This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Don't bother checking for the Greek special
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
61a36c01 739
d65e4eab 740 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
741/*
742 Two problematic code points in Unicode casefolding of EXACT nodes:
743
744 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747 which casefold to
748
749 Unicode UTF-8
750
751 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
752 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754 This means that in case-insensitive matching (or "loose matching",
755 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756 length of the above casefolded versions) can match a target string
757 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758 This would rather mess up the minimum length computation.
759
760 What we'll do is to look for the tail four bytes, and then peek
761 at the preceding two bytes to see whether we need to decrease
762 the minimum length by four (six minus two).
763
764 Thanks to the design of UTF-8, there cannot be false matches:
765 A sequence of valid UTF-8 bytes cannot be a subsequence of
766 another valid sequence of UTF-8 bytes.
767
768*/
769 char *s0 = STRING(scan), *s, *t;
770 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771 char *t0 = "\xcc\x88\xcc\x81";
772 char *t1 = t0 + 3;
773
774 for (s = s0 + 2;
775 s < s2 && (t = ninstr(s, s1, t0, t1));
776 s = t + 4) {
777 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779 min -= 4;
780 }
781 }
782
c277df42
IZ
783#ifdef DEBUGGING
784 /* Allow dumping */
cd439c50 785 n = scan + NODE_SZ_STR(scan);
c277df42 786 while (n <= stop) {
22c35a8c 787 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
788 OP(n) = OPTIMIZED;
789 NEXT_OFF(n) = 0;
790 }
791 n++;
792 }
653099ff 793#endif
c277df42 794 }
653099ff
GS
795 /* Follow the next-chain of the current node and optimize
796 away all the NOTHINGs from it. */
c277df42 797 if (OP(scan) != CURLYX) {
048cfca1
GS
798 int max = (reg_off_by_arg[OP(scan)]
799 ? I32_MAX
800 /* I32 may be smaller than U16 on CRAYs! */
801 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
802 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803 int noff;
804 regnode *n = scan;
b81d288d 805
c277df42
IZ
806 /* Skip NOTHING and LONGJMP. */
807 while ((n = regnext(n))
22c35a8c 808 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
809 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810 && off + noff < max)
811 off += noff;
812 if (reg_off_by_arg[OP(scan)])
813 ARG(scan) = off;
b81d288d 814 else
c277df42
IZ
815 NEXT_OFF(scan) = off;
816 }
653099ff
GS
817 /* The principal pseudo-switch. Cannot be a switch, since we
818 look into several different things. */
b81d288d 819 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
820 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821 next = regnext(scan);
822 code = OP(scan);
b81d288d
AB
823
824 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 825 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 826 struct regnode_charclass_class accum;
c277df42 827
653099ff 828 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 829 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 830 if (flags & SCF_DO_STCLASS)
830247a4 831 cl_init_zero(pRExC_state, &accum);
c277df42 832 while (OP(scan) == code) {
830247a4 833 I32 deltanext, minnext, f = 0, fake;
653099ff 834 struct regnode_charclass_class this_class;
c277df42
IZ
835
836 num++;
837 data_fake.flags = 0;
b81d288d 838 if (data) {
2c2d71f5 839 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
840 data_fake.last_closep = data->last_closep;
841 }
842 else
843 data_fake.last_closep = &fake;
c277df42
IZ
844 next = regnext(scan);
845 scan = NEXTOPER(scan);
846 if (code != BRANCH)
847 scan = NEXTOPER(scan);
653099ff 848 if (flags & SCF_DO_STCLASS) {
830247a4 849 cl_init(pRExC_state, &this_class);
653099ff
GS
850 data_fake.start_class = &this_class;
851 f = SCF_DO_STCLASS_AND;
b81d288d 852 }
e1901655
IZ
853 if (flags & SCF_WHILEM_VISITED_POS)
854 f |= SCF_WHILEM_VISITED_POS;
653099ff 855 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
856 minnext = study_chunk(pRExC_state, &scan, &deltanext,
857 next, &data_fake, f);
b81d288d 858 if (min1 > minnext)
c277df42
IZ
859 min1 = minnext;
860 if (max1 < minnext + deltanext)
861 max1 = minnext + deltanext;
862 if (deltanext == I32_MAX)
aca2d497 863 is_inf = is_inf_internal = 1;
c277df42
IZ
864 scan = next;
865 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 pars++;
405ff068 867 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 868 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
869 if (data)
870 data->whilem_c = data_fake.whilem_c;
653099ff 871 if (flags & SCF_DO_STCLASS)
830247a4 872 cl_or(pRExC_state, &accum, &this_class);
b81d288d 873 if (code == SUSPEND)
c277df42
IZ
874 break;
875 }
876 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 min1 = 0;
878 if (flags & SCF_DO_SUBSTR) {
879 data->pos_min += min1;
880 data->pos_delta += max1 - min1;
881 if (max1 != min1 || is_inf)
882 data->longest = &(data->longest_float);
883 }
884 min += min1;
885 delta += max1 - min1;
653099ff 886 if (flags & SCF_DO_STCLASS_OR) {
830247a4 887 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
888 if (min1) {
889 cl_and(data->start_class, &and_with);
890 flags &= ~SCF_DO_STCLASS;
891 }
892 }
893 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
894 if (min1) {
895 cl_and(data->start_class, &accum);
653099ff 896 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
897 }
898 else {
b81d288d 899 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
900 * data->start_class */
901 StructCopy(data->start_class, &and_with,
902 struct regnode_charclass_class);
903 flags &= ~SCF_DO_STCLASS_AND;
904 StructCopy(&accum, data->start_class,
905 struct regnode_charclass_class);
906 flags |= SCF_DO_STCLASS_OR;
907 data->start_class->flags |= ANYOF_EOS;
908 }
653099ff 909 }
a0ed51b3
LW
910 }
911 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
912 scan = NEXTOPER(NEXTOPER(scan));
913 else /* single branch is optimized. */
914 scan = NEXTOPER(scan);
915 continue;
a0ed51b3
LW
916 }
917 else if (OP(scan) == EXACT) {
cd439c50 918 I32 l = STR_LEN(scan);
1aa99e6b 919 UV uc = *((U8*)STRING(scan));
a0ed51b3 920 if (UTF) {
1aa99e6b
IH
921 U8 *s = (U8*)STRING(scan);
922 l = utf8_length(s, s + l);
9041c2e3 923 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
924 }
925 min += l;
c277df42 926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
927 /* The code below prefers earlier match for fixed
928 offset, later match for variable offset. */
929 if (data->last_end == -1) { /* Update the start info. */
930 data->last_start_min = data->pos_min;
931 data->last_start_max = is_inf
b81d288d 932 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 933 }
cd439c50 934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
935 data->last_end = data->pos_min + l;
936 data->pos_min += l; /* As in the first entry. */
937 data->flags &= ~SF_BEFORE_EOL;
938 }
653099ff
GS
939 if (flags & SCF_DO_STCLASS_AND) {
940 /* Check whether it is compatible with what we know already! */
941 int compat = 1;
942
1aa99e6b 943 if (uc >= 0x100 ||
516a5887 944 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 945 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 946 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 947 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 948 )
653099ff
GS
949 compat = 0;
950 ANYOF_CLASS_ZERO(data->start_class);
951 ANYOF_BITMAP_ZERO(data->start_class);
952 if (compat)
1aa99e6b 953 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 954 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
955 if (uc < 0x100)
956 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
957 }
958 else if (flags & SCF_DO_STCLASS_OR) {
959 /* false positive possible if the class is case-folded */
1aa99e6b 960 if (uc < 0x100)
9b877dbb
IH
961 ANYOF_BITMAP_SET(data->start_class, uc);
962 else
963 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
964 data->start_class->flags &= ~ANYOF_EOS;
965 cl_and(data->start_class, &and_with);
966 }
967 flags &= ~SCF_DO_STCLASS;
a0ed51b3 968 }
653099ff 969 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 970 I32 l = STR_LEN(scan);
1aa99e6b 971 UV uc = *((U8*)STRING(scan));
653099ff
GS
972
973 /* Search for fixed substrings supports EXACT only. */
b81d288d 974 if (flags & SCF_DO_SUBSTR)
830247a4 975 scan_commit(pRExC_state, data);
a0ed51b3 976 if (UTF) {
1aa99e6b
IH
977 U8 *s = (U8 *)STRING(scan);
978 l = utf8_length(s, s + l);
9041c2e3 979 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
980 }
981 min += l;
c277df42 982 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 983 data->pos_min += l;
653099ff
GS
984 if (flags & SCF_DO_STCLASS_AND) {
985 /* Check whether it is compatible with what we know already! */
986 int compat = 1;
987
1aa99e6b 988 if (uc >= 0x100 ||
516a5887 989 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 990 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 991 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
992 compat = 0;
993 ANYOF_CLASS_ZERO(data->start_class);
994 ANYOF_BITMAP_ZERO(data->start_class);
995 if (compat) {
1aa99e6b 996 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
997 data->start_class->flags &= ~ANYOF_EOS;
998 data->start_class->flags |= ANYOF_FOLD;
999 if (OP(scan) == EXACTFL)
1000 data->start_class->flags |= ANYOF_LOCALE;
1001 }
1002 }
1003 else if (flags & SCF_DO_STCLASS_OR) {
1004 if (data->start_class->flags & ANYOF_FOLD) {
1005 /* false positive possible if the class is case-folded.
1006 Assume that the locale settings are the same... */
1aa99e6b
IH
1007 if (uc < 0x100)
1008 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 }
1011 cl_and(data->start_class, &and_with);
1012 }
1013 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1014 }
4d61ec05 1015 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1016 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1017 I32 f = flags, pos_before = 0;
c277df42 1018 regnode *oscan = scan;
653099ff
GS
1019 struct regnode_charclass_class this_class;
1020 struct regnode_charclass_class *oclass = NULL;
727f22e3 1021 I32 next_is_eval = 0;
653099ff 1022
22c35a8c 1023 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1024 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1025 scan = NEXTOPER(scan);
1026 goto finish;
1027 case PLUS:
653099ff 1028 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1029 next = NEXTOPER(scan);
653099ff 1030 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1031 mincount = 1;
1032 maxcount = REG_INFTY;
c277df42
IZ
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1035 goto do_curly;
1036 }
1037 }
1038 if (flags & SCF_DO_SUBSTR)
1039 data->pos_min++;
1040 min++;
1041 /* Fall through. */
1042 case STAR:
653099ff
GS
1043 if (flags & SCF_DO_STCLASS) {
1044 mincount = 0;
b81d288d 1045 maxcount = REG_INFTY;
653099ff
GS
1046 next = regnext(scan);
1047 scan = NEXTOPER(scan);
1048 goto do_curly;
1049 }
b81d288d 1050 is_inf = is_inf_internal = 1;
c277df42
IZ
1051 scan = regnext(scan);
1052 if (flags & SCF_DO_SUBSTR) {
830247a4 1053 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1054 data->longest = &(data->longest_float);
1055 }
1056 goto optimize_curly_tail;
1057 case CURLY:
b81d288d 1058 mincount = ARG1(scan);
c277df42
IZ
1059 maxcount = ARG2(scan);
1060 next = regnext(scan);
cb434fcc
IZ
1061 if (OP(scan) == CURLYX) {
1062 I32 lp = (data ? *(data->last_closep) : 0);
1063
1064 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 }
c277df42 1066 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1067 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1068 do_curly:
1069 if (flags & SCF_DO_SUBSTR) {
830247a4 1070 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1071 pos_before = data->pos_min;
1072 }
1073 if (data) {
1074 fl = data->flags;
1075 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 if (is_inf)
1077 data->flags |= SF_IS_INF;
1078 }
653099ff 1079 if (flags & SCF_DO_STCLASS) {
830247a4 1080 cl_init(pRExC_state, &this_class);
653099ff
GS
1081 oclass = data->start_class;
1082 data->start_class = &this_class;
1083 f |= SCF_DO_STCLASS_AND;
1084 f &= ~SCF_DO_STCLASS_OR;
1085 }
e1901655
IZ
1086 /* These are the cases when once a subexpression
1087 fails at a particular position, it cannot succeed
1088 even after backtracking at the enclosing scope.
b81d288d 1089
e1901655
IZ
1090 XXXX what if minimal match and we are at the
1091 initial run of {n,m}? */
1092 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1094
c277df42 1095 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1096 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 mincount == 0
653099ff
GS
1098 ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100 if (flags & SCF_DO_STCLASS)
1101 data->start_class = oclass;
1102 if (mincount == 0 || minnext == 0) {
1103 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1104 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1105 }
1106 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1107 /* Switch to OR mode: cache the old value of
653099ff
GS
1108 * data->start_class */
1109 StructCopy(data->start_class, &and_with,
1110 struct regnode_charclass_class);
1111 flags &= ~SCF_DO_STCLASS_AND;
1112 StructCopy(&this_class, data->start_class,
1113 struct regnode_charclass_class);
1114 flags |= SCF_DO_STCLASS_OR;
1115 data->start_class->flags |= ANYOF_EOS;
1116 }
1117 } else { /* Non-zero len */
1118 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1119 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1120 cl_and(data->start_class, &and_with);
1121 }
1122 else if (flags & SCF_DO_STCLASS_AND)
1123 cl_and(data->start_class, &this_class);
1124 flags &= ~SCF_DO_STCLASS;
1125 }
c277df42
IZ
1126 if (!scan) /* It was not CURLYX, but CURLY. */
1127 scan = next;
84037bb0 1128 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1129 /* ? quantifier ok, except for (?{ ... }) */
1130 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1131 && (minnext == 0) && (deltanext == 0)
99799961 1132 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1133 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1134 {
830247a4 1135 vWARN(RExC_parse,
b45f050a
JF
1136 "Quantifier unexpected on zero-length expression");
1137 }
1138
c277df42 1139 min += minnext * mincount;
b81d288d 1140 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1141 && (minnext + deltanext) > 0)
1142 || deltanext == I32_MAX);
aca2d497 1143 is_inf |= is_inf_internal;
c277df42
IZ
1144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1147 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1148 && data->flags & SF_IN_PAR
1149 && !(data->flags & SF_HAS_EVAL)
1150 && !deltanext && minnext == 1 ) {
1151 /* Try to optimize to CURLYN. */
1152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1153 regnode *nxt1 = nxt;
1154#ifdef DEBUGGING
1155 regnode *nxt2;
1156#endif
c277df42
IZ
1157
1158 /* Skip open. */
1159 nxt = regnext(nxt);
4d61ec05 1160 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1161 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1162 && STR_LEN(nxt) == 1))
c277df42 1163 goto nogo;
497b47a8 1164#ifdef DEBUGGING
c277df42 1165 nxt2 = nxt;
497b47a8 1166#endif
c277df42 1167 nxt = regnext(nxt);
b81d288d 1168 if (OP(nxt) != CLOSE)
c277df42
IZ
1169 goto nogo;
1170 /* Now we know that nxt2 is the only contents: */
1171 oscan->flags = ARG(nxt);
1172 OP(oscan) = CURLYN;
1173 OP(nxt1) = NOTHING; /* was OPEN. */
1174#ifdef DEBUGGING
1175 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1179 OP(nxt + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1181#endif
c277df42 1182 }
c277df42
IZ
1183 nogo:
1184
1185 /* Try optimization CURLYX => CURLYM. */
b81d288d 1186 if ( OP(oscan) == CURLYX && data
c277df42 1187 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1188 && !(data->flags & SF_HAS_EVAL)
1189 && !deltanext ) {
1190 /* XXXX How to optimize if data == 0? */
1191 /* Optimize to a simpler form. */
1192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193 regnode *nxt2;
1194
1195 OP(oscan) = CURLYM;
1196 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1197 && (OP(nxt2) != WHILEM))
c277df42
IZ
1198 nxt = nxt2;
1199 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1200 /* Need to optimize away parenths. */
1201 if (data->flags & SF_IN_PAR) {
1202 /* Set the parenth number. */
1203 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
b81d288d 1205 if (OP(nxt) != CLOSE)
b45f050a 1206 FAIL("Panic opt close");
c277df42
IZ
1207 oscan->flags = ARG(nxt);
1208 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1209 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210#ifdef DEBUGGING
1211 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212 OP(nxt + 1) = OPTIMIZED; /* was count. */
1213 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1215#endif
c277df42
IZ
1216#if 0
1217 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218 regnode *nnxt = regnext(nxt1);
b81d288d 1219
c277df42
IZ
1220 if (nnxt == nxt) {
1221 if (reg_off_by_arg[OP(nxt1)])
1222 ARG_SET(nxt1, nxt2 - nxt1);
1223 else if (nxt2 - nxt1 < U16_MAX)
1224 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 else
1226 OP(nxt) = NOTHING; /* Cannot beautify */
1227 }
1228 nxt1 = nnxt;
1229 }
1230#endif
1231 /* Optimize again: */
b81d288d 1232 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1233 NULL, 0);
a0ed51b3
LW
1234 }
1235 else
c277df42 1236 oscan->flags = 0;
c277df42 1237 }
e1901655
IZ
1238 else if ((OP(oscan) == CURLYX)
1239 && (flags & SCF_WHILEM_VISITED_POS)
1240 /* See the comment on a similar expression above.
1241 However, this time it not a subexpression
1242 we care about, but the expression itself. */
1243 && (maxcount == REG_INFTY)
1244 && data && ++data->whilem_c < 16) {
1245 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1246 /* Find WHILEM (as in regexec.c) */
1247 regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 nxt += ARG(nxt);
1251 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1252 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1253 }
b81d288d 1254 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1255 pars++;
1256 if (flags & SCF_DO_SUBSTR) {
1257 SV *last_str = Nullsv;
1258 int counted = mincount != 0;
1259
1260 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1261#if defined(SPARC64_GCC_WORKAROUND)
1262 I32 b = 0;
1263 STRLEN l = 0;
1264 char *s = NULL;
1265 I32 old = 0;
1266
1267 if (pos_before >= data->last_start_min)
1268 b = pos_before;
1269 else
1270 b = data->last_start_min;
1271
1272 l = 0;
1273 s = SvPV(data->last_found, l);
1274 old = b - data->last_start_min;
1275
1276#else
b81d288d 1277 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1278 ? pos_before : data->last_start_min;
1279 STRLEN l;
1280 char *s = SvPV(data->last_found, l);
a0ed51b3 1281 I32 old = b - data->last_start_min;
5d1c421c 1282#endif
a0ed51b3
LW
1283
1284 if (UTF)
1285 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1286
a0ed51b3 1287 l -= old;
c277df42 1288 /* Get the added string: */
79cb57f6 1289 last_str = newSVpvn(s + old, l);
c277df42
IZ
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1292 if (mincount > 1) {
1293 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1294 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
b81d288d 1298 SvCUR_set(data->last_found,
c277df42
IZ
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1301 data->last_end += l * (mincount - 1);
1302 }
2a8d9689
HS
1303 } else {
1304 /* start offset must point into the last copy */
1305 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1306 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307 * (minnext + data->pos_delta);
c277df42
IZ
1308 }
1309 }
1310 /* It is counted once already... */
1311 data->pos_min += minnext * (mincount - counted);
1312 data->pos_delta += - counted * deltanext +
1313 (minnext + deltanext) * maxcount - minnext * mincount;
1314 if (mincount != maxcount) {
653099ff
GS
1315 /* Cannot extend fixed substrings found inside
1316 the group. */
830247a4 1317 scan_commit(pRExC_state,data);
c277df42
IZ
1318 if (mincount && last_str) {
1319 sv_setsv(data->last_found, last_str);
1320 data->last_end = data->pos_min;
b81d288d 1321 data->last_start_min =
a0ed51b3 1322 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1323 data->last_start_max = is_inf
1324 ? I32_MAX
c277df42 1325 : data->pos_min + data->pos_delta
a0ed51b3 1326 - CHR_SVLEN(last_str);
c277df42
IZ
1327 }
1328 data->longest = &(data->longest_float);
1329 }
aca2d497 1330 SvREFCNT_dec(last_str);
c277df42 1331 }
405ff068 1332 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1333 data->flags |= SF_HAS_EVAL;
1334 optimize_curly_tail:
c277df42 1335 if (OP(oscan) != CURLYX) {
22c35a8c 1336 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1337 && NEXT_OFF(next))
1338 NEXT_OFF(oscan) += NEXT_OFF(next);
1339 }
c277df42 1340 continue;
653099ff 1341 default: /* REF and CLUMP only? */
c277df42 1342 if (flags & SCF_DO_SUBSTR) {
830247a4 1343 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1344 data->longest = &(data->longest_float);
1345 }
aca2d497 1346 is_inf = is_inf_internal = 1;
653099ff 1347 if (flags & SCF_DO_STCLASS_OR)
830247a4 1348 cl_anything(pRExC_state, data->start_class);
653099ff 1349 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1350 break;
1351 }
a0ed51b3 1352 }
ffc61ed2 1353 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1354 int value = 0;
653099ff 1355
c277df42 1356 if (flags & SCF_DO_SUBSTR) {
830247a4 1357 scan_commit(pRExC_state,data);
c277df42
IZ
1358 data->pos_min++;
1359 }
1360 min++;
653099ff
GS
1361 if (flags & SCF_DO_STCLASS) {
1362 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364 /* Some of the logic below assumes that switching
1365 locale on will only add false positives. */
1366 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1367 case SANY:
653099ff
GS
1368 default:
1369 do_default:
1370 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1372 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1373 break;
1374 case REG_ANY:
1375 if (OP(scan) == SANY)
1376 goto do_default;
1377 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1380 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1381 }
1382 if (flags & SCF_DO_STCLASS_AND || !value)
1383 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384 break;
1385 case ANYOF:
1386 if (flags & SCF_DO_STCLASS_AND)
1387 cl_and(data->start_class,
1388 (struct regnode_charclass_class*)scan);
1389 else
830247a4 1390 cl_or(pRExC_state, data->start_class,
653099ff
GS
1391 (struct regnode_charclass_class*)scan);
1392 break;
1393 case ALNUM:
1394 if (flags & SCF_DO_STCLASS_AND) {
1395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397 for (value = 0; value < 256; value++)
1398 if (!isALNUM(value))
1399 ANYOF_BITMAP_CLEAR(data->start_class, value);
1400 }
1401 }
1402 else {
1403 if (data->start_class->flags & ANYOF_LOCALE)
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 else {
1406 for (value = 0; value < 256; value++)
1407 if (isALNUM(value))
b81d288d 1408 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1409 }
1410 }
1411 break;
1412 case ALNUML:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (data->start_class->flags & ANYOF_LOCALE)
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 }
1417 else {
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 data->start_class->flags |= ANYOF_LOCALE;
1420 }
1421 break;
1422 case NALNUM:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1427 if (isALNUM(value))
1428 ANYOF_BITMAP_CLEAR(data->start_class, value);
1429 }
1430 }
1431 else {
1432 if (data->start_class->flags & ANYOF_LOCALE)
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 else {
1435 for (value = 0; value < 256; value++)
1436 if (!isALNUM(value))
b81d288d 1437 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1438 }
1439 }
1440 break;
1441 case NALNUML:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (data->start_class->flags & ANYOF_LOCALE)
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 }
1446 else {
1447 data->start_class->flags |= ANYOF_LOCALE;
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449 }
1450 break;
1451 case SPACE:
1452 if (flags & SCF_DO_STCLASS_AND) {
1453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455 for (value = 0; value < 256; value++)
1456 if (!isSPACE(value))
1457 ANYOF_BITMAP_CLEAR(data->start_class, value);
1458 }
1459 }
1460 else {
1461 if (data->start_class->flags & ANYOF_LOCALE)
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 else {
1464 for (value = 0; value < 256; value++)
1465 if (isSPACE(value))
b81d288d 1466 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1467 }
1468 }
1469 break;
1470 case SPACEL:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (data->start_class->flags & ANYOF_LOCALE)
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 }
1475 else {
1476 data->start_class->flags |= ANYOF_LOCALE;
1477 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478 }
1479 break;
1480 case NSPACE:
1481 if (flags & SCF_DO_STCLASS_AND) {
1482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1485 if (isSPACE(value))
1486 ANYOF_BITMAP_CLEAR(data->start_class, value);
1487 }
1488 }
1489 else {
1490 if (data->start_class->flags & ANYOF_LOCALE)
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 else {
1493 for (value = 0; value < 256; value++)
1494 if (!isSPACE(value))
b81d288d 1495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1496 }
1497 }
1498 break;
1499 case NSPACEL:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (data->start_class->flags & ANYOF_LOCALE) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (!isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 data->start_class->flags |= ANYOF_LOCALE;
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 }
1512 break;
1513 case DIGIT:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516 for (value = 0; value < 256; value++)
1517 if (!isDIGIT(value))
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519 }
1520 else {
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 else {
1524 for (value = 0; value < 256; value++)
1525 if (isDIGIT(value))
b81d288d 1526 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1527 }
1528 }
1529 break;
1530 case NDIGIT:
1531 if (flags & SCF_DO_STCLASS_AND) {
1532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
1535 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536 }
1537 else {
1538 if (data->start_class->flags & ANYOF_LOCALE)
1539 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 else {
1541 for (value = 0; value < 256; value++)
1542 if (!isDIGIT(value))
b81d288d 1543 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1544 }
1545 }
1546 break;
1547 }
1548 if (flags & SCF_DO_STCLASS_OR)
1549 cl_and(data->start_class, &and_with);
1550 flags &= ~SCF_DO_STCLASS;
1551 }
a0ed51b3 1552 }
22c35a8c 1553 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1554 data->flags |= (OP(scan) == MEOL
1555 ? SF_BEFORE_MEOL
1556 : SF_BEFORE_SEOL);
a0ed51b3 1557 }
653099ff
GS
1558 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1559 /* Lookbehind, or need to calculate parens/evals/stclass: */
1560 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1561 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1562 /* Lookahead/lookbehind */
cb434fcc 1563 I32 deltanext, minnext, fake = 0;
c277df42 1564 regnode *nscan;
653099ff
GS
1565 struct regnode_charclass_class intrnl;
1566 int f = 0;
c277df42
IZ
1567
1568 data_fake.flags = 0;
b81d288d 1569 if (data) {
2c2d71f5 1570 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1571 data_fake.last_closep = data->last_closep;
1572 }
1573 else
1574 data_fake.last_closep = &fake;
653099ff
GS
1575 if ( flags & SCF_DO_STCLASS && !scan->flags
1576 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1577 cl_init(pRExC_state, &intrnl);
653099ff 1578 data_fake.start_class = &intrnl;
e1901655 1579 f |= SCF_DO_STCLASS_AND;
653099ff 1580 }
e1901655
IZ
1581 if (flags & SCF_WHILEM_VISITED_POS)
1582 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1583 next = regnext(scan);
1584 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1585 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1586 if (scan->flags) {
1587 if (deltanext) {
9baa0206 1588 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1589 }
1590 else if (minnext > U8_MAX) {
9baa0206 1591 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1592 }
1593 scan->flags = minnext;
1594 }
1595 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 pars++;
405ff068 1597 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1598 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1599 if (data)
1600 data->whilem_c = data_fake.whilem_c;
e1901655 1601 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1602 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604 cl_and(data->start_class, &intrnl);
1605 if (was)
1606 data->start_class->flags |= ANYOF_EOS;
1607 }
a0ed51b3
LW
1608 }
1609 else if (OP(scan) == OPEN) {
c277df42 1610 pars++;
a0ed51b3 1611 }
cb434fcc
IZ
1612 else if (OP(scan) == CLOSE) {
1613 if (ARG(scan) == is_par) {
1614 next = regnext(scan);
c277df42 1615
cb434fcc
IZ
1616 if ( next && (OP(next) != WHILEM) && next < last)
1617 is_par = 0; /* Disable optimization */
1618 }
1619 if (data)
1620 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1621 }
1622 else if (OP(scan) == EVAL) {
c277df42
IZ
1623 if (data)
1624 data->flags |= SF_HAS_EVAL;
1625 }
96776eda 1626 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1627 if (flags & SCF_DO_SUBSTR) {
830247a4 1628 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1629 data->longest = &(data->longest_float);
1630 }
1631 is_inf = is_inf_internal = 1;
653099ff 1632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1633 cl_anything(pRExC_state, data->start_class);
96776eda 1634 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1635 }
c277df42
IZ
1636 /* Else: zero-length, ignore. */
1637 scan = regnext(scan);
1638 }
1639
1640 finish:
1641 *scanp = scan;
aca2d497 1642 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1643 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1644 data->pos_delta = I32_MAX - data->pos_min;
1645 if (is_par > U8_MAX)
1646 is_par = 0;
1647 if (is_par && pars==1 && data) {
1648 data->flags |= SF_IN_PAR;
1649 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1650 }
1651 else if (pars && data) {
c277df42
IZ
1652 data->flags |= SF_HAS_PAR;
1653 data->flags &= ~SF_IN_PAR;
1654 }
653099ff
GS
1655 if (flags & SCF_DO_STCLASS_OR)
1656 cl_and(data->start_class, &and_with);
c277df42
IZ
1657 return min;
1658}
1659
76e3520e 1660STATIC I32
830247a4 1661S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1662{
830247a4 1663 if (RExC_rx->data) {
b81d288d
AB
1664 Renewc(RExC_rx->data,
1665 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1666 char, struct reg_data);
830247a4
IZ
1667 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668 RExC_rx->data->count += n;
a0ed51b3
LW
1669 }
1670 else {
830247a4 1671 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1672 char, struct reg_data);
830247a4
IZ
1673 New(1208, RExC_rx->data->what, n, U8);
1674 RExC_rx->data->count = n;
c277df42 1675 }
830247a4
IZ
1676 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677 return RExC_rx->data->count - n;
c277df42
IZ
1678}
1679
d88dccdf 1680void
864dbfa3 1681Perl_reginitcolors(pTHX)
d88dccdf 1682{
d88dccdf
IZ
1683 int i = 0;
1684 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1685
d88dccdf
IZ
1686 if (s) {
1687 PL_colors[0] = s = savepv(s);
1688 while (++i < 6) {
1689 s = strchr(s, '\t');
1690 if (s) {
1691 *s = '\0';
1692 PL_colors[i] = ++s;
1693 }
1694 else
c712d376 1695 PL_colors[i] = s = "";
d88dccdf
IZ
1696 }
1697 } else {
b81d288d 1698 while (i < 6)
d88dccdf
IZ
1699 PL_colors[i++] = "";
1700 }
1701 PL_colorset = 1;
1702}
1703
8615cb43 1704
a687059c 1705/*
e50aee73 1706 - pregcomp - compile a regular expression into internal code
a687059c
LW
1707 *
1708 * We can't allocate space until we know how big the compiled form will be,
1709 * but we can't compile it (and thus know how big it is) until we've got a
1710 * place to put the code. So we cheat: we compile it twice, once with code
1711 * generation turned off and size counting turned on, and once "for real".
1712 * This also means that we don't allocate space until we are sure that the
1713 * thing really will compile successfully, and we never have to move the
1714 * code and thus invalidate pointers into it. (Note that it has to be in
1715 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 *
1717 * Beware that the optimization-preparation code in here knows about some
1718 * of the structure of the compiled regexp. [I'll say.]
1719 */
1720regexp *
864dbfa3 1721Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1722{
a0d0e21e 1723 register regexp *r;
c277df42 1724 regnode *scan;
c277df42 1725 regnode *first;
a0d0e21e 1726 I32 flags;
a0d0e21e
LW
1727 I32 minlen = 0;
1728 I32 sawplus = 0;
1729 I32 sawopen = 0;
2c2d71f5 1730 scan_data_t data;
830247a4
IZ
1731 RExC_state_t RExC_state;
1732 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1733
1734 if (exp == NULL)
c277df42 1735 FAIL("NULL regexp argument");
a0d0e21e 1736
a5961de5 1737 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1738
5cfc7842 1739 RExC_precomp = exp;
a5961de5
JH
1740 DEBUG_r({
1741 if (!PL_colorset) reginitcolors();
1742 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1744 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 });
830247a4
IZ
1746 RExC_flags16 = pm->op_pmflags;
1747 RExC_sawback = 0;
bbce6d69 1748
830247a4
IZ
1749 RExC_seen = 0;
1750 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751 RExC_seen_evals = 0;
1752 RExC_extralen = 0;
c277df42 1753
bbce6d69 1754 /* First pass: determine size, legality. */
830247a4 1755 RExC_parse = exp;
fac92740 1756 RExC_start = exp;
830247a4
IZ
1757 RExC_end = xend;
1758 RExC_naughty = 0;
1759 RExC_npar = 1;
1760 RExC_size = 0L;
1761 RExC_emit = &PL_regdummy;
1762 RExC_whilem_seen = 0;
85ddcde9
JH
1763#if 0 /* REGC() is (currently) a NOP at the first pass.
1764 * Clever compilers notice this and complain. --jhi */
830247a4 1765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1766#endif
830247a4 1767 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1768 RExC_precomp = Nullch;
a0d0e21e
LW
1769 return(NULL);
1770 }
830247a4 1771 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1772
c277df42
IZ
1773 /* Small enough for pointer-storage convention?
1774 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1775 if (RExC_size >= 0x10000L && RExC_extralen)
1776 RExC_size += RExC_extralen;
c277df42 1777 else
830247a4
IZ
1778 RExC_extralen = 0;
1779 if (RExC_whilem_seen > 15)
1780 RExC_whilem_seen = 15;
a0d0e21e 1781
bbce6d69 1782 /* Allocate space and initialize. */
830247a4 1783 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1784 char, regexp);
a0d0e21e 1785 if (r == NULL)
b45f050a
JF
1786 FAIL("Regexp out of space");
1787
0f79a09d
GS
1788#ifdef DEBUGGING
1789 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1790 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1791#endif
c277df42 1792 r->refcnt = 1;
bbce6d69 1793 r->prelen = xend - exp;
5cfc7842 1794 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1795 r->subbeg = NULL;
1796 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1797 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1798
1799 r->substrs = 0; /* Useful during FAIL. */
1800 r->startp = 0; /* Useful during FAIL. */
1801 r->endp = 0; /* Useful during FAIL. */
1802
fac92740
MJD
1803 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 if (r->offsets) {
1805 r->offsets[0] = RExC_size;
1806 }
1807 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1808 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1809 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1810 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1811
830247a4 1812 RExC_rx = r;
bbce6d69
PP
1813
1814 /* Second pass: emit code. */
055bb491 1815 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1816 RExC_parse = exp;
1817 RExC_end = xend;
1818 RExC_naughty = 0;
1819 RExC_npar = 1;
fac92740 1820 RExC_emit_start = r->program;
830247a4 1821 RExC_emit = r->program;
2cd61cdb 1822 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1823 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1825 r->data = 0;
830247a4 1826 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1827 return(NULL);
1828
1829 /* Dig out information for optimizations. */
cf93c79d 1830 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1831 pm->op_pmflags = RExC_flags16;
a0ed51b3 1832 if (UTF)
5ff6fc6d 1833 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1834 r->regstclass = NULL;
830247a4 1835 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1836 r->reganch |= ROPT_NAUGHTY;
c277df42 1837 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1838
1839 /* XXXX To minimize changes to RE engine we always allocate
1840 3-units-long substrs field. */
1841 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
2c2d71f5 1843 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1844 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1845 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1846 I32 fake;
c5254dd6 1847 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1848 struct regnode_charclass_class ch_class;
1849 int stclass_flag;
cb434fcc 1850 I32 last_close = 0;
a0d0e21e
LW
1851
1852 first = scan;
c277df42 1853 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1854 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1855 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1856 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857 (OP(first) == PLUS) ||
1858 (OP(first) == MINMOD) ||
653099ff 1859 /* An {n,m} with n>0 */
22c35a8c 1860 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1861 if (OP(first) == PLUS)
1862 sawplus = 1;
1863 else
1864 first += regarglen[(U8)OP(first)];
1865 first = NEXTOPER(first);
a687059c
LW
1866 }
1867
a0d0e21e
LW
1868 /* Starting-point info. */
1869 again:
653099ff 1870 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1871 if (OP(first) == EXACT)
1872 ; /* Empty, get anchored substr later. */
1873 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1874 r->regstclass = first;
1875 }
653099ff 1876 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1877 r->regstclass = first;
22c35a8c
GS
1878 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1880 r->regstclass = first;
22c35a8c 1881 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1882 r->reganch |= (OP(first) == MBOL
1883 ? ROPT_ANCH_MBOL
1884 : (OP(first) == SBOL
1885 ? ROPT_ANCH_SBOL
1886 : ROPT_ANCH_BOL));
a0d0e21e 1887 first = NEXTOPER(first);
774d564b
PP
1888 goto again;
1889 }
1890 else if (OP(first) == GPOS) {
1891 r->reganch |= ROPT_ANCH_GPOS;
1892 first = NEXTOPER(first);
1893 goto again;
a0d0e21e 1894 }
e09294f4 1895 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1896 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1897 !(r->reganch & ROPT_ANCH) )
1898 {
1899 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1900 int type = OP(NEXTOPER(first));
1901
ffc61ed2 1902 if (type == REG_ANY)
cad2e5aa
JH
1903 type = ROPT_ANCH_MBOL;
1904 else
1905 type = ROPT_ANCH_SBOL;
1906
1907 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1908 first = NEXTOPER(first);
774d564b 1909 goto again;
a0d0e21e 1910 }
b81d288d 1911 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1912 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1913 /* x+ must match at the 1st pos of run of x's */
1914 r->reganch |= ROPT_SKIP;
a0d0e21e 1915
c277df42 1916 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1917 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1918 (IV)(first - scan + 1)));
a0d0e21e
LW
1919 /*
1920 * If there's something expensive in the r.e., find the
1921 * longest literal string that must appear and make it the
1922 * regmust. Resolve ties in favor of later strings, since
1923 * the regstart check works with the beginning of the r.e.
1924 * and avoiding duplication strengthens checking. Not a
1925 * strong reason, but sufficient in the absence of others.
1926 * [Now we resolve ties in favor of the earlier string if
c277df42 1927 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1928 * earlier string may buy us something the later one won't.]
1929 */
a0d0e21e 1930 minlen = 0;
a687059c 1931
79cb57f6
GS
1932 data.longest_fixed = newSVpvn("",0);
1933 data.longest_float = newSVpvn("",0);
1934 data.last_found = newSVpvn("",0);
c277df42
IZ
1935 data.longest = &(data.longest_fixed);
1936 first = scan;
653099ff 1937 if (!r->regstclass) {
830247a4 1938 cl_init(pRExC_state, &ch_class);
653099ff
GS
1939 data.start_class = &ch_class;
1940 stclass_flag = SCF_DO_STCLASS_AND;
1941 } else /* XXXX Check for BOUND? */
1942 stclass_flag = 0;
cb434fcc 1943 data.last_closep = &last_close;
653099ff 1944
830247a4 1945 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1946 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1947 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1948 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1949 && !RExC_seen_zerolen
1950 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1951 r->reganch |= ROPT_CHECK_ALL;
830247a4 1952 scan_commit(pRExC_state, &data);
c277df42
IZ
1953 SvREFCNT_dec(data.last_found);
1954
a0ed51b3 1955 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1956 if (longest_float_length
c277df42
IZ
1957 || (data.flags & SF_FL_BEFORE_EOL
1958 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1959 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1960 int t;
1961
a0ed51b3 1962 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1963 && data.offset_fixed == data.offset_float_min
1964 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965 goto remove_float; /* As in (a)+. */
1966
c277df42
IZ
1967 r->float_substr = data.longest_float;
1968 r->float_min_offset = data.offset_float_min;
1969 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1970 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1972 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1973 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1974 }
1975 else {
aca2d497 1976 remove_float:
c277df42
IZ
1977 r->float_substr = Nullsv;
1978 SvREFCNT_dec(data.longest_float);
c5254dd6 1979 longest_float_length = 0;
a0d0e21e 1980 }
c277df42 1981
a0ed51b3 1982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1983 if (longest_fixed_length
c277df42
IZ
1984 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1986 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1987 int t;
1988
c277df42
IZ
1989 r->anchored_substr = data.longest_fixed;
1990 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1993 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1994 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1995 }
1996 else {
c277df42
IZ
1997 r->anchored_substr = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1999 longest_fixed_length = 0;
a0d0e21e 2000 }
b81d288d 2001 if (r->regstclass
ffc61ed2 2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
2003 r->regstclass = NULL;
2004 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005 && !(data.start_class->flags & ANYOF_EOS)
2006 && !cl_is_anything(data.start_class)) {
830247a4 2007 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2008
b81d288d 2009 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2010 struct regnode_charclass_class);
2011 StructCopy(data.start_class,
830247a4 2012 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2013 struct regnode_charclass_class);
830247a4 2014 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2015 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2016 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2017 DEBUG_r({ SV *sv = sv_newmortal();
2018 regprop(sv, (regnode*)data.start_class);
2019 PerlIO_printf(Perl_debug_log,
2020 "synthetic stclass `%s'.\n",
2021 SvPVX(sv));});
653099ff 2022 }
c277df42
IZ
2023
2024 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2025 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
2026 r->check_substr = r->anchored_substr;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2030 }
2031 else {
c277df42
IZ
2032 r->check_substr = r->float_substr;
2033 r->check_offset_min = data.offset_float_min;
2034 r->check_offset_max = data.offset_float_max;
a0d0e21e 2035 }
30382c73
IZ
2036 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037 This should be changed ASAP! */
2038 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
2039 r->reganch |= RE_USE_INTUIT;
2040 if (SvTAIL(r->check_substr))
2041 r->reganch |= RE_INTUIT_TAIL;
2042 }
a0ed51b3
LW
2043 }
2044 else {
c277df42
IZ
2045 /* Several toplevels. Best we can is to set minlen. */
2046 I32 fake;
653099ff 2047 struct regnode_charclass_class ch_class;
cb434fcc 2048 I32 last_close = 0;
c277df42
IZ
2049
2050 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051 scan = r->program + 1;
830247a4 2052 cl_init(pRExC_state, &ch_class);
653099ff 2053 data.start_class = &ch_class;
cb434fcc 2054 data.last_closep = &last_close;
e1901655 2055 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2056 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class)) {
830247a4 2059 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2060
b81d288d 2061 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
830247a4 2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2065 struct regnode_charclass_class);
830247a4 2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2068 DEBUG_r({ SV* sv = sv_newmortal();
2069 regprop(sv, (regnode*)data.start_class);
2070 PerlIO_printf(Perl_debug_log,
2071 "synthetic stclass `%s'.\n",
2072 SvPVX(sv));});
653099ff 2073 }
a0d0e21e
LW
2074 }
2075
a0d0e21e 2076 r->minlen = minlen;
b81d288d 2077 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2078 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2079 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2080 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2081 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2082 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2083 if (RExC_seen & REG_SEEN_CANY)
2084 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2085 Newz(1002, r->startp, RExC_npar, I32);
2086 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2087 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2088 DEBUG_r(regdump(r));
2089 return(r);
a687059c
LW
2090}
2091
2092/*
2093 - reg - regular expression, i.e. main body or parenthesized thing
2094 *
2095 * Caller must absorb opening parenthesis.
2096 *
2097 * Combining parenthesis handling with the base level of regular expression
2098 * is a trifle forced, but the need to tie the tails of the branches to what
2099 * follows makes it hard to avoid.
2100 */
76e3520e 2101STATIC regnode *
830247a4 2102S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2103 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2104{
c277df42
IZ
2105 register regnode *ret; /* Will be the head of the group. */
2106 register regnode *br;
2107 register regnode *lastbr;
2108 register regnode *ender = 0;
a0d0e21e 2109 register I32 parno = 0;
830247a4 2110 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2111
2112 /* for (?g), (?gc), and (?o) warnings; warning
2113 about (?c) will warn about (?g) -- japhy */
2114
2115 I32 wastedflags = 0x00,
2116 wasted_o = 0x01,
2117 wasted_g = 0x02,
2118 wasted_gc = 0x02 | 0x04,
2119 wasted_c = 0x04;
2120
fac92740 2121 char * parse_start = RExC_parse; /* MJD */
830247a4 2122 char *oregcomp_parse = RExC_parse;
c277df42 2123 char c;
a0d0e21e 2124
821b33a5 2125 *flagp = 0; /* Tentatively. */
a0d0e21e 2126
9d1d55b5 2127
a0d0e21e
LW
2128 /* Make an OPEN node, if parenthesized. */
2129 if (paren) {
fac92740 2130 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2131 U16 posflags = 0, negflags = 0;
2132 U16 *flagsp = &posflags;
0f5d15d6 2133 int logical = 0;
830247a4 2134 char *seqstart = RExC_parse;
ca9dfc88 2135
830247a4
IZ
2136 RExC_parse++;
2137 paren = *RExC_parse++;
c277df42 2138 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2139 switch (paren) {
fac92740 2140 case '<': /* (?<...) */
830247a4 2141 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2142 if (*RExC_parse == '!')
c277df42 2143 paren = ',';
b81d288d 2144 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2145 goto unknown;
830247a4 2146 RExC_parse++;
fac92740
MJD
2147 case '=': /* (?=...) */
2148 case '!': /* (?!...) */
830247a4 2149 RExC_seen_zerolen++;
fac92740
MJD
2150 case ':': /* (?:...) */
2151 case '>': /* (?>...) */
a0d0e21e 2152 break;
fac92740
MJD
2153 case '$': /* (?$...) */
2154 case '@': /* (?@...) */
8615cb43 2155 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2156 break;
fac92740 2157 case '#': /* (?#...) */
830247a4
IZ
2158 while (*RExC_parse && *RExC_parse != ')')
2159 RExC_parse++;
2160 if (*RExC_parse != ')')
c277df42 2161 FAIL("Sequence (?#... not terminated");
830247a4 2162 nextchar(pRExC_state);
a0d0e21e
LW
2163 *flagp = TRYAGAIN;
2164 return NULL;
fac92740 2165 case 'p': /* (?p...) */
f9373011
PM
2166 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2168 /* FALL THROUGH*/
fac92740 2169 case '?': /* (??...) */
0f5d15d6 2170 logical = 1;
830247a4 2171 paren = *RExC_parse++;
0f5d15d6 2172 /* FALL THROUGH */
fac92740 2173 case '{': /* (?{...}) */
c277df42 2174 {
c277df42
IZ
2175 I32 count = 1, n = 0;
2176 char c;
830247a4 2177 char *s = RExC_parse;
c277df42
IZ
2178 SV *sv;
2179 OP_4tree *sop, *rop;
2180
830247a4
IZ
2181 RExC_seen_zerolen++;
2182 RExC_seen |= REG_SEEN_EVAL;
2183 while (count && (c = *RExC_parse)) {
2184 if (c == '\\' && RExC_parse[1])
2185 RExC_parse++;
b81d288d 2186 else if (c == '{')
c277df42 2187 count++;
b81d288d 2188 else if (c == '}')
c277df42 2189 count--;
830247a4 2190 RExC_parse++;
c277df42 2191 }
830247a4 2192 if (*RExC_parse != ')')
b45f050a 2193 {
b81d288d 2194 RExC_parse = s;
b45f050a
JF
2195 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2196 }
c277df42
IZ
2197 if (!SIZE_ONLY) {
2198 AV *av;
b81d288d
AB
2199
2200 if (RExC_parse - 1 - s)
830247a4 2201 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2202 else
79cb57f6 2203 sv = newSVpvn("", 0);
c277df42 2204
569233ed
SB
2205 ENTER;
2206 Perl_save_re_context(aTHX);
c277df42 2207 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2208 sop->op_private |= OPpREFCOUNTED;
2209 /* re_dup will OpREFCNT_inc */
2210 OpREFCNT_set(sop, 1);
569233ed 2211 LEAVE;
c277df42 2212
830247a4
IZ
2213 n = add_data(pRExC_state, 3, "nop");
2214 RExC_rx->data->data[n] = (void*)rop;
2215 RExC_rx->data->data[n+1] = (void*)sop;
2216 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2217 SvREFCNT_dec(sv);
a0ed51b3 2218 }
e24b16f9 2219 else { /* First pass */
830247a4 2220 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2221 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2222 /* No compiled RE interpolated, has runtime
2223 components ===> unsafe. */
2224 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2225 if (PL_tainting && PL_tainted)
cc6b7395 2226 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2227 }
2228
830247a4 2229 nextchar(pRExC_state);
0f5d15d6 2230 if (logical) {
830247a4 2231 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2232 if (!SIZE_ONLY)
2233 ret->flags = 2;
830247a4 2234 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2235 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2236 return ret;
2237 }
830247a4 2238 return reganode(pRExC_state, EVAL, n);
c277df42 2239 }
fac92740 2240 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2241 {
fac92740 2242 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2243 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2244 || RExC_parse[1] == '<'
830247a4 2245 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2246 I32 flag;
2247
830247a4 2248 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2249 if (!SIZE_ONLY)
2250 ret->flags = 1;
830247a4 2251 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2252 goto insert_if;
b81d288d 2253 }
a0ed51b3 2254 }
830247a4 2255 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2256 /* (?(1)...) */
830247a4 2257 parno = atoi(RExC_parse++);
c277df42 2258
830247a4
IZ
2259 while (isDIGIT(*RExC_parse))
2260 RExC_parse++;
fac92740
MJD
2261 ret = reganode(pRExC_state, GROUPP, parno);
2262
830247a4 2263 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2264 vFAIL("Switch condition not recognized");
c277df42 2265 insert_if:
830247a4
IZ
2266 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2267 br = regbranch(pRExC_state, &flags, 1);
c277df42 2268 if (br == NULL)
830247a4 2269 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2270 else
830247a4
IZ
2271 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2272 c = *nextchar(pRExC_state);
d1b80229
IZ
2273 if (flags&HASWIDTH)
2274 *flagp |= HASWIDTH;
c277df42 2275 if (c == '|') {
830247a4
IZ
2276 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2277 regbranch(pRExC_state, &flags, 1);
2278 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2279 if (flags&HASWIDTH)
2280 *flagp |= HASWIDTH;
830247a4 2281 c = *nextchar(pRExC_state);
a0ed51b3
LW
2282 }
2283 else
c277df42
IZ
2284 lastbr = NULL;
2285 if (c != ')')
8615cb43 2286 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2287 ender = reg_node(pRExC_state, TAIL);
2288 regtail(pRExC_state, br, ender);
c277df42 2289 if (lastbr) {
830247a4
IZ
2290 regtail(pRExC_state, lastbr, ender);
2291 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2292 }
2293 else
830247a4 2294 regtail(pRExC_state, ret, ender);
c277df42 2295 return ret;
a0ed51b3
LW
2296 }
2297 else {
830247a4 2298 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2299 }
2300 }
1b1626e4 2301 case 0:
830247a4 2302 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2303 vFAIL("Sequence (? incomplete");
1b1626e4 2304 break;
a0d0e21e 2305 default:
830247a4 2306 --RExC_parse;
fac92740 2307 parse_flags: /* (?i) */
830247a4 2308 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2309 /* (?g), (?gc) and (?o) are useless here
2310 and must be globally applied -- japhy */
2311
2312 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2313 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2314 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2315 if (! (wastedflags & wflagbit) ) {
2316 wastedflags |= wflagbit;
2317 vWARN5(
2318 RExC_parse + 1,
2319 "Useless (%s%c) - %suse /%c modifier",
2320 flagsp == &negflags ? "?-" : "?",
2321 *RExC_parse,
2322 flagsp == &negflags ? "don't " : "",
2323 *RExC_parse
2324 );
2325 }
2326 }
2327 }
2328 else if (*RExC_parse == 'c') {
2329 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2330 if (! (wastedflags & wasted_c) ) {
2331 wastedflags |= wasted_gc;
2332 vWARN3(
2333 RExC_parse + 1,
2334 "Useless (%sc) - %suse /gc modifier",
2335 flagsp == &negflags ? "?-" : "?",
2336 flagsp == &negflags ? "don't " : ""
2337 );
2338 }
2339 }
2340 }
2341 else { pmflag(flagsp, *RExC_parse); }
2342
830247a4 2343 ++RExC_parse;
ca9dfc88 2344 }
830247a4 2345 if (*RExC_parse == '-') {
ca9dfc88 2346 flagsp = &negflags;
9d1d55b5 2347 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2348 ++RExC_parse;
ca9dfc88 2349 goto parse_flags;
48c036b1 2350 }
830247a4
IZ
2351 RExC_flags16 |= posflags;
2352 RExC_flags16 &= ~negflags;
2353 if (*RExC_parse == ':') {
2354 RExC_parse++;
ca9dfc88
IZ
2355 paren = ':';
2356 break;
2357 }
c277df42 2358 unknown:
830247a4
IZ
2359 if (*RExC_parse != ')') {
2360 RExC_parse++;
2361 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2362 }
830247a4 2363 nextchar(pRExC_state);
a0d0e21e
LW
2364 *flagp = TRYAGAIN;
2365 return NULL;
2366 }
2367 }
fac92740 2368 else { /* (...) */
830247a4
IZ
2369 parno = RExC_npar;
2370 RExC_npar++;
2371 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2372 Set_Node_Length(ret, 1); /* MJD */
2373 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2374 open = 1;
a0d0e21e 2375 }
a0ed51b3 2376 }
fac92740 2377 else /* ! paren */
a0d0e21e
LW
2378 ret = NULL;
2379
2380 /* Pick up the branches, linking them together. */
fac92740 2381 parse_start = RExC_parse; /* MJD */
830247a4 2382 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2383 /* branch_len = (paren != 0); */
2384
a0d0e21e
LW
2385 if (br == NULL)
2386 return(NULL);
830247a4
IZ
2387 if (*RExC_parse == '|') {
2388 if (!SIZE_ONLY && RExC_extralen) {
2389 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2390 }
fac92740 2391 else { /* MJD */
830247a4 2392 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2393 Set_Node_Length(br, paren != 0);
2394 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2395 }
c277df42
IZ
2396 have_branch = 1;
2397 if (SIZE_ONLY)
830247a4 2398 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2399 }
2400 else if (paren == ':') {
c277df42
IZ
2401 *flagp |= flags&SIMPLE;
2402 }
2403 if (open) { /* Starts with OPEN. */
830247a4 2404 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2405 }
2406 else if (paren != '?') /* Not Conditional */
a0d0e21e 2407 ret = br;
32a0ca98 2408 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2409 lastbr = br;
830247a4
IZ
2410 while (*RExC_parse == '|') {
2411 if (!SIZE_ONLY && RExC_extralen) {
2412 ender = reganode(pRExC_state, LONGJMP,0);
2413 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2414 }
2415 if (SIZE_ONLY)
830247a4
IZ
2416 RExC_extralen += 2; /* Account for LONGJMP. */
2417 nextchar(pRExC_state);
2418 br = regbranch(pRExC_state, &flags, 0);
fac92740 2419
a687059c 2420 if (br == NULL)
a0d0e21e 2421 return(NULL);
830247a4 2422 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2423 lastbr = br;
821b33a5
IZ
2424 if (flags&HASWIDTH)
2425 *flagp |= HASWIDTH;
a687059c 2426 *flagp |= flags&SPSTART;
a0d0e21e
LW
2427 }
2428
c277df42
IZ
2429 if (have_branch || paren != ':') {
2430 /* Make a closing node, and hook it on the end. */
2431 switch (paren) {
2432 case ':':
830247a4 2433 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2434 break;
2435 case 1:
830247a4 2436 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2437 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2438 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2439 break;
2440 case '<':
c277df42
IZ
2441 case ',':
2442 case '=':
2443 case '!':
c277df42 2444 *flagp &= ~HASWIDTH;
821b33a5
IZ
2445 /* FALL THROUGH */
2446 case '>':
830247a4 2447 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2448 break;
2449 case 0:
830247a4 2450 ender = reg_node(pRExC_state, END);
c277df42
IZ
2451 break;
2452 }
830247a4 2453 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2454
c277df42
IZ
2455 if (have_branch) {
2456 /* Hook the tails of the branches to the closing node. */
2457 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2458 regoptail(pRExC_state, br, ender);
c277df42
IZ
2459 }
2460 }
a0d0e21e 2461 }
c277df42
IZ
2462
2463 {
2464 char *p;
2465 static char parens[] = "=!<,>";
2466
2467 if (paren && (p = strchr(parens, paren))) {
2468 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2469 int flag = (p - parens) > 1;
2470
2471 if (paren == '>')
2472 node = SUSPEND, flag = 0;
830247a4 2473 reginsert(pRExC_state, node,ret);
c277df42 2474 ret->flags = flag;
830247a4 2475 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2476 }
a0d0e21e
LW
2477 }
2478
2479 /* Check for proper termination. */
ce3e6498 2480 if (paren) {
830247a4
IZ
2481 RExC_flags16 = oregflags;
2482 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2483 RExC_parse = oregcomp_parse;
380a0633 2484 vFAIL("Unmatched (");
ce3e6498 2485 }
a0ed51b3 2486 }
830247a4
IZ
2487 else if (!paren && RExC_parse < RExC_end) {
2488 if (*RExC_parse == ')') {
2489 RExC_parse++;
380a0633 2490 vFAIL("Unmatched )");
a0ed51b3
LW
2491 }
2492 else
b45f050a 2493 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2494 /* NOTREACHED */
2495 }
a687059c 2496
a0d0e21e 2497 return(ret);
a687059c
LW
2498}
2499
2500/*
2501 - regbranch - one alternative of an | operator
2502 *
2503 * Implements the concatenation operator.
2504 */
76e3520e 2505STATIC regnode *
830247a4 2506S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2507{
c277df42
IZ
2508 register regnode *ret;
2509 register regnode *chain = NULL;
2510 register regnode *latest;
2511 I32 flags = 0, c = 0;
a0d0e21e 2512
b81d288d 2513 if (first)
c277df42
IZ
2514 ret = NULL;
2515 else {
b81d288d 2516 if (!SIZE_ONLY && RExC_extralen)
830247a4 2517 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2518 else {
830247a4 2519 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2520 Set_Node_Length(ret, 1);
2521 }
c277df42
IZ
2522 }
2523
b81d288d 2524 if (!first && SIZE_ONLY)
830247a4 2525 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2526
c277df42 2527 *flagp = WORST; /* Tentatively. */
a0d0e21e 2528
830247a4
IZ
2529 RExC_parse--;
2530 nextchar(pRExC_state);
2531 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2532 flags &= ~TRYAGAIN;
830247a4 2533 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2534 if (latest == NULL) {
2535 if (flags & TRYAGAIN)
2536 continue;
2537 return(NULL);
a0ed51b3
LW
2538 }
2539 else if (ret == NULL)
c277df42 2540 ret = latest;
a0d0e21e 2541 *flagp |= flags&HASWIDTH;
c277df42 2542 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2543 *flagp |= flags&SPSTART;
2544 else {
830247a4
IZ
2545 RExC_naughty++;
2546 regtail(pRExC_state, chain, latest);
a687059c 2547 }
a0d0e21e 2548 chain = latest;
c277df42
IZ
2549 c++;
2550 }
2551 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2552 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2553 if (ret == NULL)
2554 ret = chain;
2555 }
2556 if (c == 1) {
2557 *flagp |= flags&SIMPLE;
a0d0e21e 2558 }
a687059c 2559
a0d0e21e 2560 return(ret);
a687059c
LW
2561}
2562
2563/*
2564 - regpiece - something followed by possible [*+?]
2565 *
2566 * Note that the branching code sequences used for ? and the general cases
2567 * of * and + are somewhat optimized: they use the same NOTHING node as
2568 * both the endmarker for their branch list and the body of the last branch.
2569 * It might seem that this node could be dispensed with entirely, but the
2570 * endmarker role is not redundant.
2571 */
76e3520e 2572STATIC regnode *
830247a4 2573S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2574{
c277df42 2575 register regnode *ret;
a0d0e21e
LW
2576 register char op;
2577 register char *next;
2578 I32 flags;
830247a4 2579 char *origparse = RExC_parse;
a0d0e21e
LW
2580 char *maxpos;
2581 I32 min;
c277df42 2582 I32 max = REG_INFTY;
fac92740 2583 char *parse_start;
a0d0e21e 2584
830247a4 2585 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2586 if (ret == NULL) {
2587 if (flags & TRYAGAIN)
2588 *flagp |= TRYAGAIN;
2589 return(NULL);
2590 }
2591
830247a4 2592 op = *RExC_parse;
a0d0e21e 2593
830247a4 2594 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2595 parse_start = RExC_parse; /* MJD */
830247a4 2596 next = RExC_parse + 1;
a0d0e21e
LW
2597 maxpos = Nullch;
2598 while (isDIGIT(*next) || *next == ',') {
2599 if (*next == ',') {
2600 if (maxpos)
2601 break;
2602 else
2603 maxpos = next;
a687059c 2604 }
a0d0e21e
LW
2605 next++;
2606 }
2607 if (*next == '}') { /* got one */
2608 if (!maxpos)
2609 maxpos = next;
830247a4
IZ
2610 RExC_parse++;
2611 min = atoi(RExC_parse);
a0d0e21e
LW
2612 if (*maxpos == ',')
2613 maxpos++;
2614 else
830247a4 2615 maxpos = RExC_parse;
a0d0e21e
LW
2616 max = atoi(maxpos);
2617 if (!max && *maxpos != '0')
c277df42
IZ
2618 max = REG_INFTY; /* meaning "infinity" */
2619 else if (max >= REG_INFTY)
8615cb43 2620 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2621 RExC_parse = next;
2622 nextchar(pRExC_state);
a0d0e21e
LW
2623
2624 do_curly:
2625 if ((flags&SIMPLE)) {
830247a4
IZ
2626 RExC_naughty += 2 + RExC_naughty / 2;
2627 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2628 Set_Node_Offset(ret, parse_start+1); /* MJD */
2629 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2630 }
2631 else {
830247a4 2632 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2633
2634 w->flags = 0;
830247a4
IZ
2635 regtail(pRExC_state, ret, w);
2636 if (!SIZE_ONLY && RExC_extralen) {
2637 reginsert(pRExC_state, LONGJMP,ret);
2638 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2639 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2640 }
830247a4 2641 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2642 /* MJD hk */
2643 Set_Node_Offset(ret, parse_start+1);
2644 Set_Node_Length(ret,
2645 op == '{' ? (RExC_parse - parse_start) : 1);
2646
830247a4 2647 if (!SIZE_ONLY && RExC_extralen)
c277df42 2648 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2649 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2650 if (SIZE_ONLY)
830247a4
IZ
2651 RExC_whilem_seen++, RExC_extralen += 3;
2652 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2653 }
c277df42 2654 ret->flags = 0;
a0d0e21e
LW
2655
2656 if (min > 0)
821b33a5
IZ
2657 *flagp = WORST;
2658 if (max > 0)
2659 *flagp |= HASWIDTH;
a0d0e21e 2660 if (max && max < min)
8615cb43 2661 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2662 if (!SIZE_ONLY) {
2663 ARG1_SET(ret, min);
2664 ARG2_SET(ret, max);
a687059c 2665 }
a687059c 2666
a0d0e21e 2667 goto nest_check;
a687059c 2668 }
a0d0e21e 2669 }
a687059c 2670
a0d0e21e
LW
2671 if (!ISMULT1(op)) {
2672 *flagp = flags;
a687059c 2673 return(ret);
a0d0e21e 2674 }
bb20fd44 2675
c277df42 2676#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2677
2678 /* if this is reinstated, don't forget to put this back into perldiag:
2679
2680 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2681
2682 (F) The part of the regexp subject to either the * or + quantifier
2683 could match an empty string. The {#} shows in the regular
2684 expression about where the problem was discovered.
2685
2686 */
2687
bb20fd44 2688 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2689 vFAIL("Regexp *+ operand could be empty");
b81d288d 2690#endif
bb20fd44 2691
fac92740 2692 parse_start = RExC_parse;
830247a4 2693 nextchar(pRExC_state);
a0d0e21e 2694
821b33a5 2695 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2696
2697 if (op == '*' && (flags&SIMPLE)) {
830247a4 2698 reginsert(pRExC_state, STAR, ret);
c277df42 2699 ret->flags = 0;
830247a4 2700 RExC_naughty += 4;
a0d0e21e
LW
2701 }
2702 else if (op == '*') {
2703 min = 0;
2704 goto do_curly;
a0ed51b3
LW
2705 }
2706 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2707 reginsert(pRExC_state, PLUS, ret);
c277df42 2708 ret->flags = 0;
830247a4 2709 RExC_naughty += 3;
a0d0e21e
LW
2710 }
2711 else if (op == '+') {
2712 min = 1;
2713 goto do_curly;
a0ed51b3
LW
2714 }
2715 else if (op == '?') {
a0d0e21e
LW
2716 min = 0; max = 1;
2717 goto do_curly;
2718 }
2719 nest_check:
e476b1b5 2720 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2721 vWARN3(RExC_parse,
b45f050a 2722 "%.*s matches null string many times",
830247a4 2723 RExC_parse - origparse,
b45f050a 2724 origparse);
a0d0e21e
LW
2725 }
2726
830247a4
IZ
2727 if (*RExC_parse == '?') {
2728 nextchar(pRExC_state);
2729 reginsert(pRExC_state, MINMOD, ret);
2730 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2731 }
830247a4
IZ
2732 if (ISMULT2(RExC_parse)) {
2733 RExC_parse++;
b45f050a
JF
2734 vFAIL("Nested quantifiers");
2735 }
a0d0e21e
LW
2736
2737 return(ret);
a687059c
LW
2738}
2739
2740/*
2741 - regatom - the lowest level
2742 *
2743 * Optimization: gobbles an entire sequence of ordinary characters so that
2744 * it can turn them into a single node, which is smaller to store and
2745 * faster to run. Backslashed characters are exceptions, each becoming a
2746 * separate node; the code is simpler that way and it's not worth fixing.
2747 *
b45f050a 2748 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2749STATIC regnode *
830247a4 2750S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2751{
c277df42 2752 register regnode *ret = 0;
a0d0e21e 2753 I32 flags;
f06dbbb7 2754 char *parse_start = 0;
a0d0e21e
LW
2755
2756 *flagp = WORST; /* Tentatively. */
2757
2758tryagain:
830247a4 2759 switch (*RExC_parse) {
a0d0e21e 2760 case '^':
830247a4
IZ
2761 RExC_seen_zerolen++;
2762 nextchar(pRExC_state);
2763 if (RExC_flags16 & PMf_MULTILINE)
2764 ret = reg_node(pRExC_state, MBOL);
2765 else if (RExC_flags16 & PMf_SINGLELINE)
2766 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2767 else
830247a4 2768 ret = reg_node(pRExC_state, BOL);
fac92740 2769 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2770 break;
2771 case '$':
830247a4 2772 nextchar(pRExC_state);
b81d288d 2773 if (*RExC_parse)
830247a4
IZ
2774 RExC_seen_zerolen++;
2775 if (RExC_flags16 & PMf_MULTILINE)
2776 ret = reg_node(pRExC_state, MEOL);
2777 else if (RExC_flags16 & PMf_SINGLELINE)
2778 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2779 else
830247a4 2780 ret = reg_node(pRExC_state, EOL);
fac92740 2781 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2782 break;
2783 case '.':
830247a4 2784 nextchar(pRExC_state);
cce850e4 2785 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2786 ret = reg_node(pRExC_state, SANY);
2787 else
2788 ret = reg_node(pRExC_state, REG_ANY);
2789 *flagp |= HASWIDTH|SIMPLE;
830247a4 2790 RExC_naughty++;
fac92740 2791 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2792 break;
2793 case '[':
b45f050a 2794 {
830247a4 2795 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2796 ret = regclass(pRExC_state);
830247a4
IZ
2797 if (*RExC_parse != ']') {
2798 RExC_parse = oregcomp_parse;
b45f050a
JF
2799 vFAIL("Unmatched [");
2800 }
830247a4 2801 nextchar(pRExC_state);
a0d0e21e 2802 *flagp |= HASWIDTH|SIMPLE;
fac92740 2803 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2804 break;
b45f050a 2805 }
a0d0e21e 2806 case '(':
830247a4
IZ
2807 nextchar(pRExC_state);
2808 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2809 if (ret == NULL) {
bf93d4cc 2810 if (flags & TRYAGAIN) {
830247a4 2811 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2812 /* Make parent create an empty node if needed. */
2813 *flagp |= TRYAGAIN;
2814 return(NULL);
2815 }
a0d0e21e 2816 goto tryagain;
bf93d4cc 2817 }
a0d0e21e
LW
2818 return(NULL);
2819 }
c277df42 2820 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2821 break;
2822 case '|':
2823 case ')':
2824 if (flags & TRYAGAIN) {
2825 *flagp |= TRYAGAIN;
2826 return NULL;
2827 }
b45f050a 2828 vFAIL("Internal urp");
a0d0e21e
LW
2829 /* Supposed to be caught earlier. */
2830 break;
85afd4ae 2831 case '{':
830247a4
IZ
2832 if (!regcurly(RExC_parse)) {
2833 RExC_parse++;
85afd4ae
CS
2834 goto defchar;
2835 }
2836 /* FALL THROUGH */
a0d0e21e
LW
2837 case '?':
2838 case '+':
2839 case '*':
830247a4 2840 RExC_parse++;
b45f050a 2841 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2842 break;
2843 case '\\':
830247a4 2844 switch (*++RExC_parse) {
a0d0e21e 2845 case 'A':
830247a4
IZ
2846 RExC_seen_zerolen++;
2847 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2848 *flagp |= SIMPLE;
830247a4 2849 nextchar(pRExC_state);
fac92740 2850 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2851 break;
2852 case 'G':
830247a4
IZ
2853 ret = reg_node(pRExC_state, GPOS);
2854 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2855 *flagp |= SIMPLE;
830247a4 2856 nextchar(pRExC_state);
fac92740 2857 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2858 break;
2859 case 'Z':
830247a4 2860 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2861 *flagp |= SIMPLE;
a1917ab9 2862 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2863 nextchar(pRExC_state);
a0d0e21e 2864 break;
b85d18e9 2865 case 'z':
830247a4 2866 ret = reg_node(pRExC_state, EOS);
b85d18e9 2867 *flagp |= SIMPLE;
830247a4
IZ
2868 RExC_seen_zerolen++; /* Do not optimize RE away */
2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2871 break;
4a2d328f 2872 case 'C':
f33976b4
DB
2873 ret = reg_node(pRExC_state, CANY);
2874 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2875 *flagp |= HASWIDTH|SIMPLE;
830247a4 2876 nextchar(pRExC_state);
fac92740 2877 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2878 break;
2879 case 'X':
830247a4 2880 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2881 *flagp |= HASWIDTH;
830247a4 2882 nextchar(pRExC_state);
fac92740 2883 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2884 break;
a0d0e21e 2885 case 'w':
ffc61ed2 2886 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2887 *flagp |= HASWIDTH|SIMPLE;
830247a4 2888 nextchar(pRExC_state);
fac92740 2889 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2890 break;
2891 case 'W':
ffc61ed2 2892 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2893 *flagp |= HASWIDTH|SIMPLE;
830247a4 2894 nextchar(pRExC_state);
fac92740 2895 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2896 break;
2897 case 'b':
830247a4
IZ
2898 RExC_seen_zerolen++;
2899 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2900 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2901 *flagp |= SIMPLE;
830247a4 2902 nextchar(pRExC_state);
fac92740 2903 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2904 break;
2905 case 'B':
830247a4
IZ
2906 RExC_seen_zerolen++;
2907 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2908 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2909 *flagp |= SIMPLE;
830247a4 2910 nextchar(pRExC_state);
fac92740 2911 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2912 break;
2913 case 's':
ffc61ed2 2914 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2915 *flagp |= HASWIDTH|SIMPLE;
830247a4 2916 nextchar(pRExC_state);
fac92740 2917 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2918 break;
2919 case 'S':
ffc61ed2 2920 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2921 *flagp |= HASWIDTH|SIMPLE;
830247a4 2922 nextchar(pRExC_state);
fac92740 2923 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2924 break;
2925 case 'd':
ffc61ed2 2926 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2927 *flagp |= HASWIDTH|SIMPLE;
830247a4 2928 nextchar(pRExC_state);
fac92740 2929 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2930 break;
2931 case 'D':
ffc61ed2 2932 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2933 *flagp |= HASWIDTH|SIMPLE;
830247a4 2934 nextchar(pRExC_state);
fac92740 2935 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2936 break;
a14b48bc
LW
2937 case 'p':
2938 case 'P':
3568d838 2939 {
830247a4 2940 char* oldregxend = RExC_end;
fac92740 2941 char* parse_start = RExC_parse;
a14b48bc 2942
830247a4 2943 if (RExC_parse[1] == '{') {
3568d838 2944 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2945 RExC_end = strchr(RExC_parse, '}');
2946 if (!RExC_end) {
0da60cf5 2947 U8 c = (U8)*RExC_parse;
830247a4
IZ
2948 RExC_parse += 2;
2949 RExC_end = oldregxend;
0da60cf5 2950 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2951 }
830247a4 2952 RExC_end++;
a14b48bc
LW
2953 }
2954 else
830247a4
IZ
2955 RExC_end = RExC_parse + 2;
2956 RExC_parse--;
a14b48bc 2957
ffc61ed2 2958 ret = regclass(pRExC_state);
a14b48bc 2959
830247a4
IZ
2960 RExC_end = oldregxend;
2961 RExC_parse--;
fac92740 2962 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2963 nextchar(pRExC_state);
a14b48bc
LW
2964 *flagp |= HASWIDTH|SIMPLE;
2965 }
2966 break;
a0d0e21e
LW
2967 case 'n':
2968 case 'r':
2969 case 't':
2970 case 'f':
2971 case 'e':
2972 case 'a':
2973 case 'x':
2974 case 'c':
2975 case '0':
2976 goto defchar;
2977 case '1': case '2': case '3': case '4':
2978 case '5': case '6': case '7': case '8': case '9':
2979 {
830247a4 2980 I32 num = atoi(RExC_parse);
a0d0e21e 2981
830247a4 2982 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2983 goto defchar;
2984 else {
fac92740 2985 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2986 while (isDIGIT(*RExC_parse))
2987 RExC_parse++;
b45f050a 2988
830247a4 2989 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2990 vFAIL("Reference to nonexistent group");
830247a4
IZ
2991 RExC_sawback = 1;
2992 ret = reganode(pRExC_state, FOLD
a0ed51b3 2993 ? (LOC ? REFFL : REFF)
c8756f30 2994 : REF, num);
a0d0e21e 2995 *flagp |= HASWIDTH;
fac92740
MJD
2996
2997 /* override incorrect value set in reganode MJD */
2998 Set_Node_Offset(ret, parse_start+1);
2999 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3000 RExC_parse--;
3001 nextchar(pRExC_state);
a0d0e21e
LW
3002 }
3003 }
3004 break;
3005 case '\0':
830247a4 3006 if (RExC_parse >= RExC_end)
b45f050a 3007 FAIL("Trailing \\");
a0d0e21e
LW
3008 /* FALL THROUGH */
3009 default:
c9f97d15
IZ
3010 /* Do not generate `unrecognized' warnings here, we fall
3011 back into the quick-grab loop below */
a0d0e21e
LW
3012 goto defchar;
3013 }
3014 break;
4633a7c4
LW
3015
3016 case '#':
830247a4
IZ
3017 if (RExC_flags16 & PMf_EXTENDED) {
3018 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3019 if (RExC_parse < RExC_end)
4633a7c4
LW
3020 goto tryagain;
3021 }
3022 /* FALL THROUGH */
3023
a0d0e21e 3024 default: {
ba210ebe 3025 register STRLEN len;
a0ed51b3 3026 register UV ender;
a0d0e21e 3027 register char *p;
c277df42 3028 char *oldp, *s;
ba210ebe 3029 STRLEN numlen;
80aecb99 3030 STRLEN foldlen;
60a8b682 3031 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3032
3033 parse_start = RExC_parse - 1;
a0d0e21e 3034
830247a4 3035 RExC_parse++;
a0d0e21e
LW
3036
3037 defchar:
830247a4 3038 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3039 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3040 : EXACT);
cd439c50 3041 s = STRING(ret);
830247a4
IZ
3042 for (len = 0, p = RExC_parse - 1;
3043 len < 127 && p < RExC_end;
a0d0e21e
LW
3044 len++)
3045 {
3046 oldp = p;
5b5a24f7 3047
830247a4
IZ
3048 if (RExC_flags16 & PMf_EXTENDED)
3049 p = regwhite(p, RExC_end);
a0d0e21e
LW
3050 switch (*p) {
3051 case '^':
3052 case '$':
3053 case '.':
3054 case '[':
3055 case '(':
3056 case ')':
3057 case '|':
3058 goto loopdone;
3059 case '\\':
3060 switch (*++p) {
3061 case 'A':
1ed8eac0
JF
3062 case 'C':
3063 case 'X':
a0d0e21e
LW
3064 case 'G':
3065 case 'Z':
b85d18e9 3066 case 'z':
a0d0e21e
LW
3067 case 'w':
3068 case 'W':
3069 case 'b':
3070 case 'B':
3071 case 's':
3072 case 'S':
3073 case 'd':
3074 case 'D':
a14b48bc
LW
3075 case 'p':
3076 case 'P':
a0d0e21e
LW
3077 --p;
3078 goto loopdone;
3079 case 'n':
3080 ender = '\n';
3081 p++;
a687059c 3082 break;
a0d0e21e
LW
3083 case 'r':
3084 ender = '\r';
3085 p++;
a687059c 3086 break;
a0d0e21e
LW
3087 case 't':
3088 ender = '\t';
3089 p++;
a687059c 3090 break;
a0d0e21e
LW
3091 case 'f':
3092 ender = '\f';
3093 p++;
a687059c 3094 break;
a0d0e21e 3095 case 'e':
c7f1f016 3096 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3097 p++;
a687059c 3098 break;
a0d0e21e 3099 case 'a':
c7f1f016 3100 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3101 p++;
a687059c 3102 break;
a0d0e21e 3103 case 'x':
a0ed51b3
LW
3104 if (*++p == '{') {
3105 char* e = strchr(p, '}');
b81d288d 3106
b45f050a 3107 if (!e) {
830247a4 3108 RExC_parse = p + 1;
b45f050a
JF
3109 vFAIL("Missing right brace on \\x{}");
3110 }
de5f0749 3111 else {
a4c04bdc
NC
3112 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3113 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3114 numlen = e - p - 1;
3115 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3116 if (ender > 0xff)
3117 RExC_utf8 = 1;
b21ed0a9
GS
3118 /* numlen is generous */
3119 if (numlen + len >= 127) {
a0ed51b3
LW
3120 p--;
3121 goto loopdone;
3122 }
3123 p = e + 1;
3124 }
a0ed51b3
LW
3125 }
3126 else {
a4c04bdc 3127 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3128 numlen = 2;
3129 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3130 p += numlen;
3131 }
a687059c 3132 break;
a0d0e21e
LW
3133 case 'c':
3134 p++;
bbce6d69
PP
3135 ender = UCHARAT(p++);
3136 ender = toCTRL(ender);
a687059c 3137 break;
a0d0e21e
LW
3138 case '0': case '1': case '2': case '3':case '4':
3139 case '5': case '6': case '7': case '8':case '9':
3140 if (*p == '0' ||
830247a4 3141 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3142 I32 flags = 0;
3143 numlen = 3;
3144 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3145 p += numlen;
3146 }
3147 else {
3148 --p;
3149 goto loopdone;
a687059c
LW
3150 }
3151 break;
a0d0e21e 3152 case '\0':
830247a4 3153 if (p >= RExC_end)
b45f050a 3154 FAIL("Trailing \\");
a687059c 3155 /* FALL THROUGH */
a0d0e21e 3156 default:
e476b1b5 3157 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3158 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3159 goto normal_default;
a0d0e21e
LW
3160 }
3161 break;
a687059c 3162 default:
a0ed51b3 3163 normal_default:
fd400ab9 3164 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3165 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3166 &numlen, 0);
a0ed51b3
LW
3167 p += numlen;
3168 }
3169 else
3170 ender = *p++;
a0d0e21e 3171 break;
a687059c 3172 }
830247a4
IZ
3173 if (RExC_flags16 & PMf_EXTENDED)
3174 p = regwhite(p, RExC_end);
60a8b682
JH
3175 if (UTF && FOLD) {
3176 /* Prime the casefolded buffer. */
ac7e0132 3177 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3178 }
a0d0e21e
LW
3179 if (ISMULT2(p)) { /* Back off on ?+*. */
3180 if (len)
3181 p = oldp;
16ea2a2e 3182 else if (UTF) {
80aecb99 3183 if (FOLD) {
60a8b682 3184 /* Emit all the Unicode characters. */
80aecb99
JH
3185 for (foldbuf = tmpbuf;
3186 foldlen;
3187 foldlen -= numlen) {
3188 ender = utf8_to_uvchr(foldbuf, &numlen);
3189 reguni(pRExC_state, ender, s, &numlen);
3190 s += numlen;
3191 len += numlen;
3192 foldbuf += numlen;
3193 }
3194 }
3195 else {
3196 reguni(pRExC_state, ender, s, &numlen);
3197 s += numlen;
3198 len += numlen;
3199 }
a0ed51b3 3200 }
a0d0e21e
LW
3201 else {
3202 len++;
cd439c50 3203 REGC(ender, s++);
a0d0e21e
LW
3204 }
3205 break;
a687059c 3206 }
16ea2a2e 3207 if (UTF) {
80aecb99 3208 if (FOLD) {
60a8b682 3209 /* Emit all the Unicode characters. */
80aecb99
JH
3210 for (foldbuf = tmpbuf;
3211 foldlen;
3212 foldlen -= numlen) {
3213 ender = utf8_to_uvchr(foldbuf, &numlen);
3214 reguni(pRExC_state, ender, s, &numlen);
3215 s += numlen;
3216 len += numlen;
3217 foldbuf += numlen;
3218 }
3219 }
3220 else {
3221 reguni(pRExC_state, ender, s, &numlen);
3222 s += numlen;
3223 len += numlen;
3224 }
3225 len--;
a0ed51b3
LW
3226 }
3227 else
cd439c50 3228 REGC(ender, s++);
a0d0e21e
LW
3229 }
3230 loopdone:
830247a4 3231 RExC_parse = p - 1;
fac92740 3232 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3233 nextchar(pRExC_state);
793db0cb
JH
3234 {
3235 /* len is STRLEN which is unsigned, need to copy to signed */
3236 IV iv = len;
3237 if (iv < 0)
3238 vFAIL("Internal disaster");
3239 }
a0d0e21e
LW
3240 if (len > 0)
3241 *flagp |= HASWIDTH;
3242 if (len == 1)
3243 *flagp |= SIMPLE;
c277df42 3244 if (!SIZE_ONLY)
cd439c50
IZ
3245 STR_LEN(ret) = len;
3246 if (SIZE_ONLY)
830247a4 3247 RExC_size += STR_SZ(len);
cd439c50 3248 else
830247a4 3249 RExC_emit += STR_SZ(len);
a687059c 3250 }
a0d0e21e
LW
3251 break;
3252 }
a687059c 3253
60a8b682
JH
3254 /* If the encoding pragma is in effect recode the text of
3255 * any EXACT-kind nodes. */
22c54be3 3256 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3257 STRLEN oldlen = STR_LEN(ret);
3258 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3259
3260 if (RExC_utf8)
3261 SvUTF8_on(sv);
3262 if (sv_utf8_downgrade(sv, TRUE)) {
3263 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3264 STRLEN newlen = SvCUR(sv);
3265
3266 if (!SIZE_ONLY) {
3267 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3268 (int)oldlen, STRING(ret),
3269 (int)newlen, s));
3270 Copy(s, STRING(ret), newlen, char);
3271 STR_LEN(ret) += newlen - oldlen;
3272 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3273 } else
3274 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3275 }
a72c7584
JH
3276 }
3277
a0d0e21e 3278 return(ret);
a687059c
LW
3279}
3280
873ef191 3281STATIC char *
cea2e8a9 3282S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3283{
3284 while (p < e) {
3285 if (isSPACE(*p))
3286 ++p;
3287 else if (*p == '#') {
3288 do {
3289 p++;
3290 } while (p < e && *p != '\n');
3291 }
3292 else
3293 break;
3294 }
3295 return p;
3296}
3297
b8c5462f
JH
3298/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3299 Character classes ([:foo:]) can also be negated ([:^foo:]).
3300 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3301 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3302 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3303
3304#define POSIXCC_DONE(c) ((c) == ':')
3305#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3306#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3307
b8c5462f 3308STATIC I32
830247a4 3309S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3310{
3311 char *posixcc = 0;
936ed897 3312 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3313
830247a4 3314 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3315 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3316 POSIXCC(UCHARAT(RExC_parse))) {
3317 char c = UCHARAT(RExC_parse);
830247a4 3318 char* s = RExC_parse++;
b81d288d 3319
9a86a77b 3320 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3321 RExC_parse++;
3322 if (RExC_parse == RExC_end)
620e46c5 3323 /* Grandfather lone [:, [=, [. */
830247a4 3324 RExC_parse = s;
620e46c5 3325 else {
830247a4 3326 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3327
9a86a77b 3328 if (UCHARAT(RExC_parse) == ']') {
830247a4 3329 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3330 posixcc = s + 1;
3331 if (*s == ':') {
3332 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3333 I32 skip = 5; /* the most common skip */
3334
3335 switch (*posixcc) {
3336 case 'a':
3337 if (strnEQ(posixcc, "alnum", 5))
3338 namedclass =
3339 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3340 else if (strnEQ(posixcc, "alpha", 5))
3341 namedclass =
3342 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3343 else if (strnEQ(posixcc, "ascii", 5))
3344 namedclass =
3345 complement ? ANYOF_NASCII : ANYOF_ASCII;
3346 break;
aaa51d5e
JF
3347 case 'b':
3348 if (strnEQ(posixcc, "blank", 5))
3349 namedclass =
3350 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3351 break;
b8c5462f
JH
3352 case 'c':
3353 if (strnEQ(posixcc, "cntrl", 5))
3354 namedclass =
3355 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3356 break;
3357 case 'd':
3358 if (strnEQ(posixcc, "digit", 5))
3359 namedclass =
3360 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3361 break;
3362 case 'g':
3363 if (strnEQ(posixcc, "graph", 5))
3364 namedclass =
3365 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3366 break;
3367 case 'l':
3368 if (strnEQ(posixcc, "lower", 5))
3369 namedclass =
3370 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3371 break;
3372 case 'p':
3373 if (strnEQ(posixcc, "print", 5))
3374 namedclass =
3375 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3376 else if (strnEQ(posixcc, "punct", 5))
3377 namedclass =
3378 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3379 break;
3380 case 's':
3381 if (strnEQ(posixcc, "space", 5))
3382 namedclass =
aaa51d5e 3383 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3384 break;
b8c5462f
JH
3385 case 'u':
3386 if (strnEQ(posixcc, "upper", 5))
3387 namedclass =
3388 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3389 break;
3390 case 'w': /* this is not POSIX, this is the Perl \w */
3391 if (strnEQ(posixcc, "word", 4)) {
3392 namedclass =
3393 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3394 skip = 4;
3395 }
3396 break;
3397 case 'x':
3398 if (strnEQ(posixcc, "xdigit", 6)) {
3399 namedclass =
3400 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3401 skip = 6;
3402 }
3403 break;
3404 }
ac561586
JH
3405 if (namedclass == OOB_NAMEDCLASS ||
3406 posixcc[skip] != ':' ||
3407 posixcc[skip+1] != ']')
b45f050a
JF
3408 {
3409 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3410 t - s - 1, s + 1);
3411 }
3412 } else if (!SIZE_ONLY) {
b8c5462f 3413 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3414
830247a4 3415 /* adjust RExC_parse so the warning shows after
b45f050a 3416 the class closes */
9a86a77b 3417 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3418 RExC_parse++;
b45f050a
JF
3419 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3420 }
b8c5462f
JH
3421 } else {
3422 /* Maternal grandfather:
3423 * "[:" ending in ":" but not in ":]" */
830247a4 3424 RExC_parse = s;
767d463e 3425 }
620e46c5
JH
3426 }
3427 }
3428
b8c5462f
JH
3429 return namedclass;
3430}
3431
3432STATIC void
830247a4 3433S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3434{
e476b1b5 3435 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3436 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3437 char *s = RExC_parse;
93733859 3438 char c = *s++;
b8c5462f
JH
3439
3440 while(*s && isALNUM(*s))
3441 s++;
3442 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3443 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3444
3445 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3446 if (POSIXCC_NOTYET(c)) {
830247a4 3447 /* adjust RExC_parse so the error shows after
b45f050a 3448 the class closes */
9a86a77b 3449 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3450 ;
3451 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3452 }
b8c5462f
JH
3453 }
3454 }
620e46c5
JH
3455}
3456
76e3520e 3457STATIC regnode *
830247a4 3458S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3459{
ffc61ed2 3460 register UV value;
9a86a77b 3461 register UV nextvalue;
3568d838 3462 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3463 register IV range = 0;
c277df42 3464 register regnode *ret;
ba210ebe 3465 STRLEN numlen;
ffc61ed2 3466 IV namedclass;
9c5ffd7c 3467 char *rangebegin = 0;
936ed897 3468 bool need_class = 0;
9c5ffd7c 3469 SV *listsv = Nullsv;
ffc61ed2
JH
3470 register char *e;
3471 UV n;
9e55ce06
JH
3472 bool optimize_invert = TRUE;