This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: the remaining bugs in \x escapes (was Re: [PATCH] oct and hex in glorious 64...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
9af0acdb
JH
525 int value;
526
653099ff 527 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
528 for (value = 0; value < 256; ++value)
529 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 530 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* Can match anything (initialization) */
536STATIC int
537S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538{
539 int value;
540
aaa51d5e 541 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
542 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
543 return 1;
1aa99e6b
IH
544 if (!(cl->flags & ANYOF_UNICODE_ALL))
545 return 0;
9af0acdb
JH
546 for (value = 0; value < 256; ++value)
547 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
548 return 0;
549 return 1;
550}
551
552/* Can match anything (initialization) */
553STATIC void
830247a4 554S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 555{
8ecf7187 556 Zero(cl, 1, struct regnode_charclass_class);
653099ff 557 cl->type = ANYOF;
830247a4 558 cl_anything(pRExC_state, cl);
653099ff
GS
559}
560
561STATIC void
830247a4 562S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 563{
8ecf7187 564 Zero(cl, 1, struct regnode_charclass_class);
653099ff 565 cl->type = ANYOF;
830247a4 566 cl_anything(pRExC_state, cl);
653099ff
GS
567 if (LOC)
568 cl->flags |= ANYOF_LOCALE;
569}
570
571/* 'And' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
574S_cl_and(pTHX_ struct regnode_charclass_class *cl,
575 struct regnode_charclass_class *and_with)
576{
653099ff
GS
577 if (!(and_with->flags & ANYOF_CLASS)
578 && !(cl->flags & ANYOF_CLASS)
579 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
580 && !(and_with->flags & ANYOF_FOLD)
581 && !(cl->flags & ANYOF_FOLD)) {
582 int i;
583
584 if (and_with->flags & ANYOF_INVERT)
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= ~and_with->bitmap[i];
587 else
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] &= and_with->bitmap[i];
590 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
591 if (!(and_with->flags & ANYOF_EOS))
592 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
593
594 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 cl->flags |= ANYOF_UNICODE;
597 ARG_SET(cl, ARG(and_with));
598 }
599 if (!(and_with->flags & ANYOF_UNICODE_ALL))
600 cl->flags &= ~ANYOF_UNICODE_ALL;
601 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
602 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
603}
604
605/* 'OR' a given class with another one. Can create false positives */
606/* We assume that cl is not inverted */
607STATIC void
830247a4 608S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 609{
653099ff
GS
610 if (or_with->flags & ANYOF_INVERT) {
611 /* We do not use
612 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
613 * <= (B1 | !B2) | (CL1 | !CL2)
614 * which is wasteful if CL2 is small, but we ignore CL2:
615 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
616 * XXXX Can we handle case-fold? Unclear:
617 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
618 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
619 */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && !(or_with->flags & ANYOF_FOLD)
622 && !(cl->flags & ANYOF_FOLD) ) {
623 int i;
624
625 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
626 cl->bitmap[i] |= ~or_with->bitmap[i];
627 } /* XXXX: logic is complicated otherwise */
628 else {
830247a4 629 cl_anything(pRExC_state, cl);
653099ff
GS
630 }
631 } else {
632 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
633 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 634 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
635 || (cl->flags & ANYOF_FOLD)) ) {
636 int i;
637
638 /* OR char bitmap and class bitmap separately */
639 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
640 cl->bitmap[i] |= or_with->bitmap[i];
641 if (or_with->flags & ANYOF_CLASS) {
642 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
643 cl->classflags[i] |= or_with->classflags[i];
644 cl->flags |= ANYOF_CLASS;
645 }
646 }
647 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649 }
650 }
651 if (or_with->flags & ANYOF_EOS)
652 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
653
654 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
655 ARG(cl) != ARG(or_with)) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659 if (or_with->flags & ANYOF_UNICODE_ALL) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
653099ff
GS
663}
664
665/* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
c277df42
IZ
668/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
669 to the position after last scanned or to NULL. */
670
76e3520e 671STATIC I32
830247a4 672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676{
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 681 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
653099ff 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 685
c277df42
IZ
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
22c35a8c 689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 690 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693#ifdef DEBUGGING
694 regnode *stop = scan;
b81d288d 695#endif
c277df42 696
cd439c50 697 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
b81d288d 700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
22c35a8c 706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709#ifdef DEBUGGING
710 if (stringok)
711 stop = n;
b81d288d 712#endif
c277df42 713 n = regnext(n);
a0ed51b3 714 }
f49d4d0f 715 else if (stringok) {
cd439c50 716 int oldl = STR_LEN(scan);
c277df42 717 regnode *nnext = regnext(n);
f49d4d0f 718
b81d288d 719 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
c277df42 724 /* Now we can overwrite *n : */
f49d4d0f 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 726#ifdef DEBUGGING
f49d4d0f 727 stop = next - 1;
b81d288d 728#endif
c277df42
IZ
729 n = nnext;
730 }
731 }
732#ifdef DEBUGGING
733 /* Allow dumping */
cd439c50 734 n = scan + NODE_SZ_STR(scan);
c277df42 735 while (n <= stop) {
22c35a8c 736 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
737 OP(n) = OPTIMIZED;
738 NEXT_OFF(n) = 0;
739 }
740 n++;
741 }
653099ff 742#endif
c277df42 743 }
653099ff
GS
744 /* Follow the next-chain of the current node and optimize
745 away all the NOTHINGs from it. */
c277df42 746 if (OP(scan) != CURLYX) {
048cfca1
GS
747 int max = (reg_off_by_arg[OP(scan)]
748 ? I32_MAX
749 /* I32 may be smaller than U16 on CRAYs! */
750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
752 int noff;
753 regnode *n = scan;
b81d288d 754
c277df42
IZ
755 /* Skip NOTHING and LONGJMP. */
756 while ((n = regnext(n))
22c35a8c 757 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
759 && off + noff < max)
760 off += noff;
761 if (reg_off_by_arg[OP(scan)])
762 ARG(scan) = off;
b81d288d 763 else
c277df42
IZ
764 NEXT_OFF(scan) = off;
765 }
653099ff
GS
766 /* The principal pseudo-switch. Cannot be a switch, since we
767 look into several different things. */
b81d288d 768 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
769 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
770 next = regnext(scan);
771 code = OP(scan);
b81d288d
AB
772
773 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 774 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 775 struct regnode_charclass_class accum;
c277df42 776
653099ff 777 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 778 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 779 if (flags & SCF_DO_STCLASS)
830247a4 780 cl_init_zero(pRExC_state, &accum);
c277df42 781 while (OP(scan) == code) {
830247a4 782 I32 deltanext, minnext, f = 0, fake;
653099ff 783 struct regnode_charclass_class this_class;
c277df42
IZ
784
785 num++;
786 data_fake.flags = 0;
b81d288d 787 if (data) {
2c2d71f5 788 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
789 data_fake.last_closep = data->last_closep;
790 }
791 else
792 data_fake.last_closep = &fake;
c277df42
IZ
793 next = regnext(scan);
794 scan = NEXTOPER(scan);
795 if (code != BRANCH)
796 scan = NEXTOPER(scan);
653099ff 797 if (flags & SCF_DO_STCLASS) {
830247a4 798 cl_init(pRExC_state, &this_class);
653099ff
GS
799 data_fake.start_class = &this_class;
800 f = SCF_DO_STCLASS_AND;
b81d288d 801 }
e1901655
IZ
802 if (flags & SCF_WHILEM_VISITED_POS)
803 f |= SCF_WHILEM_VISITED_POS;
653099ff 804 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
805 minnext = study_chunk(pRExC_state, &scan, &deltanext,
806 next, &data_fake, f);
b81d288d 807 if (min1 > minnext)
c277df42
IZ
808 min1 = minnext;
809 if (max1 < minnext + deltanext)
810 max1 = minnext + deltanext;
811 if (deltanext == I32_MAX)
aca2d497 812 is_inf = is_inf_internal = 1;
c277df42
IZ
813 scan = next;
814 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
815 pars++;
405ff068 816 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 817 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
818 if (data)
819 data->whilem_c = data_fake.whilem_c;
653099ff 820 if (flags & SCF_DO_STCLASS)
830247a4 821 cl_or(pRExC_state, &accum, &this_class);
b81d288d 822 if (code == SUSPEND)
c277df42
IZ
823 break;
824 }
825 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
826 min1 = 0;
827 if (flags & SCF_DO_SUBSTR) {
828 data->pos_min += min1;
829 data->pos_delta += max1 - min1;
830 if (max1 != min1 || is_inf)
831 data->longest = &(data->longest_float);
832 }
833 min += min1;
834 delta += max1 - min1;
653099ff 835 if (flags & SCF_DO_STCLASS_OR) {
830247a4 836 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
837 if (min1) {
838 cl_and(data->start_class, &and_with);
839 flags &= ~SCF_DO_STCLASS;
840 }
841 }
842 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
843 if (min1) {
844 cl_and(data->start_class, &accum);
653099ff 845 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
846 }
847 else {
b81d288d 848 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
849 * data->start_class */
850 StructCopy(data->start_class, &and_with,
851 struct regnode_charclass_class);
852 flags &= ~SCF_DO_STCLASS_AND;
853 StructCopy(&accum, data->start_class,
854 struct regnode_charclass_class);
855 flags |= SCF_DO_STCLASS_OR;
856 data->start_class->flags |= ANYOF_EOS;
857 }
653099ff 858 }
a0ed51b3
LW
859 }
860 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
861 scan = NEXTOPER(NEXTOPER(scan));
862 else /* single branch is optimized. */
863 scan = NEXTOPER(scan);
864 continue;
a0ed51b3
LW
865 }
866 else if (OP(scan) == EXACT) {
cd439c50 867 I32 l = STR_LEN(scan);
1aa99e6b 868 UV uc = *((U8*)STRING(scan));
a0ed51b3 869 if (UTF) {
1aa99e6b
IH
870 U8 *s = (U8*)STRING(scan);
871 l = utf8_length(s, s + l);
9041c2e3 872 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
873 }
874 min += l;
c277df42 875 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
876 /* The code below prefers earlier match for fixed
877 offset, later match for variable offset. */
878 if (data->last_end == -1) { /* Update the start info. */
879 data->last_start_min = data->pos_min;
880 data->last_start_max = is_inf
b81d288d 881 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 882 }
cd439c50 883 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
884 data->last_end = data->pos_min + l;
885 data->pos_min += l; /* As in the first entry. */
886 data->flags &= ~SF_BEFORE_EOL;
887 }
653099ff
GS
888 if (flags & SCF_DO_STCLASS_AND) {
889 /* Check whether it is compatible with what we know already! */
890 int compat = 1;
891
1aa99e6b 892 if (uc >= 0x100 ||
516a5887 893 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 894 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 895 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 896 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 897 )
653099ff
GS
898 compat = 0;
899 ANYOF_CLASS_ZERO(data->start_class);
900 ANYOF_BITMAP_ZERO(data->start_class);
901 if (compat)
1aa99e6b 902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 903 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
904 if (uc < 0x100)
905 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
906 }
907 else if (flags & SCF_DO_STCLASS_OR) {
908 /* false positive possible if the class is case-folded */
1aa99e6b 909 if (uc < 0x100)
9b877dbb
IH
910 ANYOF_BITMAP_SET(data->start_class, uc);
911 else
912 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
913 data->start_class->flags &= ~ANYOF_EOS;
914 cl_and(data->start_class, &and_with);
915 }
916 flags &= ~SCF_DO_STCLASS;
a0ed51b3 917 }
653099ff 918 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
653099ff
GS
921
922 /* Search for fixed substrings supports EXACT only. */
b81d288d 923 if (flags & SCF_DO_SUBSTR)
830247a4 924 scan_commit(pRExC_state, data);
a0ed51b3 925 if (UTF) {
1aa99e6b
IH
926 U8 *s = (U8 *)STRING(scan);
927 l = utf8_length(s, s + l);
9041c2e3 928 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
929 }
930 min += l;
c277df42 931 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 932 data->pos_min += l;
653099ff
GS
933 if (flags & SCF_DO_STCLASS_AND) {
934 /* Check whether it is compatible with what we know already! */
935 int compat = 1;
936
1aa99e6b 937 if (uc >= 0x100 ||
516a5887 938 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 939 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 940 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
941 compat = 0;
942 ANYOF_CLASS_ZERO(data->start_class);
943 ANYOF_BITMAP_ZERO(data->start_class);
944 if (compat) {
1aa99e6b 945 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
946 data->start_class->flags &= ~ANYOF_EOS;
947 data->start_class->flags |= ANYOF_FOLD;
948 if (OP(scan) == EXACTFL)
949 data->start_class->flags |= ANYOF_LOCALE;
950 }
951 }
952 else if (flags & SCF_DO_STCLASS_OR) {
953 if (data->start_class->flags & ANYOF_FOLD) {
954 /* false positive possible if the class is case-folded.
955 Assume that the locale settings are the same... */
1aa99e6b
IH
956 if (uc < 0x100)
957 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
958 data->start_class->flags &= ~ANYOF_EOS;
959 }
960 cl_and(data->start_class, &and_with);
961 }
962 flags &= ~SCF_DO_STCLASS;
a0ed51b3 963 }
4d61ec05 964 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 965 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 966 I32 f = flags, pos_before = 0;
c277df42 967 regnode *oscan = scan;
653099ff
GS
968 struct regnode_charclass_class this_class;
969 struct regnode_charclass_class *oclass = NULL;
727f22e3 970 I32 next_is_eval = 0;
653099ff 971
22c35a8c 972 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 973 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
974 scan = NEXTOPER(scan);
975 goto finish;
976 case PLUS:
653099ff 977 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 978 next = NEXTOPER(scan);
653099ff 979 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
980 mincount = 1;
981 maxcount = REG_INFTY;
c277df42
IZ
982 next = regnext(scan);
983 scan = NEXTOPER(scan);
984 goto do_curly;
985 }
986 }
987 if (flags & SCF_DO_SUBSTR)
988 data->pos_min++;
989 min++;
990 /* Fall through. */
991 case STAR:
653099ff
GS
992 if (flags & SCF_DO_STCLASS) {
993 mincount = 0;
b81d288d 994 maxcount = REG_INFTY;
653099ff
GS
995 next = regnext(scan);
996 scan = NEXTOPER(scan);
997 goto do_curly;
998 }
b81d288d 999 is_inf = is_inf_internal = 1;
c277df42
IZ
1000 scan = regnext(scan);
1001 if (flags & SCF_DO_SUBSTR) {
830247a4 1002 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1003 data->longest = &(data->longest_float);
1004 }
1005 goto optimize_curly_tail;
1006 case CURLY:
b81d288d 1007 mincount = ARG1(scan);
c277df42
IZ
1008 maxcount = ARG2(scan);
1009 next = regnext(scan);
cb434fcc
IZ
1010 if (OP(scan) == CURLYX) {
1011 I32 lp = (data ? *(data->last_closep) : 0);
1012
1013 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1014 }
c277df42 1015 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1016 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1017 do_curly:
1018 if (flags & SCF_DO_SUBSTR) {
830247a4 1019 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1020 pos_before = data->pos_min;
1021 }
1022 if (data) {
1023 fl = data->flags;
1024 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1025 if (is_inf)
1026 data->flags |= SF_IS_INF;
1027 }
653099ff 1028 if (flags & SCF_DO_STCLASS) {
830247a4 1029 cl_init(pRExC_state, &this_class);
653099ff
GS
1030 oclass = data->start_class;
1031 data->start_class = &this_class;
1032 f |= SCF_DO_STCLASS_AND;
1033 f &= ~SCF_DO_STCLASS_OR;
1034 }
e1901655
IZ
1035 /* These are the cases when once a subexpression
1036 fails at a particular position, it cannot succeed
1037 even after backtracking at the enclosing scope.
b81d288d 1038
e1901655
IZ
1039 XXXX what if minimal match and we are at the
1040 initial run of {n,m}? */
1041 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1042 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1043
c277df42 1044 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1045 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1046 mincount == 0
653099ff
GS
1047 ? (f & ~SCF_DO_SUBSTR) : f);
1048
1049 if (flags & SCF_DO_STCLASS)
1050 data->start_class = oclass;
1051 if (mincount == 0 || minnext == 0) {
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 }
1055 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1056 /* Switch to OR mode: cache the old value of
653099ff
GS
1057 * data->start_class */
1058 StructCopy(data->start_class, &and_with,
1059 struct regnode_charclass_class);
1060 flags &= ~SCF_DO_STCLASS_AND;
1061 StructCopy(&this_class, data->start_class,
1062 struct regnode_charclass_class);
1063 flags |= SCF_DO_STCLASS_OR;
1064 data->start_class->flags |= ANYOF_EOS;
1065 }
1066 } else { /* Non-zero len */
1067 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1068 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1069 cl_and(data->start_class, &and_with);
1070 }
1071 else if (flags & SCF_DO_STCLASS_AND)
1072 cl_and(data->start_class, &this_class);
1073 flags &= ~SCF_DO_STCLASS;
1074 }
c277df42
IZ
1075 if (!scan) /* It was not CURLYX, but CURLY. */
1076 scan = next;
84037bb0 1077 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1078 /* ? quantifier ok, except for (?{ ... }) */
1079 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
84e09d5e
JH
1669 /* XXXX This looks very suspicious... */
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
1672 else
1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
055bb491 1749 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1750 RExC_parse = exp;
1751 RExC_end = xend;
1752 RExC_naughty = 0;
1753 RExC_npar = 1;
fac92740 1754 RExC_emit_start = r->program;
830247a4 1755 RExC_emit = r->program;
2cd61cdb 1756 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1757 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1758 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1759 r->data = 0;
830247a4 1760 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1761 return(NULL);
1762
1763 /* Dig out information for optimizations. */
cf93c79d 1764 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1765 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1766 if (UTF)
1767 r->reganch |= ROPT_UTF8;
c277df42 1768 r->regstclass = NULL;
830247a4 1769 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1770 r->reganch |= ROPT_NAUGHTY;
c277df42 1771 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1772
1773 /* XXXX To minimize changes to RE engine we always allocate
1774 3-units-long substrs field. */
1775 Newz(1004, r->substrs, 1, struct reg_substr_data);
1776
2c2d71f5 1777 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1778 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1779 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1780 I32 fake;
c5254dd6 1781 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1782 struct regnode_charclass_class ch_class;
1783 int stclass_flag;
cb434fcc 1784 I32 last_close = 0;
a0d0e21e
LW
1785
1786 first = scan;
c277df42 1787 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1788 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1789 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1790 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1791 (OP(first) == PLUS) ||
1792 (OP(first) == MINMOD) ||
653099ff 1793 /* An {n,m} with n>0 */
22c35a8c 1794 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1795 if (OP(first) == PLUS)
1796 sawplus = 1;
1797 else
1798 first += regarglen[(U8)OP(first)];
1799 first = NEXTOPER(first);
a687059c
LW
1800 }
1801
a0d0e21e
LW
1802 /* Starting-point info. */
1803 again:
653099ff 1804 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1805 if (OP(first) == EXACT)
1806 ; /* Empty, get anchored substr later. */
1807 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1808 r->regstclass = first;
1809 }
653099ff 1810 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1811 r->regstclass = first;
22c35a8c
GS
1812 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1813 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1814 r->regstclass = first;
22c35a8c 1815 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1816 r->reganch |= (OP(first) == MBOL
1817 ? ROPT_ANCH_MBOL
1818 : (OP(first) == SBOL
1819 ? ROPT_ANCH_SBOL
1820 : ROPT_ANCH_BOL));
a0d0e21e 1821 first = NEXTOPER(first);
774d564b 1822 goto again;
1823 }
1824 else if (OP(first) == GPOS) {
1825 r->reganch |= ROPT_ANCH_GPOS;
1826 first = NEXTOPER(first);
1827 goto again;
a0d0e21e
LW
1828 }
1829 else if ((OP(first) == STAR &&
22c35a8c 1830 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1831 !(r->reganch & ROPT_ANCH) )
1832 {
1833 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1834 int type = OP(NEXTOPER(first));
1835
ffc61ed2 1836 if (type == REG_ANY)
cad2e5aa
JH
1837 type = ROPT_ANCH_MBOL;
1838 else
1839 type = ROPT_ANCH_SBOL;
1840
1841 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1842 first = NEXTOPER(first);
774d564b 1843 goto again;
a0d0e21e 1844 }
b81d288d 1845 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1846 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1847 /* x+ must match at the 1st pos of run of x's */
1848 r->reganch |= ROPT_SKIP;
a0d0e21e 1849
c277df42 1850 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1851 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1852 (IV)(first - scan + 1)));
a0d0e21e
LW
1853 /*
1854 * If there's something expensive in the r.e., find the
1855 * longest literal string that must appear and make it the
1856 * regmust. Resolve ties in favor of later strings, since
1857 * the regstart check works with the beginning of the r.e.
1858 * and avoiding duplication strengthens checking. Not a
1859 * strong reason, but sufficient in the absence of others.
1860 * [Now we resolve ties in favor of the earlier string if
c277df42 1861 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1862 * earlier string may buy us something the later one won't.]
1863 */
a0d0e21e 1864 minlen = 0;
a687059c 1865
79cb57f6
GS
1866 data.longest_fixed = newSVpvn("",0);
1867 data.longest_float = newSVpvn("",0);
1868 data.last_found = newSVpvn("",0);
c277df42
IZ
1869 data.longest = &(data.longest_fixed);
1870 first = scan;
653099ff 1871 if (!r->regstclass) {
830247a4 1872 cl_init(pRExC_state, &ch_class);
653099ff
GS
1873 data.start_class = &ch_class;
1874 stclass_flag = SCF_DO_STCLASS_AND;
1875 } else /* XXXX Check for BOUND? */
1876 stclass_flag = 0;
cb434fcc 1877 data.last_closep = &last_close;
653099ff 1878
830247a4 1879 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1880 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1882 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1883 && !RExC_seen_zerolen
1884 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1885 r->reganch |= ROPT_CHECK_ALL;
830247a4 1886 scan_commit(pRExC_state, &data);
c277df42
IZ
1887 SvREFCNT_dec(data.last_found);
1888
a0ed51b3 1889 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1890 if (longest_float_length
c277df42
IZ
1891 || (data.flags & SF_FL_BEFORE_EOL
1892 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1893 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1894 int t;
1895
a0ed51b3 1896 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1897 && data.offset_fixed == data.offset_float_min
1898 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1899 goto remove_float; /* As in (a)+. */
1900
c277df42
IZ
1901 r->float_substr = data.longest_float;
1902 r->float_min_offset = data.offset_float_min;
1903 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1904 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1905 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1906 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1907 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1908 }
1909 else {
aca2d497 1910 remove_float:
c277df42
IZ
1911 r->float_substr = Nullsv;
1912 SvREFCNT_dec(data.longest_float);
c5254dd6 1913 longest_float_length = 0;
a0d0e21e 1914 }
c277df42 1915
a0ed51b3 1916 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1917 if (longest_fixed_length
c277df42
IZ
1918 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1921 int t;
1922
c277df42
IZ
1923 r->anchored_substr = data.longest_fixed;
1924 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1925 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1926 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1927 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1928 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1929 }
1930 else {
c277df42
IZ
1931 r->anchored_substr = Nullsv;
1932 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1933 longest_fixed_length = 0;
a0d0e21e 1934 }
b81d288d 1935 if (r->regstclass
ffc61ed2 1936 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1937 r->regstclass = NULL;
1938 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1939 && !(data.start_class->flags & ANYOF_EOS)
1940 && !cl_is_anything(data.start_class)) {
830247a4 1941 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1942
b81d288d 1943 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1944 struct regnode_charclass_class);
1945 StructCopy(data.start_class,
830247a4 1946 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1947 struct regnode_charclass_class);
830247a4 1948 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1949 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1950 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1951 DEBUG_r({ SV *sv = sv_newmortal();
1952 regprop(sv, (regnode*)data.start_class);
1953 PerlIO_printf(Perl_debug_log,
1954 "synthetic stclass `%s'.\n",
1955 SvPVX(sv));});
653099ff 1956 }
c277df42
IZ
1957
1958 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1959 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1960 r->check_substr = r->anchored_substr;
1961 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1962 if (r->reganch & ROPT_ANCH_SINGLE)
1963 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1964 }
1965 else {
c277df42
IZ
1966 r->check_substr = r->float_substr;
1967 r->check_offset_min = data.offset_float_min;
1968 r->check_offset_max = data.offset_float_max;
a0d0e21e 1969 }
30382c73
IZ
1970 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1971 This should be changed ASAP! */
1972 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1973 r->reganch |= RE_USE_INTUIT;
1974 if (SvTAIL(r->check_substr))
1975 r->reganch |= RE_INTUIT_TAIL;
1976 }
a0ed51b3
LW
1977 }
1978 else {
c277df42
IZ
1979 /* Several toplevels. Best we can is to set minlen. */
1980 I32 fake;
653099ff 1981 struct regnode_charclass_class ch_class;
cb434fcc 1982 I32 last_close = 0;
c277df42
IZ
1983
1984 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1985 scan = r->program + 1;
830247a4 1986 cl_init(pRExC_state, &ch_class);
653099ff 1987 data.start_class = &ch_class;
cb434fcc 1988 data.last_closep = &last_close;
e1901655 1989 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1990 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1991 if (!(data.start_class->flags & ANYOF_EOS)
1992 && !cl_is_anything(data.start_class)) {
830247a4 1993 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1994
b81d288d 1995 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1996 struct regnode_charclass_class);
1997 StructCopy(data.start_class,
830247a4 1998 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1999 struct regnode_charclass_class);
830247a4 2000 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2001 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2002 DEBUG_r({ SV* sv = sv_newmortal();
2003 regprop(sv, (regnode*)data.start_class);
2004 PerlIO_printf(Perl_debug_log,
2005 "synthetic stclass `%s'.\n",
2006 SvPVX(sv));});
653099ff 2007 }
a0d0e21e
LW
2008 }
2009
a0d0e21e 2010 r->minlen = minlen;
b81d288d 2011 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2012 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2013 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2014 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2015 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2016 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2017 if (RExC_seen & REG_SEEN_CANY)
2018 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2019 Newz(1002, r->startp, RExC_npar, I32);
2020 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2021 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2022 DEBUG_r(regdump(r));
2023 return(r);
a687059c
LW
2024}
2025
2026/*
2027 - reg - regular expression, i.e. main body or parenthesized thing
2028 *
2029 * Caller must absorb opening parenthesis.
2030 *
2031 * Combining parenthesis handling with the base level of regular expression
2032 * is a trifle forced, but the need to tie the tails of the branches to what
2033 * follows makes it hard to avoid.
2034 */
76e3520e 2035STATIC regnode *
830247a4 2036S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2037 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2038{
c277df42
IZ
2039 register regnode *ret; /* Will be the head of the group. */
2040 register regnode *br;
2041 register regnode *lastbr;
2042 register regnode *ender = 0;
a0d0e21e 2043 register I32 parno = 0;
830247a4 2044 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2045
2046 /* for (?g), (?gc), and (?o) warnings; warning
2047 about (?c) will warn about (?g) -- japhy */
2048
2049 I32 wastedflags = 0x00,
2050 wasted_o = 0x01,
2051 wasted_g = 0x02,
2052 wasted_gc = 0x02 | 0x04,
2053 wasted_c = 0x04;
2054
fac92740 2055 char * parse_start = RExC_parse; /* MJD */
830247a4 2056 char *oregcomp_parse = RExC_parse;
c277df42 2057 char c;
a0d0e21e 2058
821b33a5 2059 *flagp = 0; /* Tentatively. */
a0d0e21e 2060
9d1d55b5 2061
a0d0e21e
LW
2062 /* Make an OPEN node, if parenthesized. */
2063 if (paren) {
fac92740 2064 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2065 U16 posflags = 0, negflags = 0;
2066 U16 *flagsp = &posflags;
0f5d15d6 2067 int logical = 0;
830247a4 2068 char *seqstart = RExC_parse;
ca9dfc88 2069
830247a4
IZ
2070 RExC_parse++;
2071 paren = *RExC_parse++;
c277df42 2072 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2073 switch (paren) {
fac92740 2074 case '<': /* (?<...) */
830247a4 2075 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2076 if (*RExC_parse == '!')
c277df42 2077 paren = ',';
b81d288d 2078 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2079 goto unknown;
830247a4 2080 RExC_parse++;
fac92740
MJD
2081 case '=': /* (?=...) */
2082 case '!': /* (?!...) */
830247a4 2083 RExC_seen_zerolen++;
fac92740
MJD
2084 case ':': /* (?:...) */
2085 case '>': /* (?>...) */
a0d0e21e 2086 break;
fac92740
MJD
2087 case '$': /* (?$...) */
2088 case '@': /* (?@...) */
8615cb43 2089 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2090 break;
fac92740 2091 case '#': /* (?#...) */
830247a4
IZ
2092 while (*RExC_parse && *RExC_parse != ')')
2093 RExC_parse++;
2094 if (*RExC_parse != ')')
c277df42 2095 FAIL("Sequence (?#... not terminated");
830247a4 2096 nextchar(pRExC_state);
a0d0e21e
LW
2097 *flagp = TRYAGAIN;
2098 return NULL;
fac92740 2099 case 'p': /* (?p...) */
f9373011
PM
2100 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2101 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2102 /* FALL THROUGH*/
fac92740 2103 case '?': /* (??...) */
0f5d15d6 2104 logical = 1;
830247a4 2105 paren = *RExC_parse++;
0f5d15d6 2106 /* FALL THROUGH */
fac92740 2107 case '{': /* (?{...}) */
c277df42 2108 {
c277df42
IZ
2109 I32 count = 1, n = 0;
2110 char c;
830247a4 2111 char *s = RExC_parse;
c277df42
IZ
2112 SV *sv;
2113 OP_4tree *sop, *rop;
2114
830247a4
IZ
2115 RExC_seen_zerolen++;
2116 RExC_seen |= REG_SEEN_EVAL;
2117 while (count && (c = *RExC_parse)) {
2118 if (c == '\\' && RExC_parse[1])
2119 RExC_parse++;
b81d288d 2120 else if (c == '{')
c277df42 2121 count++;
b81d288d 2122 else if (c == '}')
c277df42 2123 count--;
830247a4 2124 RExC_parse++;
c277df42 2125 }
830247a4 2126 if (*RExC_parse != ')')
b45f050a 2127 {
b81d288d 2128 RExC_parse = s;
b45f050a
JF
2129 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2130 }
c277df42
IZ
2131 if (!SIZE_ONLY) {
2132 AV *av;
b81d288d
AB
2133
2134 if (RExC_parse - 1 - s)
830247a4 2135 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2136 else
79cb57f6 2137 sv = newSVpvn("", 0);
c277df42 2138
569233ed
SB
2139 ENTER;
2140 Perl_save_re_context(aTHX);
c277df42 2141 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2142 sop->op_private |= OPpREFCOUNTED;
2143 /* re_dup will OpREFCNT_inc */
2144 OpREFCNT_set(sop, 1);
569233ed 2145 LEAVE;
c277df42 2146
830247a4
IZ
2147 n = add_data(pRExC_state, 3, "nop");
2148 RExC_rx->data->data[n] = (void*)rop;
2149 RExC_rx->data->data[n+1] = (void*)sop;
2150 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2151 SvREFCNT_dec(sv);
a0ed51b3 2152 }
e24b16f9 2153 else { /* First pass */
830247a4 2154 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2155 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2156 /* No compiled RE interpolated, has runtime
2157 components ===> unsafe. */
2158 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2159 if (PL_tainting && PL_tainted)
cc6b7395 2160 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2161 }
2162
830247a4 2163 nextchar(pRExC_state);
0f5d15d6 2164 if (logical) {
830247a4 2165 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2166 if (!SIZE_ONLY)
2167 ret->flags = 2;
830247a4 2168 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2169 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2170 return ret;
2171 }
830247a4 2172 return reganode(pRExC_state, EVAL, n);
c277df42 2173 }
fac92740 2174 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2175 {
fac92740 2176 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2177 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2178 || RExC_parse[1] == '<'
830247a4 2179 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2180 I32 flag;
2181
830247a4 2182 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2183 if (!SIZE_ONLY)
2184 ret->flags = 1;
830247a4 2185 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2186 goto insert_if;
b81d288d 2187 }
a0ed51b3 2188 }
830247a4 2189 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2190 /* (?(1)...) */
830247a4 2191 parno = atoi(RExC_parse++);
c277df42 2192
830247a4
IZ
2193 while (isDIGIT(*RExC_parse))
2194 RExC_parse++;
fac92740
MJD
2195 ret = reganode(pRExC_state, GROUPP, parno);
2196
830247a4 2197 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2198 vFAIL("Switch condition not recognized");
c277df42 2199 insert_if:
830247a4
IZ
2200 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2201 br = regbranch(pRExC_state, &flags, 1);
c277df42 2202 if (br == NULL)
830247a4 2203 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2204 else
830247a4
IZ
2205 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2206 c = *nextchar(pRExC_state);
d1b80229
IZ
2207 if (flags&HASWIDTH)
2208 *flagp |= HASWIDTH;
c277df42 2209 if (c == '|') {
830247a4
IZ
2210 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2211 regbranch(pRExC_state, &flags, 1);
2212 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2213 if (flags&HASWIDTH)
2214 *flagp |= HASWIDTH;
830247a4 2215 c = *nextchar(pRExC_state);
a0ed51b3
LW
2216 }
2217 else
c277df42
IZ
2218 lastbr = NULL;
2219 if (c != ')')
8615cb43 2220 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2221 ender = reg_node(pRExC_state, TAIL);
2222 regtail(pRExC_state, br, ender);
c277df42 2223 if (lastbr) {
830247a4
IZ
2224 regtail(pRExC_state, lastbr, ender);
2225 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2226 }
2227 else
830247a4 2228 regtail(pRExC_state, ret, ender);
c277df42 2229 return ret;
a0ed51b3
LW
2230 }
2231 else {
830247a4 2232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2233 }
2234 }
1b1626e4 2235 case 0:
830247a4 2236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2237 vFAIL("Sequence (? incomplete");
1b1626e4 2238 break;
a0d0e21e 2239 default:
830247a4 2240 --RExC_parse;
fac92740 2241 parse_flags: /* (?i) */
830247a4 2242 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2243 /* (?g), (?gc) and (?o) are useless here
2244 and must be globally applied -- japhy */
2245
2246 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2247 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2248 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2249 if (! (wastedflags & wflagbit) ) {
2250 wastedflags |= wflagbit;
2251 vWARN5(
2252 RExC_parse + 1,
2253 "Useless (%s%c) - %suse /%c modifier",
2254 flagsp == &negflags ? "?-" : "?",
2255 *RExC_parse,
2256 flagsp == &negflags ? "don't " : "",
2257 *RExC_parse
2258 );
2259 }
2260 }
2261 }
2262 else if (*RExC_parse == 'c') {
2263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2264 if (! (wastedflags & wasted_c) ) {
2265 wastedflags |= wasted_gc;
2266 vWARN3(
2267 RExC_parse + 1,
2268 "Useless (%sc) - %suse /gc modifier",
2269 flagsp == &negflags ? "?-" : "?",
2270 flagsp == &negflags ? "don't " : ""
2271 );
2272 }
2273 }
2274 }
2275 else { pmflag(flagsp, *RExC_parse); }
2276
830247a4 2277 ++RExC_parse;
ca9dfc88 2278 }
830247a4 2279 if (*RExC_parse == '-') {
ca9dfc88 2280 flagsp = &negflags;
9d1d55b5 2281 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2282 ++RExC_parse;
ca9dfc88 2283 goto parse_flags;
48c036b1 2284 }
830247a4
IZ
2285 RExC_flags16 |= posflags;
2286 RExC_flags16 &= ~negflags;
2287 if (*RExC_parse == ':') {
2288 RExC_parse++;
ca9dfc88
IZ
2289 paren = ':';
2290 break;
2291 }
c277df42 2292 unknown:
830247a4
IZ
2293 if (*RExC_parse != ')') {
2294 RExC_parse++;
2295 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2296 }
830247a4 2297 nextchar(pRExC_state);
a0d0e21e
LW
2298 *flagp = TRYAGAIN;
2299 return NULL;
2300 }
2301 }
fac92740 2302 else { /* (...) */
830247a4
IZ
2303 parno = RExC_npar;
2304 RExC_npar++;
2305 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2306 Set_Node_Length(ret, 1); /* MJD */
2307 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2308 open = 1;
a0d0e21e 2309 }
a0ed51b3 2310 }
fac92740 2311 else /* ! paren */
a0d0e21e
LW
2312 ret = NULL;
2313
2314 /* Pick up the branches, linking them together. */
fac92740 2315 parse_start = RExC_parse; /* MJD */
830247a4 2316 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2317 /* branch_len = (paren != 0); */
2318
a0d0e21e
LW
2319 if (br == NULL)
2320 return(NULL);
830247a4
IZ
2321 if (*RExC_parse == '|') {
2322 if (!SIZE_ONLY && RExC_extralen) {
2323 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2324 }
fac92740 2325 else { /* MJD */
830247a4 2326 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2327 Set_Node_Length(br, paren != 0);
2328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2329 }
c277df42
IZ
2330 have_branch = 1;
2331 if (SIZE_ONLY)
830247a4 2332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2333 }
2334 else if (paren == ':') {
c277df42
IZ
2335 *flagp |= flags&SIMPLE;
2336 }
2337 if (open) { /* Starts with OPEN. */
830247a4 2338 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2339 }
2340 else if (paren != '?') /* Not Conditional */
a0d0e21e 2341 ret = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a0d0e21e 2344 *flagp |= flags&SPSTART;
c277df42 2345 lastbr = br;
830247a4
IZ
2346 while (*RExC_parse == '|') {
2347 if (!SIZE_ONLY && RExC_extralen) {
2348 ender = reganode(pRExC_state, LONGJMP,0);
2349 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2350 }
2351 if (SIZE_ONLY)
830247a4
IZ
2352 RExC_extralen += 2; /* Account for LONGJMP. */
2353 nextchar(pRExC_state);
2354 br = regbranch(pRExC_state, &flags, 0);
fac92740 2355
a687059c 2356 if (br == NULL)
a0d0e21e 2357 return(NULL);
830247a4 2358 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2359 lastbr = br;
821b33a5
IZ
2360 if (flags&HASWIDTH)
2361 *flagp |= HASWIDTH;
a687059c 2362 *flagp |= flags&SPSTART;
a0d0e21e
LW
2363 }
2364
c277df42
IZ
2365 if (have_branch || paren != ':') {
2366 /* Make a closing node, and hook it on the end. */
2367 switch (paren) {
2368 case ':':
830247a4 2369 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2370 break;
2371 case 1:
830247a4 2372 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2373 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2374 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2375 break;
2376 case '<':
c277df42
IZ
2377 case ',':
2378 case '=':
2379 case '!':
c277df42 2380 *flagp &= ~HASWIDTH;
821b33a5
IZ
2381 /* FALL THROUGH */
2382 case '>':
830247a4 2383 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2384 break;
2385 case 0:
830247a4 2386 ender = reg_node(pRExC_state, END);
c277df42
IZ
2387 break;
2388 }
830247a4 2389 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2390
c277df42
IZ
2391 if (have_branch) {
2392 /* Hook the tails of the branches to the closing node. */
2393 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2394 regoptail(pRExC_state, br, ender);
c277df42
IZ
2395 }
2396 }
a0d0e21e 2397 }
c277df42
IZ
2398
2399 {
2400 char *p;
2401 static char parens[] = "=!<,>";
2402
2403 if (paren && (p = strchr(parens, paren))) {
2404 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2405 int flag = (p - parens) > 1;
2406
2407 if (paren == '>')
2408 node = SUSPEND, flag = 0;
830247a4 2409 reginsert(pRExC_state, node,ret);
c277df42 2410 ret->flags = flag;
830247a4 2411 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2412 }
a0d0e21e
LW
2413 }
2414
2415 /* Check for proper termination. */
ce3e6498 2416 if (paren) {
830247a4
IZ
2417 RExC_flags16 = oregflags;
2418 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2419 RExC_parse = oregcomp_parse;
380a0633 2420 vFAIL("Unmatched (");
ce3e6498 2421 }
a0ed51b3 2422 }
830247a4
IZ
2423 else if (!paren && RExC_parse < RExC_end) {
2424 if (*RExC_parse == ')') {
2425 RExC_parse++;
380a0633 2426 vFAIL("Unmatched )");
a0ed51b3
LW
2427 }
2428 else
b45f050a 2429 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2430 /* NOTREACHED */
2431 }
a687059c 2432
a0d0e21e 2433 return(ret);
a687059c
LW
2434}
2435
2436/*
2437 - regbranch - one alternative of an | operator
2438 *
2439 * Implements the concatenation operator.
2440 */
76e3520e 2441STATIC regnode *
830247a4 2442S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2443{
c277df42
IZ
2444 register regnode *ret;
2445 register regnode *chain = NULL;
2446 register regnode *latest;
2447 I32 flags = 0, c = 0;
a0d0e21e 2448
b81d288d 2449 if (first)
c277df42
IZ
2450 ret = NULL;
2451 else {
b81d288d 2452 if (!SIZE_ONLY && RExC_extralen)
830247a4 2453 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2454 else {
830247a4 2455 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2456 Set_Node_Length(ret, 1);
2457 }
c277df42
IZ
2458 }
2459
b81d288d 2460 if (!first && SIZE_ONLY)
830247a4 2461 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2462
c277df42 2463 *flagp = WORST; /* Tentatively. */
a0d0e21e 2464
830247a4
IZ
2465 RExC_parse--;
2466 nextchar(pRExC_state);
2467 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2468 flags &= ~TRYAGAIN;
830247a4 2469 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2470 if (latest == NULL) {
2471 if (flags & TRYAGAIN)
2472 continue;
2473 return(NULL);
a0ed51b3
LW
2474 }
2475 else if (ret == NULL)
c277df42 2476 ret = latest;
a0d0e21e 2477 *flagp |= flags&HASWIDTH;
c277df42 2478 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2479 *flagp |= flags&SPSTART;
2480 else {
830247a4
IZ
2481 RExC_naughty++;
2482 regtail(pRExC_state, chain, latest);
a687059c 2483 }
a0d0e21e 2484 chain = latest;
c277df42
IZ
2485 c++;
2486 }
2487 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2488 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2489 if (ret == NULL)
2490 ret = chain;
2491 }
2492 if (c == 1) {
2493 *flagp |= flags&SIMPLE;
a0d0e21e 2494 }
a687059c 2495
a0d0e21e 2496 return(ret);
a687059c
LW
2497}
2498
2499/*
2500 - regpiece - something followed by possible [*+?]
2501 *
2502 * Note that the branching code sequences used for ? and the general cases
2503 * of * and + are somewhat optimized: they use the same NOTHING node as
2504 * both the endmarker for their branch list and the body of the last branch.
2505 * It might seem that this node could be dispensed with entirely, but the
2506 * endmarker role is not redundant.
2507 */
76e3520e 2508STATIC regnode *
830247a4 2509S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2510{
c277df42 2511 register regnode *ret;
a0d0e21e
LW
2512 register char op;
2513 register char *next;
2514 I32 flags;
830247a4 2515 char *origparse = RExC_parse;
a0d0e21e
LW
2516 char *maxpos;
2517 I32 min;
c277df42 2518 I32 max = REG_INFTY;
fac92740 2519 char *parse_start;
a0d0e21e 2520
830247a4 2521 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2522 if (ret == NULL) {
2523 if (flags & TRYAGAIN)
2524 *flagp |= TRYAGAIN;
2525 return(NULL);
2526 }
2527
830247a4 2528 op = *RExC_parse;
a0d0e21e 2529
830247a4 2530 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2531 parse_start = RExC_parse; /* MJD */
830247a4 2532 next = RExC_parse + 1;
a0d0e21e
LW
2533 maxpos = Nullch;
2534 while (isDIGIT(*next) || *next == ',') {
2535 if (*next == ',') {
2536 if (maxpos)
2537 break;
2538 else
2539 maxpos = next;
a687059c 2540 }
a0d0e21e
LW
2541 next++;
2542 }
2543 if (*next == '}') { /* got one */
2544 if (!maxpos)
2545 maxpos = next;
830247a4
IZ
2546 RExC_parse++;
2547 min = atoi(RExC_parse);
a0d0e21e
LW
2548 if (*maxpos == ',')
2549 maxpos++;
2550 else
830247a4 2551 maxpos = RExC_parse;
a0d0e21e
LW
2552 max = atoi(maxpos);
2553 if (!max && *maxpos != '0')
c277df42
IZ
2554 max = REG_INFTY; /* meaning "infinity" */
2555 else if (max >= REG_INFTY)
8615cb43 2556 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2557 RExC_parse = next;
2558 nextchar(pRExC_state);
a0d0e21e
LW
2559
2560 do_curly:
2561 if ((flags&SIMPLE)) {
830247a4
IZ
2562 RExC_naughty += 2 + RExC_naughty / 2;
2563 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2564 Set_Node_Offset(ret, parse_start+1); /* MJD */
2565 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2566 }
2567 else {
830247a4 2568 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2569
2570 w->flags = 0;
830247a4
IZ
2571 regtail(pRExC_state, ret, w);
2572 if (!SIZE_ONLY && RExC_extralen) {
2573 reginsert(pRExC_state, LONGJMP,ret);
2574 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2575 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2576 }
830247a4 2577 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2578 /* MJD hk */
2579 Set_Node_Offset(ret, parse_start+1);
2580 Set_Node_Length(ret,
2581 op == '{' ? (RExC_parse - parse_start) : 1);
2582
830247a4 2583 if (!SIZE_ONLY && RExC_extralen)
c277df42 2584 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2585 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2586 if (SIZE_ONLY)
830247a4
IZ
2587 RExC_whilem_seen++, RExC_extralen += 3;
2588 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2589 }
c277df42 2590 ret->flags = 0;
a0d0e21e
LW
2591
2592 if (min > 0)
821b33a5
IZ
2593 *flagp = WORST;
2594 if (max > 0)
2595 *flagp |= HASWIDTH;
a0d0e21e 2596 if (max && max < min)
8615cb43 2597 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2598 if (!SIZE_ONLY) {
2599 ARG1_SET(ret, min);
2600 ARG2_SET(ret, max);
a687059c 2601 }
a687059c 2602
a0d0e21e 2603 goto nest_check;
a687059c 2604 }
a0d0e21e 2605 }
a687059c 2606
a0d0e21e
LW
2607 if (!ISMULT1(op)) {
2608 *flagp = flags;
a687059c 2609 return(ret);
a0d0e21e 2610 }
bb20fd44 2611
c277df42 2612#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2613
2614 /* if this is reinstated, don't forget to put this back into perldiag:
2615
2616 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2617
2618 (F) The part of the regexp subject to either the * or + quantifier
2619 could match an empty string. The {#} shows in the regular
2620 expression about where the problem was discovered.
2621
2622 */
2623
bb20fd44 2624 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2625 vFAIL("Regexp *+ operand could be empty");
b81d288d 2626#endif
bb20fd44 2627
fac92740 2628 parse_start = RExC_parse;
830247a4 2629 nextchar(pRExC_state);
a0d0e21e 2630
821b33a5 2631 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2632
2633 if (op == '*' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, STAR, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 4;
a0d0e21e
LW
2637 }
2638 else if (op == '*') {
2639 min = 0;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2643 reginsert(pRExC_state, PLUS, ret);
c277df42 2644 ret->flags = 0;
830247a4 2645 RExC_naughty += 3;
a0d0e21e
LW
2646 }
2647 else if (op == '+') {
2648 min = 1;
2649 goto do_curly;
a0ed51b3
LW
2650 }
2651 else if (op == '?') {
a0d0e21e
LW
2652 min = 0; max = 1;
2653 goto do_curly;
2654 }
2655 nest_check:
e476b1b5 2656 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2657 vWARN3(RExC_parse,
b45f050a 2658 "%.*s matches null string many times",
830247a4 2659 RExC_parse - origparse,
b45f050a 2660 origparse);
a0d0e21e
LW
2661 }
2662
830247a4
IZ
2663 if (*RExC_parse == '?') {
2664 nextchar(pRExC_state);
2665 reginsert(pRExC_state, MINMOD, ret);
2666 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2667 }
830247a4
IZ
2668 if (ISMULT2(RExC_parse)) {
2669 RExC_parse++;
b45f050a
JF
2670 vFAIL("Nested quantifiers");
2671 }
a0d0e21e
LW
2672
2673 return(ret);
a687059c
LW
2674}
2675
2676/*
2677 - regatom - the lowest level
2678 *
2679 * Optimization: gobbles an entire sequence of ordinary characters so that
2680 * it can turn them into a single node, which is smaller to store and
2681 * faster to run. Backslashed characters are exceptions, each becoming a
2682 * separate node; the code is simpler that way and it's not worth fixing.
2683 *
b45f050a 2684 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2685STATIC regnode *
830247a4 2686S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2687{
c277df42 2688 register regnode *ret = 0;
a0d0e21e 2689 I32 flags;
f06dbbb7 2690 char *parse_start = 0;
a0d0e21e
LW
2691
2692 *flagp = WORST; /* Tentatively. */
2693
2694tryagain:
830247a4 2695 switch (*RExC_parse) {
a0d0e21e 2696 case '^':
830247a4
IZ
2697 RExC_seen_zerolen++;
2698 nextchar(pRExC_state);
2699 if (RExC_flags16 & PMf_MULTILINE)
2700 ret = reg_node(pRExC_state, MBOL);
2701 else if (RExC_flags16 & PMf_SINGLELINE)
2702 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2703 else
830247a4 2704 ret = reg_node(pRExC_state, BOL);
fac92740 2705 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2706 break;
2707 case '$':
830247a4 2708 nextchar(pRExC_state);
b81d288d 2709 if (*RExC_parse)
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 if (RExC_flags16 & PMf_MULTILINE)
2712 ret = reg_node(pRExC_state, MEOL);
2713 else if (RExC_flags16 & PMf_SINGLELINE)
2714 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2715 else
830247a4 2716 ret = reg_node(pRExC_state, EOL);
fac92740 2717 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2718 break;
2719 case '.':
830247a4 2720 nextchar(pRExC_state);
cce850e4 2721 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2722 ret = reg_node(pRExC_state, SANY);
2723 else
2724 ret = reg_node(pRExC_state, REG_ANY);
2725 *flagp |= HASWIDTH|SIMPLE;
830247a4 2726 RExC_naughty++;
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '[':
b45f050a 2730 {
830247a4 2731 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2732 ret = regclass(pRExC_state);
830247a4
IZ
2733 if (*RExC_parse != ']') {
2734 RExC_parse = oregcomp_parse;
b45f050a
JF
2735 vFAIL("Unmatched [");
2736 }
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738 *flagp |= HASWIDTH|SIMPLE;
fac92740 2739 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2740 break;
b45f050a 2741 }
a0d0e21e 2742 case '(':
830247a4
IZ
2743 nextchar(pRExC_state);
2744 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2745 if (ret == NULL) {
bf93d4cc 2746 if (flags & TRYAGAIN) {
830247a4 2747 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2748 /* Make parent create an empty node if needed. */
2749 *flagp |= TRYAGAIN;
2750 return(NULL);
2751 }
a0d0e21e 2752 goto tryagain;
bf93d4cc 2753 }
a0d0e21e
LW
2754 return(NULL);
2755 }
c277df42 2756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2757 break;
2758 case '|':
2759 case ')':
2760 if (flags & TRYAGAIN) {
2761 *flagp |= TRYAGAIN;
2762 return NULL;
2763 }
b45f050a 2764 vFAIL("Internal urp");
a0d0e21e
LW
2765 /* Supposed to be caught earlier. */
2766 break;
85afd4ae 2767 case '{':
830247a4
IZ
2768 if (!regcurly(RExC_parse)) {
2769 RExC_parse++;
85afd4ae
CS
2770 goto defchar;
2771 }
2772 /* FALL THROUGH */
a0d0e21e
LW
2773 case '?':
2774 case '+':
2775 case '*':
830247a4 2776 RExC_parse++;
b45f050a 2777 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2778 break;
2779 case '\\':
830247a4 2780 switch (*++RExC_parse) {
a0d0e21e 2781 case 'A':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2784 *flagp |= SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'G':
830247a4
IZ
2789 ret = reg_node(pRExC_state, GPOS);
2790 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2791 *flagp |= SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2794 break;
2795 case 'Z':
830247a4 2796 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2797 *flagp |= SIMPLE;
830247a4 2798 nextchar(pRExC_state);
a0d0e21e 2799 break;
b85d18e9 2800 case 'z':
830247a4 2801 ret = reg_node(pRExC_state, EOS);
b85d18e9 2802 *flagp |= SIMPLE;
830247a4
IZ
2803 RExC_seen_zerolen++; /* Do not optimize RE away */
2804 nextchar(pRExC_state);
fac92740 2805 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2806 break;
4a2d328f 2807 case 'C':
f33976b4
DB
2808 ret = reg_node(pRExC_state, CANY);
2809 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2810 *flagp |= HASWIDTH|SIMPLE;
830247a4 2811 nextchar(pRExC_state);
fac92740 2812 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2813 break;
2814 case 'X':
830247a4 2815 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2816 *flagp |= HASWIDTH;
830247a4 2817 nextchar(pRExC_state);
fac92740 2818 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2819 break;
a0d0e21e 2820 case 'w':
ffc61ed2 2821 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2822 *flagp |= HASWIDTH|SIMPLE;
830247a4 2823 nextchar(pRExC_state);
fac92740 2824 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2825 break;
2826 case 'W':
ffc61ed2 2827 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2828 *flagp |= HASWIDTH|SIMPLE;
830247a4 2829 nextchar(pRExC_state);
fac92740 2830 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2831 break;
2832 case 'b':
830247a4
IZ
2833 RExC_seen_zerolen++;
2834 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2835 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2836 *flagp |= SIMPLE;
830247a4 2837 nextchar(pRExC_state);
fac92740 2838 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2839 break;
2840 case 'B':
830247a4
IZ
2841 RExC_seen_zerolen++;
2842 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2843 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2844 *flagp |= SIMPLE;
830247a4 2845 nextchar(pRExC_state);
fac92740 2846 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2847 break;
2848 case 's':
ffc61ed2 2849 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2850 *flagp |= HASWIDTH|SIMPLE;
830247a4 2851 nextchar(pRExC_state);
fac92740 2852 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2853 break;
2854 case 'S':
ffc61ed2 2855 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
830247a4 2857 nextchar(pRExC_state);
fac92740 2858 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2859 break;
2860 case 'd':
ffc61ed2 2861 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2862 *flagp |= HASWIDTH|SIMPLE;
830247a4 2863 nextchar(pRExC_state);
fac92740 2864 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2865 break;
2866 case 'D':
ffc61ed2 2867 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2868 *flagp |= HASWIDTH|SIMPLE;
830247a4 2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2871 break;
a14b48bc
LW
2872 case 'p':
2873 case 'P':
3568d838 2874 {
830247a4 2875 char* oldregxend = RExC_end;
fac92740 2876 char* parse_start = RExC_parse;
a14b48bc 2877
830247a4 2878 if (RExC_parse[1] == '{') {
3568d838 2879 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2880 RExC_end = strchr(RExC_parse, '}');
2881 if (!RExC_end) {
2882 RExC_parse += 2;
2883 RExC_end = oldregxend;
b45f050a
JF
2884 vFAIL("Missing right brace on \\p{}");
2885 }
830247a4 2886 RExC_end++;
a14b48bc
LW
2887 }
2888 else
830247a4
IZ
2889 RExC_end = RExC_parse + 2;
2890 RExC_parse--;
a14b48bc 2891
ffc61ed2 2892 ret = regclass(pRExC_state);
a14b48bc 2893
830247a4
IZ
2894 RExC_end = oldregxend;
2895 RExC_parse--;
fac92740 2896 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2897 nextchar(pRExC_state);
a14b48bc
LW
2898 *flagp |= HASWIDTH|SIMPLE;
2899 }
2900 break;
a0d0e21e
LW
2901 case 'n':
2902 case 'r':
2903 case 't':
2904 case 'f':
2905 case 'e':
2906 case 'a':
2907 case 'x':
2908 case 'c':
2909 case '0':
2910 goto defchar;
2911 case '1': case '2': case '3': case '4':
2912 case '5': case '6': case '7': case '8': case '9':
2913 {
830247a4 2914 I32 num = atoi(RExC_parse);
a0d0e21e 2915
830247a4 2916 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2917 goto defchar;
2918 else {
fac92740 2919 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2920 while (isDIGIT(*RExC_parse))
2921 RExC_parse++;
b45f050a 2922
830247a4 2923 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2924 vFAIL("Reference to nonexistent group");
830247a4
IZ
2925 RExC_sawback = 1;
2926 ret = reganode(pRExC_state, FOLD
a0ed51b3 2927 ? (LOC ? REFFL : REFF)
c8756f30 2928 : REF, num);
a0d0e21e 2929 *flagp |= HASWIDTH;
fac92740
MJD
2930
2931 /* override incorrect value set in reganode MJD */
2932 Set_Node_Offset(ret, parse_start+1);
2933 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2934 RExC_parse--;
2935 nextchar(pRExC_state);
a0d0e21e
LW
2936 }
2937 }
2938 break;
2939 case '\0':
830247a4 2940 if (RExC_parse >= RExC_end)
b45f050a 2941 FAIL("Trailing \\");
a0d0e21e
LW
2942 /* FALL THROUGH */
2943 default:
c9f97d15
IZ
2944 /* Do not generate `unrecognized' warnings here, we fall
2945 back into the quick-grab loop below */
a0d0e21e
LW
2946 goto defchar;
2947 }
2948 break;
4633a7c4
LW
2949
2950 case '#':
830247a4
IZ
2951 if (RExC_flags16 & PMf_EXTENDED) {
2952 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2953 if (RExC_parse < RExC_end)
4633a7c4
LW
2954 goto tryagain;
2955 }
2956 /* FALL THROUGH */
2957
a0d0e21e 2958 default: {
ba210ebe 2959 register STRLEN len;
a0ed51b3 2960 register UV ender;
a0d0e21e 2961 register char *p;
c277df42 2962 char *oldp, *s;
ba210ebe 2963 STRLEN numlen;
f06dbbb7
JH
2964
2965 parse_start = RExC_parse - 1;
a0d0e21e 2966
830247a4 2967 RExC_parse++;
a0d0e21e
LW
2968
2969 defchar:
830247a4 2970 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2971 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2972 : EXACT);
cd439c50 2973 s = STRING(ret);
830247a4
IZ
2974 for (len = 0, p = RExC_parse - 1;
2975 len < 127 && p < RExC_end;
a0d0e21e
LW
2976 len++)
2977 {
2978 oldp = p;
5b5a24f7 2979
830247a4
IZ
2980 if (RExC_flags16 & PMf_EXTENDED)
2981 p = regwhite(p, RExC_end);
a0d0e21e
LW
2982 switch (*p) {
2983 case '^':
2984 case '$':
2985 case '.':
2986 case '[':
2987 case '(':
2988 case ')':
2989 case '|':
2990 goto loopdone;
2991 case '\\':
2992 switch (*++p) {
2993 case 'A':
2994 case 'G':
2995 case 'Z':
b85d18e9 2996 case 'z':
a0d0e21e
LW
2997 case 'w':
2998 case 'W':
2999 case 'b':
3000 case 'B':
3001 case 's':
3002 case 'S':
3003 case 'd':
3004 case 'D':
a14b48bc
LW
3005 case 'p':
3006 case 'P':
a0d0e21e
LW
3007 --p;
3008 goto loopdone;
3009 case 'n':
3010 ender = '\n';
3011 p++;
a687059c 3012 break;
a0d0e21e
LW
3013 case 'r':
3014 ender = '\r';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 't':
3018 ender = '\t';
3019 p++;
a687059c 3020 break;
a0d0e21e
LW
3021 case 'f':
3022 ender = '\f';
3023 p++;
a687059c 3024 break;
a0d0e21e 3025 case 'e':
c7f1f016 3026 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'a':
c7f1f016 3030 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3031 p++;
a687059c 3032 break;
a0d0e21e 3033 case 'x':
a0ed51b3
LW
3034 if (*++p == '{') {
3035 char* e = strchr(p, '}');
b81d288d 3036
b45f050a 3037 if (!e) {
830247a4 3038 RExC_parse = p + 1;
b45f050a
JF
3039 vFAIL("Missing right brace on \\x{}");
3040 }
de5f0749 3041 else {
a4c04bdc
NC
3042 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3043 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3044 numlen = e - p - 1;
3045 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3046 if (ender > 0xff)
3047 RExC_utf8 = 1;
b21ed0a9
GS
3048 /* numlen is generous */
3049 if (numlen + len >= 127) {
a0ed51b3
LW
3050 p--;
3051 goto loopdone;
3052 }
3053 p = e + 1;
3054 }
a0ed51b3
LW
3055 }
3056 else {
a4c04bdc 3057 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3058 numlen = 2;
3059 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3060 p += numlen;
3061 }
a687059c 3062 break;
a0d0e21e
LW
3063 case 'c':
3064 p++;
bbce6d69 3065 ender = UCHARAT(p++);
3066 ender = toCTRL(ender);
a687059c 3067 break;
a0d0e21e
LW
3068 case '0': case '1': case '2': case '3':case '4':
3069 case '5': case '6': case '7': case '8':case '9':
3070 if (*p == '0' ||
830247a4 3071 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3072 I32 flags = 0;
3073 numlen = 3;
3074 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3075 p += numlen;
3076 }
3077 else {
3078 --p;
3079 goto loopdone;
a687059c
LW
3080 }
3081 break;
a0d0e21e 3082 case '\0':
830247a4 3083 if (p >= RExC_end)
b45f050a 3084 FAIL("Trailing \\");
a687059c 3085 /* FALL THROUGH */
a0d0e21e 3086 default:
e476b1b5 3087 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3088 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3089 goto normal_default;
a0d0e21e
LW
3090 }
3091 break;
a687059c 3092 default:
a0ed51b3 3093 normal_default:
fd400ab9 3094 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3095 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3096 &numlen, 0);
a0ed51b3
LW
3097 p += numlen;
3098 }
3099 else
3100 ender = *p++;
a0d0e21e 3101 break;
a687059c 3102 }
830247a4
IZ
3103 if (RExC_flags16 & PMf_EXTENDED)
3104 p = regwhite(p, RExC_end);
a0ed51b3
LW
3105 if (UTF && FOLD) {
3106 if (LOC)
5e12f4fb 3107 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3108 else
3109 ender = toLOWER_uni(ender);
3110 }
a0d0e21e
LW
3111 if (ISMULT2(p)) { /* Back off on ?+*. */
3112 if (len)
3113 p = oldp;
2b9d42f0 3114 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3115 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3116 s += numlen;
3117 len += numlen;
3118 }
a0d0e21e
LW
3119 else {
3120 len++;
cd439c50 3121 REGC(ender, s++);
a0d0e21e
LW
3122 }
3123 break;
a687059c 3124 }
2b9d42f0 3125 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3126 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3127 s += numlen;
3128 len += numlen - 1;
3129 }
3130 else
cd439c50 3131 REGC(ender, s++);
a0d0e21e
LW
3132 }
3133 loopdone:
830247a4 3134 RExC_parse = p - 1;
fac92740 3135 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3136 nextchar(pRExC_state);
793db0cb
JH
3137 {
3138 /* len is STRLEN which is unsigned, need to copy to signed */
3139 IV iv = len;
3140 if (iv < 0)
3141 vFAIL("Internal disaster");
3142 }
a0d0e21e
LW
3143 if (len > 0)
3144 *flagp |= HASWIDTH;
3145 if (len == 1)
3146 *flagp |= SIMPLE;
c277df42 3147 if (!SIZE_ONLY)
cd439c50
IZ
3148 STR_LEN(ret) = len;
3149 if (SIZE_ONLY)
830247a4 3150 RExC_size += STR_SZ(len);
cd439c50 3151 else
830247a4 3152 RExC_emit += STR_SZ(len);
a687059c 3153 }
a0d0e21e
LW
3154 break;
3155 }
a687059c 3156
a0d0e21e 3157 return(ret);
a687059c
LW
3158}
3159
873ef191 3160STATIC char *
cea2e8a9 3161S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3162{
3163 while (p < e) {
3164 if (isSPACE(*p))
3165 ++p;
3166 else if (*p == '#') {
3167 do {
3168 p++;
3169 } while (p < e && *p != '\n');
3170 }
3171 else
3172 break;
3173 }
3174 return p;
3175}
3176
b8c5462f
JH
3177/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3178 Character classes ([:foo:]) can also be negated ([:^foo:]).
3179 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3180 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3181 but trigger warnings because they are currently unimplemented. */
3182STATIC I32
830247a4 3183S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3184{
3185 char *posixcc = 0;
936ed897 3186 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3187
830247a4 3188 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3189 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3190 (*RExC_parse == ':' ||
3191 *RExC_parse == '=' ||
3192 *RExC_parse == '.')) {
3193 char c = *RExC_parse;
3194 char* s = RExC_parse++;
b81d288d 3195
830247a4
IZ
3196 while (RExC_parse < RExC_end && *RExC_parse != c)
3197 RExC_parse++;
3198 if (RExC_parse == RExC_end)
620e46c5 3199 /* Grandfather lone [:, [=, [. */
830247a4 3200 RExC_parse = s;
620e46c5 3201 else {
830247a4 3202 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3203
830247a4
IZ
3204 if (*RExC_parse == ']') {
3205 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3206 posixcc = s + 1;
3207 if (*s == ':') {
3208 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3209 I32 skip = 5; /* the most common skip */
3210
3211 switch (*posixcc) {
3212 case 'a':
3213 if (strnEQ(posixcc, "alnum", 5))
3214 namedclass =
3215 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3216 else if (strnEQ(posixcc, "alpha", 5))
3217 namedclass =
3218 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3219 else if (strnEQ(posixcc, "ascii", 5))
3220 namedclass =
3221 complement ? ANYOF_NASCII : ANYOF_ASCII;
3222 break;
aaa51d5e
JF
3223 case 'b':
3224 if (strnEQ(posixcc, "blank", 5))
3225 namedclass =
3226 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3227 break;
b8c5462f
JH
3228 case 'c':
3229 if (strnEQ(posixcc, "cntrl", 5))
3230 namedclass =
3231 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3232 break;
3233 case 'd':
3234 if (strnEQ(posixcc, "digit", 5))
3235 namedclass =
3236 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3237 break;
3238 case 'g':
3239 if (strnEQ(posixcc, "graph", 5))
3240 namedclass =
3241 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3242 break;
3243 case 'l':
3244 if (strnEQ(posixcc, "lower", 5))
3245 namedclass =
3246 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3247 break;
3248 case 'p':
3249 if (strnEQ(posixcc, "print", 5))
3250 namedclass =
3251 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3252 else if (strnEQ(posixcc, "punct", 5))
3253 namedclass =
3254 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3255 break;
3256 case 's':
3257 if (strnEQ(posixcc, "space", 5))
3258 namedclass =
aaa51d5e 3259 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3260 break;
b8c5462f
JH
3261 case 'u':
3262 if (strnEQ(posixcc, "upper", 5))
3263 namedclass =
3264 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3265 break;
3266 case 'w': /* this is not POSIX, this is the Perl \w */
3267 if (strnEQ(posixcc, "word", 4)) {
3268 namedclass =
3269 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3270 skip = 4;
3271 }
3272 break;
3273 case 'x':
3274 if (strnEQ(posixcc, "xdigit", 6)) {
3275 namedclass =
3276 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3277 skip = 6;
3278 }
3279 break;
3280 }
ac561586
JH
3281 if (namedclass == OOB_NAMEDCLASS ||
3282 posixcc[skip] != ':' ||
3283 posixcc[skip+1] != ']')
b45f050a
JF
3284 {
3285 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3286 t - s - 1, s + 1);
3287 }
3288 } else if (!SIZE_ONLY) {
b8c5462f 3289 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3290
830247a4 3291 /* adjust RExC_parse so the warning shows after
b45f050a 3292 the class closes */
830247a4
IZ
3293 while (*RExC_parse && *RExC_parse != ']')
3294 RExC_parse++;
b45f050a
JF
3295 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3296 }
b8c5462f
JH
3297 } else {
3298 /* Maternal grandfather:
3299 * "[:" ending in ":" but not in ":]" */
830247a4 3300 RExC_parse = s;
767d463e 3301 }
620e46c5
JH
3302 }
3303 }
3304
b8c5462f
JH
3305 return namedclass;
3306}
3307
3308STATIC void
830247a4 3309S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3310{
e476b1b5 3311 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3312 (*RExC_parse == ':' ||
3313 *RExC_parse == '=' ||
3314 *RExC_parse == '.')) {
3315 char *s = RExC_parse;
93733859 3316 char c = *s++;
b8c5462f
JH
3317
3318 while(*s && isALNUM(*s))
3319 s++;
3320 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3321 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3322
3323 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3324 if (c == '=' || c == '.')
b45f050a 3325 {
830247a4 3326 /* adjust RExC_parse so the error shows after
b45f050a 3327 the class closes */
830247a4 3328 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3329 ;
3330 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3331 }
b8c5462f
JH
3332 }
3333 }
620e46c5
JH
3334}
3335
76e3520e 3336STATIC regnode *
830247a4 3337S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3338{
ffc61ed2 3339 register UV value;
3568d838 3340 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3341 register IV range = 0;
c277df42 3342 register regnode *ret;
ba210ebe 3343 STRLEN numlen;
ffc61ed2 3344 IV namedclass;
9c5ffd7c 3345 char *rangebegin = 0;
936ed897 3346 bool need_class = 0;
9c5ffd7c 3347 SV *listsv = Nullsv;
ffc61ed2
JH
3348 register char *e;
3349 UV n;
3568d838 3350 bool optimize_invert = TRUE;
ffc61ed2
JH
3351
3352 ret = reganode(pRExC_state, ANYOF, 0);
3353
3354 if (!SIZE_ONLY)
3355 ANYOF_FLAGS(ret) = 0;
3356
3357 if (*RExC_parse == '^') { /* Complement of range. */
3358 RExC_naughty++;
3359 RExC_parse++;
3360 if (!SIZE_ONLY)
3361 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3362 }
a0d0e21e 3363
936ed897 3364 if (SIZE_ONLY)
830247a4 3365 RExC_size += ANYOF_SKIP;
936ed897 3366 else {
830247a4 3367 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3368 if (FOLD)
3369 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3370 if (LOC)
3371 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3372 ANYOF_BITMAP_ZERO(ret);
3373 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3374 }
b8c5462f 3375
e476b1b5 3376 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3377 checkposixcc(pRExC_state);
b8c5462f 3378
830247a4 3379 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3380 goto charclassloop; /* allow 1st char to be ] or - */
3381
830247a4 3382 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3383
3384 charclassloop:
3385
3386 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3387
73b437c8 3388 if (!range)
830247a4 3389 rangebegin = RExC_parse;
ffc61ed2 3390 if (UTF) {
5e12f4fb 3391 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3392 RExC_end - RExC_parse,
3393 &numlen, 0);
ffc61ed2
JH
3394 RExC_parse += numlen;
3395 }
3396 else
3397 value = UCHARAT(RExC_parse++);
620e46c5 3398 if (value == '[')
830247a4 3399 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3400 else if (value == '\\') {
ffc61ed2 3401 if (UTF) {
5e12f4fb 3402 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3403 RExC_end - RExC_parse,
3404 &numlen, 0);
3405 RExC_parse += numlen;
3406 }
3407 else
3408 value = UCHARAT(RExC_parse++);
470c3474 3409 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3410 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3411 * be a problem later if we want switch on Unicode.
3412 * A similar issue a little bit later when switching on
3413 * namedclass. --jhi */
ffc61ed2 3414 switch ((I32)value) {
b8c5462f
JH
3415 case 'w': namedclass = ANYOF_ALNUM; break;
3416 case 'W': namedclass = ANYOF_NALNUM; break;
3417 case 's': namedclass = ANYOF_SPACE; break;
3418 case 'S': namedclass = ANYOF_NSPACE; break;
3419 case 'd': namedclass = ANYOF_DIGIT; break;
3420 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3421 case 'p':
3422 case 'P':
3423 if (*RExC_parse == '{') {
3424 e = strchr(RExC_parse++, '}');
3425 if (!e)
3426 vFAIL("Missing right brace on \\p{}");
3427 n = e - RExC_parse;
3428 }
3429 else {
3430 e = RExC_parse;
3431 n = 1;
3432 }
3433 if (!SIZE_ONLY) {
3434 if (value == 'p')
3435 Perl_sv_catpvf(aTHX_ listsv,
3436 "+utf8::%.*s\n", (int)n, RExC_parse);
3437 else
3438 Perl_sv_catpvf(aTHX_ listsv,
3439 "!utf8::%.*s\n", (int)n, RExC_parse);
3440 }
3441 RExC_parse = e + 1;
3442 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3443 continue;
b8c5462f
JH
3444 case 'n': value = '\n'; break;
3445 case 'r': value = '\r'; break;
3446 case 't': value = '\t'; break;
3447 case 'f': value = '\f'; break;
3448 case 'b': value = '\b'; break;
c7f1f016
NIS
3449 case 'e': value = ASCII_TO_NATIVE('\033');break;
3450 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3451 case 'x':
ffc61ed2 3452 if (*RExC_parse == '{') {
a4c04bdc
NC
3453 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3454 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3455 e = strchr(RExC_parse++, '}');
b81d288d 3456 if (!e)
ffc61ed2 3457 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3458
3459 numlen = e - RExC_parse;
3460 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3461 RExC_parse = e + 1;
3462 }
3463 else {
a4c04bdc 3464 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3465 numlen = 2;
3466 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3467 RExC_parse += numlen;
3468 }
b8c5462f
JH
3469 break;
3470 case 'c':
830247a4 3471 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3472 value = toCTRL(value);
3473 break;
3474 case '0': case '1': case '2': case '3': case '4':
3475 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3476 {
3477 I32 flags = 0;
3478 numlen = 3;
3479 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3480 RExC_parse += numlen;
b8c5462f 3481 break;
53305cf1 3482 }
1028017a 3483 default:
e476b1b5 3484 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3485 vWARN2(RExC_parse,
3486 "Unrecognized escape \\%c in character class passed through",
3487 (int)value);
1028017a 3488 break;
b8c5462f 3489 }
ffc61ed2
JH
3490 } /* end of \blah */
3491
3492 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3493
3494 if (!SIZE_ONLY && !need_class)
936ed897 3495 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3496
936ed897 3497 need_class = 1;
ffc61ed2
JH
3498
3499 /* a bad range like a-\d, a-[:digit:] ? */
3500 if (range) {
73b437c8 3501 if (!SIZE_ONLY) {
e476b1b5 3502 if (ckWARN(WARN_REGEXP))
830247a4 3503 vWARN4(RExC_parse,
b45f050a 3504 "False [] range \"%*.*s\"",
830247a4
IZ
3505 RExC_parse - rangebegin,
3506 RExC_parse - rangebegin,
b45f050a 3507 rangebegin);
3568d838
JH
3508 if (prevvalue < 256) {
3509 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3510 ANYOF_BITMAP_SET(ret, '-');
3511 }
3512 else {
3513 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3514 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3515 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3516 }
b8c5462f 3517 }
ffc61ed2
JH
3518
3519 range = 0; /* this was not a true range */
73b437c8 3520 }
ffc61ed2 3521
73b437c8 3522 if (!SIZE_ONLY) {
3568d838
JH
3523 if (namedclass > OOB_NAMEDCLASS)
3524 optimize_invert = FALSE;
e2962f66
JH
3525 /* Possible truncation here but in some 64-bit environments
3526 * the compiler gets heartburn about switch on 64-bit values.
3527 * A similar issue a little earlier when switching on value.
98f323fa 3528 * --jhi */
e2962f66 3529 switch ((I32)namedclass) {
73b437c8
JH
3530 case ANYOF_ALNUM:
3531 if (LOC)
936ed897 3532 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3533 else {
3534 for (value = 0; value < 256; value++)
3535 if (isALNUM(value))
936ed897 3536 ANYOF_BITMAP_SET(ret, value);
73b437c8 3537 }
ffc61ed2 3538 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3539 break;
3540 case ANYOF_NALNUM:
3541 if (LOC)
936ed897 3542 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3543 else {
3544 for (value = 0; value < 256; value++)
3545 if (!isALNUM(value))
936ed897 3546 ANYOF_BITMAP_SET(ret, value);
73b437c8 3547 }
ffc61ed2 3548 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3549 break;
ffc61ed2 3550 case ANYOF_ALNUMC:
73b437c8 3551 if (LOC)
ffc61ed2 3552 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3553 else {
3554 for (value = 0; value < 256; value++)
ffc61ed2 3555 if (isALNUMC(value))
936ed897 3556 ANYOF_BITMAP_SET(ret, value);
73b437c8 3557 }
ffc61ed2 3558 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3559 break;
3560 case ANYOF_NALNUMC:
3561 if (LOC)
936ed897 3562 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3563 else {
3564 for (value = 0; value < 256; value++)
3565 if (!isALNUMC(value))
936ed897 3566 ANYOF_BITMAP_SET(ret, value);
73b437c8 3567 }
ffc61ed2 3568 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3569 break;
3570 case ANYOF_ALPHA:
3571 if (LOC)
936ed897 3572 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3573 else {
3574 for (value = 0; value < 256; value++)
3575 if (isALPHA(value))
936ed897 3576 ANYOF_BITMAP_SET(ret, value);
73b437c8 3577 }
ffc61ed2 3578 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3579 break;
3580 case ANYOF_NALPHA:
3581 if (LOC)
936ed897 3582 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3583 else {
3584 for (value = 0; value < 256; value++)
3585 if (!isALPHA(value))
936ed897 3586 ANYOF_BITMAP_SET(ret, value);
73b437c8 3587 }
ffc61ed2 3588 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3589 break;
3590 case ANYOF_ASCII:
3591 if (LOC)
936ed897 3592 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3593 else {
c7f1f016 3594#ifndef EBCDIC
1ba5c669
JH
3595 for (value = 0; value < 128; value++)
3596 ANYOF_BITMAP_SET(ret, value);
3597#else /* EBCDIC */
ffbc6a93 3598 for (value = 0; value < 256; value++) {
3a3c4447
JH
3599 if (isASCII(value))
3600 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3601 }
1ba5c669 3602#endif /* EBCDIC */
73b437c8 3603 }
ffc61ed2 3604 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3605 break;
3606 case ANYOF_NASCII:
3607 if (LOC)
936ed897 3608 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3609 else {
c7f1f016 3610#ifndef EBCDIC
1ba5c669
JH
3611 for (value = 128; value < 256; value++)
3612 ANYOF_BITMAP_SET(ret, value);
3613#else /* EBCDIC */
ffbc6a93 3614 for (value = 0; value < 256; value++) {
3a3c4447
JH
3615 if (!isASCII(value))
3616 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3617 }
1ba5c669 3618#endif /* EBCDIC */
73b437c8 3619 }
ffc61ed2 3620 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3621 break;
aaa51d5e
JF
3622 case ANYOF_BLANK:
3623 if (LOC)
3624 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3625 else {
3626 for (value = 0; value < 256; value++)
3627 if (isBLANK(value))
3628 ANYOF_BITMAP_SET(ret, value);
3629 }
ffc61ed2 3630 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3631 break;
3632 case ANYOF_NBLANK:
3633 if (LOC)
3634 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3635 else {
3636 for (value = 0; value < 256; value++)
3637 if (!isBLANK(value))
3638 ANYOF_BITMAP_SET(ret, value);
3639 }
ffc61ed2 3640 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3641 break;
73b437c8
JH
3642 case ANYOF_CNTRL:
3643 if (LOC)
936ed897 3644 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (isCNTRL(value))
936ed897 3648 ANYOF_BITMAP_SET(ret, value);
73b437c8 3649 }
ffc61ed2 3650 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3651 break;
3652 case ANYOF_NCNTRL:
3653 if (LOC)
936ed897 3654 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3655 else {
3656 for (value = 0; value < 256; value++)
3657 if (!isCNTRL(value))
936ed897 3658 ANYOF_BITMAP_SET(ret, value);
73b437c8 3659 }
ffc61ed2
JH
3660 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3661 break;
3662 case ANYOF_DIGIT:
3663 if (LOC)
3664 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3665 else {
3666 /* consecutive digits assumed */
3667 for (value = '0'; value <= '9'; value++)
3668 ANYOF_BITMAP_SET(ret, value);
3669 }
3670 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3671 break;
3672 case ANYOF_NDIGIT:
3673 if (LOC)
3674 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3675 else {
3676 /* consecutive digits assumed */
3677 for (value = 0; value < '0'; value++)
3678 ANYOF_BITMAP_SET(ret, value);
3679 for (value = '9' + 1; value < 256; value++)
3680 ANYOF_BITMAP_SET(ret, value);
3681 }
3682 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3683 break;
3684 case ANYOF_GRAPH:
3685 if (LOC)
936ed897 3686 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3687 else {
3688 for (value = 0; value < 256; value++)
3689 if (isGRAPH(value))
936ed897 3690 ANYOF_BITMAP_SET(ret, value);
73b437c8 3691 }
ffc61ed2 3692 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3693 break;
3694 case ANYOF_NGRAPH:
3695 if (LOC)
936ed897 3696 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3697 else {
3698 for (value = 0; value < 256; value++)
3699 if (!isGRAPH(value))
936ed897 3700 ANYOF_BITMAP_SET(ret, value);
73b437c8 3701 }
ffc61ed2 3702 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3703 break;
3704 case ANYOF_LOWER:
3705 if (LOC)
936ed897 3706 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3707 else {
3708 for (value = 0; value < 256; value++)
3709 if (isLOWER(value))
936ed897 3710 ANYOF_BITMAP_SET(ret, value);
73b437c8 3711 }
ffc61ed2 3712 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3713 break;
3714 case ANYOF_NLOWER:
3715 if (LOC)
936ed897 3716 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3717 else {
3718 for (value = 0; value < 256; value++)
3719 if (!isLOWER(value))
936ed897 3720 ANYOF_BITMAP_SET(ret, value);
73b437c8 3721 }
ffc61ed2 3722 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3723 break;
3724 case ANYOF_PRINT:
3725 if (LOC)
936ed897 3726 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3727 else {
3728 for (value = 0; value < 256; value++)
3729 if (isPRINT(value))
936ed897 3730 ANYOF_BITMAP_SET(ret, value);
73b437c8 3731 }
ffc61ed2 3732 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3733 break;
3734 case ANYOF_NPRINT:
3735 if (LOC)
936ed897 3736 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3737 else {
3738 for (value = 0; value < 256; value++)
3739 if (!isPRINT(value))
936ed897 3740 ANYOF_BITMAP_SET(ret, value);
73b437c8 3741 }
ffc61ed2 3742 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3743 break;
aaa51d5e
JF
3744 case ANYOF_PSXSPC:
3745 if (LOC)
3746 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3747 else {
3748 for (value = 0; value < 256; value++)
3749 if (isPSXSPC(value))
3750 ANYOF_BITMAP_SET(ret, value);
3751 }
ffc61ed2 3752 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3753 break;
3754 case ANYOF_NPSXSPC:
3755 if (LOC)
3756 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3757 else {
3758 for (value = 0; value < 256; value++)
3759 if (!isPSXSPC(value))
3760 ANYOF_BITMAP_SET(ret, value);
3761 }
ffc61ed2 3762 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3763 break;
73b437c8
JH
3764 case ANYOF_PUNCT:
3765 if (LOC)
936ed897 3766 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3767 else {
3768 for (value = 0; value < 256; value++)
3769 if (isPUNCT(value))
936ed897 3770 ANYOF_BITMAP_SET(ret, value);
73b437c8 3771 }
ffc61ed2 3772 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3773 break;
3774 case ANYOF_NPUNCT:
3775 if (LOC)
936ed897 3776 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3777 else {
3778 for (value = 0; value < 256; value++)
3779 if (!isPUNCT(value))
936ed897 3780 ANYOF_BITMAP_SET(ret, value);
73b437c8 3781 }
ffc61ed2
JH
3782 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3783 break;
3784 case ANYOF_SPACE:
3785 if (LOC)
3786 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3787 else {
3788 for (value = 0; value < 256; value++)
3789 if (isSPACE(value))
3790 ANYOF_BITMAP_SET(ret, value);
3791 }
3792 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3793 break;
3794 case ANYOF_NSPACE:
3795 if (LOC)
3796 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3797 else {
3798 for (value = 0; value < 256; value++)
3799 if (!isSPACE(value))
3800 ANYOF_BITMAP_SET(ret, value);
3801 }
3802 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3803 break;
3804 case ANYOF_UPPER:
3805 if (LOC)
936ed897 3806 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3807 else {
3808 for (value = 0; value < 256; value++)
3809 if (isUPPER(value))
936ed897 3810 ANYOF_BITMAP_SET(ret, value);
73b437c8 3811 }
ffc61ed2 3812 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3813 break;
3814 case ANYOF_NUPPER:
3815 if (LOC)
936ed897 3816 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3817 else {
3818 for (value = 0; value < 256; value++)
3819 if (!isUPPER(value))
936ed897 3820 ANYOF_BITMAP_SET(ret, value);
73b437c8 3821 }
ffc61ed2 3822 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3823 break;
3824 case ANYOF_XDIGIT:
3825 if (LOC)
936ed897 3826 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3827 else {
3828 for (value = 0; value < 256; value++)
3829 if (isXDIGIT(value))
936ed897 3830 ANYOF_BITMAP_SET(ret, value);
73b437c8 3831 }
ffc61ed2 3832 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3833 break;
3834 case ANYOF_NXDIGIT:
3835 if (LOC)
936ed897 3836 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3837 else {
3838 for (value = 0; value < 256; value++)
3839 if (!isXDIGIT(value))
936ed897 3840 ANYOF_BITMAP_SET(ret, value);
73b437c8 3841 }
ffc61ed2 3842 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3843 break;
3844 default:
b45f050a 3845 vFAIL("Invalid [::] class");
73b437c8 3846 break;
b8c5462f 3847 }
b8c5462f 3848 if (LOC)
936ed897 3849 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3850 continue;
a0d0e21e 3851 }
ffc61ed2
JH
3852 } /* end of namedclass \blah */
3853
a0d0e21e 3854 if (range) {
3a3c4447 3855 if (prevvalue > value) /* b-a */ {
b45f050a 3856 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3857 RExC_parse - rangebegin,
3858 RExC_parse - rangebegin,
b45f050a 3859 rangebegin);
3568d838 3860 range = 0; /* not a valid range */
73b437c8 3861 }
a0d0e21e
LW
3862 }
3863 else {
3568d838 3864 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3865 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3866 RExC_parse[1] != ']') {
3867 RExC_parse++;
ffc61ed2
JH
3868
3869 /* a bad range like \w-, [:word:]- ? */
3870 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3871 if (ckWARN(WARN_REGEXP))
830247a4 3872 vWARN4(RExC_parse,
b45f050a 3873 "False [] range \"%*.*s\"",
830247a4
IZ
3874 RExC_parse - rangebegin,
3875 RExC_parse - rangebegin,
b45f050a 3876 rangebegin);
73b437c8 3877 if (!SIZE_ONLY)
936ed897 3878 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3879 } else
ffc61ed2
JH
3880 range = 1; /* yeah, it's a range! */
3881 continue; /* but do it the next time */
a0d0e21e 3882 }
a687059c 3883 }
ffc61ed2 3884
93733859 3885 /* now is the next time */
ae5c130c 3886 if (!SIZE_ONLY) {
3568d838
JH
3887 IV i;
3888
3889 if (prevvalue < 256) {
3890 IV ceilvalue = value < 256 ? value : 255;
3891
3892#ifdef EBCDIC
3a3c4447
JH
3893 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3894 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3895 {
3568d838
JH
3896 if (isLOWER(prevvalue)) {
3897 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3898 if (isLOWER(i))
3899 ANYOF_BITMAP_SET(ret, i);
3900 } else {
3568d838 3901 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3902 if (isUPPER(i))
3903 ANYOF_BITMAP_SET(ret, i);
3904 }
8ada0baa 3905 }
ffc61ed2 3906 else
8ada0baa 3907#endif
3568d838
JH
3908 for (i = prevvalue; i <= ceilvalue; i++)
3909 ANYOF_BITMAP_SET(ret, i);
3910 }
3911 if (value > 255) {
ffc61ed2 3912 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3913 if (prevvalue < value)
ffc61ed2 3914 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3915 (UV)prevvalue, (UV)value);
3916 else if (prevvalue == value)
ffc61ed2
JH
3917 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3918 (UV)value);
3919 }
8ada0baa 3920 }
ffc61ed2
JH
3921
3922 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3923 }
ffc61ed2 3924
936ed897 3925 if (need_class) {
4f66b38d 3926 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3927 if (SIZE_ONLY)
830247a4 3928 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3929 else
830247a4 3930 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3931 }
ffc61ed2 3932
ae5c130c 3933 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3934 if (!SIZE_ONLY &&
ffc61ed2 3935 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3936 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3937 ) {
a0ed51b3 3938 for (value = 0; value < 256; ++value) {
936ed897 3939 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3940 IV fold = PL_fold[value];
3941
3942 if (fold != value)
3943 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3944 }
3945 }
936ed897 3946 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3947 }
ffc61ed2 3948
ae5c130c 3949 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3950 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3951 /* If the only flag is inversion. */
3952 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3953 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3954 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3955 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3956 }
a0d0e21e 3957
b81d288d 3958 if (!SIZE_ONLY) {
fde631ed 3959 AV *av = newAV();
ffc61ed2
JH
3960 SV *rv;
3961
3962 av_store(av, 0, listsv);
3963 av_store(av, 1, NULL);
3964 rv = newRV_noinc((SV*)av);
19860706 3965 n = add_data(pRExC_state, 1, "s");
830247a4 3966 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3967 ARG_SET(ret, n);
a0ed51b3
LW
3968 }
3969
3970 return ret;
3971}
3972
76e3520e 3973STATIC char*
830247a4 3974S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3975{
830247a4 3976 char* retval = RExC_parse++;
a0d0e21e 3977
4633a7c4 3978 for (;;) {
830247a4
IZ
3979 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3980 RExC_parse[2] == '#') {
3981 while (*RExC_parse && *RExC_parse != ')')
3982 RExC_parse++;
3983 RExC_parse++;
4633a7c4
LW
3984 continue;
3985 }
830247a4
IZ
3986 if (RExC_flags16 & PMf_EXTENDED) {
3987 if (isSPACE(*RExC_parse)) {
3988 RExC_parse++;
748a9306
LW
3989 continue;
3990 }
830247a4
IZ
3991 else if (*RExC_parse == '#') {
3992 while (*RExC_parse && *RExC_parse != '\n')
3993 RExC_parse++;
3994 RExC_parse++;
748a9306
LW
3995 continue;
3996 }
748a9306 3997 }
4633a7c4 3998 return retval;
a0d0e21e 3999 }
a687059c
LW
4000}
4001
4002/*
c277df42 4003- reg_node - emit a node
a0d0e21e 4004*/
76e3520e 4005STATIC regnode * /* Location. */
830247a4 4006S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4007{
c277df42
IZ
4008 register regnode *ret;
4009 register regnode *ptr;
a687059c 4010
830247a4 4011 ret = RExC_emit;
c277df42 4012 if (SIZE_ONLY) {
830247a4
IZ
4013 SIZE_ALIGN(RExC_size);
4014 RExC_size += 1;
a0d0e21e
LW
4015 return(ret);
4016 }
a687059c 4017
c277df42 4018 NODE_ALIGN_FILL(ret);
a0d0e21e 4019 ptr = ret;
c277df42 4020 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4021 if (RExC_offsets) { /* MJD */
4022 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4023 "reg_node", __LINE__,
4024 reg_name[op],
4025 RExC_emit - RExC_emit_start > RExC_offsets[0]
4026 ? "Overwriting end of array!\n" : "OK",
4027 RExC_emit - RExC_emit_start,
4028 RExC_parse - RExC_start,
4029 RExC_offsets[0]));
4030 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4031 }
4032
830247a4 4033 RExC_emit = ptr;
a687059c 4034
a0d0e21e 4035 return(ret);
a687059c
LW
4036}
4037
4038/*
a0d0e21e
LW
4039- reganode - emit a node with an argument
4040*/
76e3520e 4041STATIC regnode * /* Location. */
830247a4 4042S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4043{
c277df42
IZ
4044 register regnode *ret;
4045 register regnode *ptr;
fe14fcc3 4046
830247a4 4047 ret = RExC_emit;
c277df42 4048 if (SIZE_ONLY) {
830247a4
IZ
4049 SIZE_ALIGN(RExC_size);
4050 RExC_size += 2;
a0d0e21e
LW
4051 return(ret);
4052 }
fe14fcc3 4053
c277df42 4054 NODE_ALIGN_FILL(ret);
a0d0e21e 4055 ptr = ret;
c277df42 4056 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4057 if (RExC_offsets) { /* MJD */
4058 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4059 "reganode",
4060 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4061 "Overwriting end of array!\n" : "OK",
4062 RExC_emit - RExC_emit_start,
4063 RExC_parse - RExC_start,
4064 RExC_offsets[0]));
4065 Set_Cur_Node_Offset;
4066 }
4067
830247a4 4068 RExC_emit = ptr;
fe14fcc3 4069
a0d0e21e 4070 return(ret);
fe14fcc3
LW
4071}
4072
4073/*
cd439c50 4074- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4075*/
4076STATIC void
830247a4 4077S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4078{
5e12f4fb 4079 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4080}
4081
4082/*
a0d0e21e
LW
4083- reginsert - insert an operator in front of already-emitted operand
4084*
4085* Means relocating the operand.
4086*/
76e3520e 4087STATIC void
830247a4 4088S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4089{
c277df42
IZ
4090 register regnode *src;
4091 register regnode *dst;
4092 register regnode *place;
4093 register int offset = regarglen[(U8)op];
b81d288d 4094
22c35a8c 4095/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4096
4097 if (SIZE_ONLY) {
830247a4 4098 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4099 return;
4100 }
a687059c 4101
830247a4
IZ
4102 src = RExC_emit;
4103 RExC_emit += NODE_STEP_REGNODE + offset;
4104 dst = RExC_emit;
fac92740 4105 while (src > opnd) {
c277df42 4106 StructCopy(--src, --dst, regnode);
fac92740
MJD
4107 if (RExC_offsets) { /* MJD 20010112 */
4108 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4109 "reg_insert",
4110 dst - RExC_emit_start > RExC_offsets[0]
4111 ? "Overwriting end of array!\n" : "OK",
4112 src - RExC_emit_start,
4113 dst - RExC_emit_start,
4114 RExC_offsets[0]));
4115 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4116 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4117 }
4118 }
4119
a0d0e21e
LW
4120
4121 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4122 if (RExC_offsets) { /* MJD */
4123 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4124 "reginsert",
4125 place - RExC_emit_start > RExC_offsets[0]
4126 ? "Overwriting end of array!\n" : "OK",
4127 place - RExC_emit_start,
4128 RExC_parse - RExC_start,
4129 RExC_offsets[0]));
4130 Set_Node_Offset(place, RExC_parse);
4131 }
c277df42
IZ
4132 src = NEXTOPER(place);
4133 FILL_ADVANCE_NODE(place, op);
4134 Zero(src, offset, regnode);
a687059c
LW
4135}
4136
4137/*
c277df42 4138- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4139*/
76e3520e 4140STATIC void
830247a4 4141S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4142{
c277df42
IZ
4143 register regnode *scan;
4144 register regnode *temp;
a0d0e21e 4145
c277df42 4146 if (SIZE_ONLY)
a0d0e21e
LW
4147 return;
4148
4149 /* Find last node. */
4150 scan = p;
4151 for (;;) {
4152 temp = regnext(scan);
4153 if (temp == NULL)
4154 break;
4155 scan = temp;
4156 }
a687059c 4157
c277df42
IZ
4158 if (reg_off_by_arg[OP(scan)]) {
4159 ARG_SET(scan, val - scan);
a0ed51b3
LW
4160 }
4161 else {
c277df42
IZ
4162 NEXT_OFF(scan) = val - scan;
4163 }
a687059c
LW
4164}
4165
4166/*
a0d0e21e
LW
4167- regoptail - regtail on operand of first argument; nop if operandless
4168*/
76e3520e 4169STATIC void
830247a4 4170S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4171{
a0d0e21e 4172 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4173 if (p == NULL || SIZE_ONLY)
4174 return;
22c35a8c 4175 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4176 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4177 }
22c35a8c 4178 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4179 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4180 }
4181 else
a0d0e21e 4182 return;
a687059c
LW
4183}
4184
4185/*
4186 - regcurly - a little FSA that accepts {\d+,?\d*}
4187 */
79072805 4188STATIC I32
cea2e8a9 4189S_regcurly(pTHX_ register char *s)
a687059c
LW
4190{
4191 if (*s++ != '{')
4192 return FALSE;
f0fcb552 4193 if (!isDIGIT(*s))
a687059c 4194 return FALSE;
f0fcb552 4195 while (isDIGIT(*s))
a687059c
LW
4196 s++;
4197 if (*s == ',')
4198 s++;
f0fcb552 4199 while (isDIGIT(*s))
a687059c
LW
4200 s++;
4201 if (*s != '}')
4202 return FALSE;
4203 return TRUE;
4204}
4205
a687059c 4206
8fa7f367
JH
4207#ifdef DEBUGGING
4208
76e3520e 4209STATIC regnode *
cea2e8a9 4210S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4211{
f248d071 4212 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4213 register regnode *next;
c277df42
IZ
4214
4215 while (op != END && (!last || node < last)) {
4216 /* While that wasn't END last time... */
4217
4218 NODE_ALIGN(node);
4219 op = OP(node);
4220 if (op == CLOSE)
4221 l--;
4222 next = regnext(node);
4223 /* Where, what. */
4224 if (OP(node) == OPTIMIZED)
4225 goto after_print;
4226 regprop(sv, node);
b900a521 4227 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4228 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4229 if (next == NULL) /* Next ptr. */
4230 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4231 else
b900a521 4232 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4233 (void)PerlIO_putc(Perl_debug_log, '\n');
4234 after_print:
22c35a8c 4235 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4236 register regnode *nnode = (OP(next) == LONGJMP
4237 ? regnext(next)
c277df42
IZ
4238 : next);
4239 if (last && nnode > last)
4240 nnode = last;
4241 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4242 }
22c35a8c 4243 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4244 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4245 }
4246 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4247 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4248 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4249 }
22c35a8c 4250 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4251 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4252 next, sv, l + 1);
a0ed51b3
LW
4253 }
4254 else if ( op == PLUS || op == STAR) {
c277df42 4255 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4256 }
4257 else if (op == ANYOF) {
4f66b38d
HS
4258 /* arglen 1 + class block */
4259 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4260 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4261 node = NEXTOPER(node);
a0ed51b3 4262 }
22c35a8c 4263 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4264 /* Literal string, where present. */
cd439c50 4265 node += NODE_SZ_STR(node) - 1;
c277df42 4266 node = NEXTOPER(node);
a0ed51b3
LW
4267 }
4268 else {
c277df42
IZ
4269 node = NEXTOPER(node);
4270 node += regarglen[(U8)op];
4271 }
4272 if (op == CURLYX || op == OPEN)
4273 l++;
4274 else if (op == WHILEM)
4275 l--;
4276 }
4277 return node;
4278}
4279
8fa7f367
JH
4280#endif /* DEBUGGING */
4281
a687059c 4282/*
fd181c75 4283 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4284 */
4285void
864dbfa3 4286Perl_regdump(pTHX_ regexp *r)
a687059c 4287{
35ff7856 4288#ifdef DEBUGGING
46fc3d4c 4289 SV *sv = sv_newmortal();
a687059c 4290
c277df42 4291 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4292
4293 /* Header fields of interest. */
c277df42 4294 if (r->anchored_substr)
7b0972df 4295 PerlIO_printf(Perl_debug_log,
b81d288d 4296 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4297 PL_colors[0],
7b0972df 4298 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4299 SvPVX(r->anchored_substr),
3280af22 4300 PL_colors[1],
c277df42 4301 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4302 (IV)r->anchored_offset);
c277df42 4303 if (r->float_substr)
7b0972df 4304 PerlIO_printf(Perl_debug_log,
b81d288d 4305 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4306 PL_colors[0],
b81d288d 4307 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4308 SvPVX(r->float_substr),
3280af22 4309 PL_colors[1],
c277df42 4310 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4311 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4312 if (r->check_substr)
b81d288d
AB
4313 PerlIO_printf(Perl_debug_log,
4314 r->check_substr == r->float_substr
c277df42
IZ
4315 ? "(checking floating" : "(checking anchored");
4316 if (r->reganch & ROPT_NOSCAN)
4317 PerlIO_printf(Perl_debug_log, " noscan");
4318 if (r->reganch & ROPT_CHECK_ALL)
4319 PerlIO_printf(Perl_debug_log, " isall");
4320 if (r->check_substr)
4321 PerlIO_printf(Perl_debug_log, ") ");
4322
46fc3d4c 4323 if (r->regstclass) {
4324 regprop(sv, r->regstclass);
4325 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4326 }
774d564b 4327 if (r->reganch & ROPT_ANCH) {
4328 PerlIO_printf(Perl_debug_log, "anchored");
4329 if (r->reganch & ROPT_ANCH_BOL)
4330 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4331 if (r->reganch & ROPT_ANCH_MBOL)
4332 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4333 if (r->reganch & ROPT_ANCH_SBOL)
4334 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4335 if (r->reganch & ROPT_ANCH_GPOS)
4336 PerlIO_printf(Perl_debug_log, "(GPOS)");
4337 PerlIO_putc(Perl_debug_log, ' ');
4338 }
c277df42
IZ
4339 if (r->reganch & ROPT_GPOS_SEEN)
4340 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4341 if (r->reganch & ROPT_SKIP)
760ac839 4342 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4343 if (r->reganch & ROPT_IMPLICIT)
760ac839 4344 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4345 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4346 if (r->reganch & ROPT_EVAL_SEEN)
4347 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4348 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4349 if (r->offsets) {
4350 U32 i;
4351 U32 len = r->offsets[0];
392fbf5d 4352 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4353 for (i = 1; i <= len; i++)
392fbf5d
RB
4354 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4355 (UV)r->offsets[i*2-1],
4356 (UV)r->offsets[i*2]);
fac92740
MJD
4357 PerlIO_printf(Perl_debug_log, "\n");
4358 }
17c3b450 4359#endif /* DEBUGGING */
a687059c
LW
4360}
4361
8fa7f367
JH
4362#ifdef DEBUGGING
4363
653099ff
GS
4364STATIC void
4365S_put_byte(pTHX_ SV *sv, int c)
4366{
7be5a6cf 4367 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4368 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4369 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4370 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4371 else
4372 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4373}
4374
8fa7f367
JH
4375#endif /* DEBUGGING */
4376
a687059c 4377/*
a0d0e21e
LW
4378- regprop - printable representation of opcode
4379*/
46fc3d4c 4380void
864dbfa3 4381Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4382{
35ff7856 4383#ifdef DEBUGGING
9b155405 4384 register int k;
a0d0e21e 4385
54dc92de 4386 sv_setpvn(sv, "", 0);
9b155405 4387 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4388 /* It would be nice to FAIL() here, but this may be called from
4389 regexec.c, and it would be hard to supply pRExC_state. */
4390 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4391 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4392
4393 k = PL_regkind[(U8)OP(o)];
4394
4395 if (k == EXACT)
7821416a 4396 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4397 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4398 else if (k == CURLY) {
cb434fcc 4399 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4400 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4401 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4402 }
2c2d71f5
JH
4403 else if (k == WHILEM && o->flags) /* Ordinal/of */
4404 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4405 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4406 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4407 else if (k == LOGICAL)
04ebc1ab 4408 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4409 else if (k == ANYOF) {
4410 int i, rangestart = -1;
ffc61ed2 4411 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4412 const char * const anyofs[] = { /* Should be syncronized with
4413 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4414 "\\w",
4415 "\\W",
4416 "\\s",
4417 "\\S",
4418 "\\d",
4419 "\\D",
4420 "[:alnum:]",
4421 "[:^alnum:]",
4422 "[:alpha:]",
4423 "[:^alpha:]",
4424 "[:ascii:]",
4425 "[:^ascii:]",
4426 "[:ctrl:]",
4427 "[:^ctrl:]",
4428 "[:graph:]",
4429 "[:^graph:]",
4430 "[:lower:]",
4431 "[:^lower:]",
4432 "[:print:]",
4433 "[:^print:]",
4434 "[:punct:]",
4435 "[:^punct:]",
4436 "[:upper:]",
aaa51d5e 4437 "[:^upper:]",
653099ff 4438 "[:xdigit:]",
aaa51d5e
JF
4439 "[:^xdigit:]",
4440 "[:space:]",
4441 "[:^space:]",
4442 "[:blank:]",
4443 "[:^blank:]"
653099ff
GS
4444 };
4445
19860706 4446 if (flags & ANYOF_LOCALE)
653099ff 4447 sv_catpv(sv, "{loc}");
19860706 4448 if (flags & ANYOF_FOLD)
653099ff
GS
4449 sv_catpv(sv, "{i}");
4450 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4451 if (flags & ANYOF_INVERT)
653099ff 4452 sv_catpv(sv, "^");
ffc61ed2
JH
4453 for (i = 0; i <= 256; i++) {
4454 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4455 if (rangestart == -1)
4456 rangestart = i;
4457 } else if (rangestart != -1) {
4458 if (i <= rangestart + 3)
4459 for (; rangestart < i; rangestart++)
653099ff 4460 put_byte(sv, rangestart);
ffc61ed2
JH
4461 else {
4462 put_byte(sv, rangestart);
4463 sv_catpv(sv, "-");
4464 put_byte(sv, i - 1);
653099ff 4465 }
ffc61ed2 4466 rangestart = -1;
653099ff 4467 }
847a199f 4468 }
ffc61ed2
JH
4469
4470 if (o->flags & ANYOF_CLASS)
4471 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4472 if (ANYOF_CLASS_TEST(o,i))
4473 sv_catpv(sv, anyofs[i]);
4474
4475 if (flags & ANYOF_UNICODE)
4476 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4477 else if (flags & ANYOF_UNICODE_ALL)
4478 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4479
4480 {
4481 SV *lv;
4482 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4483
ffc61ed2
JH
4484 if (lv) {
4485 if (sw) {
4486 UV i;
4487 U8 s[UTF8_MAXLEN+1];
b81d288d 4488
ffc61ed2 4489 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4490 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4491
3568d838 4492 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4493 if (rangestart == -1)
4494 rangestart = i;
4495 } else if (rangestart != -1) {
4496 U8 *p;
b81d288d 4497
ffc61ed2
JH
4498 if (i <= rangestart + 3)
4499 for (; rangestart < i; rangestart++) {
2b9d42f0 4500 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4501 put_byte(sv, *p);
4502 }
4503 else {
2b9d42f0 4504 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4505 put_byte(sv, *p);
4506 sv_catpv(sv, "-");
2b9d42f0 4507 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4508 put_byte(sv, *p);
4509 }
4510 rangestart = -1;
4511 }
19860706 4512 }
ffc61ed2
JH
4513
4514 sv_catpv(sv, "..."); /* et cetera */
19860706 4515 }
fde631ed 4516
ffc61ed2
JH
4517 {
4518 char *s = savepv(SvPVX(lv));
4519 char *origs = s;
b81d288d 4520
ffc61ed2 4521 while(*s && *s != '\n') s++;
b81d288d 4522
ffc61ed2
JH
4523 if (*s == '\n') {
4524 char *t = ++s;
4525
4526 while (*s) {
4527 if (*s == '\n')
4528 *s = ' ';
4529 s++;
4530 }
4531 if (s[-1] == ' ')
4532 s[-1] = 0;
4533
4534 sv_catpv(sv, t);
fde631ed 4535 }
b81d288d 4536
ffc61ed2 4537 Safefree(origs);
fde631ed
JH
4538 }
4539 }
653099ff 4540 }
ffc61ed2 4541
653099ff
GS
4542 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4543 }
9b155405 4544 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4545 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4546#endif /* DEBUGGING */
35ff7856 4547}
a687059c 4548
cad2e5aa
JH
4549SV *
4550Perl_re_intuit_string(pTHX_ regexp *prog)
4551{ /* Assume that RE_INTUIT is set */
4552 DEBUG_r(
4553 { STRLEN n_a;
4554 char *s = SvPV(prog->check_substr,n_a);
4555
4556 if (!PL_colorset) reginitcolors();
4557 PerlIO_printf(Perl_debug_log,
4558 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4559 PL_colors[4],PL_colors[5],PL_colors[0],
4560 s,
4561 PL_colors[1],
4562 (strlen(s) > 60 ? "..." : ""));
4563 } );
4564
4565 return prog->check_substr;
4566}
4567
2b69d0c2 4568void
864dbfa3 4569Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4570{
adac82c7 4571 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4572
4573 if (!r || (--r->refcnt > 0))
4574 return;
cad2e5aa
JH
4575 DEBUG_r(PerlIO_printf(Perl_debug_log,
4576 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4577 PL_colors[4],PL_colors[5],PL_colors[0],
4578 r->precomp,
4579 PL_colors[1],
4580 (strlen(r->precomp) > 60 ? "..." : "")));
4581
c277df42 4582 if (r->precomp)
a0d0e21e 4583 Safefree(r->precomp);
fac92740
MJD
4584 if (r->offsets) /* 20010421 MJD */
4585 Safefree(r->offsets);
cf93c79d
IZ
4586 if (RX_MATCH_COPIED(r))
4587 Safefree(r->subbeg);
a193d654
GS
4588 if (r->substrs) {
4589 if (r->anchored_substr)
4590 SvREFCNT_dec(r->anchored_substr);
4591 if (r->float_substr)
4592 SvREFCNT_dec(r->float_substr);
2779dcf1 4593 Safefree(r->substrs);
a193d654 4594 }
c277df42
IZ
4595 if (r->data) {
4596 int n = r->data->count;
dfad63ad
HS
4597 AV* new_comppad = NULL;
4598 AV* old_comppad;
4599 SV** old_curpad;
4600
c277df42 4601 while (--n >= 0) {
261faec3 4602 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4603 switch (r->data->what[n]) {
4604 case 's':
4605 SvREFCNT_dec((SV*)r->data->data[n]);
4606 break;
653099ff
GS
4607 case 'f':
4608 Safefree(r->data->data[n]);
4609 break;
dfad63ad
HS
4610 case 'p':
4611 new_comppad = (AV*)r->data->data[n];
4612 break;
c277df42 4613 case 'o':
dfad63ad 4614 if (new_comppad == NULL)
cea2e8a9 4615 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4616 old_comppad = PL_comppad;
4617 old_curpad = PL_curpad;
1e6dc0b6
SB
4618 /* Watch out for global destruction's random ordering. */
4619 if (SvTYPE(new_comppad) == SVt_PVAV) {
4620 PL_comppad = new_comppad;
4621 PL_curpad = AvARRAY(new_comppad);
4622 }
4623 else
4624 PL_curpad = NULL;
9b978d73
DM
4625
4626 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4627 op_free((OP_4tree*)r->data->data[n]);
4628 }
4629
dfad63ad
HS
4630 PL_comppad = old_comppad;
4631 PL_curpad = old_curpad;
4632 SvREFCNT_dec((SV*)new_comppad);
4633 new_comppad = NULL;
c277df42
IZ
4634 break;
4635 case 'n':
4636 break;
4637 default:
830247a4 4638 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4639 }
4640 }
4641 Safefree(r->data->what);
4642 Safefree(r->data);
a0d0e21e
LW
4643 }
4644 Safefree(r->startp);
4645 Safefree(r->endp);
4646 Safefree(r);
a687059c 4647}
c277df42
IZ
4648
4649/*
4650 - regnext - dig the "next" pointer out of a node
4651 *
4652 * [Note, when REGALIGN is defined there are two places in regmatch()
4653 * that bypass this code for speed.]
4654 */
4655regnode *
864dbfa3 4656Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4657{
4658 register I32 offset;
4659
3280af22 4660 if (p == &PL_regdummy)
c277df42
IZ
4661 return(NULL);
4662
4663 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4664 if (offset == 0)
4665 return(NULL);
4666
c277df42 4667 return(p+offset);
c277df42
IZ
4668}
4669
01f988be 4670STATIC void
cea2e8a9 4671S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4672{
4673 va_list args;
4674 STRLEN l1 = strlen(pat1);
4675 STRLEN l2 = strlen(pat2);
4676 char buf[512];
06bf62c7 4677 SV *msv;
c277df42
IZ
4678 char *message;
4679
4680 if (l1 > 510)
4681 l1 = 510;
4682 if (l1 + l2 > 510)
4683 l2 = 510 - l1;
4684 Copy(pat1, buf, l1 , char);
4685 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4686 buf[l1 + l2] = '\n';
4687 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4688#ifdef I_STDARG
4689 /* ANSI variant takes additional second argument */
c277df42 4690 va_start(args, pat2);
8736538c
AS
4691#else
4692 va_start(args);
4693#endif
5a844595 4694 msv = vmess(buf, &args);
c277df42 4695 va_end(args);
06bf62c7 4696 message = SvPV(msv,l1);
c277df42
IZ
4697 if (l1 > 512)
4698 l1 = 512;
4699 Copy(message, buf, l1 , char);
4700 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4701 Perl_croak(aTHX_ "%s", buf);
c277df42 4702}
a0ed51b3
LW
4703
4704/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4705
4706void
864dbfa3 4707Perl_save_re_context(pTHX)
b81d288d 4708{
830247a4
IZ
4709#if 0
4710 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4711 SAVEI32(RExC_npar); /* () count. */
4712 SAVEI32(RExC_size); /* Code size. */
4713 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4714 SAVEVPTR(RExC_rx); /* from regcomp.c */
4715 SAVEI32(RExC_seen); /* from regcomp.c */
4716 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4717 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4718 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4719 SAVEPPTR(RExC_end); /* End of input for compile */
4720 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4721#endif
4722
4723 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4724 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4725 SAVEPPTR(PL_reginput); /* String-input pointer. */
4726 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4727 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4728 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4729 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4730 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4731 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4732 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4733 PL_reg_start_tmp = 0;
a0ed51b3
LW
4734 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4735 PL_reg_start_tmpl = 0;
7766f137 4736 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4737 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4738 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4739 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4740 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4741 SAVEVPTR(PL_regcc); /* from regexec.c */
4742 SAVEVPTR(PL_curcop);
7766f137
GS
4743 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4744 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4745 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4746 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4747 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4748 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4749 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4750 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4751 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4752 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4753 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4754#ifdef DEBUGGING
b81d288d 4755 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4756#endif
a0ed51b3 4757}
51371543 4758
51371543 4759static void
acfe0abc 4760clear_re(pTHX_ void *r)
51371543
GS
4761{
4762 ReREFCNT_dec((regexp *)r);
4763}
ffbc6a93 4764