This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add support for finding shared arrays and hashes.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
9af0acdb
JH
525 int value;
526
653099ff 527 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
528 for (value = 0; value < 256; ++value)
529 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 530 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* Can match anything (initialization) */
536STATIC int
537S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538{
539 int value;
540
aaa51d5e 541 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
542 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
543 return 1;
1aa99e6b
IH
544 if (!(cl->flags & ANYOF_UNICODE_ALL))
545 return 0;
9af0acdb
JH
546 for (value = 0; value < 256; ++value)
547 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
548 return 0;
549 return 1;
550}
551
552/* Can match anything (initialization) */
553STATIC void
830247a4 554S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 555{
8ecf7187 556 Zero(cl, 1, struct regnode_charclass_class);
653099ff 557 cl->type = ANYOF;
830247a4 558 cl_anything(pRExC_state, cl);
653099ff
GS
559}
560
561STATIC void
830247a4 562S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 563{
8ecf7187 564 Zero(cl, 1, struct regnode_charclass_class);
653099ff 565 cl->type = ANYOF;
830247a4 566 cl_anything(pRExC_state, cl);
653099ff
GS
567 if (LOC)
568 cl->flags |= ANYOF_LOCALE;
569}
570
571/* 'And' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
574S_cl_and(pTHX_ struct regnode_charclass_class *cl,
575 struct regnode_charclass_class *and_with)
576{
653099ff
GS
577 if (!(and_with->flags & ANYOF_CLASS)
578 && !(cl->flags & ANYOF_CLASS)
579 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
580 && !(and_with->flags & ANYOF_FOLD)
581 && !(cl->flags & ANYOF_FOLD)) {
582 int i;
583
584 if (and_with->flags & ANYOF_INVERT)
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= ~and_with->bitmap[i];
587 else
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] &= and_with->bitmap[i];
590 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
591 if (!(and_with->flags & ANYOF_EOS))
592 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
593
594 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 cl->flags |= ANYOF_UNICODE;
597 ARG_SET(cl, ARG(and_with));
598 }
599 if (!(and_with->flags & ANYOF_UNICODE_ALL))
600 cl->flags &= ~ANYOF_UNICODE_ALL;
601 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
602 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
603}
604
605/* 'OR' a given class with another one. Can create false positives */
606/* We assume that cl is not inverted */
607STATIC void
830247a4 608S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 609{
653099ff
GS
610 if (or_with->flags & ANYOF_INVERT) {
611 /* We do not use
612 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
613 * <= (B1 | !B2) | (CL1 | !CL2)
614 * which is wasteful if CL2 is small, but we ignore CL2:
615 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
616 * XXXX Can we handle case-fold? Unclear:
617 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
618 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
619 */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && !(or_with->flags & ANYOF_FOLD)
622 && !(cl->flags & ANYOF_FOLD) ) {
623 int i;
624
625 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
626 cl->bitmap[i] |= ~or_with->bitmap[i];
627 } /* XXXX: logic is complicated otherwise */
628 else {
830247a4 629 cl_anything(pRExC_state, cl);
653099ff
GS
630 }
631 } else {
632 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
633 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 634 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
635 || (cl->flags & ANYOF_FOLD)) ) {
636 int i;
637
638 /* OR char bitmap and class bitmap separately */
639 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
640 cl->bitmap[i] |= or_with->bitmap[i];
641 if (or_with->flags & ANYOF_CLASS) {
642 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
643 cl->classflags[i] |= or_with->classflags[i];
644 cl->flags |= ANYOF_CLASS;
645 }
646 }
647 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649 }
650 }
651 if (or_with->flags & ANYOF_EOS)
652 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
653
654 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
655 ARG(cl) != ARG(or_with)) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659 if (or_with->flags & ANYOF_UNICODE_ALL) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
653099ff
GS
663}
664
665/* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
c277df42
IZ
668/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
669 to the position after last scanned or to NULL. */
670
76e3520e 671STATIC I32
830247a4 672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676{
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 681 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
653099ff 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 685
c277df42
IZ
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
22c35a8c 689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 690 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693#ifdef DEBUGGING
694 regnode *stop = scan;
b81d288d 695#endif
c277df42 696
cd439c50 697 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
b81d288d 700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
22c35a8c 706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709#ifdef DEBUGGING
710 if (stringok)
711 stop = n;
b81d288d 712#endif
c277df42 713 n = regnext(n);
a0ed51b3 714 }
f49d4d0f 715 else if (stringok) {
cd439c50 716 int oldl = STR_LEN(scan);
c277df42 717 regnode *nnext = regnext(n);
f49d4d0f 718
b81d288d 719 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
c277df42 724 /* Now we can overwrite *n : */
f49d4d0f 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 726#ifdef DEBUGGING
f49d4d0f 727 stop = next - 1;
b81d288d 728#endif
c277df42
IZ
729 n = nnext;
730 }
731 }
732#ifdef DEBUGGING
733 /* Allow dumping */
cd439c50 734 n = scan + NODE_SZ_STR(scan);
c277df42 735 while (n <= stop) {
22c35a8c 736 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
737 OP(n) = OPTIMIZED;
738 NEXT_OFF(n) = 0;
739 }
740 n++;
741 }
653099ff 742#endif
c277df42 743 }
653099ff
GS
744 /* Follow the next-chain of the current node and optimize
745 away all the NOTHINGs from it. */
c277df42 746 if (OP(scan) != CURLYX) {
048cfca1
GS
747 int max = (reg_off_by_arg[OP(scan)]
748 ? I32_MAX
749 /* I32 may be smaller than U16 on CRAYs! */
750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
752 int noff;
753 regnode *n = scan;
b81d288d 754
c277df42
IZ
755 /* Skip NOTHING and LONGJMP. */
756 while ((n = regnext(n))
22c35a8c 757 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
759 && off + noff < max)
760 off += noff;
761 if (reg_off_by_arg[OP(scan)])
762 ARG(scan) = off;
b81d288d 763 else
c277df42
IZ
764 NEXT_OFF(scan) = off;
765 }
653099ff
GS
766 /* The principal pseudo-switch. Cannot be a switch, since we
767 look into several different things. */
b81d288d 768 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
769 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
770 next = regnext(scan);
771 code = OP(scan);
b81d288d
AB
772
773 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 774 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 775 struct regnode_charclass_class accum;
c277df42 776
653099ff 777 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 778 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 779 if (flags & SCF_DO_STCLASS)
830247a4 780 cl_init_zero(pRExC_state, &accum);
c277df42 781 while (OP(scan) == code) {
830247a4 782 I32 deltanext, minnext, f = 0, fake;
653099ff 783 struct regnode_charclass_class this_class;
c277df42
IZ
784
785 num++;
786 data_fake.flags = 0;
b81d288d 787 if (data) {
2c2d71f5 788 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
789 data_fake.last_closep = data->last_closep;
790 }
791 else
792 data_fake.last_closep = &fake;
c277df42
IZ
793 next = regnext(scan);
794 scan = NEXTOPER(scan);
795 if (code != BRANCH)
796 scan = NEXTOPER(scan);
653099ff 797 if (flags & SCF_DO_STCLASS) {
830247a4 798 cl_init(pRExC_state, &this_class);
653099ff
GS
799 data_fake.start_class = &this_class;
800 f = SCF_DO_STCLASS_AND;
b81d288d 801 }
e1901655
IZ
802 if (flags & SCF_WHILEM_VISITED_POS)
803 f |= SCF_WHILEM_VISITED_POS;
653099ff 804 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
805 minnext = study_chunk(pRExC_state, &scan, &deltanext,
806 next, &data_fake, f);
b81d288d 807 if (min1 > minnext)
c277df42
IZ
808 min1 = minnext;
809 if (max1 < minnext + deltanext)
810 max1 = minnext + deltanext;
811 if (deltanext == I32_MAX)
aca2d497 812 is_inf = is_inf_internal = 1;
c277df42
IZ
813 scan = next;
814 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
815 pars++;
405ff068 816 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 817 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
818 if (data)
819 data->whilem_c = data_fake.whilem_c;
653099ff 820 if (flags & SCF_DO_STCLASS)
830247a4 821 cl_or(pRExC_state, &accum, &this_class);
b81d288d 822 if (code == SUSPEND)
c277df42
IZ
823 break;
824 }
825 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
826 min1 = 0;
827 if (flags & SCF_DO_SUBSTR) {
828 data->pos_min += min1;
829 data->pos_delta += max1 - min1;
830 if (max1 != min1 || is_inf)
831 data->longest = &(data->longest_float);
832 }
833 min += min1;
834 delta += max1 - min1;
653099ff 835 if (flags & SCF_DO_STCLASS_OR) {
830247a4 836 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
837 if (min1) {
838 cl_and(data->start_class, &and_with);
839 flags &= ~SCF_DO_STCLASS;
840 }
841 }
842 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
843 if (min1) {
844 cl_and(data->start_class, &accum);
653099ff 845 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
846 }
847 else {
b81d288d 848 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
849 * data->start_class */
850 StructCopy(data->start_class, &and_with,
851 struct regnode_charclass_class);
852 flags &= ~SCF_DO_STCLASS_AND;
853 StructCopy(&accum, data->start_class,
854 struct regnode_charclass_class);
855 flags |= SCF_DO_STCLASS_OR;
856 data->start_class->flags |= ANYOF_EOS;
857 }
653099ff 858 }
a0ed51b3
LW
859 }
860 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
861 scan = NEXTOPER(NEXTOPER(scan));
862 else /* single branch is optimized. */
863 scan = NEXTOPER(scan);
864 continue;
a0ed51b3
LW
865 }
866 else if (OP(scan) == EXACT) {
cd439c50 867 I32 l = STR_LEN(scan);
1aa99e6b 868 UV uc = *((U8*)STRING(scan));
a0ed51b3 869 if (UTF) {
1aa99e6b
IH
870 U8 *s = (U8*)STRING(scan);
871 l = utf8_length(s, s + l);
9041c2e3 872 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
873 }
874 min += l;
c277df42 875 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
876 /* The code below prefers earlier match for fixed
877 offset, later match for variable offset. */
878 if (data->last_end == -1) { /* Update the start info. */
879 data->last_start_min = data->pos_min;
880 data->last_start_max = is_inf
b81d288d 881 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 882 }
cd439c50 883 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
884 data->last_end = data->pos_min + l;
885 data->pos_min += l; /* As in the first entry. */
886 data->flags &= ~SF_BEFORE_EOL;
887 }
653099ff
GS
888 if (flags & SCF_DO_STCLASS_AND) {
889 /* Check whether it is compatible with what we know already! */
890 int compat = 1;
891
1aa99e6b 892 if (uc >= 0x100 ||
516a5887 893 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 894 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 895 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 896 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 897 )
653099ff
GS
898 compat = 0;
899 ANYOF_CLASS_ZERO(data->start_class);
900 ANYOF_BITMAP_ZERO(data->start_class);
901 if (compat)
1aa99e6b 902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 903 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
904 if (uc < 0x100)
905 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
906 }
907 else if (flags & SCF_DO_STCLASS_OR) {
908 /* false positive possible if the class is case-folded */
1aa99e6b 909 if (uc < 0x100)
9b877dbb
IH
910 ANYOF_BITMAP_SET(data->start_class, uc);
911 else
912 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
913 data->start_class->flags &= ~ANYOF_EOS;
914 cl_and(data->start_class, &and_with);
915 }
916 flags &= ~SCF_DO_STCLASS;
a0ed51b3 917 }
653099ff 918 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
653099ff
GS
921
922 /* Search for fixed substrings supports EXACT only. */
b81d288d 923 if (flags & SCF_DO_SUBSTR)
830247a4 924 scan_commit(pRExC_state, data);
a0ed51b3 925 if (UTF) {
1aa99e6b
IH
926 U8 *s = (U8 *)STRING(scan);
927 l = utf8_length(s, s + l);
9041c2e3 928 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
929 }
930 min += l;
c277df42 931 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 932 data->pos_min += l;
653099ff
GS
933 if (flags & SCF_DO_STCLASS_AND) {
934 /* Check whether it is compatible with what we know already! */
935 int compat = 1;
936
1aa99e6b 937 if (uc >= 0x100 ||
516a5887 938 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 939 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 940 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
941 compat = 0;
942 ANYOF_CLASS_ZERO(data->start_class);
943 ANYOF_BITMAP_ZERO(data->start_class);
944 if (compat) {
1aa99e6b 945 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
946 data->start_class->flags &= ~ANYOF_EOS;
947 data->start_class->flags |= ANYOF_FOLD;
948 if (OP(scan) == EXACTFL)
949 data->start_class->flags |= ANYOF_LOCALE;
950 }
951 }
952 else if (flags & SCF_DO_STCLASS_OR) {
953 if (data->start_class->flags & ANYOF_FOLD) {
954 /* false positive possible if the class is case-folded.
955 Assume that the locale settings are the same... */
1aa99e6b
IH
956 if (uc < 0x100)
957 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
958 data->start_class->flags &= ~ANYOF_EOS;
959 }
960 cl_and(data->start_class, &and_with);
961 }
962 flags &= ~SCF_DO_STCLASS;
a0ed51b3 963 }
4d61ec05 964 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 965 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 966 I32 f = flags, pos_before = 0;
c277df42 967 regnode *oscan = scan;
653099ff
GS
968 struct regnode_charclass_class this_class;
969 struct regnode_charclass_class *oclass = NULL;
727f22e3 970 I32 next_is_eval = 0;
653099ff 971
22c35a8c 972 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 973 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
974 scan = NEXTOPER(scan);
975 goto finish;
976 case PLUS:
653099ff 977 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 978 next = NEXTOPER(scan);
653099ff 979 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
980 mincount = 1;
981 maxcount = REG_INFTY;
c277df42
IZ
982 next = regnext(scan);
983 scan = NEXTOPER(scan);
984 goto do_curly;
985 }
986 }
987 if (flags & SCF_DO_SUBSTR)
988 data->pos_min++;
989 min++;
990 /* Fall through. */
991 case STAR:
653099ff
GS
992 if (flags & SCF_DO_STCLASS) {
993 mincount = 0;
b81d288d 994 maxcount = REG_INFTY;
653099ff
GS
995 next = regnext(scan);
996 scan = NEXTOPER(scan);
997 goto do_curly;
998 }
b81d288d 999 is_inf = is_inf_internal = 1;
c277df42
IZ
1000 scan = regnext(scan);
1001 if (flags & SCF_DO_SUBSTR) {
830247a4 1002 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1003 data->longest = &(data->longest_float);
1004 }
1005 goto optimize_curly_tail;
1006 case CURLY:
b81d288d 1007 mincount = ARG1(scan);
c277df42
IZ
1008 maxcount = ARG2(scan);
1009 next = regnext(scan);
cb434fcc
IZ
1010 if (OP(scan) == CURLYX) {
1011 I32 lp = (data ? *(data->last_closep) : 0);
1012
1013 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1014 }
c277df42 1015 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1016 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1017 do_curly:
1018 if (flags & SCF_DO_SUBSTR) {
830247a4 1019 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1020 pos_before = data->pos_min;
1021 }
1022 if (data) {
1023 fl = data->flags;
1024 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1025 if (is_inf)
1026 data->flags |= SF_IS_INF;
1027 }
653099ff 1028 if (flags & SCF_DO_STCLASS) {
830247a4 1029 cl_init(pRExC_state, &this_class);
653099ff
GS
1030 oclass = data->start_class;
1031 data->start_class = &this_class;
1032 f |= SCF_DO_STCLASS_AND;
1033 f &= ~SCF_DO_STCLASS_OR;
1034 }
e1901655
IZ
1035 /* These are the cases when once a subexpression
1036 fails at a particular position, it cannot succeed
1037 even after backtracking at the enclosing scope.
b81d288d 1038
e1901655
IZ
1039 XXXX what if minimal match and we are at the
1040 initial run of {n,m}? */
1041 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1042 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1043
c277df42 1044 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1045 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1046 mincount == 0
653099ff
GS
1047 ? (f & ~SCF_DO_SUBSTR) : f);
1048
1049 if (flags & SCF_DO_STCLASS)
1050 data->start_class = oclass;
1051 if (mincount == 0 || minnext == 0) {
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 }
1055 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1056 /* Switch to OR mode: cache the old value of
653099ff
GS
1057 * data->start_class */
1058 StructCopy(data->start_class, &and_with,
1059 struct regnode_charclass_class);
1060 flags &= ~SCF_DO_STCLASS_AND;
1061 StructCopy(&this_class, data->start_class,
1062 struct regnode_charclass_class);
1063 flags |= SCF_DO_STCLASS_OR;
1064 data->start_class->flags |= ANYOF_EOS;
1065 }
1066 } else { /* Non-zero len */
1067 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1068 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1069 cl_and(data->start_class, &and_with);
1070 }
1071 else if (flags & SCF_DO_STCLASS_AND)
1072 cl_and(data->start_class, &this_class);
1073 flags &= ~SCF_DO_STCLASS;
1074 }
c277df42
IZ
1075 if (!scan) /* It was not CURLYX, but CURLY. */
1076 scan = next;
84037bb0 1077 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1078 /* ? quantifier ok, except for (?{ ... }) */
1079 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
84e09d5e
JH
1669 /* XXXX This looks very suspicious... */
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
1672 else
1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
055bb491 1749 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1750 RExC_parse = exp;
1751 RExC_end = xend;
1752 RExC_naughty = 0;
1753 RExC_npar = 1;
fac92740 1754 RExC_emit_start = r->program;
830247a4 1755 RExC_emit = r->program;
2cd61cdb 1756 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1757 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1758 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1759 r->data = 0;
830247a4 1760 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1761 return(NULL);
1762
1763 /* Dig out information for optimizations. */
cf93c79d 1764 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1765 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1766 if (UTF)
1767 r->reganch |= ROPT_UTF8;
c277df42 1768 r->regstclass = NULL;
830247a4 1769 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1770 r->reganch |= ROPT_NAUGHTY;
c277df42 1771 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1772
1773 /* XXXX To minimize changes to RE engine we always allocate
1774 3-units-long substrs field. */
1775 Newz(1004, r->substrs, 1, struct reg_substr_data);
1776
2c2d71f5 1777 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1778 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1779 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1780 I32 fake;
c5254dd6 1781 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1782 struct regnode_charclass_class ch_class;
1783 int stclass_flag;
cb434fcc 1784 I32 last_close = 0;
a0d0e21e
LW
1785
1786 first = scan;
c277df42 1787 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1788 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1789 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1790 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1791 (OP(first) == PLUS) ||
1792 (OP(first) == MINMOD) ||
653099ff 1793 /* An {n,m} with n>0 */
22c35a8c 1794 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1795 if (OP(first) == PLUS)
1796 sawplus = 1;
1797 else
1798 first += regarglen[(U8)OP(first)];
1799 first = NEXTOPER(first);
a687059c
LW
1800 }
1801
a0d0e21e
LW
1802 /* Starting-point info. */
1803 again:
653099ff 1804 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1805 if (OP(first) == EXACT)
1806 ; /* Empty, get anchored substr later. */
1807 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1808 r->regstclass = first;
1809 }
653099ff 1810 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1811 r->regstclass = first;
22c35a8c
GS
1812 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1813 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1814 r->regstclass = first;
22c35a8c 1815 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1816 r->reganch |= (OP(first) == MBOL
1817 ? ROPT_ANCH_MBOL
1818 : (OP(first) == SBOL
1819 ? ROPT_ANCH_SBOL
1820 : ROPT_ANCH_BOL));
a0d0e21e 1821 first = NEXTOPER(first);
774d564b 1822 goto again;
1823 }
1824 else if (OP(first) == GPOS) {
1825 r->reganch |= ROPT_ANCH_GPOS;
1826 first = NEXTOPER(first);
1827 goto again;
a0d0e21e
LW
1828 }
1829 else if ((OP(first) == STAR &&
22c35a8c 1830 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1831 !(r->reganch & ROPT_ANCH) )
1832 {
1833 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1834 int type = OP(NEXTOPER(first));
1835
ffc61ed2 1836 if (type == REG_ANY)
cad2e5aa
JH
1837 type = ROPT_ANCH_MBOL;
1838 else
1839 type = ROPT_ANCH_SBOL;
1840
1841 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1842 first = NEXTOPER(first);
774d564b 1843 goto again;
a0d0e21e 1844 }
b81d288d 1845 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1846 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1847 /* x+ must match at the 1st pos of run of x's */
1848 r->reganch |= ROPT_SKIP;
a0d0e21e 1849
c277df42 1850 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1851 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1852 (IV)(first - scan + 1)));
a0d0e21e
LW
1853 /*
1854 * If there's something expensive in the r.e., find the
1855 * longest literal string that must appear and make it the
1856 * regmust. Resolve ties in favor of later strings, since
1857 * the regstart check works with the beginning of the r.e.
1858 * and avoiding duplication strengthens checking. Not a
1859 * strong reason, but sufficient in the absence of others.
1860 * [Now we resolve ties in favor of the earlier string if
c277df42 1861 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1862 * earlier string may buy us something the later one won't.]
1863 */
a0d0e21e 1864 minlen = 0;
a687059c 1865
79cb57f6
GS
1866 data.longest_fixed = newSVpvn("",0);
1867 data.longest_float = newSVpvn("",0);
1868 data.last_found = newSVpvn("",0);
c277df42
IZ
1869 data.longest = &(data.longest_fixed);
1870 first = scan;
653099ff 1871 if (!r->regstclass) {
830247a4 1872 cl_init(pRExC_state, &ch_class);
653099ff
GS
1873 data.start_class = &ch_class;
1874 stclass_flag = SCF_DO_STCLASS_AND;
1875 } else /* XXXX Check for BOUND? */
1876 stclass_flag = 0;
cb434fcc 1877 data.last_closep = &last_close;
653099ff 1878
830247a4 1879 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1880 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1882 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1883 && !RExC_seen_zerolen
1884 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1885 r->reganch |= ROPT_CHECK_ALL;
830247a4 1886 scan_commit(pRExC_state, &data);
c277df42
IZ
1887 SvREFCNT_dec(data.last_found);
1888
a0ed51b3 1889 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1890 if (longest_float_length
c277df42
IZ
1891 || (data.flags & SF_FL_BEFORE_EOL
1892 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1893 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1894 int t;
1895
a0ed51b3 1896 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1897 && data.offset_fixed == data.offset_float_min
1898 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1899 goto remove_float; /* As in (a)+. */
1900
c277df42
IZ
1901 r->float_substr = data.longest_float;
1902 r->float_min_offset = data.offset_float_min;
1903 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1904 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1905 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1906 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1907 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1908 }
1909 else {
aca2d497 1910 remove_float:
c277df42
IZ
1911 r->float_substr = Nullsv;
1912 SvREFCNT_dec(data.longest_float);
c5254dd6 1913 longest_float_length = 0;
a0d0e21e 1914 }
c277df42 1915
a0ed51b3 1916 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1917 if (longest_fixed_length
c277df42
IZ
1918 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1921 int t;
1922
c277df42
IZ
1923 r->anchored_substr = data.longest_fixed;
1924 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1925 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1926 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1927 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1928 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1929 }
1930 else {
c277df42
IZ
1931 r->anchored_substr = Nullsv;
1932 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1933 longest_fixed_length = 0;
a0d0e21e 1934 }
b81d288d 1935 if (r->regstclass
ffc61ed2 1936 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1937 r->regstclass = NULL;
1938 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1939 && !(data.start_class->flags & ANYOF_EOS)
1940 && !cl_is_anything(data.start_class)) {
830247a4 1941 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1942
b81d288d 1943 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1944 struct regnode_charclass_class);
1945 StructCopy(data.start_class,
830247a4 1946 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1947 struct regnode_charclass_class);
830247a4 1948 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1949 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1950 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1951 DEBUG_r({ SV *sv = sv_newmortal();
1952 regprop(sv, (regnode*)data.start_class);
1953 PerlIO_printf(Perl_debug_log,
1954 "synthetic stclass `%s'.\n",
1955 SvPVX(sv));});
653099ff 1956 }
c277df42
IZ
1957
1958 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1959 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1960 r->check_substr = r->anchored_substr;
1961 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1962 if (r->reganch & ROPT_ANCH_SINGLE)
1963 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1964 }
1965 else {
c277df42
IZ
1966 r->check_substr = r->float_substr;
1967 r->check_offset_min = data.offset_float_min;
1968 r->check_offset_max = data.offset_float_max;
a0d0e21e 1969 }
30382c73
IZ
1970 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1971 This should be changed ASAP! */
1972 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1973 r->reganch |= RE_USE_INTUIT;
1974 if (SvTAIL(r->check_substr))
1975 r->reganch |= RE_INTUIT_TAIL;
1976 }
a0ed51b3
LW
1977 }
1978 else {
c277df42
IZ
1979 /* Several toplevels. Best we can is to set minlen. */
1980 I32 fake;
653099ff 1981 struct regnode_charclass_class ch_class;
cb434fcc 1982 I32 last_close = 0;
c277df42
IZ
1983
1984 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1985 scan = r->program + 1;
830247a4 1986 cl_init(pRExC_state, &ch_class);
653099ff 1987 data.start_class = &ch_class;
cb434fcc 1988 data.last_closep = &last_close;
e1901655 1989 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1990 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1991 if (!(data.start_class->flags & ANYOF_EOS)
1992 && !cl_is_anything(data.start_class)) {
830247a4 1993 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1994
b81d288d 1995 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1996 struct regnode_charclass_class);
1997 StructCopy(data.start_class,
830247a4 1998 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1999 struct regnode_charclass_class);
830247a4 2000 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2001 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2002 DEBUG_r({ SV* sv = sv_newmortal();
2003 regprop(sv, (regnode*)data.start_class);
2004 PerlIO_printf(Perl_debug_log,
2005 "synthetic stclass `%s'.\n",
2006 SvPVX(sv));});
653099ff 2007 }
a0d0e21e
LW
2008 }
2009
a0d0e21e 2010 r->minlen = minlen;
b81d288d 2011 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2012 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2013 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2014 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2015 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2016 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2017 if (RExC_seen & REG_SEEN_CANY)
2018 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2019 Newz(1002, r->startp, RExC_npar, I32);
2020 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2021 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2022 DEBUG_r(regdump(r));
2023 return(r);
a687059c
LW
2024}
2025
2026/*
2027 - reg - regular expression, i.e. main body or parenthesized thing
2028 *
2029 * Caller must absorb opening parenthesis.
2030 *
2031 * Combining parenthesis handling with the base level of regular expression
2032 * is a trifle forced, but the need to tie the tails of the branches to what
2033 * follows makes it hard to avoid.
2034 */
76e3520e 2035STATIC regnode *
830247a4 2036S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2037 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2038{
c277df42
IZ
2039 register regnode *ret; /* Will be the head of the group. */
2040 register regnode *br;
2041 register regnode *lastbr;
2042 register regnode *ender = 0;
a0d0e21e 2043 register I32 parno = 0;
830247a4 2044 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2045
2046 /* for (?g), (?gc), and (?o) warnings; warning
2047 about (?c) will warn about (?g) -- japhy */
2048
2049 I32 wastedflags = 0x00,
2050 wasted_o = 0x01,
2051 wasted_g = 0x02,
2052 wasted_gc = 0x02 | 0x04,
2053 wasted_c = 0x04;
2054
fac92740 2055 char * parse_start = RExC_parse; /* MJD */
830247a4 2056 char *oregcomp_parse = RExC_parse;
c277df42 2057 char c;
a0d0e21e 2058
821b33a5 2059 *flagp = 0; /* Tentatively. */
a0d0e21e 2060
9d1d55b5 2061
a0d0e21e
LW
2062 /* Make an OPEN node, if parenthesized. */
2063 if (paren) {
fac92740 2064 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2065 U16 posflags = 0, negflags = 0;
2066 U16 *flagsp = &posflags;
0f5d15d6 2067 int logical = 0;
830247a4 2068 char *seqstart = RExC_parse;
ca9dfc88 2069
830247a4
IZ
2070 RExC_parse++;
2071 paren = *RExC_parse++;
c277df42 2072 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2073 switch (paren) {
fac92740 2074 case '<': /* (?<...) */
830247a4 2075 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2076 if (*RExC_parse == '!')
c277df42 2077 paren = ',';
b81d288d 2078 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2079 goto unknown;
830247a4 2080 RExC_parse++;
fac92740
MJD
2081 case '=': /* (?=...) */
2082 case '!': /* (?!...) */
830247a4 2083 RExC_seen_zerolen++;
fac92740
MJD
2084 case ':': /* (?:...) */
2085 case '>': /* (?>...) */
a0d0e21e 2086 break;
fac92740
MJD
2087 case '$': /* (?$...) */
2088 case '@': /* (?@...) */
8615cb43 2089 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2090 break;
fac92740 2091 case '#': /* (?#...) */
830247a4
IZ
2092 while (*RExC_parse && *RExC_parse != ')')
2093 RExC_parse++;
2094 if (*RExC_parse != ')')
c277df42 2095 FAIL("Sequence (?#... not terminated");
830247a4 2096 nextchar(pRExC_state);
a0d0e21e
LW
2097 *flagp = TRYAGAIN;
2098 return NULL;
fac92740 2099 case 'p': /* (?p...) */
f9373011
PM
2100 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2101 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2102 /* FALL THROUGH*/
fac92740 2103 case '?': /* (??...) */
0f5d15d6 2104 logical = 1;
830247a4 2105 paren = *RExC_parse++;
0f5d15d6 2106 /* FALL THROUGH */
fac92740 2107 case '{': /* (?{...}) */
c277df42 2108 {
c277df42
IZ
2109 I32 count = 1, n = 0;
2110 char c;
830247a4 2111 char *s = RExC_parse;
c277df42
IZ
2112 SV *sv;
2113 OP_4tree *sop, *rop;
2114
830247a4
IZ
2115 RExC_seen_zerolen++;
2116 RExC_seen |= REG_SEEN_EVAL;
2117 while (count && (c = *RExC_parse)) {
2118 if (c == '\\' && RExC_parse[1])
2119 RExC_parse++;
b81d288d 2120 else if (c == '{')
c277df42 2121 count++;
b81d288d 2122 else if (c == '}')
c277df42 2123 count--;
830247a4 2124 RExC_parse++;
c277df42 2125 }
830247a4 2126 if (*RExC_parse != ')')
b45f050a 2127 {
b81d288d 2128 RExC_parse = s;
b45f050a
JF
2129 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2130 }
c277df42
IZ
2131 if (!SIZE_ONLY) {
2132 AV *av;
b81d288d
AB
2133
2134 if (RExC_parse - 1 - s)
830247a4 2135 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2136 else
79cb57f6 2137 sv = newSVpvn("", 0);
c277df42 2138
569233ed
SB
2139 ENTER;
2140 Perl_save_re_context(aTHX);
c277df42 2141 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2142 sop->op_private |= OPpREFCOUNTED;
2143 /* re_dup will OpREFCNT_inc */
2144 OpREFCNT_set(sop, 1);
569233ed 2145 LEAVE;
c277df42 2146
830247a4
IZ
2147 n = add_data(pRExC_state, 3, "nop");
2148 RExC_rx->data->data[n] = (void*)rop;
2149 RExC_rx->data->data[n+1] = (void*)sop;
2150 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2151 SvREFCNT_dec(sv);
a0ed51b3 2152 }
e24b16f9 2153 else { /* First pass */
830247a4 2154 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2155 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2156 /* No compiled RE interpolated, has runtime
2157 components ===> unsafe. */
2158 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2159 if (PL_tainting && PL_tainted)
cc6b7395 2160 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2161 }
2162
830247a4 2163 nextchar(pRExC_state);
0f5d15d6 2164 if (logical) {
830247a4 2165 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2166 if (!SIZE_ONLY)
2167 ret->flags = 2;
830247a4 2168 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2169 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2170 return ret;
2171 }
830247a4 2172 return reganode(pRExC_state, EVAL, n);
c277df42 2173 }
fac92740 2174 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2175 {
fac92740 2176 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2177 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2178 || RExC_parse[1] == '<'
830247a4 2179 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2180 I32 flag;
2181
830247a4 2182 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2183 if (!SIZE_ONLY)
2184 ret->flags = 1;
830247a4 2185 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2186 goto insert_if;
b81d288d 2187 }
a0ed51b3 2188 }
830247a4 2189 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2190 /* (?(1)...) */
830247a4 2191 parno = atoi(RExC_parse++);
c277df42 2192
830247a4
IZ
2193 while (isDIGIT(*RExC_parse))
2194 RExC_parse++;
fac92740
MJD
2195 ret = reganode(pRExC_state, GROUPP, parno);
2196
830247a4 2197 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2198 vFAIL("Switch condition not recognized");
c277df42 2199 insert_if:
830247a4
IZ
2200 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2201 br = regbranch(pRExC_state, &flags, 1);
c277df42 2202 if (br == NULL)
830247a4 2203 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2204 else
830247a4
IZ
2205 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2206 c = *nextchar(pRExC_state);
d1b80229
IZ
2207 if (flags&HASWIDTH)
2208 *flagp |= HASWIDTH;
c277df42 2209 if (c == '|') {
830247a4
IZ
2210 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2211 regbranch(pRExC_state, &flags, 1);
2212 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2213 if (flags&HASWIDTH)
2214 *flagp |= HASWIDTH;
830247a4 2215 c = *nextchar(pRExC_state);
a0ed51b3
LW
2216 }
2217 else
c277df42
IZ
2218 lastbr = NULL;
2219 if (c != ')')
8615cb43 2220 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2221 ender = reg_node(pRExC_state, TAIL);
2222 regtail(pRExC_state, br, ender);
c277df42 2223 if (lastbr) {
830247a4
IZ
2224 regtail(pRExC_state, lastbr, ender);
2225 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2226 }
2227 else
830247a4 2228 regtail(pRExC_state, ret, ender);
c277df42 2229 return ret;
a0ed51b3
LW
2230 }
2231 else {
830247a4 2232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2233 }
2234 }
1b1626e4 2235 case 0:
830247a4 2236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2237 vFAIL("Sequence (? incomplete");
1b1626e4 2238 break;
a0d0e21e 2239 default:
830247a4 2240 --RExC_parse;
fac92740 2241 parse_flags: /* (?i) */
830247a4 2242 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2243 /* (?g), (?gc) and (?o) are useless here
2244 and must be globally applied -- japhy */
2245
2246 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2247 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2248 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2249 if (! (wastedflags & wflagbit) ) {
2250 wastedflags |= wflagbit;
2251 vWARN5(
2252 RExC_parse + 1,
2253 "Useless (%s%c) - %suse /%c modifier",
2254 flagsp == &negflags ? "?-" : "?",
2255 *RExC_parse,
2256 flagsp == &negflags ? "don't " : "",
2257 *RExC_parse
2258 );
2259 }
2260 }
2261 }
2262 else if (*RExC_parse == 'c') {
2263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2264 if (! (wastedflags & wasted_c) ) {
2265 wastedflags |= wasted_gc;
2266 vWARN3(
2267 RExC_parse + 1,
2268 "Useless (%sc) - %suse /gc modifier",
2269 flagsp == &negflags ? "?-" : "?",
2270 flagsp == &negflags ? "don't " : ""
2271 );
2272 }
2273 }
2274 }
2275 else { pmflag(flagsp, *RExC_parse); }
2276
830247a4 2277 ++RExC_parse;
ca9dfc88 2278 }
830247a4 2279 if (*RExC_parse == '-') {
ca9dfc88 2280 flagsp = &negflags;
9d1d55b5 2281 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2282 ++RExC_parse;
ca9dfc88 2283 goto parse_flags;
48c036b1 2284 }
830247a4
IZ
2285 RExC_flags16 |= posflags;
2286 RExC_flags16 &= ~negflags;
2287 if (*RExC_parse == ':') {
2288 RExC_parse++;
ca9dfc88
IZ
2289 paren = ':';
2290 break;
2291 }
c277df42 2292 unknown:
830247a4
IZ
2293 if (*RExC_parse != ')') {
2294 RExC_parse++;
2295 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2296 }
830247a4 2297 nextchar(pRExC_state);
a0d0e21e
LW
2298 *flagp = TRYAGAIN;
2299 return NULL;
2300 }
2301 }
fac92740 2302 else { /* (...) */
830247a4
IZ
2303 parno = RExC_npar;
2304 RExC_npar++;
2305 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2306 Set_Node_Length(ret, 1); /* MJD */
2307 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2308 open = 1;
a0d0e21e 2309 }
a0ed51b3 2310 }
fac92740 2311 else /* ! paren */
a0d0e21e
LW
2312 ret = NULL;
2313
2314 /* Pick up the branches, linking them together. */
fac92740 2315 parse_start = RExC_parse; /* MJD */
830247a4 2316 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2317 /* branch_len = (paren != 0); */
2318
a0d0e21e
LW
2319 if (br == NULL)
2320 return(NULL);
830247a4
IZ
2321 if (*RExC_parse == '|') {
2322 if (!SIZE_ONLY && RExC_extralen) {
2323 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2324 }
fac92740 2325 else { /* MJD */
830247a4 2326 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2327 Set_Node_Length(br, paren != 0);
2328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2329 }
c277df42
IZ
2330 have_branch = 1;
2331 if (SIZE_ONLY)
830247a4 2332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2333 }
2334 else if (paren == ':') {
c277df42
IZ
2335 *flagp |= flags&SIMPLE;
2336 }
2337 if (open) { /* Starts with OPEN. */
830247a4 2338 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2339 }
2340 else if (paren != '?') /* Not Conditional */
a0d0e21e 2341 ret = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a0d0e21e 2344 *flagp |= flags&SPSTART;
c277df42 2345 lastbr = br;
830247a4
IZ
2346 while (*RExC_parse == '|') {
2347 if (!SIZE_ONLY && RExC_extralen) {
2348 ender = reganode(pRExC_state, LONGJMP,0);
2349 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2350 }
2351 if (SIZE_ONLY)
830247a4
IZ
2352 RExC_extralen += 2; /* Account for LONGJMP. */
2353 nextchar(pRExC_state);
2354 br = regbranch(pRExC_state, &flags, 0);
fac92740 2355
a687059c 2356 if (br == NULL)
a0d0e21e 2357 return(NULL);
830247a4 2358 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2359 lastbr = br;
821b33a5
IZ
2360 if (flags&HASWIDTH)
2361 *flagp |= HASWIDTH;
a687059c 2362 *flagp |= flags&SPSTART;
a0d0e21e
LW
2363 }
2364
c277df42
IZ
2365 if (have_branch || paren != ':') {
2366 /* Make a closing node, and hook it on the end. */
2367 switch (paren) {
2368 case ':':
830247a4 2369 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2370 break;
2371 case 1:
830247a4 2372 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2373 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2374 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2375 break;
2376 case '<':
c277df42
IZ
2377 case ',':
2378 case '=':
2379 case '!':
c277df42 2380 *flagp &= ~HASWIDTH;
821b33a5
IZ
2381 /* FALL THROUGH */
2382 case '>':
830247a4 2383 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2384 break;
2385 case 0:
830247a4 2386 ender = reg_node(pRExC_state, END);
c277df42
IZ
2387 break;
2388 }
830247a4 2389 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2390
c277df42
IZ
2391 if (have_branch) {
2392 /* Hook the tails of the branches to the closing node. */
2393 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2394 regoptail(pRExC_state, br, ender);
c277df42
IZ
2395 }
2396 }
a0d0e21e 2397 }
c277df42
IZ
2398
2399 {
2400 char *p;
2401 static char parens[] = "=!<,>";
2402
2403 if (paren && (p = strchr(parens, paren))) {
2404 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2405 int flag = (p - parens) > 1;
2406
2407 if (paren == '>')
2408 node = SUSPEND, flag = 0;
830247a4 2409 reginsert(pRExC_state, node,ret);
c277df42 2410 ret->flags = flag;
830247a4 2411 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2412 }
a0d0e21e
LW
2413 }
2414
2415 /* Check for proper termination. */
ce3e6498 2416 if (paren) {
830247a4
IZ
2417 RExC_flags16 = oregflags;
2418 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2419 RExC_parse = oregcomp_parse;
380a0633 2420 vFAIL("Unmatched (");
ce3e6498 2421 }
a0ed51b3 2422 }
830247a4
IZ
2423 else if (!paren && RExC_parse < RExC_end) {
2424 if (*RExC_parse == ')') {
2425 RExC_parse++;
380a0633 2426 vFAIL("Unmatched )");
a0ed51b3
LW
2427 }
2428 else
b45f050a 2429 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2430 /* NOTREACHED */
2431 }
a687059c 2432
a0d0e21e 2433 return(ret);
a687059c
LW
2434}
2435
2436/*
2437 - regbranch - one alternative of an | operator
2438 *
2439 * Implements the concatenation operator.
2440 */
76e3520e 2441STATIC regnode *
830247a4 2442S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2443{
c277df42
IZ
2444 register regnode *ret;
2445 register regnode *chain = NULL;
2446 register regnode *latest;
2447 I32 flags = 0, c = 0;
a0d0e21e 2448
b81d288d 2449 if (first)
c277df42
IZ
2450 ret = NULL;
2451 else {
b81d288d 2452 if (!SIZE_ONLY && RExC_extralen)
830247a4 2453 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2454 else {
830247a4 2455 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2456 Set_Node_Length(ret, 1);
2457 }
c277df42
IZ
2458 }
2459
b81d288d 2460 if (!first && SIZE_ONLY)
830247a4 2461 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2462
c277df42 2463 *flagp = WORST; /* Tentatively. */
a0d0e21e 2464
830247a4
IZ
2465 RExC_parse--;
2466 nextchar(pRExC_state);
2467 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2468 flags &= ~TRYAGAIN;
830247a4 2469 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2470 if (latest == NULL) {
2471 if (flags & TRYAGAIN)
2472 continue;
2473 return(NULL);
a0ed51b3
LW
2474 }
2475 else if (ret == NULL)
c277df42 2476 ret = latest;
a0d0e21e 2477 *flagp |= flags&HASWIDTH;
c277df42 2478 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2479 *flagp |= flags&SPSTART;
2480 else {
830247a4
IZ
2481 RExC_naughty++;
2482 regtail(pRExC_state, chain, latest);
a687059c 2483 }
a0d0e21e 2484 chain = latest;
c277df42
IZ
2485 c++;
2486 }
2487 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2488 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2489 if (ret == NULL)
2490 ret = chain;
2491 }
2492 if (c == 1) {
2493 *flagp |= flags&SIMPLE;
a0d0e21e 2494 }
a687059c 2495
a0d0e21e 2496 return(ret);
a687059c
LW
2497}
2498
2499/*
2500 - regpiece - something followed by possible [*+?]
2501 *
2502 * Note that the branching code sequences used for ? and the general cases
2503 * of * and + are somewhat optimized: they use the same NOTHING node as
2504 * both the endmarker for their branch list and the body of the last branch.
2505 * It might seem that this node could be dispensed with entirely, but the
2506 * endmarker role is not redundant.
2507 */
76e3520e 2508STATIC regnode *
830247a4 2509S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2510{
c277df42 2511 register regnode *ret;
a0d0e21e
LW
2512 register char op;
2513 register char *next;
2514 I32 flags;
830247a4 2515 char *origparse = RExC_parse;
a0d0e21e
LW
2516 char *maxpos;
2517 I32 min;
c277df42 2518 I32 max = REG_INFTY;
fac92740 2519 char *parse_start;
a0d0e21e 2520
830247a4 2521 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2522 if (ret == NULL) {
2523 if (flags & TRYAGAIN)
2524 *flagp |= TRYAGAIN;
2525 return(NULL);
2526 }
2527
830247a4 2528 op = *RExC_parse;
a0d0e21e 2529
830247a4 2530 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2531 parse_start = RExC_parse; /* MJD */
830247a4 2532 next = RExC_parse + 1;
a0d0e21e
LW
2533 maxpos = Nullch;
2534 while (isDIGIT(*next) || *next == ',') {
2535 if (*next == ',') {
2536 if (maxpos)
2537 break;
2538 else
2539 maxpos = next;
a687059c 2540 }
a0d0e21e
LW
2541 next++;
2542 }
2543 if (*next == '}') { /* got one */
2544 if (!maxpos)
2545 maxpos = next;
830247a4
IZ
2546 RExC_parse++;
2547 min = atoi(RExC_parse);
a0d0e21e
LW
2548 if (*maxpos == ',')
2549 maxpos++;
2550 else
830247a4 2551 maxpos = RExC_parse;
a0d0e21e
LW
2552 max = atoi(maxpos);
2553 if (!max && *maxpos != '0')
c277df42
IZ
2554 max = REG_INFTY; /* meaning "infinity" */
2555 else if (max >= REG_INFTY)
8615cb43 2556 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2557 RExC_parse = next;
2558 nextchar(pRExC_state);
a0d0e21e
LW
2559
2560 do_curly:
2561 if ((flags&SIMPLE)) {
830247a4
IZ
2562 RExC_naughty += 2 + RExC_naughty / 2;
2563 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2564 Set_Node_Offset(ret, parse_start+1); /* MJD */
2565 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2566 }
2567 else {
830247a4 2568 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2569
2570 w->flags = 0;
830247a4
IZ
2571 regtail(pRExC_state, ret, w);
2572 if (!SIZE_ONLY && RExC_extralen) {
2573 reginsert(pRExC_state, LONGJMP,ret);
2574 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2575 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2576 }
830247a4 2577 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2578 /* MJD hk */
2579 Set_Node_Offset(ret, parse_start+1);
2580 Set_Node_Length(ret,
2581 op == '{' ? (RExC_parse - parse_start) : 1);
2582
830247a4 2583 if (!SIZE_ONLY && RExC_extralen)
c277df42 2584 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2585 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2586 if (SIZE_ONLY)
830247a4
IZ
2587 RExC_whilem_seen++, RExC_extralen += 3;
2588 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2589 }
c277df42 2590 ret->flags = 0;
a0d0e21e
LW
2591
2592 if (min > 0)
821b33a5
IZ
2593 *flagp = WORST;
2594 if (max > 0)
2595 *flagp |= HASWIDTH;
a0d0e21e 2596 if (max && max < min)
8615cb43 2597 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2598 if (!SIZE_ONLY) {
2599 ARG1_SET(ret, min);
2600 ARG2_SET(ret, max);
a687059c 2601 }
a687059c 2602
a0d0e21e 2603 goto nest_check;
a687059c 2604 }
a0d0e21e 2605 }
a687059c 2606
a0d0e21e
LW
2607 if (!ISMULT1(op)) {
2608 *flagp = flags;
a687059c 2609 return(ret);
a0d0e21e 2610 }
bb20fd44 2611
c277df42 2612#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2613
2614 /* if this is reinstated, don't forget to put this back into perldiag:
2615
2616 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2617
2618 (F) The part of the regexp subject to either the * or + quantifier
2619 could match an empty string. The {#} shows in the regular
2620 expression about where the problem was discovered.
2621
2622 */
2623
bb20fd44 2624 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2625 vFAIL("Regexp *+ operand could be empty");
b81d288d 2626#endif
bb20fd44 2627
fac92740 2628 parse_start = RExC_parse;
830247a4 2629 nextchar(pRExC_state);
a0d0e21e 2630
821b33a5 2631 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2632
2633 if (op == '*' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, STAR, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 4;
a0d0e21e
LW
2637 }
2638 else if (op == '*') {
2639 min = 0;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2643 reginsert(pRExC_state, PLUS, ret);
c277df42 2644 ret->flags = 0;
830247a4 2645 RExC_naughty += 3;
a0d0e21e
LW
2646 }
2647 else if (op == '+') {
2648 min = 1;
2649 goto do_curly;
a0ed51b3
LW
2650 }
2651 else if (op == '?') {
a0d0e21e
LW
2652 min = 0; max = 1;
2653 goto do_curly;
2654 }
2655 nest_check:
e476b1b5 2656 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2657 vWARN3(RExC_parse,
b45f050a 2658 "%.*s matches null string many times",
830247a4 2659 RExC_parse - origparse,
b45f050a 2660 origparse);
a0d0e21e
LW
2661 }
2662
830247a4
IZ
2663 if (*RExC_parse == '?') {
2664 nextchar(pRExC_state);
2665 reginsert(pRExC_state, MINMOD, ret);
2666 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2667 }
830247a4
IZ
2668 if (ISMULT2(RExC_parse)) {
2669 RExC_parse++;
b45f050a
JF
2670 vFAIL("Nested quantifiers");
2671 }
a0d0e21e
LW
2672
2673 return(ret);
a687059c
LW
2674}
2675
2676/*
2677 - regatom - the lowest level
2678 *
2679 * Optimization: gobbles an entire sequence of ordinary characters so that
2680 * it can turn them into a single node, which is smaller to store and
2681 * faster to run. Backslashed characters are exceptions, each becoming a
2682 * separate node; the code is simpler that way and it's not worth fixing.
2683 *
b45f050a 2684 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2685STATIC regnode *
830247a4 2686S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2687{
c277df42 2688 register regnode *ret = 0;
a0d0e21e 2689 I32 flags;
f06dbbb7 2690 char *parse_start = 0;
a0d0e21e
LW
2691
2692 *flagp = WORST; /* Tentatively. */
2693
2694tryagain:
830247a4 2695 switch (*RExC_parse) {
a0d0e21e 2696 case '^':
830247a4
IZ
2697 RExC_seen_zerolen++;
2698 nextchar(pRExC_state);
2699 if (RExC_flags16 & PMf_MULTILINE)
2700 ret = reg_node(pRExC_state, MBOL);
2701 else if (RExC_flags16 & PMf_SINGLELINE)
2702 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2703 else
830247a4 2704 ret = reg_node(pRExC_state, BOL);
fac92740 2705 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2706 break;
2707 case '$':
830247a4 2708 nextchar(pRExC_state);
b81d288d 2709 if (*RExC_parse)
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 if (RExC_flags16 & PMf_MULTILINE)
2712 ret = reg_node(pRExC_state, MEOL);
2713 else if (RExC_flags16 & PMf_SINGLELINE)
2714 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2715 else
830247a4 2716 ret = reg_node(pRExC_state, EOL);
fac92740 2717 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2718 break;
2719 case '.':
830247a4 2720 nextchar(pRExC_state);
cce850e4 2721 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2722 ret = reg_node(pRExC_state, SANY);
2723 else
2724 ret = reg_node(pRExC_state, REG_ANY);
2725 *flagp |= HASWIDTH|SIMPLE;
830247a4 2726 RExC_naughty++;
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '[':
b45f050a 2730 {
830247a4 2731 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2732 ret = regclass(pRExC_state);
830247a4
IZ
2733 if (*RExC_parse != ']') {
2734 RExC_parse = oregcomp_parse;
b45f050a
JF
2735 vFAIL("Unmatched [");
2736 }
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738 *flagp |= HASWIDTH|SIMPLE;
fac92740 2739 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2740 break;
b45f050a 2741 }
a0d0e21e 2742 case '(':
830247a4
IZ
2743 nextchar(pRExC_state);
2744 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2745 if (ret == NULL) {
bf93d4cc 2746 if (flags & TRYAGAIN) {
830247a4 2747 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2748 /* Make parent create an empty node if needed. */
2749 *flagp |= TRYAGAIN;
2750 return(NULL);
2751 }
a0d0e21e 2752 goto tryagain;
bf93d4cc 2753 }
a0d0e21e
LW
2754 return(NULL);
2755 }
c277df42 2756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2757 break;
2758 case '|':
2759 case ')':
2760 if (flags & TRYAGAIN) {
2761 *flagp |= TRYAGAIN;
2762 return NULL;
2763 }
b45f050a 2764 vFAIL("Internal urp");
a0d0e21e
LW
2765 /* Supposed to be caught earlier. */
2766 break;
85afd4ae 2767 case '{':
830247a4
IZ
2768 if (!regcurly(RExC_parse)) {
2769 RExC_parse++;
85afd4ae
CS
2770 goto defchar;
2771 }
2772 /* FALL THROUGH */
a0d0e21e
LW
2773 case '?':
2774 case '+':
2775 case '*':
830247a4 2776 RExC_parse++;
b45f050a 2777 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2778 break;
2779 case '\\':
830247a4 2780 switch (*++RExC_parse) {
a0d0e21e 2781 case 'A':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2784 *flagp |= SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'G':
830247a4
IZ
2789 ret = reg_node(pRExC_state, GPOS);
2790 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2791 *flagp |= SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2794 break;
2795 case 'Z':
830247a4 2796 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2797 *flagp |= SIMPLE;
830247a4 2798 nextchar(pRExC_state);
a0d0e21e 2799 break;
b85d18e9 2800 case 'z':
830247a4 2801 ret = reg_node(pRExC_state, EOS);
b85d18e9 2802 *flagp |= SIMPLE;
830247a4
IZ
2803 RExC_seen_zerolen++; /* Do not optimize RE away */
2804 nextchar(pRExC_state);
fac92740 2805 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2806 break;
4a2d328f 2807 case 'C':
f33976b4
DB
2808 ret = reg_node(pRExC_state, CANY);
2809 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2810 *flagp |= HASWIDTH|SIMPLE;
830247a4 2811 nextchar(pRExC_state);
fac92740 2812 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2813 break;
2814 case 'X':
830247a4 2815 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2816 *flagp |= HASWIDTH;
830247a4 2817 nextchar(pRExC_state);
fac92740 2818 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2819 break;
a0d0e21e 2820 case 'w':
ffc61ed2 2821 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2822 *flagp |= HASWIDTH|SIMPLE;
830247a4 2823 nextchar(pRExC_state);
fac92740 2824 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2825 break;
2826 case 'W':
ffc61ed2 2827 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2828 *flagp |= HASWIDTH|SIMPLE;
830247a4 2829 nextchar(pRExC_state);
fac92740 2830 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2831 break;
2832 case 'b':
830247a4
IZ
2833 RExC_seen_zerolen++;
2834 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2835 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2836 *flagp |= SIMPLE;
830247a4 2837 nextchar(pRExC_state);
fac92740 2838 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2839 break;
2840 case 'B':
830247a4
IZ
2841 RExC_seen_zerolen++;
2842 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2843 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2844 *flagp |= SIMPLE;
830247a4 2845 nextchar(pRExC_state);
fac92740 2846 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2847 break;
2848 case 's':
ffc61ed2 2849 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2850 *flagp |= HASWIDTH|SIMPLE;
830247a4 2851 nextchar(pRExC_state);
fac92740 2852 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2853 break;
2854 case 'S':
ffc61ed2 2855 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
830247a4 2857 nextchar(pRExC_state);
fac92740 2858 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2859 break;
2860 case 'd':
ffc61ed2 2861 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2862 *flagp |= HASWIDTH|SIMPLE;
830247a4 2863 nextchar(pRExC_state);
fac92740 2864 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2865 break;
2866 case 'D':
ffc61ed2 2867 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2868 *flagp |= HASWIDTH|SIMPLE;
830247a4 2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2871 break;
a14b48bc
LW
2872 case 'p':
2873 case 'P':
3568d838 2874 {
830247a4 2875 char* oldregxend = RExC_end;
fac92740 2876 char* parse_start = RExC_parse;
a14b48bc 2877
830247a4 2878 if (RExC_parse[1] == '{') {
3568d838 2879 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2880 RExC_end = strchr(RExC_parse, '}');
2881 if (!RExC_end) {
0da60cf5 2882 U8 c = (U8)*RExC_parse;
830247a4
IZ
2883 RExC_parse += 2;
2884 RExC_end = oldregxend;
0da60cf5 2885 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2886 }
830247a4 2887 RExC_end++;
a14b48bc
LW
2888 }
2889 else
830247a4
IZ
2890 RExC_end = RExC_parse + 2;
2891 RExC_parse--;
a14b48bc 2892
ffc61ed2 2893 ret = regclass(pRExC_state);
a14b48bc 2894
830247a4
IZ
2895 RExC_end = oldregxend;
2896 RExC_parse--;
fac92740 2897 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2898 nextchar(pRExC_state);
a14b48bc
LW
2899 *flagp |= HASWIDTH|SIMPLE;
2900 }
2901 break;
a0d0e21e
LW
2902 case 'n':
2903 case 'r':
2904 case 't':
2905 case 'f':
2906 case 'e':
2907 case 'a':
2908 case 'x':
2909 case 'c':
2910 case '0':
2911 goto defchar;
2912 case '1': case '2': case '3': case '4':
2913 case '5': case '6': case '7': case '8': case '9':
2914 {
830247a4 2915 I32 num = atoi(RExC_parse);
a0d0e21e 2916
830247a4 2917 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2918 goto defchar;
2919 else {
fac92740 2920 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2921 while (isDIGIT(*RExC_parse))
2922 RExC_parse++;
b45f050a 2923
830247a4 2924 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2925 vFAIL("Reference to nonexistent group");
830247a4
IZ
2926 RExC_sawback = 1;
2927 ret = reganode(pRExC_state, FOLD
a0ed51b3 2928 ? (LOC ? REFFL : REFF)
c8756f30 2929 : REF, num);
a0d0e21e 2930 *flagp |= HASWIDTH;
fac92740
MJD
2931
2932 /* override incorrect value set in reganode MJD */
2933 Set_Node_Offset(ret, parse_start+1);
2934 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2935 RExC_parse--;
2936 nextchar(pRExC_state);
a0d0e21e
LW
2937 }
2938 }
2939 break;
2940 case '\0':
830247a4 2941 if (RExC_parse >= RExC_end)
b45f050a 2942 FAIL("Trailing \\");
a0d0e21e
LW
2943 /* FALL THROUGH */
2944 default:
c9f97d15
IZ
2945 /* Do not generate `unrecognized' warnings here, we fall
2946 back into the quick-grab loop below */
a0d0e21e
LW
2947 goto defchar;
2948 }
2949 break;
4633a7c4
LW
2950
2951 case '#':
830247a4
IZ
2952 if (RExC_flags16 & PMf_EXTENDED) {
2953 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2954 if (RExC_parse < RExC_end)
4633a7c4
LW
2955 goto tryagain;
2956 }
2957 /* FALL THROUGH */
2958
a0d0e21e 2959 default: {
ba210ebe 2960 register STRLEN len;
a0ed51b3 2961 register UV ender;
a0d0e21e 2962 register char *p;
c277df42 2963 char *oldp, *s;
ba210ebe 2964 STRLEN numlen;
a2a2844f
JH
2965 STRLEN ulen;
2966 U8 tmpbuf[UTF8_MAXLEN*2+1];
f06dbbb7
JH
2967
2968 parse_start = RExC_parse - 1;
a0d0e21e 2969
830247a4 2970 RExC_parse++;
a0d0e21e
LW
2971
2972 defchar:
830247a4 2973 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2974 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2975 : EXACT);
cd439c50 2976 s = STRING(ret);
830247a4
IZ
2977 for (len = 0, p = RExC_parse - 1;
2978 len < 127 && p < RExC_end;
a0d0e21e
LW
2979 len++)
2980 {
2981 oldp = p;
5b5a24f7 2982
830247a4
IZ
2983 if (RExC_flags16 & PMf_EXTENDED)
2984 p = regwhite(p, RExC_end);
a0d0e21e
LW
2985 switch (*p) {
2986 case '^':
2987 case '$':
2988 case '.':
2989 case '[':
2990 case '(':
2991 case ')':
2992 case '|':
2993 goto loopdone;
2994 case '\\':
2995 switch (*++p) {
2996 case 'A':
2997 case 'G':
2998 case 'Z':
b85d18e9 2999 case 'z':
a0d0e21e
LW
3000 case 'w':
3001 case 'W':
3002 case 'b':
3003 case 'B':
3004 case 's':
3005 case 'S':
3006 case 'd':
3007 case 'D':
a14b48bc
LW
3008 case 'p':
3009 case 'P':
a0d0e21e
LW
3010 --p;
3011 goto loopdone;
3012 case 'n':
3013 ender = '\n';
3014 p++;
a687059c 3015 break;
a0d0e21e
LW
3016 case 'r':
3017 ender = '\r';
3018 p++;
a687059c 3019 break;
a0d0e21e
LW
3020 case 't':
3021 ender = '\t';
3022 p++;
a687059c 3023 break;
a0d0e21e
LW
3024 case 'f':
3025 ender = '\f';
3026 p++;
a687059c 3027 break;
a0d0e21e 3028 case 'e':
c7f1f016 3029 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3030 p++;
a687059c 3031 break;
a0d0e21e 3032 case 'a':
c7f1f016 3033 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3034 p++;
a687059c 3035 break;
a0d0e21e 3036 case 'x':
a0ed51b3
LW
3037 if (*++p == '{') {
3038 char* e = strchr(p, '}');
b81d288d 3039
b45f050a 3040 if (!e) {
830247a4 3041 RExC_parse = p + 1;
b45f050a
JF
3042 vFAIL("Missing right brace on \\x{}");
3043 }
de5f0749 3044 else {
a4c04bdc
NC
3045 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3046 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3047 numlen = e - p - 1;
3048 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3049 if (ender > 0xff)
3050 RExC_utf8 = 1;
b21ed0a9
GS
3051 /* numlen is generous */
3052 if (numlen + len >= 127) {
a0ed51b3
LW
3053 p--;
3054 goto loopdone;
3055 }
3056 p = e + 1;
3057 }
a0ed51b3
LW
3058 }
3059 else {
a4c04bdc 3060 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3061 numlen = 2;
3062 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3063 p += numlen;
3064 }
a687059c 3065 break;
a0d0e21e
LW
3066 case 'c':
3067 p++;
bbce6d69 3068 ender = UCHARAT(p++);
3069 ender = toCTRL(ender);
a687059c 3070 break;
a0d0e21e
LW
3071 case '0': case '1': case '2': case '3':case '4':
3072 case '5': case '6': case '7': case '8':case '9':
3073 if (*p == '0' ||
830247a4 3074 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3075 I32 flags = 0;
3076 numlen = 3;
3077 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3078 p += numlen;
3079 }
3080 else {
3081 --p;
3082 goto loopdone;
a687059c
LW
3083 }
3084 break;
a0d0e21e 3085 case '\0':
830247a4 3086 if (p >= RExC_end)
b45f050a 3087 FAIL("Trailing \\");
a687059c 3088 /* FALL THROUGH */
a0d0e21e 3089 default:
e476b1b5 3090 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3091 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3092 goto normal_default;
a0d0e21e
LW
3093 }
3094 break;
a687059c 3095 default:
a0ed51b3 3096 normal_default:
fd400ab9 3097 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3098 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3099 &numlen, 0);
a0ed51b3
LW
3100 p += numlen;
3101 }
3102 else
3103 ender = *p++;
a0d0e21e 3104 break;
a687059c 3105 }
830247a4
IZ
3106 if (RExC_flags16 & PMf_EXTENDED)
3107 p = regwhite(p, RExC_end);
a0ed51b3 3108 if (UTF && FOLD) {
a2a2844f
JH
3109 toLOWER_uni(ender, tmpbuf, &ulen);
3110 ender = utf8_to_uvchr(tmpbuf, 0);
a0ed51b3 3111 }
a0d0e21e
LW
3112 if (ISMULT2(p)) { /* Back off on ?+*. */
3113 if (len)
3114 p = oldp;
2b9d42f0 3115 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3116 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3117 s += numlen;
3118 len += numlen;
3119 }
a0d0e21e
LW
3120 else {
3121 len++;
cd439c50 3122 REGC(ender, s++);
a0d0e21e
LW
3123 }
3124 break;
a687059c 3125 }
2b9d42f0 3126 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3127 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3128 s += numlen;
3129 len += numlen - 1;
3130 }
3131 else
cd439c50 3132 REGC(ender, s++);
a0d0e21e
LW
3133 }
3134 loopdone:
830247a4 3135 RExC_parse = p - 1;
fac92740 3136 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3137 nextchar(pRExC_state);
793db0cb
JH
3138 {
3139 /* len is STRLEN which is unsigned, need to copy to signed */
3140 IV iv = len;
3141 if (iv < 0)
3142 vFAIL("Internal disaster");
3143 }
a0d0e21e
LW
3144 if (len > 0)
3145 *flagp |= HASWIDTH;
3146 if (len == 1)
3147 *flagp |= SIMPLE;
c277df42 3148 if (!SIZE_ONLY)
cd439c50
IZ
3149 STR_LEN(ret) = len;
3150 if (SIZE_ONLY)
830247a4 3151 RExC_size += STR_SZ(len);
cd439c50 3152 else
830247a4 3153 RExC_emit += STR_SZ(len);
a687059c 3154 }
a0d0e21e
LW
3155 break;
3156 }
a687059c 3157
a0d0e21e 3158 return(ret);
a687059c
LW
3159}
3160
873ef191 3161STATIC char *
cea2e8a9 3162S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3163{
3164 while (p < e) {
3165 if (isSPACE(*p))
3166 ++p;
3167 else if (*p == '#') {
3168 do {
3169 p++;
3170 } while (p < e && *p != '\n');
3171 }
3172 else
3173 break;
3174 }
3175 return p;
3176}
3177
b8c5462f
JH
3178/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3179 Character classes ([:foo:]) can also be negated ([:^foo:]).
3180 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3181 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3182 but trigger warnings because they are currently unimplemented. */
3183STATIC I32
830247a4 3184S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3185{
3186 char *posixcc = 0;
936ed897 3187 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3188
830247a4 3189 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3190 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3191 (*RExC_parse == ':' ||
3192 *RExC_parse == '=' ||
3193 *RExC_parse == '.')) {
3194 char c = *RExC_parse;
3195 char* s = RExC_parse++;
b81d288d 3196
830247a4
IZ
3197 while (RExC_parse < RExC_end && *RExC_parse != c)
3198 RExC_parse++;
3199 if (RExC_parse == RExC_end)
620e46c5 3200 /* Grandfather lone [:, [=, [. */
830247a4 3201 RExC_parse = s;
620e46c5 3202 else {
830247a4 3203 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3204
830247a4
IZ
3205 if (*RExC_parse == ']') {
3206 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3207 posixcc = s + 1;
3208 if (*s == ':') {
3209 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3210 I32 skip = 5; /* the most common skip */
3211
3212 switch (*posixcc) {
3213 case 'a':
3214 if (strnEQ(posixcc, "alnum", 5))
3215 namedclass =
3216 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3217 else if (strnEQ(posixcc, "alpha", 5))
3218 namedclass =
3219 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3220 else if (strnEQ(posixcc, "ascii", 5))
3221 namedclass =
3222 complement ? ANYOF_NASCII : ANYOF_ASCII;
3223 break;
aaa51d5e
JF
3224 case 'b':
3225 if (strnEQ(posixcc, "blank", 5))
3226 namedclass =
3227 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3228 break;
b8c5462f
JH
3229 case 'c':
3230 if (strnEQ(posixcc, "cntrl", 5))
3231 namedclass =
3232 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3233 break;
3234 case 'd':
3235 if (strnEQ(posixcc, "digit", 5))
3236 namedclass =
3237 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3238 break;
3239 case 'g':
3240 if (strnEQ(posixcc, "graph", 5))
3241 namedclass =
3242 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3243 break;
3244 case 'l':
3245 if (strnEQ(posixcc, "lower", 5))
3246 namedclass =
3247 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3248 break;
3249 case 'p':
3250 if (strnEQ(posixcc, "print", 5))
3251 namedclass =
3252 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3253 else if (strnEQ(posixcc, "punct", 5))
3254 namedclass =
3255 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3256 break;
3257 case 's':
3258 if (strnEQ(posixcc, "space", 5))
3259 namedclass =
aaa51d5e 3260 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3261 break;
b8c5462f
JH
3262 case 'u':
3263 if (strnEQ(posixcc, "upper", 5))
3264 namedclass =
3265 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3266 break;
3267 case 'w': /* this is not POSIX, this is the Perl \w */
3268 if (strnEQ(posixcc, "word", 4)) {
3269 namedclass =
3270 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3271 skip = 4;
3272 }
3273 break;
3274 case 'x':
3275 if (strnEQ(posixcc, "xdigit", 6)) {
3276 namedclass =
3277 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3278 skip = 6;
3279 }
3280 break;
3281 }
ac561586
JH
3282 if (namedclass == OOB_NAMEDCLASS ||
3283 posixcc[skip] != ':' ||
3284 posixcc[skip+1] != ']')
b45f050a
JF
3285 {
3286 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3287 t - s - 1, s + 1);
3288 }
3289 } else if (!SIZE_ONLY) {
b8c5462f 3290 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3291
830247a4 3292 /* adjust RExC_parse so the warning shows after
b45f050a 3293 the class closes */
830247a4
IZ
3294 while (*RExC_parse && *RExC_parse != ']')
3295 RExC_parse++;
b45f050a
JF
3296 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3297 }
b8c5462f
JH
3298 } else {
3299 /* Maternal grandfather:
3300 * "[:" ending in ":" but not in ":]" */
830247a4 3301 RExC_parse = s;
767d463e 3302 }
620e46c5
JH
3303 }
3304 }
3305
b8c5462f
JH
3306 return namedclass;
3307}
3308
3309STATIC void
830247a4 3310S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3311{
e476b1b5 3312 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3313 (*RExC_parse == ':' ||
3314 *RExC_parse == '=' ||
3315 *RExC_parse == '.')) {
3316 char *s = RExC_parse;
93733859 3317 char c = *s++;
b8c5462f
JH
3318
3319 while(*s && isALNUM(*s))
3320 s++;
3321 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3322 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3323
3324 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3325 if (c == '=' || c == '.')
b45f050a 3326 {
830247a4 3327 /* adjust RExC_parse so the error shows after
b45f050a 3328 the class closes */
830247a4 3329 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3330 ;
3331 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3332 }
b8c5462f
JH
3333 }
3334 }
620e46c5
JH
3335}
3336
76e3520e 3337STATIC regnode *
830247a4 3338S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3339{
ffc61ed2 3340 register UV value;
3568d838 3341 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3342 register IV range = 0;
c277df42 3343 register regnode *ret;
ba210ebe 3344 STRLEN numlen;
ffc61ed2 3345 IV namedclass;
9c5ffd7c 3346 char *rangebegin = 0;
936ed897 3347 bool need_class = 0;
9c5ffd7c 3348 SV *listsv = Nullsv;
ffc61ed2
JH
3349 register char *e;
3350 UV n;
3568d838 3351 bool optimize_invert = TRUE;
ffc61ed2
JH
3352
3353 ret = reganode(pRExC_state, ANYOF, 0);
3354
3355 if (!SIZE_ONLY)
3356 ANYOF_FLAGS(ret) = 0;
3357
3358 if (*RExC_parse == '^') { /* Complement of range. */
3359 RExC_naughty++;
3360 RExC_parse++;
3361 if (!SIZE_ONLY)
3362 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3363 }
a0d0e21e 3364
936ed897 3365 if (SIZE_ONLY)
830247a4 3366 RExC_size += ANYOF_SKIP;
936ed897 3367 else {
830247a4 3368 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3369 if (FOLD)
3370 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3371 if (LOC)
3372 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3373 ANYOF_BITMAP_ZERO(ret);
3374 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3375 }
b8c5462f 3376
e476b1b5 3377 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3378 checkposixcc(pRExC_state);
b8c5462f 3379
830247a4 3380 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3381 goto charclassloop; /* allow 1st char to be ] or - */
3382
830247a4 3383 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3384
3385 charclassloop:
3386
3387 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3388
73b437c8 3389 if (!range)
830247a4 3390 rangebegin = RExC_parse;
ffc61ed2 3391 if (UTF) {
5e12f4fb 3392 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3393 RExC_end - RExC_parse,
3394 &numlen, 0);
ffc61ed2
JH
3395 RExC_parse += numlen;
3396 }
3397 else
3398 value = UCHARAT(RExC_parse++);
620e46c5 3399 if (value == '[')
830247a4 3400 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3401 else if (value == '\\') {
ffc61ed2 3402 if (UTF) {
5e12f4fb 3403 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3404 RExC_end - RExC_parse,
3405 &numlen, 0);
3406 RExC_parse += numlen;
3407 }
3408 else
3409 value = UCHARAT(RExC_parse++);
470c3474 3410 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3411 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3412 * be a problem later if we want switch on Unicode.
3413 * A similar issue a little bit later when switching on
3414 * namedclass. --jhi */
ffc61ed2 3415 switch ((I32)value) {
b8c5462f
JH
3416 case 'w': namedclass = ANYOF_ALNUM; break;
3417 case 'W': namedclass = ANYOF_NALNUM; break;
3418 case 's': namedclass = ANYOF_SPACE; break;
3419 case 'S': namedclass = ANYOF_NSPACE; break;
3420 case 'd': namedclass = ANYOF_DIGIT; break;
3421 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3422 case 'p':
3423 case 'P':
3424 if (*RExC_parse == '{') {
0da60cf5 3425 U8 c = (U8)value;
ffc61ed2
JH
3426 e = strchr(RExC_parse++, '}');
3427 if (!e)
0da60cf5 3428 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3429 while (isSPACE(UCHARAT(RExC_parse)))
3430 RExC_parse++;
3431 if (e == RExC_parse)
0da60cf5 3432 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3433 n = e - RExC_parse;
ab13f0c7
JH
3434 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3435 n--;
ffc61ed2
JH
3436 }
3437 else {
3438 e = RExC_parse;
3439 n = 1;
3440 }
3441 if (!SIZE_ONLY) {
ab13f0c7
JH
3442 if (UCHARAT(RExC_parse) == '^') {
3443 RExC_parse++;
3444 n--;
3445 value = value == 'p' ? 'P' : 'p'; /* toggle */
3446 while (isSPACE(UCHARAT(RExC_parse))) {
3447 RExC_parse++;
3448 n--;
3449 }
3450 }
ffc61ed2 3451 if (value == 'p')
ab13f0c7
JH
3452 Perl_sv_catpvf(aTHX_ listsv,
3453 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3454 else
ab13f0c7
JH
3455 Perl_sv_catpvf(aTHX_ listsv,
3456 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3457 }
3458 RExC_parse = e + 1;
3459 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3460 continue;
b8c5462f
JH
3461 case 'n': value = '\n'; break;
3462 case 'r': value = '\r'; break;
3463 case 't': value = '\t'; break;
3464 case 'f': value = '\f'; break;
3465 case 'b': value = '\b'; break;
c7f1f016
NIS
3466 case 'e': value = ASCII_TO_NATIVE('\033');break;
3467 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3468 case 'x':
ffc61ed2 3469 if (*RExC_parse == '{') {
a4c04bdc
NC
3470 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3471 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3472 e = strchr(RExC_parse++, '}');
b81d288d 3473 if (!e)
ffc61ed2 3474 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3475
3476 numlen = e - RExC_parse;
3477 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3478 RExC_parse = e + 1;
3479 }
3480 else {
a4c04bdc 3481 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3482 numlen = 2;
3483 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3484 RExC_parse += numlen;
3485 }
b8c5462f
JH
3486 break;
3487 case 'c':
830247a4 3488 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3489 value = toCTRL(value);
3490 break;
3491 case '0': case '1': case '2': case '3': case '4':
3492 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3493 {
3494 I32 flags = 0;
3495 numlen = 3;
3496 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3497 RExC_parse += numlen;
b8c5462f 3498 break;
53305cf1 3499 }
1028017a 3500 default:
e476b1b5 3501 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3502 vWARN2(RExC_parse,
3503 "Unrecognized escape \\%c in character class passed through",
3504 (int)value);
1028017a 3505 break;
b8c5462f 3506 }
ffc61ed2
JH
3507 } /* end of \blah */
3508
3509 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3510
3511 if (!SIZE_ONLY && !need_class)
936ed897 3512 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3513
936ed897 3514 need_class = 1;
ffc61ed2
JH
3515
3516 /* a bad range like a-\d, a-[:digit:] ? */
3517 if (range) {
73b437c8 3518 if (!SIZE_ONLY) {
e476b1b5 3519 if (ckWARN(WARN_REGEXP))
830247a4 3520 vWARN4(RExC_parse,
b45f050a 3521 "False [] range \"%*.*s\"",
830247a4
IZ
3522 RExC_parse - rangebegin,
3523 RExC_parse - rangebegin,
b45f050a 3524 rangebegin);
3568d838
JH
3525 if (prevvalue < 256) {
3526 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3527 ANYOF_BITMAP_SET(ret, '-');
3528 }
3529 else {
3530 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3531 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3532 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3533 }
b8c5462f 3534 }
ffc61ed2
JH
3535
3536 range = 0; /* this was not a true range */
73b437c8 3537 }
ffc61ed2 3538
73b437c8 3539 if (!SIZE_ONLY) {
3568d838
JH
3540 if (namedclass > OOB_NAMEDCLASS)
3541 optimize_invert = FALSE;
e2962f66
JH
3542 /* Possible truncation here but in some 64-bit environments
3543 * the compiler gets heartburn about switch on 64-bit values.
3544 * A similar issue a little earlier when switching on value.
98f323fa 3545 * --jhi */
e2962f66 3546 switch ((I32)namedclass) {
73b437c8
JH
3547 case ANYOF_ALNUM:
3548 if (LOC)
936ed897 3549 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3550 else {
3551 for (value = 0; value < 256; value++)
3552 if (isALNUM(value))
936ed897 3553 ANYOF_BITMAP_SET(ret, value);
73b437c8 3554 }
ffc61ed2 3555 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3556 break;
3557 case ANYOF_NALNUM:
3558 if (LOC)
936ed897 3559 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3560 else {
3561 for (value = 0; value < 256; value++)
3562 if (!isALNUM(value))
936ed897 3563 ANYOF_BITMAP_SET(ret, value);
73b437c8 3564 }
ffc61ed2 3565 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3566 break;
ffc61ed2 3567 case ANYOF_ALNUMC:
73b437c8 3568 if (LOC)
ffc61ed2 3569 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3570 else {
3571 for (value = 0; value < 256; value++)
ffc61ed2 3572 if (isALNUMC(value))
936ed897 3573 ANYOF_BITMAP_SET(ret, value);
73b437c8 3574 }
ffc61ed2 3575 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3576 break;
3577 case ANYOF_NALNUMC:
3578 if (LOC)
936ed897 3579 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3580 else {
3581 for (value = 0; value < 256; value++)
3582 if (!isALNUMC(value))
936ed897 3583 ANYOF_BITMAP_SET(ret, value);
73b437c8 3584 }
ffc61ed2 3585 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3586 break;
3587 case ANYOF_ALPHA:
3588 if (LOC)
936ed897 3589 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3590 else {
3591 for (value = 0; value < 256; value++)
3592 if (isALPHA(value))
936ed897 3593 ANYOF_BITMAP_SET(ret, value);
73b437c8 3594 }
ffc61ed2 3595 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3596 break;
3597 case ANYOF_NALPHA:
3598 if (LOC)
936ed897 3599 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3600 else {
3601 for (value = 0; value < 256; value++)
3602 if (!isALPHA(value))
936ed897 3603 ANYOF_BITMAP_SET(ret, value);
73b437c8 3604 }
ffc61ed2 3605 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3606 break;
3607 case ANYOF_ASCII:
3608 if (LOC)
936ed897 3609 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3610 else {
c7f1f016 3611#ifndef EBCDIC
1ba5c669
JH
3612 for (value = 0; value < 128; value++)
3613 ANYOF_BITMAP_SET(ret, value);
3614#else /* EBCDIC */
ffbc6a93 3615 for (value = 0; value < 256; value++) {
3a3c4447
JH
3616 if (isASCII(value))
3617 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3618 }
1ba5c669 3619#endif /* EBCDIC */
73b437c8 3620 }
ffc61ed2 3621 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3622 break;
3623 case ANYOF_NASCII:
3624 if (LOC)
936ed897 3625 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3626 else {
c7f1f016 3627#ifndef EBCDIC
1ba5c669
JH
3628 for (value = 128; value < 256; value++)
3629 ANYOF_BITMAP_SET(ret, value);
3630#else /* EBCDIC */
ffbc6a93 3631 for (value = 0; value < 256; value++) {
3a3c4447
JH
3632 if (!isASCII(value))
3633 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3634 }
1ba5c669 3635#endif /* EBCDIC */
73b437c8 3636 }
ffc61ed2 3637 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3638 break;
aaa51d5e
JF
3639 case ANYOF_BLANK:
3640 if (LOC)
3641 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3642 else {
3643 for (value = 0; value < 256; value++)
3644 if (isBLANK(value))
3645 ANYOF_BITMAP_SET(ret, value);
3646 }
ffc61ed2 3647 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3648 break;
3649 case ANYOF_NBLANK:
3650 if (LOC)
3651 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3652 else {
3653 for (value = 0; value < 256; value++)
3654 if (!isBLANK(value))
3655 ANYOF_BITMAP_SET(ret, value);
3656 }
ffc61ed2 3657 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3658 break;
73b437c8
JH
3659 case ANYOF_CNTRL:
3660 if (LOC)
936ed897 3661 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3662 else {
3663 for (value = 0; value < 256; value++)
3664 if (isCNTRL(value))
936ed897 3665 ANYOF_BITMAP_SET(ret, value);
73b437c8 3666 }
ffc61ed2 3667 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3668 break;
3669 case ANYOF_NCNTRL:
3670 if (LOC)
936ed897 3671 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3672 else {
3673 for (value = 0; value < 256; value++)
3674 if (!isCNTRL(value))
936ed897 3675 ANYOF_BITMAP_SET(ret, value);
73b437c8 3676 }
ffc61ed2
JH
3677 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3678 break;
3679 case ANYOF_DIGIT:
3680 if (LOC)
3681 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3682 else {
3683 /* consecutive digits assumed */
3684 for (value = '0'; value <= '9'; value++)
3685 ANYOF_BITMAP_SET(ret, value);
3686 }
3687 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3688 break;
3689 case ANYOF_NDIGIT:
3690 if (LOC)
3691 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3692 else {
3693 /* consecutive digits assumed */
3694 for (value = 0; value < '0'; value++)
3695 ANYOF_BITMAP_SET(ret, value);
3696 for (value = '9' + 1; value < 256; value++)
3697 ANYOF_BITMAP_SET(ret, value);
3698 }
3699 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3700 break;
3701 case ANYOF_GRAPH:
3702 if (LOC)
936ed897 3703 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3704 else {
3705 for (value = 0; value < 256; value++)
3706 if (isGRAPH(value))
936ed897 3707 ANYOF_BITMAP_SET(ret, value);
73b437c8 3708 }
ffc61ed2 3709 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3710 break;
3711 case ANYOF_NGRAPH:
3712 if (LOC)
936ed897 3713 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3714 else {
3715 for (value = 0; value < 256; value++)
3716 if (!isGRAPH(value))
936ed897 3717 ANYOF_BITMAP_SET(ret, value);
73b437c8 3718 }
ffc61ed2 3719 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3720 break;
3721 case ANYOF_LOWER:
3722 if (LOC)
936ed897 3723 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3724 else {
3725 for (value = 0; value < 256; value++)
3726 if (isLOWER(value))
936ed897 3727 ANYOF_BITMAP_SET(ret, value);
73b437c8 3728 }
ffc61ed2 3729 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3730 break;
3731 case ANYOF_NLOWER:
3732 if (LOC)
936ed897 3733 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3734 else {
3735 for (value = 0; value < 256; value++)
3736 if (!isLOWER(value))
936ed897 3737 ANYOF_BITMAP_SET(ret, value);
73b437c8 3738 }
ffc61ed2 3739 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3740 break;
3741 case ANYOF_PRINT:
3742 if (LOC)
936ed897 3743 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3744 else {
3745 for (value = 0; value < 256; value++)
3746 if (isPRINT(value))
936ed897 3747 ANYOF_BITMAP_SET(ret, value);
73b437c8 3748 }
ffc61ed2 3749 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3750 break;
3751 case ANYOF_NPRINT:
3752 if (LOC)
936ed897 3753 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3754 else {
3755 for (value = 0; value < 256; value++)
3756 if (!isPRINT(value))
936ed897 3757 ANYOF_BITMAP_SET(ret, value);
73b437c8 3758 }
ffc61ed2 3759 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3760 break;
aaa51d5e
JF
3761 case ANYOF_PSXSPC:
3762 if (LOC)
3763 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3764 else {
3765 for (value = 0; value < 256; value++)
3766 if (isPSXSPC(value))
3767 ANYOF_BITMAP_SET(ret, value);
3768 }
ffc61ed2 3769 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3770 break;
3771 case ANYOF_NPSXSPC:
3772 if (LOC)
3773 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3774 else {
3775 for (value = 0; value < 256; value++)
3776 if (!isPSXSPC(value))
3777 ANYOF_BITMAP_SET(ret, value);
3778 }
ffc61ed2 3779 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3780 break;
73b437c8
JH
3781 case ANYOF_PUNCT:
3782 if (LOC)
936ed897 3783 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3784 else {
3785 for (value = 0; value < 256; value++)
3786 if (isPUNCT(value))
936ed897 3787 ANYOF_BITMAP_SET(ret, value);
73b437c8 3788 }
ffc61ed2 3789 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3790 break;
3791 case ANYOF_NPUNCT:
3792 if (LOC)
936ed897 3793 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3794 else {
3795 for (value = 0; value < 256; value++)
3796 if (!isPUNCT(value))
936ed897 3797 ANYOF_BITMAP_SET(ret, value);
73b437c8 3798 }
ffc61ed2
JH
3799 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3800 break;
3801 case ANYOF_SPACE:
3802 if (LOC)
3803 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3804 else {
3805 for (value = 0; value < 256; value++)
3806 if (isSPACE(value))
3807 ANYOF_BITMAP_SET(ret, value);
3808 }
3809 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3810 break;
3811 case ANYOF_NSPACE:
3812 if (LOC)
3813 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3814 else {
3815 for (value = 0; value < 256; value++)
3816 if (!isSPACE(value))
3817 ANYOF_BITMAP_SET(ret, value);
3818 }
3819 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3820 break;
3821 case ANYOF_UPPER:
3822 if (LOC)
936ed897 3823 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3824 else {
3825 for (value = 0; value < 256; value++)
3826 if (isUPPER(value))
936ed897 3827 ANYOF_BITMAP_SET(ret, value);
73b437c8 3828 }
ffc61ed2 3829 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3830 break;
3831 case ANYOF_NUPPER:
3832 if (LOC)
936ed897 3833 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3834 else {
3835 for (value = 0; value < 256; value++)
3836 if (!isUPPER(value))
936ed897 3837 ANYOF_BITMAP_SET(ret, value);
73b437c8 3838 }
ffc61ed2 3839 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3840 break;
3841 case ANYOF_XDIGIT:
3842 if (LOC)
936ed897 3843 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3844 else {
3845 for (value = 0; value < 256; value++)
3846 if (isXDIGIT(value))
936ed897 3847 ANYOF_BITMAP_SET(ret, value);
73b437c8 3848 }
ffc61ed2 3849 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3850 break;
3851 case ANYOF_NXDIGIT:
3852 if (LOC)
936ed897 3853 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3854 else {
3855 for (value = 0; value < 256; value++)
3856 if (!isXDIGIT(value))
936ed897 3857 ANYOF_BITMAP_SET(ret, value);
73b437c8 3858 }
ffc61ed2 3859 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3860 break;
3861 default:
b45f050a 3862 vFAIL("Invalid [::] class");
73b437c8 3863 break;
b8c5462f 3864 }
b8c5462f 3865 if (LOC)
936ed897 3866 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3867 continue;
a0d0e21e 3868 }
ffc61ed2
JH
3869 } /* end of namedclass \blah */
3870
a0d0e21e 3871 if (range) {
3a3c4447 3872 if (prevvalue > value) /* b-a */ {
b45f050a 3873 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3874 RExC_parse - rangebegin,
3875 RExC_parse - rangebegin,
b45f050a 3876 rangebegin);
3568d838 3877 range = 0; /* not a valid range */
73b437c8 3878 }
a0d0e21e
LW
3879 }
3880 else {
3568d838 3881 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3882 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3883 RExC_parse[1] != ']') {
3884 RExC_parse++;
ffc61ed2
JH
3885
3886 /* a bad range like \w-, [:word:]- ? */
3887 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3888 if (ckWARN(WARN_REGEXP))
830247a4 3889 vWARN4(RExC_parse,
b45f050a 3890 "False [] range \"%*.*s\"",
830247a4
IZ
3891 RExC_parse - rangebegin,
3892 RExC_parse - rangebegin,
b45f050a 3893 rangebegin);
73b437c8 3894 if (!SIZE_ONLY)
936ed897 3895 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3896 } else
ffc61ed2
JH
3897 range = 1; /* yeah, it's a range! */
3898 continue; /* but do it the next time */
a0d0e21e 3899 }
a687059c 3900 }
ffc61ed2 3901
93733859 3902 /* now is the next time */
ae5c130c 3903 if (!SIZE_ONLY) {
3568d838
JH
3904 IV i;
3905
3906 if (prevvalue < 256) {
3907 IV ceilvalue = value < 256 ? value : 255;
3908
3909#ifdef EBCDIC
3a3c4447
JH
3910 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3911 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3912 {
3568d838
JH
3913 if (isLOWER(prevvalue)) {
3914 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3915 if (isLOWER(i))
3916 ANYOF_BITMAP_SET(ret, i);
3917 } else {
3568d838 3918 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3919 if (isUPPER(i))
3920 ANYOF_BITMAP_SET(ret, i);
3921 }
8ada0baa 3922 }
ffc61ed2 3923 else
8ada0baa 3924#endif
3568d838
JH
3925 for (i = prevvalue; i <= ceilvalue; i++)
3926 ANYOF_BITMAP_SET(ret, i);
3927 }
3928 if (value > 255) {
ffc61ed2 3929 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3930 if (prevvalue < value)
ffc61ed2 3931 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3932 (UV)prevvalue, (UV)value);
3933 else if (prevvalue == value)
ffc61ed2
JH
3934 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3935 (UV)value);
3936 }
8ada0baa 3937 }
ffc61ed2
JH
3938
3939 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3940 }
ffc61ed2 3941
936ed897 3942 if (need_class) {
4f66b38d 3943 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3944 if (SIZE_ONLY)
830247a4 3945 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3946 else
830247a4 3947 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3948 }
ffc61ed2 3949
ae5c130c 3950 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3951 if (!SIZE_ONLY &&
ffc61ed2 3952 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3953 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3954 ) {
a0ed51b3 3955 for (value = 0; value < 256; ++value) {
936ed897 3956 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3957 IV fold = PL_fold[value];
3958
3959 if (fold != value)
3960 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3961 }
3962 }
936ed897 3963 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3964 }
ffc61ed2 3965
ae5c130c 3966 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3967 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3968 /* If the only flag is inversion. */
3969 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3970 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3971 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3972 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3973 }
a0d0e21e 3974
b81d288d 3975 if (!SIZE_ONLY) {
fde631ed 3976 AV *av = newAV();
ffc61ed2
JH
3977 SV *rv;
3978
3979 av_store(av, 0, listsv);
3980 av_store(av, 1, NULL);
3981 rv = newRV_noinc((SV*)av);
19860706 3982 n = add_data(pRExC_state, 1, "s");
830247a4 3983 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3984 ARG_SET(ret, n);
a0ed51b3
LW
3985 }
3986
3987 return ret;
3988}
3989
76e3520e 3990STATIC char*
830247a4 3991S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3992{
830247a4 3993 char* retval = RExC_parse++;
a0d0e21e 3994
4633a7c4 3995 for (;;) {
830247a4
IZ
3996 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3997 RExC_parse[2] == '#') {
3998 while (*RExC_parse && *RExC_parse != ')')
3999 RExC_parse++;
4000 RExC_parse++;
4633a7c4
LW
4001 continue;
4002 }
830247a4
IZ
4003 if (RExC_flags16 & PMf_EXTENDED) {
4004 if (isSPACE(*RExC_parse)) {
4005 RExC_parse++;
748a9306
LW
4006 continue;
4007 }
830247a4
IZ
4008 else if (*RExC_parse == '#') {
4009 while (*RExC_parse && *RExC_parse != '\n')
4010 RExC_parse++;
4011 RExC_parse++;
748a9306
LW
4012 continue;
4013 }
748a9306 4014 }
4633a7c4 4015 return retval;
a0d0e21e 4016 }
a687059c
LW
4017}
4018
4019/*
c277df42 4020- reg_node - emit a node
a0d0e21e 4021*/
76e3520e 4022STATIC regnode * /* Location. */
830247a4 4023S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4024{
c277df42
IZ
4025 register regnode *ret;
4026 register regnode *ptr;
a687059c 4027
830247a4 4028 ret = RExC_emit;
c277df42 4029 if (SIZE_ONLY) {
830247a4
IZ
4030 SIZE_ALIGN(RExC_size);
4031 RExC_size += 1;
a0d0e21e
LW
4032 return(ret);
4033 }
a687059c 4034
c277df42 4035 NODE_ALIGN_FILL(ret);
a0d0e21e 4036 ptr = ret;
c277df42 4037 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4038 if (RExC_offsets) { /* MJD */
4039 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4040 "reg_node", __LINE__,
4041 reg_name[op],
4042 RExC_emit - RExC_emit_start > RExC_offsets[0]
4043 ? "Overwriting end of array!\n" : "OK",
4044 RExC_emit - RExC_emit_start,
4045 RExC_parse - RExC_start,
4046 RExC_offsets[0]));
4047 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4048 }
4049
830247a4 4050 RExC_emit = ptr;
a687059c 4051
a0d0e21e 4052 return(ret);
a687059c
LW
4053}
4054
4055/*
a0d0e21e
LW
4056- reganode - emit a node with an argument
4057*/
76e3520e 4058STATIC regnode * /* Location. */
830247a4 4059S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4060{
c277df42
IZ
4061 register regnode *ret;
4062 register regnode *ptr;
fe14fcc3 4063
830247a4 4064 ret = RExC_emit;
c277df42 4065 if (SIZE_ONLY) {
830247a4
IZ
4066 SIZE_ALIGN(RExC_size);
4067 RExC_size += 2;
a0d0e21e
LW
4068 return(ret);
4069 }
fe14fcc3 4070
c277df42 4071 NODE_ALIGN_FILL(ret);
a0d0e21e 4072 ptr = ret;
c277df42 4073 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4074 if (RExC_offsets) { /* MJD */
4075 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4076 "reganode",
4077 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4078 "Overwriting end of array!\n" : "OK",
4079 RExC_emit - RExC_emit_start,
4080 RExC_parse - RExC_start,
4081 RExC_offsets[0]));
4082 Set_Cur_Node_Offset;
4083 }
4084
830247a4 4085 RExC_emit = ptr;
fe14fcc3 4086
a0d0e21e 4087 return(ret);
fe14fcc3
LW
4088}
4089
4090/*
cd439c50 4091- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4092*/
4093STATIC void
830247a4 4094S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4095{
5e12f4fb 4096 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4097}
4098
4099/*
a0d0e21e
LW
4100- reginsert - insert an operator in front of already-emitted operand
4101*
4102* Means relocating the operand.
4103*/
76e3520e 4104STATIC void
830247a4 4105S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4106{
c277df42
IZ
4107 register regnode *src;
4108 register regnode *dst;
4109 register regnode *place;
4110 register int offset = regarglen[(U8)op];
b81d288d 4111
22c35a8c 4112/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4113
4114 if (SIZE_ONLY) {
830247a4 4115 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4116 return;
4117 }
a687059c 4118
830247a4
IZ
4119 src = RExC_emit;
4120 RExC_emit += NODE_STEP_REGNODE + offset;
4121 dst = RExC_emit;
fac92740 4122 while (src > opnd) {
c277df42 4123 StructCopy(--src, --dst, regnode);
fac92740
MJD
4124 if (RExC_offsets) { /* MJD 20010112 */
4125 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4126 "reg_insert",
4127 dst - RExC_emit_start > RExC_offsets[0]
4128 ? "Overwriting end of array!\n" : "OK",
4129 src - RExC_emit_start,
4130 dst - RExC_emit_start,
4131 RExC_offsets[0]));
4132 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4133 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4134 }
4135 }
4136
a0d0e21e
LW
4137
4138 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4139 if (RExC_offsets) { /* MJD */
4140 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4141 "reginsert",
4142 place - RExC_emit_start > RExC_offsets[0]
4143 ? "Overwriting end of array!\n" : "OK",
4144 place - RExC_emit_start,
4145 RExC_parse - RExC_start,
4146 RExC_offsets[0]));
4147 Set_Node_Offset(place, RExC_parse);
4148 }
c277df42
IZ
4149 src = NEXTOPER(place);
4150 FILL_ADVANCE_NODE(place, op);
4151 Zero(src, offset, regnode);
a687059c
LW
4152}
4153
4154/*
c277df42 4155- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4156*/
76e3520e 4157STATIC void
830247a4 4158S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4159{
c277df42
IZ
4160 register regnode *scan;
4161 register regnode *temp;
a0d0e21e 4162
c277df42 4163 if (SIZE_ONLY)
a0d0e21e
LW
4164 return;
4165
4166 /* Find last node. */
4167 scan = p;
4168 for (;;) {
4169 temp = regnext(scan);
4170 if (temp == NULL)
4171 break;
4172 scan = temp;
4173 }
a687059c 4174
c277df42
IZ
4175 if (reg_off_by_arg[OP(scan)]) {
4176 ARG_SET(scan, val - scan);
a0ed51b3
LW
4177 }
4178 else {
c277df42
IZ
4179 NEXT_OFF(scan) = val - scan;
4180 }
a687059c
LW
4181}
4182
4183/*
a0d0e21e
LW
4184- regoptail - regtail on operand of first argument; nop if operandless
4185*/
76e3520e 4186STATIC void
830247a4 4187S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4188{
a0d0e21e 4189 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4190 if (p == NULL || SIZE_ONLY)
4191 return;
22c35a8c 4192 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4193 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4194 }
22c35a8c 4195 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4196 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4197 }
4198 else
a0d0e21e 4199 return;
a687059c
LW
4200}
4201
4202/*
4203 - regcurly - a little FSA that accepts {\d+,?\d*}
4204 */
79072805 4205STATIC I32
cea2e8a9 4206S_regcurly(pTHX_ register char *s)
a687059c
LW
4207{
4208 if (*s++ != '{')
4209 return FALSE;
f0fcb552 4210 if (!isDIGIT(*s))
a687059c 4211 return FALSE;
f0fcb552 4212 while (isDIGIT(*s))
a687059c
LW
4213 s++;
4214 if (*s == ',')
4215 s++;
f0fcb552 4216 while (isDIGIT(*s))
a687059c
LW
4217 s++;
4218 if (*s != '}')
4219 return FALSE;
4220 return TRUE;
4221}
4222
a687059c 4223
8fa7f367
JH
4224#ifdef DEBUGGING
4225
76e3520e 4226STATIC regnode *
cea2e8a9 4227S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4228{
f248d071 4229 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4230 register regnode *next;
c277df42
IZ
4231
4232 while (op != END && (!last || node < last)) {
4233 /* While that wasn't END last time... */
4234
4235 NODE_ALIGN(node);
4236 op = OP(node);
4237 if (op == CLOSE)
4238 l--;
4239 next = regnext(node);
4240 /* Where, what. */
4241 if (OP(node) == OPTIMIZED)
4242 goto after_print;
4243 regprop(sv, node);
b900a521 4244 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4245 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4246 if (next == NULL) /* Next ptr. */
4247 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4248 else
b900a521 4249 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4250 (void)PerlIO_putc(Perl_debug_log, '\n');
4251 after_print:
22c35a8c 4252 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4253 register regnode *nnode = (OP(next) == LONGJMP
4254 ? regnext(next)
c277df42
IZ
4255 : next);
4256 if (last && nnode > last)
4257 nnode = last;
4258 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4259 }
22c35a8c 4260 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4261 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4262 }
4263 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4264 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4265 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4266 }
22c35a8c 4267 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4268 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4269 next, sv, l + 1);
a0ed51b3
LW
4270 }
4271 else if ( op == PLUS || op == STAR) {
c277df42 4272 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4273 }
4274 else if (op == ANYOF) {
4f66b38d
HS
4275 /* arglen 1 + class block */
4276 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4277 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4278 node = NEXTOPER(node);
a0ed51b3 4279 }
22c35a8c 4280 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4281 /* Literal string, where present. */
cd439c50 4282 node += NODE_SZ_STR(node) - 1;
c277df42 4283 node = NEXTOPER(node);
a0ed51b3
LW
4284 }
4285 else {
c277df42
IZ
4286 node = NEXTOPER(node);
4287 node += regarglen[(U8)op];
4288 }
4289 if (op == CURLYX || op == OPEN)
4290 l++;
4291 else if (op == WHILEM)
4292 l--;
4293 }
4294 return node;
4295}
4296
8fa7f367
JH
4297#endif /* DEBUGGING */
4298
a687059c 4299/*
fd181c75 4300 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4301 */
4302void
864dbfa3 4303Perl_regdump(pTHX_ regexp *r)
a687059c 4304{
35ff7856 4305#ifdef DEBUGGING
46fc3d4c 4306 SV *sv = sv_newmortal();
a687059c 4307
c277df42 4308 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4309
4310 /* Header fields of interest. */
c277df42 4311 if (r->anchored_substr)
7b0972df 4312 PerlIO_printf(Perl_debug_log,
b81d288d 4313 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4314 PL_colors[0],
7b0972df 4315 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4316 SvPVX(r->anchored_substr),
3280af22 4317 PL_colors[1],
c277df42 4318 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4319 (IV)r->anchored_offset);
c277df42 4320 if (r->float_substr)
7b0972df 4321 PerlIO_printf(Perl_debug_log,
b81d288d 4322 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4323 PL_colors[0],
b81d288d 4324 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4325 SvPVX(r->float_substr),
3280af22 4326 PL_colors[1],
c277df42 4327 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4328 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4329 if (r->check_substr)
b81d288d
AB
4330 PerlIO_printf(Perl_debug_log,
4331 r->check_substr == r->float_substr
c277df42
IZ
4332 ? "(checking floating" : "(checking anchored");
4333 if (r->reganch & ROPT_NOSCAN)
4334 PerlIO_printf(Perl_debug_log, " noscan");
4335 if (r->reganch & ROPT_CHECK_ALL)
4336 PerlIO_printf(Perl_debug_log, " isall");
4337 if (r->check_substr)
4338 PerlIO_printf(Perl_debug_log, ") ");
4339
46fc3d4c 4340 if (r->regstclass) {
4341 regprop(sv, r->regstclass);
4342 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4343 }
774d564b 4344 if (r->reganch & ROPT_ANCH) {
4345 PerlIO_printf(Perl_debug_log, "anchored");
4346 if (r->reganch & ROPT_ANCH_BOL)
4347 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4348 if (r->reganch & ROPT_ANCH_MBOL)
4349 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4350 if (r->reganch & ROPT_ANCH_SBOL)
4351 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4352 if (r->reganch & ROPT_ANCH_GPOS)
4353 PerlIO_printf(Perl_debug_log, "(GPOS)");
4354 PerlIO_putc(Perl_debug_log, ' ');
4355 }
c277df42
IZ
4356 if (r->reganch & ROPT_GPOS_SEEN)
4357 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4358 if (r->reganch & ROPT_SKIP)
760ac839 4359 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4360 if (r->reganch & ROPT_IMPLICIT)
760ac839 4361 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4362 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4363 if (r->reganch & ROPT_EVAL_SEEN)
4364 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4365 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4366 if (r->offsets) {
4367 U32 i;
4368 U32 len = r->offsets[0];
392fbf5d 4369 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4370 for (i = 1; i <= len; i++)
392fbf5d
RB
4371 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4372 (UV)r->offsets[i*2-1],
4373 (UV)r->offsets[i*2]);
fac92740
MJD
4374 PerlIO_printf(Perl_debug_log, "\n");
4375 }
17c3b450 4376#endif /* DEBUGGING */
a687059c
LW
4377}
4378
8fa7f367
JH
4379#ifdef DEBUGGING
4380
653099ff
GS
4381STATIC void
4382S_put_byte(pTHX_ SV *sv, int c)
4383{
7be5a6cf 4384 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4385 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4386 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4387 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4388 else
4389 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4390}
4391
8fa7f367
JH
4392#endif /* DEBUGGING */
4393
a687059c 4394/*
a0d0e21e
LW
4395- regprop - printable representation of opcode
4396*/
46fc3d4c 4397void
864dbfa3 4398Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4399{
35ff7856 4400#ifdef DEBUGGING
9b155405 4401 register int k;
a0d0e21e 4402
54dc92de 4403 sv_setpvn(sv, "", 0);
9b155405 4404 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4405 /* It would be nice to FAIL() here, but this may be called from
4406 regexec.c, and it would be hard to supply pRExC_state. */
4407 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4408 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4409
4410 k = PL_regkind[(U8)OP(o)];
4411
4412 if (k == EXACT)
7821416a 4413 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4414 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4415 else if (k == CURLY) {
cb434fcc 4416 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4417 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4418 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4419 }
2c2d71f5
JH
4420 else if (k == WHILEM && o->flags) /* Ordinal/of */
4421 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4422 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4423 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4424 else if (k == LOGICAL)
04ebc1ab 4425 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4426 else if (k == ANYOF) {
4427 int i, rangestart = -1;
ffc61ed2 4428 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4429 const char * const anyofs[] = { /* Should be syncronized with
4430 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4431 "\\w",
4432 "\\W",
4433 "\\s",
4434 "\\S",
4435 "\\d",
4436 "\\D",
4437 "[:alnum:]",
4438 "[:^alnum:]",
4439 "[:alpha:]",
4440 "[:^alpha:]",
4441 "[:ascii:]",
4442 "[:^ascii:]",
4443 "[:ctrl:]",
4444 "[:^ctrl:]",
4445 "[:graph:]",
4446 "[:^graph:]",
4447 "[:lower:]",
4448 "[:^lower:]",
4449 "[:print:]",
4450 "[:^print:]",
4451 "[:punct:]",
4452 "[:^punct:]",
4453 "[:upper:]",
aaa51d5e 4454 "[:^upper:]",
653099ff 4455 "[:xdigit:]",
aaa51d5e
JF
4456 "[:^xdigit:]",
4457 "[:space:]",
4458 "[:^space:]",
4459 "[:blank:]",
4460 "[:^blank:]"
653099ff
GS
4461 };
4462
19860706 4463 if (flags & ANYOF_LOCALE)
653099ff 4464 sv_catpv(sv, "{loc}");
19860706 4465 if (flags & ANYOF_FOLD)
653099ff
GS
4466 sv_catpv(sv, "{i}");
4467 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4468 if (flags & ANYOF_INVERT)
653099ff 4469 sv_catpv(sv, "^");
ffc61ed2
JH
4470 for (i = 0; i <= 256; i++) {
4471 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4472 if (rangestart == -1)
4473 rangestart = i;
4474 } else if (rangestart != -1) {
4475 if (i <= rangestart + 3)
4476 for (; rangestart < i; rangestart++)
653099ff 4477 put_byte(sv, rangestart);
ffc61ed2
JH
4478 else {
4479 put_byte(sv, rangestart);
4480 sv_catpv(sv, "-");
4481 put_byte(sv, i - 1);
653099ff 4482 }
ffc61ed2 4483 rangestart = -1;
653099ff 4484 }
847a199f 4485 }
ffc61ed2
JH
4486
4487 if (o->flags & ANYOF_CLASS)
4488 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4489 if (ANYOF_CLASS_TEST(o,i))
4490 sv_catpv(sv, anyofs[i]);
4491
4492 if (flags & ANYOF_UNICODE)
4493 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4494 else if (flags & ANYOF_UNICODE_ALL)
4495 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4496
4497 {
4498 SV *lv;
4499 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4500
ffc61ed2
JH
4501 if (lv) {
4502 if (sw) {
4503 UV i;
4504 U8 s[UTF8_MAXLEN+1];
b81d288d 4505
ffc61ed2 4506 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4507 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4508
3568d838 4509 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4510 if (rangestart == -1)
4511 rangestart = i;
4512 } else if (rangestart != -1) {
4513 U8 *p;
b81d288d 4514
ffc61ed2
JH
4515 if (i <= rangestart + 3)
4516 for (; rangestart < i; rangestart++) {
2b9d42f0 4517 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4518 put_byte(sv, *p);
4519 }
4520 else {
2b9d42f0 4521 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4522 put_byte(sv, *p);
4523 sv_catpv(sv, "-");
2b9d42f0 4524 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4525 put_byte(sv, *p);
4526 }
4527 rangestart = -1;
4528 }
19860706 4529 }
ffc61ed2
JH
4530
4531 sv_catpv(sv, "..."); /* et cetera */
19860706 4532 }
fde631ed 4533
ffc61ed2
JH
4534 {
4535 char *s = savepv(SvPVX(lv));
4536 char *origs = s;
b81d288d 4537
ffc61ed2 4538 while(*s && *s != '\n') s++;
b81d288d 4539
ffc61ed2
JH
4540 if (*s == '\n') {
4541 char *t = ++s;
4542
4543 while (*s) {
4544 if (*s == '\n')
4545 *s = ' ';
4546 s++;
4547 }
4548 if (s[-1] == ' ')
4549 s[-1] = 0;
4550
4551 sv_catpv(sv, t);
fde631ed 4552 }
b81d288d 4553
ffc61ed2 4554 Safefree(origs);
fde631ed
JH
4555 }
4556 }
653099ff 4557 }
ffc61ed2 4558
653099ff
GS
4559 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4560 }
9b155405 4561 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4562 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4563#endif /* DEBUGGING */
35ff7856 4564}
a687059c 4565
cad2e5aa
JH
4566SV *
4567Perl_re_intuit_string(pTHX_ regexp *prog)
4568{ /* Assume that RE_INTUIT is set */
4569 DEBUG_r(
4570 { STRLEN n_a;
4571 char *s = SvPV(prog->check_substr,n_a);
4572
4573 if (!PL_colorset) reginitcolors();
4574 PerlIO_printf(Perl_debug_log,
4575 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4576 PL_colors[4],PL_colors[5],PL_colors[0],
4577 s,
4578 PL_colors[1],
4579 (strlen(s) > 60 ? "..." : ""));
4580 } );
4581
4582 return prog->check_substr;
4583}
4584
2b69d0c2 4585void
864dbfa3 4586Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4587{
adac82c7 4588 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4589
4590 if (!r || (--r->refcnt > 0))
4591 return;
cad2e5aa
JH
4592 DEBUG_r(PerlIO_printf(Perl_debug_log,
4593 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4594 PL_colors[4],PL_colors[5],PL_colors[0],
4595 r->precomp,
4596 PL_colors[1],
4597 (strlen(r->precomp) > 60 ? "..." : "")));
4598
c277df42 4599 if (r->precomp)
a0d0e21e 4600 Safefree(r->precomp);
fac92740
MJD
4601 if (r->offsets) /* 20010421 MJD */
4602 Safefree(r->offsets);
cf93c79d
IZ
4603 if (RX_MATCH_COPIED(r))
4604 Safefree(r->subbeg);
a193d654
GS
4605 if (r->substrs) {
4606 if (r->anchored_substr)
4607 SvREFCNT_dec(r->anchored_substr);
4608 if (r->float_substr)
4609 SvREFCNT_dec(r->float_substr);
2779dcf1 4610 Safefree(r->substrs);
a193d654 4611 }
c277df42
IZ
4612 if (r->data) {
4613 int n = r->data->count;
dfad63ad
HS
4614 AV* new_comppad = NULL;
4615 AV* old_comppad;
4616 SV** old_curpad;
4617
c277df42 4618 while (--n >= 0) {
261faec3 4619 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4620 switch (r->data->what[n]) {
4621 case 's':
4622 SvREFCNT_dec((SV*)r->data->data[n]);
4623 break;
653099ff
GS
4624 case 'f':
4625 Safefree(r->data->data[n]);
4626 break;
dfad63ad
HS
4627 case 'p':
4628 new_comppad = (AV*)r->data->data[n];
4629 break;
c277df42 4630 case 'o':
dfad63ad 4631 if (new_comppad == NULL)
cea2e8a9 4632 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4633 old_comppad = PL_comppad;
4634 old_curpad = PL_curpad;
1e6dc0b6
SB
4635 /* Watch out for global destruction's random ordering. */
4636 if (SvTYPE(new_comppad) == SVt_PVAV) {
4637 PL_comppad = new_comppad;
4638 PL_curpad = AvARRAY(new_comppad);
4639 }
4640 else
4641 PL_curpad = NULL;
9b978d73
DM
4642
4643 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4644 op_free((OP_4tree*)r->data->data[n]);
4645 }
4646
dfad63ad
HS
4647 PL_comppad = old_comppad;
4648 PL_curpad = old_curpad;
4649 SvREFCNT_dec((SV*)new_comppad);
4650 new_comppad = NULL;
c277df42
IZ
4651 break;
4652 case 'n':
4653 break;
4654 default:
830247a4 4655 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4656 }
4657 }
4658 Safefree(r->data->what);
4659 Safefree(r->data);
a0d0e21e
LW
4660 }
4661 Safefree(r->startp);
4662 Safefree(r->endp);
4663 Safefree(r);
a687059c 4664}
c277df42
IZ
4665
4666/*
4667 - regnext - dig the "next" pointer out of a node
4668 *
4669 * [Note, when REGALIGN is defined there are two places in regmatch()
4670 * that bypass this code for speed.]
4671 */
4672regnode *
864dbfa3 4673Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4674{
4675 register I32 offset;
4676
3280af22 4677 if (p == &PL_regdummy)
c277df42
IZ
4678 return(NULL);
4679
4680 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4681 if (offset == 0)
4682 return(NULL);
4683
c277df42 4684 return(p+offset);
c277df42
IZ
4685}
4686
01f988be 4687STATIC void
cea2e8a9 4688S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4689{
4690 va_list args;
4691 STRLEN l1 = strlen(pat1);
4692 STRLEN l2 = strlen(pat2);
4693 char buf[512];
06bf62c7 4694 SV *msv;
c277df42
IZ
4695 char *message;
4696
4697 if (l1 > 510)
4698 l1 = 510;
4699 if (l1 + l2 > 510)
4700 l2 = 510 - l1;
4701 Copy(pat1, buf, l1 , char);
4702 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4703 buf[l1 + l2] = '\n';
4704 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4705#ifdef I_STDARG
4706 /* ANSI variant takes additional second argument */
c277df42 4707 va_start(args, pat2);
8736538c
AS
4708#else
4709 va_start(args);
4710#endif
5a844595 4711 msv = vmess(buf, &args);
c277df42 4712 va_end(args);
06bf62c7 4713 message = SvPV(msv,l1);
c277df42
IZ
4714 if (l1 > 512)
4715 l1 = 512;
4716 Copy(message, buf, l1 , char);
4717 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4718 Perl_croak(aTHX_ "%s", buf);
c277df42 4719}
a0ed51b3
LW
4720
4721/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4722
4723void
864dbfa3 4724Perl_save_re_context(pTHX)
b81d288d 4725{
830247a4
IZ
4726#if 0
4727 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4728 SAVEI32(RExC_npar); /* () count. */
4729 SAVEI32(RExC_size); /* Code size. */
4730 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4731 SAVEVPTR(RExC_rx); /* from regcomp.c */
4732 SAVEI32(RExC_seen); /* from regcomp.c */
4733 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4734 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4735 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4736 SAVEPPTR(RExC_end); /* End of input for compile */
4737 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4738#endif
4739
4740 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4741 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4742 SAVEPPTR(PL_reginput); /* String-input pointer. */
4743 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4744 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4745 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4746 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4747 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4748 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4749 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4750 PL_reg_start_tmp = 0;
a0ed51b3
LW
4751 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4752 PL_reg_start_tmpl = 0;
7766f137 4753 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4754 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4755 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4756 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4757 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4758 SAVEVPTR(PL_regcc); /* from regexec.c */
4759 SAVEVPTR(PL_curcop);
7766f137
GS
4760 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4761 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4762 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4763 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4764 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4765 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4766 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4767 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4768 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4769 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4770 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4771#ifdef DEBUGGING
b81d288d 4772 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4773#endif
a0ed51b3 4774}
51371543 4775
51371543 4776static void
acfe0abc 4777clear_re(pTHX_ void *r)
51371543
GS
4778{
4779 ReREFCNT_dec((regexp *)r);
4780}
ffbc6a93 4781