This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Tests for the less Pragma
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
830247a4 262 unsigned len = strlen(RExC_precomp); \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
830247a4 284 unsigned len = strlen(RExC_precomp); \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
830247a4 304 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
830247a4 325 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
830247a4 347 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
830247a4 368 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
830247a4 379 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
830247a4 387 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
394 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
830247a4 403 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
830247a4 411 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
830247a4 419 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
428 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
9af0acdb
JH
525 int value;
526
653099ff 527 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
528 for (value = 0; value < 256; ++value)
529 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 530 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* Can match anything (initialization) */
536STATIC int
537S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538{
539 int value;
540
aaa51d5e 541 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
542 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
543 return 1;
1aa99e6b
IH
544 if (!(cl->flags & ANYOF_UNICODE_ALL))
545 return 0;
9af0acdb
JH
546 for (value = 0; value < 256; ++value)
547 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
548 return 0;
549 return 1;
550}
551
552/* Can match anything (initialization) */
553STATIC void
830247a4 554S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 555{
8ecf7187 556 Zero(cl, 1, struct regnode_charclass_class);
653099ff 557 cl->type = ANYOF;
830247a4 558 cl_anything(pRExC_state, cl);
653099ff
GS
559}
560
561STATIC void
830247a4 562S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 563{
8ecf7187 564 Zero(cl, 1, struct regnode_charclass_class);
653099ff 565 cl->type = ANYOF;
830247a4 566 cl_anything(pRExC_state, cl);
653099ff
GS
567 if (LOC)
568 cl->flags |= ANYOF_LOCALE;
569}
570
571/* 'And' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
574S_cl_and(pTHX_ struct regnode_charclass_class *cl,
575 struct regnode_charclass_class *and_with)
576{
653099ff
GS
577 if (!(and_with->flags & ANYOF_CLASS)
578 && !(cl->flags & ANYOF_CLASS)
579 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
580 && !(and_with->flags & ANYOF_FOLD)
581 && !(cl->flags & ANYOF_FOLD)) {
582 int i;
583
584 if (and_with->flags & ANYOF_INVERT)
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= ~and_with->bitmap[i];
587 else
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] &= and_with->bitmap[i];
590 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
591 if (!(and_with->flags & ANYOF_EOS))
592 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
593
594 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 cl->flags |= ANYOF_UNICODE;
597 ARG_SET(cl, ARG(and_with));
598 }
599 if (!(and_with->flags & ANYOF_UNICODE_ALL))
600 cl->flags &= ~ANYOF_UNICODE_ALL;
601 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
602 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
603}
604
605/* 'OR' a given class with another one. Can create false positives */
606/* We assume that cl is not inverted */
607STATIC void
830247a4 608S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 609{
653099ff
GS
610 if (or_with->flags & ANYOF_INVERT) {
611 /* We do not use
612 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
613 * <= (B1 | !B2) | (CL1 | !CL2)
614 * which is wasteful if CL2 is small, but we ignore CL2:
615 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
616 * XXXX Can we handle case-fold? Unclear:
617 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
618 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
619 */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && !(or_with->flags & ANYOF_FOLD)
622 && !(cl->flags & ANYOF_FOLD) ) {
623 int i;
624
625 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
626 cl->bitmap[i] |= ~or_with->bitmap[i];
627 } /* XXXX: logic is complicated otherwise */
628 else {
830247a4 629 cl_anything(pRExC_state, cl);
653099ff
GS
630 }
631 } else {
632 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
633 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 634 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
635 || (cl->flags & ANYOF_FOLD)) ) {
636 int i;
637
638 /* OR char bitmap and class bitmap separately */
639 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
640 cl->bitmap[i] |= or_with->bitmap[i];
641 if (or_with->flags & ANYOF_CLASS) {
642 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
643 cl->classflags[i] |= or_with->classflags[i];
644 cl->flags |= ANYOF_CLASS;
645 }
646 }
647 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649 }
650 }
651 if (or_with->flags & ANYOF_EOS)
652 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
653
654 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
655 ARG(cl) != ARG(or_with)) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659 if (or_with->flags & ANYOF_UNICODE_ALL) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
653099ff
GS
663}
664
665/* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
c277df42
IZ
668/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
669 to the position after last scanned or to NULL. */
670
76e3520e 671STATIC I32
830247a4 672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676{
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 681 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
653099ff 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 685
c277df42
IZ
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
22c35a8c 689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 690 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693#ifdef DEBUGGING
694 regnode *stop = scan;
b81d288d 695#endif
c277df42 696
cd439c50 697 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
b81d288d 700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
22c35a8c 706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709#ifdef DEBUGGING
710 if (stringok)
711 stop = n;
b81d288d 712#endif
c277df42 713 n = regnext(n);
a0ed51b3 714 }
f49d4d0f 715 else if (stringok) {
cd439c50 716 int oldl = STR_LEN(scan);
c277df42 717 regnode *nnext = regnext(n);
f49d4d0f 718
b81d288d 719 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
c277df42 724 /* Now we can overwrite *n : */
f49d4d0f 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 726#ifdef DEBUGGING
f49d4d0f 727 stop = next - 1;
b81d288d 728#endif
c277df42
IZ
729 n = nnext;
730 }
731 }
732#ifdef DEBUGGING
733 /* Allow dumping */
cd439c50 734 n = scan + NODE_SZ_STR(scan);
c277df42 735 while (n <= stop) {
22c35a8c 736 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
737 OP(n) = OPTIMIZED;
738 NEXT_OFF(n) = 0;
739 }
740 n++;
741 }
653099ff 742#endif
c277df42 743 }
653099ff
GS
744 /* Follow the next-chain of the current node and optimize
745 away all the NOTHINGs from it. */
c277df42 746 if (OP(scan) != CURLYX) {
048cfca1
GS
747 int max = (reg_off_by_arg[OP(scan)]
748 ? I32_MAX
749 /* I32 may be smaller than U16 on CRAYs! */
750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
752 int noff;
753 regnode *n = scan;
b81d288d 754
c277df42
IZ
755 /* Skip NOTHING and LONGJMP. */
756 while ((n = regnext(n))
22c35a8c 757 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
759 && off + noff < max)
760 off += noff;
761 if (reg_off_by_arg[OP(scan)])
762 ARG(scan) = off;
b81d288d 763 else
c277df42
IZ
764 NEXT_OFF(scan) = off;
765 }
653099ff
GS
766 /* The principal pseudo-switch. Cannot be a switch, since we
767 look into several different things. */
b81d288d 768 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
769 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
770 next = regnext(scan);
771 code = OP(scan);
b81d288d
AB
772
773 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 774 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 775 struct regnode_charclass_class accum;
c277df42 776
653099ff 777 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 778 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 779 if (flags & SCF_DO_STCLASS)
830247a4 780 cl_init_zero(pRExC_state, &accum);
c277df42 781 while (OP(scan) == code) {
830247a4 782 I32 deltanext, minnext, f = 0, fake;
653099ff 783 struct regnode_charclass_class this_class;
c277df42
IZ
784
785 num++;
786 data_fake.flags = 0;
b81d288d 787 if (data) {
2c2d71f5 788 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
789 data_fake.last_closep = data->last_closep;
790 }
791 else
792 data_fake.last_closep = &fake;
c277df42
IZ
793 next = regnext(scan);
794 scan = NEXTOPER(scan);
795 if (code != BRANCH)
796 scan = NEXTOPER(scan);
653099ff 797 if (flags & SCF_DO_STCLASS) {
830247a4 798 cl_init(pRExC_state, &this_class);
653099ff
GS
799 data_fake.start_class = &this_class;
800 f = SCF_DO_STCLASS_AND;
b81d288d 801 }
e1901655
IZ
802 if (flags & SCF_WHILEM_VISITED_POS)
803 f |= SCF_WHILEM_VISITED_POS;
653099ff 804 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
805 minnext = study_chunk(pRExC_state, &scan, &deltanext,
806 next, &data_fake, f);
b81d288d 807 if (min1 > minnext)
c277df42
IZ
808 min1 = minnext;
809 if (max1 < minnext + deltanext)
810 max1 = minnext + deltanext;
811 if (deltanext == I32_MAX)
aca2d497 812 is_inf = is_inf_internal = 1;
c277df42
IZ
813 scan = next;
814 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
815 pars++;
405ff068 816 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 817 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
818 if (data)
819 data->whilem_c = data_fake.whilem_c;
653099ff 820 if (flags & SCF_DO_STCLASS)
830247a4 821 cl_or(pRExC_state, &accum, &this_class);
b81d288d 822 if (code == SUSPEND)
c277df42
IZ
823 break;
824 }
825 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
826 min1 = 0;
827 if (flags & SCF_DO_SUBSTR) {
828 data->pos_min += min1;
829 data->pos_delta += max1 - min1;
830 if (max1 != min1 || is_inf)
831 data->longest = &(data->longest_float);
832 }
833 min += min1;
834 delta += max1 - min1;
653099ff 835 if (flags & SCF_DO_STCLASS_OR) {
830247a4 836 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
837 if (min1) {
838 cl_and(data->start_class, &and_with);
839 flags &= ~SCF_DO_STCLASS;
840 }
841 }
842 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
843 if (min1) {
844 cl_and(data->start_class, &accum);
653099ff 845 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
846 }
847 else {
b81d288d 848 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
849 * data->start_class */
850 StructCopy(data->start_class, &and_with,
851 struct regnode_charclass_class);
852 flags &= ~SCF_DO_STCLASS_AND;
853 StructCopy(&accum, data->start_class,
854 struct regnode_charclass_class);
855 flags |= SCF_DO_STCLASS_OR;
856 data->start_class->flags |= ANYOF_EOS;
857 }
653099ff 858 }
a0ed51b3
LW
859 }
860 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
861 scan = NEXTOPER(NEXTOPER(scan));
862 else /* single branch is optimized. */
863 scan = NEXTOPER(scan);
864 continue;
a0ed51b3
LW
865 }
866 else if (OP(scan) == EXACT) {
cd439c50 867 I32 l = STR_LEN(scan);
1aa99e6b 868 UV uc = *((U8*)STRING(scan));
a0ed51b3 869 if (UTF) {
1aa99e6b
IH
870 U8 *s = (U8*)STRING(scan);
871 l = utf8_length(s, s + l);
9041c2e3 872 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
873 }
874 min += l;
c277df42 875 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
876 /* The code below prefers earlier match for fixed
877 offset, later match for variable offset. */
878 if (data->last_end == -1) { /* Update the start info. */
879 data->last_start_min = data->pos_min;
880 data->last_start_max = is_inf
b81d288d 881 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 882 }
cd439c50 883 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
884 data->last_end = data->pos_min + l;
885 data->pos_min += l; /* As in the first entry. */
886 data->flags &= ~SF_BEFORE_EOL;
887 }
653099ff
GS
888 if (flags & SCF_DO_STCLASS_AND) {
889 /* Check whether it is compatible with what we know already! */
890 int compat = 1;
891
1aa99e6b 892 if (uc >= 0x100 ||
516a5887 893 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 894 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 895 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 896 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 897 )
653099ff
GS
898 compat = 0;
899 ANYOF_CLASS_ZERO(data->start_class);
900 ANYOF_BITMAP_ZERO(data->start_class);
901 if (compat)
1aa99e6b 902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 903 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
904 if (uc < 0x100)
905 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
906 }
907 else if (flags & SCF_DO_STCLASS_OR) {
908 /* false positive possible if the class is case-folded */
1aa99e6b 909 if (uc < 0x100)
9b877dbb
IH
910 ANYOF_BITMAP_SET(data->start_class, uc);
911 else
912 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
913 data->start_class->flags &= ~ANYOF_EOS;
914 cl_and(data->start_class, &and_with);
915 }
916 flags &= ~SCF_DO_STCLASS;
a0ed51b3 917 }
653099ff 918 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
653099ff
GS
921
922 /* Search for fixed substrings supports EXACT only. */
b81d288d 923 if (flags & SCF_DO_SUBSTR)
830247a4 924 scan_commit(pRExC_state, data);
a0ed51b3 925 if (UTF) {
1aa99e6b
IH
926 U8 *s = (U8 *)STRING(scan);
927 l = utf8_length(s, s + l);
9041c2e3 928 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
929 }
930 min += l;
c277df42 931 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 932 data->pos_min += l;
653099ff
GS
933 if (flags & SCF_DO_STCLASS_AND) {
934 /* Check whether it is compatible with what we know already! */
935 int compat = 1;
936
1aa99e6b 937 if (uc >= 0x100 ||
516a5887 938 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 939 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 940 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
941 compat = 0;
942 ANYOF_CLASS_ZERO(data->start_class);
943 ANYOF_BITMAP_ZERO(data->start_class);
944 if (compat) {
1aa99e6b 945 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
946 data->start_class->flags &= ~ANYOF_EOS;
947 data->start_class->flags |= ANYOF_FOLD;
948 if (OP(scan) == EXACTFL)
949 data->start_class->flags |= ANYOF_LOCALE;
950 }
951 }
952 else if (flags & SCF_DO_STCLASS_OR) {
953 if (data->start_class->flags & ANYOF_FOLD) {
954 /* false positive possible if the class is case-folded.
955 Assume that the locale settings are the same... */
1aa99e6b
IH
956 if (uc < 0x100)
957 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
958 data->start_class->flags &= ~ANYOF_EOS;
959 }
960 cl_and(data->start_class, &and_with);
961 }
962 flags &= ~SCF_DO_STCLASS;
a0ed51b3 963 }
4d61ec05 964 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 965 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 966 I32 f = flags, pos_before = 0;
c277df42 967 regnode *oscan = scan;
653099ff
GS
968 struct regnode_charclass_class this_class;
969 struct regnode_charclass_class *oclass = NULL;
970
22c35a8c 971 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 972 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
973 scan = NEXTOPER(scan);
974 goto finish;
975 case PLUS:
653099ff 976 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 977 next = NEXTOPER(scan);
653099ff 978 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
979 mincount = 1;
980 maxcount = REG_INFTY;
c277df42
IZ
981 next = regnext(scan);
982 scan = NEXTOPER(scan);
983 goto do_curly;
984 }
985 }
986 if (flags & SCF_DO_SUBSTR)
987 data->pos_min++;
988 min++;
989 /* Fall through. */
990 case STAR:
653099ff
GS
991 if (flags & SCF_DO_STCLASS) {
992 mincount = 0;
b81d288d 993 maxcount = REG_INFTY;
653099ff
GS
994 next = regnext(scan);
995 scan = NEXTOPER(scan);
996 goto do_curly;
997 }
b81d288d 998 is_inf = is_inf_internal = 1;
c277df42
IZ
999 scan = regnext(scan);
1000 if (flags & SCF_DO_SUBSTR) {
830247a4 1001 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1002 data->longest = &(data->longest_float);
1003 }
1004 goto optimize_curly_tail;
1005 case CURLY:
b81d288d 1006 mincount = ARG1(scan);
c277df42
IZ
1007 maxcount = ARG2(scan);
1008 next = regnext(scan);
cb434fcc
IZ
1009 if (OP(scan) == CURLYX) {
1010 I32 lp = (data ? *(data->last_closep) : 0);
1011
1012 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1013 }
c277df42
IZ
1014 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1015 do_curly:
1016 if (flags & SCF_DO_SUBSTR) {
830247a4 1017 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1018 pos_before = data->pos_min;
1019 }
1020 if (data) {
1021 fl = data->flags;
1022 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1023 if (is_inf)
1024 data->flags |= SF_IS_INF;
1025 }
653099ff 1026 if (flags & SCF_DO_STCLASS) {
830247a4 1027 cl_init(pRExC_state, &this_class);
653099ff
GS
1028 oclass = data->start_class;
1029 data->start_class = &this_class;
1030 f |= SCF_DO_STCLASS_AND;
1031 f &= ~SCF_DO_STCLASS_OR;
1032 }
e1901655
IZ
1033 /* These are the cases when once a subexpression
1034 fails at a particular position, it cannot succeed
1035 even after backtracking at the enclosing scope.
b81d288d 1036
e1901655
IZ
1037 XXXX what if minimal match and we are at the
1038 initial run of {n,m}? */
1039 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1040 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1041
c277df42 1042 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1043 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1044 mincount == 0
653099ff
GS
1045 ? (f & ~SCF_DO_SUBSTR) : f);
1046
1047 if (flags & SCF_DO_STCLASS)
1048 data->start_class = oclass;
1049 if (mincount == 0 || minnext == 0) {
1050 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1051 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1052 }
1053 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1054 /* Switch to OR mode: cache the old value of
653099ff
GS
1055 * data->start_class */
1056 StructCopy(data->start_class, &and_with,
1057 struct regnode_charclass_class);
1058 flags &= ~SCF_DO_STCLASS_AND;
1059 StructCopy(&this_class, data->start_class,
1060 struct regnode_charclass_class);
1061 flags |= SCF_DO_STCLASS_OR;
1062 data->start_class->flags |= ANYOF_EOS;
1063 }
1064 } else { /* Non-zero len */
1065 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1066 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1067 cl_and(data->start_class, &and_with);
1068 }
1069 else if (flags & SCF_DO_STCLASS_AND)
1070 cl_and(data->start_class, &this_class);
1071 flags &= ~SCF_DO_STCLASS;
1072 }
c277df42
IZ
1073 if (!scan) /* It was not CURLYX, but CURLY. */
1074 scan = next;
84037bb0
HS
1075 if (ckWARN(WARN_REGEXP)
1076 && (minnext == 0) && (deltanext == 0)
99799961 1077 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1078 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1079 {
830247a4 1080 vWARN(RExC_parse,
b45f050a
JF
1081 "Quantifier unexpected on zero-length expression");
1082 }
1083
c277df42 1084 min += minnext * mincount;
b81d288d 1085 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1086 && (minnext + deltanext) > 0)
1087 || deltanext == I32_MAX);
aca2d497 1088 is_inf |= is_inf_internal;
c277df42
IZ
1089 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1090
1091 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1092 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1093 && data->flags & SF_IN_PAR
1094 && !(data->flags & SF_HAS_EVAL)
1095 && !deltanext && minnext == 1 ) {
1096 /* Try to optimize to CURLYN. */
1097 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1098 regnode *nxt1 = nxt;
1099#ifdef DEBUGGING
1100 regnode *nxt2;
1101#endif
c277df42
IZ
1102
1103 /* Skip open. */
1104 nxt = regnext(nxt);
4d61ec05 1105 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1106 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1107 && STR_LEN(nxt) == 1))
c277df42 1108 goto nogo;
497b47a8 1109#ifdef DEBUGGING
c277df42 1110 nxt2 = nxt;
497b47a8 1111#endif
c277df42 1112 nxt = regnext(nxt);
b81d288d 1113 if (OP(nxt) != CLOSE)
c277df42
IZ
1114 goto nogo;
1115 /* Now we know that nxt2 is the only contents: */
1116 oscan->flags = ARG(nxt);
1117 OP(oscan) = CURLYN;
1118 OP(nxt1) = NOTHING; /* was OPEN. */
1119#ifdef DEBUGGING
1120 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1121 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1122 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1123 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1124 OP(nxt + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1126#endif
c277df42 1127 }
c277df42
IZ
1128 nogo:
1129
1130 /* Try optimization CURLYX => CURLYM. */
b81d288d 1131 if ( OP(oscan) == CURLYX && data
c277df42 1132 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1133 && !(data->flags & SF_HAS_EVAL)
1134 && !deltanext ) {
1135 /* XXXX How to optimize if data == 0? */
1136 /* Optimize to a simpler form. */
1137 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1138 regnode *nxt2;
1139
1140 OP(oscan) = CURLYM;
1141 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1142 && (OP(nxt2) != WHILEM))
c277df42
IZ
1143 nxt = nxt2;
1144 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1145 /* Need to optimize away parenths. */
1146 if (data->flags & SF_IN_PAR) {
1147 /* Set the parenth number. */
1148 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1149
b81d288d 1150 if (OP(nxt) != CLOSE)
b45f050a 1151 FAIL("Panic opt close");
c277df42
IZ
1152 oscan->flags = ARG(nxt);
1153 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1154 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1155#ifdef DEBUGGING
1156 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1157 OP(nxt + 1) = OPTIMIZED; /* was count. */
1158 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1159 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1160#endif
c277df42
IZ
1161#if 0
1162 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1163 regnode *nnxt = regnext(nxt1);
b81d288d 1164
c277df42
IZ
1165 if (nnxt == nxt) {
1166 if (reg_off_by_arg[OP(nxt1)])
1167 ARG_SET(nxt1, nxt2 - nxt1);
1168 else if (nxt2 - nxt1 < U16_MAX)
1169 NEXT_OFF(nxt1) = nxt2 - nxt1;
1170 else
1171 OP(nxt) = NOTHING; /* Cannot beautify */
1172 }
1173 nxt1 = nnxt;
1174 }
1175#endif
1176 /* Optimize again: */
b81d288d 1177 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1178 NULL, 0);
a0ed51b3
LW
1179 }
1180 else
c277df42 1181 oscan->flags = 0;
c277df42 1182 }
e1901655
IZ
1183 else if ((OP(oscan) == CURLYX)
1184 && (flags & SCF_WHILEM_VISITED_POS)
1185 /* See the comment on a similar expression above.
1186 However, this time it not a subexpression
1187 we care about, but the expression itself. */
1188 && (maxcount == REG_INFTY)
1189 && data && ++data->whilem_c < 16) {
1190 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1191 /* Find WHILEM (as in regexec.c) */
1192 regnode *nxt = oscan + NEXT_OFF(oscan);
1193
1194 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1195 nxt += ARG(nxt);
1196 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1197 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1198 }
b81d288d 1199 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1200 pars++;
1201 if (flags & SCF_DO_SUBSTR) {
1202 SV *last_str = Nullsv;
1203 int counted = mincount != 0;
1204
1205 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1206 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1207 ? pos_before : data->last_start_min;
1208 STRLEN l;
1209 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1210 I32 old = b - data->last_start_min;
1211
1212 if (UTF)
1213 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1214
a0ed51b3 1215 l -= old;
c277df42 1216 /* Get the added string: */
79cb57f6 1217 last_str = newSVpvn(s + old, l);
c277df42
IZ
1218 if (deltanext == 0 && pos_before == b) {
1219 /* What was added is a constant string */
1220 if (mincount > 1) {
1221 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1222 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1223 SvPVX(last_str), l, mincount - 1);
1224 SvCUR(last_str) *= mincount;
1225 /* Add additional parts. */
b81d288d 1226 SvCUR_set(data->last_found,
c277df42
IZ
1227 SvCUR(data->last_found) - l);
1228 sv_catsv(data->last_found, last_str);
1229 data->last_end += l * (mincount - 1);
1230 }
2a8d9689
HS
1231 } else {
1232 /* start offset must point into the last copy */
1233 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1234 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1235 * (minnext + data->pos_delta);
c277df42
IZ
1236 }
1237 }
1238 /* It is counted once already... */
1239 data->pos_min += minnext * (mincount - counted);
1240 data->pos_delta += - counted * deltanext +
1241 (minnext + deltanext) * maxcount - minnext * mincount;
1242 if (mincount != maxcount) {
653099ff
GS
1243 /* Cannot extend fixed substrings found inside
1244 the group. */
830247a4 1245 scan_commit(pRExC_state,data);
c277df42
IZ
1246 if (mincount && last_str) {
1247 sv_setsv(data->last_found, last_str);
1248 data->last_end = data->pos_min;
b81d288d 1249 data->last_start_min =
a0ed51b3 1250 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1251 data->last_start_max = is_inf
1252 ? I32_MAX
c277df42 1253 : data->pos_min + data->pos_delta
a0ed51b3 1254 - CHR_SVLEN(last_str);
c277df42
IZ
1255 }
1256 data->longest = &(data->longest_float);
1257 }
aca2d497 1258 SvREFCNT_dec(last_str);
c277df42 1259 }
405ff068 1260 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1261 data->flags |= SF_HAS_EVAL;
1262 optimize_curly_tail:
c277df42 1263 if (OP(oscan) != CURLYX) {
22c35a8c 1264 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1265 && NEXT_OFF(next))
1266 NEXT_OFF(oscan) += NEXT_OFF(next);
1267 }
c277df42 1268 continue;
653099ff 1269 default: /* REF and CLUMP only? */
c277df42 1270 if (flags & SCF_DO_SUBSTR) {
830247a4 1271 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1272 data->longest = &(data->longest_float);
1273 }
aca2d497 1274 is_inf = is_inf_internal = 1;
653099ff 1275 if (flags & SCF_DO_STCLASS_OR)
830247a4 1276 cl_anything(pRExC_state, data->start_class);
653099ff 1277 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1278 break;
1279 }
a0ed51b3 1280 }
ffc61ed2 1281 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1282 int value = 0;
653099ff 1283
c277df42 1284 if (flags & SCF_DO_SUBSTR) {
830247a4 1285 scan_commit(pRExC_state,data);
c277df42
IZ
1286 data->pos_min++;
1287 }
1288 min++;
653099ff
GS
1289 if (flags & SCF_DO_STCLASS) {
1290 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1291
1292 /* Some of the logic below assumes that switching
1293 locale on will only add false positives. */
1294 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1295 case SANY:
653099ff
GS
1296 default:
1297 do_default:
1298 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1299 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1300 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1301 break;
1302 case REG_ANY:
1303 if (OP(scan) == SANY)
1304 goto do_default;
1305 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1306 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1307 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1308 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1309 }
1310 if (flags & SCF_DO_STCLASS_AND || !value)
1311 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1312 break;
1313 case ANYOF:
1314 if (flags & SCF_DO_STCLASS_AND)
1315 cl_and(data->start_class,
1316 (struct regnode_charclass_class*)scan);
1317 else
830247a4 1318 cl_or(pRExC_state, data->start_class,
653099ff
GS
1319 (struct regnode_charclass_class*)scan);
1320 break;
1321 case ALNUM:
1322 if (flags & SCF_DO_STCLASS_AND) {
1323 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1324 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1325 for (value = 0; value < 256; value++)
1326 if (!isALNUM(value))
1327 ANYOF_BITMAP_CLEAR(data->start_class, value);
1328 }
1329 }
1330 else {
1331 if (data->start_class->flags & ANYOF_LOCALE)
1332 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1333 else {
1334 for (value = 0; value < 256; value++)
1335 if (isALNUM(value))
b81d288d 1336 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1337 }
1338 }
1339 break;
1340 case ALNUML:
1341 if (flags & SCF_DO_STCLASS_AND) {
1342 if (data->start_class->flags & ANYOF_LOCALE)
1343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1344 }
1345 else {
1346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1347 data->start_class->flags |= ANYOF_LOCALE;
1348 }
1349 break;
1350 case NALNUM:
1351 if (flags & SCF_DO_STCLASS_AND) {
1352 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1354 for (value = 0; value < 256; value++)
1355 if (isALNUM(value))
1356 ANYOF_BITMAP_CLEAR(data->start_class, value);
1357 }
1358 }
1359 else {
1360 if (data->start_class->flags & ANYOF_LOCALE)
1361 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1362 else {
1363 for (value = 0; value < 256; value++)
1364 if (!isALNUM(value))
b81d288d 1365 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1366 }
1367 }
1368 break;
1369 case NALNUML:
1370 if (flags & SCF_DO_STCLASS_AND) {
1371 if (data->start_class->flags & ANYOF_LOCALE)
1372 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1373 }
1374 else {
1375 data->start_class->flags |= ANYOF_LOCALE;
1376 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1377 }
1378 break;
1379 case SPACE:
1380 if (flags & SCF_DO_STCLASS_AND) {
1381 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1383 for (value = 0; value < 256; value++)
1384 if (!isSPACE(value))
1385 ANYOF_BITMAP_CLEAR(data->start_class, value);
1386 }
1387 }
1388 else {
1389 if (data->start_class->flags & ANYOF_LOCALE)
1390 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1391 else {
1392 for (value = 0; value < 256; value++)
1393 if (isSPACE(value))
b81d288d 1394 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1395 }
1396 }
1397 break;
1398 case SPACEL:
1399 if (flags & SCF_DO_STCLASS_AND) {
1400 if (data->start_class->flags & ANYOF_LOCALE)
1401 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1402 }
1403 else {
1404 data->start_class->flags |= ANYOF_LOCALE;
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1406 }
1407 break;
1408 case NSPACE:
1409 if (flags & SCF_DO_STCLASS_AND) {
1410 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1412 for (value = 0; value < 256; value++)
1413 if (isSPACE(value))
1414 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 }
1416 }
1417 else {
1418 if (data->start_class->flags & ANYOF_LOCALE)
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1420 else {
1421 for (value = 0; value < 256; value++)
1422 if (!isSPACE(value))
b81d288d 1423 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1424 }
1425 }
1426 break;
1427 case NSPACEL:
1428 if (flags & SCF_DO_STCLASS_AND) {
1429 if (data->start_class->flags & ANYOF_LOCALE) {
1430 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1431 for (value = 0; value < 256; value++)
1432 if (!isSPACE(value))
1433 ANYOF_BITMAP_CLEAR(data->start_class, value);
1434 }
1435 }
1436 else {
1437 data->start_class->flags |= ANYOF_LOCALE;
1438 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1439 }
1440 break;
1441 case DIGIT:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1444 for (value = 0; value < 256; value++)
1445 if (!isDIGIT(value))
1446 ANYOF_BITMAP_CLEAR(data->start_class, value);
1447 }
1448 else {
1449 if (data->start_class->flags & ANYOF_LOCALE)
1450 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1451 else {
1452 for (value = 0; value < 256; value++)
1453 if (isDIGIT(value))
b81d288d 1454 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1455 }
1456 }
1457 break;
1458 case NDIGIT:
1459 if (flags & SCF_DO_STCLASS_AND) {
1460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1461 for (value = 0; value < 256; value++)
1462 if (isDIGIT(value))
1463 ANYOF_BITMAP_CLEAR(data->start_class, value);
1464 }
1465 else {
1466 if (data->start_class->flags & ANYOF_LOCALE)
1467 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1468 else {
1469 for (value = 0; value < 256; value++)
1470 if (!isDIGIT(value))
b81d288d 1471 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1472 }
1473 }
1474 break;
1475 }
1476 if (flags & SCF_DO_STCLASS_OR)
1477 cl_and(data->start_class, &and_with);
1478 flags &= ~SCF_DO_STCLASS;
1479 }
a0ed51b3 1480 }
22c35a8c 1481 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1482 data->flags |= (OP(scan) == MEOL
1483 ? SF_BEFORE_MEOL
1484 : SF_BEFORE_SEOL);
a0ed51b3 1485 }
653099ff
GS
1486 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1487 /* Lookbehind, or need to calculate parens/evals/stclass: */
1488 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1489 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1490 /* Lookahead/lookbehind */
cb434fcc 1491 I32 deltanext, minnext, fake = 0;
c277df42 1492 regnode *nscan;
653099ff
GS
1493 struct regnode_charclass_class intrnl;
1494 int f = 0;
c277df42
IZ
1495
1496 data_fake.flags = 0;
b81d288d 1497 if (data) {
2c2d71f5 1498 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1499 data_fake.last_closep = data->last_closep;
1500 }
1501 else
1502 data_fake.last_closep = &fake;
653099ff
GS
1503 if ( flags & SCF_DO_STCLASS && !scan->flags
1504 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1505 cl_init(pRExC_state, &intrnl);
653099ff 1506 data_fake.start_class = &intrnl;
e1901655 1507 f |= SCF_DO_STCLASS_AND;
653099ff 1508 }
e1901655
IZ
1509 if (flags & SCF_WHILEM_VISITED_POS)
1510 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1511 next = regnext(scan);
1512 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1513 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1514 if (scan->flags) {
1515 if (deltanext) {
9baa0206 1516 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1517 }
1518 else if (minnext > U8_MAX) {
9baa0206 1519 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1520 }
1521 scan->flags = minnext;
1522 }
1523 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1524 pars++;
405ff068 1525 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1526 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1527 if (data)
1528 data->whilem_c = data_fake.whilem_c;
e1901655 1529 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1530 int was = (data->start_class->flags & ANYOF_EOS);
1531
1532 cl_and(data->start_class, &intrnl);
1533 if (was)
1534 data->start_class->flags |= ANYOF_EOS;
1535 }
a0ed51b3
LW
1536 }
1537 else if (OP(scan) == OPEN) {
c277df42 1538 pars++;
a0ed51b3 1539 }
cb434fcc
IZ
1540 else if (OP(scan) == CLOSE) {
1541 if (ARG(scan) == is_par) {
1542 next = regnext(scan);
c277df42 1543
cb434fcc
IZ
1544 if ( next && (OP(next) != WHILEM) && next < last)
1545 is_par = 0; /* Disable optimization */
1546 }
1547 if (data)
1548 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1549 }
1550 else if (OP(scan) == EVAL) {
c277df42
IZ
1551 if (data)
1552 data->flags |= SF_HAS_EVAL;
1553 }
96776eda 1554 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1555 if (flags & SCF_DO_SUBSTR) {
830247a4 1556 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1557 data->longest = &(data->longest_float);
1558 }
1559 is_inf = is_inf_internal = 1;
653099ff 1560 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1561 cl_anything(pRExC_state, data->start_class);
96776eda 1562 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1563 }
c277df42
IZ
1564 /* Else: zero-length, ignore. */
1565 scan = regnext(scan);
1566 }
1567
1568 finish:
1569 *scanp = scan;
aca2d497 1570 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1571 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1572 data->pos_delta = I32_MAX - data->pos_min;
1573 if (is_par > U8_MAX)
1574 is_par = 0;
1575 if (is_par && pars==1 && data) {
1576 data->flags |= SF_IN_PAR;
1577 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1578 }
1579 else if (pars && data) {
c277df42
IZ
1580 data->flags |= SF_HAS_PAR;
1581 data->flags &= ~SF_IN_PAR;
1582 }
653099ff
GS
1583 if (flags & SCF_DO_STCLASS_OR)
1584 cl_and(data->start_class, &and_with);
c277df42
IZ
1585 return min;
1586}
1587
76e3520e 1588STATIC I32
830247a4 1589S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1590{
830247a4 1591 if (RExC_rx->data) {
b81d288d
AB
1592 Renewc(RExC_rx->data,
1593 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1594 char, struct reg_data);
830247a4
IZ
1595 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1596 RExC_rx->data->count += n;
a0ed51b3
LW
1597 }
1598 else {
830247a4 1599 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1600 char, struct reg_data);
830247a4
IZ
1601 New(1208, RExC_rx->data->what, n, U8);
1602 RExC_rx->data->count = n;
c277df42 1603 }
830247a4
IZ
1604 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1605 return RExC_rx->data->count - n;
c277df42
IZ
1606}
1607
d88dccdf 1608void
864dbfa3 1609Perl_reginitcolors(pTHX)
d88dccdf 1610{
d88dccdf
IZ
1611 int i = 0;
1612 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1613
d88dccdf
IZ
1614 if (s) {
1615 PL_colors[0] = s = savepv(s);
1616 while (++i < 6) {
1617 s = strchr(s, '\t');
1618 if (s) {
1619 *s = '\0';
1620 PL_colors[i] = ++s;
1621 }
1622 else
c712d376 1623 PL_colors[i] = s = "";
d88dccdf
IZ
1624 }
1625 } else {
b81d288d 1626 while (i < 6)
d88dccdf
IZ
1627 PL_colors[i++] = "";
1628 }
1629 PL_colorset = 1;
1630}
1631
8615cb43 1632
a687059c 1633/*
e50aee73 1634 - pregcomp - compile a regular expression into internal code
a687059c
LW
1635 *
1636 * We can't allocate space until we know how big the compiled form will be,
1637 * but we can't compile it (and thus know how big it is) until we've got a
1638 * place to put the code. So we cheat: we compile it twice, once with code
1639 * generation turned off and size counting turned on, and once "for real".
1640 * This also means that we don't allocate space until we are sure that the
1641 * thing really will compile successfully, and we never have to move the
1642 * code and thus invalidate pointers into it. (Note that it has to be in
1643 * one piece because free() must be able to free it all.) [NB: not true in perl]
1644 *
1645 * Beware that the optimization-preparation code in here knows about some
1646 * of the structure of the compiled regexp. [I'll say.]
1647 */
1648regexp *
864dbfa3 1649Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1650{
a0d0e21e 1651 register regexp *r;
c277df42 1652 regnode *scan;
c277df42 1653 regnode *first;
a0d0e21e 1654 I32 flags;
a0d0e21e
LW
1655 I32 minlen = 0;
1656 I32 sawplus = 0;
1657 I32 sawopen = 0;
2c2d71f5 1658 scan_data_t data;
830247a4
IZ
1659 RExC_state_t RExC_state;
1660 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1661
1662 if (exp == NULL)
c277df42 1663 FAIL("NULL regexp argument");
a0d0e21e 1664
84e09d5e
JH
1665 /* XXXX This looks very suspicious... */
1666 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1667 RExC_utf8 = 1;
1668 else
1669 RExC_utf8 = 0;
a0ed51b3 1670
5cfc7842 1671 RExC_precomp = exp;
35ef4773 1672 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1673 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1674 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1675 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1676 RExC_flags16 = pm->op_pmflags;
1677 RExC_sawback = 0;
bbce6d69 1678
830247a4
IZ
1679 RExC_seen = 0;
1680 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1681 RExC_seen_evals = 0;
1682 RExC_extralen = 0;
c277df42 1683
bbce6d69 1684 /* First pass: determine size, legality. */
830247a4 1685 RExC_parse = exp;
fac92740 1686 RExC_start = exp;
830247a4
IZ
1687 RExC_end = xend;
1688 RExC_naughty = 0;
1689 RExC_npar = 1;
1690 RExC_size = 0L;
1691 RExC_emit = &PL_regdummy;
1692 RExC_whilem_seen = 0;
85ddcde9
JH
1693#if 0 /* REGC() is (currently) a NOP at the first pass.
1694 * Clever compilers notice this and complain. --jhi */
830247a4 1695 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1696#endif
830247a4 1697 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1698 RExC_precomp = Nullch;
a0d0e21e
LW
1699 return(NULL);
1700 }
830247a4 1701 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1702
c277df42
IZ
1703 /* Small enough for pointer-storage convention?
1704 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1705 if (RExC_size >= 0x10000L && RExC_extralen)
1706 RExC_size += RExC_extralen;
c277df42 1707 else
830247a4
IZ
1708 RExC_extralen = 0;
1709 if (RExC_whilem_seen > 15)
1710 RExC_whilem_seen = 15;
a0d0e21e 1711
bbce6d69 1712 /* Allocate space and initialize. */
830247a4 1713 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1714 char, regexp);
a0d0e21e 1715 if (r == NULL)
b45f050a
JF
1716 FAIL("Regexp out of space");
1717
0f79a09d
GS
1718#ifdef DEBUGGING
1719 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1720 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1721#endif
c277df42 1722 r->refcnt = 1;
bbce6d69 1723 r->prelen = xend - exp;
5cfc7842 1724 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1725 r->subbeg = NULL;
1726 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1727 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1728
1729 r->substrs = 0; /* Useful during FAIL. */
1730 r->startp = 0; /* Useful during FAIL. */
1731 r->endp = 0; /* Useful during FAIL. */
1732
fac92740
MJD
1733 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1734 if (r->offsets) {
1735 r->offsets[0] = RExC_size;
1736 }
1737 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1738 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1739 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1740 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1741
830247a4 1742 RExC_rx = r;
bbce6d69 1743
1744 /* Second pass: emit code. */
055bb491 1745 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1746 RExC_parse = exp;
1747 RExC_end = xend;
1748 RExC_naughty = 0;
1749 RExC_npar = 1;
fac92740 1750 RExC_emit_start = r->program;
830247a4 1751 RExC_emit = r->program;
2cd61cdb 1752 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1753 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1754 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1755 r->data = 0;
830247a4 1756 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1757 return(NULL);
1758
1759 /* Dig out information for optimizations. */
cf93c79d 1760 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1761 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1762 if (UTF)
1763 r->reganch |= ROPT_UTF8;
c277df42 1764 r->regstclass = NULL;
830247a4 1765 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1766 r->reganch |= ROPT_NAUGHTY;
c277df42 1767 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1768
1769 /* XXXX To minimize changes to RE engine we always allocate
1770 3-units-long substrs field. */
1771 Newz(1004, r->substrs, 1, struct reg_substr_data);
1772
2c2d71f5 1773 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1774 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1775 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1776 I32 fake;
c5254dd6 1777 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1778 struct regnode_charclass_class ch_class;
1779 int stclass_flag;
cb434fcc 1780 I32 last_close = 0;
a0d0e21e
LW
1781
1782 first = scan;
c277df42 1783 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1784 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1785 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1786 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1787 (OP(first) == PLUS) ||
1788 (OP(first) == MINMOD) ||
653099ff 1789 /* An {n,m} with n>0 */
22c35a8c 1790 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1791 if (OP(first) == PLUS)
1792 sawplus = 1;
1793 else
1794 first += regarglen[(U8)OP(first)];
1795 first = NEXTOPER(first);
a687059c
LW
1796 }
1797
a0d0e21e
LW
1798 /* Starting-point info. */
1799 again:
653099ff 1800 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1801 if (OP(first) == EXACT)
1802 ; /* Empty, get anchored substr later. */
1803 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1804 r->regstclass = first;
1805 }
653099ff 1806 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1807 r->regstclass = first;
22c35a8c
GS
1808 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1809 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1810 r->regstclass = first;
22c35a8c 1811 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1812 r->reganch |= (OP(first) == MBOL
1813 ? ROPT_ANCH_MBOL
1814 : (OP(first) == SBOL
1815 ? ROPT_ANCH_SBOL
1816 : ROPT_ANCH_BOL));
a0d0e21e 1817 first = NEXTOPER(first);
774d564b 1818 goto again;
1819 }
1820 else if (OP(first) == GPOS) {
1821 r->reganch |= ROPT_ANCH_GPOS;
1822 first = NEXTOPER(first);
1823 goto again;
a0d0e21e
LW
1824 }
1825 else if ((OP(first) == STAR &&
22c35a8c 1826 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1827 !(r->reganch & ROPT_ANCH) )
1828 {
1829 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1830 int type = OP(NEXTOPER(first));
1831
ffc61ed2 1832 if (type == REG_ANY)
cad2e5aa
JH
1833 type = ROPT_ANCH_MBOL;
1834 else
1835 type = ROPT_ANCH_SBOL;
1836
1837 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1838 first = NEXTOPER(first);
774d564b 1839 goto again;
a0d0e21e 1840 }
b81d288d 1841 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1842 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1843 /* x+ must match at the 1st pos of run of x's */
1844 r->reganch |= ROPT_SKIP;
a0d0e21e 1845
c277df42 1846 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1847 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1848 (IV)(first - scan + 1)));
a0d0e21e
LW
1849 /*
1850 * If there's something expensive in the r.e., find the
1851 * longest literal string that must appear and make it the
1852 * regmust. Resolve ties in favor of later strings, since
1853 * the regstart check works with the beginning of the r.e.
1854 * and avoiding duplication strengthens checking. Not a
1855 * strong reason, but sufficient in the absence of others.
1856 * [Now we resolve ties in favor of the earlier string if
c277df42 1857 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1858 * earlier string may buy us something the later one won't.]
1859 */
a0d0e21e 1860 minlen = 0;
a687059c 1861
79cb57f6
GS
1862 data.longest_fixed = newSVpvn("",0);
1863 data.longest_float = newSVpvn("",0);
1864 data.last_found = newSVpvn("",0);
c277df42
IZ
1865 data.longest = &(data.longest_fixed);
1866 first = scan;
653099ff 1867 if (!r->regstclass) {
830247a4 1868 cl_init(pRExC_state, &ch_class);
653099ff
GS
1869 data.start_class = &ch_class;
1870 stclass_flag = SCF_DO_STCLASS_AND;
1871 } else /* XXXX Check for BOUND? */
1872 stclass_flag = 0;
cb434fcc 1873 data.last_closep = &last_close;
653099ff 1874
830247a4 1875 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1876 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1877 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1878 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1879 && !RExC_seen_zerolen
1880 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1881 r->reganch |= ROPT_CHECK_ALL;
830247a4 1882 scan_commit(pRExC_state, &data);
c277df42
IZ
1883 SvREFCNT_dec(data.last_found);
1884
a0ed51b3 1885 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1886 if (longest_float_length
c277df42
IZ
1887 || (data.flags & SF_FL_BEFORE_EOL
1888 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1889 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1890 int t;
1891
a0ed51b3 1892 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1893 && data.offset_fixed == data.offset_float_min
1894 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1895 goto remove_float; /* As in (a)+. */
1896
c277df42
IZ
1897 r->float_substr = data.longest_float;
1898 r->float_min_offset = data.offset_float_min;
1899 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1900 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1901 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1902 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1903 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1904 }
1905 else {
aca2d497 1906 remove_float:
c277df42
IZ
1907 r->float_substr = Nullsv;
1908 SvREFCNT_dec(data.longest_float);
c5254dd6 1909 longest_float_length = 0;
a0d0e21e 1910 }
c277df42 1911
a0ed51b3 1912 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1913 if (longest_fixed_length
c277df42
IZ
1914 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1915 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1916 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1917 int t;
1918
c277df42
IZ
1919 r->anchored_substr = data.longest_fixed;
1920 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1921 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1922 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1923 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1924 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1925 }
1926 else {
c277df42
IZ
1927 r->anchored_substr = Nullsv;
1928 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1929 longest_fixed_length = 0;
a0d0e21e 1930 }
b81d288d 1931 if (r->regstclass
ffc61ed2 1932 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1933 r->regstclass = NULL;
1934 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1935 && !(data.start_class->flags & ANYOF_EOS)
1936 && !cl_is_anything(data.start_class)) {
830247a4 1937 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1938
b81d288d 1939 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1940 struct regnode_charclass_class);
1941 StructCopy(data.start_class,
830247a4 1942 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1943 struct regnode_charclass_class);
830247a4 1944 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1945 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1946 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1947 DEBUG_r({ SV *sv = sv_newmortal();
1948 regprop(sv, (regnode*)data.start_class);
1949 PerlIO_printf(Perl_debug_log,
1950 "synthetic stclass `%s'.\n",
1951 SvPVX(sv));});
653099ff 1952 }
c277df42
IZ
1953
1954 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1955 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1956 r->check_substr = r->anchored_substr;
1957 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1958 if (r->reganch & ROPT_ANCH_SINGLE)
1959 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1960 }
1961 else {
c277df42
IZ
1962 r->check_substr = r->float_substr;
1963 r->check_offset_min = data.offset_float_min;
1964 r->check_offset_max = data.offset_float_max;
a0d0e21e 1965 }
30382c73
IZ
1966 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1967 This should be changed ASAP! */
1968 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1969 r->reganch |= RE_USE_INTUIT;
1970 if (SvTAIL(r->check_substr))
1971 r->reganch |= RE_INTUIT_TAIL;
1972 }
a0ed51b3
LW
1973 }
1974 else {
c277df42
IZ
1975 /* Several toplevels. Best we can is to set minlen. */
1976 I32 fake;
653099ff 1977 struct regnode_charclass_class ch_class;
cb434fcc 1978 I32 last_close = 0;
c277df42
IZ
1979
1980 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1981 scan = r->program + 1;
830247a4 1982 cl_init(pRExC_state, &ch_class);
653099ff 1983 data.start_class = &ch_class;
cb434fcc 1984 data.last_closep = &last_close;
e1901655 1985 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1986 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1987 if (!(data.start_class->flags & ANYOF_EOS)
1988 && !cl_is_anything(data.start_class)) {
830247a4 1989 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1990
b81d288d 1991 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1992 struct regnode_charclass_class);
1993 StructCopy(data.start_class,
830247a4 1994 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1995 struct regnode_charclass_class);
830247a4 1996 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1997 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1998 DEBUG_r({ SV* sv = sv_newmortal();
1999 regprop(sv, (regnode*)data.start_class);
2000 PerlIO_printf(Perl_debug_log,
2001 "synthetic stclass `%s'.\n",
2002 SvPVX(sv));});
653099ff 2003 }
a0d0e21e
LW
2004 }
2005
a0d0e21e 2006 r->minlen = minlen;
b81d288d 2007 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2008 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2009 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2010 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2011 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2012 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2013 if (RExC_seen & REG_SEEN_CANY)
2014 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2015 Newz(1002, r->startp, RExC_npar, I32);
2016 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2017 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2018 DEBUG_r(regdump(r));
2019 return(r);
a687059c
LW
2020}
2021
2022/*
2023 - reg - regular expression, i.e. main body or parenthesized thing
2024 *
2025 * Caller must absorb opening parenthesis.
2026 *
2027 * Combining parenthesis handling with the base level of regular expression
2028 * is a trifle forced, but the need to tie the tails of the branches to what
2029 * follows makes it hard to avoid.
2030 */
76e3520e 2031STATIC regnode *
830247a4 2032S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2033 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2034{
c277df42
IZ
2035 register regnode *ret; /* Will be the head of the group. */
2036 register regnode *br;
2037 register regnode *lastbr;
2038 register regnode *ender = 0;
a0d0e21e 2039 register I32 parno = 0;
830247a4 2040 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2041
2042 /* for (?g), (?gc), and (?o) warnings; warning
2043 about (?c) will warn about (?g) -- japhy */
2044
2045 I32 wastedflags = 0x00,
2046 wasted_o = 0x01,
2047 wasted_g = 0x02,
2048 wasted_gc = 0x02 | 0x04,
2049 wasted_c = 0x04;
2050
fac92740 2051 char * parse_start = RExC_parse; /* MJD */
830247a4 2052 char *oregcomp_parse = RExC_parse;
c277df42 2053 char c;
a0d0e21e 2054
821b33a5 2055 *flagp = 0; /* Tentatively. */
a0d0e21e 2056
9d1d55b5 2057
a0d0e21e
LW
2058 /* Make an OPEN node, if parenthesized. */
2059 if (paren) {
fac92740 2060 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2061 U16 posflags = 0, negflags = 0;
2062 U16 *flagsp = &posflags;
0f5d15d6 2063 int logical = 0;
830247a4 2064 char *seqstart = RExC_parse;
ca9dfc88 2065
830247a4
IZ
2066 RExC_parse++;
2067 paren = *RExC_parse++;
c277df42 2068 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2069 switch (paren) {
fac92740 2070 case '<': /* (?<...) */
830247a4 2071 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2072 if (*RExC_parse == '!')
c277df42 2073 paren = ',';
b81d288d 2074 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2075 goto unknown;
830247a4 2076 RExC_parse++;
fac92740
MJD
2077 case '=': /* (?=...) */
2078 case '!': /* (?!...) */
830247a4 2079 RExC_seen_zerolen++;
fac92740
MJD
2080 case ':': /* (?:...) */
2081 case '>': /* (?>...) */
a0d0e21e 2082 break;
fac92740
MJD
2083 case '$': /* (?$...) */
2084 case '@': /* (?@...) */
8615cb43 2085 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2086 break;
fac92740 2087 case '#': /* (?#...) */
830247a4
IZ
2088 while (*RExC_parse && *RExC_parse != ')')
2089 RExC_parse++;
2090 if (*RExC_parse != ')')
c277df42 2091 FAIL("Sequence (?#... not terminated");
830247a4 2092 nextchar(pRExC_state);
a0d0e21e
LW
2093 *flagp = TRYAGAIN;
2094 return NULL;
fac92740 2095 case 'p': /* (?p...) */
f9373011
PM
2096 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2097 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2098 /* FALL THROUGH*/
fac92740 2099 case '?': /* (??...) */
0f5d15d6 2100 logical = 1;
830247a4 2101 paren = *RExC_parse++;
0f5d15d6 2102 /* FALL THROUGH */
fac92740 2103 case '{': /* (?{...}) */
c277df42 2104 {
c277df42
IZ
2105 I32 count = 1, n = 0;
2106 char c;
830247a4 2107 char *s = RExC_parse;
c277df42
IZ
2108 SV *sv;
2109 OP_4tree *sop, *rop;
2110
830247a4
IZ
2111 RExC_seen_zerolen++;
2112 RExC_seen |= REG_SEEN_EVAL;
2113 while (count && (c = *RExC_parse)) {
2114 if (c == '\\' && RExC_parse[1])
2115 RExC_parse++;
b81d288d 2116 else if (c == '{')
c277df42 2117 count++;
b81d288d 2118 else if (c == '}')
c277df42 2119 count--;
830247a4 2120 RExC_parse++;
c277df42 2121 }
830247a4 2122 if (*RExC_parse != ')')
b45f050a 2123 {
b81d288d 2124 RExC_parse = s;
b45f050a
JF
2125 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2126 }
c277df42
IZ
2127 if (!SIZE_ONLY) {
2128 AV *av;
b81d288d
AB
2129
2130 if (RExC_parse - 1 - s)
830247a4 2131 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2132 else
79cb57f6 2133 sv = newSVpvn("", 0);
c277df42 2134
569233ed
SB
2135 ENTER;
2136 Perl_save_re_context(aTHX);
c277df42 2137 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2138 sop->op_private |= OPpREFCOUNTED;
2139 /* re_dup will OpREFCNT_inc */
2140 OpREFCNT_set(sop, 1);
569233ed 2141 LEAVE;
c277df42 2142
830247a4
IZ
2143 n = add_data(pRExC_state, 3, "nop");
2144 RExC_rx->data->data[n] = (void*)rop;
2145 RExC_rx->data->data[n+1] = (void*)sop;
2146 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2147 SvREFCNT_dec(sv);
a0ed51b3 2148 }
e24b16f9 2149 else { /* First pass */
830247a4 2150 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2151 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2152 /* No compiled RE interpolated, has runtime
2153 components ===> unsafe. */
2154 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2155 if (PL_tainting && PL_tainted)
cc6b7395 2156 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2157 }
2158
830247a4 2159 nextchar(pRExC_state);
0f5d15d6 2160 if (logical) {
830247a4 2161 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2162 if (!SIZE_ONLY)
2163 ret->flags = 2;
830247a4 2164 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2165 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2166 return ret;
2167 }
830247a4 2168 return reganode(pRExC_state, EVAL, n);
c277df42 2169 }
fac92740 2170 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2171 {
fac92740 2172 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2173 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2174 || RExC_parse[1] == '<'
830247a4 2175 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2176 I32 flag;
2177
830247a4 2178 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2179 if (!SIZE_ONLY)
2180 ret->flags = 1;
830247a4 2181 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2182 goto insert_if;
b81d288d 2183 }
a0ed51b3 2184 }
830247a4 2185 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2186 /* (?(1)...) */
830247a4 2187 parno = atoi(RExC_parse++);
c277df42 2188
830247a4
IZ
2189 while (isDIGIT(*RExC_parse))
2190 RExC_parse++;
fac92740
MJD
2191 ret = reganode(pRExC_state, GROUPP, parno);
2192
830247a4 2193 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2194 vFAIL("Switch condition not recognized");
c277df42 2195 insert_if:
830247a4
IZ
2196 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2197 br = regbranch(pRExC_state, &flags, 1);
c277df42 2198 if (br == NULL)
830247a4 2199 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2200 else
830247a4
IZ
2201 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2202 c = *nextchar(pRExC_state);
d1b80229
IZ
2203 if (flags&HASWIDTH)
2204 *flagp |= HASWIDTH;
c277df42 2205 if (c == '|') {
830247a4
IZ
2206 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2207 regbranch(pRExC_state, &flags, 1);
2208 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2209 if (flags&HASWIDTH)
2210 *flagp |= HASWIDTH;
830247a4 2211 c = *nextchar(pRExC_state);
a0ed51b3
LW
2212 }
2213 else
c277df42
IZ
2214 lastbr = NULL;
2215 if (c != ')')
8615cb43 2216 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2217 ender = reg_node(pRExC_state, TAIL);
2218 regtail(pRExC_state, br, ender);
c277df42 2219 if (lastbr) {
830247a4
IZ
2220 regtail(pRExC_state, lastbr, ender);
2221 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2222 }
2223 else
830247a4 2224 regtail(pRExC_state, ret, ender);
c277df42 2225 return ret;
a0ed51b3
LW
2226 }
2227 else {
830247a4 2228 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2229 }
2230 }
1b1626e4 2231 case 0:
830247a4 2232 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2233 vFAIL("Sequence (? incomplete");
1b1626e4 2234 break;
a0d0e21e 2235 default:
830247a4 2236 --RExC_parse;
fac92740 2237 parse_flags: /* (?i) */
830247a4 2238 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2239 /* (?g), (?gc) and (?o) are useless here
2240 and must be globally applied -- japhy */
2241
2242 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2243 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2244 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2245 if (! (wastedflags & wflagbit) ) {
2246 wastedflags |= wflagbit;
2247 vWARN5(
2248 RExC_parse + 1,
2249 "Useless (%s%c) - %suse /%c modifier",
2250 flagsp == &negflags ? "?-" : "?",
2251 *RExC_parse,
2252 flagsp == &negflags ? "don't " : "",
2253 *RExC_parse
2254 );
2255 }
2256 }
2257 }
2258 else if (*RExC_parse == 'c') {
2259 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2260 if (! (wastedflags & wasted_c) ) {
2261 wastedflags |= wasted_gc;
2262 vWARN3(
2263 RExC_parse + 1,
2264 "Useless (%sc) - %suse /gc modifier",
2265 flagsp == &negflags ? "?-" : "?",
2266 flagsp == &negflags ? "don't " : ""
2267 );
2268 }
2269 }
2270 }
2271 else { pmflag(flagsp, *RExC_parse); }
2272
830247a4 2273 ++RExC_parse;
ca9dfc88 2274 }
830247a4 2275 if (*RExC_parse == '-') {
ca9dfc88 2276 flagsp = &negflags;
9d1d55b5 2277 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2278 ++RExC_parse;
ca9dfc88 2279 goto parse_flags;
48c036b1 2280 }
830247a4
IZ
2281 RExC_flags16 |= posflags;
2282 RExC_flags16 &= ~negflags;
2283 if (*RExC_parse == ':') {
2284 RExC_parse++;
ca9dfc88
IZ
2285 paren = ':';
2286 break;
2287 }
c277df42 2288 unknown:
830247a4
IZ
2289 if (*RExC_parse != ')') {
2290 RExC_parse++;
2291 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2292 }
830247a4 2293 nextchar(pRExC_state);
a0d0e21e
LW
2294 *flagp = TRYAGAIN;
2295 return NULL;
2296 }
2297 }
fac92740 2298 else { /* (...) */
830247a4
IZ
2299 parno = RExC_npar;
2300 RExC_npar++;
2301 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2302 Set_Node_Length(ret, 1); /* MJD */
2303 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2304 open = 1;
a0d0e21e 2305 }
a0ed51b3 2306 }
fac92740 2307 else /* ! paren */
a0d0e21e
LW
2308 ret = NULL;
2309
2310 /* Pick up the branches, linking them together. */
fac92740 2311 parse_start = RExC_parse; /* MJD */
830247a4 2312 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2313 /* branch_len = (paren != 0); */
2314
a0d0e21e
LW
2315 if (br == NULL)
2316 return(NULL);
830247a4
IZ
2317 if (*RExC_parse == '|') {
2318 if (!SIZE_ONLY && RExC_extralen) {
2319 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2320 }
fac92740 2321 else { /* MJD */
830247a4 2322 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2323 Set_Node_Length(br, paren != 0);
2324 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2325 }
c277df42
IZ
2326 have_branch = 1;
2327 if (SIZE_ONLY)
830247a4 2328 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2329 }
2330 else if (paren == ':') {
c277df42
IZ
2331 *flagp |= flags&SIMPLE;
2332 }
2333 if (open) { /* Starts with OPEN. */
830247a4 2334 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2335 }
2336 else if (paren != '?') /* Not Conditional */
a0d0e21e 2337 ret = br;
821b33a5
IZ
2338 if (flags&HASWIDTH)
2339 *flagp |= HASWIDTH;
a0d0e21e 2340 *flagp |= flags&SPSTART;
c277df42 2341 lastbr = br;
830247a4
IZ
2342 while (*RExC_parse == '|') {
2343 if (!SIZE_ONLY && RExC_extralen) {
2344 ender = reganode(pRExC_state, LONGJMP,0);
2345 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2346 }
2347 if (SIZE_ONLY)
830247a4
IZ
2348 RExC_extralen += 2; /* Account for LONGJMP. */
2349 nextchar(pRExC_state);
2350 br = regbranch(pRExC_state, &flags, 0);
fac92740 2351
a687059c 2352 if (br == NULL)
a0d0e21e 2353 return(NULL);
830247a4 2354 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2355 lastbr = br;
821b33a5
IZ
2356 if (flags&HASWIDTH)
2357 *flagp |= HASWIDTH;
a687059c 2358 *flagp |= flags&SPSTART;
a0d0e21e
LW
2359 }
2360
c277df42
IZ
2361 if (have_branch || paren != ':') {
2362 /* Make a closing node, and hook it on the end. */
2363 switch (paren) {
2364 case ':':
830247a4 2365 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2366 break;
2367 case 1:
830247a4 2368 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2369 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2370 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2371 break;
2372 case '<':
c277df42
IZ
2373 case ',':
2374 case '=':
2375 case '!':
c277df42 2376 *flagp &= ~HASWIDTH;
821b33a5
IZ
2377 /* FALL THROUGH */
2378 case '>':
830247a4 2379 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2380 break;
2381 case 0:
830247a4 2382 ender = reg_node(pRExC_state, END);
c277df42
IZ
2383 break;
2384 }
830247a4 2385 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2386
c277df42
IZ
2387 if (have_branch) {
2388 /* Hook the tails of the branches to the closing node. */
2389 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2390 regoptail(pRExC_state, br, ender);
c277df42
IZ
2391 }
2392 }
a0d0e21e 2393 }
c277df42
IZ
2394
2395 {
2396 char *p;
2397 static char parens[] = "=!<,>";
2398
2399 if (paren && (p = strchr(parens, paren))) {
2400 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2401 int flag = (p - parens) > 1;
2402
2403 if (paren == '>')
2404 node = SUSPEND, flag = 0;
830247a4 2405 reginsert(pRExC_state, node,ret);
c277df42 2406 ret->flags = flag;
830247a4 2407 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2408 }
a0d0e21e
LW
2409 }
2410
2411 /* Check for proper termination. */
ce3e6498 2412 if (paren) {
830247a4
IZ
2413 RExC_flags16 = oregflags;
2414 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2415 RExC_parse = oregcomp_parse;
380a0633 2416 vFAIL("Unmatched (");
ce3e6498 2417 }
a0ed51b3 2418 }
830247a4
IZ
2419 else if (!paren && RExC_parse < RExC_end) {
2420 if (*RExC_parse == ')') {
2421 RExC_parse++;
380a0633 2422 vFAIL("Unmatched )");
a0ed51b3
LW
2423 }
2424 else
b45f050a 2425 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2426 /* NOTREACHED */
2427 }
a687059c 2428
a0d0e21e 2429 return(ret);
a687059c
LW
2430}
2431
2432/*
2433 - regbranch - one alternative of an | operator
2434 *
2435 * Implements the concatenation operator.
2436 */
76e3520e 2437STATIC regnode *
830247a4 2438S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2439{
c277df42
IZ
2440 register regnode *ret;
2441 register regnode *chain = NULL;
2442 register regnode *latest;
2443 I32 flags = 0, c = 0;
a0d0e21e 2444
b81d288d 2445 if (first)
c277df42
IZ
2446 ret = NULL;
2447 else {
b81d288d 2448 if (!SIZE_ONLY && RExC_extralen)
830247a4 2449 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2450 else {
830247a4 2451 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2452 Set_Node_Length(ret, 1);
2453 }
c277df42
IZ
2454 }
2455
b81d288d 2456 if (!first && SIZE_ONLY)
830247a4 2457 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2458
c277df42 2459 *flagp = WORST; /* Tentatively. */
a0d0e21e 2460
830247a4
IZ
2461 RExC_parse--;
2462 nextchar(pRExC_state);
2463 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2464 flags &= ~TRYAGAIN;
830247a4 2465 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2466 if (latest == NULL) {
2467 if (flags & TRYAGAIN)
2468 continue;
2469 return(NULL);
a0ed51b3
LW
2470 }
2471 else if (ret == NULL)
c277df42 2472 ret = latest;
a0d0e21e 2473 *flagp |= flags&HASWIDTH;
c277df42 2474 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2475 *flagp |= flags&SPSTART;
2476 else {
830247a4
IZ
2477 RExC_naughty++;
2478 regtail(pRExC_state, chain, latest);
a687059c 2479 }
a0d0e21e 2480 chain = latest;
c277df42
IZ
2481 c++;
2482 }
2483 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2484 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2485 if (ret == NULL)
2486 ret = chain;
2487 }
2488 if (c == 1) {
2489 *flagp |= flags&SIMPLE;
a0d0e21e 2490 }
a687059c 2491
a0d0e21e 2492 return(ret);
a687059c
LW
2493}
2494
2495/*
2496 - regpiece - something followed by possible [*+?]
2497 *
2498 * Note that the branching code sequences used for ? and the general cases
2499 * of * and + are somewhat optimized: they use the same NOTHING node as
2500 * both the endmarker for their branch list and the body of the last branch.
2501 * It might seem that this node could be dispensed with entirely, but the
2502 * endmarker role is not redundant.
2503 */
76e3520e 2504STATIC regnode *
830247a4 2505S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2506{
c277df42 2507 register regnode *ret;
a0d0e21e
LW
2508 register char op;
2509 register char *next;
2510 I32 flags;
830247a4 2511 char *origparse = RExC_parse;
a0d0e21e
LW
2512 char *maxpos;
2513 I32 min;
c277df42 2514 I32 max = REG_INFTY;
fac92740 2515 char *parse_start;
a0d0e21e 2516
830247a4 2517 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2518 if (ret == NULL) {
2519 if (flags & TRYAGAIN)
2520 *flagp |= TRYAGAIN;
2521 return(NULL);
2522 }
2523
830247a4 2524 op = *RExC_parse;
a0d0e21e 2525
830247a4 2526 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2527 parse_start = RExC_parse; /* MJD */
830247a4 2528 next = RExC_parse + 1;
a0d0e21e
LW
2529 maxpos = Nullch;
2530 while (isDIGIT(*next) || *next == ',') {
2531 if (*next == ',') {
2532 if (maxpos)
2533 break;
2534 else
2535 maxpos = next;
a687059c 2536 }
a0d0e21e
LW
2537 next++;
2538 }
2539 if (*next == '}') { /* got one */
2540 if (!maxpos)
2541 maxpos = next;
830247a4
IZ
2542 RExC_parse++;
2543 min = atoi(RExC_parse);
a0d0e21e
LW
2544 if (*maxpos == ',')
2545 maxpos++;
2546 else
830247a4 2547 maxpos = RExC_parse;
a0d0e21e
LW
2548 max = atoi(maxpos);
2549 if (!max && *maxpos != '0')
c277df42
IZ
2550 max = REG_INFTY; /* meaning "infinity" */
2551 else if (max >= REG_INFTY)
8615cb43 2552 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2553 RExC_parse = next;
2554 nextchar(pRExC_state);
a0d0e21e
LW
2555
2556 do_curly:
2557 if ((flags&SIMPLE)) {
830247a4
IZ
2558 RExC_naughty += 2 + RExC_naughty / 2;
2559 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2560 Set_Node_Offset(ret, parse_start+1); /* MJD */
2561 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2562 }
2563 else {
830247a4 2564 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2565
2566 w->flags = 0;
830247a4
IZ
2567 regtail(pRExC_state, ret, w);
2568 if (!SIZE_ONLY && RExC_extralen) {
2569 reginsert(pRExC_state, LONGJMP,ret);
2570 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2571 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2572 }
830247a4 2573 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2574 /* MJD hk */
2575 Set_Node_Offset(ret, parse_start+1);
2576 Set_Node_Length(ret,
2577 op == '{' ? (RExC_parse - parse_start) : 1);
2578
830247a4 2579 if (!SIZE_ONLY && RExC_extralen)
c277df42 2580 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2581 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2582 if (SIZE_ONLY)
830247a4
IZ
2583 RExC_whilem_seen++, RExC_extralen += 3;
2584 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2585 }
c277df42 2586 ret->flags = 0;
a0d0e21e
LW
2587
2588 if (min > 0)
821b33a5
IZ
2589 *flagp = WORST;
2590 if (max > 0)
2591 *flagp |= HASWIDTH;
a0d0e21e 2592 if (max && max < min)
8615cb43 2593 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2594 if (!SIZE_ONLY) {
2595 ARG1_SET(ret, min);
2596 ARG2_SET(ret, max);
a687059c 2597 }
a687059c 2598
a0d0e21e 2599 goto nest_check;
a687059c 2600 }
a0d0e21e 2601 }
a687059c 2602
a0d0e21e
LW
2603 if (!ISMULT1(op)) {
2604 *flagp = flags;
a687059c 2605 return(ret);
a0d0e21e 2606 }
bb20fd44 2607
c277df42 2608#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2609
2610 /* if this is reinstated, don't forget to put this back into perldiag:
2611
2612 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2613
2614 (F) The part of the regexp subject to either the * or + quantifier
2615 could match an empty string. The {#} shows in the regular
2616 expression about where the problem was discovered.
2617
2618 */
2619
bb20fd44 2620 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2621 vFAIL("Regexp *+ operand could be empty");
b81d288d 2622#endif
bb20fd44 2623
fac92740 2624 parse_start = RExC_parse;
830247a4 2625 nextchar(pRExC_state);
a0d0e21e 2626
821b33a5 2627 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2628
2629 if (op == '*' && (flags&SIMPLE)) {
830247a4 2630 reginsert(pRExC_state, STAR, ret);
c277df42 2631 ret->flags = 0;
830247a4 2632 RExC_naughty += 4;
a0d0e21e
LW
2633 }
2634 else if (op == '*') {
2635 min = 0;
2636 goto do_curly;
a0ed51b3
LW
2637 }
2638 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2639 reginsert(pRExC_state, PLUS, ret);
c277df42 2640 ret->flags = 0;
830247a4 2641 RExC_naughty += 3;
a0d0e21e
LW
2642 }
2643 else if (op == '+') {
2644 min = 1;
2645 goto do_curly;
a0ed51b3
LW
2646 }
2647 else if (op == '?') {
a0d0e21e
LW
2648 min = 0; max = 1;
2649 goto do_curly;
2650 }
2651 nest_check:
e476b1b5 2652 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2653 vWARN3(RExC_parse,
b45f050a 2654 "%.*s matches null string many times",
830247a4 2655 RExC_parse - origparse,
b45f050a 2656 origparse);
a0d0e21e
LW
2657 }
2658
830247a4
IZ
2659 if (*RExC_parse == '?') {
2660 nextchar(pRExC_state);
2661 reginsert(pRExC_state, MINMOD, ret);
2662 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2663 }
830247a4
IZ
2664 if (ISMULT2(RExC_parse)) {
2665 RExC_parse++;
b45f050a
JF
2666 vFAIL("Nested quantifiers");
2667 }
a0d0e21e
LW
2668
2669 return(ret);
a687059c
LW
2670}
2671
2672/*
2673 - regatom - the lowest level
2674 *
2675 * Optimization: gobbles an entire sequence of ordinary characters so that
2676 * it can turn them into a single node, which is smaller to store and
2677 * faster to run. Backslashed characters are exceptions, each becoming a
2678 * separate node; the code is simpler that way and it's not worth fixing.
2679 *
b45f050a 2680 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2681STATIC regnode *
830247a4 2682S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2683{
c277df42 2684 register regnode *ret = 0;
a0d0e21e 2685 I32 flags;
f06dbbb7 2686 char *parse_start = 0;
a0d0e21e
LW
2687
2688 *flagp = WORST; /* Tentatively. */
2689
2690tryagain:
830247a4 2691 switch (*RExC_parse) {
a0d0e21e 2692 case '^':
830247a4
IZ
2693 RExC_seen_zerolen++;
2694 nextchar(pRExC_state);
2695 if (RExC_flags16 & PMf_MULTILINE)
2696 ret = reg_node(pRExC_state, MBOL);
2697 else if (RExC_flags16 & PMf_SINGLELINE)
2698 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2699 else
830247a4 2700 ret = reg_node(pRExC_state, BOL);
fac92740 2701 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2702 break;
2703 case '$':
830247a4 2704 nextchar(pRExC_state);
b81d288d 2705 if (*RExC_parse)
830247a4
IZ
2706 RExC_seen_zerolen++;
2707 if (RExC_flags16 & PMf_MULTILINE)
2708 ret = reg_node(pRExC_state, MEOL);
2709 else if (RExC_flags16 & PMf_SINGLELINE)
2710 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2711 else
830247a4 2712 ret = reg_node(pRExC_state, EOL);
fac92740 2713 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2714 break;
2715 case '.':
830247a4 2716 nextchar(pRExC_state);
cce850e4 2717 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2718 ret = reg_node(pRExC_state, SANY);
2719 else
2720 ret = reg_node(pRExC_state, REG_ANY);
2721 *flagp |= HASWIDTH|SIMPLE;
830247a4 2722 RExC_naughty++;
fac92740 2723 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2724 break;
2725 case '[':
b45f050a 2726 {
830247a4 2727 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2728 ret = regclass(pRExC_state);
830247a4
IZ
2729 if (*RExC_parse != ']') {
2730 RExC_parse = oregcomp_parse;
b45f050a
JF
2731 vFAIL("Unmatched [");
2732 }
830247a4 2733 nextchar(pRExC_state);
a0d0e21e 2734 *flagp |= HASWIDTH|SIMPLE;
fac92740 2735 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2736 break;
b45f050a 2737 }
a0d0e21e 2738 case '(':
830247a4
IZ
2739 nextchar(pRExC_state);
2740 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2741 if (ret == NULL) {
bf93d4cc 2742 if (flags & TRYAGAIN) {
830247a4 2743 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2744 /* Make parent create an empty node if needed. */
2745 *flagp |= TRYAGAIN;
2746 return(NULL);
2747 }
a0d0e21e 2748 goto tryagain;
bf93d4cc 2749 }
a0d0e21e
LW
2750 return(NULL);
2751 }
c277df42 2752 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2753 break;
2754 case '|':
2755 case ')':
2756 if (flags & TRYAGAIN) {
2757 *flagp |= TRYAGAIN;
2758 return NULL;
2759 }
b45f050a 2760 vFAIL("Internal urp");
a0d0e21e
LW
2761 /* Supposed to be caught earlier. */
2762 break;
85afd4ae 2763 case '{':
830247a4
IZ
2764 if (!regcurly(RExC_parse)) {
2765 RExC_parse++;
85afd4ae
CS
2766 goto defchar;
2767 }
2768 /* FALL THROUGH */
a0d0e21e
LW
2769 case '?':
2770 case '+':
2771 case '*':
830247a4 2772 RExC_parse++;
b45f050a 2773 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2774 break;
2775 case '\\':
830247a4 2776 switch (*++RExC_parse) {
a0d0e21e 2777 case 'A':
830247a4
IZ
2778 RExC_seen_zerolen++;
2779 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2780 *flagp |= SIMPLE;
830247a4 2781 nextchar(pRExC_state);
fac92740 2782 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2783 break;
2784 case 'G':
830247a4
IZ
2785 ret = reg_node(pRExC_state, GPOS);
2786 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2787 *flagp |= SIMPLE;
830247a4 2788 nextchar(pRExC_state);
fac92740 2789 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2790 break;
2791 case 'Z':
830247a4 2792 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2793 *flagp |= SIMPLE;
830247a4 2794 nextchar(pRExC_state);
a0d0e21e 2795 break;
b85d18e9 2796 case 'z':
830247a4 2797 ret = reg_node(pRExC_state, EOS);
b85d18e9 2798 *flagp |= SIMPLE;
830247a4
IZ
2799 RExC_seen_zerolen++; /* Do not optimize RE away */
2800 nextchar(pRExC_state);
fac92740 2801 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2802 break;
4a2d328f 2803 case 'C':
f33976b4
DB
2804 ret = reg_node(pRExC_state, CANY);
2805 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2806 *flagp |= HASWIDTH|SIMPLE;
830247a4 2807 nextchar(pRExC_state);
fac92740 2808 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2809 break;
2810 case 'X':
830247a4 2811 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2812 *flagp |= HASWIDTH;
830247a4 2813 nextchar(pRExC_state);
fac92740 2814 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2815 break;
a0d0e21e 2816 case 'w':
ffc61ed2 2817 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2818 *flagp |= HASWIDTH|SIMPLE;
830247a4 2819 nextchar(pRExC_state);
fac92740 2820 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2821 break;
2822 case 'W':
ffc61ed2 2823 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2824 *flagp |= HASWIDTH|SIMPLE;
830247a4 2825 nextchar(pRExC_state);
fac92740 2826 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2827 break;
2828 case 'b':
830247a4
IZ
2829 RExC_seen_zerolen++;
2830 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2831 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2832 *flagp |= SIMPLE;
830247a4 2833 nextchar(pRExC_state);
fac92740 2834 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2835 break;
2836 case 'B':
830247a4
IZ
2837 RExC_seen_zerolen++;
2838 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2839 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2840 *flagp |= SIMPLE;
830247a4 2841 nextchar(pRExC_state);
fac92740 2842 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2843 break;
2844 case 's':
ffc61ed2 2845 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2846 *flagp |= HASWIDTH|SIMPLE;
830247a4 2847 nextchar(pRExC_state);
fac92740 2848 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2849 break;
2850 case 'S':
ffc61ed2 2851 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2852 *flagp |= HASWIDTH|SIMPLE;
830247a4 2853 nextchar(pRExC_state);
fac92740 2854 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2855 break;
2856 case 'd':
ffc61ed2 2857 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2858 *flagp |= HASWIDTH|SIMPLE;
830247a4 2859 nextchar(pRExC_state);
fac92740 2860 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2861 break;
2862 case 'D':
ffc61ed2 2863 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2864 *flagp |= HASWIDTH|SIMPLE;
830247a4 2865 nextchar(pRExC_state);
fac92740 2866 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2867 break;
a14b48bc
LW
2868 case 'p':
2869 case 'P':
3568d838 2870 {
830247a4 2871 char* oldregxend = RExC_end;
fac92740 2872 char* parse_start = RExC_parse;
a14b48bc 2873
830247a4 2874 if (RExC_parse[1] == '{') {
3568d838 2875 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2876 RExC_end = strchr(RExC_parse, '}');
2877 if (!RExC_end) {
2878 RExC_parse += 2;
2879 RExC_end = oldregxend;
b45f050a
JF
2880 vFAIL("Missing right brace on \\p{}");
2881 }
830247a4 2882 RExC_end++;
a14b48bc
LW
2883 }
2884 else
830247a4
IZ
2885 RExC_end = RExC_parse + 2;
2886 RExC_parse--;
a14b48bc 2887
ffc61ed2 2888 ret = regclass(pRExC_state);
a14b48bc 2889
830247a4
IZ
2890 RExC_end = oldregxend;
2891 RExC_parse--;
fac92740 2892 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2893 nextchar(pRExC_state);
a14b48bc
LW
2894 *flagp |= HASWIDTH|SIMPLE;
2895 }
2896 break;
a0d0e21e
LW
2897 case 'n':
2898 case 'r':
2899 case 't':
2900 case 'f':
2901 case 'e':
2902 case 'a':
2903 case 'x':
2904 case 'c':
2905 case '0':
2906 goto defchar;
2907 case '1': case '2': case '3': case '4':
2908 case '5': case '6': case '7': case '8': case '9':
2909 {
830247a4 2910 I32 num = atoi(RExC_parse);
a0d0e21e 2911
830247a4 2912 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2913 goto defchar;
2914 else {
fac92740 2915 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2916 while (isDIGIT(*RExC_parse))
2917 RExC_parse++;
b45f050a 2918
830247a4 2919 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2920 vFAIL("Reference to nonexistent group");
830247a4
IZ
2921 RExC_sawback = 1;
2922 ret = reganode(pRExC_state, FOLD
a0ed51b3 2923 ? (LOC ? REFFL : REFF)
c8756f30 2924 : REF, num);
a0d0e21e 2925 *flagp |= HASWIDTH;
fac92740
MJD
2926
2927 /* override incorrect value set in reganode MJD */
2928 Set_Node_Offset(ret, parse_start+1);
2929 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2930 RExC_parse--;
2931 nextchar(pRExC_state);
a0d0e21e
LW
2932 }
2933 }
2934 break;
2935 case '\0':
830247a4 2936 if (RExC_parse >= RExC_end)
b45f050a 2937 FAIL("Trailing \\");
a0d0e21e
LW
2938 /* FALL THROUGH */
2939 default:
c9f97d15
IZ
2940 /* Do not generate `unrecognized' warnings here, we fall
2941 back into the quick-grab loop below */
a0d0e21e
LW
2942 goto defchar;
2943 }
2944 break;
4633a7c4
LW
2945
2946 case '#':
830247a4
IZ
2947 if (RExC_flags16 & PMf_EXTENDED) {
2948 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2949 if (RExC_parse < RExC_end)
4633a7c4
LW
2950 goto tryagain;
2951 }
2952 /* FALL THROUGH */
2953
a0d0e21e 2954 default: {
ba210ebe 2955 register STRLEN len;
a0ed51b3 2956 register UV ender;
a0d0e21e 2957 register char *p;
c277df42 2958 char *oldp, *s;
ba210ebe 2959 STRLEN numlen;
f06dbbb7
JH
2960
2961 parse_start = RExC_parse - 1;
a0d0e21e 2962
830247a4 2963 RExC_parse++;
a0d0e21e
LW
2964
2965 defchar:
830247a4 2966 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2967 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2968 : EXACT);
cd439c50 2969 s = STRING(ret);
830247a4
IZ
2970 for (len = 0, p = RExC_parse - 1;
2971 len < 127 && p < RExC_end;
a0d0e21e
LW
2972 len++)
2973 {
2974 oldp = p;
5b5a24f7 2975
830247a4
IZ
2976 if (RExC_flags16 & PMf_EXTENDED)
2977 p = regwhite(p, RExC_end);
a0d0e21e
LW
2978 switch (*p) {
2979 case '^':
2980 case '$':
2981 case '.':
2982 case '[':
2983 case '(':
2984 case ')':
2985 case '|':
2986 goto loopdone;
2987 case '\\':
2988 switch (*++p) {
2989 case 'A':
2990 case 'G':
2991 case 'Z':
b85d18e9 2992 case 'z':
a0d0e21e
LW
2993 case 'w':
2994 case 'W':
2995 case 'b':
2996 case 'B':
2997 case 's':
2998 case 'S':
2999 case 'd':
3000 case 'D':
a14b48bc
LW
3001 case 'p':
3002 case 'P':
a0d0e21e
LW
3003 --p;
3004 goto loopdone;
3005 case 'n':
3006 ender = '\n';
3007 p++;
a687059c 3008 break;
a0d0e21e
LW
3009 case 'r':
3010 ender = '\r';
3011 p++;
a687059c 3012 break;
a0d0e21e
LW
3013 case 't':
3014 ender = '\t';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 'f':
3018 ender = '\f';
3019 p++;
a687059c 3020 break;
a0d0e21e 3021 case 'e':
c7f1f016 3022 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3023 p++;
a687059c 3024 break;
a0d0e21e 3025 case 'a':
c7f1f016 3026 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'x':
a0ed51b3
LW
3030 if (*++p == '{') {
3031 char* e = strchr(p, '}');
b81d288d 3032
b45f050a 3033 if (!e) {
830247a4 3034 RExC_parse = p + 1;
b45f050a
JF
3035 vFAIL("Missing right brace on \\x{}");
3036 }
de5f0749 3037 else {
53305cf1
NC
3038 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES;
3039 numlen = e - p - 1;
3040 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3041 if (ender > 0xff)
3042 RExC_utf8 = 1;
b21ed0a9
GS
3043 /* numlen is generous */
3044 if (numlen + len >= 127) {
a0ed51b3
LW
3045 p--;
3046 goto loopdone;
3047 }
3048 p = e + 1;
3049 }
a0ed51b3
LW
3050 }
3051 else {
53305cf1
NC
3052 I32 flags = 0;
3053 numlen = 2;
3054 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3055 p += numlen;
3056 }
a687059c 3057 break;
a0d0e21e
LW
3058 case 'c':
3059 p++;
bbce6d69 3060 ender = UCHARAT(p++);
3061 ender = toCTRL(ender);
a687059c 3062 break;
a0d0e21e
LW
3063 case '0': case '1': case '2': case '3':case '4':
3064 case '5': case '6': case '7': case '8':case '9':
3065 if (*p == '0' ||
830247a4 3066 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3067 I32 flags = 0;
3068 numlen = 3;
3069 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3070 p += numlen;
3071 }
3072 else {
3073 --p;
3074 goto loopdone;
a687059c
LW
3075 }
3076 break;
a0d0e21e 3077 case '\0':
830247a4 3078 if (p >= RExC_end)
b45f050a 3079 FAIL("Trailing \\");
a687059c 3080 /* FALL THROUGH */
a0d0e21e 3081 default:
e476b1b5 3082 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3083 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3084 goto normal_default;
a0d0e21e
LW
3085 }
3086 break;
a687059c 3087 default:
a0ed51b3 3088 normal_default:
fd400ab9 3089 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3090 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3091 &numlen, 0);
a0ed51b3
LW
3092 p += numlen;
3093 }
3094 else
3095 ender = *p++;
a0d0e21e 3096 break;
a687059c 3097 }
830247a4
IZ
3098 if (RExC_flags16 & PMf_EXTENDED)
3099 p = regwhite(p, RExC_end);
a0ed51b3
LW
3100 if (UTF && FOLD) {
3101 if (LOC)
5e12f4fb 3102 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3103 else
3104 ender = toLOWER_uni(ender);
3105 }
a0d0e21e
LW
3106 if (ISMULT2(p)) { /* Back off on ?+*. */
3107 if (len)
3108 p = oldp;
2b9d42f0 3109 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3110 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3111 s += numlen;
3112 len += numlen;
3113 }
a0d0e21e
LW
3114 else {
3115 len++;
cd439c50 3116 REGC(ender, s++);
a0d0e21e
LW
3117 }
3118 break;
a687059c 3119 }
2b9d42f0 3120 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3121 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3122 s += numlen;
3123 len += numlen - 1;
3124 }
3125 else
cd439c50 3126 REGC(ender, s++);
a0d0e21e
LW
3127 }
3128 loopdone:
830247a4 3129 RExC_parse = p - 1;
fac92740 3130 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3131 nextchar(pRExC_state);
793db0cb
JH
3132 {
3133 /* len is STRLEN which is unsigned, need to copy to signed */
3134 IV iv = len;
3135 if (iv < 0)
3136 vFAIL("Internal disaster");
3137 }
a0d0e21e
LW
3138 if (len > 0)
3139 *flagp |= HASWIDTH;
3140 if (len == 1)
3141 *flagp |= SIMPLE;
c277df42 3142 if (!SIZE_ONLY)
cd439c50
IZ
3143 STR_LEN(ret) = len;
3144 if (SIZE_ONLY)
830247a4 3145 RExC_size += STR_SZ(len);
cd439c50 3146 else
830247a4 3147 RExC_emit += STR_SZ(len);
a687059c 3148 }
a0d0e21e
LW
3149 break;
3150 }
a687059c 3151
a0d0e21e 3152 return(ret);
a687059c
LW
3153}
3154
873ef191 3155STATIC char *
cea2e8a9 3156S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3157{
3158 while (p < e) {
3159 if (isSPACE(*p))
3160 ++p;
3161 else if (*p == '#') {
3162 do {
3163 p++;
3164 } while (p < e && *p != '\n');
3165 }
3166 else
3167 break;
3168 }
3169 return p;
3170}
3171
b8c5462f
JH
3172/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3173 Character classes ([:foo:]) can also be negated ([:^foo:]).
3174 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3175 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3176 but trigger warnings because they are currently unimplemented. */
3177STATIC I32
830247a4 3178S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3179{
3180 char *posixcc = 0;
936ed897 3181 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3182
830247a4 3183 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3184 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3185 (*RExC_parse == ':' ||
3186 *RExC_parse == '=' ||
3187 *RExC_parse == '.')) {
3188 char c = *RExC_parse;
3189 char* s = RExC_parse++;
b81d288d 3190
830247a4
IZ
3191 while (RExC_parse < RExC_end && *RExC_parse != c)
3192 RExC_parse++;
3193 if (RExC_parse == RExC_end)
620e46c5 3194 /* Grandfather lone [:, [=, [. */
830247a4 3195 RExC_parse = s;
620e46c5 3196 else {
830247a4 3197 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3198
830247a4
IZ
3199 if (*RExC_parse == ']') {
3200 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3201 posixcc = s + 1;
3202 if (*s == ':') {
3203 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3204 I32 skip = 5; /* the most common skip */
3205
3206 switch (*posixcc) {
3207 case 'a':
3208 if (strnEQ(posixcc, "alnum", 5))
3209 namedclass =
3210 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3211 else if (strnEQ(posixcc, "alpha", 5))
3212 namedclass =
3213 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3214 else if (strnEQ(posixcc, "ascii", 5))
3215 namedclass =
3216 complement ? ANYOF_NASCII : ANYOF_ASCII;
3217 break;
aaa51d5e
JF
3218 case 'b':
3219 if (strnEQ(posixcc, "blank", 5))
3220 namedclass =
3221 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3222 break;
b8c5462f
JH
3223 case 'c':
3224 if (strnEQ(posixcc, "cntrl", 5))
3225 namedclass =
3226 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3227 break;
3228 case 'd':
3229 if (strnEQ(posixcc, "digit", 5))
3230 namedclass =
3231 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3232 break;
3233 case 'g':
3234 if (strnEQ(posixcc, "graph", 5))
3235 namedclass =
3236 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3237 break;
3238 case 'l':
3239 if (strnEQ(posixcc, "lower", 5))
3240 namedclass =
3241 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3242 break;
3243 case 'p':
3244 if (strnEQ(posixcc, "print", 5))
3245 namedclass =
3246 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3247 else if (strnEQ(posixcc, "punct", 5))
3248 namedclass =
3249 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3250 break;
3251 case 's':
3252 if (strnEQ(posixcc, "space", 5))
3253 namedclass =
aaa51d5e 3254 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3255 break;
b8c5462f
JH
3256 case 'u':
3257 if (strnEQ(posixcc, "upper", 5))
3258 namedclass =
3259 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3260 break;
3261 case 'w': /* this is not POSIX, this is the Perl \w */
3262 if (strnEQ(posixcc, "word", 4)) {
3263 namedclass =
3264 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3265 skip = 4;
3266 }
3267 break;
3268 case 'x':
3269 if (strnEQ(posixcc, "xdigit", 6)) {
3270 namedclass =
3271 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3272 skip = 6;
3273 }
3274 break;
3275 }
ac561586
JH
3276 if (namedclass == OOB_NAMEDCLASS ||
3277 posixcc[skip] != ':' ||
3278 posixcc[skip+1] != ']')
b45f050a
JF
3279 {
3280 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3281 t - s - 1, s + 1);
3282 }
3283 } else if (!SIZE_ONLY) {
b8c5462f 3284 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3285
830247a4 3286 /* adjust RExC_parse so the warning shows after
b45f050a 3287 the class closes */
830247a4
IZ
3288 while (*RExC_parse && *RExC_parse != ']')
3289 RExC_parse++;
b45f050a
JF
3290 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3291 }
b8c5462f
JH
3292 } else {
3293 /* Maternal grandfather:
3294 * "[:" ending in ":" but not in ":]" */
830247a4 3295 RExC_parse = s;
767d463e 3296 }
620e46c5
JH
3297 }
3298 }
3299
b8c5462f
JH
3300 return namedclass;
3301}
3302
3303STATIC void
830247a4 3304S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3305{
e476b1b5 3306 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3307 (*RExC_parse == ':' ||
3308 *RExC_parse == '=' ||
3309 *RExC_parse == '.')) {
3310 char *s = RExC_parse;
93733859 3311 char c = *s++;
b8c5462f
JH
3312
3313 while(*s && isALNUM(*s))
3314 s++;
3315 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3316 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3317
3318 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3319 if (c == '=' || c == '.')
b45f050a 3320 {
830247a4 3321 /* adjust RExC_parse so the error shows after
b45f050a 3322 the class closes */
830247a4 3323 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3324 ;
3325 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3326 }
b8c5462f
JH
3327 }
3328 }
620e46c5
JH
3329}
3330
76e3520e 3331STATIC regnode *
830247a4 3332S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3333{
ffc61ed2 3334 register UV value;
3568d838 3335 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3336 register IV range = 0;
c277df42 3337 register regnode *ret;
ba210ebe 3338 STRLEN numlen;
ffc61ed2 3339 IV namedclass;
9c5ffd7c 3340 char *rangebegin = 0;
936ed897 3341 bool need_class = 0;
9c5ffd7c 3342 SV *listsv = Nullsv;
ffc61ed2
JH
3343 register char *e;
3344 UV n;
3568d838 3345 bool optimize_invert = TRUE;
ffc61ed2
JH
3346
3347 ret = reganode(pRExC_state, ANYOF, 0);
3348
3349 if (!SIZE_ONLY)
3350 ANYOF_FLAGS(ret) = 0;
3351
3352 if (*RExC_parse == '^') { /* Complement of range. */
3353 RExC_naughty++;
3354 RExC_parse++;
3355 if (!SIZE_ONLY)
3356 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3357 }
a0d0e21e 3358
936ed897 3359 if (SIZE_ONLY)
830247a4 3360 RExC_size += ANYOF_SKIP;
936ed897 3361 else {
830247a4 3362 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3363 if (FOLD)
3364 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3365 if (LOC)
3366 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3367 ANYOF_BITMAP_ZERO(ret);
3368 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3369 }
b8c5462f 3370
e476b1b5 3371 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3372 checkposixcc(pRExC_state);
b8c5462f 3373
830247a4 3374 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3375 goto charclassloop; /* allow 1st char to be ] or - */
3376
830247a4 3377 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3378
3379 charclassloop:
3380
3381 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3382
73b437c8 3383 if (!range)
830247a4 3384 rangebegin = RExC_parse;
ffc61ed2 3385 if (UTF) {
5e12f4fb 3386 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3387 RExC_end - RExC_parse,
3388 &numlen, 0);
ffc61ed2
JH
3389 RExC_parse += numlen;
3390 }
3391 else
3392 value = UCHARAT(RExC_parse++);
620e46c5 3393 if (value == '[')
830247a4 3394 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3395 else if (value == '\\') {
ffc61ed2 3396 if (UTF) {
5e12f4fb 3397 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3398 RExC_end - RExC_parse,
3399 &numlen, 0);
3400 RExC_parse += numlen;
3401 }
3402 else
3403 value = UCHARAT(RExC_parse++);
470c3474 3404 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3405 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3406 * be a problem later if we want switch on Unicode.
3407 * A similar issue a little bit later when switching on
3408 * namedclass. --jhi */
ffc61ed2 3409 switch ((I32)value) {
b8c5462f
JH
3410 case 'w': namedclass = ANYOF_ALNUM; break;
3411 case 'W': namedclass = ANYOF_NALNUM; break;
3412 case 's': namedclass = ANYOF_SPACE; break;
3413 case 'S': namedclass = ANYOF_NSPACE; break;
3414 case 'd': namedclass = ANYOF_DIGIT; break;
3415 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3416 case 'p':
3417 case 'P':
3418 if (*RExC_parse == '{') {
3419 e = strchr(RExC_parse++, '}');
3420 if (!e)
3421 vFAIL("Missing right brace on \\p{}");
3422 n = e - RExC_parse;
3423 }
3424 else {
3425 e = RExC_parse;
3426 n = 1;
3427 }
3428 if (!SIZE_ONLY) {
3429 if (value == 'p')
3430 Perl_sv_catpvf(aTHX_ listsv,
3431 "+utf8::%.*s\n", (int)n, RExC_parse);
3432 else
3433 Perl_sv_catpvf(aTHX_ listsv,
3434 "!utf8::%.*s\n", (int)n, RExC_parse);
3435 }
3436 RExC_parse = e + 1;
3437 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3438 continue;
b8c5462f
JH
3439 case 'n': value = '\n'; break;
3440 case 'r': value = '\r'; break;
3441 case 't': value = '\t'; break;
3442 case 'f': value = '\f'; break;
3443 case 'b': value = '\b'; break;
c7f1f016
NIS
3444 case 'e': value = ASCII_TO_NATIVE('\033');break;
3445 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3446 case 'x':
ffc61ed2 3447 if (*RExC_parse == '{') {
53305cf1 3448 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES;
ffc61ed2 3449 e = strchr(RExC_parse++, '}');
b81d288d 3450 if (!e)
ffc61ed2 3451 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3452
3453 numlen = e - RExC_parse;
3454 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3455 RExC_parse = e + 1;
3456 }
3457 else {
53305cf1
NC
3458 I32 flags = 0;
3459 numlen = 2;
3460 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3461 RExC_parse += numlen;
3462 }
b8c5462f
JH
3463 break;
3464 case 'c':
830247a4 3465 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3466 value = toCTRL(value);
3467 break;
3468 case '0': case '1': case '2': case '3': case '4':
3469 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3470 {
3471 I32 flags = 0;
3472 numlen = 3;
3473 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3474 RExC_parse += numlen;
b8c5462f 3475 break;
53305cf1 3476 }
1028017a 3477 default:
e476b1b5 3478 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3479 vWARN2(RExC_parse,
3480 "Unrecognized escape \\%c in character class passed through",
3481 (int)value);
1028017a 3482 break;
b8c5462f 3483 }
ffc61ed2
JH
3484 } /* end of \blah */
3485
3486 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3487
3488 if (!SIZE_ONLY && !need_class)
936ed897 3489 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3490
936ed897 3491 need_class = 1;
ffc61ed2
JH
3492
3493 /* a bad range like a-\d, a-[:digit:] ? */
3494 if (range) {
73b437c8 3495 if (!SIZE_ONLY) {
e476b1b5 3496 if (ckWARN(WARN_REGEXP))
830247a4 3497 vWARN4(RExC_parse,
b45f050a 3498 "False [] range \"%*.*s\"",
830247a4
IZ
3499 RExC_parse - rangebegin,
3500 RExC_parse - rangebegin,
b45f050a 3501 rangebegin);
3568d838
JH
3502 if (prevvalue < 256) {
3503 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3504 ANYOF_BITMAP_SET(ret, '-');
3505 }
3506 else {
3507 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3508 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3509 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3510 }
b8c5462f 3511 }
ffc61ed2
JH
3512
3513 range = 0; /* this was not a true range */
73b437c8 3514 }
ffc61ed2 3515
73b437c8 3516 if (!SIZE_ONLY) {
3568d838
JH
3517 if (namedclass > OOB_NAMEDCLASS)
3518 optimize_invert = FALSE;
e2962f66
JH
3519 /* Possible truncation here but in some 64-bit environments
3520 * the compiler gets heartburn about switch on 64-bit values.
3521 * A similar issue a little earlier when switching on value.
98f323fa 3522 * --jhi */
e2962f66 3523 switch ((I32)namedclass) {
73b437c8
JH
3524 case ANYOF_ALNUM:
3525 if (LOC)
936ed897 3526 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3527 else {
3528 for (value = 0; value < 256; value++)
3529 if (isALNUM(value))
936ed897 3530 ANYOF_BITMAP_SET(ret, value);
73b437c8 3531 }
ffc61ed2 3532 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3533 break;
3534 case ANYOF_NALNUM:
3535 if (LOC)
936ed897 3536 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3537 else {
3538 for (value = 0; value < 256; value++)
3539 if (!isALNUM(value))
936ed897 3540 ANYOF_BITMAP_SET(ret, value);
73b437c8 3541 }
ffc61ed2 3542 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3543 break;
ffc61ed2 3544 case ANYOF_ALNUMC:
73b437c8 3545 if (LOC)
ffc61ed2 3546 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3547 else {
3548 for (value = 0; value < 256; value++)
ffc61ed2 3549 if (isALNUMC(value))
936ed897 3550 ANYOF_BITMAP_SET(ret, value);
73b437c8 3551 }
ffc61ed2 3552 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3553 break;
3554 case ANYOF_NALNUMC:
3555 if (LOC)
936ed897 3556 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3557 else {
3558 for (value = 0; value < 256; value++)
3559 if (!isALNUMC(value))
936ed897 3560 ANYOF_BITMAP_SET(ret, value);
73b437c8 3561 }
ffc61ed2 3562 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3563 break;
3564 case ANYOF_ALPHA:
3565 if (LOC)
936ed897 3566 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3567 else {
3568 for (value = 0; value < 256; value++)
3569 if (isALPHA(value))
936ed897 3570 ANYOF_BITMAP_SET(ret, value);
73b437c8 3571 }
ffc61ed2 3572 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3573 break;
3574 case ANYOF_NALPHA:
3575 if (LOC)
936ed897 3576 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3577 else {
3578 for (value = 0; value < 256; value++)
3579 if (!isALPHA(value))
936ed897 3580 ANYOF_BITMAP_SET(ret, value);
73b437c8 3581 }
ffc61ed2 3582 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3583 break;
3584 case ANYOF_ASCII:
3585 if (LOC)
936ed897 3586 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3587 else {
c7f1f016 3588#ifndef EBCDIC
1ba5c669
JH
3589 for (value = 0; value < 128; value++)
3590 ANYOF_BITMAP_SET(ret, value);
3591#else /* EBCDIC */
ffbc6a93 3592 for (value = 0; value < 256; value++) {
3a3c4447
JH
3593 if (isASCII(value))
3594 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3595 }
1ba5c669 3596#endif /* EBCDIC */
73b437c8 3597 }
ffc61ed2 3598 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3599 break;
3600 case ANYOF_NASCII:
3601 if (LOC)
936ed897 3602 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3603 else {
c7f1f016 3604#ifndef EBCDIC
1ba5c669
JH
3605 for (value = 128; value < 256; value++)
3606 ANYOF_BITMAP_SET(ret, value);
3607#else /* EBCDIC */
ffbc6a93 3608 for (value = 0; value < 256; value++) {
3a3c4447
JH
3609 if (!isASCII(value))
3610 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3611 }
1ba5c669 3612#endif /* EBCDIC */
73b437c8 3613 }
ffc61ed2 3614 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3615 break;
aaa51d5e
JF
3616 case ANYOF_BLANK:
3617 if (LOC)
3618 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3619 else {
3620 for (value = 0; value < 256; value++)
3621 if (isBLANK(value))
3622 ANYOF_BITMAP_SET(ret, value);
3623 }
ffc61ed2 3624 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3625 break;
3626 case ANYOF_NBLANK:
3627 if (LOC)
3628 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3629 else {
3630 for (value = 0; value < 256; value++)
3631 if (!isBLANK(value))
3632 ANYOF_BITMAP_SET(ret, value);
3633 }
ffc61ed2 3634 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3635 break;
73b437c8
JH
3636 case ANYOF_CNTRL:
3637 if (LOC)
936ed897 3638 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3639 else {
3640 for (value = 0; value < 256; value++)
3641 if (isCNTRL(value))
936ed897 3642 ANYOF_BITMAP_SET(ret, value);
73b437c8 3643 }
ffc61ed2 3644 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3645 break;
3646 case ANYOF_NCNTRL:
3647 if (LOC)
936ed897 3648 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3649 else {
3650 for (value = 0; value < 256; value++)
3651 if (!isCNTRL(value))
936ed897 3652 ANYOF_BITMAP_SET(ret, value);
73b437c8 3653 }
ffc61ed2
JH
3654 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3655 break;
3656 case ANYOF_DIGIT:
3657 if (LOC)
3658 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3659 else {
3660 /* consecutive digits assumed */
3661 for (value = '0'; value <= '9'; value++)
3662 ANYOF_BITMAP_SET(ret, value);
3663 }
3664 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3665 break;
3666 case ANYOF_NDIGIT:
3667 if (LOC)
3668 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3669 else {
3670 /* consecutive digits assumed */
3671 for (value = 0; value < '0'; value++)
3672 ANYOF_BITMAP_SET(ret, value);
3673 for (value = '9' + 1; value < 256; value++)
3674 ANYOF_BITMAP_SET(ret, value);
3675 }
3676 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3677 break;
3678 case ANYOF_GRAPH:
3679 if (LOC)
936ed897 3680 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3681 else {
3682 for (value = 0; value < 256; value++)
3683 if (isGRAPH(value))
936ed897 3684 ANYOF_BITMAP_SET(ret, value);
73b437c8 3685 }
ffc61ed2 3686 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3687 break;
3688 case ANYOF_NGRAPH:
3689 if (LOC)
936ed897 3690 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3691 else {
3692 for (value = 0; value < 256; value++)
3693 if (!isGRAPH(value))
936ed897 3694 ANYOF_BITMAP_SET(ret, value);
73b437c8 3695 }
ffc61ed2 3696 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3697 break;
3698 case ANYOF_LOWER:
3699 if (LOC)
936ed897 3700 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3701 else {
3702 for (value = 0; value < 256; value++)
3703 if (isLOWER(value))
936ed897 3704 ANYOF_BITMAP_SET(ret, value);
73b437c8 3705 }
ffc61ed2 3706 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3707 break;
3708 case ANYOF_NLOWER:
3709 if (LOC)
936ed897 3710 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3711 else {
3712 for (value = 0; value < 256; value++)
3713 if (!isLOWER(value))
936ed897 3714 ANYOF_BITMAP_SET(ret, value);
73b437c8 3715 }
ffc61ed2 3716 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3717 break;
3718 case ANYOF_PRINT:
3719 if (LOC)
936ed897 3720 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3721 else {
3722 for (value = 0; value < 256; value++)
3723 if (isPRINT(value))
936ed897 3724 ANYOF_BITMAP_SET(ret, value);
73b437c8 3725 }
ffc61ed2 3726 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3727 break;
3728 case ANYOF_NPRINT:
3729 if (LOC)
936ed897 3730 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3731 else {
3732 for (value = 0; value < 256; value++)
3733 if (!isPRINT(value))
936ed897 3734 ANYOF_BITMAP_SET(ret, value);
73b437c8 3735 }
ffc61ed2 3736 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3737 break;
aaa51d5e
JF
3738 case ANYOF_PSXSPC:
3739 if (LOC)
3740 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3741 else {
3742 for (value = 0; value < 256; value++)
3743 if (isPSXSPC(value))
3744 ANYOF_BITMAP_SET(ret, value);
3745 }
ffc61ed2 3746 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3747 break;
3748 case ANYOF_NPSXSPC:
3749 if (LOC)
3750 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3751 else {
3752 for (value = 0; value < 256; value++)
3753 if (!isPSXSPC(value))
3754 ANYOF_BITMAP_SET(ret, value);
3755 }
ffc61ed2 3756 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3757 break;
73b437c8
JH
3758 case ANYOF_PUNCT:
3759 if (LOC)
936ed897 3760 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3761 else {
3762 for (value = 0; value < 256; value++)
3763 if (isPUNCT(value))
936ed897 3764 ANYOF_BITMAP_SET(ret, value);
73b437c8 3765 }
ffc61ed2 3766 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3767 break;
3768 case ANYOF_NPUNCT:
3769 if (LOC)
936ed897 3770 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3771 else {
3772 for (value = 0; value < 256; value++)
3773 if (!isPUNCT(value))
936ed897 3774 ANYOF_BITMAP_SET(ret, value);
73b437c8 3775 }
ffc61ed2
JH
3776 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3777 break;
3778 case ANYOF_SPACE:
3779 if (LOC)
3780 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3781 else {
3782 for (value = 0; value < 256; value++)
3783 if (isSPACE(value))
3784 ANYOF_BITMAP_SET(ret, value);
3785 }
3786 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3787 break;
3788 case ANYOF_NSPACE:
3789 if (LOC)
3790 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3791 else {
3792 for (value = 0; value < 256; value++)
3793 if (!isSPACE(value))
3794 ANYOF_BITMAP_SET(ret, value);
3795 }
3796 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3797 break;
3798 case ANYOF_UPPER:
3799 if (LOC)
936ed897 3800 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3801 else {
3802 for (value = 0; value < 256; value++)
3803 if (isUPPER(value))
936ed897 3804 ANYOF_BITMAP_SET(ret, value);
73b437c8 3805 }
ffc61ed2 3806 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3807 break;
3808 case ANYOF_NUPPER:
3809 if (LOC)
936ed897 3810 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3811 else {
3812 for (value = 0; value < 256; value++)
3813 if (!isUPPER(value))
936ed897 3814 ANYOF_BITMAP_SET(ret, value);
73b437c8 3815 }
ffc61ed2 3816 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3817 break;
3818 case ANYOF_XDIGIT:
3819 if (LOC)
936ed897 3820 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3821 else {
3822 for (value = 0; value < 256; value++)
3823 if (isXDIGIT(value))
936ed897 3824 ANYOF_BITMAP_SET(ret, value);
73b437c8 3825 }
ffc61ed2 3826 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3827 break;
3828 case ANYOF_NXDIGIT:
3829 if (LOC)
936ed897 3830 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3831 else {
3832 for (value = 0; value < 256; value++)
3833 if (!isXDIGIT(value))
936ed897 3834 ANYOF_BITMAP_SET(ret, value);
73b437c8 3835 }
ffc61ed2 3836 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3837 break;
3838 default:
b45f050a 3839 vFAIL("Invalid [::] class");
73b437c8 3840 break;
b8c5462f 3841 }
b8c5462f 3842 if (LOC)
936ed897 3843 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3844 continue;
a0d0e21e 3845 }
ffc61ed2
JH
3846 } /* end of namedclass \blah */
3847
a0d0e21e 3848 if (range) {
3a3c4447 3849 if (prevvalue > value) /* b-a */ {
b45f050a 3850 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3851 RExC_parse - rangebegin,
3852 RExC_parse - rangebegin,
b45f050a 3853 rangebegin);
3568d838 3854 range = 0; /* not a valid range */
73b437c8 3855 }
a0d0e21e
LW
3856 }
3857 else {
3568d838 3858 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3859 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3860 RExC_parse[1] != ']') {
3861 RExC_parse++;
ffc61ed2
JH
3862
3863 /* a bad range like \w-, [:word:]- ? */
3864 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3865 if (ckWARN(WARN_REGEXP))
830247a4 3866 vWARN4(RExC_parse,
b45f050a 3867 "False [] range \"%*.*s\"",
830247a4
IZ
3868 RExC_parse - rangebegin,
3869 RExC_parse - rangebegin,
b45f050a 3870 rangebegin);
73b437c8 3871 if (!SIZE_ONLY)
936ed897 3872 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3873 } else
ffc61ed2
JH
3874 range = 1; /* yeah, it's a range! */
3875 continue; /* but do it the next time */
a0d0e21e 3876 }
a687059c 3877 }
ffc61ed2 3878
93733859 3879 /* now is the next time */
ae5c130c 3880 if (!SIZE_ONLY) {
3568d838
JH
3881 IV i;
3882
3883 if (prevvalue < 256) {
3884 IV ceilvalue = value < 256 ? value : 255;
3885
3886#ifdef EBCDIC
3a3c4447
JH
3887 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3888 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3889 {
3568d838
JH
3890 if (isLOWER(prevvalue)) {
3891 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3892 if (isLOWER(i))
3893 ANYOF_BITMAP_SET(ret, i);
3894 } else {
3568d838 3895 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3896 if (isUPPER(i))
3897 ANYOF_BITMAP_SET(ret, i);
3898 }
8ada0baa 3899 }
ffc61ed2 3900 else
8ada0baa 3901#endif
3568d838
JH
3902 for (i = prevvalue; i <= ceilvalue; i++)
3903 ANYOF_BITMAP_SET(ret, i);
3904 }
3905 if (value > 255) {
ffc61ed2 3906 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3907 if (prevvalue < value)
ffc61ed2 3908 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3909 (UV)prevvalue, (UV)value);
3910 else if (prevvalue == value)
ffc61ed2
JH
3911 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3912 (UV)value);
3913 }
8ada0baa 3914 }
ffc61ed2
JH
3915
3916 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3917 }
ffc61ed2 3918
936ed897 3919 if (need_class) {
4f66b38d 3920 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3921 if (SIZE_ONLY)
830247a4 3922 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3923 else
830247a4 3924 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3925 }
ffc61ed2 3926
ae5c130c 3927 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3928 if (!SIZE_ONLY &&
ffc61ed2 3929 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3930 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3931 ) {
a0ed51b3 3932 for (value = 0; value < 256; ++value) {
936ed897 3933 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3934 IV fold = PL_fold[value];
3935
3936 if (fold != value)
3937 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3938 }
3939 }
936ed897 3940 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3941 }
ffc61ed2 3942
ae5c130c 3943 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3944 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3945 /* If the only flag is inversion. */
3946 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3947 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3948 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3949 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3950 }
a0d0e21e 3951
b81d288d 3952 if (!SIZE_ONLY) {
fde631ed 3953 AV *av = newAV();
ffc61ed2
JH
3954 SV *rv;
3955
3956 av_store(av, 0, listsv);
3957 av_store(av, 1, NULL);
3958 rv = newRV_noinc((SV*)av);
19860706 3959 n = add_data(pRExC_state, 1, "s");
830247a4 3960 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3961 ARG_SET(ret, n);
a0ed51b3
LW
3962 }
3963
3964 return ret;
3965}
3966
76e3520e 3967STATIC char*
830247a4 3968S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3969{
830247a4 3970 char* retval = RExC_parse++;
a0d0e21e 3971
4633a7c4 3972 for (;;) {
830247a4
IZ
3973 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3974 RExC_parse[2] == '#') {
3975 while (*RExC_parse && *RExC_parse != ')')
3976 RExC_parse++;
3977 RExC_parse++;
4633a7c4
LW
3978 continue;
3979 }
830247a4
IZ
3980 if (RExC_flags16 & PMf_EXTENDED) {
3981 if (isSPACE(*RExC_parse)) {
3982 RExC_parse++;
748a9306
LW
3983 continue;
3984 }
830247a4
IZ
3985 else if (*RExC_parse == '#') {
3986 while (*RExC_parse && *RExC_parse != '\n')
3987 RExC_parse++;
3988 RExC_parse++;
748a9306
LW
3989 continue;
3990 }
748a9306 3991 }
4633a7c4 3992 return retval;
a0d0e21e 3993 }
a687059c
LW
3994}
3995
3996/*
c277df42 3997- reg_node - emit a node
a0d0e21e 3998*/
76e3520e 3999STATIC regnode * /* Location. */
830247a4 4000S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4001{
c277df42
IZ
4002 register regnode *ret;
4003 register regnode *ptr;
a687059c 4004
830247a4 4005 ret = RExC_emit;
c277df42 4006 if (SIZE_ONLY) {
830247a4
IZ
4007 SIZE_ALIGN(RExC_size);
4008 RExC_size += 1;
a0d0e21e
LW
4009 return(ret);
4010 }
a687059c 4011
c277df42 4012 NODE_ALIGN_FILL(ret);
a0d0e21e 4013 ptr = ret;
c277df42 4014 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4015 if (RExC_offsets) { /* MJD */
4016 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4017 "reg_node", __LINE__,
4018 reg_name[op],
4019 RExC_emit - RExC_emit_start > RExC_offsets[0]
4020 ? "Overwriting end of array!\n" : "OK",
4021 RExC_emit - RExC_emit_start,
4022 RExC_parse - RExC_start,
4023 RExC_offsets[0]));
4024 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4025 }
4026
830247a4 4027 RExC_emit = ptr;
a687059c 4028
a0d0e21e 4029 return(ret);
a687059c
LW
4030}
4031
4032/*
a0d0e21e
LW
4033- reganode - emit a node with an argument
4034*/
76e3520e 4035STATIC regnode * /* Location. */
830247a4 4036S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4037{
c277df42
IZ
4038 register regnode *ret;
4039 register regnode *ptr;
fe14fcc3 4040
830247a4 4041 ret = RExC_emit;
c277df42 4042 if (SIZE_ONLY) {
830247a4
IZ
4043 SIZE_ALIGN(RExC_size);
4044 RExC_size += 2;
a0d0e21e
LW
4045 return(ret);
4046 }
fe14fcc3 4047
c277df42 4048 NODE_ALIGN_FILL(ret);
a0d0e21e 4049 ptr = ret;
c277df42 4050 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4051 if (RExC_offsets) { /* MJD */
4052 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4053 "reganode",
4054 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4055 "Overwriting end of array!\n" : "OK",
4056 RExC_emit - RExC_emit_start,
4057 RExC_parse - RExC_start,
4058 RExC_offsets[0]));
4059 Set_Cur_Node_Offset;
4060 }
4061
830247a4 4062 RExC_emit = ptr;
fe14fcc3 4063
a0d0e21e 4064 return(ret);
fe14fcc3
LW
4065}
4066
4067/*
cd439c50 4068- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4069*/
4070STATIC void
830247a4 4071S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4072{
5e12f4fb 4073 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4074}
4075
4076/*
a0d0e21e
LW
4077- reginsert - insert an operator in front of already-emitted operand
4078*
4079* Means relocating the operand.
4080*/
76e3520e 4081STATIC void
830247a4 4082S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4083{
c277df42
IZ
4084 register regnode *src;
4085 register regnode *dst;
4086 register regnode *place;
4087 register int offset = regarglen[(U8)op];
b81d288d 4088
22c35a8c 4089/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4090
4091 if (SIZE_ONLY) {
830247a4 4092 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4093 return;
4094 }
a687059c 4095
830247a4
IZ
4096 src = RExC_emit;
4097 RExC_emit += NODE_STEP_REGNODE + offset;
4098 dst = RExC_emit;
fac92740 4099 while (src > opnd) {
c277df42 4100 StructCopy(--src, --dst, regnode);
fac92740
MJD
4101 if (RExC_offsets) { /* MJD 20010112 */
4102 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4103 "reg_insert",
4104 dst - RExC_emit_start > RExC_offsets[0]
4105 ? "Overwriting end of array!\n" : "OK",
4106 src - RExC_emit_start,
4107 dst - RExC_emit_start,
4108 RExC_offsets[0]));
4109 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4110 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4111 }
4112 }
4113
a0d0e21e
LW
4114
4115 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4116 if (RExC_offsets) { /* MJD */
4117 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4118 "reginsert",
4119 place - RExC_emit_start > RExC_offsets[0]
4120 ? "Overwriting end of array!\n" : "OK",
4121 place - RExC_emit_start,
4122 RExC_parse - RExC_start,
4123 RExC_offsets[0]));
4124 Set_Node_Offset(place, RExC_parse);
4125 }
c277df42
IZ
4126 src = NEXTOPER(place);
4127 FILL_ADVANCE_NODE(place, op);
4128 Zero(src, offset, regnode);
a687059c
LW
4129}
4130
4131/*
c277df42 4132- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4133*/
76e3520e 4134STATIC void
830247a4 4135S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4136{
c277df42
IZ
4137 register regnode *scan;
4138 register regnode *temp;
a0d0e21e 4139
c277df42 4140 if (SIZE_ONLY)
a0d0e21e
LW
4141 return;
4142
4143 /* Find last node. */
4144 scan = p;
4145 for (;;) {
4146 temp = regnext(scan);
4147 if (temp == NULL)
4148 break;
4149 scan = temp;
4150 }
a687059c 4151
c277df42
IZ
4152 if (reg_off_by_arg[OP(scan)]) {
4153 ARG_SET(scan, val - scan);
a0ed51b3
LW
4154 }
4155 else {
c277df42
IZ
4156 NEXT_OFF(scan) = val - scan;
4157 }
a687059c
LW
4158}
4159
4160/*
a0d0e21e
LW
4161- regoptail - regtail on operand of first argument; nop if operandless
4162*/
76e3520e 4163STATIC void
830247a4 4164S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4165{
a0d0e21e 4166 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4167 if (p == NULL || SIZE_ONLY)
4168 return;
22c35a8c 4169 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4170 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4171 }
22c35a8c 4172 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4173 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4174 }
4175 else
a0d0e21e 4176 return;
a687059c
LW
4177}
4178
4179/*
4180 - regcurly - a little FSA that accepts {\d+,?\d*}
4181 */
79072805 4182STATIC I32
cea2e8a9 4183S_regcurly(pTHX_ register char *s)
a687059c
LW
4184{
4185 if (*s++ != '{')
4186 return FALSE;
f0fcb552 4187 if (!isDIGIT(*s))
a687059c 4188 return FALSE;
f0fcb552 4189 while (isDIGIT(*s))
a687059c
LW
4190 s++;
4191 if (*s == ',')
4192 s++;
f0fcb552 4193 while (isDIGIT(*s))
a687059c
LW
4194 s++;
4195 if (*s != '}')
4196 return FALSE;
4197 return TRUE;
4198}
4199
a687059c 4200
8fa7f367
JH
4201#ifdef DEBUGGING
4202
76e3520e 4203STATIC regnode *
cea2e8a9 4204S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4205{
f248d071 4206 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4207 register regnode *next;
c277df42
IZ
4208
4209 while (op != END && (!last || node < last)) {
4210 /* While that wasn't END last time... */
4211
4212 NODE_ALIGN(node);
4213 op = OP(node);
4214 if (op == CLOSE)
4215 l--;
4216 next = regnext(node);
4217 /* Where, what. */
4218 if (OP(node) == OPTIMIZED)
4219 goto after_print;
4220 regprop(sv, node);
b900a521 4221 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4222 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4223 if (next == NULL) /* Next ptr. */
4224 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4225 else
b900a521 4226 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4227 (void)PerlIO_putc(Perl_debug_log, '\n');
4228 after_print:
22c35a8c 4229 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4230 register regnode *nnode = (OP(next) == LONGJMP
4231 ? regnext(next)
c277df42
IZ
4232 : next);
4233 if (last && nnode > last)
4234 nnode = last;
4235 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4236 }
22c35a8c 4237 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4238 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4239 }
4240 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4241 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4242 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4243 }
22c35a8c 4244 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4245 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4246 next, sv, l + 1);
a0ed51b3
LW
4247 }
4248 else if ( op == PLUS || op == STAR) {
c277df42 4249 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4250 }
4251 else if (op == ANYOF) {
4f66b38d
HS
4252 /* arglen 1 + class block */
4253 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4254 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4255 node = NEXTOPER(node);
a0ed51b3 4256 }
22c35a8c 4257 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4258 /* Literal string, where present. */
cd439c50 4259 node += NODE_SZ_STR(node) - 1;
c277df42 4260 node = NEXTOPER(node);
a0ed51b3
LW
4261 }
4262 else {
c277df42
IZ
4263 node = NEXTOPER(node);
4264 node += regarglen[(U8)op];
4265 }
4266 if (op == CURLYX || op == OPEN)
4267 l++;
4268 else if (op == WHILEM)
4269 l--;
4270 }
4271 return node;
4272}
4273
8fa7f367
JH
4274#endif /* DEBUGGING */
4275
a687059c 4276/*
fd181c75 4277 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4278 */
4279void
864dbfa3 4280Perl_regdump(pTHX_ regexp *r)
a687059c 4281{
35ff7856 4282#ifdef DEBUGGING
46fc3d4c 4283 SV *sv = sv_newmortal();
a687059c 4284
c277df42 4285 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4286
4287 /* Header fields of interest. */
c277df42 4288 if (r->anchored_substr)
7b0972df 4289 PerlIO_printf(Perl_debug_log,
b81d288d 4290 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4291 PL_colors[0],
7b0972df 4292 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4293 SvPVX(r->anchored_substr),
3280af22 4294 PL_colors[1],
c277df42 4295 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4296 (IV)r->anchored_offset);
c277df42 4297 if (r->float_substr)
7b0972df 4298 PerlIO_printf(Perl_debug_log,
b81d288d 4299 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4300 PL_colors[0],
b81d288d 4301 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4302 SvPVX(r->float_substr),
3280af22 4303 PL_colors[1],
c277df42 4304 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4305 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4306 if (r->check_substr)
b81d288d
AB
4307 PerlIO_printf(Perl_debug_log,
4308 r->check_substr == r->float_substr
c277df42
IZ
4309 ? "(checking floating" : "(checking anchored");
4310 if (r->reganch & ROPT_NOSCAN)
4311 PerlIO_printf(Perl_debug_log, " noscan");
4312 if (r->reganch & ROPT_CHECK_ALL)
4313 PerlIO_printf(Perl_debug_log, " isall");
4314 if (r->check_substr)
4315 PerlIO_printf(Perl_debug_log, ") ");
4316
46fc3d4c 4317 if (r->regstclass) {
4318 regprop(sv, r->regstclass);
4319 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4320 }
774d564b 4321 if (r->reganch & ROPT_ANCH) {
4322 PerlIO_printf(Perl_debug_log, "anchored");
4323 if (r->reganch & ROPT_ANCH_BOL)
4324 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4325 if (r->reganch & ROPT_ANCH_MBOL)
4326 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4327 if (r->reganch & ROPT_ANCH_SBOL)
4328 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4329 if (r->reganch & ROPT_ANCH_GPOS)
4330 PerlIO_printf(Perl_debug_log, "(GPOS)");
4331 PerlIO_putc(Perl_debug_log, ' ');
4332 }
c277df42
IZ
4333 if (r->reganch & ROPT_GPOS_SEEN)
4334 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4335 if (r->reganch & ROPT_SKIP)
760ac839 4336 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4337 if (r->reganch & ROPT_IMPLICIT)
760ac839 4338 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4339 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4340 if (r->reganch & ROPT_EVAL_SEEN)
4341 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4342 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4343 if (r->offsets) {
4344 U32 i;
4345 U32 len = r->offsets[0];
392fbf5d 4346 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4347 for (i = 1; i <= len; i++)
392fbf5d
RB
4348 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4349 (UV)r->offsets[i*2-1],
4350 (UV)r->offsets[i*2]);
fac92740
MJD
4351 PerlIO_printf(Perl_debug_log, "\n");
4352 }
17c3b450 4353#endif /* DEBUGGING */
a687059c
LW
4354}
4355
8fa7f367
JH
4356#ifdef DEBUGGING
4357
653099ff
GS
4358STATIC void
4359S_put_byte(pTHX_ SV *sv, int c)
4360{
7be5a6cf 4361 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4362 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4363 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4364 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4365 else
4366 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4367}
4368
8fa7f367
JH
4369#endif /* DEBUGGING */
4370
a687059c 4371/*
a0d0e21e
LW
4372- regprop - printable representation of opcode
4373*/
46fc3d4c 4374void
864dbfa3 4375Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4376{
35ff7856 4377#ifdef DEBUGGING
9b155405 4378 register int k;
a0d0e21e 4379
54dc92de 4380 sv_setpvn(sv, "", 0);
9b155405 4381 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4382 /* It would be nice to FAIL() here, but this may be called from
4383 regexec.c, and it would be hard to supply pRExC_state. */
4384 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4385 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4386
4387 k = PL_regkind[(U8)OP(o)];
4388
4389 if (k == EXACT)
7821416a 4390 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4391 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4392 else if (k == CURLY) {
cb434fcc 4393 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4394 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4395 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4396 }
2c2d71f5
JH
4397 else if (k == WHILEM && o->flags) /* Ordinal/of */
4398 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4399 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4400 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4401 else if (k == LOGICAL)
04ebc1ab 4402 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4403 else if (k == ANYOF) {
4404 int i, rangestart = -1;
ffc61ed2 4405 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4406 const char * const anyofs[] = { /* Should be syncronized with
4407 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4408 "\\w",
4409 "\\W",
4410 "\\s",
4411 "\\S",
4412 "\\d",
4413 "\\D",
4414 "[:alnum:]",
4415 "[:^alnum:]",
4416 "[:alpha:]",
4417 "[:^alpha:]",
4418 "[:ascii:]",
4419 "[:^ascii:]",
4420 "[:ctrl:]",
4421 "[:^ctrl:]",
4422 "[:graph:]",
4423 "[:^graph:]",
4424 "[:lower:]",
4425 "[:^lower:]",
4426 "[:print:]",
4427 "[:^print:]",
4428 "[:punct:]",
4429 "[:^punct:]",
4430 "[:upper:]",
aaa51d5e 4431 "[:^upper:]",
653099ff 4432 "[:xdigit:]",
aaa51d5e
JF
4433 "[:^xdigit:]",
4434 "[:space:]",
4435 "[:^space:]",
4436 "[:blank:]",
4437 "[:^blank:]"
653099ff
GS
4438 };
4439
19860706 4440 if (flags & ANYOF_LOCALE)
653099ff 4441 sv_catpv(sv, "{loc}");
19860706 4442 if (flags & ANYOF_FOLD)
653099ff
GS
4443 sv_catpv(sv, "{i}");
4444 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4445 if (flags & ANYOF_INVERT)
653099ff 4446 sv_catpv(sv, "^");
ffc61ed2
JH
4447 for (i = 0; i <= 256; i++) {
4448 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4449 if (rangestart == -1)
4450 rangestart = i;
4451 } else if (rangestart != -1) {
4452 if (i <= rangestart + 3)
4453 for (; rangestart < i; rangestart++)
653099ff 4454 put_byte(sv, rangestart);
ffc61ed2
JH
4455 else {
4456 put_byte(sv, rangestart);
4457 sv_catpv(sv, "-");
4458 put_byte(sv, i - 1);
653099ff 4459 }
ffc61ed2 4460 rangestart = -1;
653099ff 4461 }
847a199f 4462 }
ffc61ed2
JH
4463
4464 if (o->flags & ANYOF_CLASS)
4465 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4466 if (ANYOF_CLASS_TEST(o,i))
4467 sv_catpv(sv, anyofs[i]);
4468
4469 if (flags & ANYOF_UNICODE)
4470 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4471 else if (flags & ANYOF_UNICODE_ALL)
4472 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4473
4474 {
4475 SV *lv;
4476 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4477
ffc61ed2
JH
4478 if (lv) {
4479 if (sw) {
4480 UV i;
4481 U8 s[UTF8_MAXLEN+1];
b81d288d 4482
ffc61ed2 4483 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4484 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4485
3568d838 4486 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4487 if (rangestart == -1)
4488 rangestart = i;
4489 } else if (rangestart != -1) {
4490 U8 *p;
b81d288d 4491
ffc61ed2
JH
4492 if (i <= rangestart + 3)
4493 for (; rangestart < i; rangestart++) {
2b9d42f0 4494 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4495 put_byte(sv, *p);
4496 }
4497 else {
2b9d42f0 4498 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4499 put_byte(sv, *p);
4500 sv_catpv(sv, "-");
2b9d42f0 4501 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4502 put_byte(sv, *p);
4503 }
4504 rangestart = -1;
4505 }
19860706 4506 }
ffc61ed2
JH
4507
4508 sv_catpv(sv, "..."); /* et cetera */
19860706 4509 }
fde631ed 4510
ffc61ed2
JH
4511 {
4512 char *s = savepv(SvPVX(lv));
4513 char *origs = s;
b81d288d 4514
ffc61ed2 4515 while(*s && *s != '\n') s++;
b81d288d 4516
ffc61ed2
JH
4517 if (*s == '\n') {
4518 char *t = ++s;
4519
4520 while (*s) {
4521 if (*s == '\n')
4522 *s = ' ';
4523 s++;
4524 }
4525 if (s[-1] == ' ')
4526 s[-1] = 0;
4527
4528 sv_catpv(sv, t);
fde631ed 4529 }
b81d288d 4530
ffc61ed2 4531 Safefree(origs);
fde631ed
JH
4532 }
4533 }
653099ff 4534 }
ffc61ed2 4535
653099ff
GS
4536 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4537 }
9b155405 4538 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4539 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4540#endif /* DEBUGGING */
35ff7856 4541}
a687059c 4542
cad2e5aa
JH
4543SV *
4544Perl_re_intuit_string(pTHX_ regexp *prog)
4545{ /* Assume that RE_INTUIT is set */
4546 DEBUG_r(
4547 { STRLEN n_a;
4548 char *s = SvPV(prog->check_substr,n_a);
4549
4550 if (!PL_colorset) reginitcolors();
4551 PerlIO_printf(Perl_debug_log,
4552 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4553 PL_colors[4],PL_colors[5],PL_colors[0],
4554 s,
4555 PL_colors[1],
4556 (strlen(s) > 60 ? "..." : ""));
4557 } );
4558
4559 return prog->check_substr;
4560}
4561
2b69d0c2 4562void
864dbfa3 4563Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4564{
adac82c7 4565 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4566
4567 if (!r || (--r->refcnt > 0))
4568 return;
cad2e5aa
JH
4569 DEBUG_r(PerlIO_printf(Perl_debug_log,
4570 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4571 PL_colors[4],PL_colors[5],PL_colors[0],
4572 r->precomp,
4573 PL_colors[1],
4574 (strlen(r->precomp) > 60 ? "..." : "")));
4575
c277df42 4576 if (r->precomp)
a0d0e21e 4577 Safefree(r->precomp);
fac92740
MJD
4578 if (r->offsets) /* 20010421 MJD */
4579 Safefree(r->offsets);
cf93c79d
IZ
4580 if (RX_MATCH_COPIED(r))
4581 Safefree(r->subbeg);
a193d654
GS
4582 if (r->substrs) {
4583 if (r->anchored_substr)
4584 SvREFCNT_dec(r->anchored_substr);
4585 if (r->float_substr)
4586 SvREFCNT_dec(r->float_substr);
2779dcf1 4587 Safefree(r->substrs);
a193d654 4588 }
c277df42
IZ
4589 if (r->data) {
4590 int n = r->data->count;
dfad63ad
HS
4591 AV* new_comppad = NULL;
4592 AV* old_comppad;
4593 SV** old_curpad;
4594
c277df42 4595 while (--n >= 0) {
261faec3 4596 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4597 switch (r->data->what[n]) {
4598 case 's':
4599 SvREFCNT_dec((SV*)r->data->data[n]);
4600 break;
653099ff
GS
4601 case 'f':
4602 Safefree(r->data->data[n]);
4603 break;
dfad63ad
HS
4604 case 'p':
4605 new_comppad = (AV*)r->data->data[n];
4606 break;
c277df42 4607 case 'o':
dfad63ad 4608 if (new_comppad == NULL)
cea2e8a9 4609 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4610 old_comppad = PL_comppad;
4611 old_curpad = PL_curpad;
1e6dc0b6
SB
4612 /* Watch out for global destruction's random ordering. */
4613 if (SvTYPE(new_comppad) == SVt_PVAV) {
4614 PL_comppad = new_comppad;
4615 PL_curpad = AvARRAY(new_comppad);
4616 }
4617 else
4618 PL_curpad = NULL;
9b978d73
DM
4619
4620 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4621 op_free((OP_4tree*)r->data->data[n]);
4622 }
4623
dfad63ad
HS
4624 PL_comppad = old_comppad;
4625 PL_curpad = old_curpad;
4626 SvREFCNT_dec((SV*)new_comppad);
4627 new_comppad = NULL;
c277df42
IZ
4628 break;
4629 case 'n':
4630 break;
4631 default:
830247a4 4632 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4633 }
4634 }
4635 Safefree(r->data->what);
4636 Safefree(r->data);
a0d0e21e
LW
4637 }
4638 Safefree(r->startp);
4639 Safefree(r->endp);
4640 Safefree(r);
a687059c 4641}
c277df42
IZ
4642
4643/*
4644 - regnext - dig the "next" pointer out of a node
4645 *
4646 * [Note, when REGALIGN is defined there are two places in regmatch()
4647 * that bypass this code for speed.]
4648 */
4649regnode *
864dbfa3 4650Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4651{
4652 register I32 offset;
4653
3280af22 4654 if (p == &PL_regdummy)
c277df42
IZ
4655 return(NULL);
4656
4657 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4658 if (offset == 0)
4659 return(NULL);
4660
c277df42 4661 return(p+offset);
c277df42
IZ
4662}
4663
01f988be 4664STATIC void
cea2e8a9 4665S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4666{
4667 va_list args;
4668 STRLEN l1 = strlen(pat1);
4669 STRLEN l2 = strlen(pat2);
4670 char buf[512];
06bf62c7 4671 SV *msv;
c277df42
IZ
4672 char *message;
4673
4674 if (l1 > 510)
4675 l1 = 510;
4676 if (l1 + l2 > 510)
4677 l2 = 510 - l1;
4678 Copy(pat1, buf, l1 , char);
4679 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4680 buf[l1 + l2] = '\n';
4681 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4682#ifdef I_STDARG
4683 /* ANSI variant takes additional second argument */
c277df42 4684 va_start(args, pat2);
8736538c
AS
4685#else
4686 va_start(args);
4687#endif
5a844595 4688 msv = vmess(buf, &args);
c277df42 4689 va_end(args);
06bf62c7 4690 message = SvPV(msv,l1);
c277df42
IZ
4691 if (l1 > 512)
4692 l1 = 512;
4693 Copy(message, buf, l1 , char);
4694 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4695 Perl_croak(aTHX_ "%s", buf);
c277df42 4696}
a0ed51b3
LW
4697
4698/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4699
4700void
864dbfa3 4701Perl_save_re_context(pTHX)
b81d288d 4702{
830247a4
IZ
4703#if 0
4704 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4705 SAVEI32(RExC_npar); /* () count. */
4706 SAVEI32(RExC_size); /* Code size. */
4707 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4708 SAVEVPTR(RExC_rx); /* from regcomp.c */
4709 SAVEI32(RExC_seen); /* from regcomp.c */
4710 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4711 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4712 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4713 SAVEPPTR(RExC_end); /* End of input for compile */
4714 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4715#endif
4716
4717 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4718 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4719 SAVEPPTR(PL_reginput); /* String-input pointer. */
4720 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4721 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4722 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4723 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4724 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4725 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4726 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4727 PL_reg_start_tmp = 0;
a0ed51b3
LW
4728 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4729 PL_reg_start_tmpl = 0;
7766f137 4730 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4731 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4732 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4733 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4734 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4735 SAVEVPTR(PL_regcc); /* from regexec.c */
4736 SAVEVPTR(PL_curcop);
7766f137
GS
4737 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4738 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4739 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4740 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4741 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4742 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4743 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4744 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4745 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4746 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4747 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4748#ifdef DEBUGGING
b81d288d 4749 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4750#endif
a0ed51b3 4751}
51371543 4752
51371543 4753static void
acfe0abc 4754clear_re(pTHX_ void *r)
51371543
GS
4755{
4756 ReREFCNT_dec((regexp *)r);
4757}
ffbc6a93 4758