This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make the test to cleanup the test directory tree
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
79eeca27
JH
253#define MARKER1 "HERE" /* marker as it appears in the description */
254#define MARKER2 " << HERE " /* marker as it appears within the regex */
b81d288d 255
79eeca27 256#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
396
397#define vWARN2(loc, m, a1) \
398 STMT_START { \
830247a4 399 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
400 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
401 a1, \
830247a4 402 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
403 } STMT_END
404
405#define vWARN3(loc, m, a1, a2) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
409 a1, a2, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN4(loc, m, a1, a2, a3) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
417 a1, a2, a3, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
419 } STMT_END
420
421
cd439c50
IZ
422/* Allow for side effects in s */
423#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
424
fac92740
MJD
425/* Macros for recording node offsets. 20001227 mjd@plover.com
426 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
427 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
428 * Element 0 holds the number n.
429 */
430
431#define MJD_OFFSET_DEBUG(x)
432/* #define MJD_OFFSET_DEBUG(x) fprintf x */
433
434
435# define Set_Node_Offset_To_R(node,byte) \
436 STMT_START { \
437 if (! SIZE_ONLY) { \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444 } STMT_END
445
446# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
447# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
448
449# define Set_Node_Length_To_R(node,len) \
450 STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
453 if((node) < 0) { \
454 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
455 } else { \
456 RExC_offsets[2*(node)] = (len); \
457 } \
458 } \
459 } STMT_END
460
461# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
462# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
463# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
464
465/* Get offsets and lengths */
466#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
467#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
468
51371543 469static void clear_re(pTHXo_ void *r);
4327152a 470
653099ff
GS
471/* Mark that we cannot extend a found fixed substring at this point.
472 Updata the longest found anchored substring and the longest found
473 floating substrings if needed. */
474
4327152a 475STATIC void
830247a4 476S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 477{
a0ed51b3
LW
478 STRLEN l = CHR_SVLEN(data->last_found);
479 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 480
c277df42
IZ
481 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
482 sv_setsv(*data->longest, data->last_found);
483 if (*data->longest == data->longest_fixed) {
484 data->offset_fixed = l ? data->last_start_min : data->pos_min;
485 if (data->flags & SF_BEFORE_EOL)
b81d288d 486 data->flags
c277df42
IZ
487 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
488 else
489 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
490 }
491 else {
c277df42 492 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
493 data->offset_float_max = (l
494 ? data->last_start_max
c277df42
IZ
495 : data->pos_min + data->pos_delta);
496 if (data->flags & SF_BEFORE_EOL)
b81d288d 497 data->flags
c277df42
IZ
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499 else
500 data->flags &= ~SF_FL_BEFORE_EOL;
501 }
502 }
503 SvCUR_set(data->last_found, 0);
504 data->last_end = -1;
505 data->flags &= ~SF_BEFORE_EOL;
506}
507
653099ff
GS
508/* Can match anything (initialization) */
509STATIC void
830247a4 510S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 511{
9af0acdb
JH
512 int value;
513
653099ff 514 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
515 for (value = 0; value < 256; ++value)
516 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 517 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
518 if (LOC)
519 cl->flags |= ANYOF_LOCALE;
520}
521
522/* Can match anything (initialization) */
523STATIC int
524S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
525{
526 int value;
527
aaa51d5e 528 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
529 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 return 1;
1aa99e6b
IH
531 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 return 0;
9af0acdb
JH
533 for (value = 0; value < 256; ++value)
534 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
535 return 0;
536 return 1;
537}
538
539/* Can match anything (initialization) */
540STATIC void
830247a4 541S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 542{
8ecf7187 543 Zero(cl, 1, struct regnode_charclass_class);
653099ff 544 cl->type = ANYOF;
830247a4 545 cl_anything(pRExC_state, cl);
653099ff
GS
546}
547
548STATIC void
830247a4 549S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554 if (LOC)
555 cl->flags |= ANYOF_LOCALE;
556}
557
558/* 'And' a given class with another one. Can create false positives */
559/* We assume that cl is not inverted */
560STATIC void
561S_cl_and(pTHX_ struct regnode_charclass_class *cl,
562 struct regnode_charclass_class *and_with)
563{
653099ff
GS
564 if (!(and_with->flags & ANYOF_CLASS)
565 && !(cl->flags & ANYOF_CLASS)
566 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
567 && !(and_with->flags & ANYOF_FOLD)
568 && !(cl->flags & ANYOF_FOLD)) {
569 int i;
570
571 if (and_with->flags & ANYOF_INVERT)
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= ~and_with->bitmap[i];
574 else
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= and_with->bitmap[i];
577 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
578 if (!(and_with->flags & ANYOF_EOS))
579 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
580
581 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
582 cl->flags &= ~ANYOF_UNICODE_ALL;
583 cl->flags |= ANYOF_UNICODE;
584 ARG_SET(cl, ARG(and_with));
585 }
586 if (!(and_with->flags & ANYOF_UNICODE_ALL))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
830247a4 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
652/* REx optimizer. Converts nodes into quickier variants "in place".
653 Finds fixed substrings. */
654
c277df42
IZ
655/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
656 to the position after last scanned or to NULL. */
657
76e3520e 658STATIC I32
830247a4 659S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
660 /* scanp: Start here (read-write). */
661 /* deltap: Write maxlen-minlen here. */
662 /* last: Stop before this one. */
663{
664 I32 min = 0, pars = 0, code;
665 regnode *scan = *scanp, *next;
666 I32 delta = 0;
667 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 668 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
669 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
670 scan_data_t data_fake;
653099ff 671 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 672
c277df42
IZ
673 while (scan && OP(scan) != END && scan < last) {
674 /* Peephole optimizer: */
675
22c35a8c 676 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 677 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
678 regnode *n = regnext(scan);
679 U32 stringok = 1;
680#ifdef DEBUGGING
681 regnode *stop = scan;
b81d288d 682#endif
c277df42 683
cd439c50 684 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
685 /* Skip NOTHING, merge EXACT*. */
686 while (n &&
b81d288d 687 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
688 (stringok && (OP(n) == OP(scan))))
689 && NEXT_OFF(n)
690 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
691 if (OP(n) == TAIL || n > next)
692 stringok = 0;
22c35a8c 693 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
694 NEXT_OFF(scan) += NEXT_OFF(n);
695 next = n + NODE_STEP_REGNODE;
696#ifdef DEBUGGING
697 if (stringok)
698 stop = n;
b81d288d 699#endif
c277df42 700 n = regnext(n);
a0ed51b3 701 }
f49d4d0f 702 else if (stringok) {
cd439c50 703 int oldl = STR_LEN(scan);
c277df42 704 regnode *nnext = regnext(n);
f49d4d0f 705
b81d288d 706 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
707 break;
708 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
709 STR_LEN(scan) += STR_LEN(n);
710 next = n + NODE_SZ_STR(n);
c277df42 711 /* Now we can overwrite *n : */
f49d4d0f 712 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 713#ifdef DEBUGGING
f49d4d0f 714 stop = next - 1;
b81d288d 715#endif
c277df42
IZ
716 n = nnext;
717 }
718 }
719#ifdef DEBUGGING
720 /* Allow dumping */
cd439c50 721 n = scan + NODE_SZ_STR(scan);
c277df42 722 while (n <= stop) {
22c35a8c 723 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
724 OP(n) = OPTIMIZED;
725 NEXT_OFF(n) = 0;
726 }
727 n++;
728 }
653099ff 729#endif
c277df42 730 }
653099ff
GS
731 /* Follow the next-chain of the current node and optimize
732 away all the NOTHINGs from it. */
c277df42 733 if (OP(scan) != CURLYX) {
048cfca1
GS
734 int max = (reg_off_by_arg[OP(scan)]
735 ? I32_MAX
736 /* I32 may be smaller than U16 on CRAYs! */
737 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
738 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
739 int noff;
740 regnode *n = scan;
b81d288d 741
c277df42
IZ
742 /* Skip NOTHING and LONGJMP. */
743 while ((n = regnext(n))
22c35a8c 744 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
745 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
746 && off + noff < max)
747 off += noff;
748 if (reg_off_by_arg[OP(scan)])
749 ARG(scan) = off;
b81d288d 750 else
c277df42
IZ
751 NEXT_OFF(scan) = off;
752 }
653099ff
GS
753 /* The principal pseudo-switch. Cannot be a switch, since we
754 look into several different things. */
b81d288d 755 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
756 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
757 next = regnext(scan);
758 code = OP(scan);
b81d288d
AB
759
760 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 761 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 762 struct regnode_charclass_class accum;
c277df42 763
653099ff 764 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 765 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 766 if (flags & SCF_DO_STCLASS)
830247a4 767 cl_init_zero(pRExC_state, &accum);
c277df42 768 while (OP(scan) == code) {
830247a4 769 I32 deltanext, minnext, f = 0, fake;
653099ff 770 struct regnode_charclass_class this_class;
c277df42
IZ
771
772 num++;
773 data_fake.flags = 0;
b81d288d 774 if (data) {
2c2d71f5 775 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
776 data_fake.last_closep = data->last_closep;
777 }
778 else
779 data_fake.last_closep = &fake;
c277df42
IZ
780 next = regnext(scan);
781 scan = NEXTOPER(scan);
782 if (code != BRANCH)
783 scan = NEXTOPER(scan);
653099ff 784 if (flags & SCF_DO_STCLASS) {
830247a4 785 cl_init(pRExC_state, &this_class);
653099ff
GS
786 data_fake.start_class = &this_class;
787 f = SCF_DO_STCLASS_AND;
b81d288d 788 }
e1901655
IZ
789 if (flags & SCF_WHILEM_VISITED_POS)
790 f |= SCF_WHILEM_VISITED_POS;
653099ff 791 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
792 minnext = study_chunk(pRExC_state, &scan, &deltanext,
793 next, &data_fake, f);
b81d288d 794 if (min1 > minnext)
c277df42
IZ
795 min1 = minnext;
796 if (max1 < minnext + deltanext)
797 max1 = minnext + deltanext;
798 if (deltanext == I32_MAX)
aca2d497 799 is_inf = is_inf_internal = 1;
c277df42
IZ
800 scan = next;
801 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
802 pars++;
405ff068 803 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 804 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
805 if (data)
806 data->whilem_c = data_fake.whilem_c;
653099ff 807 if (flags & SCF_DO_STCLASS)
830247a4 808 cl_or(pRExC_state, &accum, &this_class);
b81d288d 809 if (code == SUSPEND)
c277df42
IZ
810 break;
811 }
812 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
813 min1 = 0;
814 if (flags & SCF_DO_SUBSTR) {
815 data->pos_min += min1;
816 data->pos_delta += max1 - min1;
817 if (max1 != min1 || is_inf)
818 data->longest = &(data->longest_float);
819 }
820 min += min1;
821 delta += max1 - min1;
653099ff 822 if (flags & SCF_DO_STCLASS_OR) {
830247a4 823 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
824 if (min1) {
825 cl_and(data->start_class, &and_with);
826 flags &= ~SCF_DO_STCLASS;
827 }
828 }
829 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
830 if (min1) {
831 cl_and(data->start_class, &accum);
653099ff 832 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
833 }
834 else {
b81d288d 835 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
836 * data->start_class */
837 StructCopy(data->start_class, &and_with,
838 struct regnode_charclass_class);
839 flags &= ~SCF_DO_STCLASS_AND;
840 StructCopy(&accum, data->start_class,
841 struct regnode_charclass_class);
842 flags |= SCF_DO_STCLASS_OR;
843 data->start_class->flags |= ANYOF_EOS;
844 }
653099ff 845 }
a0ed51b3
LW
846 }
847 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
848 scan = NEXTOPER(NEXTOPER(scan));
849 else /* single branch is optimized. */
850 scan = NEXTOPER(scan);
851 continue;
a0ed51b3
LW
852 }
853 else if (OP(scan) == EXACT) {
cd439c50 854 I32 l = STR_LEN(scan);
1aa99e6b 855 UV uc = *((U8*)STRING(scan));
a0ed51b3 856 if (UTF) {
1aa99e6b
IH
857 U8 *s = (U8*)STRING(scan);
858 l = utf8_length(s, s + l);
9041c2e3 859 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
860 }
861 min += l;
c277df42 862 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
863 /* The code below prefers earlier match for fixed
864 offset, later match for variable offset. */
865 if (data->last_end == -1) { /* Update the start info. */
866 data->last_start_min = data->pos_min;
867 data->last_start_max = is_inf
b81d288d 868 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 869 }
cd439c50 870 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
871 data->last_end = data->pos_min + l;
872 data->pos_min += l; /* As in the first entry. */
873 data->flags &= ~SF_BEFORE_EOL;
874 }
653099ff
GS
875 if (flags & SCF_DO_STCLASS_AND) {
876 /* Check whether it is compatible with what we know already! */
877 int compat = 1;
878
1aa99e6b 879 if (uc >= 0x100 ||
b81d288d 880 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 881 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 882 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 883 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
884 compat = 0;
885 ANYOF_CLASS_ZERO(data->start_class);
886 ANYOF_BITMAP_ZERO(data->start_class);
887 if (compat)
1aa99e6b 888 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 889 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
890 if (uc < 0x100)
891 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
892 }
893 else if (flags & SCF_DO_STCLASS_OR) {
894 /* false positive possible if the class is case-folded */
1aa99e6b 895 if (uc < 0x100)
9b877dbb
IH
896 ANYOF_BITMAP_SET(data->start_class, uc);
897 else
898 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
899 data->start_class->flags &= ~ANYOF_EOS;
900 cl_and(data->start_class, &and_with);
901 }
902 flags &= ~SCF_DO_STCLASS;
a0ed51b3 903 }
653099ff 904 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 905 I32 l = STR_LEN(scan);
1aa99e6b 906 UV uc = *((U8*)STRING(scan));
653099ff
GS
907
908 /* Search for fixed substrings supports EXACT only. */
b81d288d 909 if (flags & SCF_DO_SUBSTR)
830247a4 910 scan_commit(pRExC_state, data);
a0ed51b3 911 if (UTF) {
1aa99e6b
IH
912 U8 *s = (U8 *)STRING(scan);
913 l = utf8_length(s, s + l);
9041c2e3 914 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
915 }
916 min += l;
c277df42 917 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 918 data->pos_min += l;
653099ff
GS
919 if (flags & SCF_DO_STCLASS_AND) {
920 /* Check whether it is compatible with what we know already! */
921 int compat = 1;
922
1aa99e6b 923 if (uc >= 0x100 ||
b81d288d 924 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b
IH
925 && !ANYOF_BITMAP_TEST(data->start_class, uc)
926 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
653099ff
GS
927 compat = 0;
928 ANYOF_CLASS_ZERO(data->start_class);
929 ANYOF_BITMAP_ZERO(data->start_class);
930 if (compat) {
1aa99e6b 931 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
932 data->start_class->flags &= ~ANYOF_EOS;
933 data->start_class->flags |= ANYOF_FOLD;
934 if (OP(scan) == EXACTFL)
935 data->start_class->flags |= ANYOF_LOCALE;
936 }
937 }
938 else if (flags & SCF_DO_STCLASS_OR) {
939 if (data->start_class->flags & ANYOF_FOLD) {
940 /* false positive possible if the class is case-folded.
941 Assume that the locale settings are the same... */
1aa99e6b
IH
942 if (uc < 0x100)
943 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
944 data->start_class->flags &= ~ANYOF_EOS;
945 }
946 cl_and(data->start_class, &and_with);
947 }
948 flags &= ~SCF_DO_STCLASS;
a0ed51b3 949 }
4d61ec05 950 else if (strchr((char*)PL_varies,OP(scan))) {
aa7a4b56
MH
951 I32 mincount, maxcount, minnext, deltanext, fl;
952 I32 f = flags, pos_before = 0;
c277df42 953 regnode *oscan = scan;
653099ff
GS
954 struct regnode_charclass_class this_class;
955 struct regnode_charclass_class *oclass = NULL;
956
22c35a8c 957 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 958 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
959 scan = NEXTOPER(scan);
960 goto finish;
961 case PLUS:
653099ff 962 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 963 next = NEXTOPER(scan);
653099ff 964 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
965 mincount = 1;
966 maxcount = REG_INFTY;
c277df42
IZ
967 next = regnext(scan);
968 scan = NEXTOPER(scan);
969 goto do_curly;
970 }
971 }
972 if (flags & SCF_DO_SUBSTR)
973 data->pos_min++;
974 min++;
975 /* Fall through. */
976 case STAR:
653099ff
GS
977 if (flags & SCF_DO_STCLASS) {
978 mincount = 0;
b81d288d 979 maxcount = REG_INFTY;
653099ff
GS
980 next = regnext(scan);
981 scan = NEXTOPER(scan);
982 goto do_curly;
983 }
b81d288d 984 is_inf = is_inf_internal = 1;
c277df42
IZ
985 scan = regnext(scan);
986 if (flags & SCF_DO_SUBSTR) {
830247a4 987 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
988 data->longest = &(data->longest_float);
989 }
990 goto optimize_curly_tail;
991 case CURLY:
b81d288d 992 mincount = ARG1(scan);
c277df42
IZ
993 maxcount = ARG2(scan);
994 next = regnext(scan);
cb434fcc
IZ
995 if (OP(scan) == CURLYX) {
996 I32 lp = (data ? *(data->last_closep) : 0);
997
998 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
999 }
c277df42
IZ
1000 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1001 do_curly:
1002 if (flags & SCF_DO_SUBSTR) {
830247a4 1003 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1004 pos_before = data->pos_min;
1005 }
1006 if (data) {
1007 fl = data->flags;
1008 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1009 if (is_inf)
1010 data->flags |= SF_IS_INF;
1011 }
653099ff 1012 if (flags & SCF_DO_STCLASS) {
830247a4 1013 cl_init(pRExC_state, &this_class);
653099ff
GS
1014 oclass = data->start_class;
1015 data->start_class = &this_class;
1016 f |= SCF_DO_STCLASS_AND;
1017 f &= ~SCF_DO_STCLASS_OR;
1018 }
e1901655
IZ
1019 /* These are the cases when once a subexpression
1020 fails at a particular position, it cannot succeed
1021 even after backtracking at the enclosing scope.
b81d288d 1022
e1901655
IZ
1023 XXXX what if minimal match and we are at the
1024 initial run of {n,m}? */
1025 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1026 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1027
c277df42 1028 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1029 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1030 mincount == 0
653099ff
GS
1031 ? (f & ~SCF_DO_SUBSTR) : f);
1032
1033 if (flags & SCF_DO_STCLASS)
1034 data->start_class = oclass;
1035 if (mincount == 0 || minnext == 0) {
1036 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1037 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1038 }
1039 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1040 /* Switch to OR mode: cache the old value of
653099ff
GS
1041 * data->start_class */
1042 StructCopy(data->start_class, &and_with,
1043 struct regnode_charclass_class);
1044 flags &= ~SCF_DO_STCLASS_AND;
1045 StructCopy(&this_class, data->start_class,
1046 struct regnode_charclass_class);
1047 flags |= SCF_DO_STCLASS_OR;
1048 data->start_class->flags |= ANYOF_EOS;
1049 }
1050 } else { /* Non-zero len */
1051 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1052 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1053 cl_and(data->start_class, &and_with);
1054 }
1055 else if (flags & SCF_DO_STCLASS_AND)
1056 cl_and(data->start_class, &this_class);
1057 flags &= ~SCF_DO_STCLASS;
1058 }
c277df42
IZ
1059 if (!scan) /* It was not CURLYX, but CURLY. */
1060 scan = next;
b81d288d 1061 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
821b33a5 1062 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1063 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1064 {
830247a4 1065 vWARN(RExC_parse,
b45f050a
JF
1066 "Quantifier unexpected on zero-length expression");
1067 }
1068
c277df42 1069 min += minnext * mincount;
b81d288d 1070 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1071 && (minnext + deltanext) > 0)
1072 || deltanext == I32_MAX);
aca2d497 1073 is_inf |= is_inf_internal;
c277df42
IZ
1074 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1075
1076 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1077 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1078 && data->flags & SF_IN_PAR
1079 && !(data->flags & SF_HAS_EVAL)
1080 && !deltanext && minnext == 1 ) {
1081 /* Try to optimize to CURLYN. */
1082 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1083 regnode *nxt1 = nxt, *nxt2;
1084
1085 /* Skip open. */
1086 nxt = regnext(nxt);
4d61ec05 1087 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1088 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1089 && STR_LEN(nxt) == 1))
c277df42
IZ
1090 goto nogo;
1091 nxt2 = nxt;
1092 nxt = regnext(nxt);
b81d288d 1093 if (OP(nxt) != CLOSE)
c277df42
IZ
1094 goto nogo;
1095 /* Now we know that nxt2 is the only contents: */
1096 oscan->flags = ARG(nxt);
1097 OP(oscan) = CURLYN;
1098 OP(nxt1) = NOTHING; /* was OPEN. */
1099#ifdef DEBUGGING
1100 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1101 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1102 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1103 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1104 OP(nxt + 1) = OPTIMIZED; /* was count. */
1105 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1106#endif
c277df42 1107 }
c277df42
IZ
1108 nogo:
1109
1110 /* Try optimization CURLYX => CURLYM. */
b81d288d 1111 if ( OP(oscan) == CURLYX && data
c277df42 1112 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1113 && !(data->flags & SF_HAS_EVAL)
1114 && !deltanext ) {
1115 /* XXXX How to optimize if data == 0? */
1116 /* Optimize to a simpler form. */
1117 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1118 regnode *nxt2;
1119
1120 OP(oscan) = CURLYM;
1121 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1122 && (OP(nxt2) != WHILEM))
c277df42
IZ
1123 nxt = nxt2;
1124 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1125 /* Need to optimize away parenths. */
1126 if (data->flags & SF_IN_PAR) {
1127 /* Set the parenth number. */
1128 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1129
b81d288d 1130 if (OP(nxt) != CLOSE)
b45f050a 1131 FAIL("Panic opt close");
c277df42
IZ
1132 oscan->flags = ARG(nxt);
1133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1135#ifdef DEBUGGING
1136 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1137 OP(nxt + 1) = OPTIMIZED; /* was count. */
1138 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1139 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1140#endif
c277df42
IZ
1141#if 0
1142 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1143 regnode *nnxt = regnext(nxt1);
b81d288d 1144
c277df42
IZ
1145 if (nnxt == nxt) {
1146 if (reg_off_by_arg[OP(nxt1)])
1147 ARG_SET(nxt1, nxt2 - nxt1);
1148 else if (nxt2 - nxt1 < U16_MAX)
1149 NEXT_OFF(nxt1) = nxt2 - nxt1;
1150 else
1151 OP(nxt) = NOTHING; /* Cannot beautify */
1152 }
1153 nxt1 = nnxt;
1154 }
1155#endif
1156 /* Optimize again: */
b81d288d 1157 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1158 NULL, 0);
a0ed51b3
LW
1159 }
1160 else
c277df42 1161 oscan->flags = 0;
c277df42 1162 }
e1901655
IZ
1163 else if ((OP(oscan) == CURLYX)
1164 && (flags & SCF_WHILEM_VISITED_POS)
1165 /* See the comment on a similar expression above.
1166 However, this time it not a subexpression
1167 we care about, but the expression itself. */
1168 && (maxcount == REG_INFTY)
1169 && data && ++data->whilem_c < 16) {
1170 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1171 /* Find WHILEM (as in regexec.c) */
1172 regnode *nxt = oscan + NEXT_OFF(oscan);
1173
1174 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1175 nxt += ARG(nxt);
1176 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1177 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1178 }
b81d288d 1179 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1180 pars++;
1181 if (flags & SCF_DO_SUBSTR) {
1182 SV *last_str = Nullsv;
1183 int counted = mincount != 0;
1184
1185 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1186 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1187 ? pos_before : data->last_start_min;
1188 STRLEN l;
1189 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1190 I32 old = b - data->last_start_min;
1191
1192 if (UTF)
1193 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1194
a0ed51b3 1195 l -= old;
c277df42 1196 /* Get the added string: */
79cb57f6 1197 last_str = newSVpvn(s + old, l);
c277df42
IZ
1198 if (deltanext == 0 && pos_before == b) {
1199 /* What was added is a constant string */
1200 if (mincount > 1) {
1201 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1202 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1203 SvPVX(last_str), l, mincount - 1);
1204 SvCUR(last_str) *= mincount;
1205 /* Add additional parts. */
b81d288d 1206 SvCUR_set(data->last_found,
c277df42
IZ
1207 SvCUR(data->last_found) - l);
1208 sv_catsv(data->last_found, last_str);
1209 data->last_end += l * (mincount - 1);
1210 }
2a8d9689
HS
1211 } else {
1212 /* start offset must point into the last copy */
1213 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1214 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1215 * (minnext + data->pos_delta);
c277df42
IZ
1216 }
1217 }
1218 /* It is counted once already... */
1219 data->pos_min += minnext * (mincount - counted);
1220 data->pos_delta += - counted * deltanext +
1221 (minnext + deltanext) * maxcount - minnext * mincount;
1222 if (mincount != maxcount) {
653099ff
GS
1223 /* Cannot extend fixed substrings found inside
1224 the group. */
830247a4 1225 scan_commit(pRExC_state,data);
c277df42
IZ
1226 if (mincount && last_str) {
1227 sv_setsv(data->last_found, last_str);
1228 data->last_end = data->pos_min;
b81d288d 1229 data->last_start_min =
a0ed51b3 1230 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1231 data->last_start_max = is_inf
1232 ? I32_MAX
c277df42 1233 : data->pos_min + data->pos_delta
a0ed51b3 1234 - CHR_SVLEN(last_str);
c277df42
IZ
1235 }
1236 data->longest = &(data->longest_float);
1237 }
aca2d497 1238 SvREFCNT_dec(last_str);
c277df42 1239 }
405ff068 1240 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1241 data->flags |= SF_HAS_EVAL;
1242 optimize_curly_tail:
c277df42 1243 if (OP(oscan) != CURLYX) {
22c35a8c 1244 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1245 && NEXT_OFF(next))
1246 NEXT_OFF(oscan) += NEXT_OFF(next);
1247 }
c277df42 1248 continue;
653099ff 1249 default: /* REF and CLUMP only? */
c277df42 1250 if (flags & SCF_DO_SUBSTR) {
830247a4 1251 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1252 data->longest = &(data->longest_float);
1253 }
aca2d497 1254 is_inf = is_inf_internal = 1;
653099ff 1255 if (flags & SCF_DO_STCLASS_OR)
830247a4 1256 cl_anything(pRExC_state, data->start_class);
653099ff 1257 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1258 break;
1259 }
a0ed51b3 1260 }
ffc61ed2 1261 else if (strchr((char*)PL_simple,OP(scan))) {
653099ff
GS
1262 int value;
1263
c277df42 1264 if (flags & SCF_DO_SUBSTR) {
830247a4 1265 scan_commit(pRExC_state,data);
c277df42
IZ
1266 data->pos_min++;
1267 }
1268 min++;
653099ff
GS
1269 if (flags & SCF_DO_STCLASS) {
1270 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1271
1272 /* Some of the logic below assumes that switching
1273 locale on will only add false positives. */
1274 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1275 case SANY:
653099ff
GS
1276 default:
1277 do_default:
1278 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1279 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1281 break;
1282 case REG_ANY:
1283 if (OP(scan) == SANY)
1284 goto do_default;
1285 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1286 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1287 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1288 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1289 }
1290 if (flags & SCF_DO_STCLASS_AND || !value)
1291 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1292 break;
1293 case ANYOF:
1294 if (flags & SCF_DO_STCLASS_AND)
1295 cl_and(data->start_class,
1296 (struct regnode_charclass_class*)scan);
1297 else
830247a4 1298 cl_or(pRExC_state, data->start_class,
653099ff
GS
1299 (struct regnode_charclass_class*)scan);
1300 break;
1301 case ALNUM:
1302 if (flags & SCF_DO_STCLASS_AND) {
1303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1305 for (value = 0; value < 256; value++)
1306 if (!isALNUM(value))
1307 ANYOF_BITMAP_CLEAR(data->start_class, value);
1308 }
1309 }
1310 else {
1311 if (data->start_class->flags & ANYOF_LOCALE)
1312 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1313 else {
1314 for (value = 0; value < 256; value++)
1315 if (isALNUM(value))
b81d288d 1316 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1317 }
1318 }
1319 break;
1320 case ALNUML:
1321 if (flags & SCF_DO_STCLASS_AND) {
1322 if (data->start_class->flags & ANYOF_LOCALE)
1323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1324 }
1325 else {
1326 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1327 data->start_class->flags |= ANYOF_LOCALE;
1328 }
1329 break;
1330 case NALNUM:
1331 if (flags & SCF_DO_STCLASS_AND) {
1332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1334 for (value = 0; value < 256; value++)
1335 if (isALNUM(value))
1336 ANYOF_BITMAP_CLEAR(data->start_class, value);
1337 }
1338 }
1339 else {
1340 if (data->start_class->flags & ANYOF_LOCALE)
1341 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1342 else {
1343 for (value = 0; value < 256; value++)
1344 if (!isALNUM(value))
b81d288d 1345 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1346 }
1347 }
1348 break;
1349 case NALNUML:
1350 if (flags & SCF_DO_STCLASS_AND) {
1351 if (data->start_class->flags & ANYOF_LOCALE)
1352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1353 }
1354 else {
1355 data->start_class->flags |= ANYOF_LOCALE;
1356 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1357 }
1358 break;
1359 case SPACE:
1360 if (flags & SCF_DO_STCLASS_AND) {
1361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1363 for (value = 0; value < 256; value++)
1364 if (!isSPACE(value))
1365 ANYOF_BITMAP_CLEAR(data->start_class, value);
1366 }
1367 }
1368 else {
1369 if (data->start_class->flags & ANYOF_LOCALE)
1370 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1371 else {
1372 for (value = 0; value < 256; value++)
1373 if (isSPACE(value))
b81d288d 1374 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1375 }
1376 }
1377 break;
1378 case SPACEL:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (data->start_class->flags & ANYOF_LOCALE)
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1382 }
1383 else {
1384 data->start_class->flags |= ANYOF_LOCALE;
1385 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1386 }
1387 break;
1388 case NSPACE:
1389 if (flags & SCF_DO_STCLASS_AND) {
1390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1392 for (value = 0; value < 256; value++)
1393 if (isSPACE(value))
1394 ANYOF_BITMAP_CLEAR(data->start_class, value);
1395 }
1396 }
1397 else {
1398 if (data->start_class->flags & ANYOF_LOCALE)
1399 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1400 else {
1401 for (value = 0; value < 256; value++)
1402 if (!isSPACE(value))
b81d288d 1403 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1404 }
1405 }
1406 break;
1407 case NSPACEL:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (data->start_class->flags & ANYOF_LOCALE) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1411 for (value = 0; value < 256; value++)
1412 if (!isSPACE(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 data->start_class->flags |= ANYOF_LOCALE;
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1419 }
1420 break;
1421 case DIGIT:
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1424 for (value = 0; value < 256; value++)
1425 if (!isDIGIT(value))
1426 ANYOF_BITMAP_CLEAR(data->start_class, value);
1427 }
1428 else {
1429 if (data->start_class->flags & ANYOF_LOCALE)
1430 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1431 else {
1432 for (value = 0; value < 256; value++)
1433 if (isDIGIT(value))
b81d288d 1434 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1435 }
1436 }
1437 break;
1438 case NDIGIT:
1439 if (flags & SCF_DO_STCLASS_AND) {
1440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1441 for (value = 0; value < 256; value++)
1442 if (isDIGIT(value))
1443 ANYOF_BITMAP_CLEAR(data->start_class, value);
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isDIGIT(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 }
1456 if (flags & SCF_DO_STCLASS_OR)
1457 cl_and(data->start_class, &and_with);
1458 flags &= ~SCF_DO_STCLASS;
1459 }
a0ed51b3 1460 }
22c35a8c 1461 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1462 data->flags |= (OP(scan) == MEOL
1463 ? SF_BEFORE_MEOL
1464 : SF_BEFORE_SEOL);
a0ed51b3 1465 }
653099ff
GS
1466 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1467 /* Lookbehind, or need to calculate parens/evals/stclass: */
1468 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1469 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1470 /* Lookahead/lookbehind */
cb434fcc 1471 I32 deltanext, minnext, fake = 0;
c277df42 1472 regnode *nscan;
653099ff
GS
1473 struct regnode_charclass_class intrnl;
1474 int f = 0;
c277df42
IZ
1475
1476 data_fake.flags = 0;
b81d288d 1477 if (data) {
2c2d71f5 1478 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1479 data_fake.last_closep = data->last_closep;
1480 }
1481 else
1482 data_fake.last_closep = &fake;
653099ff
GS
1483 if ( flags & SCF_DO_STCLASS && !scan->flags
1484 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1485 cl_init(pRExC_state, &intrnl);
653099ff 1486 data_fake.start_class = &intrnl;
e1901655 1487 f |= SCF_DO_STCLASS_AND;
653099ff 1488 }
e1901655
IZ
1489 if (flags & SCF_WHILEM_VISITED_POS)
1490 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1491 next = regnext(scan);
1492 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1493 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1494 if (scan->flags) {
1495 if (deltanext) {
9baa0206 1496 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1497 }
1498 else if (minnext > U8_MAX) {
9baa0206 1499 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1500 }
1501 scan->flags = minnext;
1502 }
1503 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1504 pars++;
405ff068 1505 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1506 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1507 if (data)
1508 data->whilem_c = data_fake.whilem_c;
e1901655 1509 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1510 int was = (data->start_class->flags & ANYOF_EOS);
1511
1512 cl_and(data->start_class, &intrnl);
1513 if (was)
1514 data->start_class->flags |= ANYOF_EOS;
1515 }
a0ed51b3
LW
1516 }
1517 else if (OP(scan) == OPEN) {
c277df42 1518 pars++;
a0ed51b3 1519 }
cb434fcc
IZ
1520 else if (OP(scan) == CLOSE) {
1521 if (ARG(scan) == is_par) {
1522 next = regnext(scan);
c277df42 1523
cb434fcc
IZ
1524 if ( next && (OP(next) != WHILEM) && next < last)
1525 is_par = 0; /* Disable optimization */
1526 }
1527 if (data)
1528 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1529 }
1530 else if (OP(scan) == EVAL) {
c277df42
IZ
1531 if (data)
1532 data->flags |= SF_HAS_EVAL;
1533 }
96776eda 1534 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1535 if (flags & SCF_DO_SUBSTR) {
830247a4 1536 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1537 data->longest = &(data->longest_float);
1538 }
1539 is_inf = is_inf_internal = 1;
653099ff 1540 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1541 cl_anything(pRExC_state, data->start_class);
96776eda 1542 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1543 }
c277df42
IZ
1544 /* Else: zero-length, ignore. */
1545 scan = regnext(scan);
1546 }
1547
1548 finish:
1549 *scanp = scan;
aca2d497 1550 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1551 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1552 data->pos_delta = I32_MAX - data->pos_min;
1553 if (is_par > U8_MAX)
1554 is_par = 0;
1555 if (is_par && pars==1 && data) {
1556 data->flags |= SF_IN_PAR;
1557 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1558 }
1559 else if (pars && data) {
c277df42
IZ
1560 data->flags |= SF_HAS_PAR;
1561 data->flags &= ~SF_IN_PAR;
1562 }
653099ff
GS
1563 if (flags & SCF_DO_STCLASS_OR)
1564 cl_and(data->start_class, &and_with);
c277df42
IZ
1565 return min;
1566}
1567
76e3520e 1568STATIC I32
830247a4 1569S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1570{
830247a4 1571 if (RExC_rx->data) {
b81d288d
AB
1572 Renewc(RExC_rx->data,
1573 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1574 char, struct reg_data);
830247a4
IZ
1575 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1576 RExC_rx->data->count += n;
a0ed51b3
LW
1577 }
1578 else {
830247a4 1579 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1580 char, struct reg_data);
830247a4
IZ
1581 New(1208, RExC_rx->data->what, n, U8);
1582 RExC_rx->data->count = n;
c277df42 1583 }
830247a4
IZ
1584 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1585 return RExC_rx->data->count - n;
c277df42
IZ
1586}
1587
d88dccdf 1588void
864dbfa3 1589Perl_reginitcolors(pTHX)
d88dccdf 1590{
d88dccdf
IZ
1591 int i = 0;
1592 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1593
d88dccdf
IZ
1594 if (s) {
1595 PL_colors[0] = s = savepv(s);
1596 while (++i < 6) {
1597 s = strchr(s, '\t');
1598 if (s) {
1599 *s = '\0';
1600 PL_colors[i] = ++s;
1601 }
1602 else
c712d376 1603 PL_colors[i] = s = "";
d88dccdf
IZ
1604 }
1605 } else {
b81d288d 1606 while (i < 6)
d88dccdf
IZ
1607 PL_colors[i++] = "";
1608 }
1609 PL_colorset = 1;
1610}
1611
8615cb43 1612
a687059c 1613/*
e50aee73 1614 - pregcomp - compile a regular expression into internal code
a687059c
LW
1615 *
1616 * We can't allocate space until we know how big the compiled form will be,
1617 * but we can't compile it (and thus know how big it is) until we've got a
1618 * place to put the code. So we cheat: we compile it twice, once with code
1619 * generation turned off and size counting turned on, and once "for real".
1620 * This also means that we don't allocate space until we are sure that the
1621 * thing really will compile successfully, and we never have to move the
1622 * code and thus invalidate pointers into it. (Note that it has to be in
1623 * one piece because free() must be able to free it all.) [NB: not true in perl]
1624 *
1625 * Beware that the optimization-preparation code in here knows about some
1626 * of the structure of the compiled regexp. [I'll say.]
1627 */
1628regexp *
864dbfa3 1629Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1630{
a0d0e21e 1631 register regexp *r;
c277df42 1632 regnode *scan;
c277df42 1633 regnode *first;
a0d0e21e 1634 I32 flags;
a0d0e21e
LW
1635 I32 minlen = 0;
1636 I32 sawplus = 0;
1637 I32 sawopen = 0;
2c2d71f5 1638 scan_data_t data;
830247a4
IZ
1639 RExC_state_t RExC_state;
1640 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1641
1642 if (exp == NULL)
c277df42 1643 FAIL("NULL regexp argument");
a0d0e21e 1644
830247a4 1645 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1646 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1647 RExC_utf8 = 1;
a0ed51b3 1648 else
1aa99e6b 1649 RExC_utf8 = 0;
a0ed51b3 1650
5cfc7842 1651 RExC_precomp = exp;
35ef4773 1652 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1653 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1654 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1655 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1656 RExC_flags16 = pm->op_pmflags;
1657 RExC_sawback = 0;
bbce6d69 1658
830247a4
IZ
1659 RExC_seen = 0;
1660 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1661 RExC_seen_evals = 0;
1662 RExC_extralen = 0;
c277df42 1663
bbce6d69 1664 /* First pass: determine size, legality. */
830247a4 1665 RExC_parse = exp;
fac92740 1666 RExC_start = exp;
830247a4
IZ
1667 RExC_end = xend;
1668 RExC_naughty = 0;
1669 RExC_npar = 1;
1670 RExC_size = 0L;
1671 RExC_emit = &PL_regdummy;
1672 RExC_whilem_seen = 0;
85ddcde9
JH
1673#if 0 /* REGC() is (currently) a NOP at the first pass.
1674 * Clever compilers notice this and complain. --jhi */
830247a4 1675 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1676#endif
830247a4 1677 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1678 RExC_precomp = Nullch;
a0d0e21e
LW
1679 return(NULL);
1680 }
830247a4 1681 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1682
c277df42
IZ
1683 /* Small enough for pointer-storage convention?
1684 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1685 if (RExC_size >= 0x10000L && RExC_extralen)
1686 RExC_size += RExC_extralen;
c277df42 1687 else
830247a4
IZ
1688 RExC_extralen = 0;
1689 if (RExC_whilem_seen > 15)
1690 RExC_whilem_seen = 15;
a0d0e21e 1691
bbce6d69 1692 /* Allocate space and initialize. */
830247a4 1693 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1694 char, regexp);
a0d0e21e 1695 if (r == NULL)
b45f050a
JF
1696 FAIL("Regexp out of space");
1697
0f79a09d
GS
1698#ifdef DEBUGGING
1699 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1700 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1701#endif
c277df42 1702 r->refcnt = 1;
bbce6d69 1703 r->prelen = xend - exp;
5cfc7842 1704 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1705 r->subbeg = NULL;
1706 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1707 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1708
1709 r->substrs = 0; /* Useful during FAIL. */
1710 r->startp = 0; /* Useful during FAIL. */
1711 r->endp = 0; /* Useful during FAIL. */
1712
fac92740
MJD
1713 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1714 if (r->offsets) {
1715 r->offsets[0] = RExC_size;
1716 }
1717 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1718 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1719 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1720 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1721
830247a4 1722 RExC_rx = r;
bbce6d69 1723
1724 /* Second pass: emit code. */
830247a4
IZ
1725 RExC_parse = exp;
1726 RExC_end = xend;
1727 RExC_naughty = 0;
1728 RExC_npar = 1;
fac92740 1729 RExC_emit_start = r->program;
830247a4 1730 RExC_emit = r->program;
2cd61cdb 1731 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1732 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1733 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1734 r->data = 0;
830247a4 1735 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1736 return(NULL);
1737
1738 /* Dig out information for optimizations. */
cf93c79d 1739 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1740 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1741 if (UTF)
1742 r->reganch |= ROPT_UTF8;
c277df42 1743 r->regstclass = NULL;
830247a4 1744 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1745 r->reganch |= ROPT_NAUGHTY;
c277df42 1746 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1747
1748 /* XXXX To minimize changes to RE engine we always allocate
1749 3-units-long substrs field. */
1750 Newz(1004, r->substrs, 1, struct reg_substr_data);
1751
2c2d71f5 1752 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1753 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1754 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1755 I32 fake;
c5254dd6 1756 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1757 struct regnode_charclass_class ch_class;
1758 int stclass_flag;
cb434fcc 1759 I32 last_close = 0;
a0d0e21e
LW
1760
1761 first = scan;
c277df42 1762 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1763 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1764 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1765 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1766 (OP(first) == PLUS) ||
1767 (OP(first) == MINMOD) ||
653099ff 1768 /* An {n,m} with n>0 */
22c35a8c 1769 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1770 if (OP(first) == PLUS)
1771 sawplus = 1;
1772 else
1773 first += regarglen[(U8)OP(first)];
1774 first = NEXTOPER(first);
a687059c
LW
1775 }
1776
a0d0e21e
LW
1777 /* Starting-point info. */
1778 again:
653099ff 1779 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1780 if (OP(first) == EXACT)
1781 ; /* Empty, get anchored substr later. */
1782 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1783 r->regstclass = first;
1784 }
653099ff 1785 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1786 r->regstclass = first;
22c35a8c
GS
1787 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1788 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1789 r->regstclass = first;
22c35a8c 1790 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1791 r->reganch |= (OP(first) == MBOL
1792 ? ROPT_ANCH_MBOL
1793 : (OP(first) == SBOL
1794 ? ROPT_ANCH_SBOL
1795 : ROPT_ANCH_BOL));
a0d0e21e 1796 first = NEXTOPER(first);
774d564b 1797 goto again;
1798 }
1799 else if (OP(first) == GPOS) {
1800 r->reganch |= ROPT_ANCH_GPOS;
1801 first = NEXTOPER(first);
1802 goto again;
a0d0e21e
LW
1803 }
1804 else if ((OP(first) == STAR &&
22c35a8c 1805 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1806 !(r->reganch & ROPT_ANCH) )
1807 {
1808 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1809 int type = OP(NEXTOPER(first));
1810
ffc61ed2 1811 if (type == REG_ANY)
cad2e5aa
JH
1812 type = ROPT_ANCH_MBOL;
1813 else
1814 type = ROPT_ANCH_SBOL;
1815
1816 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1817 first = NEXTOPER(first);
774d564b 1818 goto again;
a0d0e21e 1819 }
b81d288d 1820 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1821 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1822 /* x+ must match at the 1st pos of run of x's */
1823 r->reganch |= ROPT_SKIP;
a0d0e21e 1824
c277df42 1825 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1826 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1827 (IV)(first - scan + 1)));
a0d0e21e
LW
1828 /*
1829 * If there's something expensive in the r.e., find the
1830 * longest literal string that must appear and make it the
1831 * regmust. Resolve ties in favor of later strings, since
1832 * the regstart check works with the beginning of the r.e.
1833 * and avoiding duplication strengthens checking. Not a
1834 * strong reason, but sufficient in the absence of others.
1835 * [Now we resolve ties in favor of the earlier string if
c277df42 1836 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1837 * earlier string may buy us something the later one won't.]
1838 */
a0d0e21e 1839 minlen = 0;
a687059c 1840
79cb57f6
GS
1841 data.longest_fixed = newSVpvn("",0);
1842 data.longest_float = newSVpvn("",0);
1843 data.last_found = newSVpvn("",0);
c277df42
IZ
1844 data.longest = &(data.longest_fixed);
1845 first = scan;
653099ff 1846 if (!r->regstclass) {
830247a4 1847 cl_init(pRExC_state, &ch_class);
653099ff
GS
1848 data.start_class = &ch_class;
1849 stclass_flag = SCF_DO_STCLASS_AND;
1850 } else /* XXXX Check for BOUND? */
1851 stclass_flag = 0;
cb434fcc 1852 data.last_closep = &last_close;
653099ff 1853
830247a4 1854 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1855 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1856 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1857 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1858 && !RExC_seen_zerolen
1859 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1860 r->reganch |= ROPT_CHECK_ALL;
830247a4 1861 scan_commit(pRExC_state, &data);
c277df42
IZ
1862 SvREFCNT_dec(data.last_found);
1863
a0ed51b3 1864 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1865 if (longest_float_length
c277df42
IZ
1866 || (data.flags & SF_FL_BEFORE_EOL
1867 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1868 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1869 int t;
1870
a0ed51b3 1871 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1872 && data.offset_fixed == data.offset_float_min
1873 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1874 goto remove_float; /* As in (a)+. */
1875
c277df42
IZ
1876 r->float_substr = data.longest_float;
1877 r->float_min_offset = data.offset_float_min;
1878 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1879 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1880 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1881 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1882 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1883 }
1884 else {
aca2d497 1885 remove_float:
c277df42
IZ
1886 r->float_substr = Nullsv;
1887 SvREFCNT_dec(data.longest_float);
c5254dd6 1888 longest_float_length = 0;
a0d0e21e 1889 }
c277df42 1890
a0ed51b3 1891 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1892 if (longest_fixed_length
c277df42
IZ
1893 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1894 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1895 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1896 int t;
1897
c277df42
IZ
1898 r->anchored_substr = data.longest_fixed;
1899 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1900 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1901 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1902 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1903 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1904 }
1905 else {
c277df42
IZ
1906 r->anchored_substr = Nullsv;
1907 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1908 longest_fixed_length = 0;
a0d0e21e 1909 }
b81d288d 1910 if (r->regstclass
ffc61ed2 1911 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1912 r->regstclass = NULL;
1913 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1914 && !(data.start_class->flags & ANYOF_EOS)
1915 && !cl_is_anything(data.start_class)) {
1916 SV *sv;
830247a4 1917 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1918
b81d288d 1919 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1920 struct regnode_charclass_class);
1921 StructCopy(data.start_class,
830247a4 1922 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1923 struct regnode_charclass_class);
830247a4 1924 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1925 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1926 PL_regdata = r->data; /* for regprop() */
653099ff
GS
1927 DEBUG_r((sv = sv_newmortal(),
1928 regprop(sv, (regnode*)data.start_class),
894356b3 1929 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1930 SvPVX(sv))));
1931 }
c277df42
IZ
1932
1933 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1934 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1935 r->check_substr = r->anchored_substr;
1936 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1937 if (r->reganch & ROPT_ANCH_SINGLE)
1938 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1939 }
1940 else {
c277df42
IZ
1941 r->check_substr = r->float_substr;
1942 r->check_offset_min = data.offset_float_min;
1943 r->check_offset_max = data.offset_float_max;
a0d0e21e 1944 }
30382c73
IZ
1945 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1946 This should be changed ASAP! */
1947 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1948 r->reganch |= RE_USE_INTUIT;
1949 if (SvTAIL(r->check_substr))
1950 r->reganch |= RE_INTUIT_TAIL;
1951 }
a0ed51b3
LW
1952 }
1953 else {
c277df42
IZ
1954 /* Several toplevels. Best we can is to set minlen. */
1955 I32 fake;
653099ff 1956 struct regnode_charclass_class ch_class;
cb434fcc 1957 I32 last_close = 0;
c277df42
IZ
1958
1959 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1960 scan = r->program + 1;
830247a4 1961 cl_init(pRExC_state, &ch_class);
653099ff 1962 data.start_class = &ch_class;
cb434fcc 1963 data.last_closep = &last_close;
e1901655 1964 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1965 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1966 if (!(data.start_class->flags & ANYOF_EOS)
1967 && !cl_is_anything(data.start_class)) {
1968 SV *sv;
830247a4 1969 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1970
b81d288d 1971 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1972 struct regnode_charclass_class);
1973 StructCopy(data.start_class,
830247a4 1974 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1975 struct regnode_charclass_class);
830247a4 1976 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff
GS
1977 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
1978 DEBUG_r((sv = sv_newmortal(),
1979 regprop(sv, (regnode*)data.start_class),
894356b3 1980 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1981 SvPVX(sv))));
1982 }
a0d0e21e
LW
1983 }
1984
a0d0e21e 1985 r->minlen = minlen;
b81d288d 1986 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1987 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1988 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1989 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1990 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1991 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
1992 if (RExC_seen & REG_SEEN_SANY)
1993 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
1994 Newz(1002, r->startp, RExC_npar, I32);
1995 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1996 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1997 DEBUG_r(regdump(r));
1998 return(r);
a687059c
LW
1999}
2000
2001/*
2002 - reg - regular expression, i.e. main body or parenthesized thing
2003 *
2004 * Caller must absorb opening parenthesis.
2005 *
2006 * Combining parenthesis handling with the base level of regular expression
2007 * is a trifle forced, but the need to tie the tails of the branches to what
2008 * follows makes it hard to avoid.
2009 */
76e3520e 2010STATIC regnode *
830247a4 2011S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2012 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2013{
c277df42
IZ
2014 register regnode *ret; /* Will be the head of the group. */
2015 register regnode *br;
2016 register regnode *lastbr;
2017 register regnode *ender = 0;
a0d0e21e 2018 register I32 parno = 0;
830247a4 2019 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
fac92740 2020 char * parse_start = RExC_parse; /* MJD */
830247a4 2021 char *oregcomp_parse = RExC_parse;
c277df42 2022 char c;
a0d0e21e 2023
821b33a5 2024 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
2025
2026 /* Make an OPEN node, if parenthesized. */
2027 if (paren) {
fac92740 2028 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2029 U16 posflags = 0, negflags = 0;
2030 U16 *flagsp = &posflags;
0f5d15d6 2031 int logical = 0;
830247a4 2032 char *seqstart = RExC_parse;
ca9dfc88 2033
830247a4
IZ
2034 RExC_parse++;
2035 paren = *RExC_parse++;
c277df42 2036 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2037 switch (paren) {
fac92740 2038 case '<': /* (?<...) */
830247a4 2039 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2040 if (*RExC_parse == '!')
c277df42 2041 paren = ',';
b81d288d 2042 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2043 goto unknown;
830247a4 2044 RExC_parse++;
fac92740
MJD
2045 case '=': /* (?=...) */
2046 case '!': /* (?!...) */
830247a4 2047 RExC_seen_zerolen++;
fac92740
MJD
2048 case ':': /* (?:...) */
2049 case '>': /* (?>...) */
a0d0e21e 2050 break;
fac92740
MJD
2051 case '$': /* (?$...) */
2052 case '@': /* (?@...) */
8615cb43 2053 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2054 break;
fac92740 2055 case '#': /* (?#...) */
830247a4
IZ
2056 while (*RExC_parse && *RExC_parse != ')')
2057 RExC_parse++;
2058 if (*RExC_parse != ')')
c277df42 2059 FAIL("Sequence (?#... not terminated");
830247a4 2060 nextchar(pRExC_state);
a0d0e21e
LW
2061 *flagp = TRYAGAIN;
2062 return NULL;
fac92740 2063 case 'p': /* (?p...) */
d7e9e385 2064 if (SIZE_ONLY)
830247a4 2065 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2066 /* FALL THROUGH*/
fac92740 2067 case '?': /* (??...) */
0f5d15d6 2068 logical = 1;
830247a4 2069 paren = *RExC_parse++;
0f5d15d6 2070 /* FALL THROUGH */
fac92740 2071 case '{': /* (?{...}) */
c277df42 2072 {
c277df42
IZ
2073 I32 count = 1, n = 0;
2074 char c;
830247a4 2075 char *s = RExC_parse;
c277df42
IZ
2076 SV *sv;
2077 OP_4tree *sop, *rop;
2078
830247a4
IZ
2079 RExC_seen_zerolen++;
2080 RExC_seen |= REG_SEEN_EVAL;
2081 while (count && (c = *RExC_parse)) {
2082 if (c == '\\' && RExC_parse[1])
2083 RExC_parse++;
b81d288d 2084 else if (c == '{')
c277df42 2085 count++;
b81d288d 2086 else if (c == '}')
c277df42 2087 count--;
830247a4 2088 RExC_parse++;
c277df42 2089 }
830247a4 2090 if (*RExC_parse != ')')
b45f050a 2091 {
b81d288d 2092 RExC_parse = s;
b45f050a
JF
2093 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2094 }
c277df42
IZ
2095 if (!SIZE_ONLY) {
2096 AV *av;
b81d288d
AB
2097
2098 if (RExC_parse - 1 - s)
830247a4 2099 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2100 else
79cb57f6 2101 sv = newSVpvn("", 0);
c277df42 2102
569233ed
SB
2103 ENTER;
2104 Perl_save_re_context(aTHX);
c277df42 2105 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2106 LEAVE;
c277df42 2107
830247a4
IZ
2108 n = add_data(pRExC_state, 3, "nop");
2109 RExC_rx->data->data[n] = (void*)rop;
2110 RExC_rx->data->data[n+1] = (void*)sop;
2111 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2112 SvREFCNT_dec(sv);
a0ed51b3 2113 }
e24b16f9 2114 else { /* First pass */
830247a4 2115 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2116 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2117 /* No compiled RE interpolated, has runtime
2118 components ===> unsafe. */
2119 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2120 if (PL_tainting && PL_tainted)
cc6b7395 2121 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2122 }
2123
830247a4 2124 nextchar(pRExC_state);
0f5d15d6 2125 if (logical) {
830247a4 2126 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2127 if (!SIZE_ONLY)
2128 ret->flags = 2;
830247a4 2129 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2130 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2131 return ret;
2132 }
830247a4 2133 return reganode(pRExC_state, EVAL, n);
c277df42 2134 }
fac92740 2135 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2136 {
fac92740 2137 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2138 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2139 || RExC_parse[1] == '<'
830247a4 2140 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2141 I32 flag;
2142
830247a4 2143 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2144 if (!SIZE_ONLY)
2145 ret->flags = 1;
830247a4 2146 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2147 goto insert_if;
b81d288d 2148 }
a0ed51b3 2149 }
830247a4 2150 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2151 /* (?(1)...) */
830247a4 2152 parno = atoi(RExC_parse++);
c277df42 2153
830247a4
IZ
2154 while (isDIGIT(*RExC_parse))
2155 RExC_parse++;
fac92740
MJD
2156 ret = reganode(pRExC_state, GROUPP, parno);
2157
830247a4 2158 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2159 vFAIL("Switch condition not recognized");
c277df42 2160 insert_if:
830247a4
IZ
2161 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2162 br = regbranch(pRExC_state, &flags, 1);
c277df42 2163 if (br == NULL)
830247a4 2164 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2165 else
830247a4
IZ
2166 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2167 c = *nextchar(pRExC_state);
d1b80229
IZ
2168 if (flags&HASWIDTH)
2169 *flagp |= HASWIDTH;
c277df42 2170 if (c == '|') {
830247a4
IZ
2171 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2172 regbranch(pRExC_state, &flags, 1);
2173 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2174 if (flags&HASWIDTH)
2175 *flagp |= HASWIDTH;
830247a4 2176 c = *nextchar(pRExC_state);
a0ed51b3
LW
2177 }
2178 else
c277df42
IZ
2179 lastbr = NULL;
2180 if (c != ')')
8615cb43 2181 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2182 ender = reg_node(pRExC_state, TAIL);
2183 regtail(pRExC_state, br, ender);
c277df42 2184 if (lastbr) {
830247a4
IZ
2185 regtail(pRExC_state, lastbr, ender);
2186 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2187 }
2188 else
830247a4 2189 regtail(pRExC_state, ret, ender);
c277df42 2190 return ret;
a0ed51b3
LW
2191 }
2192 else {
830247a4 2193 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2194 }
2195 }
1b1626e4 2196 case 0:
830247a4 2197 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2198 vFAIL("Sequence (? incomplete");
1b1626e4 2199 break;
a0d0e21e 2200 default:
830247a4 2201 --RExC_parse;
fac92740 2202 parse_flags: /* (?i) */
830247a4
IZ
2203 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2204 if (*RExC_parse != 'o')
2205 pmflag(flagsp, *RExC_parse);
2206 ++RExC_parse;
ca9dfc88 2207 }
830247a4 2208 if (*RExC_parse == '-') {
ca9dfc88 2209 flagsp = &negflags;
830247a4 2210 ++RExC_parse;
ca9dfc88 2211 goto parse_flags;
48c036b1 2212 }
830247a4
IZ
2213 RExC_flags16 |= posflags;
2214 RExC_flags16 &= ~negflags;
2215 if (*RExC_parse == ':') {
2216 RExC_parse++;
ca9dfc88
IZ
2217 paren = ':';
2218 break;
2219 }
c277df42 2220 unknown:
830247a4
IZ
2221 if (*RExC_parse != ')') {
2222 RExC_parse++;
2223 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2224 }
830247a4 2225 nextchar(pRExC_state);
a0d0e21e
LW
2226 *flagp = TRYAGAIN;
2227 return NULL;
2228 }
2229 }
fac92740 2230 else { /* (...) */
830247a4
IZ
2231 parno = RExC_npar;
2232 RExC_npar++;
2233 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2234 Set_Node_Length(ret, 1); /* MJD */
2235 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2236 open = 1;
a0d0e21e 2237 }
a0ed51b3 2238 }
fac92740 2239 else /* ! paren */
a0d0e21e
LW
2240 ret = NULL;
2241
2242 /* Pick up the branches, linking them together. */
fac92740 2243 parse_start = RExC_parse; /* MJD */
830247a4 2244 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2245 /* branch_len = (paren != 0); */
2246
a0d0e21e
LW
2247 if (br == NULL)
2248 return(NULL);
830247a4
IZ
2249 if (*RExC_parse == '|') {
2250 if (!SIZE_ONLY && RExC_extralen) {
2251 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2252 }
fac92740 2253 else { /* MJD */
830247a4 2254 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2255 Set_Node_Length(br, paren != 0);
2256 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2257 }
c277df42
IZ
2258 have_branch = 1;
2259 if (SIZE_ONLY)
830247a4 2260 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2261 }
2262 else if (paren == ':') {
c277df42
IZ
2263 *flagp |= flags&SIMPLE;
2264 }
2265 if (open) { /* Starts with OPEN. */
830247a4 2266 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2267 }
2268 else if (paren != '?') /* Not Conditional */
a0d0e21e 2269 ret = br;
821b33a5
IZ
2270 if (flags&HASWIDTH)
2271 *flagp |= HASWIDTH;
a0d0e21e 2272 *flagp |= flags&SPSTART;
c277df42 2273 lastbr = br;
830247a4
IZ
2274 while (*RExC_parse == '|') {
2275 if (!SIZE_ONLY && RExC_extralen) {
2276 ender = reganode(pRExC_state, LONGJMP,0);
2277 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2278 }
2279 if (SIZE_ONLY)
830247a4
IZ
2280 RExC_extralen += 2; /* Account for LONGJMP. */
2281 nextchar(pRExC_state);
2282 br = regbranch(pRExC_state, &flags, 0);
fac92740 2283
a687059c 2284 if (br == NULL)
a0d0e21e 2285 return(NULL);
830247a4 2286 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2287 lastbr = br;
821b33a5
IZ
2288 if (flags&HASWIDTH)
2289 *flagp |= HASWIDTH;
a687059c 2290 *flagp |= flags&SPSTART;
a0d0e21e
LW
2291 }
2292
c277df42
IZ
2293 if (have_branch || paren != ':') {
2294 /* Make a closing node, and hook it on the end. */
2295 switch (paren) {
2296 case ':':
830247a4 2297 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2298 break;
2299 case 1:
830247a4 2300 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2301 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2302 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2303 break;
2304 case '<':
c277df42
IZ
2305 case ',':
2306 case '=':
2307 case '!':
c277df42 2308 *flagp &= ~HASWIDTH;
821b33a5
IZ
2309 /* FALL THROUGH */
2310 case '>':
830247a4 2311 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2312 break;
2313 case 0:
830247a4 2314 ender = reg_node(pRExC_state, END);
c277df42
IZ
2315 break;
2316 }
830247a4 2317 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2318
c277df42
IZ
2319 if (have_branch) {
2320 /* Hook the tails of the branches to the closing node. */
2321 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2322 regoptail(pRExC_state, br, ender);
c277df42
IZ
2323 }
2324 }
a0d0e21e 2325 }
c277df42
IZ
2326
2327 {
2328 char *p;
2329 static char parens[] = "=!<,>";
2330
2331 if (paren && (p = strchr(parens, paren))) {
2332 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2333 int flag = (p - parens) > 1;
2334
2335 if (paren == '>')
2336 node = SUSPEND, flag = 0;
830247a4 2337 reginsert(pRExC_state, node,ret);
c277df42 2338 ret->flags = flag;
830247a4 2339 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2340 }
a0d0e21e
LW
2341 }
2342
2343 /* Check for proper termination. */
ce3e6498 2344 if (paren) {
830247a4
IZ
2345 RExC_flags16 = oregflags;
2346 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2347 RExC_parse = oregcomp_parse;
380a0633 2348 vFAIL("Unmatched (");
ce3e6498 2349 }
a0ed51b3 2350 }
830247a4
IZ
2351 else if (!paren && RExC_parse < RExC_end) {
2352 if (*RExC_parse == ')') {
2353 RExC_parse++;
380a0633 2354 vFAIL("Unmatched )");
a0ed51b3
LW
2355 }
2356 else
b45f050a 2357 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2358 /* NOTREACHED */
2359 }
a687059c 2360
a0d0e21e 2361 return(ret);
a687059c
LW
2362}
2363
2364/*
2365 - regbranch - one alternative of an | operator
2366 *
2367 * Implements the concatenation operator.
2368 */
76e3520e 2369STATIC regnode *
830247a4 2370S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2371{
c277df42
IZ
2372 register regnode *ret;
2373 register regnode *chain = NULL;
2374 register regnode *latest;
2375 I32 flags = 0, c = 0;
a0d0e21e 2376
b81d288d 2377 if (first)
c277df42
IZ
2378 ret = NULL;
2379 else {
b81d288d 2380 if (!SIZE_ONLY && RExC_extralen)
830247a4 2381 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2382 else {
830247a4 2383 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2384 Set_Node_Length(ret, 1);
2385 }
c277df42
IZ
2386 }
2387
b81d288d 2388 if (!first && SIZE_ONLY)
830247a4 2389 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2390
c277df42 2391 *flagp = WORST; /* Tentatively. */
a0d0e21e 2392
830247a4
IZ
2393 RExC_parse--;
2394 nextchar(pRExC_state);
2395 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2396 flags &= ~TRYAGAIN;
830247a4 2397 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2398 if (latest == NULL) {
2399 if (flags & TRYAGAIN)
2400 continue;
2401 return(NULL);
a0ed51b3
LW
2402 }
2403 else if (ret == NULL)
c277df42 2404 ret = latest;
a0d0e21e 2405 *flagp |= flags&HASWIDTH;
c277df42 2406 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2407 *flagp |= flags&SPSTART;
2408 else {
830247a4
IZ
2409 RExC_naughty++;
2410 regtail(pRExC_state, chain, latest);
a687059c 2411 }
a0d0e21e 2412 chain = latest;
c277df42
IZ
2413 c++;
2414 }
2415 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2416 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2417 if (ret == NULL)
2418 ret = chain;
2419 }
2420 if (c == 1) {
2421 *flagp |= flags&SIMPLE;
a0d0e21e 2422 }
a687059c 2423
a0d0e21e 2424 return(ret);
a687059c
LW
2425}
2426
2427/*
2428 - regpiece - something followed by possible [*+?]
2429 *
2430 * Note that the branching code sequences used for ? and the general cases
2431 * of * and + are somewhat optimized: they use the same NOTHING node as
2432 * both the endmarker for their branch list and the body of the last branch.
2433 * It might seem that this node could be dispensed with entirely, but the
2434 * endmarker role is not redundant.
2435 */
76e3520e 2436STATIC regnode *
830247a4 2437S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2438{
c277df42 2439 register regnode *ret;
a0d0e21e
LW
2440 register char op;
2441 register char *next;
2442 I32 flags;
830247a4 2443 char *origparse = RExC_parse;
a0d0e21e
LW
2444 char *maxpos;
2445 I32 min;
c277df42 2446 I32 max = REG_INFTY;
fac92740 2447 char *parse_start;
a0d0e21e 2448
830247a4 2449 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2450 if (ret == NULL) {
2451 if (flags & TRYAGAIN)
2452 *flagp |= TRYAGAIN;
2453 return(NULL);
2454 }
2455
830247a4 2456 op = *RExC_parse;
a0d0e21e 2457
830247a4 2458 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2459 parse_start = RExC_parse; /* MJD */
830247a4 2460 next = RExC_parse + 1;
a0d0e21e
LW
2461 maxpos = Nullch;
2462 while (isDIGIT(*next) || *next == ',') {
2463 if (*next == ',') {
2464 if (maxpos)
2465 break;
2466 else
2467 maxpos = next;
a687059c 2468 }
a0d0e21e
LW
2469 next++;
2470 }
2471 if (*next == '}') { /* got one */
2472 if (!maxpos)
2473 maxpos = next;
830247a4
IZ
2474 RExC_parse++;
2475 min = atoi(RExC_parse);
a0d0e21e
LW
2476 if (*maxpos == ',')
2477 maxpos++;
2478 else
830247a4 2479 maxpos = RExC_parse;
a0d0e21e
LW
2480 max = atoi(maxpos);
2481 if (!max && *maxpos != '0')
c277df42
IZ
2482 max = REG_INFTY; /* meaning "infinity" */
2483 else if (max >= REG_INFTY)
8615cb43 2484 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2485 RExC_parse = next;
2486 nextchar(pRExC_state);
a0d0e21e
LW
2487
2488 do_curly:
2489 if ((flags&SIMPLE)) {
830247a4
IZ
2490 RExC_naughty += 2 + RExC_naughty / 2;
2491 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2492 Set_Node_Offset(ret, parse_start+1); /* MJD */
2493 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2494 }
2495 else {
830247a4 2496 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2497
2498 w->flags = 0;
830247a4
IZ
2499 regtail(pRExC_state, ret, w);
2500 if (!SIZE_ONLY && RExC_extralen) {
2501 reginsert(pRExC_state, LONGJMP,ret);
2502 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2503 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2504 }
830247a4 2505 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2506 /* MJD hk */
2507 Set_Node_Offset(ret, parse_start+1);
2508 Set_Node_Length(ret,
2509 op == '{' ? (RExC_parse - parse_start) : 1);
2510
830247a4 2511 if (!SIZE_ONLY && RExC_extralen)
c277df42 2512 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2513 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2514 if (SIZE_ONLY)
830247a4
IZ
2515 RExC_whilem_seen++, RExC_extralen += 3;
2516 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2517 }
c277df42 2518 ret->flags = 0;
a0d0e21e
LW
2519
2520 if (min > 0)
821b33a5
IZ
2521 *flagp = WORST;
2522 if (max > 0)
2523 *flagp |= HASWIDTH;
a0d0e21e 2524 if (max && max < min)
8615cb43 2525 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2526 if (!SIZE_ONLY) {
2527 ARG1_SET(ret, min);
2528 ARG2_SET(ret, max);
a687059c 2529 }
a687059c 2530
a0d0e21e 2531 goto nest_check;
a687059c 2532 }
a0d0e21e 2533 }
a687059c 2534
a0d0e21e
LW
2535 if (!ISMULT1(op)) {
2536 *flagp = flags;
a687059c 2537 return(ret);
a0d0e21e 2538 }
bb20fd44 2539
c277df42 2540#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2541
2542 /* if this is reinstated, don't forget to put this back into perldiag:
2543
2544 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2545
2546 (F) The part of the regexp subject to either the * or + quantifier
2547 could match an empty string. The {#} shows in the regular
2548 expression about where the problem was discovered.
2549
2550 */
2551
bb20fd44 2552 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2553 vFAIL("Regexp *+ operand could be empty");
b81d288d 2554#endif
bb20fd44 2555
fac92740 2556 parse_start = RExC_parse;
830247a4 2557 nextchar(pRExC_state);
a0d0e21e 2558
821b33a5 2559 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2560
2561 if (op == '*' && (flags&SIMPLE)) {
830247a4 2562 reginsert(pRExC_state, STAR, ret);
c277df42 2563 ret->flags = 0;
830247a4 2564 RExC_naughty += 4;
a0d0e21e
LW
2565 }
2566 else if (op == '*') {
2567 min = 0;
2568 goto do_curly;
a0ed51b3
LW
2569 }
2570 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2571 reginsert(pRExC_state, PLUS, ret);
c277df42 2572 ret->flags = 0;
830247a4 2573 RExC_naughty += 3;
a0d0e21e
LW
2574 }
2575 else if (op == '+') {
2576 min = 1;
2577 goto do_curly;
a0ed51b3
LW
2578 }
2579 else if (op == '?') {
a0d0e21e
LW
2580 min = 0; max = 1;
2581 goto do_curly;
2582 }
2583 nest_check:
e476b1b5 2584 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2585 vWARN3(RExC_parse,
b45f050a 2586 "%.*s matches null string many times",
830247a4 2587 RExC_parse - origparse,
b45f050a 2588 origparse);
a0d0e21e
LW
2589 }
2590
830247a4
IZ
2591 if (*RExC_parse == '?') {
2592 nextchar(pRExC_state);
2593 reginsert(pRExC_state, MINMOD, ret);
2594 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2595 }
830247a4
IZ
2596 if (ISMULT2(RExC_parse)) {
2597 RExC_parse++;
b45f050a
JF
2598 vFAIL("Nested quantifiers");
2599 }
a0d0e21e
LW
2600
2601 return(ret);
a687059c
LW
2602}
2603
2604/*
2605 - regatom - the lowest level
2606 *
2607 * Optimization: gobbles an entire sequence of ordinary characters so that
2608 * it can turn them into a single node, which is smaller to store and
2609 * faster to run. Backslashed characters are exceptions, each becoming a
2610 * separate node; the code is simpler that way and it's not worth fixing.
2611 *
b45f050a 2612 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2613STATIC regnode *
830247a4 2614S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2615{
c277df42 2616 register regnode *ret = 0;
a0d0e21e 2617 I32 flags;
fac92740 2618 char *parse_start = RExC_parse;
a0d0e21e
LW
2619
2620 *flagp = WORST; /* Tentatively. */
2621
2622tryagain:
830247a4 2623 switch (*RExC_parse) {
a0d0e21e 2624 case '^':
830247a4
IZ
2625 RExC_seen_zerolen++;
2626 nextchar(pRExC_state);
2627 if (RExC_flags16 & PMf_MULTILINE)
2628 ret = reg_node(pRExC_state, MBOL);
2629 else if (RExC_flags16 & PMf_SINGLELINE)
2630 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2631 else
830247a4 2632 ret = reg_node(pRExC_state, BOL);
fac92740 2633 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2634 break;
2635 case '$':
830247a4 2636 nextchar(pRExC_state);
b81d288d 2637 if (*RExC_parse)
830247a4
IZ
2638 RExC_seen_zerolen++;
2639 if (RExC_flags16 & PMf_MULTILINE)
2640 ret = reg_node(pRExC_state, MEOL);
2641 else if (RExC_flags16 & PMf_SINGLELINE)
2642 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2643 else
830247a4 2644 ret = reg_node(pRExC_state, EOL);
fac92740 2645 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2646 break;
2647 case '.':
830247a4 2648 nextchar(pRExC_state);
ffc61ed2
JH
2649 if (RExC_flags16 & PMf_SINGLELINE)
2650 ret = reg_node(pRExC_state, SANY);
2651 else
2652 ret = reg_node(pRExC_state, REG_ANY);
2653 *flagp |= HASWIDTH|SIMPLE;
830247a4 2654 RExC_naughty++;
fac92740 2655 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2656 break;
2657 case '[':
b45f050a 2658 {
830247a4 2659 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2660 ret = regclass(pRExC_state);
830247a4
IZ
2661 if (*RExC_parse != ']') {
2662 RExC_parse = oregcomp_parse;
b45f050a
JF
2663 vFAIL("Unmatched [");
2664 }
830247a4 2665 nextchar(pRExC_state);
a0d0e21e 2666 *flagp |= HASWIDTH|SIMPLE;
fac92740 2667 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2668 break;
b45f050a 2669 }
a0d0e21e 2670 case '(':
830247a4
IZ
2671 nextchar(pRExC_state);
2672 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2673 if (ret == NULL) {
bf93d4cc 2674 if (flags & TRYAGAIN) {
830247a4 2675 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2676 /* Make parent create an empty node if needed. */
2677 *flagp |= TRYAGAIN;
2678 return(NULL);
2679 }
a0d0e21e 2680 goto tryagain;
bf93d4cc 2681 }
a0d0e21e
LW
2682 return(NULL);
2683 }
c277df42 2684 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2685 break;
2686 case '|':
2687 case ')':
2688 if (flags & TRYAGAIN) {
2689 *flagp |= TRYAGAIN;
2690 return NULL;
2691 }
b45f050a 2692 vFAIL("Internal urp");
a0d0e21e
LW
2693 /* Supposed to be caught earlier. */
2694 break;
85afd4ae 2695 case '{':
830247a4
IZ
2696 if (!regcurly(RExC_parse)) {
2697 RExC_parse++;
85afd4ae
CS
2698 goto defchar;
2699 }
2700 /* FALL THROUGH */
a0d0e21e
LW
2701 case '?':
2702 case '+':
2703 case '*':
830247a4 2704 RExC_parse++;
b45f050a 2705 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2706 break;
2707 case '\\':
830247a4 2708 switch (*++RExC_parse) {
a0d0e21e 2709 case 'A':
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2712 *flagp |= SIMPLE;
830247a4 2713 nextchar(pRExC_state);
fac92740 2714 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2715 break;
2716 case 'G':
830247a4
IZ
2717 ret = reg_node(pRExC_state, GPOS);
2718 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2719 *flagp |= SIMPLE;
830247a4 2720 nextchar(pRExC_state);
fac92740 2721 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2722 break;
2723 case 'Z':
830247a4 2724 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2725 *flagp |= SIMPLE;
830247a4 2726 nextchar(pRExC_state);
a0d0e21e 2727 break;
b85d18e9 2728 case 'z':
830247a4 2729 ret = reg_node(pRExC_state, EOS);
b85d18e9 2730 *flagp |= SIMPLE;
830247a4
IZ
2731 RExC_seen_zerolen++; /* Do not optimize RE away */
2732 nextchar(pRExC_state);
fac92740 2733 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2734 break;
4a2d328f 2735 case 'C':
830247a4 2736 ret = reg_node(pRExC_state, SANY);
3baa4c62 2737 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2738 *flagp |= HASWIDTH|SIMPLE;
830247a4 2739 nextchar(pRExC_state);
fac92740 2740 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2741 break;
2742 case 'X':
830247a4 2743 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2744 *flagp |= HASWIDTH;
830247a4 2745 nextchar(pRExC_state);
fac92740 2746 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2747 break;
a0d0e21e 2748 case 'w':
ffc61ed2 2749 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2750 *flagp |= HASWIDTH|SIMPLE;
830247a4 2751 nextchar(pRExC_state);
fac92740 2752 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2753 break;
2754 case 'W':
ffc61ed2 2755 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2756 *flagp |= HASWIDTH|SIMPLE;
830247a4 2757 nextchar(pRExC_state);
fac92740 2758 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2759 break;
2760 case 'b':
830247a4
IZ
2761 RExC_seen_zerolen++;
2762 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2763 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2764 *flagp |= SIMPLE;
830247a4 2765 nextchar(pRExC_state);
fac92740 2766 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2767 break;
2768 case 'B':
830247a4
IZ
2769 RExC_seen_zerolen++;
2770 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2771 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2772 *flagp |= SIMPLE;
830247a4 2773 nextchar(pRExC_state);
fac92740 2774 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2775 break;
2776 case 's':
ffc61ed2 2777 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2778 *flagp |= HASWIDTH|SIMPLE;
830247a4 2779 nextchar(pRExC_state);
fac92740 2780 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2781 break;
2782 case 'S':
ffc61ed2 2783 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2784 *flagp |= HASWIDTH|SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'd':
ffc61ed2 2789 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2790 *flagp |= HASWIDTH|SIMPLE;
830247a4 2791 nextchar(pRExC_state);
fac92740 2792 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2793 break;
2794 case 'D':
ffc61ed2 2795 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2796 *flagp |= HASWIDTH|SIMPLE;
830247a4 2797 nextchar(pRExC_state);
fac92740 2798 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2799 break;
a14b48bc
LW
2800 case 'p':
2801 case 'P':
3568d838 2802 {
830247a4 2803 char* oldregxend = RExC_end;
fac92740 2804 char* parse_start = RExC_parse;
a14b48bc 2805
830247a4 2806 if (RExC_parse[1] == '{') {
3568d838 2807 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2808 RExC_end = strchr(RExC_parse, '}');
2809 if (!RExC_end) {
2810 RExC_parse += 2;
2811 RExC_end = oldregxend;
b45f050a
JF
2812 vFAIL("Missing right brace on \\p{}");
2813 }
830247a4 2814 RExC_end++;
a14b48bc
LW
2815 }
2816 else
830247a4
IZ
2817 RExC_end = RExC_parse + 2;
2818 RExC_parse--;
a14b48bc 2819
ffc61ed2 2820 ret = regclass(pRExC_state);
a14b48bc 2821
830247a4
IZ
2822 RExC_end = oldregxend;
2823 RExC_parse--;
fac92740 2824 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2825 nextchar(pRExC_state);
a14b48bc
LW
2826 *flagp |= HASWIDTH|SIMPLE;
2827 }
2828 break;
a0d0e21e
LW
2829 case 'n':
2830 case 'r':
2831 case 't':
2832 case 'f':
2833 case 'e':
2834 case 'a':
2835 case 'x':
2836 case 'c':
2837 case '0':
2838 goto defchar;
2839 case '1': case '2': case '3': case '4':
2840 case '5': case '6': case '7': case '8': case '9':
2841 {
830247a4 2842 I32 num = atoi(RExC_parse);
a0d0e21e 2843
830247a4 2844 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2845 goto defchar;
2846 else {
fac92740 2847 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2848 while (isDIGIT(*RExC_parse))
2849 RExC_parse++;
b45f050a 2850
830247a4 2851 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2852 vFAIL("Reference to nonexistent group");
830247a4
IZ
2853 RExC_sawback = 1;
2854 ret = reganode(pRExC_state, FOLD
a0ed51b3 2855 ? (LOC ? REFFL : REFF)
c8756f30 2856 : REF, num);
a0d0e21e 2857 *flagp |= HASWIDTH;
fac92740
MJD
2858
2859 /* override incorrect value set in reganode MJD */
2860 Set_Node_Offset(ret, parse_start+1);
2861 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2862 RExC_parse--;
2863 nextchar(pRExC_state);
a0d0e21e
LW
2864 }
2865 }
2866 break;
2867 case '\0':
830247a4 2868 if (RExC_parse >= RExC_end)
b45f050a 2869 FAIL("Trailing \\");
a0d0e21e
LW
2870 /* FALL THROUGH */
2871 default:
c9f97d15
IZ
2872 /* Do not generate `unrecognized' warnings here, we fall
2873 back into the quick-grab loop below */
a0d0e21e
LW
2874 goto defchar;
2875 }
2876 break;
4633a7c4
LW
2877
2878 case '#':
830247a4
IZ
2879 if (RExC_flags16 & PMf_EXTENDED) {
2880 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2881 if (RExC_parse < RExC_end)
4633a7c4
LW
2882 goto tryagain;
2883 }
2884 /* FALL THROUGH */
2885
a0d0e21e 2886 default: {
ba210ebe 2887 register STRLEN len;
a0ed51b3 2888 register UV ender;
a0d0e21e 2889 register char *p;
c277df42 2890 char *oldp, *s;
ba210ebe 2891 STRLEN numlen;
fac92740 2892 char *parse_start = RExC_parse - 1;
a0d0e21e 2893
830247a4 2894 RExC_parse++;
a0d0e21e
LW
2895
2896 defchar:
830247a4 2897 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2898 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2899 : EXACT);
cd439c50 2900 s = STRING(ret);
830247a4
IZ
2901 for (len = 0, p = RExC_parse - 1;
2902 len < 127 && p < RExC_end;
a0d0e21e
LW
2903 len++)
2904 {
2905 oldp = p;
5b5a24f7 2906
830247a4
IZ
2907 if (RExC_flags16 & PMf_EXTENDED)
2908 p = regwhite(p, RExC_end);
a0d0e21e
LW
2909 switch (*p) {
2910 case '^':
2911 case '$':
2912 case '.':
2913 case '[':
2914 case '(':
2915 case ')':
2916 case '|':
2917 goto loopdone;
2918 case '\\':
2919 switch (*++p) {
2920 case 'A':
2921 case 'G':
2922 case 'Z':
b85d18e9 2923 case 'z':
a0d0e21e
LW
2924 case 'w':
2925 case 'W':
2926 case 'b':
2927 case 'B':
2928 case 's':
2929 case 'S':
2930 case 'd':
2931 case 'D':
a14b48bc
LW
2932 case 'p':
2933 case 'P':
a0d0e21e
LW
2934 --p;
2935 goto loopdone;
2936 case 'n':
2937 ender = '\n';
2938 p++;
a687059c 2939 break;
a0d0e21e
LW
2940 case 'r':
2941 ender = '\r';
2942 p++;
a687059c 2943 break;
a0d0e21e
LW
2944 case 't':
2945 ender = '\t';
2946 p++;
a687059c 2947 break;
a0d0e21e
LW
2948 case 'f':
2949 ender = '\f';
2950 p++;
a687059c 2951 break;
a0d0e21e 2952 case 'e':
c7f1f016 2953 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 2954 p++;
a687059c 2955 break;
a0d0e21e 2956 case 'a':
c7f1f016 2957 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 2958 p++;
a687059c 2959 break;
a0d0e21e 2960 case 'x':
a0ed51b3
LW
2961 if (*++p == '{') {
2962 char* e = strchr(p, '}');
b81d288d 2963
b45f050a 2964 if (!e) {
830247a4 2965 RExC_parse = p + 1;
b45f050a
JF
2966 vFAIL("Missing right brace on \\x{}");
2967 }
de5f0749 2968 else {
b21ed0a9 2969 numlen = 1; /* allow underscores */
de35ba6f 2970 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
2971 if (ender > 0xff)
2972 RExC_utf8 = 1;
b21ed0a9
GS
2973 /* numlen is generous */
2974 if (numlen + len >= 127) {
a0ed51b3
LW
2975 p--;
2976 goto loopdone;
2977 }
2978 p = e + 1;
2979 }
a0ed51b3
LW
2980 }
2981 else {
b21ed0a9 2982 numlen = 0; /* disallow underscores */
dff6d3cd 2983 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2984 p += numlen;
2985 }
a687059c 2986 break;
a0d0e21e
LW
2987 case 'c':
2988 p++;
bbce6d69 2989 ender = UCHARAT(p++);
2990 ender = toCTRL(ender);
a687059c 2991 break;
a0d0e21e
LW
2992 case '0': case '1': case '2': case '3':case '4':
2993 case '5': case '6': case '7': case '8':case '9':
2994 if (*p == '0' ||
830247a4 2995 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2996 numlen = 0; /* disallow underscores */
dff6d3cd 2997 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2998 p += numlen;
2999 }
3000 else {
3001 --p;
3002 goto loopdone;
a687059c
LW
3003 }
3004 break;
a0d0e21e 3005 case '\0':
830247a4 3006 if (p >= RExC_end)
b45f050a 3007 FAIL("Trailing \\");
a687059c 3008 /* FALL THROUGH */
a0d0e21e 3009 default:
e476b1b5 3010 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3011 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3012 goto normal_default;
a0d0e21e
LW
3013 }
3014 break;
a687059c 3015 default:
a0ed51b3 3016 normal_default:
fd400ab9 3017 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3018 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3019 &numlen, 0);
a0ed51b3
LW
3020 p += numlen;
3021 }
3022 else
3023 ender = *p++;
a0d0e21e 3024 break;
a687059c 3025 }
830247a4
IZ
3026 if (RExC_flags16 & PMf_EXTENDED)
3027 p = regwhite(p, RExC_end);
a0ed51b3
LW
3028 if (UTF && FOLD) {
3029 if (LOC)
5e12f4fb 3030 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3031 else
3032 ender = toLOWER_uni(ender);
3033 }
a0d0e21e
LW
3034 if (ISMULT2(p)) { /* Back off on ?+*. */
3035 if (len)
3036 p = oldp;
2b9d42f0 3037 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3038 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3039 s += numlen;
3040 len += numlen;
3041 }
a0d0e21e
LW
3042 else {
3043 len++;
cd439c50 3044 REGC(ender, s++);
a0d0e21e
LW
3045 }
3046 break;
a687059c 3047 }
2b9d42f0 3048 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3049 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3050 s += numlen;
3051 len += numlen - 1;
3052 }
3053 else
cd439c50 3054 REGC(ender, s++);
a0d0e21e
LW
3055 }
3056 loopdone:
830247a4 3057 RExC_parse = p - 1;
fac92740 3058 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3059 nextchar(pRExC_state);
793db0cb
JH
3060 {
3061 /* len is STRLEN which is unsigned, need to copy to signed */
3062 IV iv = len;
3063 if (iv < 0)
3064 vFAIL("Internal disaster");
3065 }
a0d0e21e
LW
3066 if (len > 0)
3067 *flagp |= HASWIDTH;
3068 if (len == 1)
3069 *flagp |= SIMPLE;
c277df42 3070 if (!SIZE_ONLY)
cd439c50
IZ
3071 STR_LEN(ret) = len;
3072 if (SIZE_ONLY)
830247a4 3073 RExC_size += STR_SZ(len);
cd439c50 3074 else
830247a4 3075 RExC_emit += STR_SZ(len);
a687059c 3076 }
a0d0e21e
LW
3077 break;
3078 }
a687059c 3079
a0d0e21e 3080 return(ret);
a687059c
LW
3081}
3082
873ef191 3083STATIC char *
cea2e8a9 3084S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3085{
3086 while (p < e) {
3087 if (isSPACE(*p))
3088 ++p;
3089 else if (*p == '#') {
3090 do {
3091 p++;
3092 } while (p < e && *p != '\n');
3093 }
3094 else
3095 break;
3096 }
3097 return p;
3098}
3099
b8c5462f
JH
3100/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3101 Character classes ([:foo:]) can also be negated ([:^foo:]).
3102 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3103 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3104 but trigger warnings because they are currently unimplemented. */
3105STATIC I32
830247a4 3106S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3107{
3108 char *posixcc = 0;
936ed897 3109 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3110
830247a4 3111 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3112 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3113 (*RExC_parse == ':' ||
3114 *RExC_parse == '=' ||
3115 *RExC_parse == '.')) {
3116 char c = *RExC_parse;
3117 char* s = RExC_parse++;
b81d288d 3118
830247a4
IZ
3119 while (RExC_parse < RExC_end && *RExC_parse != c)
3120 RExC_parse++;
3121 if (RExC_parse == RExC_end)
620e46c5 3122 /* Grandfather lone [:, [=, [. */
830247a4 3123 RExC_parse = s;
620e46c5 3124 else {
830247a4 3125 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3126
830247a4
IZ
3127 if (*RExC_parse == ']') {
3128 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3129 posixcc = s + 1;
3130 if (*s == ':') {
3131 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3132 I32 skip = 5; /* the most common skip */
3133
3134 switch (*posixcc) {
3135 case 'a':
3136 if (strnEQ(posixcc, "alnum", 5))
3137 namedclass =
3138 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3139 else if (strnEQ(posixcc, "alpha", 5))
3140 namedclass =
3141 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3142 else if (strnEQ(posixcc, "ascii", 5))
3143 namedclass =
3144 complement ? ANYOF_NASCII : ANYOF_ASCII;
3145 break;
aaa51d5e
JF
3146 case 'b':
3147 if (strnEQ(posixcc, "blank", 5))
3148 namedclass =
3149 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3150 break;
b8c5462f
JH
3151 case 'c':
3152 if (strnEQ(posixcc, "cntrl", 5))
3153 namedclass =
3154 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3155 break;
3156 case 'd':
3157 if (strnEQ(posixcc, "digit", 5))
3158 namedclass =
3159 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3160 break;
3161 case 'g':
3162 if (strnEQ(posixcc, "graph", 5))
3163 namedclass =
3164 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3165 break;
3166 case 'l':
3167 if (strnEQ(posixcc, "lower", 5))
3168 namedclass =
3169 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3170 break;
3171 case 'p':
3172 if (strnEQ(posixcc, "print", 5))
3173 namedclass =
3174 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3175 else if (strnEQ(posixcc, "punct", 5))
3176 namedclass =
3177 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3178 break;
3179 case 's':
3180 if (strnEQ(posixcc, "space", 5))
3181 namedclass =
aaa51d5e 3182 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3183 break;
b8c5462f
JH
3184 case 'u':
3185 if (strnEQ(posixcc, "upper", 5))
3186 namedclass =
3187 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3188 break;
3189 case 'w': /* this is not POSIX, this is the Perl \w */
3190 if (strnEQ(posixcc, "word", 4)) {
3191 namedclass =
3192 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3193 skip = 4;
3194 }
3195 break;
3196 case 'x':
3197 if (strnEQ(posixcc, "xdigit", 6)) {
3198 namedclass =
3199 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3200 skip = 6;
3201 }
3202 break;
3203 }
ac561586
JH
3204 if (namedclass == OOB_NAMEDCLASS ||
3205 posixcc[skip] != ':' ||
3206 posixcc[skip+1] != ']')
b45f050a
JF
3207 {
3208 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3209 t - s - 1, s + 1);
3210 }
3211 } else if (!SIZE_ONLY) {
b8c5462f 3212 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3213
830247a4 3214 /* adjust RExC_parse so the warning shows after
b45f050a 3215 the class closes */
830247a4
IZ
3216 while (*RExC_parse && *RExC_parse != ']')
3217 RExC_parse++;
b45f050a
JF
3218 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3219 }
b8c5462f
JH
3220 } else {
3221 /* Maternal grandfather:
3222 * "[:" ending in ":" but not in ":]" */
830247a4 3223 RExC_parse = s;
767d463e 3224 }
620e46c5
JH
3225 }
3226 }
3227
b8c5462f
JH
3228 return namedclass;
3229}
3230
3231STATIC void
830247a4 3232S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3233{
e476b1b5 3234 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3235 (*RExC_parse == ':' ||
3236 *RExC_parse == '=' ||
3237 *RExC_parse == '.')) {
3238 char *s = RExC_parse;
93733859 3239 char c = *s++;
b8c5462f
JH
3240
3241 while(*s && isALNUM(*s))
3242 s++;
3243 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3244 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3245
3246 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3247 if (c == '=' || c == '.')
b45f050a 3248 {
830247a4 3249 /* adjust RExC_parse so the error shows after
b45f050a 3250 the class closes */
830247a4 3251 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3252 ;
3253 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3254 }
b8c5462f
JH
3255 }
3256 }
620e46c5
JH
3257}
3258
76e3520e 3259STATIC regnode *
830247a4 3260S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3261{
ffc61ed2 3262 register UV value;
3568d838 3263 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3264 register IV range = 0;
c277df42 3265 register regnode *ret;
ba210ebe 3266 STRLEN numlen;
ffc61ed2 3267 IV namedclass;
73b437c8 3268 char *rangebegin;
936ed897 3269 bool need_class = 0;
ffc61ed2
JH
3270 SV *listsv;
3271 register char *e;
fac92740 3272 char *parse_start = RExC_parse; /* MJD */
ffc61ed2 3273 UV n;
3568d838 3274 bool optimize_invert = TRUE;
ffc61ed2
JH
3275
3276 ret = reganode(pRExC_state, ANYOF, 0);
3277
3278 if (!SIZE_ONLY)
3279 ANYOF_FLAGS(ret) = 0;
3280
3281 if (*RExC_parse == '^') { /* Complement of range. */
3282 RExC_naughty++;
3283 RExC_parse++;
3284 if (!SIZE_ONLY)
3285 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3286 }
a0d0e21e 3287
936ed897 3288 if (SIZE_ONLY)
830247a4 3289 RExC_size += ANYOF_SKIP;
936ed897 3290 else {
830247a4 3291 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3292 if (FOLD)
3293 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3294 if (LOC)
3295 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3296 ANYOF_BITMAP_ZERO(ret);
3297 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3298 }
b8c5462f 3299
e476b1b5 3300 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3301 checkposixcc(pRExC_state);
b8c5462f 3302
830247a4 3303 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3304 goto charclassloop; /* allow 1st char to be ] or - */
3305
830247a4 3306 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3307
3308 charclassloop:
3309
3310 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3311
73b437c8 3312 if (!range)
830247a4 3313 rangebegin = RExC_parse;
ffc61ed2 3314 if (UTF) {
5e12f4fb 3315 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3316 RExC_end - RExC_parse,
3317 &numlen, 0);
ffc61ed2
JH
3318 RExC_parse += numlen;
3319 }
3320 else
3321 value = UCHARAT(RExC_parse++);
620e46c5 3322 if (value == '[')
830247a4 3323 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3324 else if (value == '\\') {
ffc61ed2 3325 if (UTF) {
5e12f4fb 3326 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3327 RExC_end - RExC_parse,
3328 &numlen, 0);
3329 RExC_parse += numlen;
3330 }
3331 else
3332 value = UCHARAT(RExC_parse++);
470c3474 3333 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3334 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3335 * be a problem later if we want switch on Unicode.
3336 * A similar issue a little bit later when switching on
3337 * namedclass. --jhi */
ffc61ed2 3338 switch ((I32)value) {
b8c5462f
JH
3339 case 'w': namedclass = ANYOF_ALNUM; break;
3340 case 'W': namedclass = ANYOF_NALNUM; break;
3341 case 's': namedclass = ANYOF_SPACE; break;
3342 case 'S': namedclass = ANYOF_NSPACE; break;
3343 case 'd': namedclass = ANYOF_DIGIT; break;
3344 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3345 case 'p':
3346 case 'P':
3347 if (*RExC_parse == '{') {
3348 e = strchr(RExC_parse++, '}');
3349 if (!e)
3350 vFAIL("Missing right brace on \\p{}");
3351 n = e - RExC_parse;
3352 }
3353 else {
3354 e = RExC_parse;
3355 n = 1;
3356 }
3357 if (!SIZE_ONLY) {
3358 if (value == 'p')
3359 Perl_sv_catpvf(aTHX_ listsv,
3360 "+utf8::%.*s\n", (int)n, RExC_parse);
3361 else
3362 Perl_sv_catpvf(aTHX_ listsv,
3363 "!utf8::%.*s\n", (int)n, RExC_parse);
3364 }
3365 RExC_parse = e + 1;
3366 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3367 continue;
b8c5462f
JH
3368 case 'n': value = '\n'; break;
3369 case 'r': value = '\r'; break;
3370 case 't': value = '\t'; break;
3371 case 'f': value = '\f'; break;
3372 case 'b': value = '\b'; break;
c7f1f016
NIS
3373 case 'e': value = ASCII_TO_NATIVE('\033');break;
3374 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3375 case 'x':
ffc61ed2
JH
3376 if (*RExC_parse == '{') {
3377 e = strchr(RExC_parse++, '}');
b81d288d 3378 if (!e)
ffc61ed2
JH
3379 vFAIL("Missing right brace on \\x{}");
3380 numlen = 1; /* allow underscores */
3381 value = (UV)scan_hex(RExC_parse,
3382 e - RExC_parse,
3383 &numlen);
3384 RExC_parse = e + 1;
3385 }
3386 else {
3387 numlen = 0; /* disallow underscores */
3388 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3389 RExC_parse += numlen;
3390 }
b8c5462f
JH
3391 break;
3392 case 'c':
830247a4 3393 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3394 value = toCTRL(value);
3395 break;
3396 case '0': case '1': case '2': case '3': case '4':
3397 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3398 numlen = 0; /* disallow underscores */
830247a4
IZ
3399 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3400 RExC_parse += numlen;
b8c5462f 3401 break;
1028017a 3402 default:
e476b1b5 3403 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3404 vWARN2(RExC_parse,
3405 "Unrecognized escape \\%c in character class passed through",
3406 (int)value);
1028017a 3407 break;
b8c5462f 3408 }
ffc61ed2
JH
3409 } /* end of \blah */
3410
3411 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3412
3413 if (!SIZE_ONLY && !need_class)
936ed897 3414 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3415
936ed897 3416 need_class = 1;
ffc61ed2
JH
3417
3418 /* a bad range like a-\d, a-[:digit:] ? */
3419 if (range) {
73b437c8 3420 if (!SIZE_ONLY) {
e476b1b5 3421 if (ckWARN(WARN_REGEXP))
830247a4 3422 vWARN4(RExC_parse,
b45f050a 3423 "False [] range \"%*.*s\"",
830247a4
IZ
3424 RExC_parse - rangebegin,
3425 RExC_parse - rangebegin,
b45f050a 3426 rangebegin);
3568d838
JH
3427 if (prevvalue < 256) {
3428 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3429 ANYOF_BITMAP_SET(ret, '-');
3430 }
3431 else {
3432 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3433 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3434 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3435 }
b8c5462f 3436 }
ffc61ed2
JH
3437
3438 range = 0; /* this was not a true range */
73b437c8 3439 }
ffc61ed2 3440
73b437c8 3441 if (!SIZE_ONLY) {
3568d838
JH
3442 if (namedclass > OOB_NAMEDCLASS)
3443 optimize_invert = FALSE;
e2962f66
JH
3444 /* Possible truncation here but in some 64-bit environments
3445 * the compiler gets heartburn about switch on 64-bit values.
3446 * A similar issue a little earlier when switching on value.
98f323fa 3447 * --jhi */
e2962f66 3448 switch ((I32)namedclass) {
73b437c8
JH
3449 case ANYOF_ALNUM:
3450 if (LOC)
936ed897 3451 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3452 else {
3453 for (value = 0; value < 256; value++)
3454 if (isALNUM(value))
936ed897 3455 ANYOF_BITMAP_SET(ret, value);
73b437c8 3456 }
ffc61ed2 3457 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3458 break;
3459 case ANYOF_NALNUM:
3460 if (LOC)
936ed897 3461 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3462 else {
3463 for (value = 0; value < 256; value++)
3464 if (!isALNUM(value))
936ed897 3465 ANYOF_BITMAP_SET(ret, value);
73b437c8 3466 }
ffc61ed2 3467 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3468 break;
ffc61ed2 3469 case ANYOF_ALNUMC:
73b437c8 3470 if (LOC)
ffc61ed2 3471 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3472 else {
3473 for (value = 0; value < 256; value++)
ffc61ed2 3474 if (isALNUMC(value))
936ed897 3475 ANYOF_BITMAP_SET(ret, value);
73b437c8 3476 }
ffc61ed2 3477 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3478 break;
3479 case ANYOF_NALNUMC:
3480 if (LOC)
936ed897 3481 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3482 else {
3483 for (value = 0; value < 256; value++)
3484 if (!isALNUMC(value))
936ed897 3485 ANYOF_BITMAP_SET(ret, value);
73b437c8 3486 }
ffc61ed2 3487 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3488 break;
3489 case ANYOF_ALPHA:
3490 if (LOC)
936ed897 3491 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3492 else {
3493 for (value = 0; value < 256; value++)
3494 if (isALPHA(value))
936ed897 3495 ANYOF_BITMAP_SET(ret, value);
73b437c8 3496 }
ffc61ed2 3497 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3498 break;
3499 case ANYOF_NALPHA:
3500 if (LOC)
936ed897 3501 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3502 else {
3503 for (value = 0; value < 256; value++)
3504 if (!isALPHA(value))
936ed897 3505 ANYOF_BITMAP_SET(ret, value);
73b437c8 3506 }
ffc61ed2 3507 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3508 break;
3509 case ANYOF_ASCII:
3510 if (LOC)
936ed897 3511 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3512 else {
c7f1f016 3513#ifndef EBCDIC
1ba5c669
JH
3514 for (value = 0; value < 128; value++)
3515 ANYOF_BITMAP_SET(ret, value);
3516#else /* EBCDIC */
ffbc6a93
JH
3517 for (value = 0; value < 256; value++) {
3518 if (PL_hints & HINT_RE_ASCIIR) {
3519 if (NATIVE_TO_ASCII(value) < 128)
3520 ANYOF_BITMAP_SET(ret, value);
3521 }
3522 else {
3523 if (isASCII(value))
3524 ANYOF_BITMAP_SET(ret, value);
3525 }
3526 }
1ba5c669 3527#endif /* EBCDIC */
73b437c8 3528 }
ffc61ed2 3529 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3530 break;
3531 case ANYOF_NASCII:
3532 if (LOC)
936ed897 3533 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3534 else {
c7f1f016 3535#ifndef EBCDIC
1ba5c669
JH
3536 for (value = 128; value < 256; value++)
3537 ANYOF_BITMAP_SET(ret, value);
3538#else /* EBCDIC */
ffbc6a93
JH
3539 for (value = 0; value < 256; value++) {
3540 if (PL_hints & HINT_RE_ASCIIR) {
3541 if (NATIVE_TO_ASCII(value) >= 128)
3542 ANYOF_BITMAP_SET(ret, value);
3543 }
3544 else {
3545 if (!isASCII(value))
3546 ANYOF_BITMAP_SET(ret, value);
3547 }
3548 }
1ba5c669 3549#endif /* EBCDIC */
73b437c8 3550 }
ffc61ed2 3551 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3552 break;
aaa51d5e
JF
3553 case ANYOF_BLANK:
3554 if (LOC)
3555 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3556 else {
3557 for (value = 0; value < 256; value++)
3558 if (isBLANK(value))
3559 ANYOF_BITMAP_SET(ret, value);
3560 }
ffc61ed2 3561 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3562 break;
3563 case ANYOF_NBLANK:
3564 if (LOC)
3565 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3566 else {
3567 for (value = 0; value < 256; value++)
3568 if (!isBLANK(value))
3569 ANYOF_BITMAP_SET(ret, value);
3570 }
ffc61ed2 3571 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3572 break;
73b437c8
JH
3573 case ANYOF_CNTRL:
3574 if (LOC)
936ed897 3575 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3576 else {
3577 for (value = 0; value < 256; value++)
3578 if (isCNTRL(value))
936ed897 3579 ANYOF_BITMAP_SET(ret, value);
73b437c8 3580 }
ffc61ed2 3581 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3582 break;
3583 case ANYOF_NCNTRL:
3584 if (LOC)
936ed897 3585 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3586 else {
3587 for (value = 0; value < 256; value++)
3588 if (!isCNTRL(value))
936ed897 3589 ANYOF_BITMAP_SET(ret, value);
73b437c8 3590 }
ffc61ed2
JH
3591 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3592 break;
3593 case ANYOF_DIGIT:
3594 if (LOC)
3595 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3596 else {
3597 /* consecutive digits assumed */
3598 for (value = '0'; value <= '9'; value++)
3599 ANYOF_BITMAP_SET(ret, value);
3600 }
3601 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3602 break;
3603 case ANYOF_NDIGIT:
3604 if (LOC)
3605 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3606 else {
3607 /* consecutive digits assumed */
3608 for (value = 0; value < '0'; value++)
3609 ANYOF_BITMAP_SET(ret, value);
3610 for (value = '9' + 1; value < 256; value++)
3611 ANYOF_BITMAP_SET(ret, value);
3612 }
3613 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3614 break;
3615 case ANYOF_GRAPH:
3616 if (LOC)
936ed897 3617 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3618 else {
3619 for (value = 0; value < 256; value++)
3620 if (isGRAPH(value))
936ed897 3621 ANYOF_BITMAP_SET(ret, value);
73b437c8 3622 }
ffc61ed2 3623 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3624 break;
3625 case ANYOF_NGRAPH:
3626 if (LOC)
936ed897 3627 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3628 else {
3629 for (value = 0; value < 256; value++)
3630 if (!isGRAPH(value))
936ed897 3631 ANYOF_BITMAP_SET(ret, value);
73b437c8 3632 }
ffc61ed2 3633 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3634 break;
3635 case ANYOF_LOWER:
3636 if (LOC)
936ed897 3637 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3638 else {
3639 for (value = 0; value < 256; value++)
3640 if (isLOWER(value))
936ed897 3641 ANYOF_BITMAP_SET(ret, value);
73b437c8 3642 }
ffc61ed2 3643 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3644 break;
3645 case ANYOF_NLOWER:
3646 if (LOC)
936ed897 3647 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3648 else {
3649 for (value = 0; value < 256; value++)
3650 if (!isLOWER(value))
936ed897 3651 ANYOF_BITMAP_SET(ret, value);
73b437c8 3652 }
ffc61ed2 3653 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3654 break;
3655 case ANYOF_PRINT:
3656 if (LOC)
936ed897 3657 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3658 else {
3659 for (value = 0; value < 256; value++)
3660 if (isPRINT(value))
936ed897 3661 ANYOF_BITMAP_SET(ret, value);
73b437c8 3662 }
ffc61ed2 3663 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3664 break;
3665 case ANYOF_NPRINT:
3666 if (LOC)
936ed897 3667 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3668 else {
3669 for (value = 0; value < 256; value++)
3670 if (!isPRINT(value))
936ed897 3671 ANYOF_BITMAP_SET(ret, value);
73b437c8 3672 }
ffc61ed2 3673 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3674 break;
aaa51d5e
JF
3675 case ANYOF_PSXSPC:
3676 if (LOC)
3677 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (isPSXSPC(value))
3681 ANYOF_BITMAP_SET(ret, value);
3682 }
ffc61ed2 3683 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3684 break;
3685 case ANYOF_NPSXSPC:
3686 if (LOC)
3687 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3688 else {
3689 for (value = 0; value < 256; value++)
3690 if (!isPSXSPC(value))
3691 ANYOF_BITMAP_SET(ret, value);
3692 }
ffc61ed2 3693 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3694 break;
73b437c8
JH
3695 case ANYOF_PUNCT:
3696 if (LOC)
936ed897 3697 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3698 else {
3699 for (value = 0; value < 256; value++)
3700 if (isPUNCT(value))
936ed897 3701 ANYOF_BITMAP_SET(ret, value);
73b437c8 3702 }
ffc61ed2 3703 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3704 break;
3705 case ANYOF_NPUNCT:
3706 if (LOC)
936ed897 3707 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3708 else {
3709 for (value = 0; value < 256; value++)
3710 if (!isPUNCT(value))
936ed897 3711 ANYOF_BITMAP_SET(ret, value);
73b437c8 3712 }
ffc61ed2
JH
3713 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3714 break;
3715 case ANYOF_SPACE:
3716 if (LOC)
3717 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3718 else {
3719 for (value = 0; value < 256; value++)
3720 if (isSPACE(value))
3721 ANYOF_BITMAP_SET(ret, value);
3722 }
3723 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3724 break;
3725 case ANYOF_NSPACE:
3726 if (LOC)
3727 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3728 else {
3729 for (value = 0; value < 256; value++)
3730 if (!isSPACE(value))
3731 ANYOF_BITMAP_SET(ret, value);
3732 }
3733 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3734 break;
3735 case ANYOF_UPPER:
3736 if (LOC)
936ed897 3737 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3738 else {
3739 for (value = 0; value < 256; value++)
3740 if (isUPPER(value))
936ed897 3741 ANYOF_BITMAP_SET(ret, value);
73b437c8 3742 }
ffc61ed2 3743 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3744 break;
3745 case ANYOF_NUPPER:
3746 if (LOC)
936ed897 3747 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3748 else {
3749 for (value = 0; value < 256; value++)
3750 if (!isUPPER(value))
936ed897 3751 ANYOF_BITMAP_SET(ret, value);
73b437c8 3752 }
ffc61ed2 3753 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3754 break;
3755 case ANYOF_XDIGIT:
3756 if (LOC)
936ed897 3757 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3758 else {
3759 for (value = 0; value < 256; value++)
3760 if (isXDIGIT(value))
936ed897 3761 ANYOF_BITMAP_SET(ret, value);
73b437c8 3762 }
ffc61ed2 3763 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3764 break;
3765 case ANYOF_NXDIGIT:
3766 if (LOC)
936ed897 3767 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3768 else {
3769 for (value = 0; value < 256; value++)
3770 if (!isXDIGIT(value))
936ed897 3771 ANYOF_BITMAP_SET(ret, value);
73b437c8 3772 }
ffc61ed2 3773 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3774 break;
3775 default:
b45f050a 3776 vFAIL("Invalid [::] class");
73b437c8 3777 break;
b8c5462f 3778 }
b8c5462f 3779 if (LOC)
936ed897 3780 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3781 continue;
a0d0e21e 3782 }
ffc61ed2
JH
3783 } /* end of namedclass \blah */
3784
a0d0e21e 3785 if (range) {
3568d838
JH
3786 if (((prevvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
3787 ((NATIVE_TO_UNI(prevvalue) > NATIVE_TO_UNI(value)) &&
3788 (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
b45f050a 3789 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3790 RExC_parse - rangebegin,
3791 RExC_parse - rangebegin,
b45f050a 3792 rangebegin);
3568d838 3793 range = 0; /* not a valid range */
73b437c8 3794 }
a0d0e21e
LW
3795 }
3796 else {
3568d838 3797 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3798 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3799 RExC_parse[1] != ']') {
3800 RExC_parse++;
ffc61ed2
JH
3801
3802 /* a bad range like \w-, [:word:]- ? */
3803 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3804 if (ckWARN(WARN_REGEXP))
830247a4 3805 vWARN4(RExC_parse,
b45f050a 3806 "False [] range \"%*.*s\"",
830247a4
IZ
3807 RExC_parse - rangebegin,
3808 RExC_parse - rangebegin,
b45f050a 3809 rangebegin);
73b437c8 3810 if (!SIZE_ONLY)
936ed897 3811 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3812 } else
ffc61ed2
JH
3813 range = 1; /* yeah, it's a range! */
3814 continue; /* but do it the next time */
a0d0e21e 3815 }
a687059c 3816 }
ffc61ed2 3817
93733859 3818 /* now is the next time */
ae5c130c 3819 if (!SIZE_ONLY) {
3568d838
JH
3820 IV i;
3821
3822 if (prevvalue < 256) {
3823 IV ceilvalue = value < 256 ? value : 255;
3824
3825#ifdef EBCDIC
ba953a12 3826 if (PL_hints & HINT_RE_ASCIIR) {
ffbc6a93 3827 /* New style scheme for ranges:
ffbc6a93
JH
3828 * use re 'asciir';
3829 * do ranges in ASCII/Unicode space
3830 */
3568d838
JH
3831 for (i = NATIVE_TO_ASCII(prevvalue);
3832 i <= NATIVE_TO_ASCII(ceilvalue);
3833 i++)
3834 ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
ffbc6a93 3835 }
3568d838
JH
3836 else if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3837 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3838 {
3568d838
JH
3839 if (isLOWER(prevvalue)) {
3840 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3841 if (isLOWER(i))
3842 ANYOF_BITMAP_SET(ret, i);
3843 } else {
3568d838 3844 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3845 if (isUPPER(i))
3846 ANYOF_BITMAP_SET(ret, i);
3847 }
8ada0baa 3848 }
ffc61ed2 3849 else
8ada0baa 3850#endif
3568d838
JH
3851 for (i = prevvalue; i <= ceilvalue; i++)
3852 ANYOF_BITMAP_SET(ret, i);
3853 }
3854 if (value > 255) {
ffc61ed2 3855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3856 if (prevvalue < value)
ffc61ed2 3857 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3858 (UV)prevvalue, (UV)value);
3859 else if (prevvalue == value)
ffc61ed2
JH
3860 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3861 (UV)value);
3862 }
8ada0baa 3863 }
ffc61ed2
JH
3864
3865 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3866 }
ffc61ed2 3867
936ed897 3868 if (need_class) {
4f66b38d 3869 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3870 if (SIZE_ONLY)
830247a4 3871 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3872 else
830247a4 3873 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3874 }
ffc61ed2 3875
ae5c130c 3876 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3877 if (!SIZE_ONLY &&
ffc61ed2
JH
3878 (ANYOF_FLAGS(ret) &
3879 /* If the only flag is folding (plus possibly inversion). */
3880 (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
a0ed51b3 3881 for (value = 0; value < 256; ++value) {
936ed897 3882 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3883 IV fold = PL_fold[value];
3884
3885 if (fold != value)
3886 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3887 }
3888 }
936ed897 3889 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3890 }
ffc61ed2 3891
ae5c130c 3892 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3893 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3894 /* If the only flag is inversion. */
3895 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3896 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3897 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3898 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3899 }
a0d0e21e 3900
b81d288d 3901 if (!SIZE_ONLY) {
fde631ed 3902 AV *av = newAV();
ffc61ed2
JH
3903 SV *rv;
3904
3905 av_store(av, 0, listsv);
3906 av_store(av, 1, NULL);
3907 rv = newRV_noinc((SV*)av);
19860706 3908 n = add_data(pRExC_state, 1, "s");
830247a4 3909 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3910 ARG_SET(ret, n);
a0ed51b3
LW
3911 }
3912
3913 return ret;
3914}
3915
76e3520e 3916STATIC char*
830247a4 3917S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3918{
830247a4 3919 char* retval = RExC_parse++;
a0d0e21e 3920
4633a7c4 3921 for (;;) {
830247a4
IZ
3922 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3923 RExC_parse[2] == '#') {
3924 while (*RExC_parse && *RExC_parse != ')')
3925 RExC_parse++;
3926 RExC_parse++;
4633a7c4
LW
3927 continue;
3928 }
830247a4
IZ
3929 if (RExC_flags16 & PMf_EXTENDED) {
3930 if (isSPACE(*RExC_parse)) {
3931 RExC_parse++;
748a9306
LW
3932 continue;
3933 }
830247a4
IZ
3934 else if (*RExC_parse == '#') {
3935 while (*RExC_parse && *RExC_parse != '\n')
3936 RExC_parse++;
3937 RExC_parse++;
748a9306
LW
3938 continue;
3939 }
748a9306 3940 }
4633a7c4 3941 return retval;
a0d0e21e 3942 }
a687059c
LW
3943}
3944
3945/*
c277df42 3946- reg_node - emit a node
a0d0e21e 3947*/
76e3520e 3948STATIC regnode * /* Location. */
830247a4 3949S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3950{
c277df42
IZ
3951 register regnode *ret;
3952 register regnode *ptr;
a687059c 3953
830247a4 3954 ret = RExC_emit;
c277df42 3955 if (SIZE_ONLY) {
830247a4
IZ
3956 SIZE_ALIGN(RExC_size);
3957 RExC_size += 1;
a0d0e21e
LW
3958 return(ret);
3959 }
a687059c 3960
c277df42 3961 NODE_ALIGN_FILL(ret);
a0d0e21e 3962 ptr = ret;
c277df42 3963 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
3964 if (RExC_offsets) { /* MJD */
3965 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
3966 "reg_node", __LINE__,
3967 reg_name[op],
3968 RExC_emit - RExC_emit_start > RExC_offsets[0]
3969 ? "Overwriting end of array!\n" : "OK",
3970 RExC_emit - RExC_emit_start,
3971 RExC_parse - RExC_start,
3972 RExC_offsets[0]));
3973 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
3974 }
3975
830247a4 3976 RExC_emit = ptr;
a687059c 3977
a0d0e21e 3978 return(ret);
a687059c
LW
3979}
3980
3981/*
a0d0e21e
LW
3982- reganode - emit a node with an argument
3983*/
76e3520e 3984STATIC regnode * /* Location. */
830247a4 3985S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 3986{
c277df42
IZ
3987 register regnode *ret;
3988 register regnode *ptr;
fe14fcc3 3989
830247a4 3990 ret = RExC_emit;
c277df42 3991 if (SIZE_ONLY) {
830247a4
IZ
3992 SIZE_ALIGN(RExC_size);
3993 RExC_size += 2;
a0d0e21e
LW
3994 return(ret);
3995 }
fe14fcc3 3996
c277df42 3997 NODE_ALIGN_FILL(ret);
a0d0e21e 3998 ptr = ret;
c277df42 3999 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4000 if (RExC_offsets) { /* MJD */
4001 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4002 "reganode",
4003 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4004 "Overwriting end of array!\n" : "OK",
4005 RExC_emit - RExC_emit_start,
4006 RExC_parse - RExC_start,
4007 RExC_offsets[0]));
4008 Set_Cur_Node_Offset;
4009 }
4010
830247a4 4011 RExC_emit = ptr;
fe14fcc3 4012
a0d0e21e 4013 return(ret);
fe14fcc3
LW
4014}
4015
4016/*
cd439c50 4017- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4018*/
4019STATIC void
830247a4 4020S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4021{
5e12f4fb 4022 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4023}
4024
4025/*
a0d0e21e
LW
4026- reginsert - insert an operator in front of already-emitted operand
4027*
4028* Means relocating the operand.
4029*/
76e3520e 4030STATIC void
830247a4 4031S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4032{
c277df42
IZ
4033 register regnode *src;
4034 register regnode *dst;
4035 register regnode *place;
4036 register int offset = regarglen[(U8)op];
b81d288d 4037
22c35a8c 4038/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4039
4040 if (SIZE_ONLY) {
830247a4 4041 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4042 return;
4043 }
a687059c 4044
830247a4
IZ
4045 src = RExC_emit;
4046 RExC_emit += NODE_STEP_REGNODE + offset;
4047 dst = RExC_emit;
fac92740 4048 while (src > opnd) {
c277df42 4049 StructCopy(--src, --dst, regnode);
fac92740
MJD
4050 if (RExC_offsets) { /* MJD 20010112 */
4051 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4052 "reg_insert",
4053 dst - RExC_emit_start > RExC_offsets[0]
4054 ? "Overwriting end of array!\n" : "OK",
4055 src - RExC_emit_start,
4056 dst - RExC_emit_start,
4057 RExC_offsets[0]));
4058 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4059 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4060 }
4061 }
4062
a0d0e21e
LW
4063
4064 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4065 if (RExC_offsets) { /* MJD */
4066 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4067 "reginsert",
4068 place - RExC_emit_start > RExC_offsets[0]
4069 ? "Overwriting end of array!\n" : "OK",
4070 place - RExC_emit_start,
4071 RExC_parse - RExC_start,
4072 RExC_offsets[0]));
4073 Set_Node_Offset(place, RExC_parse);
4074 }
c277df42
IZ
4075 src = NEXTOPER(place);
4076 FILL_ADVANCE_NODE(place, op);
4077 Zero(src, offset, regnode);
a687059c
LW
4078}
4079
4080/*
c277df42 4081- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4082*/
76e3520e 4083STATIC void
830247a4 4084S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4085{
c277df42
IZ
4086 register regnode *scan;
4087 register regnode *temp;
a0d0e21e 4088
c277df42 4089 if (SIZE_ONLY)
a0d0e21e
LW
4090 return;
4091
4092 /* Find last node. */
4093 scan = p;
4094 for (;;) {
4095 temp = regnext(scan);
4096 if (temp == NULL)
4097 break;
4098 scan = temp;
4099 }
a687059c 4100
c277df42
IZ
4101 if (reg_off_by_arg[OP(scan)]) {
4102 ARG_SET(scan, val - scan);
a0ed51b3
LW
4103 }
4104 else {
c277df42
IZ
4105 NEXT_OFF(scan) = val - scan;
4106 }
a687059c
LW
4107}
4108
4109/*
a0d0e21e
LW
4110- regoptail - regtail on operand of first argument; nop if operandless
4111*/
76e3520e 4112STATIC void
830247a4 4113S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4114{
a0d0e21e 4115 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4116 if (p == NULL || SIZE_ONLY)
4117 return;
22c35a8c 4118 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4119 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4120 }
22c35a8c 4121 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4122 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4123 }
4124 else
a0d0e21e 4125 return;
a687059c
LW
4126}
4127
4128/*
4129 - regcurly - a little FSA that accepts {\d+,?\d*}
4130 */
79072805 4131STATIC I32
cea2e8a9 4132S_regcurly(pTHX_ register char *s)
a687059c
LW
4133{
4134 if (*s++ != '{')
4135 return FALSE;
f0fcb552 4136 if (!isDIGIT(*s))
a687059c 4137 return FALSE;
f0fcb552 4138 while (isDIGIT(*s))
a687059c
LW
4139 s++;
4140 if (*s == ',')
4141 s++;
f0fcb552 4142 while (isDIGIT(*s))
a687059c
LW
4143 s++;
4144 if (*s != '}')
4145 return FALSE;
4146 return TRUE;
4147}
4148
a687059c 4149
76e3520e 4150STATIC regnode *
cea2e8a9 4151S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4152{
35ff7856 4153#ifdef DEBUGGING
f248d071 4154 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4155 register regnode *next;
c277df42
IZ
4156
4157 while (op != END && (!last || node < last)) {
4158 /* While that wasn't END last time... */
4159
4160 NODE_ALIGN(node);
4161 op = OP(node);
4162 if (op == CLOSE)
4163 l--;
4164 next = regnext(node);
4165 /* Where, what. */
4166 if (OP(node) == OPTIMIZED)
4167 goto after_print;
4168 regprop(sv, node);
b900a521 4169 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4170 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4171 if (next == NULL) /* Next ptr. */
4172 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4173 else
b900a521 4174 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4175 (void)PerlIO_putc(Perl_debug_log, '\n');
4176 after_print:
22c35a8c 4177 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4178 register regnode *nnode = (OP(next) == LONGJMP
4179 ? regnext(next)
c277df42
IZ
4180 : next);
4181 if (last && nnode > last)
4182 nnode = last;
4183 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4184 }
22c35a8c 4185 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4186 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4187 }
4188 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4189 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4190 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4191 }
22c35a8c 4192 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4193 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4194 next, sv, l + 1);
a0ed51b3
LW
4195 }
4196 else if ( op == PLUS || op == STAR) {
c277df42 4197 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4198 }
4199 else if (op == ANYOF) {
4f66b38d
HS
4200 /* arglen 1 + class block */
4201 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4202 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4203 node = NEXTOPER(node);
a0ed51b3 4204 }
22c35a8c 4205 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4206 /* Literal string, where present. */
cd439c50 4207 node += NODE_SZ_STR(node) - 1;
c277df42 4208 node = NEXTOPER(node);
a0ed51b3
LW
4209 }
4210 else {
c277df42
IZ
4211 node = NEXTOPER(node);
4212 node += regarglen[(U8)op];
4213 }
4214 if (op == CURLYX || op == OPEN)
4215 l++;
4216 else if (op == WHILEM)
4217 l--;
4218 }
17c3b450 4219#endif /* DEBUGGING */
c277df42
IZ
4220 return node;
4221}
4222
a687059c 4223/*
fd181c75 4224 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4225 */
4226void
864dbfa3 4227Perl_regdump(pTHX_ regexp *r)
a687059c 4228{
35ff7856 4229#ifdef DEBUGGING
46fc3d4c 4230 SV *sv = sv_newmortal();
a687059c 4231
c277df42 4232 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4233
4234 /* Header fields of interest. */
c277df42 4235 if (r->anchored_substr)
7b0972df 4236 PerlIO_printf(Perl_debug_log,
b81d288d 4237 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4238 PL_colors[0],
7b0972df 4239 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4240 SvPVX(r->anchored_substr),
3280af22 4241 PL_colors[1],
c277df42 4242 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4243 (IV)r->anchored_offset);
c277df42 4244 if (r->float_substr)
7b0972df 4245 PerlIO_printf(Perl_debug_log,
b81d288d 4246 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4247 PL_colors[0],
b81d288d 4248 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4249 SvPVX(r->float_substr),
3280af22 4250 PL_colors[1],
c277df42 4251 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4252 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4253 if (r->check_substr)
b81d288d
AB
4254 PerlIO_printf(Perl_debug_log,
4255 r->check_substr == r->float_substr
c277df42
IZ
4256 ? "(checking floating" : "(checking anchored");
4257 if (r->reganch & ROPT_NOSCAN)
4258 PerlIO_printf(Perl_debug_log, " noscan");
4259 if (r->reganch & ROPT_CHECK_ALL)
4260 PerlIO_printf(Perl_debug_log, " isall");
4261 if (r->check_substr)
4262 PerlIO_printf(Perl_debug_log, ") ");
4263
46fc3d4c 4264 if (r->regstclass) {
4265 regprop(sv, r->regstclass);
4266 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4267 }
774d564b 4268 if (r->reganch & ROPT_ANCH) {
4269 PerlIO_printf(Perl_debug_log, "anchored");
4270 if (r->reganch & ROPT_ANCH_BOL)
4271 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4272 if (r->reganch & ROPT_ANCH_MBOL)
4273 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4274 if (r->reganch & ROPT_ANCH_SBOL)
4275 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4276 if (r->reganch & ROPT_ANCH_GPOS)
4277 PerlIO_printf(Perl_debug_log, "(GPOS)");
4278 PerlIO_putc(Perl_debug_log, ' ');
4279 }
c277df42
IZ
4280 if (r->reganch & ROPT_GPOS_SEEN)
4281 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4282 if (r->reganch & ROPT_SKIP)
760ac839 4283 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4284 if (r->reganch & ROPT_IMPLICIT)
760ac839 4285 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4286 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4287 if (r->reganch & ROPT_EVAL_SEEN)
4288 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4289 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4290 if (r->offsets) {
4291 U32 i;
4292 U32 len = r->offsets[0];
392fbf5d 4293 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4294 for (i = 1; i <= len; i++)
392fbf5d
RB
4295 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4296 (UV)r->offsets[i*2-1],
4297 (UV)r->offsets[i*2]);
fac92740
MJD
4298 PerlIO_printf(Perl_debug_log, "\n");
4299 }
17c3b450 4300#endif /* DEBUGGING */
a687059c
LW
4301}
4302
653099ff
GS
4303STATIC void
4304S_put_byte(pTHX_ SV *sv, int c)
4305{
ffc61ed2 4306 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4307 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4308 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4309 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4310 else
4311 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4312}
4313
a687059c 4314/*
a0d0e21e
LW
4315- regprop - printable representation of opcode
4316*/
46fc3d4c 4317void
864dbfa3 4318Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4319{
35ff7856 4320#ifdef DEBUGGING
9b155405 4321 register int k;
a0d0e21e 4322
54dc92de 4323 sv_setpvn(sv, "", 0);
9b155405 4324 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4325 /* It would be nice to FAIL() here, but this may be called from
4326 regexec.c, and it would be hard to supply pRExC_state. */
4327 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4328 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4329
4330 k = PL_regkind[(U8)OP(o)];
4331
4332 if (k == EXACT)
7821416a 4333 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4334 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4335 else if (k == CURLY) {
cb434fcc 4336 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4337 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4338 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4339 }
2c2d71f5
JH
4340 else if (k == WHILEM && o->flags) /* Ordinal/of */
4341 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4342 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4343 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4344 else if (k == LOGICAL)
04ebc1ab 4345 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4346 else if (k == ANYOF) {
4347 int i, rangestart = -1;
ffc61ed2 4348 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4349 const char * const anyofs[] = { /* Should be syncronized with
4350 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4351 "\\w",
4352 "\\W",
4353 "\\s",
4354 "\\S",
4355 "\\d",
4356 "\\D",
4357 "[:alnum:]",
4358 "[:^alnum:]",
4359 "[:alpha:]",
4360 "[:^alpha:]",
4361 "[:ascii:]",
4362 "[:^ascii:]",
4363 "[:ctrl:]",
4364 "[:^ctrl:]",
4365 "[:graph:]",
4366 "[:^graph:]",
4367 "[:lower:]",
4368 "[:^lower:]",
4369 "[:print:]",
4370 "[:^print:]",
4371 "[:punct:]",
4372 "[:^punct:]",
4373 "[:upper:]",
aaa51d5e 4374 "[:^upper:]",
653099ff 4375 "[:xdigit:]",
aaa51d5e
JF
4376 "[:^xdigit:]",
4377 "[:space:]",
4378 "[:^space:]",
4379 "[:blank:]",
4380 "[:^blank:]"
653099ff
GS
4381 };
4382
19860706 4383 if (flags & ANYOF_LOCALE)
653099ff 4384 sv_catpv(sv, "{loc}");
19860706 4385 if (flags & ANYOF_FOLD)
653099ff
GS
4386 sv_catpv(sv, "{i}");
4387 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4388 if (flags & ANYOF_INVERT)
653099ff 4389 sv_catpv(sv, "^");
ffc61ed2
JH
4390 for (i = 0; i <= 256; i++) {
4391 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4392 if (rangestart == -1)
4393 rangestart = i;
4394 } else if (rangestart != -1) {
4395 if (i <= rangestart + 3)
4396 for (; rangestart < i; rangestart++)
653099ff 4397 put_byte(sv, rangestart);
ffc61ed2
JH
4398 else {
4399 put_byte(sv, rangestart);
4400 sv_catpv(sv, "-");
4401 put_byte(sv, i - 1);
653099ff 4402 }
ffc61ed2 4403 rangestart = -1;
653099ff 4404 }
847a199f 4405 }
ffc61ed2
JH
4406
4407 if (o->flags & ANYOF_CLASS)
4408 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4409 if (ANYOF_CLASS_TEST(o,i))
4410 sv_catpv(sv, anyofs[i]);
4411
4412 if (flags & ANYOF_UNICODE)
4413 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4414 else if (flags & ANYOF_UNICODE_ALL)
4415 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4416
4417 {
4418 SV *lv;
4419 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4420
ffc61ed2
JH
4421 if (lv) {
4422 if (sw) {
4423 UV i;
4424 U8 s[UTF8_MAXLEN+1];
b81d288d 4425
ffc61ed2 4426 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4427 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4428
3568d838 4429 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4430 if (rangestart == -1)
4431 rangestart = i;
4432 } else if (rangestart != -1) {
4433 U8 *p;
b81d288d 4434
ffc61ed2
JH
4435 if (i <= rangestart + 3)
4436 for (; rangestart < i; rangestart++) {
2b9d42f0 4437 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4438 put_byte(sv, *p);
4439 }
4440 else {
2b9d42f0 4441 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4442 put_byte(sv, *p);
4443 sv_catpv(sv, "-");
2b9d42f0 4444 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4445 put_byte(sv, *p);
4446 }
4447 rangestart = -1;
4448 }
19860706 4449 }
ffc61ed2
JH
4450
4451 sv_catpv(sv, "..."); /* et cetera */
19860706 4452 }
fde631ed 4453
ffc61ed2
JH
4454 {
4455 char *s = savepv(SvPVX(lv));
4456 char *origs = s;
b81d288d 4457
ffc61ed2 4458 while(*s && *s != '\n') s++;
b81d288d 4459
ffc61ed2
JH
4460 if (*s == '\n') {
4461 char *t = ++s;
4462
4463 while (*s) {
4464 if (*s == '\n')
4465 *s = ' ';
4466 s++;
4467 }
4468 if (s[-1] == ' ')
4469 s[-1] = 0;
4470
4471 sv_catpv(sv, t);
fde631ed 4472 }
b81d288d 4473
ffc61ed2 4474 Safefree(origs);
fde631ed
JH
4475 }
4476 }
653099ff 4477 }
ffc61ed2 4478
653099ff
GS
4479 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4480 }
9b155405 4481 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4482 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4483#endif /* DEBUGGING */
35ff7856 4484}
a687059c 4485
cad2e5aa
JH
4486SV *
4487Perl_re_intuit_string(pTHX_ regexp *prog)
4488{ /* Assume that RE_INTUIT is set */
4489 DEBUG_r(
4490 { STRLEN n_a;
4491 char *s = SvPV(prog->check_substr,n_a);
4492
4493 if (!PL_colorset) reginitcolors();
4494 PerlIO_printf(Perl_debug_log,
4495 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4496 PL_colors[4],PL_colors[5],PL_colors[0],
4497 s,
4498 PL_colors[1],
4499 (strlen(s) > 60 ? "..." : ""));
4500 } );
4501
4502 return prog->check_substr;
4503}
4504
2b69d0c2 4505void
864dbfa3 4506Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4507{
adac82c7 4508 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4509
4510 if (!r || (--r->refcnt > 0))
4511 return;
cad2e5aa
JH
4512 DEBUG_r(PerlIO_printf(Perl_debug_log,
4513 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4514 PL_colors[4],PL_colors[5],PL_colors[0],
4515 r->precomp,
4516 PL_colors[1],
4517 (strlen(r->precomp) > 60 ? "..." : "")));
4518
c277df42 4519 if (r->precomp)
a0d0e21e 4520 Safefree(r->precomp);
fac92740
MJD
4521 if (r->offsets) /* 20010421 MJD */
4522 Safefree(r->offsets);
cf93c79d
IZ
4523 if (RX_MATCH_COPIED(r))
4524 Safefree(r->subbeg);
a193d654
GS
4525 if (r->substrs) {
4526 if (r->anchored_substr)
4527 SvREFCNT_dec(r->anchored_substr);
4528 if (r->float_substr)
4529 SvREFCNT_dec(r->float_substr);
2779dcf1 4530 Safefree(r->substrs);
a193d654 4531 }
c277df42
IZ
4532 if (r->data) {
4533 int n = r->data->count;
dfad63ad
HS
4534 AV* new_comppad = NULL;
4535 AV* old_comppad;
4536 SV** old_curpad;
4537
c277df42
IZ
4538 while (--n >= 0) {
4539 switch (r->data->what[n]) {
4540 case 's':
4541 SvREFCNT_dec((SV*)r->data->data[n]);
4542 break;
653099ff
GS
4543 case 'f':
4544 Safefree(r->data->data[n]);
4545 break;
dfad63ad
HS
4546 case 'p':
4547 new_comppad = (AV*)r->data->data[n];
4548 break;
c277df42 4549 case 'o':
dfad63ad 4550 if (new_comppad == NULL)
cea2e8a9 4551 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4552 old_comppad = PL_comppad;
4553 old_curpad = PL_curpad;
1e6dc0b6
SB
4554 /* Watch out for global destruction's random ordering. */
4555 if (SvTYPE(new_comppad) == SVt_PVAV) {
4556 PL_comppad = new_comppad;
4557 PL_curpad = AvARRAY(new_comppad);
4558 }
4559 else
4560 PL_curpad = NULL;
c277df42 4561 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4562 PL_comppad = old_comppad;
4563 PL_curpad = old_curpad;
4564 SvREFCNT_dec((SV*)new_comppad);
4565 new_comppad = NULL;
c277df42
IZ
4566 break;
4567 case 'n':
4568 break;
4569 default:
830247a4 4570 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4571 }
4572 }
4573 Safefree(r->data->what);
4574 Safefree(r->data);
a0d0e21e
LW
4575 }
4576 Safefree(r->startp);
4577 Safefree(r->endp);
4578 Safefree(r);
a687059c 4579}
c277df42
IZ
4580
4581/*
4582 - regnext - dig the "next" pointer out of a node
4583 *
4584 * [Note, when REGALIGN is defined there are two places in regmatch()
4585 * that bypass this code for speed.]
4586 */
4587regnode *
864dbfa3 4588Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4589{
4590 register I32 offset;
4591
3280af22 4592 if (p == &PL_regdummy)
c277df42
IZ
4593 return(NULL);
4594
4595 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4596 if (offset == 0)
4597 return(NULL);
4598
c277df42 4599 return(p+offset);
c277df42
IZ
4600}
4601
01f988be 4602STATIC void
cea2e8a9 4603S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4604{
4605 va_list args;
4606 STRLEN l1 = strlen(pat1);
4607 STRLEN l2 = strlen(pat2);
4608 char buf[512];
06bf62c7 4609 SV *msv;
c277df42
IZ
4610 char *message;
4611
4612 if (l1 > 510)
4613 l1 = 510;
4614 if (l1 + l2 > 510)
4615 l2 = 510 - l1;
4616 Copy(pat1, buf, l1 , char);
4617 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4618 buf[l1 + l2] = '\n';
4619 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4620#ifdef I_STDARG
4621 /* ANSI variant takes additional second argument */
c277df42 4622 va_start(args, pat2);
8736538c
AS
4623#else
4624 va_start(args);
4625#endif
5a844595 4626 msv = vmess(buf, &args);
c277df42 4627 va_end(args);
06bf62c7 4628 message = SvPV(msv,l1);
c277df42
IZ
4629 if (l1 > 512)
4630 l1 = 512;
4631 Copy(message, buf, l1 , char);
4632 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4633 Perl_croak(aTHX_ "%s", buf);
c277df42 4634}
a0ed51b3
LW
4635
4636/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4637
4638void
864dbfa3 4639Perl_save_re_context(pTHX)
b81d288d 4640{
830247a4
IZ
4641#if 0
4642 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4643 SAVEI32(RExC_npar); /* () count. */
4644 SAVEI32(RExC_size); /* Code size. */
4645 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4646 SAVEVPTR(RExC_rx); /* from regcomp.c */
4647 SAVEI32(RExC_seen); /* from regcomp.c */
4648 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4649 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4650 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4651 SAVEPPTR(RExC_end); /* End of input for compile */
4652 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4653#endif
4654
4655 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4656 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4657 SAVEPPTR(PL_reginput); /* String-input pointer. */
4658 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4659 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4660 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4661 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4662 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4663 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4664 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4665 PL_reg_start_tmp = 0;
a0ed51b3
LW
4666 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4667 PL_reg_start_tmpl = 0;
7766f137 4668 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4669 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4670 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4671 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4672 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4673 SAVEVPTR(PL_regcc); /* from regexec.c */
4674 SAVEVPTR(PL_curcop);
7766f137
GS
4675 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4676 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4677 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4678 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4679 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4680 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4681 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4682 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4683 SAVEI32(PL_regnpar); /* () count. */
54b6e2fa 4684#ifdef DEBUGGING
b81d288d 4685 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4686#endif
a0ed51b3 4687}
51371543
GS
4688
4689#ifdef PERL_OBJECT
51371543
GS
4690#include "XSUB.h"
4691#undef this
4692#define this pPerl
4693#endif
4694
4695static void
4696clear_re(pTHXo_ void *r)
4697{
4698 ReREFCNT_dec((regexp *)r);
4699}
ffbc6a93 4700