This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Retract #10324 and #10333; not needed.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
396
397#define vWARN2(loc, m, a1) \
398 STMT_START { \
830247a4 399 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
400 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
401 a1, \
830247a4 402 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
403 } STMT_END
404
405#define vWARN3(loc, m, a1, a2) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
409 a1, a2, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN4(loc, m, a1, a2, a3) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
417 a1, a2, a3, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
419 } STMT_END
420
421
cd439c50
IZ
422/* Allow for side effects in s */
423#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
424
fac92740
MJD
425/* Macros for recording node offsets. 20001227 mjd@plover.com
426 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
427 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
428 * Element 0 holds the number n.
429 */
430
431#define MJD_OFFSET_DEBUG(x)
432/* #define MJD_OFFSET_DEBUG(x) fprintf x */
433
434
435# define Set_Node_Offset_To_R(node,byte) \
436 STMT_START { \
437 if (! SIZE_ONLY) { \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444 } STMT_END
445
446# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
447# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
448
449# define Set_Node_Length_To_R(node,len) \
450 STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
453 if((node) < 0) { \
454 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
455 } else { \
456 RExC_offsets[2*(node)] = (len); \
457 } \
458 } \
459 } STMT_END
460
461# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
462# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
463# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
464
465/* Get offsets and lengths */
466#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
467#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
468
51371543 469static void clear_re(pTHXo_ void *r);
4327152a 470
653099ff
GS
471/* Mark that we cannot extend a found fixed substring at this point.
472 Updata the longest found anchored substring and the longest found
473 floating substrings if needed. */
474
4327152a 475STATIC void
830247a4 476S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 477{
a0ed51b3
LW
478 STRLEN l = CHR_SVLEN(data->last_found);
479 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 480
c277df42
IZ
481 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
482 sv_setsv(*data->longest, data->last_found);
483 if (*data->longest == data->longest_fixed) {
484 data->offset_fixed = l ? data->last_start_min : data->pos_min;
485 if (data->flags & SF_BEFORE_EOL)
b81d288d 486 data->flags
c277df42
IZ
487 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
488 else
489 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
490 }
491 else {
c277df42 492 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
493 data->offset_float_max = (l
494 ? data->last_start_max
c277df42
IZ
495 : data->pos_min + data->pos_delta);
496 if (data->flags & SF_BEFORE_EOL)
b81d288d 497 data->flags
c277df42
IZ
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499 else
500 data->flags &= ~SF_FL_BEFORE_EOL;
501 }
502 }
503 SvCUR_set(data->last_found, 0);
504 data->last_end = -1;
505 data->flags &= ~SF_BEFORE_EOL;
506}
507
653099ff
GS
508/* Can match anything (initialization) */
509STATIC void
830247a4 510S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 511{
9af0acdb
JH
512 int value;
513
653099ff 514 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
515 for (value = 0; value < 256; ++value)
516 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 517 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
518 if (LOC)
519 cl->flags |= ANYOF_LOCALE;
520}
521
522/* Can match anything (initialization) */
523STATIC int
524S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
525{
526 int value;
527
aaa51d5e 528 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
529 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 return 1;
1aa99e6b
IH
531 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 return 0;
9af0acdb
JH
533 for (value = 0; value < 256; ++value)
534 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
535 return 0;
536 return 1;
537}
538
539/* Can match anything (initialization) */
540STATIC void
830247a4 541S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 542{
8ecf7187 543 Zero(cl, 1, struct regnode_charclass_class);
653099ff 544 cl->type = ANYOF;
830247a4 545 cl_anything(pRExC_state, cl);
653099ff
GS
546}
547
548STATIC void
830247a4 549S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554 if (LOC)
555 cl->flags |= ANYOF_LOCALE;
556}
557
558/* 'And' a given class with another one. Can create false positives */
559/* We assume that cl is not inverted */
560STATIC void
561S_cl_and(pTHX_ struct regnode_charclass_class *cl,
562 struct regnode_charclass_class *and_with)
563{
653099ff
GS
564 if (!(and_with->flags & ANYOF_CLASS)
565 && !(cl->flags & ANYOF_CLASS)
566 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
567 && !(and_with->flags & ANYOF_FOLD)
568 && !(cl->flags & ANYOF_FOLD)) {
569 int i;
570
571 if (and_with->flags & ANYOF_INVERT)
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= ~and_with->bitmap[i];
574 else
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= and_with->bitmap[i];
577 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
578 if (!(and_with->flags & ANYOF_EOS))
579 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
580
581 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
582 cl->flags &= ~ANYOF_UNICODE_ALL;
583 cl->flags |= ANYOF_UNICODE;
584 ARG_SET(cl, ARG(and_with));
585 }
586 if (!(and_with->flags & ANYOF_UNICODE_ALL))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
830247a4 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
652/* REx optimizer. Converts nodes into quickier variants "in place".
653 Finds fixed substrings. */
654
c277df42
IZ
655/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
656 to the position after last scanned or to NULL. */
657
76e3520e 658STATIC I32
830247a4 659S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
660 /* scanp: Start here (read-write). */
661 /* deltap: Write maxlen-minlen here. */
662 /* last: Stop before this one. */
663{
664 I32 min = 0, pars = 0, code;
665 regnode *scan = *scanp, *next;
666 I32 delta = 0;
667 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 668 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
669 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
670 scan_data_t data_fake;
653099ff 671 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 672
c277df42
IZ
673 while (scan && OP(scan) != END && scan < last) {
674 /* Peephole optimizer: */
675
22c35a8c 676 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 677 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
678 regnode *n = regnext(scan);
679 U32 stringok = 1;
680#ifdef DEBUGGING
681 regnode *stop = scan;
b81d288d 682#endif
c277df42 683
cd439c50 684 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
685 /* Skip NOTHING, merge EXACT*. */
686 while (n &&
b81d288d 687 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
688 (stringok && (OP(n) == OP(scan))))
689 && NEXT_OFF(n)
690 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
691 if (OP(n) == TAIL || n > next)
692 stringok = 0;
22c35a8c 693 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
694 NEXT_OFF(scan) += NEXT_OFF(n);
695 next = n + NODE_STEP_REGNODE;
696#ifdef DEBUGGING
697 if (stringok)
698 stop = n;
b81d288d 699#endif
c277df42 700 n = regnext(n);
a0ed51b3 701 }
f49d4d0f 702 else if (stringok) {
cd439c50 703 int oldl = STR_LEN(scan);
c277df42 704 regnode *nnext = regnext(n);
f49d4d0f 705
b81d288d 706 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
707 break;
708 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
709 STR_LEN(scan) += STR_LEN(n);
710 next = n + NODE_SZ_STR(n);
c277df42 711 /* Now we can overwrite *n : */
f49d4d0f 712 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 713#ifdef DEBUGGING
f49d4d0f 714 stop = next - 1;
b81d288d 715#endif
c277df42
IZ
716 n = nnext;
717 }
718 }
719#ifdef DEBUGGING
720 /* Allow dumping */
cd439c50 721 n = scan + NODE_SZ_STR(scan);
c277df42 722 while (n <= stop) {
22c35a8c 723 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
724 OP(n) = OPTIMIZED;
725 NEXT_OFF(n) = 0;
726 }
727 n++;
728 }
653099ff 729#endif
c277df42 730 }
653099ff
GS
731 /* Follow the next-chain of the current node and optimize
732 away all the NOTHINGs from it. */
c277df42 733 if (OP(scan) != CURLYX) {
048cfca1
GS
734 int max = (reg_off_by_arg[OP(scan)]
735 ? I32_MAX
736 /* I32 may be smaller than U16 on CRAYs! */
737 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
738 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
739 int noff;
740 regnode *n = scan;
b81d288d 741
c277df42
IZ
742 /* Skip NOTHING and LONGJMP. */
743 while ((n = regnext(n))
22c35a8c 744 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
745 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
746 && off + noff < max)
747 off += noff;
748 if (reg_off_by_arg[OP(scan)])
749 ARG(scan) = off;
b81d288d 750 else
c277df42
IZ
751 NEXT_OFF(scan) = off;
752 }
653099ff
GS
753 /* The principal pseudo-switch. Cannot be a switch, since we
754 look into several different things. */
b81d288d 755 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
756 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
757 next = regnext(scan);
758 code = OP(scan);
b81d288d
AB
759
760 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 761 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 762 struct regnode_charclass_class accum;
c277df42 763
653099ff 764 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 765 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 766 if (flags & SCF_DO_STCLASS)
830247a4 767 cl_init_zero(pRExC_state, &accum);
c277df42 768 while (OP(scan) == code) {
830247a4 769 I32 deltanext, minnext, f = 0, fake;
653099ff 770 struct regnode_charclass_class this_class;
c277df42
IZ
771
772 num++;
773 data_fake.flags = 0;
b81d288d 774 if (data) {
2c2d71f5 775 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
776 data_fake.last_closep = data->last_closep;
777 }
778 else
779 data_fake.last_closep = &fake;
c277df42
IZ
780 next = regnext(scan);
781 scan = NEXTOPER(scan);
782 if (code != BRANCH)
783 scan = NEXTOPER(scan);
653099ff 784 if (flags & SCF_DO_STCLASS) {
830247a4 785 cl_init(pRExC_state, &this_class);
653099ff
GS
786 data_fake.start_class = &this_class;
787 f = SCF_DO_STCLASS_AND;
b81d288d 788 }
e1901655
IZ
789 if (flags & SCF_WHILEM_VISITED_POS)
790 f |= SCF_WHILEM_VISITED_POS;
653099ff 791 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
792 minnext = study_chunk(pRExC_state, &scan, &deltanext,
793 next, &data_fake, f);
b81d288d 794 if (min1 > minnext)
c277df42
IZ
795 min1 = minnext;
796 if (max1 < minnext + deltanext)
797 max1 = minnext + deltanext;
798 if (deltanext == I32_MAX)
aca2d497 799 is_inf = is_inf_internal = 1;
c277df42
IZ
800 scan = next;
801 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
802 pars++;
405ff068 803 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 804 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
805 if (data)
806 data->whilem_c = data_fake.whilem_c;
653099ff 807 if (flags & SCF_DO_STCLASS)
830247a4 808 cl_or(pRExC_state, &accum, &this_class);
b81d288d 809 if (code == SUSPEND)
c277df42
IZ
810 break;
811 }
812 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
813 min1 = 0;
814 if (flags & SCF_DO_SUBSTR) {
815 data->pos_min += min1;
816 data->pos_delta += max1 - min1;
817 if (max1 != min1 || is_inf)
818 data->longest = &(data->longest_float);
819 }
820 min += min1;
821 delta += max1 - min1;
653099ff 822 if (flags & SCF_DO_STCLASS_OR) {
830247a4 823 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
824 if (min1) {
825 cl_and(data->start_class, &and_with);
826 flags &= ~SCF_DO_STCLASS;
827 }
828 }
829 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
830 if (min1) {
831 cl_and(data->start_class, &accum);
653099ff 832 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
833 }
834 else {
b81d288d 835 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
836 * data->start_class */
837 StructCopy(data->start_class, &and_with,
838 struct regnode_charclass_class);
839 flags &= ~SCF_DO_STCLASS_AND;
840 StructCopy(&accum, data->start_class,
841 struct regnode_charclass_class);
842 flags |= SCF_DO_STCLASS_OR;
843 data->start_class->flags |= ANYOF_EOS;
844 }
653099ff 845 }
a0ed51b3
LW
846 }
847 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
848 scan = NEXTOPER(NEXTOPER(scan));
849 else /* single branch is optimized. */
850 scan = NEXTOPER(scan);
851 continue;
a0ed51b3
LW
852 }
853 else if (OP(scan) == EXACT) {
cd439c50 854 I32 l = STR_LEN(scan);
1aa99e6b 855 UV uc = *((U8*)STRING(scan));
a0ed51b3 856 if (UTF) {
1aa99e6b
IH
857 U8 *s = (U8*)STRING(scan);
858 l = utf8_length(s, s + l);
9041c2e3 859 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
860 }
861 min += l;
c277df42 862 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
863 /* The code below prefers earlier match for fixed
864 offset, later match for variable offset. */
865 if (data->last_end == -1) { /* Update the start info. */
866 data->last_start_min = data->pos_min;
867 data->last_start_max = is_inf
b81d288d 868 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 869 }
cd439c50 870 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
871 data->last_end = data->pos_min + l;
872 data->pos_min += l; /* As in the first entry. */
873 data->flags &= ~SF_BEFORE_EOL;
874 }
653099ff
GS
875 if (flags & SCF_DO_STCLASS_AND) {
876 /* Check whether it is compatible with what we know already! */
877 int compat = 1;
878
1aa99e6b 879 if (uc >= 0x100 ||
516a5887 880 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 881 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 882 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 883 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 884 )
653099ff
GS
885 compat = 0;
886 ANYOF_CLASS_ZERO(data->start_class);
887 ANYOF_BITMAP_ZERO(data->start_class);
888 if (compat)
1aa99e6b 889 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 890 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
891 if (uc < 0x100)
892 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
893 }
894 else if (flags & SCF_DO_STCLASS_OR) {
895 /* false positive possible if the class is case-folded */
1aa99e6b 896 if (uc < 0x100)
9b877dbb
IH
897 ANYOF_BITMAP_SET(data->start_class, uc);
898 else
899 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
900 data->start_class->flags &= ~ANYOF_EOS;
901 cl_and(data->start_class, &and_with);
902 }
903 flags &= ~SCF_DO_STCLASS;
a0ed51b3 904 }
653099ff 905 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 906 I32 l = STR_LEN(scan);
1aa99e6b 907 UV uc = *((U8*)STRING(scan));
653099ff
GS
908
909 /* Search for fixed substrings supports EXACT only. */
b81d288d 910 if (flags & SCF_DO_SUBSTR)
830247a4 911 scan_commit(pRExC_state, data);
a0ed51b3 912 if (UTF) {
1aa99e6b
IH
913 U8 *s = (U8 *)STRING(scan);
914 l = utf8_length(s, s + l);
9041c2e3 915 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
916 }
917 min += l;
c277df42 918 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 919 data->pos_min += l;
653099ff
GS
920 if (flags & SCF_DO_STCLASS_AND) {
921 /* Check whether it is compatible with what we know already! */
922 int compat = 1;
923
1aa99e6b 924 if (uc >= 0x100 ||
516a5887 925 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 926 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 927 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
928 compat = 0;
929 ANYOF_CLASS_ZERO(data->start_class);
930 ANYOF_BITMAP_ZERO(data->start_class);
931 if (compat) {
1aa99e6b 932 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
933 data->start_class->flags &= ~ANYOF_EOS;
934 data->start_class->flags |= ANYOF_FOLD;
935 if (OP(scan) == EXACTFL)
936 data->start_class->flags |= ANYOF_LOCALE;
937 }
938 }
939 else if (flags & SCF_DO_STCLASS_OR) {
940 if (data->start_class->flags & ANYOF_FOLD) {
941 /* false positive possible if the class is case-folded.
942 Assume that the locale settings are the same... */
1aa99e6b
IH
943 if (uc < 0x100)
944 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
945 data->start_class->flags &= ~ANYOF_EOS;
946 }
947 cl_and(data->start_class, &and_with);
948 }
949 flags &= ~SCF_DO_STCLASS;
a0ed51b3 950 }
4d61ec05 951 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 952 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 953 I32 f = flags, pos_before = 0;
c277df42 954 regnode *oscan = scan;
653099ff
GS
955 struct regnode_charclass_class this_class;
956 struct regnode_charclass_class *oclass = NULL;
957
22c35a8c 958 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 959 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
960 scan = NEXTOPER(scan);
961 goto finish;
962 case PLUS:
653099ff 963 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 964 next = NEXTOPER(scan);
653099ff 965 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
966 mincount = 1;
967 maxcount = REG_INFTY;
c277df42
IZ
968 next = regnext(scan);
969 scan = NEXTOPER(scan);
970 goto do_curly;
971 }
972 }
973 if (flags & SCF_DO_SUBSTR)
974 data->pos_min++;
975 min++;
976 /* Fall through. */
977 case STAR:
653099ff
GS
978 if (flags & SCF_DO_STCLASS) {
979 mincount = 0;
b81d288d 980 maxcount = REG_INFTY;
653099ff
GS
981 next = regnext(scan);
982 scan = NEXTOPER(scan);
983 goto do_curly;
984 }
b81d288d 985 is_inf = is_inf_internal = 1;
c277df42
IZ
986 scan = regnext(scan);
987 if (flags & SCF_DO_SUBSTR) {
830247a4 988 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
989 data->longest = &(data->longest_float);
990 }
991 goto optimize_curly_tail;
992 case CURLY:
b81d288d 993 mincount = ARG1(scan);
c277df42
IZ
994 maxcount = ARG2(scan);
995 next = regnext(scan);
cb434fcc
IZ
996 if (OP(scan) == CURLYX) {
997 I32 lp = (data ? *(data->last_closep) : 0);
998
999 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1000 }
c277df42
IZ
1001 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1002 do_curly:
1003 if (flags & SCF_DO_SUBSTR) {
830247a4 1004 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1005 pos_before = data->pos_min;
1006 }
1007 if (data) {
1008 fl = data->flags;
1009 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1010 if (is_inf)
1011 data->flags |= SF_IS_INF;
1012 }
653099ff 1013 if (flags & SCF_DO_STCLASS) {
830247a4 1014 cl_init(pRExC_state, &this_class);
653099ff
GS
1015 oclass = data->start_class;
1016 data->start_class = &this_class;
1017 f |= SCF_DO_STCLASS_AND;
1018 f &= ~SCF_DO_STCLASS_OR;
1019 }
e1901655
IZ
1020 /* These are the cases when once a subexpression
1021 fails at a particular position, it cannot succeed
1022 even after backtracking at the enclosing scope.
b81d288d 1023
e1901655
IZ
1024 XXXX what if minimal match and we are at the
1025 initial run of {n,m}? */
1026 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1027 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1028
c277df42 1029 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1030 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1031 mincount == 0
653099ff
GS
1032 ? (f & ~SCF_DO_SUBSTR) : f);
1033
1034 if (flags & SCF_DO_STCLASS)
1035 data->start_class = oclass;
1036 if (mincount == 0 || minnext == 0) {
1037 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1038 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1039 }
1040 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1041 /* Switch to OR mode: cache the old value of
653099ff
GS
1042 * data->start_class */
1043 StructCopy(data->start_class, &and_with,
1044 struct regnode_charclass_class);
1045 flags &= ~SCF_DO_STCLASS_AND;
1046 StructCopy(&this_class, data->start_class,
1047 struct regnode_charclass_class);
1048 flags |= SCF_DO_STCLASS_OR;
1049 data->start_class->flags |= ANYOF_EOS;
1050 }
1051 } else { /* Non-zero len */
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 cl_and(data->start_class, &and_with);
1055 }
1056 else if (flags & SCF_DO_STCLASS_AND)
1057 cl_and(data->start_class, &this_class);
1058 flags &= ~SCF_DO_STCLASS;
1059 }
c277df42
IZ
1060 if (!scan) /* It was not CURLYX, but CURLY. */
1061 scan = next;
b81d288d 1062 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
99799961 1063 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1064 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1065 {
830247a4 1066 vWARN(RExC_parse,
b45f050a
JF
1067 "Quantifier unexpected on zero-length expression");
1068 }
1069
c277df42 1070 min += minnext * mincount;
b81d288d 1071 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1072 && (minnext + deltanext) > 0)
1073 || deltanext == I32_MAX);
aca2d497 1074 is_inf |= is_inf_internal;
c277df42
IZ
1075 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1076
1077 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1078 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1079 && data->flags & SF_IN_PAR
1080 && !(data->flags & SF_HAS_EVAL)
1081 && !deltanext && minnext == 1 ) {
1082 /* Try to optimize to CURLYN. */
1083 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1084 regnode *nxt1 = nxt, *nxt2;
1085
1086 /* Skip open. */
1087 nxt = regnext(nxt);
4d61ec05 1088 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1089 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1090 && STR_LEN(nxt) == 1))
c277df42
IZ
1091 goto nogo;
1092 nxt2 = nxt;
1093 nxt = regnext(nxt);
b81d288d 1094 if (OP(nxt) != CLOSE)
c277df42
IZ
1095 goto nogo;
1096 /* Now we know that nxt2 is the only contents: */
1097 oscan->flags = ARG(nxt);
1098 OP(oscan) = CURLYN;
1099 OP(nxt1) = NOTHING; /* was OPEN. */
1100#ifdef DEBUGGING
1101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1102 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1103 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1104 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1105 OP(nxt + 1) = OPTIMIZED; /* was count. */
1106 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1107#endif
c277df42 1108 }
c277df42
IZ
1109 nogo:
1110
1111 /* Try optimization CURLYX => CURLYM. */
b81d288d 1112 if ( OP(oscan) == CURLYX && data
c277df42 1113 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1114 && !(data->flags & SF_HAS_EVAL)
1115 && !deltanext ) {
1116 /* XXXX How to optimize if data == 0? */
1117 /* Optimize to a simpler form. */
1118 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1119 regnode *nxt2;
1120
1121 OP(oscan) = CURLYM;
1122 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1123 && (OP(nxt2) != WHILEM))
c277df42
IZ
1124 nxt = nxt2;
1125 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1126 /* Need to optimize away parenths. */
1127 if (data->flags & SF_IN_PAR) {
1128 /* Set the parenth number. */
1129 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1130
b81d288d 1131 if (OP(nxt) != CLOSE)
b45f050a 1132 FAIL("Panic opt close");
c277df42
IZ
1133 oscan->flags = ARG(nxt);
1134 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1135 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1136#ifdef DEBUGGING
1137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1138 OP(nxt + 1) = OPTIMIZED; /* was count. */
1139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1141#endif
c277df42
IZ
1142#if 0
1143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1144 regnode *nnxt = regnext(nxt1);
b81d288d 1145
c277df42
IZ
1146 if (nnxt == nxt) {
1147 if (reg_off_by_arg[OP(nxt1)])
1148 ARG_SET(nxt1, nxt2 - nxt1);
1149 else if (nxt2 - nxt1 < U16_MAX)
1150 NEXT_OFF(nxt1) = nxt2 - nxt1;
1151 else
1152 OP(nxt) = NOTHING; /* Cannot beautify */
1153 }
1154 nxt1 = nnxt;
1155 }
1156#endif
1157 /* Optimize again: */
b81d288d 1158 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1159 NULL, 0);
a0ed51b3
LW
1160 }
1161 else
c277df42 1162 oscan->flags = 0;
c277df42 1163 }
e1901655
IZ
1164 else if ((OP(oscan) == CURLYX)
1165 && (flags & SCF_WHILEM_VISITED_POS)
1166 /* See the comment on a similar expression above.
1167 However, this time it not a subexpression
1168 we care about, but the expression itself. */
1169 && (maxcount == REG_INFTY)
1170 && data && ++data->whilem_c < 16) {
1171 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1172 /* Find WHILEM (as in regexec.c) */
1173 regnode *nxt = oscan + NEXT_OFF(oscan);
1174
1175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1176 nxt += ARG(nxt);
1177 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1178 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1179 }
b81d288d 1180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1181 pars++;
1182 if (flags & SCF_DO_SUBSTR) {
1183 SV *last_str = Nullsv;
1184 int counted = mincount != 0;
1185
1186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1187 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1188 ? pos_before : data->last_start_min;
1189 STRLEN l;
1190 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1191 I32 old = b - data->last_start_min;
1192
1193 if (UTF)
1194 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1195
a0ed51b3 1196 l -= old;
c277df42 1197 /* Get the added string: */
79cb57f6 1198 last_str = newSVpvn(s + old, l);
c277df42
IZ
1199 if (deltanext == 0 && pos_before == b) {
1200 /* What was added is a constant string */
1201 if (mincount > 1) {
1202 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1203 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1204 SvPVX(last_str), l, mincount - 1);
1205 SvCUR(last_str) *= mincount;
1206 /* Add additional parts. */
b81d288d 1207 SvCUR_set(data->last_found,
c277df42
IZ
1208 SvCUR(data->last_found) - l);
1209 sv_catsv(data->last_found, last_str);
1210 data->last_end += l * (mincount - 1);
1211 }
2a8d9689
HS
1212 } else {
1213 /* start offset must point into the last copy */
1214 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1215 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1216 * (minnext + data->pos_delta);
c277df42
IZ
1217 }
1218 }
1219 /* It is counted once already... */
1220 data->pos_min += minnext * (mincount - counted);
1221 data->pos_delta += - counted * deltanext +
1222 (minnext + deltanext) * maxcount - minnext * mincount;
1223 if (mincount != maxcount) {
653099ff
GS
1224 /* Cannot extend fixed substrings found inside
1225 the group. */
830247a4 1226 scan_commit(pRExC_state,data);
c277df42
IZ
1227 if (mincount && last_str) {
1228 sv_setsv(data->last_found, last_str);
1229 data->last_end = data->pos_min;
b81d288d 1230 data->last_start_min =
a0ed51b3 1231 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1232 data->last_start_max = is_inf
1233 ? I32_MAX
c277df42 1234 : data->pos_min + data->pos_delta
a0ed51b3 1235 - CHR_SVLEN(last_str);
c277df42
IZ
1236 }
1237 data->longest = &(data->longest_float);
1238 }
aca2d497 1239 SvREFCNT_dec(last_str);
c277df42 1240 }
405ff068 1241 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1242 data->flags |= SF_HAS_EVAL;
1243 optimize_curly_tail:
c277df42 1244 if (OP(oscan) != CURLYX) {
22c35a8c 1245 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1246 && NEXT_OFF(next))
1247 NEXT_OFF(oscan) += NEXT_OFF(next);
1248 }
c277df42 1249 continue;
653099ff 1250 default: /* REF and CLUMP only? */
c277df42 1251 if (flags & SCF_DO_SUBSTR) {
830247a4 1252 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1253 data->longest = &(data->longest_float);
1254 }
aca2d497 1255 is_inf = is_inf_internal = 1;
653099ff 1256 if (flags & SCF_DO_STCLASS_OR)
830247a4 1257 cl_anything(pRExC_state, data->start_class);
653099ff 1258 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1259 break;
1260 }
a0ed51b3 1261 }
ffc61ed2 1262 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1263 int value = 0;
653099ff 1264
c277df42 1265 if (flags & SCF_DO_SUBSTR) {
830247a4 1266 scan_commit(pRExC_state,data);
c277df42
IZ
1267 data->pos_min++;
1268 }
1269 min++;
653099ff
GS
1270 if (flags & SCF_DO_STCLASS) {
1271 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1272
1273 /* Some of the logic below assumes that switching
1274 locale on will only add false positives. */
1275 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1276 case SANY:
653099ff
GS
1277 default:
1278 do_default:
1279 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1280 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1281 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1282 break;
1283 case REG_ANY:
1284 if (OP(scan) == SANY)
1285 goto do_default;
1286 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1287 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1288 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1289 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1290 }
1291 if (flags & SCF_DO_STCLASS_AND || !value)
1292 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1293 break;
1294 case ANYOF:
1295 if (flags & SCF_DO_STCLASS_AND)
1296 cl_and(data->start_class,
1297 (struct regnode_charclass_class*)scan);
1298 else
830247a4 1299 cl_or(pRExC_state, data->start_class,
653099ff
GS
1300 (struct regnode_charclass_class*)scan);
1301 break;
1302 case ALNUM:
1303 if (flags & SCF_DO_STCLASS_AND) {
1304 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1305 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1306 for (value = 0; value < 256; value++)
1307 if (!isALNUM(value))
1308 ANYOF_BITMAP_CLEAR(data->start_class, value);
1309 }
1310 }
1311 else {
1312 if (data->start_class->flags & ANYOF_LOCALE)
1313 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1314 else {
1315 for (value = 0; value < 256; value++)
1316 if (isALNUM(value))
b81d288d 1317 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1318 }
1319 }
1320 break;
1321 case ALNUML:
1322 if (flags & SCF_DO_STCLASS_AND) {
1323 if (data->start_class->flags & ANYOF_LOCALE)
1324 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1325 }
1326 else {
1327 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1328 data->start_class->flags |= ANYOF_LOCALE;
1329 }
1330 break;
1331 case NALNUM:
1332 if (flags & SCF_DO_STCLASS_AND) {
1333 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1335 for (value = 0; value < 256; value++)
1336 if (isALNUM(value))
1337 ANYOF_BITMAP_CLEAR(data->start_class, value);
1338 }
1339 }
1340 else {
1341 if (data->start_class->flags & ANYOF_LOCALE)
1342 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1343 else {
1344 for (value = 0; value < 256; value++)
1345 if (!isALNUM(value))
b81d288d 1346 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1347 }
1348 }
1349 break;
1350 case NALNUML:
1351 if (flags & SCF_DO_STCLASS_AND) {
1352 if (data->start_class->flags & ANYOF_LOCALE)
1353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1354 }
1355 else {
1356 data->start_class->flags |= ANYOF_LOCALE;
1357 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1358 }
1359 break;
1360 case SPACE:
1361 if (flags & SCF_DO_STCLASS_AND) {
1362 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1364 for (value = 0; value < 256; value++)
1365 if (!isSPACE(value))
1366 ANYOF_BITMAP_CLEAR(data->start_class, value);
1367 }
1368 }
1369 else {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1372 else {
1373 for (value = 0; value < 256; value++)
1374 if (isSPACE(value))
b81d288d 1375 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1376 }
1377 }
1378 break;
1379 case SPACEL:
1380 if (flags & SCF_DO_STCLASS_AND) {
1381 if (data->start_class->flags & ANYOF_LOCALE)
1382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1383 }
1384 else {
1385 data->start_class->flags |= ANYOF_LOCALE;
1386 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1387 }
1388 break;
1389 case NSPACE:
1390 if (flags & SCF_DO_STCLASS_AND) {
1391 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1393 for (value = 0; value < 256; value++)
1394 if (isSPACE(value))
1395 ANYOF_BITMAP_CLEAR(data->start_class, value);
1396 }
1397 }
1398 else {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1401 else {
1402 for (value = 0; value < 256; value++)
1403 if (!isSPACE(value))
b81d288d 1404 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1405 }
1406 }
1407 break;
1408 case NSPACEL:
1409 if (flags & SCF_DO_STCLASS_AND) {
1410 if (data->start_class->flags & ANYOF_LOCALE) {
1411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1412 for (value = 0; value < 256; value++)
1413 if (!isSPACE(value))
1414 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 }
1416 }
1417 else {
1418 data->start_class->flags |= ANYOF_LOCALE;
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1420 }
1421 break;
1422 case DIGIT:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1425 for (value = 0; value < 256; value++)
1426 if (!isDIGIT(value))
1427 ANYOF_BITMAP_CLEAR(data->start_class, value);
1428 }
1429 else {
1430 if (data->start_class->flags & ANYOF_LOCALE)
1431 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1432 else {
1433 for (value = 0; value < 256; value++)
1434 if (isDIGIT(value))
b81d288d 1435 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1436 }
1437 }
1438 break;
1439 case NDIGIT:
1440 if (flags & SCF_DO_STCLASS_AND) {
1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1442 for (value = 0; value < 256; value++)
1443 if (isDIGIT(value))
1444 ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 }
1446 else {
1447 if (data->start_class->flags & ANYOF_LOCALE)
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1449 else {
1450 for (value = 0; value < 256; value++)
1451 if (!isDIGIT(value))
b81d288d 1452 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1453 }
1454 }
1455 break;
1456 }
1457 if (flags & SCF_DO_STCLASS_OR)
1458 cl_and(data->start_class, &and_with);
1459 flags &= ~SCF_DO_STCLASS;
1460 }
a0ed51b3 1461 }
22c35a8c 1462 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1463 data->flags |= (OP(scan) == MEOL
1464 ? SF_BEFORE_MEOL
1465 : SF_BEFORE_SEOL);
a0ed51b3 1466 }
653099ff
GS
1467 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1468 /* Lookbehind, or need to calculate parens/evals/stclass: */
1469 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1470 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1471 /* Lookahead/lookbehind */
cb434fcc 1472 I32 deltanext, minnext, fake = 0;
c277df42 1473 regnode *nscan;
653099ff
GS
1474 struct regnode_charclass_class intrnl;
1475 int f = 0;
c277df42
IZ
1476
1477 data_fake.flags = 0;
b81d288d 1478 if (data) {
2c2d71f5 1479 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1480 data_fake.last_closep = data->last_closep;
1481 }
1482 else
1483 data_fake.last_closep = &fake;
653099ff
GS
1484 if ( flags & SCF_DO_STCLASS && !scan->flags
1485 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1486 cl_init(pRExC_state, &intrnl);
653099ff 1487 data_fake.start_class = &intrnl;
e1901655 1488 f |= SCF_DO_STCLASS_AND;
653099ff 1489 }
e1901655
IZ
1490 if (flags & SCF_WHILEM_VISITED_POS)
1491 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1492 next = regnext(scan);
1493 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1494 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1495 if (scan->flags) {
1496 if (deltanext) {
9baa0206 1497 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1498 }
1499 else if (minnext > U8_MAX) {
9baa0206 1500 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1501 }
1502 scan->flags = minnext;
1503 }
1504 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1505 pars++;
405ff068 1506 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1507 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1508 if (data)
1509 data->whilem_c = data_fake.whilem_c;
e1901655 1510 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1511 int was = (data->start_class->flags & ANYOF_EOS);
1512
1513 cl_and(data->start_class, &intrnl);
1514 if (was)
1515 data->start_class->flags |= ANYOF_EOS;
1516 }
a0ed51b3
LW
1517 }
1518 else if (OP(scan) == OPEN) {
c277df42 1519 pars++;
a0ed51b3 1520 }
cb434fcc
IZ
1521 else if (OP(scan) == CLOSE) {
1522 if (ARG(scan) == is_par) {
1523 next = regnext(scan);
c277df42 1524
cb434fcc
IZ
1525 if ( next && (OP(next) != WHILEM) && next < last)
1526 is_par = 0; /* Disable optimization */
1527 }
1528 if (data)
1529 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1530 }
1531 else if (OP(scan) == EVAL) {
c277df42
IZ
1532 if (data)
1533 data->flags |= SF_HAS_EVAL;
1534 }
96776eda 1535 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1536 if (flags & SCF_DO_SUBSTR) {
830247a4 1537 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1538 data->longest = &(data->longest_float);
1539 }
1540 is_inf = is_inf_internal = 1;
653099ff 1541 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1542 cl_anything(pRExC_state, data->start_class);
96776eda 1543 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1544 }
c277df42
IZ
1545 /* Else: zero-length, ignore. */
1546 scan = regnext(scan);
1547 }
1548
1549 finish:
1550 *scanp = scan;
aca2d497 1551 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1552 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1553 data->pos_delta = I32_MAX - data->pos_min;
1554 if (is_par > U8_MAX)
1555 is_par = 0;
1556 if (is_par && pars==1 && data) {
1557 data->flags |= SF_IN_PAR;
1558 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1559 }
1560 else if (pars && data) {
c277df42
IZ
1561 data->flags |= SF_HAS_PAR;
1562 data->flags &= ~SF_IN_PAR;
1563 }
653099ff
GS
1564 if (flags & SCF_DO_STCLASS_OR)
1565 cl_and(data->start_class, &and_with);
c277df42
IZ
1566 return min;
1567}
1568
76e3520e 1569STATIC I32
830247a4 1570S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1571{
830247a4 1572 if (RExC_rx->data) {
b81d288d
AB
1573 Renewc(RExC_rx->data,
1574 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1575 char, struct reg_data);
830247a4
IZ
1576 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1577 RExC_rx->data->count += n;
a0ed51b3
LW
1578 }
1579 else {
830247a4 1580 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1581 char, struct reg_data);
830247a4
IZ
1582 New(1208, RExC_rx->data->what, n, U8);
1583 RExC_rx->data->count = n;
c277df42 1584 }
830247a4
IZ
1585 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1586 return RExC_rx->data->count - n;
c277df42
IZ
1587}
1588
d88dccdf 1589void
864dbfa3 1590Perl_reginitcolors(pTHX)
d88dccdf 1591{
d88dccdf
IZ
1592 int i = 0;
1593 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1594
d88dccdf
IZ
1595 if (s) {
1596 PL_colors[0] = s = savepv(s);
1597 while (++i < 6) {
1598 s = strchr(s, '\t');
1599 if (s) {
1600 *s = '\0';
1601 PL_colors[i] = ++s;
1602 }
1603 else
c712d376 1604 PL_colors[i] = s = "";
d88dccdf
IZ
1605 }
1606 } else {
b81d288d 1607 while (i < 6)
d88dccdf
IZ
1608 PL_colors[i++] = "";
1609 }
1610 PL_colorset = 1;
1611}
1612
8615cb43 1613
a687059c 1614/*
e50aee73 1615 - pregcomp - compile a regular expression into internal code
a687059c
LW
1616 *
1617 * We can't allocate space until we know how big the compiled form will be,
1618 * but we can't compile it (and thus know how big it is) until we've got a
1619 * place to put the code. So we cheat: we compile it twice, once with code
1620 * generation turned off and size counting turned on, and once "for real".
1621 * This also means that we don't allocate space until we are sure that the
1622 * thing really will compile successfully, and we never have to move the
1623 * code and thus invalidate pointers into it. (Note that it has to be in
1624 * one piece because free() must be able to free it all.) [NB: not true in perl]
1625 *
1626 * Beware that the optimization-preparation code in here knows about some
1627 * of the structure of the compiled regexp. [I'll say.]
1628 */
1629regexp *
864dbfa3 1630Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1631{
a0d0e21e 1632 register regexp *r;
c277df42 1633 regnode *scan;
c277df42 1634 regnode *first;
a0d0e21e 1635 I32 flags;
a0d0e21e
LW
1636 I32 minlen = 0;
1637 I32 sawplus = 0;
1638 I32 sawopen = 0;
2c2d71f5 1639 scan_data_t data;
830247a4
IZ
1640 RExC_state_t RExC_state;
1641 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1642
1643 if (exp == NULL)
c277df42 1644 FAIL("NULL regexp argument");
a0d0e21e 1645
830247a4 1646 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1647 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1648 RExC_utf8 = 1;
a0ed51b3 1649 else
1aa99e6b 1650 RExC_utf8 = 0;
a0ed51b3 1651
5cfc7842 1652 RExC_precomp = exp;
35ef4773 1653 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1654 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1655 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1656 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1657 RExC_flags16 = pm->op_pmflags;
1658 RExC_sawback = 0;
bbce6d69 1659
830247a4
IZ
1660 RExC_seen = 0;
1661 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1662 RExC_seen_evals = 0;
1663 RExC_extralen = 0;
c277df42 1664
bbce6d69 1665 /* First pass: determine size, legality. */
830247a4 1666 RExC_parse = exp;
fac92740 1667 RExC_start = exp;
830247a4
IZ
1668 RExC_end = xend;
1669 RExC_naughty = 0;
1670 RExC_npar = 1;
1671 RExC_size = 0L;
1672 RExC_emit = &PL_regdummy;
1673 RExC_whilem_seen = 0;
85ddcde9
JH
1674#if 0 /* REGC() is (currently) a NOP at the first pass.
1675 * Clever compilers notice this and complain. --jhi */
830247a4 1676 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1677#endif
830247a4 1678 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1679 RExC_precomp = Nullch;
a0d0e21e
LW
1680 return(NULL);
1681 }
830247a4 1682 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1683
c277df42
IZ
1684 /* Small enough for pointer-storage convention?
1685 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1686 if (RExC_size >= 0x10000L && RExC_extralen)
1687 RExC_size += RExC_extralen;
c277df42 1688 else
830247a4
IZ
1689 RExC_extralen = 0;
1690 if (RExC_whilem_seen > 15)
1691 RExC_whilem_seen = 15;
a0d0e21e 1692
bbce6d69 1693 /* Allocate space and initialize. */
830247a4 1694 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1695 char, regexp);
a0d0e21e 1696 if (r == NULL)
b45f050a
JF
1697 FAIL("Regexp out of space");
1698
0f79a09d
GS
1699#ifdef DEBUGGING
1700 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1701 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1702#endif
c277df42 1703 r->refcnt = 1;
bbce6d69 1704 r->prelen = xend - exp;
5cfc7842 1705 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1706 r->subbeg = NULL;
1707 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1708 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1709
1710 r->substrs = 0; /* Useful during FAIL. */
1711 r->startp = 0; /* Useful during FAIL. */
1712 r->endp = 0; /* Useful during FAIL. */
1713
fac92740
MJD
1714 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1715 if (r->offsets) {
1716 r->offsets[0] = RExC_size;
1717 }
1718 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1719 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1720 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1721 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1722
830247a4 1723 RExC_rx = r;
bbce6d69 1724
1725 /* Second pass: emit code. */
830247a4
IZ
1726 RExC_parse = exp;
1727 RExC_end = xend;
1728 RExC_naughty = 0;
1729 RExC_npar = 1;
fac92740 1730 RExC_emit_start = r->program;
830247a4 1731 RExC_emit = r->program;
2cd61cdb 1732 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1733 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1734 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1735 r->data = 0;
830247a4 1736 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1737 return(NULL);
1738
1739 /* Dig out information for optimizations. */
cf93c79d 1740 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1741 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1742 if (UTF)
1743 r->reganch |= ROPT_UTF8;
c277df42 1744 r->regstclass = NULL;
830247a4 1745 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1746 r->reganch |= ROPT_NAUGHTY;
c277df42 1747 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1748
1749 /* XXXX To minimize changes to RE engine we always allocate
1750 3-units-long substrs field. */
1751 Newz(1004, r->substrs, 1, struct reg_substr_data);
1752
2c2d71f5 1753 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1754 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1755 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1756 I32 fake;
c5254dd6 1757 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1758 struct regnode_charclass_class ch_class;
1759 int stclass_flag;
cb434fcc 1760 I32 last_close = 0;
a0d0e21e
LW
1761
1762 first = scan;
c277df42 1763 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1764 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1765 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1766 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1767 (OP(first) == PLUS) ||
1768 (OP(first) == MINMOD) ||
653099ff 1769 /* An {n,m} with n>0 */
22c35a8c 1770 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1771 if (OP(first) == PLUS)
1772 sawplus = 1;
1773 else
1774 first += regarglen[(U8)OP(first)];
1775 first = NEXTOPER(first);
a687059c
LW
1776 }
1777
a0d0e21e
LW
1778 /* Starting-point info. */
1779 again:
653099ff 1780 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1781 if (OP(first) == EXACT)
1782 ; /* Empty, get anchored substr later. */
1783 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1784 r->regstclass = first;
1785 }
653099ff 1786 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1787 r->regstclass = first;
22c35a8c
GS
1788 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1789 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1790 r->regstclass = first;
22c35a8c 1791 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1792 r->reganch |= (OP(first) == MBOL
1793 ? ROPT_ANCH_MBOL
1794 : (OP(first) == SBOL
1795 ? ROPT_ANCH_SBOL
1796 : ROPT_ANCH_BOL));
a0d0e21e 1797 first = NEXTOPER(first);
774d564b 1798 goto again;
1799 }
1800 else if (OP(first) == GPOS) {
1801 r->reganch |= ROPT_ANCH_GPOS;
1802 first = NEXTOPER(first);
1803 goto again;
a0d0e21e
LW
1804 }
1805 else if ((OP(first) == STAR &&
22c35a8c 1806 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1807 !(r->reganch & ROPT_ANCH) )
1808 {
1809 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1810 int type = OP(NEXTOPER(first));
1811
ffc61ed2 1812 if (type == REG_ANY)
cad2e5aa
JH
1813 type = ROPT_ANCH_MBOL;
1814 else
1815 type = ROPT_ANCH_SBOL;
1816
1817 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1818 first = NEXTOPER(first);
774d564b 1819 goto again;
a0d0e21e 1820 }
b81d288d 1821 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1822 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1823 /* x+ must match at the 1st pos of run of x's */
1824 r->reganch |= ROPT_SKIP;
a0d0e21e 1825
c277df42 1826 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1827 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1828 (IV)(first - scan + 1)));
a0d0e21e
LW
1829 /*
1830 * If there's something expensive in the r.e., find the
1831 * longest literal string that must appear and make it the
1832 * regmust. Resolve ties in favor of later strings, since
1833 * the regstart check works with the beginning of the r.e.
1834 * and avoiding duplication strengthens checking. Not a
1835 * strong reason, but sufficient in the absence of others.
1836 * [Now we resolve ties in favor of the earlier string if
c277df42 1837 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1838 * earlier string may buy us something the later one won't.]
1839 */
a0d0e21e 1840 minlen = 0;
a687059c 1841
79cb57f6
GS
1842 data.longest_fixed = newSVpvn("",0);
1843 data.longest_float = newSVpvn("",0);
1844 data.last_found = newSVpvn("",0);
c277df42
IZ
1845 data.longest = &(data.longest_fixed);
1846 first = scan;
653099ff 1847 if (!r->regstclass) {
830247a4 1848 cl_init(pRExC_state, &ch_class);
653099ff
GS
1849 data.start_class = &ch_class;
1850 stclass_flag = SCF_DO_STCLASS_AND;
1851 } else /* XXXX Check for BOUND? */
1852 stclass_flag = 0;
cb434fcc 1853 data.last_closep = &last_close;
653099ff 1854
830247a4 1855 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1856 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1857 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1858 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1859 && !RExC_seen_zerolen
1860 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1861 r->reganch |= ROPT_CHECK_ALL;
830247a4 1862 scan_commit(pRExC_state, &data);
c277df42
IZ
1863 SvREFCNT_dec(data.last_found);
1864
a0ed51b3 1865 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1866 if (longest_float_length
c277df42
IZ
1867 || (data.flags & SF_FL_BEFORE_EOL
1868 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1869 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1870 int t;
1871
a0ed51b3 1872 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1873 && data.offset_fixed == data.offset_float_min
1874 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1875 goto remove_float; /* As in (a)+. */
1876
c277df42
IZ
1877 r->float_substr = data.longest_float;
1878 r->float_min_offset = data.offset_float_min;
1879 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1880 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1881 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1882 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1883 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1884 }
1885 else {
aca2d497 1886 remove_float:
c277df42
IZ
1887 r->float_substr = Nullsv;
1888 SvREFCNT_dec(data.longest_float);
c5254dd6 1889 longest_float_length = 0;
a0d0e21e 1890 }
c277df42 1891
a0ed51b3 1892 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1893 if (longest_fixed_length
c277df42
IZ
1894 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1895 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1896 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1897 int t;
1898
c277df42
IZ
1899 r->anchored_substr = data.longest_fixed;
1900 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1901 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1902 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1903 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1904 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1905 }
1906 else {
c277df42
IZ
1907 r->anchored_substr = Nullsv;
1908 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1909 longest_fixed_length = 0;
a0d0e21e 1910 }
b81d288d 1911 if (r->regstclass
ffc61ed2 1912 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1913 r->regstclass = NULL;
1914 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1915 && !(data.start_class->flags & ANYOF_EOS)
1916 && !cl_is_anything(data.start_class)) {
830247a4 1917 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1918
b81d288d 1919 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1920 struct regnode_charclass_class);
1921 StructCopy(data.start_class,
830247a4 1922 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1923 struct regnode_charclass_class);
830247a4 1924 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1925 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1926 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1927 DEBUG_r({ SV *sv = sv_newmortal();
1928 regprop(sv, (regnode*)data.start_class);
1929 PerlIO_printf(Perl_debug_log,
1930 "synthetic stclass `%s'.\n",
1931 SvPVX(sv));});
653099ff 1932 }
c277df42
IZ
1933
1934 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1935 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1936 r->check_substr = r->anchored_substr;
1937 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1938 if (r->reganch & ROPT_ANCH_SINGLE)
1939 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1940 }
1941 else {
c277df42
IZ
1942 r->check_substr = r->float_substr;
1943 r->check_offset_min = data.offset_float_min;
1944 r->check_offset_max = data.offset_float_max;
a0d0e21e 1945 }
30382c73
IZ
1946 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1947 This should be changed ASAP! */
1948 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1949 r->reganch |= RE_USE_INTUIT;
1950 if (SvTAIL(r->check_substr))
1951 r->reganch |= RE_INTUIT_TAIL;
1952 }
a0ed51b3
LW
1953 }
1954 else {
c277df42
IZ
1955 /* Several toplevels. Best we can is to set minlen. */
1956 I32 fake;
653099ff 1957 struct regnode_charclass_class ch_class;
cb434fcc 1958 I32 last_close = 0;
c277df42
IZ
1959
1960 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1961 scan = r->program + 1;
830247a4 1962 cl_init(pRExC_state, &ch_class);
653099ff 1963 data.start_class = &ch_class;
cb434fcc 1964 data.last_closep = &last_close;
e1901655 1965 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1966 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1967 if (!(data.start_class->flags & ANYOF_EOS)
1968 && !cl_is_anything(data.start_class)) {
830247a4 1969 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1970
b81d288d 1971 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1972 struct regnode_charclass_class);
1973 StructCopy(data.start_class,
830247a4 1974 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1975 struct regnode_charclass_class);
830247a4 1976 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1977 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1978 DEBUG_r({ SV* sv = sv_newmortal();
1979 regprop(sv, (regnode*)data.start_class);
1980 PerlIO_printf(Perl_debug_log,
1981 "synthetic stclass `%s'.\n",
1982 SvPVX(sv));});
653099ff 1983 }
a0d0e21e
LW
1984 }
1985
a0d0e21e 1986 r->minlen = minlen;
b81d288d 1987 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1988 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1989 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1990 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1991 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1992 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
1993 if (RExC_seen & REG_SEEN_SANY)
1994 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
1995 Newz(1002, r->startp, RExC_npar, I32);
1996 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1997 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1998 DEBUG_r(regdump(r));
1999 return(r);
a687059c
LW
2000}
2001
2002/*
2003 - reg - regular expression, i.e. main body or parenthesized thing
2004 *
2005 * Caller must absorb opening parenthesis.
2006 *
2007 * Combining parenthesis handling with the base level of regular expression
2008 * is a trifle forced, but the need to tie the tails of the branches to what
2009 * follows makes it hard to avoid.
2010 */
76e3520e 2011STATIC regnode *
830247a4 2012S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2013 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2014{
c277df42
IZ
2015 register regnode *ret; /* Will be the head of the group. */
2016 register regnode *br;
2017 register regnode *lastbr;
2018 register regnode *ender = 0;
a0d0e21e 2019 register I32 parno = 0;
830247a4 2020 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
fac92740 2021 char * parse_start = RExC_parse; /* MJD */
830247a4 2022 char *oregcomp_parse = RExC_parse;
c277df42 2023 char c;
a0d0e21e 2024
821b33a5 2025 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
2026
2027 /* Make an OPEN node, if parenthesized. */
2028 if (paren) {
fac92740 2029 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2030 U16 posflags = 0, negflags = 0;
2031 U16 *flagsp = &posflags;
0f5d15d6 2032 int logical = 0;
830247a4 2033 char *seqstart = RExC_parse;
ca9dfc88 2034
830247a4
IZ
2035 RExC_parse++;
2036 paren = *RExC_parse++;
c277df42 2037 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2038 switch (paren) {
fac92740 2039 case '<': /* (?<...) */
830247a4 2040 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2041 if (*RExC_parse == '!')
c277df42 2042 paren = ',';
b81d288d 2043 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2044 goto unknown;
830247a4 2045 RExC_parse++;
fac92740
MJD
2046 case '=': /* (?=...) */
2047 case '!': /* (?!...) */
830247a4 2048 RExC_seen_zerolen++;
fac92740
MJD
2049 case ':': /* (?:...) */
2050 case '>': /* (?>...) */
a0d0e21e 2051 break;
fac92740
MJD
2052 case '$': /* (?$...) */
2053 case '@': /* (?@...) */
8615cb43 2054 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2055 break;
fac92740 2056 case '#': /* (?#...) */
830247a4
IZ
2057 while (*RExC_parse && *RExC_parse != ')')
2058 RExC_parse++;
2059 if (*RExC_parse != ')')
c277df42 2060 FAIL("Sequence (?#... not terminated");
830247a4 2061 nextchar(pRExC_state);
a0d0e21e
LW
2062 *flagp = TRYAGAIN;
2063 return NULL;
fac92740 2064 case 'p': /* (?p...) */
d7e9e385 2065 if (SIZE_ONLY)
830247a4 2066 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2067 /* FALL THROUGH*/
fac92740 2068 case '?': /* (??...) */
0f5d15d6 2069 logical = 1;
830247a4 2070 paren = *RExC_parse++;
0f5d15d6 2071 /* FALL THROUGH */
fac92740 2072 case '{': /* (?{...}) */
c277df42 2073 {
c277df42
IZ
2074 I32 count = 1, n = 0;
2075 char c;
830247a4 2076 char *s = RExC_parse;
c277df42
IZ
2077 SV *sv;
2078 OP_4tree *sop, *rop;
2079
830247a4
IZ
2080 RExC_seen_zerolen++;
2081 RExC_seen |= REG_SEEN_EVAL;
2082 while (count && (c = *RExC_parse)) {
2083 if (c == '\\' && RExC_parse[1])
2084 RExC_parse++;
b81d288d 2085 else if (c == '{')
c277df42 2086 count++;
b81d288d 2087 else if (c == '}')
c277df42 2088 count--;
830247a4 2089 RExC_parse++;
c277df42 2090 }
830247a4 2091 if (*RExC_parse != ')')
b45f050a 2092 {
b81d288d 2093 RExC_parse = s;
b45f050a
JF
2094 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2095 }
c277df42
IZ
2096 if (!SIZE_ONLY) {
2097 AV *av;
b81d288d
AB
2098
2099 if (RExC_parse - 1 - s)
830247a4 2100 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2101 else
79cb57f6 2102 sv = newSVpvn("", 0);
c277df42 2103
569233ed
SB
2104 ENTER;
2105 Perl_save_re_context(aTHX);
c277df42 2106 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2107 LEAVE;
c277df42 2108
830247a4
IZ
2109 n = add_data(pRExC_state, 3, "nop");
2110 RExC_rx->data->data[n] = (void*)rop;
2111 RExC_rx->data->data[n+1] = (void*)sop;
2112 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2113 SvREFCNT_dec(sv);
a0ed51b3 2114 }
e24b16f9 2115 else { /* First pass */
830247a4 2116 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2117 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2118 /* No compiled RE interpolated, has runtime
2119 components ===> unsafe. */
2120 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2121 if (PL_tainting && PL_tainted)
cc6b7395 2122 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2123 }
2124
830247a4 2125 nextchar(pRExC_state);
0f5d15d6 2126 if (logical) {
830247a4 2127 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2128 if (!SIZE_ONLY)
2129 ret->flags = 2;
830247a4 2130 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2131 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2132 return ret;
2133 }
830247a4 2134 return reganode(pRExC_state, EVAL, n);
c277df42 2135 }
fac92740 2136 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2137 {
fac92740 2138 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2139 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2140 || RExC_parse[1] == '<'
830247a4 2141 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2142 I32 flag;
2143
830247a4 2144 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2145 if (!SIZE_ONLY)
2146 ret->flags = 1;
830247a4 2147 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2148 goto insert_if;
b81d288d 2149 }
a0ed51b3 2150 }
830247a4 2151 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2152 /* (?(1)...) */
830247a4 2153 parno = atoi(RExC_parse++);
c277df42 2154
830247a4
IZ
2155 while (isDIGIT(*RExC_parse))
2156 RExC_parse++;
fac92740
MJD
2157 ret = reganode(pRExC_state, GROUPP, parno);
2158
830247a4 2159 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2160 vFAIL("Switch condition not recognized");
c277df42 2161 insert_if:
830247a4
IZ
2162 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2163 br = regbranch(pRExC_state, &flags, 1);
c277df42 2164 if (br == NULL)
830247a4 2165 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2166 else
830247a4
IZ
2167 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2168 c = *nextchar(pRExC_state);
d1b80229
IZ
2169 if (flags&HASWIDTH)
2170 *flagp |= HASWIDTH;
c277df42 2171 if (c == '|') {
830247a4
IZ
2172 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2173 regbranch(pRExC_state, &flags, 1);
2174 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2175 if (flags&HASWIDTH)
2176 *flagp |= HASWIDTH;
830247a4 2177 c = *nextchar(pRExC_state);
a0ed51b3
LW
2178 }
2179 else
c277df42
IZ
2180 lastbr = NULL;
2181 if (c != ')')
8615cb43 2182 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2183 ender = reg_node(pRExC_state, TAIL);
2184 regtail(pRExC_state, br, ender);
c277df42 2185 if (lastbr) {
830247a4
IZ
2186 regtail(pRExC_state, lastbr, ender);
2187 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2188 }
2189 else
830247a4 2190 regtail(pRExC_state, ret, ender);
c277df42 2191 return ret;
a0ed51b3
LW
2192 }
2193 else {
830247a4 2194 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2195 }
2196 }
1b1626e4 2197 case 0:
830247a4 2198 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2199 vFAIL("Sequence (? incomplete");
1b1626e4 2200 break;
a0d0e21e 2201 default:
830247a4 2202 --RExC_parse;
fac92740 2203 parse_flags: /* (?i) */
830247a4
IZ
2204 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2205 if (*RExC_parse != 'o')
2206 pmflag(flagsp, *RExC_parse);
2207 ++RExC_parse;
ca9dfc88 2208 }
830247a4 2209 if (*RExC_parse == '-') {
ca9dfc88 2210 flagsp = &negflags;
830247a4 2211 ++RExC_parse;
ca9dfc88 2212 goto parse_flags;
48c036b1 2213 }
830247a4
IZ
2214 RExC_flags16 |= posflags;
2215 RExC_flags16 &= ~negflags;
2216 if (*RExC_parse == ':') {
2217 RExC_parse++;
ca9dfc88
IZ
2218 paren = ':';
2219 break;
2220 }
c277df42 2221 unknown:
830247a4
IZ
2222 if (*RExC_parse != ')') {
2223 RExC_parse++;
2224 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2225 }
830247a4 2226 nextchar(pRExC_state);
a0d0e21e
LW
2227 *flagp = TRYAGAIN;
2228 return NULL;
2229 }
2230 }
fac92740 2231 else { /* (...) */
830247a4
IZ
2232 parno = RExC_npar;
2233 RExC_npar++;
2234 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2235 Set_Node_Length(ret, 1); /* MJD */
2236 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2237 open = 1;
a0d0e21e 2238 }
a0ed51b3 2239 }
fac92740 2240 else /* ! paren */
a0d0e21e
LW
2241 ret = NULL;
2242
2243 /* Pick up the branches, linking them together. */
fac92740 2244 parse_start = RExC_parse; /* MJD */
830247a4 2245 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2246 /* branch_len = (paren != 0); */
2247
a0d0e21e
LW
2248 if (br == NULL)
2249 return(NULL);
830247a4
IZ
2250 if (*RExC_parse == '|') {
2251 if (!SIZE_ONLY && RExC_extralen) {
2252 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2253 }
fac92740 2254 else { /* MJD */
830247a4 2255 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2256 Set_Node_Length(br, paren != 0);
2257 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2258 }
c277df42
IZ
2259 have_branch = 1;
2260 if (SIZE_ONLY)
830247a4 2261 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2262 }
2263 else if (paren == ':') {
c277df42
IZ
2264 *flagp |= flags&SIMPLE;
2265 }
2266 if (open) { /* Starts with OPEN. */
830247a4 2267 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2268 }
2269 else if (paren != '?') /* Not Conditional */
a0d0e21e 2270 ret = br;
821b33a5
IZ
2271 if (flags&HASWIDTH)
2272 *flagp |= HASWIDTH;
a0d0e21e 2273 *flagp |= flags&SPSTART;
c277df42 2274 lastbr = br;
830247a4
IZ
2275 while (*RExC_parse == '|') {
2276 if (!SIZE_ONLY && RExC_extralen) {
2277 ender = reganode(pRExC_state, LONGJMP,0);
2278 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2279 }
2280 if (SIZE_ONLY)
830247a4
IZ
2281 RExC_extralen += 2; /* Account for LONGJMP. */
2282 nextchar(pRExC_state);
2283 br = regbranch(pRExC_state, &flags, 0);
fac92740 2284
a687059c 2285 if (br == NULL)
a0d0e21e 2286 return(NULL);
830247a4 2287 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2288 lastbr = br;
821b33a5
IZ
2289 if (flags&HASWIDTH)
2290 *flagp |= HASWIDTH;
a687059c 2291 *flagp |= flags&SPSTART;
a0d0e21e
LW
2292 }
2293
c277df42
IZ
2294 if (have_branch || paren != ':') {
2295 /* Make a closing node, and hook it on the end. */
2296 switch (paren) {
2297 case ':':
830247a4 2298 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2299 break;
2300 case 1:
830247a4 2301 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2302 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2303 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2304 break;
2305 case '<':
c277df42
IZ
2306 case ',':
2307 case '=':
2308 case '!':
c277df42 2309 *flagp &= ~HASWIDTH;
821b33a5
IZ
2310 /* FALL THROUGH */
2311 case '>':
830247a4 2312 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2313 break;
2314 case 0:
830247a4 2315 ender = reg_node(pRExC_state, END);
c277df42
IZ
2316 break;
2317 }
830247a4 2318 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2319
c277df42
IZ
2320 if (have_branch) {
2321 /* Hook the tails of the branches to the closing node. */
2322 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2323 regoptail(pRExC_state, br, ender);
c277df42
IZ
2324 }
2325 }
a0d0e21e 2326 }
c277df42
IZ
2327
2328 {
2329 char *p;
2330 static char parens[] = "=!<,>";
2331
2332 if (paren && (p = strchr(parens, paren))) {
2333 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2334 int flag = (p - parens) > 1;
2335
2336 if (paren == '>')
2337 node = SUSPEND, flag = 0;
830247a4 2338 reginsert(pRExC_state, node,ret);
c277df42 2339 ret->flags = flag;
830247a4 2340 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2341 }
a0d0e21e
LW
2342 }
2343
2344 /* Check for proper termination. */
ce3e6498 2345 if (paren) {
830247a4
IZ
2346 RExC_flags16 = oregflags;
2347 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2348 RExC_parse = oregcomp_parse;
380a0633 2349 vFAIL("Unmatched (");
ce3e6498 2350 }
a0ed51b3 2351 }
830247a4
IZ
2352 else if (!paren && RExC_parse < RExC_end) {
2353 if (*RExC_parse == ')') {
2354 RExC_parse++;
380a0633 2355 vFAIL("Unmatched )");
a0ed51b3
LW
2356 }
2357 else
b45f050a 2358 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2359 /* NOTREACHED */
2360 }
a687059c 2361
a0d0e21e 2362 return(ret);
a687059c
LW
2363}
2364
2365/*
2366 - regbranch - one alternative of an | operator
2367 *
2368 * Implements the concatenation operator.
2369 */
76e3520e 2370STATIC regnode *
830247a4 2371S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2372{
c277df42
IZ
2373 register regnode *ret;
2374 register regnode *chain = NULL;
2375 register regnode *latest;
2376 I32 flags = 0, c = 0;
a0d0e21e 2377
b81d288d 2378 if (first)
c277df42
IZ
2379 ret = NULL;
2380 else {
b81d288d 2381 if (!SIZE_ONLY && RExC_extralen)
830247a4 2382 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2383 else {
830247a4 2384 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2385 Set_Node_Length(ret, 1);
2386 }
c277df42
IZ
2387 }
2388
b81d288d 2389 if (!first && SIZE_ONLY)
830247a4 2390 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2391
c277df42 2392 *flagp = WORST; /* Tentatively. */
a0d0e21e 2393
830247a4
IZ
2394 RExC_parse--;
2395 nextchar(pRExC_state);
2396 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2397 flags &= ~TRYAGAIN;
830247a4 2398 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2399 if (latest == NULL) {
2400 if (flags & TRYAGAIN)
2401 continue;
2402 return(NULL);
a0ed51b3
LW
2403 }
2404 else if (ret == NULL)
c277df42 2405 ret = latest;
a0d0e21e 2406 *flagp |= flags&HASWIDTH;
c277df42 2407 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2408 *flagp |= flags&SPSTART;
2409 else {
830247a4
IZ
2410 RExC_naughty++;
2411 regtail(pRExC_state, chain, latest);
a687059c 2412 }
a0d0e21e 2413 chain = latest;
c277df42
IZ
2414 c++;
2415 }
2416 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2417 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2418 if (ret == NULL)
2419 ret = chain;
2420 }
2421 if (c == 1) {
2422 *flagp |= flags&SIMPLE;
a0d0e21e 2423 }
a687059c 2424
a0d0e21e 2425 return(ret);
a687059c
LW
2426}
2427
2428/*
2429 - regpiece - something followed by possible [*+?]
2430 *
2431 * Note that the branching code sequences used for ? and the general cases
2432 * of * and + are somewhat optimized: they use the same NOTHING node as
2433 * both the endmarker for their branch list and the body of the last branch.
2434 * It might seem that this node could be dispensed with entirely, but the
2435 * endmarker role is not redundant.
2436 */
76e3520e 2437STATIC regnode *
830247a4 2438S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2439{
c277df42 2440 register regnode *ret;
a0d0e21e
LW
2441 register char op;
2442 register char *next;
2443 I32 flags;
830247a4 2444 char *origparse = RExC_parse;
a0d0e21e
LW
2445 char *maxpos;
2446 I32 min;
c277df42 2447 I32 max = REG_INFTY;
fac92740 2448 char *parse_start;
a0d0e21e 2449
830247a4 2450 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2451 if (ret == NULL) {
2452 if (flags & TRYAGAIN)
2453 *flagp |= TRYAGAIN;
2454 return(NULL);
2455 }
2456
830247a4 2457 op = *RExC_parse;
a0d0e21e 2458
830247a4 2459 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2460 parse_start = RExC_parse; /* MJD */
830247a4 2461 next = RExC_parse + 1;
a0d0e21e
LW
2462 maxpos = Nullch;
2463 while (isDIGIT(*next) || *next == ',') {
2464 if (*next == ',') {
2465 if (maxpos)
2466 break;
2467 else
2468 maxpos = next;
a687059c 2469 }
a0d0e21e
LW
2470 next++;
2471 }
2472 if (*next == '}') { /* got one */
2473 if (!maxpos)
2474 maxpos = next;
830247a4
IZ
2475 RExC_parse++;
2476 min = atoi(RExC_parse);
a0d0e21e
LW
2477 if (*maxpos == ',')
2478 maxpos++;
2479 else
830247a4 2480 maxpos = RExC_parse;
a0d0e21e
LW
2481 max = atoi(maxpos);
2482 if (!max && *maxpos != '0')
c277df42
IZ
2483 max = REG_INFTY; /* meaning "infinity" */
2484 else if (max >= REG_INFTY)
8615cb43 2485 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2486 RExC_parse = next;
2487 nextchar(pRExC_state);
a0d0e21e
LW
2488
2489 do_curly:
2490 if ((flags&SIMPLE)) {
830247a4
IZ
2491 RExC_naughty += 2 + RExC_naughty / 2;
2492 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2493 Set_Node_Offset(ret, parse_start+1); /* MJD */
2494 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2495 }
2496 else {
830247a4 2497 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2498
2499 w->flags = 0;
830247a4
IZ
2500 regtail(pRExC_state, ret, w);
2501 if (!SIZE_ONLY && RExC_extralen) {
2502 reginsert(pRExC_state, LONGJMP,ret);
2503 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2504 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2505 }
830247a4 2506 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2507 /* MJD hk */
2508 Set_Node_Offset(ret, parse_start+1);
2509 Set_Node_Length(ret,
2510 op == '{' ? (RExC_parse - parse_start) : 1);
2511
830247a4 2512 if (!SIZE_ONLY && RExC_extralen)
c277df42 2513 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2514 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2515 if (SIZE_ONLY)
830247a4
IZ
2516 RExC_whilem_seen++, RExC_extralen += 3;
2517 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2518 }
c277df42 2519 ret->flags = 0;
a0d0e21e
LW
2520
2521 if (min > 0)
821b33a5
IZ
2522 *flagp = WORST;
2523 if (max > 0)
2524 *flagp |= HASWIDTH;
a0d0e21e 2525 if (max && max < min)
8615cb43 2526 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2527 if (!SIZE_ONLY) {
2528 ARG1_SET(ret, min);
2529 ARG2_SET(ret, max);
a687059c 2530 }
a687059c 2531
a0d0e21e 2532 goto nest_check;
a687059c 2533 }
a0d0e21e 2534 }
a687059c 2535
a0d0e21e
LW
2536 if (!ISMULT1(op)) {
2537 *flagp = flags;
a687059c 2538 return(ret);
a0d0e21e 2539 }
bb20fd44 2540
c277df42 2541#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2542
2543 /* if this is reinstated, don't forget to put this back into perldiag:
2544
2545 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2546
2547 (F) The part of the regexp subject to either the * or + quantifier
2548 could match an empty string. The {#} shows in the regular
2549 expression about where the problem was discovered.
2550
2551 */
2552
bb20fd44 2553 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2554 vFAIL("Regexp *+ operand could be empty");
b81d288d 2555#endif
bb20fd44 2556
fac92740 2557 parse_start = RExC_parse;
830247a4 2558 nextchar(pRExC_state);
a0d0e21e 2559
821b33a5 2560 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2561
2562 if (op == '*' && (flags&SIMPLE)) {
830247a4 2563 reginsert(pRExC_state, STAR, ret);
c277df42 2564 ret->flags = 0;
830247a4 2565 RExC_naughty += 4;
a0d0e21e
LW
2566 }
2567 else if (op == '*') {
2568 min = 0;
2569 goto do_curly;
a0ed51b3
LW
2570 }
2571 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2572 reginsert(pRExC_state, PLUS, ret);
c277df42 2573 ret->flags = 0;
830247a4 2574 RExC_naughty += 3;
a0d0e21e
LW
2575 }
2576 else if (op == '+') {
2577 min = 1;
2578 goto do_curly;
a0ed51b3
LW
2579 }
2580 else if (op == '?') {
a0d0e21e
LW
2581 min = 0; max = 1;
2582 goto do_curly;
2583 }
2584 nest_check:
e476b1b5 2585 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2586 vWARN3(RExC_parse,
b45f050a 2587 "%.*s matches null string many times",
830247a4 2588 RExC_parse - origparse,
b45f050a 2589 origparse);
a0d0e21e
LW
2590 }
2591
830247a4
IZ
2592 if (*RExC_parse == '?') {
2593 nextchar(pRExC_state);
2594 reginsert(pRExC_state, MINMOD, ret);
2595 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2596 }
830247a4
IZ
2597 if (ISMULT2(RExC_parse)) {
2598 RExC_parse++;
b45f050a
JF
2599 vFAIL("Nested quantifiers");
2600 }
a0d0e21e
LW
2601
2602 return(ret);
a687059c
LW
2603}
2604
2605/*
2606 - regatom - the lowest level
2607 *
2608 * Optimization: gobbles an entire sequence of ordinary characters so that
2609 * it can turn them into a single node, which is smaller to store and
2610 * faster to run. Backslashed characters are exceptions, each becoming a
2611 * separate node; the code is simpler that way and it's not worth fixing.
2612 *
b45f050a 2613 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2614STATIC regnode *
830247a4 2615S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2616{
c277df42 2617 register regnode *ret = 0;
a0d0e21e
LW
2618 I32 flags;
2619
2620 *flagp = WORST; /* Tentatively. */
2621
2622tryagain:
830247a4 2623 switch (*RExC_parse) {
a0d0e21e 2624 case '^':
830247a4
IZ
2625 RExC_seen_zerolen++;
2626 nextchar(pRExC_state);
2627 if (RExC_flags16 & PMf_MULTILINE)
2628 ret = reg_node(pRExC_state, MBOL);
2629 else if (RExC_flags16 & PMf_SINGLELINE)
2630 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2631 else
830247a4 2632 ret = reg_node(pRExC_state, BOL);
fac92740 2633 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2634 break;
2635 case '$':
830247a4 2636 nextchar(pRExC_state);
b81d288d 2637 if (*RExC_parse)
830247a4
IZ
2638 RExC_seen_zerolen++;
2639 if (RExC_flags16 & PMf_MULTILINE)
2640 ret = reg_node(pRExC_state, MEOL);
2641 else if (RExC_flags16 & PMf_SINGLELINE)
2642 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2643 else
830247a4 2644 ret = reg_node(pRExC_state, EOL);
fac92740 2645 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2646 break;
2647 case '.':
830247a4 2648 nextchar(pRExC_state);
ffc61ed2
JH
2649 if (RExC_flags16 & PMf_SINGLELINE)
2650 ret = reg_node(pRExC_state, SANY);
2651 else
2652 ret = reg_node(pRExC_state, REG_ANY);
2653 *flagp |= HASWIDTH|SIMPLE;
830247a4 2654 RExC_naughty++;
fac92740 2655 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2656 break;
2657 case '[':
b45f050a 2658 {
830247a4 2659 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2660 ret = regclass(pRExC_state);
830247a4
IZ
2661 if (*RExC_parse != ']') {
2662 RExC_parse = oregcomp_parse;
b45f050a
JF
2663 vFAIL("Unmatched [");
2664 }
830247a4 2665 nextchar(pRExC_state);
a0d0e21e 2666 *flagp |= HASWIDTH|SIMPLE;
fac92740 2667 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2668 break;
b45f050a 2669 }
a0d0e21e 2670 case '(':
830247a4
IZ
2671 nextchar(pRExC_state);
2672 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2673 if (ret == NULL) {
bf93d4cc 2674 if (flags & TRYAGAIN) {
830247a4 2675 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2676 /* Make parent create an empty node if needed. */
2677 *flagp |= TRYAGAIN;
2678 return(NULL);
2679 }
a0d0e21e 2680 goto tryagain;
bf93d4cc 2681 }
a0d0e21e
LW
2682 return(NULL);
2683 }
c277df42 2684 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2685 break;
2686 case '|':
2687 case ')':
2688 if (flags & TRYAGAIN) {
2689 *flagp |= TRYAGAIN;
2690 return NULL;
2691 }
b45f050a 2692 vFAIL("Internal urp");
a0d0e21e
LW
2693 /* Supposed to be caught earlier. */
2694 break;
85afd4ae 2695 case '{':
830247a4
IZ
2696 if (!regcurly(RExC_parse)) {
2697 RExC_parse++;
85afd4ae
CS
2698 goto defchar;
2699 }
2700 /* FALL THROUGH */
a0d0e21e
LW
2701 case '?':
2702 case '+':
2703 case '*':
830247a4 2704 RExC_parse++;
b45f050a 2705 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2706 break;
2707 case '\\':
830247a4 2708 switch (*++RExC_parse) {
a0d0e21e 2709 case 'A':
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2712 *flagp |= SIMPLE;
830247a4 2713 nextchar(pRExC_state);
fac92740 2714 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2715 break;
2716 case 'G':
830247a4
IZ
2717 ret = reg_node(pRExC_state, GPOS);
2718 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2719 *flagp |= SIMPLE;
830247a4 2720 nextchar(pRExC_state);
fac92740 2721 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2722 break;
2723 case 'Z':
830247a4 2724 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2725 *flagp |= SIMPLE;
830247a4 2726 nextchar(pRExC_state);
a0d0e21e 2727 break;
b85d18e9 2728 case 'z':
830247a4 2729 ret = reg_node(pRExC_state, EOS);
b85d18e9 2730 *flagp |= SIMPLE;
830247a4
IZ
2731 RExC_seen_zerolen++; /* Do not optimize RE away */
2732 nextchar(pRExC_state);
fac92740 2733 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2734 break;
4a2d328f 2735 case 'C':
830247a4 2736 ret = reg_node(pRExC_state, SANY);
3baa4c62 2737 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2738 *flagp |= HASWIDTH|SIMPLE;
830247a4 2739 nextchar(pRExC_state);
fac92740 2740 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2741 break;
2742 case 'X':
830247a4 2743 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2744 *flagp |= HASWIDTH;
830247a4 2745 nextchar(pRExC_state);
fac92740 2746 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2747 break;
a0d0e21e 2748 case 'w':
ffc61ed2 2749 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2750 *flagp |= HASWIDTH|SIMPLE;
830247a4 2751 nextchar(pRExC_state);
fac92740 2752 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2753 break;
2754 case 'W':
ffc61ed2 2755 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2756 *flagp |= HASWIDTH|SIMPLE;
830247a4 2757 nextchar(pRExC_state);
fac92740 2758 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2759 break;
2760 case 'b':
830247a4
IZ
2761 RExC_seen_zerolen++;
2762 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2763 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2764 *flagp |= SIMPLE;
830247a4 2765 nextchar(pRExC_state);
fac92740 2766 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2767 break;
2768 case 'B':
830247a4
IZ
2769 RExC_seen_zerolen++;
2770 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2771 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2772 *flagp |= SIMPLE;
830247a4 2773 nextchar(pRExC_state);
fac92740 2774 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2775 break;
2776 case 's':
ffc61ed2 2777 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2778 *flagp |= HASWIDTH|SIMPLE;
830247a4 2779 nextchar(pRExC_state);
fac92740 2780 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2781 break;
2782 case 'S':
ffc61ed2 2783 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2784 *flagp |= HASWIDTH|SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'd':
ffc61ed2 2789 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2790 *flagp |= HASWIDTH|SIMPLE;
830247a4 2791 nextchar(pRExC_state);
fac92740 2792 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2793 break;
2794 case 'D':
ffc61ed2 2795 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2796 *flagp |= HASWIDTH|SIMPLE;
830247a4 2797 nextchar(pRExC_state);
fac92740 2798 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2799 break;
a14b48bc
LW
2800 case 'p':
2801 case 'P':
3568d838 2802 {
830247a4 2803 char* oldregxend = RExC_end;
fac92740 2804 char* parse_start = RExC_parse;
a14b48bc 2805
830247a4 2806 if (RExC_parse[1] == '{') {
3568d838 2807 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2808 RExC_end = strchr(RExC_parse, '}');
2809 if (!RExC_end) {
2810 RExC_parse += 2;
2811 RExC_end = oldregxend;
b45f050a
JF
2812 vFAIL("Missing right brace on \\p{}");
2813 }
830247a4 2814 RExC_end++;
a14b48bc
LW
2815 }
2816 else
830247a4
IZ
2817 RExC_end = RExC_parse + 2;
2818 RExC_parse--;
a14b48bc 2819
ffc61ed2 2820 ret = regclass(pRExC_state);
a14b48bc 2821
830247a4
IZ
2822 RExC_end = oldregxend;
2823 RExC_parse--;
fac92740 2824 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2825 nextchar(pRExC_state);
a14b48bc
LW
2826 *flagp |= HASWIDTH|SIMPLE;
2827 }
2828 break;
a0d0e21e
LW
2829 case 'n':
2830 case 'r':
2831 case 't':
2832 case 'f':
2833 case 'e':
2834 case 'a':
2835 case 'x':
2836 case 'c':
2837 case '0':
2838 goto defchar;
2839 case '1': case '2': case '3': case '4':
2840 case '5': case '6': case '7': case '8': case '9':
2841 {
830247a4 2842 I32 num = atoi(RExC_parse);
a0d0e21e 2843
830247a4 2844 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2845 goto defchar;
2846 else {
fac92740 2847 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2848 while (isDIGIT(*RExC_parse))
2849 RExC_parse++;
b45f050a 2850
830247a4 2851 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2852 vFAIL("Reference to nonexistent group");
830247a4
IZ
2853 RExC_sawback = 1;
2854 ret = reganode(pRExC_state, FOLD
a0ed51b3 2855 ? (LOC ? REFFL : REFF)
c8756f30 2856 : REF, num);
a0d0e21e 2857 *flagp |= HASWIDTH;
fac92740
MJD
2858
2859 /* override incorrect value set in reganode MJD */
2860 Set_Node_Offset(ret, parse_start+1);
2861 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2862 RExC_parse--;
2863 nextchar(pRExC_state);
a0d0e21e
LW
2864 }
2865 }
2866 break;
2867 case '\0':
830247a4 2868 if (RExC_parse >= RExC_end)
b45f050a 2869 FAIL("Trailing \\");
a0d0e21e
LW
2870 /* FALL THROUGH */
2871 default:
c9f97d15
IZ
2872 /* Do not generate `unrecognized' warnings here, we fall
2873 back into the quick-grab loop below */
a0d0e21e
LW
2874 goto defchar;
2875 }
2876 break;
4633a7c4
LW
2877
2878 case '#':
830247a4
IZ
2879 if (RExC_flags16 & PMf_EXTENDED) {
2880 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2881 if (RExC_parse < RExC_end)
4633a7c4
LW
2882 goto tryagain;
2883 }
2884 /* FALL THROUGH */
2885
a0d0e21e 2886 default: {
ba210ebe 2887 register STRLEN len;
a0ed51b3 2888 register UV ender;
a0d0e21e 2889 register char *p;
c277df42 2890 char *oldp, *s;
ba210ebe 2891 STRLEN numlen;
fac92740 2892 char *parse_start = RExC_parse - 1;
a0d0e21e 2893
830247a4 2894 RExC_parse++;
a0d0e21e
LW
2895
2896 defchar:
830247a4 2897 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2898 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2899 : EXACT);
cd439c50 2900 s = STRING(ret);
830247a4
IZ
2901 for (len = 0, p = RExC_parse - 1;
2902 len < 127 && p < RExC_end;
a0d0e21e
LW
2903 len++)
2904 {
2905 oldp = p;
5b5a24f7 2906
830247a4
IZ
2907 if (RExC_flags16 & PMf_EXTENDED)
2908 p = regwhite(p, RExC_end);
a0d0e21e
LW
2909 switch (*p) {
2910 case '^':
2911 case '$':
2912 case '.':
2913 case '[':
2914 case '(':
2915 case ')':
2916 case '|':
2917 goto loopdone;
2918 case '\\':
2919 switch (*++p) {
2920 case 'A':
2921 case 'G':
2922 case 'Z':
b85d18e9 2923 case 'z':
a0d0e21e
LW
2924 case 'w':
2925 case 'W':
2926 case 'b':
2927 case 'B':
2928 case 's':
2929 case 'S':
2930 case 'd':
2931 case 'D':
a14b48bc
LW
2932 case 'p':
2933 case 'P':
a0d0e21e
LW
2934 --p;
2935 goto loopdone;
2936 case 'n':
2937 ender = '\n';
2938 p++;
a687059c 2939 break;
a0d0e21e
LW
2940 case 'r':
2941 ender = '\r';
2942 p++;
a687059c 2943 break;
a0d0e21e
LW
2944 case 't':
2945 ender = '\t';
2946 p++;
a687059c 2947 break;
a0d0e21e
LW
2948 case 'f':
2949 ender = '\f';
2950 p++;
a687059c 2951 break;
a0d0e21e 2952 case 'e':
c7f1f016 2953 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 2954 p++;
a687059c 2955 break;
a0d0e21e 2956 case 'a':
c7f1f016 2957 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 2958 p++;
a687059c 2959 break;
a0d0e21e 2960 case 'x':
a0ed51b3
LW
2961 if (*++p == '{') {
2962 char* e = strchr(p, '}');
b81d288d 2963
b45f050a 2964 if (!e) {
830247a4 2965 RExC_parse = p + 1;
b45f050a
JF
2966 vFAIL("Missing right brace on \\x{}");
2967 }
de5f0749 2968 else {
b21ed0a9 2969 numlen = 1; /* allow underscores */
de35ba6f 2970 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
2971 if (ender > 0xff)
2972 RExC_utf8 = 1;
b21ed0a9
GS
2973 /* numlen is generous */
2974 if (numlen + len >= 127) {
a0ed51b3
LW
2975 p--;
2976 goto loopdone;
2977 }
2978 p = e + 1;
2979 }
a0ed51b3
LW
2980 }
2981 else {
b21ed0a9 2982 numlen = 0; /* disallow underscores */
dff6d3cd 2983 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2984 p += numlen;
2985 }
a687059c 2986 break;
a0d0e21e
LW
2987 case 'c':
2988 p++;
bbce6d69 2989 ender = UCHARAT(p++);
2990 ender = toCTRL(ender);
a687059c 2991 break;
a0d0e21e
LW
2992 case '0': case '1': case '2': case '3':case '4':
2993 case '5': case '6': case '7': case '8':case '9':
2994 if (*p == '0' ||
830247a4 2995 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2996 numlen = 0; /* disallow underscores */
dff6d3cd 2997 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2998 p += numlen;
2999 }
3000 else {
3001 --p;
3002 goto loopdone;
a687059c
LW
3003 }
3004 break;
a0d0e21e 3005 case '\0':
830247a4 3006 if (p >= RExC_end)
b45f050a 3007 FAIL("Trailing \\");
a687059c 3008 /* FALL THROUGH */
a0d0e21e 3009 default:
e476b1b5 3010 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3011 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3012 goto normal_default;
a0d0e21e
LW
3013 }
3014 break;
a687059c 3015 default:
a0ed51b3 3016 normal_default:
fd400ab9 3017 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3018 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3019 &numlen, 0);
a0ed51b3
LW
3020 p += numlen;
3021 }
3022 else
3023 ender = *p++;
a0d0e21e 3024 break;
a687059c 3025 }
830247a4
IZ
3026 if (RExC_flags16 & PMf_EXTENDED)
3027 p = regwhite(p, RExC_end);
a0ed51b3
LW
3028 if (UTF && FOLD) {
3029 if (LOC)
5e12f4fb 3030 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3031 else
3032 ender = toLOWER_uni(ender);
3033 }
a0d0e21e
LW
3034 if (ISMULT2(p)) { /* Back off on ?+*. */
3035 if (len)
3036 p = oldp;
2b9d42f0 3037 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3038 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3039 s += numlen;
3040 len += numlen;
3041 }
a0d0e21e
LW
3042 else {
3043 len++;
cd439c50 3044 REGC(ender, s++);
a0d0e21e
LW
3045 }
3046 break;
a687059c 3047 }
2b9d42f0 3048 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3049 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3050 s += numlen;
3051 len += numlen - 1;
3052 }
3053 else
cd439c50 3054 REGC(ender, s++);
a0d0e21e
LW
3055 }
3056 loopdone:
830247a4 3057 RExC_parse = p - 1;
fac92740 3058 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3059 nextchar(pRExC_state);
793db0cb
JH
3060 {
3061 /* len is STRLEN which is unsigned, need to copy to signed */
3062 IV iv = len;
3063 if (iv < 0)
3064 vFAIL("Internal disaster");
3065 }
a0d0e21e
LW
3066 if (len > 0)
3067 *flagp |= HASWIDTH;
3068 if (len == 1)
3069 *flagp |= SIMPLE;
c277df42 3070 if (!SIZE_ONLY)
cd439c50
IZ
3071 STR_LEN(ret) = len;
3072 if (SIZE_ONLY)
830247a4 3073 RExC_size += STR_SZ(len);
cd439c50 3074 else
830247a4 3075 RExC_emit += STR_SZ(len);
a687059c 3076 }
a0d0e21e
LW
3077 break;
3078 }
a687059c 3079
a0d0e21e 3080 return(ret);
a687059c
LW
3081}
3082
873ef191 3083STATIC char *
cea2e8a9 3084S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3085{
3086 while (p < e) {
3087 if (isSPACE(*p))
3088 ++p;
3089 else if (*p == '#') {
3090 do {
3091 p++;
3092 } while (p < e && *p != '\n');
3093 }
3094 else
3095 break;
3096 }
3097 return p;
3098}
3099
b8c5462f
JH
3100/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3101 Character classes ([:foo:]) can also be negated ([:^foo:]).
3102 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3103 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3104 but trigger warnings because they are currently unimplemented. */
3105STATIC I32
830247a4 3106S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3107{
3108 char *posixcc = 0;
936ed897 3109 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3110
830247a4 3111 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3112 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3113 (*RExC_parse == ':' ||
3114 *RExC_parse == '=' ||
3115 *RExC_parse == '.')) {
3116 char c = *RExC_parse;
3117 char* s = RExC_parse++;
b81d288d 3118
830247a4
IZ
3119 while (RExC_parse < RExC_end && *RExC_parse != c)
3120 RExC_parse++;
3121 if (RExC_parse == RExC_end)
620e46c5 3122 /* Grandfather lone [:, [=, [. */
830247a4 3123 RExC_parse = s;
620e46c5 3124 else {
830247a4 3125 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3126
830247a4
IZ
3127 if (*RExC_parse == ']') {
3128 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3129 posixcc = s + 1;
3130 if (*s == ':') {
3131 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3132 I32 skip = 5; /* the most common skip */
3133
3134 switch (*posixcc) {
3135 case 'a':
3136 if (strnEQ(posixcc, "alnum", 5))
3137 namedclass =
3138 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3139 else if (strnEQ(posixcc, "alpha", 5))
3140 namedclass =
3141 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3142 else if (strnEQ(posixcc, "ascii", 5))
3143 namedclass =
3144 complement ? ANYOF_NASCII : ANYOF_ASCII;
3145 break;
aaa51d5e
JF
3146 case 'b':
3147 if (strnEQ(posixcc, "blank", 5))
3148 namedclass =
3149 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3150 break;
b8c5462f
JH
3151 case 'c':
3152 if (strnEQ(posixcc, "cntrl", 5))
3153 namedclass =
3154 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3155 break;
3156 case 'd':
3157 if (strnEQ(posixcc, "digit", 5))
3158 namedclass =
3159 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3160 break;
3161 case 'g':
3162 if (strnEQ(posixcc, "graph", 5))
3163 namedclass =
3164 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3165 break;
3166 case 'l':
3167 if (strnEQ(posixcc, "lower", 5))
3168 namedclass =
3169 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3170 break;
3171 case 'p':
3172 if (strnEQ(posixcc, "print", 5))
3173 namedclass =
3174 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3175 else if (strnEQ(posixcc, "punct", 5))
3176 namedclass =
3177 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3178 break;
3179 case 's':
3180 if (strnEQ(posixcc, "space", 5))
3181 namedclass =
aaa51d5e 3182 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3183 break;
b8c5462f
JH
3184 case 'u':
3185 if (strnEQ(posixcc, "upper", 5))
3186 namedclass =
3187 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3188 break;
3189 case 'w': /* this is not POSIX, this is the Perl \w */
3190 if (strnEQ(posixcc, "word", 4)) {
3191 namedclass =
3192 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3193 skip = 4;
3194 }
3195 break;
3196 case 'x':
3197 if (strnEQ(posixcc, "xdigit", 6)) {
3198 namedclass =
3199 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3200 skip = 6;
3201 }
3202 break;
3203 }
ac561586
JH
3204 if (namedclass == OOB_NAMEDCLASS ||
3205 posixcc[skip] != ':' ||
3206 posixcc[skip+1] != ']')
b45f050a
JF
3207 {
3208 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3209 t - s - 1, s + 1);
3210 }
3211 } else if (!SIZE_ONLY) {
b8c5462f 3212 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3213
830247a4 3214 /* adjust RExC_parse so the warning shows after
b45f050a 3215 the class closes */
830247a4
IZ
3216 while (*RExC_parse && *RExC_parse != ']')
3217 RExC_parse++;
b45f050a
JF
3218 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3219 }
b8c5462f
JH
3220 } else {
3221 /* Maternal grandfather:
3222 * "[:" ending in ":" but not in ":]" */
830247a4 3223 RExC_parse = s;
767d463e 3224 }
620e46c5
JH
3225 }
3226 }
3227
b8c5462f
JH
3228 return namedclass;
3229}
3230
3231STATIC void
830247a4 3232S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3233{
e476b1b5 3234 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3235 (*RExC_parse == ':' ||
3236 *RExC_parse == '=' ||
3237 *RExC_parse == '.')) {
3238 char *s = RExC_parse;
93733859 3239 char c = *s++;
b8c5462f
JH
3240
3241 while(*s && isALNUM(*s))
3242 s++;
3243 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3244 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3245
3246 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3247 if (c == '=' || c == '.')
b45f050a 3248 {
830247a4 3249 /* adjust RExC_parse so the error shows after
b45f050a 3250 the class closes */
830247a4 3251 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3252 ;
3253 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3254 }
b8c5462f
JH
3255 }
3256 }
620e46c5
JH
3257}
3258
76e3520e 3259STATIC regnode *
830247a4 3260S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3261{
ffc61ed2 3262 register UV value;
3568d838 3263 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3264 register IV range = 0;
c277df42 3265 register regnode *ret;
ba210ebe 3266 STRLEN numlen;
ffc61ed2 3267 IV namedclass;
9c5ffd7c 3268 char *rangebegin = 0;
936ed897 3269 bool need_class = 0;
9c5ffd7c 3270 SV *listsv = Nullsv;
ffc61ed2
JH
3271 register char *e;
3272 UV n;
3568d838 3273 bool optimize_invert = TRUE;
ffc61ed2
JH
3274
3275 ret = reganode(pRExC_state, ANYOF, 0);
3276
3277 if (!SIZE_ONLY)
3278 ANYOF_FLAGS(ret) = 0;
3279
3280 if (*RExC_parse == '^') { /* Complement of range. */
3281 RExC_naughty++;
3282 RExC_parse++;
3283 if (!SIZE_ONLY)
3284 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3285 }
a0d0e21e 3286
936ed897 3287 if (SIZE_ONLY)
830247a4 3288 RExC_size += ANYOF_SKIP;
936ed897 3289 else {
830247a4 3290 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3291 if (FOLD)
3292 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3293 if (LOC)
3294 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3295 ANYOF_BITMAP_ZERO(ret);
3296 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3297 }
b8c5462f 3298
e476b1b5 3299 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3300 checkposixcc(pRExC_state);
b8c5462f 3301
830247a4 3302 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3303 goto charclassloop; /* allow 1st char to be ] or - */
3304
830247a4 3305 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3306
3307 charclassloop:
3308
3309 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3310
73b437c8 3311 if (!range)
830247a4 3312 rangebegin = RExC_parse;
ffc61ed2 3313 if (UTF) {
5e12f4fb 3314 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3315 RExC_end - RExC_parse,
3316 &numlen, 0);
ffc61ed2
JH
3317 RExC_parse += numlen;
3318 }
3319 else
3320 value = UCHARAT(RExC_parse++);
620e46c5 3321 if (value == '[')
830247a4 3322 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3323 else if (value == '\\') {
ffc61ed2 3324 if (UTF) {
5e12f4fb 3325 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3326 RExC_end - RExC_parse,
3327 &numlen, 0);
3328 RExC_parse += numlen;
3329 }
3330 else
3331 value = UCHARAT(RExC_parse++);
470c3474 3332 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3333 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3334 * be a problem later if we want switch on Unicode.
3335 * A similar issue a little bit later when switching on
3336 * namedclass. --jhi */
ffc61ed2 3337 switch ((I32)value) {
b8c5462f
JH
3338 case 'w': namedclass = ANYOF_ALNUM; break;
3339 case 'W': namedclass = ANYOF_NALNUM; break;
3340 case 's': namedclass = ANYOF_SPACE; break;
3341 case 'S': namedclass = ANYOF_NSPACE; break;
3342 case 'd': namedclass = ANYOF_DIGIT; break;
3343 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3344 case 'p':
3345 case 'P':
3346 if (*RExC_parse == '{') {
3347 e = strchr(RExC_parse++, '}');
3348 if (!e)
3349 vFAIL("Missing right brace on \\p{}");
3350 n = e - RExC_parse;
3351 }
3352 else {
3353 e = RExC_parse;
3354 n = 1;
3355 }
3356 if (!SIZE_ONLY) {
3357 if (value == 'p')
3358 Perl_sv_catpvf(aTHX_ listsv,
3359 "+utf8::%.*s\n", (int)n, RExC_parse);
3360 else
3361 Perl_sv_catpvf(aTHX_ listsv,
3362 "!utf8::%.*s\n", (int)n, RExC_parse);
3363 }
3364 RExC_parse = e + 1;
3365 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3366 continue;
b8c5462f
JH
3367 case 'n': value = '\n'; break;
3368 case 'r': value = '\r'; break;
3369 case 't': value = '\t'; break;
3370 case 'f': value = '\f'; break;
3371 case 'b': value = '\b'; break;
c7f1f016
NIS
3372 case 'e': value = ASCII_TO_NATIVE('\033');break;
3373 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3374 case 'x':
ffc61ed2
JH
3375 if (*RExC_parse == '{') {
3376 e = strchr(RExC_parse++, '}');
b81d288d 3377 if (!e)
ffc61ed2
JH
3378 vFAIL("Missing right brace on \\x{}");
3379 numlen = 1; /* allow underscores */
3380 value = (UV)scan_hex(RExC_parse,
3381 e - RExC_parse,
3382 &numlen);
3383 RExC_parse = e + 1;
3384 }
3385 else {
3386 numlen = 0; /* disallow underscores */
3387 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3388 RExC_parse += numlen;
3389 }
b8c5462f
JH
3390 break;
3391 case 'c':
830247a4 3392 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3393 value = toCTRL(value);
3394 break;
3395 case '0': case '1': case '2': case '3': case '4':
3396 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3397 numlen = 0; /* disallow underscores */
830247a4
IZ
3398 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3399 RExC_parse += numlen;
b8c5462f 3400 break;
1028017a 3401 default:
e476b1b5 3402 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3403 vWARN2(RExC_parse,
3404 "Unrecognized escape \\%c in character class passed through",
3405 (int)value);
1028017a 3406 break;
b8c5462f 3407 }
ffc61ed2
JH
3408 } /* end of \blah */
3409
3410 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3411
3412 if (!SIZE_ONLY && !need_class)
936ed897 3413 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3414
936ed897 3415 need_class = 1;
ffc61ed2
JH
3416
3417 /* a bad range like a-\d, a-[:digit:] ? */
3418 if (range) {
73b437c8 3419 if (!SIZE_ONLY) {
e476b1b5 3420 if (ckWARN(WARN_REGEXP))
830247a4 3421 vWARN4(RExC_parse,
b45f050a 3422 "False [] range \"%*.*s\"",
830247a4
IZ
3423 RExC_parse - rangebegin,
3424 RExC_parse - rangebegin,
b45f050a 3425 rangebegin);
3568d838
JH
3426 if (prevvalue < 256) {
3427 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3428 ANYOF_BITMAP_SET(ret, '-');
3429 }
3430 else {
3431 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3432 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3433 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3434 }
b8c5462f 3435 }
ffc61ed2
JH
3436
3437 range = 0; /* this was not a true range */
73b437c8 3438 }
ffc61ed2 3439
73b437c8 3440 if (!SIZE_ONLY) {
3568d838
JH
3441 if (namedclass > OOB_NAMEDCLASS)
3442 optimize_invert = FALSE;
e2962f66
JH
3443 /* Possible truncation here but in some 64-bit environments
3444 * the compiler gets heartburn about switch on 64-bit values.
3445 * A similar issue a little earlier when switching on value.
98f323fa 3446 * --jhi */
e2962f66 3447 switch ((I32)namedclass) {
73b437c8
JH
3448 case ANYOF_ALNUM:
3449 if (LOC)
936ed897 3450 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3451 else {
3452 for (value = 0; value < 256; value++)
3453 if (isALNUM(value))
936ed897 3454 ANYOF_BITMAP_SET(ret, value);
73b437c8 3455 }
ffc61ed2 3456 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3457 break;
3458 case ANYOF_NALNUM:
3459 if (LOC)
936ed897 3460 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3461 else {
3462 for (value = 0; value < 256; value++)
3463 if (!isALNUM(value))
936ed897 3464 ANYOF_BITMAP_SET(ret, value);
73b437c8 3465 }
ffc61ed2 3466 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3467 break;
ffc61ed2 3468 case ANYOF_ALNUMC:
73b437c8 3469 if (LOC)
ffc61ed2 3470 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3471 else {
3472 for (value = 0; value < 256; value++)
ffc61ed2 3473 if (isALNUMC(value))
936ed897 3474 ANYOF_BITMAP_SET(ret, value);
73b437c8 3475 }
ffc61ed2 3476 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3477 break;
3478 case ANYOF_NALNUMC:
3479 if (LOC)
936ed897 3480 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3481 else {
3482 for (value = 0; value < 256; value++)
3483 if (!isALNUMC(value))
936ed897 3484 ANYOF_BITMAP_SET(ret, value);
73b437c8 3485 }
ffc61ed2 3486 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3487 break;
3488 case ANYOF_ALPHA:
3489 if (LOC)
936ed897 3490 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3491 else {
3492 for (value = 0; value < 256; value++)
3493 if (isALPHA(value))
936ed897 3494 ANYOF_BITMAP_SET(ret, value);
73b437c8 3495 }
ffc61ed2 3496 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3497 break;
3498 case ANYOF_NALPHA:
3499 if (LOC)
936ed897 3500 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3501 else {
3502 for (value = 0; value < 256; value++)
3503 if (!isALPHA(value))
936ed897 3504 ANYOF_BITMAP_SET(ret, value);
73b437c8 3505 }
ffc61ed2 3506 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3507 break;
3508 case ANYOF_ASCII:
3509 if (LOC)
936ed897 3510 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3511 else {
c7f1f016 3512#ifndef EBCDIC
1ba5c669
JH
3513 for (value = 0; value < 128; value++)
3514 ANYOF_BITMAP_SET(ret, value);
3515#else /* EBCDIC */
ffbc6a93 3516 for (value = 0; value < 256; value++) {
3a3c4447
JH
3517 if (isASCII(value))
3518 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3519 }
1ba5c669 3520#endif /* EBCDIC */
73b437c8 3521 }
ffc61ed2 3522 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3523 break;
3524 case ANYOF_NASCII:
3525 if (LOC)
936ed897 3526 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3527 else {
c7f1f016 3528#ifndef EBCDIC
1ba5c669
JH
3529 for (value = 128; value < 256; value++)
3530 ANYOF_BITMAP_SET(ret, value);
3531#else /* EBCDIC */
ffbc6a93 3532 for (value = 0; value < 256; value++) {
3a3c4447
JH
3533 if (!isASCII(value))
3534 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3535 }
1ba5c669 3536#endif /* EBCDIC */
73b437c8 3537 }
ffc61ed2 3538 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3539 break;
aaa51d5e
JF
3540 case ANYOF_BLANK:
3541 if (LOC)
3542 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3543 else {
3544 for (value = 0; value < 256; value++)
3545 if (isBLANK(value))
3546 ANYOF_BITMAP_SET(ret, value);
3547 }
ffc61ed2 3548 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3549 break;
3550 case ANYOF_NBLANK:
3551 if (LOC)
3552 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3553 else {
3554 for (value = 0; value < 256; value++)
3555 if (!isBLANK(value))
3556 ANYOF_BITMAP_SET(ret, value);
3557 }
ffc61ed2 3558 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3559 break;
73b437c8
JH
3560 case ANYOF_CNTRL:
3561 if (LOC)
936ed897 3562 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3563 else {
3564 for (value = 0; value < 256; value++)
3565 if (isCNTRL(value))
936ed897 3566 ANYOF_BITMAP_SET(ret, value);
73b437c8 3567 }
ffc61ed2 3568 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3569 break;
3570 case ANYOF_NCNTRL:
3571 if (LOC)
936ed897 3572 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3573 else {
3574 for (value = 0; value < 256; value++)
3575 if (!isCNTRL(value))
936ed897 3576 ANYOF_BITMAP_SET(ret, value);
73b437c8 3577 }
ffc61ed2
JH
3578 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3579 break;
3580 case ANYOF_DIGIT:
3581 if (LOC)
3582 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3583 else {
3584 /* consecutive digits assumed */
3585 for (value = '0'; value <= '9'; value++)
3586 ANYOF_BITMAP_SET(ret, value);
3587 }
3588 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3589 break;
3590 case ANYOF_NDIGIT:
3591 if (LOC)
3592 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3593 else {
3594 /* consecutive digits assumed */
3595 for (value = 0; value < '0'; value++)
3596 ANYOF_BITMAP_SET(ret, value);
3597 for (value = '9' + 1; value < 256; value++)
3598 ANYOF_BITMAP_SET(ret, value);
3599 }
3600 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3601 break;
3602 case ANYOF_GRAPH:
3603 if (LOC)
936ed897 3604 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3605 else {
3606 for (value = 0; value < 256; value++)
3607 if (isGRAPH(value))
936ed897 3608 ANYOF_BITMAP_SET(ret, value);
73b437c8 3609 }
ffc61ed2 3610 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3611 break;
3612 case ANYOF_NGRAPH:
3613 if (LOC)
936ed897 3614 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3615 else {
3616 for (value = 0; value < 256; value++)
3617 if (!isGRAPH(value))
936ed897 3618 ANYOF_BITMAP_SET(ret, value);
73b437c8 3619 }
ffc61ed2 3620 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3621 break;
3622 case ANYOF_LOWER:
3623 if (LOC)
936ed897 3624 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3625 else {
3626 for (value = 0; value < 256; value++)
3627 if (isLOWER(value))
936ed897 3628 ANYOF_BITMAP_SET(ret, value);
73b437c8 3629 }
ffc61ed2 3630 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3631 break;
3632 case ANYOF_NLOWER:
3633 if (LOC)
936ed897 3634 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3635 else {
3636 for (value = 0; value < 256; value++)
3637 if (!isLOWER(value))
936ed897 3638 ANYOF_BITMAP_SET(ret, value);
73b437c8 3639 }
ffc61ed2 3640 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3641 break;
3642 case ANYOF_PRINT:
3643 if (LOC)
936ed897 3644 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (isPRINT(value))
936ed897 3648 ANYOF_BITMAP_SET(ret, value);
73b437c8 3649 }
ffc61ed2 3650 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3651 break;
3652 case ANYOF_NPRINT:
3653 if (LOC)
936ed897 3654 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3655 else {
3656 for (value = 0; value < 256; value++)
3657 if (!isPRINT(value))
936ed897 3658 ANYOF_BITMAP_SET(ret, value);
73b437c8 3659 }
ffc61ed2 3660 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3661 break;
aaa51d5e
JF
3662 case ANYOF_PSXSPC:
3663 if (LOC)
3664 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3665 else {
3666 for (value = 0; value < 256; value++)
3667 if (isPSXSPC(value))
3668 ANYOF_BITMAP_SET(ret, value);
3669 }
ffc61ed2 3670 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3671 break;
3672 case ANYOF_NPSXSPC:
3673 if (LOC)
3674 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3675 else {
3676 for (value = 0; value < 256; value++)
3677 if (!isPSXSPC(value))
3678 ANYOF_BITMAP_SET(ret, value);
3679 }
ffc61ed2 3680 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3681 break;
73b437c8
JH
3682 case ANYOF_PUNCT:
3683 if (LOC)
936ed897 3684 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3685 else {
3686 for (value = 0; value < 256; value++)
3687 if (isPUNCT(value))
936ed897 3688 ANYOF_BITMAP_SET(ret, value);
73b437c8 3689 }
ffc61ed2 3690 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3691 break;
3692 case ANYOF_NPUNCT:
3693 if (LOC)
936ed897 3694 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3695 else {
3696 for (value = 0; value < 256; value++)
3697 if (!isPUNCT(value))
936ed897 3698 ANYOF_BITMAP_SET(ret, value);
73b437c8 3699 }
ffc61ed2
JH
3700 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3701 break;
3702 case ANYOF_SPACE:
3703 if (LOC)
3704 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3705 else {
3706 for (value = 0; value < 256; value++)
3707 if (isSPACE(value))
3708 ANYOF_BITMAP_SET(ret, value);
3709 }
3710 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3711 break;
3712 case ANYOF_NSPACE:
3713 if (LOC)
3714 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3715 else {
3716 for (value = 0; value < 256; value++)
3717 if (!isSPACE(value))
3718 ANYOF_BITMAP_SET(ret, value);
3719 }
3720 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3721 break;
3722 case ANYOF_UPPER:
3723 if (LOC)
936ed897 3724 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3725 else {
3726 for (value = 0; value < 256; value++)
3727 if (isUPPER(value))
936ed897 3728 ANYOF_BITMAP_SET(ret, value);
73b437c8 3729 }
ffc61ed2 3730 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3731 break;
3732 case ANYOF_NUPPER:
3733 if (LOC)
936ed897 3734 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3735 else {
3736 for (value = 0; value < 256; value++)
3737 if (!isUPPER(value))
936ed897 3738 ANYOF_BITMAP_SET(ret, value);
73b437c8 3739 }
ffc61ed2 3740 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3741 break;
3742 case ANYOF_XDIGIT:
3743 if (LOC)
936ed897 3744 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3745 else {
3746 for (value = 0; value < 256; value++)
3747 if (isXDIGIT(value))
936ed897 3748 ANYOF_BITMAP_SET(ret, value);
73b437c8 3749 }
ffc61ed2 3750 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3751 break;
3752 case ANYOF_NXDIGIT:
3753 if (LOC)
936ed897 3754 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3755 else {
3756 for (value = 0; value < 256; value++)
3757 if (!isXDIGIT(value))
936ed897 3758 ANYOF_BITMAP_SET(ret, value);
73b437c8 3759 }
ffc61ed2 3760 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3761 break;
3762 default:
b45f050a 3763 vFAIL("Invalid [::] class");
73b437c8 3764 break;
b8c5462f 3765 }
b8c5462f 3766 if (LOC)
936ed897 3767 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3768 continue;
a0d0e21e 3769 }
ffc61ed2
JH
3770 } /* end of namedclass \blah */
3771
a0d0e21e 3772 if (range) {
3a3c4447 3773 if (prevvalue > value) /* b-a */ {
b45f050a 3774 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3775 RExC_parse - rangebegin,
3776 RExC_parse - rangebegin,
b45f050a 3777 rangebegin);
3568d838 3778 range = 0; /* not a valid range */
73b437c8 3779 }
a0d0e21e
LW
3780 }
3781 else {
3568d838 3782 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3783 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3784 RExC_parse[1] != ']') {
3785 RExC_parse++;
ffc61ed2
JH
3786
3787 /* a bad range like \w-, [:word:]- ? */
3788 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3789 if (ckWARN(WARN_REGEXP))
830247a4 3790 vWARN4(RExC_parse,
b45f050a 3791 "False [] range \"%*.*s\"",
830247a4
IZ
3792 RExC_parse - rangebegin,
3793 RExC_parse - rangebegin,
b45f050a 3794 rangebegin);
73b437c8 3795 if (!SIZE_ONLY)
936ed897 3796 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3797 } else
ffc61ed2
JH
3798 range = 1; /* yeah, it's a range! */
3799 continue; /* but do it the next time */
a0d0e21e 3800 }
a687059c 3801 }
ffc61ed2 3802
93733859 3803 /* now is the next time */
ae5c130c 3804 if (!SIZE_ONLY) {
3568d838
JH
3805 IV i;
3806
3807 if (prevvalue < 256) {
3808 IV ceilvalue = value < 256 ? value : 255;
3809
3810#ifdef EBCDIC
3a3c4447
JH
3811 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3812 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3813 {
3568d838
JH
3814 if (isLOWER(prevvalue)) {
3815 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3816 if (isLOWER(i))
3817 ANYOF_BITMAP_SET(ret, i);
3818 } else {
3568d838 3819 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3820 if (isUPPER(i))
3821 ANYOF_BITMAP_SET(ret, i);
3822 }
8ada0baa 3823 }
ffc61ed2 3824 else
8ada0baa 3825#endif
3568d838
JH
3826 for (i = prevvalue; i <= ceilvalue; i++)
3827 ANYOF_BITMAP_SET(ret, i);
3828 }
3829 if (value > 255) {
ffc61ed2 3830 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3831 if (prevvalue < value)
ffc61ed2 3832 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3833 (UV)prevvalue, (UV)value);
3834 else if (prevvalue == value)
ffc61ed2
JH
3835 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3836 (UV)value);
3837 }
8ada0baa 3838 }
ffc61ed2
JH
3839
3840 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3841 }
ffc61ed2 3842
936ed897 3843 if (need_class) {
4f66b38d 3844 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3845 if (SIZE_ONLY)
830247a4 3846 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3847 else
830247a4 3848 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3849 }
ffc61ed2 3850
ae5c130c 3851 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3852 if (!SIZE_ONLY &&
ffc61ed2 3853 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3854 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3855 ) {
a0ed51b3 3856 for (value = 0; value < 256; ++value) {
936ed897 3857 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3858 IV fold = PL_fold[value];
3859
3860 if (fold != value)
3861 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3862 }
3863 }
936ed897 3864 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3865 }
ffc61ed2 3866
ae5c130c 3867 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3868 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3869 /* If the only flag is inversion. */
3870 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3871 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3872 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3873 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3874 }
a0d0e21e 3875
b81d288d 3876 if (!SIZE_ONLY) {
fde631ed 3877 AV *av = newAV();
ffc61ed2
JH
3878 SV *rv;
3879
3880 av_store(av, 0, listsv);
3881 av_store(av, 1, NULL);
3882 rv = newRV_noinc((SV*)av);
19860706 3883 n = add_data(pRExC_state, 1, "s");
830247a4 3884 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3885 ARG_SET(ret, n);
a0ed51b3
LW
3886 }
3887
3888 return ret;
3889}
3890
76e3520e 3891STATIC char*
830247a4 3892S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3893{
830247a4 3894 char* retval = RExC_parse++;
a0d0e21e 3895
4633a7c4 3896 for (;;) {
830247a4
IZ
3897 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3898 RExC_parse[2] == '#') {
3899 while (*RExC_parse && *RExC_parse != ')')
3900 RExC_parse++;
3901 RExC_parse++;
4633a7c4
LW
3902 continue;
3903 }
830247a4
IZ
3904 if (RExC_flags16 & PMf_EXTENDED) {
3905 if (isSPACE(*RExC_parse)) {
3906 RExC_parse++;
748a9306
LW
3907 continue;
3908 }
830247a4
IZ
3909 else if (*RExC_parse == '#') {
3910 while (*RExC_parse && *RExC_parse != '\n')
3911 RExC_parse++;
3912 RExC_parse++;
748a9306
LW
3913 continue;
3914 }
748a9306 3915 }
4633a7c4 3916 return retval;
a0d0e21e 3917 }
a687059c
LW
3918}
3919
3920/*
c277df42 3921- reg_node - emit a node
a0d0e21e 3922*/
76e3520e 3923STATIC regnode * /* Location. */
830247a4 3924S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3925{
c277df42
IZ
3926 register regnode *ret;
3927 register regnode *ptr;
a687059c 3928
830247a4 3929 ret = RExC_emit;
c277df42 3930 if (SIZE_ONLY) {
830247a4
IZ
3931 SIZE_ALIGN(RExC_size);
3932 RExC_size += 1;
a0d0e21e
LW
3933 return(ret);
3934 }
a687059c 3935
c277df42 3936 NODE_ALIGN_FILL(ret);
a0d0e21e 3937 ptr = ret;
c277df42 3938 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
3939 if (RExC_offsets) { /* MJD */
3940 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
3941 "reg_node", __LINE__,
3942 reg_name[op],
3943 RExC_emit - RExC_emit_start > RExC_offsets[0]
3944 ? "Overwriting end of array!\n" : "OK",
3945 RExC_emit - RExC_emit_start,
3946 RExC_parse - RExC_start,
3947 RExC_offsets[0]));
3948 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
3949 }
3950
830247a4 3951 RExC_emit = ptr;
a687059c 3952
a0d0e21e 3953 return(ret);
a687059c
LW
3954}
3955
3956/*
a0d0e21e
LW
3957- reganode - emit a node with an argument
3958*/
76e3520e 3959STATIC regnode * /* Location. */
830247a4 3960S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 3961{
c277df42
IZ
3962 register regnode *ret;
3963 register regnode *ptr;
fe14fcc3 3964
830247a4 3965 ret = RExC_emit;
c277df42 3966 if (SIZE_ONLY) {
830247a4
IZ
3967 SIZE_ALIGN(RExC_size);
3968 RExC_size += 2;
a0d0e21e
LW
3969 return(ret);
3970 }
fe14fcc3 3971
c277df42 3972 NODE_ALIGN_FILL(ret);
a0d0e21e 3973 ptr = ret;
c277df42 3974 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
3975 if (RExC_offsets) { /* MJD */
3976 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
3977 "reganode",
3978 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
3979 "Overwriting end of array!\n" : "OK",
3980 RExC_emit - RExC_emit_start,
3981 RExC_parse - RExC_start,
3982 RExC_offsets[0]));
3983 Set_Cur_Node_Offset;
3984 }
3985
830247a4 3986 RExC_emit = ptr;
fe14fcc3 3987
a0d0e21e 3988 return(ret);
fe14fcc3
LW
3989}
3990
3991/*
cd439c50 3992- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
3993*/
3994STATIC void
830247a4 3995S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 3996{
5e12f4fb 3997 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
3998}
3999
4000/*
a0d0e21e
LW
4001- reginsert - insert an operator in front of already-emitted operand
4002*
4003* Means relocating the operand.
4004*/
76e3520e 4005STATIC void
830247a4 4006S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4007{
c277df42
IZ
4008 register regnode *src;
4009 register regnode *dst;
4010 register regnode *place;
4011 register int offset = regarglen[(U8)op];
b81d288d 4012
22c35a8c 4013/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4014
4015 if (SIZE_ONLY) {
830247a4 4016 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4017 return;
4018 }
a687059c 4019
830247a4
IZ
4020 src = RExC_emit;
4021 RExC_emit += NODE_STEP_REGNODE + offset;
4022 dst = RExC_emit;
fac92740 4023 while (src > opnd) {
c277df42 4024 StructCopy(--src, --dst, regnode);
fac92740
MJD
4025 if (RExC_offsets) { /* MJD 20010112 */
4026 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4027 "reg_insert",
4028 dst - RExC_emit_start > RExC_offsets[0]
4029 ? "Overwriting end of array!\n" : "OK",
4030 src - RExC_emit_start,
4031 dst - RExC_emit_start,
4032 RExC_offsets[0]));
4033 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4034 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4035 }
4036 }
4037
a0d0e21e
LW
4038
4039 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4040 if (RExC_offsets) { /* MJD */
4041 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4042 "reginsert",
4043 place - RExC_emit_start > RExC_offsets[0]
4044 ? "Overwriting end of array!\n" : "OK",
4045 place - RExC_emit_start,
4046 RExC_parse - RExC_start,
4047 RExC_offsets[0]));
4048 Set_Node_Offset(place, RExC_parse);
4049 }
c277df42
IZ
4050 src = NEXTOPER(place);
4051 FILL_ADVANCE_NODE(place, op);
4052 Zero(src, offset, regnode);
a687059c
LW
4053}
4054
4055/*
c277df42 4056- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4057*/
76e3520e 4058STATIC void
830247a4 4059S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4060{
c277df42
IZ
4061 register regnode *scan;
4062 register regnode *temp;
a0d0e21e 4063
c277df42 4064 if (SIZE_ONLY)
a0d0e21e
LW
4065 return;
4066
4067 /* Find last node. */
4068 scan = p;
4069 for (;;) {
4070 temp = regnext(scan);
4071 if (temp == NULL)
4072 break;
4073 scan = temp;
4074 }
a687059c 4075
c277df42
IZ
4076 if (reg_off_by_arg[OP(scan)]) {
4077 ARG_SET(scan, val - scan);
a0ed51b3
LW
4078 }
4079 else {
c277df42
IZ
4080 NEXT_OFF(scan) = val - scan;
4081 }
a687059c
LW
4082}
4083
4084/*
a0d0e21e
LW
4085- regoptail - regtail on operand of first argument; nop if operandless
4086*/
76e3520e 4087STATIC void
830247a4 4088S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4089{
a0d0e21e 4090 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4091 if (p == NULL || SIZE_ONLY)
4092 return;
22c35a8c 4093 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4094 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4095 }
22c35a8c 4096 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4097 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4098 }
4099 else
a0d0e21e 4100 return;
a687059c
LW
4101}
4102
4103/*
4104 - regcurly - a little FSA that accepts {\d+,?\d*}
4105 */
79072805 4106STATIC I32
cea2e8a9 4107S_regcurly(pTHX_ register char *s)
a687059c
LW
4108{
4109 if (*s++ != '{')
4110 return FALSE;
f0fcb552 4111 if (!isDIGIT(*s))
a687059c 4112 return FALSE;
f0fcb552 4113 while (isDIGIT(*s))
a687059c
LW
4114 s++;
4115 if (*s == ',')
4116 s++;
f0fcb552 4117 while (isDIGIT(*s))
a687059c
LW
4118 s++;
4119 if (*s != '}')
4120 return FALSE;
4121 return TRUE;
4122}
4123
a687059c 4124
76e3520e 4125STATIC regnode *
cea2e8a9 4126S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4127{
35ff7856 4128#ifdef DEBUGGING
f248d071 4129 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4130 register regnode *next;
c277df42
IZ
4131
4132 while (op != END && (!last || node < last)) {
4133 /* While that wasn't END last time... */
4134
4135 NODE_ALIGN(node);
4136 op = OP(node);
4137 if (op == CLOSE)
4138 l--;
4139 next = regnext(node);
4140 /* Where, what. */
4141 if (OP(node) == OPTIMIZED)
4142 goto after_print;
4143 regprop(sv, node);
b900a521 4144 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4145 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4146 if (next == NULL) /* Next ptr. */
4147 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4148 else
b900a521 4149 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4150 (void)PerlIO_putc(Perl_debug_log, '\n');
4151 after_print:
22c35a8c 4152 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4153 register regnode *nnode = (OP(next) == LONGJMP
4154 ? regnext(next)
c277df42
IZ
4155 : next);
4156 if (last && nnode > last)
4157 nnode = last;
4158 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4159 }
22c35a8c 4160 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4161 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4162 }
4163 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4164 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4165 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4166 }
22c35a8c 4167 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4168 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4169 next, sv, l + 1);
a0ed51b3
LW
4170 }
4171 else if ( op == PLUS || op == STAR) {
c277df42 4172 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4173 }
4174 else if (op == ANYOF) {
4f66b38d
HS
4175 /* arglen 1 + class block */
4176 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4177 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4178 node = NEXTOPER(node);
a0ed51b3 4179 }
22c35a8c 4180 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4181 /* Literal string, where present. */
cd439c50 4182 node += NODE_SZ_STR(node) - 1;
c277df42 4183 node = NEXTOPER(node);
a0ed51b3
LW
4184 }
4185 else {
c277df42
IZ
4186 node = NEXTOPER(node);
4187 node += regarglen[(U8)op];
4188 }
4189 if (op == CURLYX || op == OPEN)
4190 l++;
4191 else if (op == WHILEM)
4192 l--;
4193 }
17c3b450 4194#endif /* DEBUGGING */
c277df42
IZ
4195 return node;
4196}
4197
a687059c 4198/*
fd181c75 4199 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4200 */
4201void
864dbfa3 4202Perl_regdump(pTHX_ regexp *r)
a687059c 4203{
35ff7856 4204#ifdef DEBUGGING
46fc3d4c 4205 SV *sv = sv_newmortal();
a687059c 4206
c277df42 4207 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4208
4209 /* Header fields of interest. */
c277df42 4210 if (r->anchored_substr)
7b0972df 4211 PerlIO_printf(Perl_debug_log,
b81d288d 4212 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4213 PL_colors[0],
7b0972df 4214 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4215 SvPVX(r->anchored_substr),
3280af22 4216 PL_colors[1],
c277df42 4217 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4218 (IV)r->anchored_offset);
c277df42 4219 if (r->float_substr)
7b0972df 4220 PerlIO_printf(Perl_debug_log,
b81d288d 4221 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4222 PL_colors[0],
b81d288d 4223 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4224 SvPVX(r->float_substr),
3280af22 4225 PL_colors[1],
c277df42 4226 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4227 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4228 if (r->check_substr)
b81d288d
AB
4229 PerlIO_printf(Perl_debug_log,
4230 r->check_substr == r->float_substr
c277df42
IZ
4231 ? "(checking floating" : "(checking anchored");
4232 if (r->reganch & ROPT_NOSCAN)
4233 PerlIO_printf(Perl_debug_log, " noscan");
4234 if (r->reganch & ROPT_CHECK_ALL)
4235 PerlIO_printf(Perl_debug_log, " isall");
4236 if (r->check_substr)
4237 PerlIO_printf(Perl_debug_log, ") ");
4238
46fc3d4c 4239 if (r->regstclass) {
4240 regprop(sv, r->regstclass);
4241 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4242 }
774d564b 4243 if (r->reganch & ROPT_ANCH) {
4244 PerlIO_printf(Perl_debug_log, "anchored");
4245 if (r->reganch & ROPT_ANCH_BOL)
4246 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4247 if (r->reganch & ROPT_ANCH_MBOL)
4248 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4249 if (r->reganch & ROPT_ANCH_SBOL)
4250 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4251 if (r->reganch & ROPT_ANCH_GPOS)
4252 PerlIO_printf(Perl_debug_log, "(GPOS)");
4253 PerlIO_putc(Perl_debug_log, ' ');
4254 }
c277df42
IZ
4255 if (r->reganch & ROPT_GPOS_SEEN)
4256 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4257 if (r->reganch & ROPT_SKIP)
760ac839 4258 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4259 if (r->reganch & ROPT_IMPLICIT)
760ac839 4260 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4261 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4262 if (r->reganch & ROPT_EVAL_SEEN)
4263 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4264 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4265 if (r->offsets) {
4266 U32 i;
4267 U32 len = r->offsets[0];
392fbf5d 4268 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4269 for (i = 1; i <= len; i++)
392fbf5d
RB
4270 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4271 (UV)r->offsets[i*2-1],
4272 (UV)r->offsets[i*2]);
fac92740
MJD
4273 PerlIO_printf(Perl_debug_log, "\n");
4274 }
17c3b450 4275#endif /* DEBUGGING */
a687059c
LW
4276}
4277
653099ff
GS
4278STATIC void
4279S_put_byte(pTHX_ SV *sv, int c)
4280{
ffc61ed2 4281 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4282 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4283 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4284 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4285 else
4286 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4287}
4288
a687059c 4289/*
a0d0e21e
LW
4290- regprop - printable representation of opcode
4291*/
46fc3d4c 4292void
864dbfa3 4293Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4294{
35ff7856 4295#ifdef DEBUGGING
9b155405 4296 register int k;
a0d0e21e 4297
54dc92de 4298 sv_setpvn(sv, "", 0);
9b155405 4299 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4300 /* It would be nice to FAIL() here, but this may be called from
4301 regexec.c, and it would be hard to supply pRExC_state. */
4302 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4303 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4304
4305 k = PL_regkind[(U8)OP(o)];
4306
4307 if (k == EXACT)
7821416a 4308 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4309 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4310 else if (k == CURLY) {
cb434fcc 4311 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4312 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4313 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4314 }
2c2d71f5
JH
4315 else if (k == WHILEM && o->flags) /* Ordinal/of */
4316 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4317 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4318 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4319 else if (k == LOGICAL)
04ebc1ab 4320 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4321 else if (k == ANYOF) {
4322 int i, rangestart = -1;
ffc61ed2 4323 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4324 const char * const anyofs[] = { /* Should be syncronized with
4325 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4326 "\\w",
4327 "\\W",
4328 "\\s",
4329 "\\S",
4330 "\\d",
4331 "\\D",
4332 "[:alnum:]",
4333 "[:^alnum:]",
4334 "[:alpha:]",
4335 "[:^alpha:]",
4336 "[:ascii:]",
4337 "[:^ascii:]",
4338 "[:ctrl:]",
4339 "[:^ctrl:]",
4340 "[:graph:]",
4341 "[:^graph:]",
4342 "[:lower:]",
4343 "[:^lower:]",
4344 "[:print:]",
4345 "[:^print:]",
4346 "[:punct:]",
4347 "[:^punct:]",
4348 "[:upper:]",
aaa51d5e 4349 "[:^upper:]",
653099ff 4350 "[:xdigit:]",
aaa51d5e
JF
4351 "[:^xdigit:]",
4352 "[:space:]",
4353 "[:^space:]",
4354 "[:blank:]",
4355 "[:^blank:]"
653099ff
GS
4356 };
4357
19860706 4358 if (flags & ANYOF_LOCALE)
653099ff 4359 sv_catpv(sv, "{loc}");
19860706 4360 if (flags & ANYOF_FOLD)
653099ff
GS
4361 sv_catpv(sv, "{i}");
4362 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4363 if (flags & ANYOF_INVERT)
653099ff 4364 sv_catpv(sv, "^");
ffc61ed2
JH
4365 for (i = 0; i <= 256; i++) {
4366 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4367 if (rangestart == -1)
4368 rangestart = i;
4369 } else if (rangestart != -1) {
4370 if (i <= rangestart + 3)
4371 for (; rangestart < i; rangestart++)
653099ff 4372 put_byte(sv, rangestart);
ffc61ed2
JH
4373 else {
4374 put_byte(sv, rangestart);
4375 sv_catpv(sv, "-");
4376 put_byte(sv, i - 1);
653099ff 4377 }
ffc61ed2 4378 rangestart = -1;
653099ff 4379 }
847a199f 4380 }
ffc61ed2
JH
4381
4382 if (o->flags & ANYOF_CLASS)
4383 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4384 if (ANYOF_CLASS_TEST(o,i))
4385 sv_catpv(sv, anyofs[i]);
4386
4387 if (flags & ANYOF_UNICODE)
4388 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4389 else if (flags & ANYOF_UNICODE_ALL)
4390 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4391
4392 {
4393 SV *lv;
4394 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4395
ffc61ed2
JH
4396 if (lv) {
4397 if (sw) {
4398 UV i;
4399 U8 s[UTF8_MAXLEN+1];
b81d288d 4400
ffc61ed2 4401 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4402 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4403
3568d838 4404 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4405 if (rangestart == -1)
4406 rangestart = i;
4407 } else if (rangestart != -1) {
4408 U8 *p;
b81d288d 4409
ffc61ed2
JH
4410 if (i <= rangestart + 3)
4411 for (; rangestart < i; rangestart++) {
2b9d42f0 4412 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4413 put_byte(sv, *p);
4414 }
4415 else {
2b9d42f0 4416 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4417 put_byte(sv, *p);
4418 sv_catpv(sv, "-");
2b9d42f0 4419 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4420 put_byte(sv, *p);
4421 }
4422 rangestart = -1;
4423 }
19860706 4424 }
ffc61ed2
JH
4425
4426 sv_catpv(sv, "..."); /* et cetera */
19860706 4427 }
fde631ed 4428
ffc61ed2
JH
4429 {
4430 char *s = savepv(SvPVX(lv));
4431 char *origs = s;
b81d288d 4432
ffc61ed2 4433 while(*s && *s != '\n') s++;
b81d288d 4434
ffc61ed2
JH
4435 if (*s == '\n') {
4436 char *t = ++s;
4437
4438 while (*s) {
4439 if (*s == '\n')
4440 *s = ' ';
4441 s++;
4442 }
4443 if (s[-1] == ' ')
4444 s[-1] = 0;
4445
4446 sv_catpv(sv, t);
fde631ed 4447 }
b81d288d 4448
ffc61ed2 4449 Safefree(origs);
fde631ed
JH
4450 }
4451 }
653099ff 4452 }
ffc61ed2 4453
653099ff
GS
4454 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4455 }
9b155405 4456 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4457 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4458#endif /* DEBUGGING */
35ff7856 4459}
a687059c 4460
cad2e5aa
JH
4461SV *
4462Perl_re_intuit_string(pTHX_ regexp *prog)
4463{ /* Assume that RE_INTUIT is set */
4464 DEBUG_r(
4465 { STRLEN n_a;
4466 char *s = SvPV(prog->check_substr,n_a);
4467
4468 if (!PL_colorset) reginitcolors();
4469 PerlIO_printf(Perl_debug_log,
4470 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4471 PL_colors[4],PL_colors[5],PL_colors[0],
4472 s,
4473 PL_colors[1],
4474 (strlen(s) > 60 ? "..." : ""));
4475 } );
4476
4477 return prog->check_substr;
4478}
4479
2b69d0c2 4480void
864dbfa3 4481Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4482{
adac82c7 4483 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4484
4485 if (!r || (--r->refcnt > 0))
4486 return;
cad2e5aa
JH
4487 DEBUG_r(PerlIO_printf(Perl_debug_log,
4488 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4489 PL_colors[4],PL_colors[5],PL_colors[0],
4490 r->precomp,
4491 PL_colors[1],
4492 (strlen(r->precomp) > 60 ? "..." : "")));
4493
c277df42 4494 if (r->precomp)
a0d0e21e 4495 Safefree(r->precomp);
fac92740
MJD
4496 if (r->offsets) /* 20010421 MJD */
4497 Safefree(r->offsets);
cf93c79d
IZ
4498 if (RX_MATCH_COPIED(r))
4499 Safefree(r->subbeg);
a193d654
GS
4500 if (r->substrs) {
4501 if (r->anchored_substr)
4502 SvREFCNT_dec(r->anchored_substr);
4503 if (r->float_substr)
4504 SvREFCNT_dec(r->float_substr);
2779dcf1 4505 Safefree(r->substrs);
a193d654 4506 }
c277df42
IZ
4507 if (r->data) {
4508 int n = r->data->count;
dfad63ad
HS
4509 AV* new_comppad = NULL;
4510 AV* old_comppad;
4511 SV** old_curpad;
4512
c277df42
IZ
4513 while (--n >= 0) {
4514 switch (r->data->what[n]) {
4515 case 's':
4516 SvREFCNT_dec((SV*)r->data->data[n]);
4517 break;
653099ff
GS
4518 case 'f':
4519 Safefree(r->data->data[n]);
4520 break;
dfad63ad
HS
4521 case 'p':
4522 new_comppad = (AV*)r->data->data[n];
4523 break;
c277df42 4524 case 'o':
dfad63ad 4525 if (new_comppad == NULL)
cea2e8a9 4526 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4527 old_comppad = PL_comppad;
4528 old_curpad = PL_curpad;
1e6dc0b6
SB
4529 /* Watch out for global destruction's random ordering. */
4530 if (SvTYPE(new_comppad) == SVt_PVAV) {
4531 PL_comppad = new_comppad;
4532 PL_curpad = AvARRAY(new_comppad);
4533 }
4534 else
4535 PL_curpad = NULL;
c277df42 4536 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4537 PL_comppad = old_comppad;
4538 PL_curpad = old_curpad;
4539 SvREFCNT_dec((SV*)new_comppad);
4540 new_comppad = NULL;
c277df42
IZ
4541 break;
4542 case 'n':
4543 break;
4544 default:
830247a4 4545 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4546 }
4547 }
4548 Safefree(r->data->what);
4549 Safefree(r->data);
a0d0e21e
LW
4550 }
4551 Safefree(r->startp);
4552 Safefree(r->endp);
4553 Safefree(r);
a687059c 4554}
c277df42
IZ
4555
4556/*
4557 - regnext - dig the "next" pointer out of a node
4558 *
4559 * [Note, when REGALIGN is defined there are two places in regmatch()
4560 * that bypass this code for speed.]
4561 */
4562regnode *
864dbfa3 4563Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4564{
4565 register I32 offset;
4566
3280af22 4567 if (p == &PL_regdummy)
c277df42
IZ
4568 return(NULL);
4569
4570 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4571 if (offset == 0)
4572 return(NULL);
4573
c277df42 4574 return(p+offset);
c277df42
IZ
4575}
4576
01f988be 4577STATIC void
cea2e8a9 4578S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4579{
4580 va_list args;
4581 STRLEN l1 = strlen(pat1);
4582 STRLEN l2 = strlen(pat2);
4583 char buf[512];
06bf62c7 4584 SV *msv;
c277df42
IZ
4585 char *message;
4586
4587 if (l1 > 510)
4588 l1 = 510;
4589 if (l1 + l2 > 510)
4590 l2 = 510 - l1;
4591 Copy(pat1, buf, l1 , char);
4592 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4593 buf[l1 + l2] = '\n';
4594 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4595#ifdef I_STDARG
4596 /* ANSI variant takes additional second argument */
c277df42 4597 va_start(args, pat2);
8736538c
AS
4598#else
4599 va_start(args);
4600#endif
5a844595 4601 msv = vmess(buf, &args);
c277df42 4602 va_end(args);
06bf62c7 4603 message = SvPV(msv,l1);
c277df42
IZ
4604 if (l1 > 512)
4605 l1 = 512;
4606 Copy(message, buf, l1 , char);
4607 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4608 Perl_croak(aTHX_ "%s", buf);
c277df42 4609}
a0ed51b3
LW
4610
4611/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4612
4613void
864dbfa3 4614Perl_save_re_context(pTHX)
b81d288d 4615{
830247a4
IZ
4616#if 0
4617 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4618 SAVEI32(RExC_npar); /* () count. */
4619 SAVEI32(RExC_size); /* Code size. */
4620 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4621 SAVEVPTR(RExC_rx); /* from regcomp.c */
4622 SAVEI32(RExC_seen); /* from regcomp.c */
4623 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4624 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4625 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4626 SAVEPPTR(RExC_end); /* End of input for compile */
4627 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4628#endif
4629
4630 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4631 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4632 SAVEPPTR(PL_reginput); /* String-input pointer. */
4633 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4634 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4635 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4636 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4637 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4638 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4639 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4640 PL_reg_start_tmp = 0;
a0ed51b3
LW
4641 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4642 PL_reg_start_tmpl = 0;
7766f137 4643 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4644 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4645 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4646 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4647 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4648 SAVEVPTR(PL_regcc); /* from regexec.c */
4649 SAVEVPTR(PL_curcop);
7766f137
GS
4650 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4651 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4652 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4653 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4654 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4655 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4656 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4657 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4658 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4659 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4660#ifdef DEBUGGING
b81d288d 4661 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4662#endif
a0ed51b3 4663}
51371543
GS
4664
4665#ifdef PERL_OBJECT
51371543
GS
4666#include "XSUB.h"
4667#undef this
4668#define this pPerl
4669#endif
4670
4671static void
4672clear_re(pTHXo_ void *r)
4673{
4674 ReREFCNT_dec((regexp *)r);
4675}
ffbc6a93 4676