This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Drop the -ansi from the default gcc flags.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
396
397#define vWARN2(loc, m, a1) \
398 STMT_START { \
830247a4 399 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
400 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
401 a1, \
830247a4 402 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
403 } STMT_END
404
405#define vWARN3(loc, m, a1, a2) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
409 a1, a2, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN4(loc, m, a1, a2, a3) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
417 a1, a2, a3, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
419 } STMT_END
420
9d1d55b5
JP
421/* used for the parse_flags section for (?c) -- japhy */
422#define vWARN5(loc, m, a1, a2, a3, a4) \
423 STMT_START { \
424 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
425 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
426 a1, a2, a3, a4, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428 } STMT_END
429
8615cb43 430
cd439c50
IZ
431/* Allow for side effects in s */
432#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
433
fac92740
MJD
434/* Macros for recording node offsets. 20001227 mjd@plover.com
435 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
436 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
437 * Element 0 holds the number n.
438 */
439
440#define MJD_OFFSET_DEBUG(x)
441/* #define MJD_OFFSET_DEBUG(x) fprintf x */
442
443
444# define Set_Node_Offset_To_R(node,byte) \
445 STMT_START { \
446 if (! SIZE_ONLY) { \
447 if((node) < 0) { \
448 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
449 } else { \
450 RExC_offsets[2*(node)-1] = (byte); \
451 } \
452 } \
453 } STMT_END
454
455# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
456# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
457
458# define Set_Node_Length_To_R(node,len) \
459 STMT_START { \
460 if (! SIZE_ONLY) { \
461 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
462 if((node) < 0) { \
463 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
464 } else { \
465 RExC_offsets[2*(node)] = (len); \
466 } \
467 } \
468 } STMT_END
469
470# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
471# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
472# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
473
474/* Get offsets and lengths */
475#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
476#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
477
51371543 478static void clear_re(pTHXo_ void *r);
4327152a 479
653099ff
GS
480/* Mark that we cannot extend a found fixed substring at this point.
481 Updata the longest found anchored substring and the longest found
482 floating substrings if needed. */
483
4327152a 484STATIC void
830247a4 485S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 486{
a0ed51b3
LW
487 STRLEN l = CHR_SVLEN(data->last_found);
488 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 489
c277df42
IZ
490 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
491 sv_setsv(*data->longest, data->last_found);
492 if (*data->longest == data->longest_fixed) {
493 data->offset_fixed = l ? data->last_start_min : data->pos_min;
494 if (data->flags & SF_BEFORE_EOL)
b81d288d 495 data->flags
c277df42
IZ
496 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
497 else
498 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
499 }
500 else {
c277df42 501 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
502 data->offset_float_max = (l
503 ? data->last_start_max
c277df42
IZ
504 : data->pos_min + data->pos_delta);
505 if (data->flags & SF_BEFORE_EOL)
b81d288d 506 data->flags
c277df42
IZ
507 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
508 else
509 data->flags &= ~SF_FL_BEFORE_EOL;
510 }
511 }
512 SvCUR_set(data->last_found, 0);
513 data->last_end = -1;
514 data->flags &= ~SF_BEFORE_EOL;
515}
516
653099ff
GS
517/* Can match anything (initialization) */
518STATIC void
830247a4 519S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 520{
9af0acdb
JH
521 int value;
522
653099ff 523 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
524 for (value = 0; value < 256; ++value)
525 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
9af0acdb
JH
542 for (value = 0; value < 256; ++value)
543 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
544 return 0;
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
661/* REx optimizer. Converts nodes into quickier variants "in place".
662 Finds fixed substrings. */
663
c277df42
IZ
664/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
665 to the position after last scanned or to NULL. */
666
76e3520e 667STATIC I32
830247a4 668S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
669 /* scanp: Start here (read-write). */
670 /* deltap: Write maxlen-minlen here. */
671 /* last: Stop before this one. */
672{
673 I32 min = 0, pars = 0, code;
674 regnode *scan = *scanp, *next;
675 I32 delta = 0;
676 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 677 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
678 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
679 scan_data_t data_fake;
653099ff 680 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 681
c277df42
IZ
682 while (scan && OP(scan) != END && scan < last) {
683 /* Peephole optimizer: */
684
22c35a8c 685 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 686 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
687 regnode *n = regnext(scan);
688 U32 stringok = 1;
689#ifdef DEBUGGING
690 regnode *stop = scan;
b81d288d 691#endif
c277df42 692
cd439c50 693 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
694 /* Skip NOTHING, merge EXACT*. */
695 while (n &&
b81d288d 696 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
697 (stringok && (OP(n) == OP(scan))))
698 && NEXT_OFF(n)
699 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
700 if (OP(n) == TAIL || n > next)
701 stringok = 0;
22c35a8c 702 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
703 NEXT_OFF(scan) += NEXT_OFF(n);
704 next = n + NODE_STEP_REGNODE;
705#ifdef DEBUGGING
706 if (stringok)
707 stop = n;
b81d288d 708#endif
c277df42 709 n = regnext(n);
a0ed51b3 710 }
f49d4d0f 711 else if (stringok) {
cd439c50 712 int oldl = STR_LEN(scan);
c277df42 713 regnode *nnext = regnext(n);
f49d4d0f 714
b81d288d 715 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
716 break;
717 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
718 STR_LEN(scan) += STR_LEN(n);
719 next = n + NODE_SZ_STR(n);
c277df42 720 /* Now we can overwrite *n : */
f49d4d0f 721 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 722#ifdef DEBUGGING
f49d4d0f 723 stop = next - 1;
b81d288d 724#endif
c277df42
IZ
725 n = nnext;
726 }
727 }
728#ifdef DEBUGGING
729 /* Allow dumping */
cd439c50 730 n = scan + NODE_SZ_STR(scan);
c277df42 731 while (n <= stop) {
22c35a8c 732 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
733 OP(n) = OPTIMIZED;
734 NEXT_OFF(n) = 0;
735 }
736 n++;
737 }
653099ff 738#endif
c277df42 739 }
653099ff
GS
740 /* Follow the next-chain of the current node and optimize
741 away all the NOTHINGs from it. */
c277df42 742 if (OP(scan) != CURLYX) {
048cfca1
GS
743 int max = (reg_off_by_arg[OP(scan)]
744 ? I32_MAX
745 /* I32 may be smaller than U16 on CRAYs! */
746 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
747 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
748 int noff;
749 regnode *n = scan;
b81d288d 750
c277df42
IZ
751 /* Skip NOTHING and LONGJMP. */
752 while ((n = regnext(n))
22c35a8c 753 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
754 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
755 && off + noff < max)
756 off += noff;
757 if (reg_off_by_arg[OP(scan)])
758 ARG(scan) = off;
b81d288d 759 else
c277df42
IZ
760 NEXT_OFF(scan) = off;
761 }
653099ff
GS
762 /* The principal pseudo-switch. Cannot be a switch, since we
763 look into several different things. */
b81d288d 764 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
765 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
766 next = regnext(scan);
767 code = OP(scan);
b81d288d
AB
768
769 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 770 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 771 struct regnode_charclass_class accum;
c277df42 772
653099ff 773 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 774 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 775 if (flags & SCF_DO_STCLASS)
830247a4 776 cl_init_zero(pRExC_state, &accum);
c277df42 777 while (OP(scan) == code) {
830247a4 778 I32 deltanext, minnext, f = 0, fake;
653099ff 779 struct regnode_charclass_class this_class;
c277df42
IZ
780
781 num++;
782 data_fake.flags = 0;
b81d288d 783 if (data) {
2c2d71f5 784 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
785 data_fake.last_closep = data->last_closep;
786 }
787 else
788 data_fake.last_closep = &fake;
c277df42
IZ
789 next = regnext(scan);
790 scan = NEXTOPER(scan);
791 if (code != BRANCH)
792 scan = NEXTOPER(scan);
653099ff 793 if (flags & SCF_DO_STCLASS) {
830247a4 794 cl_init(pRExC_state, &this_class);
653099ff
GS
795 data_fake.start_class = &this_class;
796 f = SCF_DO_STCLASS_AND;
b81d288d 797 }
e1901655
IZ
798 if (flags & SCF_WHILEM_VISITED_POS)
799 f |= SCF_WHILEM_VISITED_POS;
653099ff 800 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
801 minnext = study_chunk(pRExC_state, &scan, &deltanext,
802 next, &data_fake, f);
b81d288d 803 if (min1 > minnext)
c277df42
IZ
804 min1 = minnext;
805 if (max1 < minnext + deltanext)
806 max1 = minnext + deltanext;
807 if (deltanext == I32_MAX)
aca2d497 808 is_inf = is_inf_internal = 1;
c277df42
IZ
809 scan = next;
810 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
811 pars++;
405ff068 812 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 813 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
814 if (data)
815 data->whilem_c = data_fake.whilem_c;
653099ff 816 if (flags & SCF_DO_STCLASS)
830247a4 817 cl_or(pRExC_state, &accum, &this_class);
b81d288d 818 if (code == SUSPEND)
c277df42
IZ
819 break;
820 }
821 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
822 min1 = 0;
823 if (flags & SCF_DO_SUBSTR) {
824 data->pos_min += min1;
825 data->pos_delta += max1 - min1;
826 if (max1 != min1 || is_inf)
827 data->longest = &(data->longest_float);
828 }
829 min += min1;
830 delta += max1 - min1;
653099ff 831 if (flags & SCF_DO_STCLASS_OR) {
830247a4 832 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
833 if (min1) {
834 cl_and(data->start_class, &and_with);
835 flags &= ~SCF_DO_STCLASS;
836 }
837 }
838 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
839 if (min1) {
840 cl_and(data->start_class, &accum);
653099ff 841 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
842 }
843 else {
b81d288d 844 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
845 * data->start_class */
846 StructCopy(data->start_class, &and_with,
847 struct regnode_charclass_class);
848 flags &= ~SCF_DO_STCLASS_AND;
849 StructCopy(&accum, data->start_class,
850 struct regnode_charclass_class);
851 flags |= SCF_DO_STCLASS_OR;
852 data->start_class->flags |= ANYOF_EOS;
853 }
653099ff 854 }
a0ed51b3
LW
855 }
856 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
857 scan = NEXTOPER(NEXTOPER(scan));
858 else /* single branch is optimized. */
859 scan = NEXTOPER(scan);
860 continue;
a0ed51b3
LW
861 }
862 else if (OP(scan) == EXACT) {
cd439c50 863 I32 l = STR_LEN(scan);
1aa99e6b 864 UV uc = *((U8*)STRING(scan));
a0ed51b3 865 if (UTF) {
1aa99e6b
IH
866 U8 *s = (U8*)STRING(scan);
867 l = utf8_length(s, s + l);
9041c2e3 868 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
869 }
870 min += l;
c277df42 871 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
872 /* The code below prefers earlier match for fixed
873 offset, later match for variable offset. */
874 if (data->last_end == -1) { /* Update the start info. */
875 data->last_start_min = data->pos_min;
876 data->last_start_max = is_inf
b81d288d 877 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 878 }
cd439c50 879 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
880 data->last_end = data->pos_min + l;
881 data->pos_min += l; /* As in the first entry. */
882 data->flags &= ~SF_BEFORE_EOL;
883 }
653099ff
GS
884 if (flags & SCF_DO_STCLASS_AND) {
885 /* Check whether it is compatible with what we know already! */
886 int compat = 1;
887
1aa99e6b 888 if (uc >= 0x100 ||
516a5887 889 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 890 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 891 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 892 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 893 )
653099ff
GS
894 compat = 0;
895 ANYOF_CLASS_ZERO(data->start_class);
896 ANYOF_BITMAP_ZERO(data->start_class);
897 if (compat)
1aa99e6b 898 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 899 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
900 if (uc < 0x100)
901 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
902 }
903 else if (flags & SCF_DO_STCLASS_OR) {
904 /* false positive possible if the class is case-folded */
1aa99e6b 905 if (uc < 0x100)
9b877dbb
IH
906 ANYOF_BITMAP_SET(data->start_class, uc);
907 else
908 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
909 data->start_class->flags &= ~ANYOF_EOS;
910 cl_and(data->start_class, &and_with);
911 }
912 flags &= ~SCF_DO_STCLASS;
a0ed51b3 913 }
653099ff 914 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 915 I32 l = STR_LEN(scan);
1aa99e6b 916 UV uc = *((U8*)STRING(scan));
653099ff
GS
917
918 /* Search for fixed substrings supports EXACT only. */
b81d288d 919 if (flags & SCF_DO_SUBSTR)
830247a4 920 scan_commit(pRExC_state, data);
a0ed51b3 921 if (UTF) {
1aa99e6b
IH
922 U8 *s = (U8 *)STRING(scan);
923 l = utf8_length(s, s + l);
9041c2e3 924 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
925 }
926 min += l;
c277df42 927 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 928 data->pos_min += l;
653099ff
GS
929 if (flags & SCF_DO_STCLASS_AND) {
930 /* Check whether it is compatible with what we know already! */
931 int compat = 1;
932
1aa99e6b 933 if (uc >= 0x100 ||
516a5887 934 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 935 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 936 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
937 compat = 0;
938 ANYOF_CLASS_ZERO(data->start_class);
939 ANYOF_BITMAP_ZERO(data->start_class);
940 if (compat) {
1aa99e6b 941 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
942 data->start_class->flags &= ~ANYOF_EOS;
943 data->start_class->flags |= ANYOF_FOLD;
944 if (OP(scan) == EXACTFL)
945 data->start_class->flags |= ANYOF_LOCALE;
946 }
947 }
948 else if (flags & SCF_DO_STCLASS_OR) {
949 if (data->start_class->flags & ANYOF_FOLD) {
950 /* false positive possible if the class is case-folded.
951 Assume that the locale settings are the same... */
1aa99e6b
IH
952 if (uc < 0x100)
953 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
954 data->start_class->flags &= ~ANYOF_EOS;
955 }
956 cl_and(data->start_class, &and_with);
957 }
958 flags &= ~SCF_DO_STCLASS;
a0ed51b3 959 }
4d61ec05 960 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 961 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 962 I32 f = flags, pos_before = 0;
c277df42 963 regnode *oscan = scan;
653099ff
GS
964 struct regnode_charclass_class this_class;
965 struct regnode_charclass_class *oclass = NULL;
966
22c35a8c 967 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 968 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
969 scan = NEXTOPER(scan);
970 goto finish;
971 case PLUS:
653099ff 972 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 973 next = NEXTOPER(scan);
653099ff 974 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
975 mincount = 1;
976 maxcount = REG_INFTY;
c277df42
IZ
977 next = regnext(scan);
978 scan = NEXTOPER(scan);
979 goto do_curly;
980 }
981 }
982 if (flags & SCF_DO_SUBSTR)
983 data->pos_min++;
984 min++;
985 /* Fall through. */
986 case STAR:
653099ff
GS
987 if (flags & SCF_DO_STCLASS) {
988 mincount = 0;
b81d288d 989 maxcount = REG_INFTY;
653099ff
GS
990 next = regnext(scan);
991 scan = NEXTOPER(scan);
992 goto do_curly;
993 }
b81d288d 994 is_inf = is_inf_internal = 1;
c277df42
IZ
995 scan = regnext(scan);
996 if (flags & SCF_DO_SUBSTR) {
830247a4 997 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
998 data->longest = &(data->longest_float);
999 }
1000 goto optimize_curly_tail;
1001 case CURLY:
b81d288d 1002 mincount = ARG1(scan);
c277df42
IZ
1003 maxcount = ARG2(scan);
1004 next = regnext(scan);
cb434fcc
IZ
1005 if (OP(scan) == CURLYX) {
1006 I32 lp = (data ? *(data->last_closep) : 0);
1007
1008 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1009 }
c277df42
IZ
1010 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1011 do_curly:
1012 if (flags & SCF_DO_SUBSTR) {
830247a4 1013 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1014 pos_before = data->pos_min;
1015 }
1016 if (data) {
1017 fl = data->flags;
1018 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1019 if (is_inf)
1020 data->flags |= SF_IS_INF;
1021 }
653099ff 1022 if (flags & SCF_DO_STCLASS) {
830247a4 1023 cl_init(pRExC_state, &this_class);
653099ff
GS
1024 oclass = data->start_class;
1025 data->start_class = &this_class;
1026 f |= SCF_DO_STCLASS_AND;
1027 f &= ~SCF_DO_STCLASS_OR;
1028 }
e1901655
IZ
1029 /* These are the cases when once a subexpression
1030 fails at a particular position, it cannot succeed
1031 even after backtracking at the enclosing scope.
b81d288d 1032
e1901655
IZ
1033 XXXX what if minimal match and we are at the
1034 initial run of {n,m}? */
1035 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1036 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1037
c277df42 1038 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1039 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1040 mincount == 0
653099ff
GS
1041 ? (f & ~SCF_DO_SUBSTR) : f);
1042
1043 if (flags & SCF_DO_STCLASS)
1044 data->start_class = oclass;
1045 if (mincount == 0 || minnext == 0) {
1046 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1047 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1048 }
1049 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1050 /* Switch to OR mode: cache the old value of
653099ff
GS
1051 * data->start_class */
1052 StructCopy(data->start_class, &and_with,
1053 struct regnode_charclass_class);
1054 flags &= ~SCF_DO_STCLASS_AND;
1055 StructCopy(&this_class, data->start_class,
1056 struct regnode_charclass_class);
1057 flags |= SCF_DO_STCLASS_OR;
1058 data->start_class->flags |= ANYOF_EOS;
1059 }
1060 } else { /* Non-zero len */
1061 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1062 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1063 cl_and(data->start_class, &and_with);
1064 }
1065 else if (flags & SCF_DO_STCLASS_AND)
1066 cl_and(data->start_class, &this_class);
1067 flags &= ~SCF_DO_STCLASS;
1068 }
c277df42
IZ
1069 if (!scan) /* It was not CURLYX, but CURLY. */
1070 scan = next;
b81d288d 1071 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
99799961 1072 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1073 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1074 {
830247a4 1075 vWARN(RExC_parse,
b45f050a
JF
1076 "Quantifier unexpected on zero-length expression");
1077 }
1078
c277df42 1079 min += minnext * mincount;
b81d288d 1080 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1081 && (minnext + deltanext) > 0)
1082 || deltanext == I32_MAX);
aca2d497 1083 is_inf |= is_inf_internal;
c277df42
IZ
1084 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1085
1086 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1087 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1088 && data->flags & SF_IN_PAR
1089 && !(data->flags & SF_HAS_EVAL)
1090 && !deltanext && minnext == 1 ) {
1091 /* Try to optimize to CURLYN. */
1092 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1093 regnode *nxt1 = nxt, *nxt2;
1094
1095 /* Skip open. */
1096 nxt = regnext(nxt);
4d61ec05 1097 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1098 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1099 && STR_LEN(nxt) == 1))
c277df42
IZ
1100 goto nogo;
1101 nxt2 = nxt;
1102 nxt = regnext(nxt);
b81d288d 1103 if (OP(nxt) != CLOSE)
c277df42
IZ
1104 goto nogo;
1105 /* Now we know that nxt2 is the only contents: */
1106 oscan->flags = ARG(nxt);
1107 OP(oscan) = CURLYN;
1108 OP(nxt1) = NOTHING; /* was OPEN. */
1109#ifdef DEBUGGING
1110 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1111 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1112 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1113 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1114 OP(nxt + 1) = OPTIMIZED; /* was count. */
1115 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1116#endif
c277df42 1117 }
c277df42
IZ
1118 nogo:
1119
1120 /* Try optimization CURLYX => CURLYM. */
b81d288d 1121 if ( OP(oscan) == CURLYX && data
c277df42 1122 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1123 && !(data->flags & SF_HAS_EVAL)
1124 && !deltanext ) {
1125 /* XXXX How to optimize if data == 0? */
1126 /* Optimize to a simpler form. */
1127 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1128 regnode *nxt2;
1129
1130 OP(oscan) = CURLYM;
1131 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1132 && (OP(nxt2) != WHILEM))
c277df42
IZ
1133 nxt = nxt2;
1134 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1135 /* Need to optimize away parenths. */
1136 if (data->flags & SF_IN_PAR) {
1137 /* Set the parenth number. */
1138 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1139
b81d288d 1140 if (OP(nxt) != CLOSE)
b45f050a 1141 FAIL("Panic opt close");
c277df42
IZ
1142 oscan->flags = ARG(nxt);
1143 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1144 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1145#ifdef DEBUGGING
1146 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1147 OP(nxt + 1) = OPTIMIZED; /* was count. */
1148 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1149 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1150#endif
c277df42
IZ
1151#if 0
1152 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1153 regnode *nnxt = regnext(nxt1);
b81d288d 1154
c277df42
IZ
1155 if (nnxt == nxt) {
1156 if (reg_off_by_arg[OP(nxt1)])
1157 ARG_SET(nxt1, nxt2 - nxt1);
1158 else if (nxt2 - nxt1 < U16_MAX)
1159 NEXT_OFF(nxt1) = nxt2 - nxt1;
1160 else
1161 OP(nxt) = NOTHING; /* Cannot beautify */
1162 }
1163 nxt1 = nnxt;
1164 }
1165#endif
1166 /* Optimize again: */
b81d288d 1167 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1168 NULL, 0);
a0ed51b3
LW
1169 }
1170 else
c277df42 1171 oscan->flags = 0;
c277df42 1172 }
e1901655
IZ
1173 else if ((OP(oscan) == CURLYX)
1174 && (flags & SCF_WHILEM_VISITED_POS)
1175 /* See the comment on a similar expression above.
1176 However, this time it not a subexpression
1177 we care about, but the expression itself. */
1178 && (maxcount == REG_INFTY)
1179 && data && ++data->whilem_c < 16) {
1180 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1181 /* Find WHILEM (as in regexec.c) */
1182 regnode *nxt = oscan + NEXT_OFF(oscan);
1183
1184 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1185 nxt += ARG(nxt);
1186 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1187 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1188 }
b81d288d 1189 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1190 pars++;
1191 if (flags & SCF_DO_SUBSTR) {
1192 SV *last_str = Nullsv;
1193 int counted = mincount != 0;
1194
1195 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1196 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1197 ? pos_before : data->last_start_min;
1198 STRLEN l;
1199 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1200 I32 old = b - data->last_start_min;
1201
1202 if (UTF)
1203 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1204
a0ed51b3 1205 l -= old;
c277df42 1206 /* Get the added string: */
79cb57f6 1207 last_str = newSVpvn(s + old, l);
c277df42
IZ
1208 if (deltanext == 0 && pos_before == b) {
1209 /* What was added is a constant string */
1210 if (mincount > 1) {
1211 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1212 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1213 SvPVX(last_str), l, mincount - 1);
1214 SvCUR(last_str) *= mincount;
1215 /* Add additional parts. */
b81d288d 1216 SvCUR_set(data->last_found,
c277df42
IZ
1217 SvCUR(data->last_found) - l);
1218 sv_catsv(data->last_found, last_str);
1219 data->last_end += l * (mincount - 1);
1220 }
2a8d9689
HS
1221 } else {
1222 /* start offset must point into the last copy */
1223 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1224 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1225 * (minnext + data->pos_delta);
c277df42
IZ
1226 }
1227 }
1228 /* It is counted once already... */
1229 data->pos_min += minnext * (mincount - counted);
1230 data->pos_delta += - counted * deltanext +
1231 (minnext + deltanext) * maxcount - minnext * mincount;
1232 if (mincount != maxcount) {
653099ff
GS
1233 /* Cannot extend fixed substrings found inside
1234 the group. */
830247a4 1235 scan_commit(pRExC_state,data);
c277df42
IZ
1236 if (mincount && last_str) {
1237 sv_setsv(data->last_found, last_str);
1238 data->last_end = data->pos_min;
b81d288d 1239 data->last_start_min =
a0ed51b3 1240 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1241 data->last_start_max = is_inf
1242 ? I32_MAX
c277df42 1243 : data->pos_min + data->pos_delta
a0ed51b3 1244 - CHR_SVLEN(last_str);
c277df42
IZ
1245 }
1246 data->longest = &(data->longest_float);
1247 }
aca2d497 1248 SvREFCNT_dec(last_str);
c277df42 1249 }
405ff068 1250 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1251 data->flags |= SF_HAS_EVAL;
1252 optimize_curly_tail:
c277df42 1253 if (OP(oscan) != CURLYX) {
22c35a8c 1254 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1255 && NEXT_OFF(next))
1256 NEXT_OFF(oscan) += NEXT_OFF(next);
1257 }
c277df42 1258 continue;
653099ff 1259 default: /* REF and CLUMP only? */
c277df42 1260 if (flags & SCF_DO_SUBSTR) {
830247a4 1261 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1262 data->longest = &(data->longest_float);
1263 }
aca2d497 1264 is_inf = is_inf_internal = 1;
653099ff 1265 if (flags & SCF_DO_STCLASS_OR)
830247a4 1266 cl_anything(pRExC_state, data->start_class);
653099ff 1267 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1268 break;
1269 }
a0ed51b3 1270 }
ffc61ed2 1271 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1272 int value = 0;
653099ff 1273
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data);
c277df42
IZ
1276 data->pos_min++;
1277 }
1278 min++;
653099ff
GS
1279 if (flags & SCF_DO_STCLASS) {
1280 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1281
1282 /* Some of the logic below assumes that switching
1283 locale on will only add false positives. */
1284 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1285 case SANY:
653099ff
GS
1286 default:
1287 do_default:
1288 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1289 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1290 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1291 break;
1292 case REG_ANY:
1293 if (OP(scan) == SANY)
1294 goto do_default;
1295 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1296 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1297 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1298 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1299 }
1300 if (flags & SCF_DO_STCLASS_AND || !value)
1301 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1302 break;
1303 case ANYOF:
1304 if (flags & SCF_DO_STCLASS_AND)
1305 cl_and(data->start_class,
1306 (struct regnode_charclass_class*)scan);
1307 else
830247a4 1308 cl_or(pRExC_state, data->start_class,
653099ff
GS
1309 (struct regnode_charclass_class*)scan);
1310 break;
1311 case ALNUM:
1312 if (flags & SCF_DO_STCLASS_AND) {
1313 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1314 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1315 for (value = 0; value < 256; value++)
1316 if (!isALNUM(value))
1317 ANYOF_BITMAP_CLEAR(data->start_class, value);
1318 }
1319 }
1320 else {
1321 if (data->start_class->flags & ANYOF_LOCALE)
1322 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1323 else {
1324 for (value = 0; value < 256; value++)
1325 if (isALNUM(value))
b81d288d 1326 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1327 }
1328 }
1329 break;
1330 case ALNUML:
1331 if (flags & SCF_DO_STCLASS_AND) {
1332 if (data->start_class->flags & ANYOF_LOCALE)
1333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1334 }
1335 else {
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 data->start_class->flags |= ANYOF_LOCALE;
1338 }
1339 break;
1340 case NALNUM:
1341 if (flags & SCF_DO_STCLASS_AND) {
1342 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1344 for (value = 0; value < 256; value++)
1345 if (isALNUM(value))
1346 ANYOF_BITMAP_CLEAR(data->start_class, value);
1347 }
1348 }
1349 else {
1350 if (data->start_class->flags & ANYOF_LOCALE)
1351 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1352 else {
1353 for (value = 0; value < 256; value++)
1354 if (!isALNUM(value))
b81d288d 1355 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1356 }
1357 }
1358 break;
1359 case NALNUML:
1360 if (flags & SCF_DO_STCLASS_AND) {
1361 if (data->start_class->flags & ANYOF_LOCALE)
1362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1363 }
1364 else {
1365 data->start_class->flags |= ANYOF_LOCALE;
1366 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1367 }
1368 break;
1369 case SPACE:
1370 if (flags & SCF_DO_STCLASS_AND) {
1371 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1372 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1373 for (value = 0; value < 256; value++)
1374 if (!isSPACE(value))
1375 ANYOF_BITMAP_CLEAR(data->start_class, value);
1376 }
1377 }
1378 else {
1379 if (data->start_class->flags & ANYOF_LOCALE)
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1381 else {
1382 for (value = 0; value < 256; value++)
1383 if (isSPACE(value))
b81d288d 1384 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1385 }
1386 }
1387 break;
1388 case SPACEL:
1389 if (flags & SCF_DO_STCLASS_AND) {
1390 if (data->start_class->flags & ANYOF_LOCALE)
1391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1392 }
1393 else {
1394 data->start_class->flags |= ANYOF_LOCALE;
1395 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1396 }
1397 break;
1398 case NSPACE:
1399 if (flags & SCF_DO_STCLASS_AND) {
1400 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1401 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1402 for (value = 0; value < 256; value++)
1403 if (isSPACE(value))
1404 ANYOF_BITMAP_CLEAR(data->start_class, value);
1405 }
1406 }
1407 else {
1408 if (data->start_class->flags & ANYOF_LOCALE)
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1410 else {
1411 for (value = 0; value < 256; value++)
1412 if (!isSPACE(value))
b81d288d 1413 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1414 }
1415 }
1416 break;
1417 case NSPACEL:
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (data->start_class->flags & ANYOF_LOCALE) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1421 for (value = 0; value < 256; value++)
1422 if (!isSPACE(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1424 }
1425 }
1426 else {
1427 data->start_class->flags |= ANYOF_LOCALE;
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1429 }
1430 break;
1431 case DIGIT:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1434 for (value = 0; value < 256; value++)
1435 if (!isDIGIT(value))
1436 ANYOF_BITMAP_CLEAR(data->start_class, value);
1437 }
1438 else {
1439 if (data->start_class->flags & ANYOF_LOCALE)
1440 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1441 else {
1442 for (value = 0; value < 256; value++)
1443 if (isDIGIT(value))
b81d288d 1444 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1445 }
1446 }
1447 break;
1448 case NDIGIT:
1449 if (flags & SCF_DO_STCLASS_AND) {
1450 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1451 for (value = 0; value < 256; value++)
1452 if (isDIGIT(value))
1453 ANYOF_BITMAP_CLEAR(data->start_class, value);
1454 }
1455 else {
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1458 else {
1459 for (value = 0; value < 256; value++)
1460 if (!isDIGIT(value))
b81d288d 1461 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1462 }
1463 }
1464 break;
1465 }
1466 if (flags & SCF_DO_STCLASS_OR)
1467 cl_and(data->start_class, &and_with);
1468 flags &= ~SCF_DO_STCLASS;
1469 }
a0ed51b3 1470 }
22c35a8c 1471 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1472 data->flags |= (OP(scan) == MEOL
1473 ? SF_BEFORE_MEOL
1474 : SF_BEFORE_SEOL);
a0ed51b3 1475 }
653099ff
GS
1476 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1477 /* Lookbehind, or need to calculate parens/evals/stclass: */
1478 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1479 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1480 /* Lookahead/lookbehind */
cb434fcc 1481 I32 deltanext, minnext, fake = 0;
c277df42 1482 regnode *nscan;
653099ff
GS
1483 struct regnode_charclass_class intrnl;
1484 int f = 0;
c277df42
IZ
1485
1486 data_fake.flags = 0;
b81d288d 1487 if (data) {
2c2d71f5 1488 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1489 data_fake.last_closep = data->last_closep;
1490 }
1491 else
1492 data_fake.last_closep = &fake;
653099ff
GS
1493 if ( flags & SCF_DO_STCLASS && !scan->flags
1494 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1495 cl_init(pRExC_state, &intrnl);
653099ff 1496 data_fake.start_class = &intrnl;
e1901655 1497 f |= SCF_DO_STCLASS_AND;
653099ff 1498 }
e1901655
IZ
1499 if (flags & SCF_WHILEM_VISITED_POS)
1500 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1501 next = regnext(scan);
1502 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1503 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1504 if (scan->flags) {
1505 if (deltanext) {
9baa0206 1506 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1507 }
1508 else if (minnext > U8_MAX) {
9baa0206 1509 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1510 }
1511 scan->flags = minnext;
1512 }
1513 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1514 pars++;
405ff068 1515 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1516 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1517 if (data)
1518 data->whilem_c = data_fake.whilem_c;
e1901655 1519 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1520 int was = (data->start_class->flags & ANYOF_EOS);
1521
1522 cl_and(data->start_class, &intrnl);
1523 if (was)
1524 data->start_class->flags |= ANYOF_EOS;
1525 }
a0ed51b3
LW
1526 }
1527 else if (OP(scan) == OPEN) {
c277df42 1528 pars++;
a0ed51b3 1529 }
cb434fcc
IZ
1530 else if (OP(scan) == CLOSE) {
1531 if (ARG(scan) == is_par) {
1532 next = regnext(scan);
c277df42 1533
cb434fcc
IZ
1534 if ( next && (OP(next) != WHILEM) && next < last)
1535 is_par = 0; /* Disable optimization */
1536 }
1537 if (data)
1538 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1539 }
1540 else if (OP(scan) == EVAL) {
c277df42
IZ
1541 if (data)
1542 data->flags |= SF_HAS_EVAL;
1543 }
96776eda 1544 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1545 if (flags & SCF_DO_SUBSTR) {
830247a4 1546 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1547 data->longest = &(data->longest_float);
1548 }
1549 is_inf = is_inf_internal = 1;
653099ff 1550 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1551 cl_anything(pRExC_state, data->start_class);
96776eda 1552 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1553 }
c277df42
IZ
1554 /* Else: zero-length, ignore. */
1555 scan = regnext(scan);
1556 }
1557
1558 finish:
1559 *scanp = scan;
aca2d497 1560 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1561 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1562 data->pos_delta = I32_MAX - data->pos_min;
1563 if (is_par > U8_MAX)
1564 is_par = 0;
1565 if (is_par && pars==1 && data) {
1566 data->flags |= SF_IN_PAR;
1567 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1568 }
1569 else if (pars && data) {
c277df42
IZ
1570 data->flags |= SF_HAS_PAR;
1571 data->flags &= ~SF_IN_PAR;
1572 }
653099ff
GS
1573 if (flags & SCF_DO_STCLASS_OR)
1574 cl_and(data->start_class, &and_with);
c277df42
IZ
1575 return min;
1576}
1577
76e3520e 1578STATIC I32
830247a4 1579S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1580{
830247a4 1581 if (RExC_rx->data) {
b81d288d
AB
1582 Renewc(RExC_rx->data,
1583 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1584 char, struct reg_data);
830247a4
IZ
1585 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1586 RExC_rx->data->count += n;
a0ed51b3
LW
1587 }
1588 else {
830247a4 1589 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1590 char, struct reg_data);
830247a4
IZ
1591 New(1208, RExC_rx->data->what, n, U8);
1592 RExC_rx->data->count = n;
c277df42 1593 }
830247a4
IZ
1594 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1595 return RExC_rx->data->count - n;
c277df42
IZ
1596}
1597
d88dccdf 1598void
864dbfa3 1599Perl_reginitcolors(pTHX)
d88dccdf 1600{
d88dccdf
IZ
1601 int i = 0;
1602 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1603
d88dccdf
IZ
1604 if (s) {
1605 PL_colors[0] = s = savepv(s);
1606 while (++i < 6) {
1607 s = strchr(s, '\t');
1608 if (s) {
1609 *s = '\0';
1610 PL_colors[i] = ++s;
1611 }
1612 else
c712d376 1613 PL_colors[i] = s = "";
d88dccdf
IZ
1614 }
1615 } else {
b81d288d 1616 while (i < 6)
d88dccdf
IZ
1617 PL_colors[i++] = "";
1618 }
1619 PL_colorset = 1;
1620}
1621
8615cb43 1622
a687059c 1623/*
e50aee73 1624 - pregcomp - compile a regular expression into internal code
a687059c
LW
1625 *
1626 * We can't allocate space until we know how big the compiled form will be,
1627 * but we can't compile it (and thus know how big it is) until we've got a
1628 * place to put the code. So we cheat: we compile it twice, once with code
1629 * generation turned off and size counting turned on, and once "for real".
1630 * This also means that we don't allocate space until we are sure that the
1631 * thing really will compile successfully, and we never have to move the
1632 * code and thus invalidate pointers into it. (Note that it has to be in
1633 * one piece because free() must be able to free it all.) [NB: not true in perl]
1634 *
1635 * Beware that the optimization-preparation code in here knows about some
1636 * of the structure of the compiled regexp. [I'll say.]
1637 */
1638regexp *
864dbfa3 1639Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1640{
a0d0e21e 1641 register regexp *r;
c277df42 1642 regnode *scan;
c277df42 1643 regnode *first;
a0d0e21e 1644 I32 flags;
a0d0e21e
LW
1645 I32 minlen = 0;
1646 I32 sawplus = 0;
1647 I32 sawopen = 0;
2c2d71f5 1648 scan_data_t data;
830247a4
IZ
1649 RExC_state_t RExC_state;
1650 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1651
1652 if (exp == NULL)
c277df42 1653 FAIL("NULL regexp argument");
a0d0e21e 1654
830247a4 1655 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1656 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1657 RExC_utf8 = 1;
a0ed51b3 1658 else
1aa99e6b 1659 RExC_utf8 = 0;
a0ed51b3 1660
5cfc7842 1661 RExC_precomp = exp;
35ef4773 1662 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1663 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1664 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1665 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1666 RExC_flags16 = pm->op_pmflags;
1667 RExC_sawback = 0;
bbce6d69 1668
830247a4
IZ
1669 RExC_seen = 0;
1670 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1671 RExC_seen_evals = 0;
1672 RExC_extralen = 0;
c277df42 1673
bbce6d69 1674 /* First pass: determine size, legality. */
830247a4 1675 RExC_parse = exp;
fac92740 1676 RExC_start = exp;
830247a4
IZ
1677 RExC_end = xend;
1678 RExC_naughty = 0;
1679 RExC_npar = 1;
1680 RExC_size = 0L;
1681 RExC_emit = &PL_regdummy;
1682 RExC_whilem_seen = 0;
85ddcde9
JH
1683#if 0 /* REGC() is (currently) a NOP at the first pass.
1684 * Clever compilers notice this and complain. --jhi */
830247a4 1685 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1686#endif
830247a4 1687 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1688 RExC_precomp = Nullch;
a0d0e21e
LW
1689 return(NULL);
1690 }
830247a4 1691 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1692
c277df42
IZ
1693 /* Small enough for pointer-storage convention?
1694 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1695 if (RExC_size >= 0x10000L && RExC_extralen)
1696 RExC_size += RExC_extralen;
c277df42 1697 else
830247a4
IZ
1698 RExC_extralen = 0;
1699 if (RExC_whilem_seen > 15)
1700 RExC_whilem_seen = 15;
a0d0e21e 1701
bbce6d69 1702 /* Allocate space and initialize. */
830247a4 1703 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1704 char, regexp);
a0d0e21e 1705 if (r == NULL)
b45f050a
JF
1706 FAIL("Regexp out of space");
1707
0f79a09d
GS
1708#ifdef DEBUGGING
1709 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1710 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1711#endif
c277df42 1712 r->refcnt = 1;
bbce6d69 1713 r->prelen = xend - exp;
5cfc7842 1714 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1715 r->subbeg = NULL;
1716 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1717 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1718
1719 r->substrs = 0; /* Useful during FAIL. */
1720 r->startp = 0; /* Useful during FAIL. */
1721 r->endp = 0; /* Useful during FAIL. */
1722
fac92740
MJD
1723 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1724 if (r->offsets) {
1725 r->offsets[0] = RExC_size;
1726 }
1727 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1728 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1729 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1730 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1731
830247a4 1732 RExC_rx = r;
bbce6d69 1733
1734 /* Second pass: emit code. */
830247a4
IZ
1735 RExC_parse = exp;
1736 RExC_end = xend;
1737 RExC_naughty = 0;
1738 RExC_npar = 1;
fac92740 1739 RExC_emit_start = r->program;
830247a4 1740 RExC_emit = r->program;
2cd61cdb 1741 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1742 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1743 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1744 r->data = 0;
830247a4 1745 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1746 return(NULL);
1747
1748 /* Dig out information for optimizations. */
cf93c79d 1749 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1750 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1751 if (UTF)
1752 r->reganch |= ROPT_UTF8;
c277df42 1753 r->regstclass = NULL;
830247a4 1754 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1755 r->reganch |= ROPT_NAUGHTY;
c277df42 1756 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1757
1758 /* XXXX To minimize changes to RE engine we always allocate
1759 3-units-long substrs field. */
1760 Newz(1004, r->substrs, 1, struct reg_substr_data);
1761
2c2d71f5 1762 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1763 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1764 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1765 I32 fake;
c5254dd6 1766 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1767 struct regnode_charclass_class ch_class;
1768 int stclass_flag;
cb434fcc 1769 I32 last_close = 0;
a0d0e21e
LW
1770
1771 first = scan;
c277df42 1772 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1773 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1774 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1775 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1776 (OP(first) == PLUS) ||
1777 (OP(first) == MINMOD) ||
653099ff 1778 /* An {n,m} with n>0 */
22c35a8c 1779 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1780 if (OP(first) == PLUS)
1781 sawplus = 1;
1782 else
1783 first += regarglen[(U8)OP(first)];
1784 first = NEXTOPER(first);
a687059c
LW
1785 }
1786
a0d0e21e
LW
1787 /* Starting-point info. */
1788 again:
653099ff 1789 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1790 if (OP(first) == EXACT)
1791 ; /* Empty, get anchored substr later. */
1792 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1793 r->regstclass = first;
1794 }
653099ff 1795 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1796 r->regstclass = first;
22c35a8c
GS
1797 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1798 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1799 r->regstclass = first;
22c35a8c 1800 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1801 r->reganch |= (OP(first) == MBOL
1802 ? ROPT_ANCH_MBOL
1803 : (OP(first) == SBOL
1804 ? ROPT_ANCH_SBOL
1805 : ROPT_ANCH_BOL));
a0d0e21e 1806 first = NEXTOPER(first);
774d564b 1807 goto again;
1808 }
1809 else if (OP(first) == GPOS) {
1810 r->reganch |= ROPT_ANCH_GPOS;
1811 first = NEXTOPER(first);
1812 goto again;
a0d0e21e
LW
1813 }
1814 else if ((OP(first) == STAR &&
22c35a8c 1815 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1816 !(r->reganch & ROPT_ANCH) )
1817 {
1818 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1819 int type = OP(NEXTOPER(first));
1820
ffc61ed2 1821 if (type == REG_ANY)
cad2e5aa
JH
1822 type = ROPT_ANCH_MBOL;
1823 else
1824 type = ROPT_ANCH_SBOL;
1825
1826 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1827 first = NEXTOPER(first);
774d564b 1828 goto again;
a0d0e21e 1829 }
b81d288d 1830 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1831 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1832 /* x+ must match at the 1st pos of run of x's */
1833 r->reganch |= ROPT_SKIP;
a0d0e21e 1834
c277df42 1835 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1836 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1837 (IV)(first - scan + 1)));
a0d0e21e
LW
1838 /*
1839 * If there's something expensive in the r.e., find the
1840 * longest literal string that must appear and make it the
1841 * regmust. Resolve ties in favor of later strings, since
1842 * the regstart check works with the beginning of the r.e.
1843 * and avoiding duplication strengthens checking. Not a
1844 * strong reason, but sufficient in the absence of others.
1845 * [Now we resolve ties in favor of the earlier string if
c277df42 1846 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1847 * earlier string may buy us something the later one won't.]
1848 */
a0d0e21e 1849 minlen = 0;
a687059c 1850
79cb57f6
GS
1851 data.longest_fixed = newSVpvn("",0);
1852 data.longest_float = newSVpvn("",0);
1853 data.last_found = newSVpvn("",0);
c277df42
IZ
1854 data.longest = &(data.longest_fixed);
1855 first = scan;
653099ff 1856 if (!r->regstclass) {
830247a4 1857 cl_init(pRExC_state, &ch_class);
653099ff
GS
1858 data.start_class = &ch_class;
1859 stclass_flag = SCF_DO_STCLASS_AND;
1860 } else /* XXXX Check for BOUND? */
1861 stclass_flag = 0;
cb434fcc 1862 data.last_closep = &last_close;
653099ff 1863
830247a4 1864 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1865 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1866 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1867 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1868 && !RExC_seen_zerolen
1869 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1870 r->reganch |= ROPT_CHECK_ALL;
830247a4 1871 scan_commit(pRExC_state, &data);
c277df42
IZ
1872 SvREFCNT_dec(data.last_found);
1873
a0ed51b3 1874 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1875 if (longest_float_length
c277df42
IZ
1876 || (data.flags & SF_FL_BEFORE_EOL
1877 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1878 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1879 int t;
1880
a0ed51b3 1881 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1882 && data.offset_fixed == data.offset_float_min
1883 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1884 goto remove_float; /* As in (a)+. */
1885
c277df42
IZ
1886 r->float_substr = data.longest_float;
1887 r->float_min_offset = data.offset_float_min;
1888 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1889 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1890 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1891 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1892 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1893 }
1894 else {
aca2d497 1895 remove_float:
c277df42
IZ
1896 r->float_substr = Nullsv;
1897 SvREFCNT_dec(data.longest_float);
c5254dd6 1898 longest_float_length = 0;
a0d0e21e 1899 }
c277df42 1900
a0ed51b3 1901 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1902 if (longest_fixed_length
c277df42
IZ
1903 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1904 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1905 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1906 int t;
1907
c277df42
IZ
1908 r->anchored_substr = data.longest_fixed;
1909 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1910 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1911 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1912 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1913 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1914 }
1915 else {
c277df42
IZ
1916 r->anchored_substr = Nullsv;
1917 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1918 longest_fixed_length = 0;
a0d0e21e 1919 }
b81d288d 1920 if (r->regstclass
ffc61ed2 1921 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1922 r->regstclass = NULL;
1923 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1924 && !(data.start_class->flags & ANYOF_EOS)
1925 && !cl_is_anything(data.start_class)) {
830247a4 1926 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1927
b81d288d 1928 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1929 struct regnode_charclass_class);
1930 StructCopy(data.start_class,
830247a4 1931 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1932 struct regnode_charclass_class);
830247a4 1933 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1934 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1935 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1936 DEBUG_r({ SV *sv = sv_newmortal();
1937 regprop(sv, (regnode*)data.start_class);
1938 PerlIO_printf(Perl_debug_log,
1939 "synthetic stclass `%s'.\n",
1940 SvPVX(sv));});
653099ff 1941 }
c277df42
IZ
1942
1943 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1944 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1945 r->check_substr = r->anchored_substr;
1946 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1947 if (r->reganch & ROPT_ANCH_SINGLE)
1948 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1949 }
1950 else {
c277df42
IZ
1951 r->check_substr = r->float_substr;
1952 r->check_offset_min = data.offset_float_min;
1953 r->check_offset_max = data.offset_float_max;
a0d0e21e 1954 }
30382c73
IZ
1955 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1956 This should be changed ASAP! */
1957 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1958 r->reganch |= RE_USE_INTUIT;
1959 if (SvTAIL(r->check_substr))
1960 r->reganch |= RE_INTUIT_TAIL;
1961 }
a0ed51b3
LW
1962 }
1963 else {
c277df42
IZ
1964 /* Several toplevels. Best we can is to set minlen. */
1965 I32 fake;
653099ff 1966 struct regnode_charclass_class ch_class;
cb434fcc 1967 I32 last_close = 0;
c277df42
IZ
1968
1969 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1970 scan = r->program + 1;
830247a4 1971 cl_init(pRExC_state, &ch_class);
653099ff 1972 data.start_class = &ch_class;
cb434fcc 1973 data.last_closep = &last_close;
e1901655 1974 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1975 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1976 if (!(data.start_class->flags & ANYOF_EOS)
1977 && !cl_is_anything(data.start_class)) {
830247a4 1978 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1979
b81d288d 1980 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1981 struct regnode_charclass_class);
1982 StructCopy(data.start_class,
830247a4 1983 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1984 struct regnode_charclass_class);
830247a4 1985 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1986 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1987 DEBUG_r({ SV* sv = sv_newmortal();
1988 regprop(sv, (regnode*)data.start_class);
1989 PerlIO_printf(Perl_debug_log,
1990 "synthetic stclass `%s'.\n",
1991 SvPVX(sv));});
653099ff 1992 }
a0d0e21e
LW
1993 }
1994
a0d0e21e 1995 r->minlen = minlen;
b81d288d 1996 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1997 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1998 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1999 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2000 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2001 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
2002 if (RExC_seen & REG_SEEN_SANY)
2003 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
2004 Newz(1002, r->startp, RExC_npar, I32);
2005 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2006 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2007 DEBUG_r(regdump(r));
2008 return(r);
a687059c
LW
2009}
2010
2011/*
2012 - reg - regular expression, i.e. main body or parenthesized thing
2013 *
2014 * Caller must absorb opening parenthesis.
2015 *
2016 * Combining parenthesis handling with the base level of regular expression
2017 * is a trifle forced, but the need to tie the tails of the branches to what
2018 * follows makes it hard to avoid.
2019 */
76e3520e 2020STATIC regnode *
830247a4 2021S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2022 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2023{
c277df42
IZ
2024 register regnode *ret; /* Will be the head of the group. */
2025 register regnode *br;
2026 register regnode *lastbr;
2027 register regnode *ender = 0;
a0d0e21e 2028 register I32 parno = 0;
830247a4 2029 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2030
2031 /* for (?g), (?gc), and (?o) warnings; warning
2032 about (?c) will warn about (?g) -- japhy */
2033
2034 I32 wastedflags = 0x00,
2035 wasted_o = 0x01,
2036 wasted_g = 0x02,
2037 wasted_gc = 0x02 | 0x04,
2038 wasted_c = 0x04;
2039
fac92740 2040 char * parse_start = RExC_parse; /* MJD */
830247a4 2041 char *oregcomp_parse = RExC_parse;
c277df42 2042 char c;
a0d0e21e 2043
821b33a5 2044 *flagp = 0; /* Tentatively. */
a0d0e21e 2045
9d1d55b5 2046
a0d0e21e
LW
2047 /* Make an OPEN node, if parenthesized. */
2048 if (paren) {
fac92740 2049 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2050 U16 posflags = 0, negflags = 0;
2051 U16 *flagsp = &posflags;
0f5d15d6 2052 int logical = 0;
830247a4 2053 char *seqstart = RExC_parse;
ca9dfc88 2054
830247a4
IZ
2055 RExC_parse++;
2056 paren = *RExC_parse++;
c277df42 2057 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2058 switch (paren) {
fac92740 2059 case '<': /* (?<...) */
830247a4 2060 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2061 if (*RExC_parse == '!')
c277df42 2062 paren = ',';
b81d288d 2063 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2064 goto unknown;
830247a4 2065 RExC_parse++;
fac92740
MJD
2066 case '=': /* (?=...) */
2067 case '!': /* (?!...) */
830247a4 2068 RExC_seen_zerolen++;
fac92740
MJD
2069 case ':': /* (?:...) */
2070 case '>': /* (?>...) */
a0d0e21e 2071 break;
fac92740
MJD
2072 case '$': /* (?$...) */
2073 case '@': /* (?@...) */
8615cb43 2074 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2075 break;
fac92740 2076 case '#': /* (?#...) */
830247a4
IZ
2077 while (*RExC_parse && *RExC_parse != ')')
2078 RExC_parse++;
2079 if (*RExC_parse != ')')
c277df42 2080 FAIL("Sequence (?#... not terminated");
830247a4 2081 nextchar(pRExC_state);
a0d0e21e
LW
2082 *flagp = TRYAGAIN;
2083 return NULL;
fac92740 2084 case 'p': /* (?p...) */
d7e9e385 2085 if (SIZE_ONLY)
830247a4 2086 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2087 /* FALL THROUGH*/
fac92740 2088 case '?': /* (??...) */
0f5d15d6 2089 logical = 1;
830247a4 2090 paren = *RExC_parse++;
0f5d15d6 2091 /* FALL THROUGH */
fac92740 2092 case '{': /* (?{...}) */
c277df42 2093 {
c277df42
IZ
2094 I32 count = 1, n = 0;
2095 char c;
830247a4 2096 char *s = RExC_parse;
c277df42
IZ
2097 SV *sv;
2098 OP_4tree *sop, *rop;
2099
830247a4
IZ
2100 RExC_seen_zerolen++;
2101 RExC_seen |= REG_SEEN_EVAL;
2102 while (count && (c = *RExC_parse)) {
2103 if (c == '\\' && RExC_parse[1])
2104 RExC_parse++;
b81d288d 2105 else if (c == '{')
c277df42 2106 count++;
b81d288d 2107 else if (c == '}')
c277df42 2108 count--;
830247a4 2109 RExC_parse++;
c277df42 2110 }
830247a4 2111 if (*RExC_parse != ')')
b45f050a 2112 {
b81d288d 2113 RExC_parse = s;
b45f050a
JF
2114 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2115 }
c277df42
IZ
2116 if (!SIZE_ONLY) {
2117 AV *av;
b81d288d
AB
2118
2119 if (RExC_parse - 1 - s)
830247a4 2120 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2121 else
79cb57f6 2122 sv = newSVpvn("", 0);
c277df42 2123
569233ed
SB
2124 ENTER;
2125 Perl_save_re_context(aTHX);
c277df42 2126 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2127 LEAVE;
c277df42 2128
830247a4
IZ
2129 n = add_data(pRExC_state, 3, "nop");
2130 RExC_rx->data->data[n] = (void*)rop;
2131 RExC_rx->data->data[n+1] = (void*)sop;
2132 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2133 SvREFCNT_dec(sv);
a0ed51b3 2134 }
e24b16f9 2135 else { /* First pass */
830247a4 2136 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2137 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2138 /* No compiled RE interpolated, has runtime
2139 components ===> unsafe. */
2140 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2141 if (PL_tainting && PL_tainted)
cc6b7395 2142 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2143 }
2144
830247a4 2145 nextchar(pRExC_state);
0f5d15d6 2146 if (logical) {
830247a4 2147 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2148 if (!SIZE_ONLY)
2149 ret->flags = 2;
830247a4 2150 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2151 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2152 return ret;
2153 }
830247a4 2154 return reganode(pRExC_state, EVAL, n);
c277df42 2155 }
fac92740 2156 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2157 {
fac92740 2158 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2159 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2160 || RExC_parse[1] == '<'
830247a4 2161 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2162 I32 flag;
2163
830247a4 2164 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2165 if (!SIZE_ONLY)
2166 ret->flags = 1;
830247a4 2167 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2168 goto insert_if;
b81d288d 2169 }
a0ed51b3 2170 }
830247a4 2171 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2172 /* (?(1)...) */
830247a4 2173 parno = atoi(RExC_parse++);
c277df42 2174
830247a4
IZ
2175 while (isDIGIT(*RExC_parse))
2176 RExC_parse++;
fac92740
MJD
2177 ret = reganode(pRExC_state, GROUPP, parno);
2178
830247a4 2179 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2180 vFAIL("Switch condition not recognized");
c277df42 2181 insert_if:
830247a4
IZ
2182 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2183 br = regbranch(pRExC_state, &flags, 1);
c277df42 2184 if (br == NULL)
830247a4 2185 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2186 else
830247a4
IZ
2187 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2188 c = *nextchar(pRExC_state);
d1b80229
IZ
2189 if (flags&HASWIDTH)
2190 *flagp |= HASWIDTH;
c277df42 2191 if (c == '|') {
830247a4
IZ
2192 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2193 regbranch(pRExC_state, &flags, 1);
2194 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2195 if (flags&HASWIDTH)
2196 *flagp |= HASWIDTH;
830247a4 2197 c = *nextchar(pRExC_state);
a0ed51b3
LW
2198 }
2199 else
c277df42
IZ
2200 lastbr = NULL;
2201 if (c != ')')
8615cb43 2202 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2203 ender = reg_node(pRExC_state, TAIL);
2204 regtail(pRExC_state, br, ender);
c277df42 2205 if (lastbr) {
830247a4
IZ
2206 regtail(pRExC_state, lastbr, ender);
2207 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2208 }
2209 else
830247a4 2210 regtail(pRExC_state, ret, ender);
c277df42 2211 return ret;
a0ed51b3
LW
2212 }
2213 else {
830247a4 2214 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2215 }
2216 }
1b1626e4 2217 case 0:
830247a4 2218 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2219 vFAIL("Sequence (? incomplete");
1b1626e4 2220 break;
a0d0e21e 2221 default:
830247a4 2222 --RExC_parse;
fac92740 2223 parse_flags: /* (?i) */
830247a4 2224 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2225 /* (?g), (?gc) and (?o) are useless here
2226 and must be globally applied -- japhy */
2227
2228 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2229 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2230 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2231 if (! (wastedflags & wflagbit) ) {
2232 wastedflags |= wflagbit;
2233 vWARN5(
2234 RExC_parse + 1,
2235 "Useless (%s%c) - %suse /%c modifier",
2236 flagsp == &negflags ? "?-" : "?",
2237 *RExC_parse,
2238 flagsp == &negflags ? "don't " : "",
2239 *RExC_parse
2240 );
2241 }
2242 }
2243 }
2244 else if (*RExC_parse == 'c') {
2245 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2246 if (! (wastedflags & wasted_c) ) {
2247 wastedflags |= wasted_gc;
2248 vWARN3(
2249 RExC_parse + 1,
2250 "Useless (%sc) - %suse /gc modifier",
2251 flagsp == &negflags ? "?-" : "?",
2252 flagsp == &negflags ? "don't " : ""
2253 );
2254 }
2255 }
2256 }
2257 else { pmflag(flagsp, *RExC_parse); }
2258
830247a4 2259 ++RExC_parse;
ca9dfc88 2260 }
830247a4 2261 if (*RExC_parse == '-') {
ca9dfc88 2262 flagsp = &negflags;
9d1d55b5 2263 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2264 ++RExC_parse;
ca9dfc88 2265 goto parse_flags;
48c036b1 2266 }
830247a4
IZ
2267 RExC_flags16 |= posflags;
2268 RExC_flags16 &= ~negflags;
2269 if (*RExC_parse == ':') {
2270 RExC_parse++;
ca9dfc88
IZ
2271 paren = ':';
2272 break;
2273 }
c277df42 2274 unknown:
830247a4
IZ
2275 if (*RExC_parse != ')') {
2276 RExC_parse++;
2277 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2278 }
830247a4 2279 nextchar(pRExC_state);
a0d0e21e
LW
2280 *flagp = TRYAGAIN;
2281 return NULL;
2282 }
2283 }
fac92740 2284 else { /* (...) */
830247a4
IZ
2285 parno = RExC_npar;
2286 RExC_npar++;
2287 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2288 Set_Node_Length(ret, 1); /* MJD */
2289 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2290 open = 1;
a0d0e21e 2291 }
a0ed51b3 2292 }
fac92740 2293 else /* ! paren */
a0d0e21e
LW
2294 ret = NULL;
2295
2296 /* Pick up the branches, linking them together. */
fac92740 2297 parse_start = RExC_parse; /* MJD */
830247a4 2298 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2299 /* branch_len = (paren != 0); */
2300
a0d0e21e
LW
2301 if (br == NULL)
2302 return(NULL);
830247a4
IZ
2303 if (*RExC_parse == '|') {
2304 if (!SIZE_ONLY && RExC_extralen) {
2305 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2306 }
fac92740 2307 else { /* MJD */
830247a4 2308 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2309 Set_Node_Length(br, paren != 0);
2310 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2311 }
c277df42
IZ
2312 have_branch = 1;
2313 if (SIZE_ONLY)
830247a4 2314 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2315 }
2316 else if (paren == ':') {
c277df42
IZ
2317 *flagp |= flags&SIMPLE;
2318 }
2319 if (open) { /* Starts with OPEN. */
830247a4 2320 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2321 }
2322 else if (paren != '?') /* Not Conditional */
a0d0e21e 2323 ret = br;
821b33a5
IZ
2324 if (flags&HASWIDTH)
2325 *flagp |= HASWIDTH;
a0d0e21e 2326 *flagp |= flags&SPSTART;
c277df42 2327 lastbr = br;
830247a4
IZ
2328 while (*RExC_parse == '|') {
2329 if (!SIZE_ONLY && RExC_extralen) {
2330 ender = reganode(pRExC_state, LONGJMP,0);
2331 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2332 }
2333 if (SIZE_ONLY)
830247a4
IZ
2334 RExC_extralen += 2; /* Account for LONGJMP. */
2335 nextchar(pRExC_state);
2336 br = regbranch(pRExC_state, &flags, 0);
fac92740 2337
a687059c 2338 if (br == NULL)
a0d0e21e 2339 return(NULL);
830247a4 2340 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2341 lastbr = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a687059c 2344 *flagp |= flags&SPSTART;
a0d0e21e
LW
2345 }
2346
c277df42
IZ
2347 if (have_branch || paren != ':') {
2348 /* Make a closing node, and hook it on the end. */
2349 switch (paren) {
2350 case ':':
830247a4 2351 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2352 break;
2353 case 1:
830247a4 2354 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2355 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2356 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2357 break;
2358 case '<':
c277df42
IZ
2359 case ',':
2360 case '=':
2361 case '!':
c277df42 2362 *flagp &= ~HASWIDTH;
821b33a5
IZ
2363 /* FALL THROUGH */
2364 case '>':
830247a4 2365 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2366 break;
2367 case 0:
830247a4 2368 ender = reg_node(pRExC_state, END);
c277df42
IZ
2369 break;
2370 }
830247a4 2371 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2372
c277df42
IZ
2373 if (have_branch) {
2374 /* Hook the tails of the branches to the closing node. */
2375 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2376 regoptail(pRExC_state, br, ender);
c277df42
IZ
2377 }
2378 }
a0d0e21e 2379 }
c277df42
IZ
2380
2381 {
2382 char *p;
2383 static char parens[] = "=!<,>";
2384
2385 if (paren && (p = strchr(parens, paren))) {
2386 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2387 int flag = (p - parens) > 1;
2388
2389 if (paren == '>')
2390 node = SUSPEND, flag = 0;
830247a4 2391 reginsert(pRExC_state, node,ret);
c277df42 2392 ret->flags = flag;
830247a4 2393 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2394 }
a0d0e21e
LW
2395 }
2396
2397 /* Check for proper termination. */
ce3e6498 2398 if (paren) {
830247a4
IZ
2399 RExC_flags16 = oregflags;
2400 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2401 RExC_parse = oregcomp_parse;
380a0633 2402 vFAIL("Unmatched (");
ce3e6498 2403 }
a0ed51b3 2404 }
830247a4
IZ
2405 else if (!paren && RExC_parse < RExC_end) {
2406 if (*RExC_parse == ')') {
2407 RExC_parse++;
380a0633 2408 vFAIL("Unmatched )");
a0ed51b3
LW
2409 }
2410 else
b45f050a 2411 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2412 /* NOTREACHED */
2413 }
a687059c 2414
a0d0e21e 2415 return(ret);
a687059c
LW
2416}
2417
2418/*
2419 - regbranch - one alternative of an | operator
2420 *
2421 * Implements the concatenation operator.
2422 */
76e3520e 2423STATIC regnode *
830247a4 2424S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2425{
c277df42
IZ
2426 register regnode *ret;
2427 register regnode *chain = NULL;
2428 register regnode *latest;
2429 I32 flags = 0, c = 0;
a0d0e21e 2430
b81d288d 2431 if (first)
c277df42
IZ
2432 ret = NULL;
2433 else {
b81d288d 2434 if (!SIZE_ONLY && RExC_extralen)
830247a4 2435 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2436 else {
830247a4 2437 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2438 Set_Node_Length(ret, 1);
2439 }
c277df42
IZ
2440 }
2441
b81d288d 2442 if (!first && SIZE_ONLY)
830247a4 2443 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2444
c277df42 2445 *flagp = WORST; /* Tentatively. */
a0d0e21e 2446
830247a4
IZ
2447 RExC_parse--;
2448 nextchar(pRExC_state);
2449 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2450 flags &= ~TRYAGAIN;
830247a4 2451 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2452 if (latest == NULL) {
2453 if (flags & TRYAGAIN)
2454 continue;
2455 return(NULL);
a0ed51b3
LW
2456 }
2457 else if (ret == NULL)
c277df42 2458 ret = latest;
a0d0e21e 2459 *flagp |= flags&HASWIDTH;
c277df42 2460 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2461 *flagp |= flags&SPSTART;
2462 else {
830247a4
IZ
2463 RExC_naughty++;
2464 regtail(pRExC_state, chain, latest);
a687059c 2465 }
a0d0e21e 2466 chain = latest;
c277df42
IZ
2467 c++;
2468 }
2469 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2470 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2471 if (ret == NULL)
2472 ret = chain;
2473 }
2474 if (c == 1) {
2475 *flagp |= flags&SIMPLE;
a0d0e21e 2476 }
a687059c 2477
a0d0e21e 2478 return(ret);
a687059c
LW
2479}
2480
2481/*
2482 - regpiece - something followed by possible [*+?]
2483 *
2484 * Note that the branching code sequences used for ? and the general cases
2485 * of * and + are somewhat optimized: they use the same NOTHING node as
2486 * both the endmarker for their branch list and the body of the last branch.
2487 * It might seem that this node could be dispensed with entirely, but the
2488 * endmarker role is not redundant.
2489 */
76e3520e 2490STATIC regnode *
830247a4 2491S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2492{
c277df42 2493 register regnode *ret;
a0d0e21e
LW
2494 register char op;
2495 register char *next;
2496 I32 flags;
830247a4 2497 char *origparse = RExC_parse;
a0d0e21e
LW
2498 char *maxpos;
2499 I32 min;
c277df42 2500 I32 max = REG_INFTY;
fac92740 2501 char *parse_start;
a0d0e21e 2502
830247a4 2503 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2504 if (ret == NULL) {
2505 if (flags & TRYAGAIN)
2506 *flagp |= TRYAGAIN;
2507 return(NULL);
2508 }
2509
830247a4 2510 op = *RExC_parse;
a0d0e21e 2511
830247a4 2512 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2513 parse_start = RExC_parse; /* MJD */
830247a4 2514 next = RExC_parse + 1;
a0d0e21e
LW
2515 maxpos = Nullch;
2516 while (isDIGIT(*next) || *next == ',') {
2517 if (*next == ',') {
2518 if (maxpos)
2519 break;
2520 else
2521 maxpos = next;
a687059c 2522 }
a0d0e21e
LW
2523 next++;
2524 }
2525 if (*next == '}') { /* got one */
2526 if (!maxpos)
2527 maxpos = next;
830247a4
IZ
2528 RExC_parse++;
2529 min = atoi(RExC_parse);
a0d0e21e
LW
2530 if (*maxpos == ',')
2531 maxpos++;
2532 else
830247a4 2533 maxpos = RExC_parse;
a0d0e21e
LW
2534 max = atoi(maxpos);
2535 if (!max && *maxpos != '0')
c277df42
IZ
2536 max = REG_INFTY; /* meaning "infinity" */
2537 else if (max >= REG_INFTY)
8615cb43 2538 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2539 RExC_parse = next;
2540 nextchar(pRExC_state);
a0d0e21e
LW
2541
2542 do_curly:
2543 if ((flags&SIMPLE)) {
830247a4
IZ
2544 RExC_naughty += 2 + RExC_naughty / 2;
2545 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2546 Set_Node_Offset(ret, parse_start+1); /* MJD */
2547 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2548 }
2549 else {
830247a4 2550 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2551
2552 w->flags = 0;
830247a4
IZ
2553 regtail(pRExC_state, ret, w);
2554 if (!SIZE_ONLY && RExC_extralen) {
2555 reginsert(pRExC_state, LONGJMP,ret);
2556 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2557 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2558 }
830247a4 2559 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2560 /* MJD hk */
2561 Set_Node_Offset(ret, parse_start+1);
2562 Set_Node_Length(ret,
2563 op == '{' ? (RExC_parse - parse_start) : 1);
2564
830247a4 2565 if (!SIZE_ONLY && RExC_extralen)
c277df42 2566 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2567 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2568 if (SIZE_ONLY)
830247a4
IZ
2569 RExC_whilem_seen++, RExC_extralen += 3;
2570 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2571 }
c277df42 2572 ret->flags = 0;
a0d0e21e
LW
2573
2574 if (min > 0)
821b33a5
IZ
2575 *flagp = WORST;
2576 if (max > 0)
2577 *flagp |= HASWIDTH;
a0d0e21e 2578 if (max && max < min)
8615cb43 2579 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2580 if (!SIZE_ONLY) {
2581 ARG1_SET(ret, min);
2582 ARG2_SET(ret, max);
a687059c 2583 }
a687059c 2584
a0d0e21e 2585 goto nest_check;
a687059c 2586 }
a0d0e21e 2587 }
a687059c 2588
a0d0e21e
LW
2589 if (!ISMULT1(op)) {
2590 *flagp = flags;
a687059c 2591 return(ret);
a0d0e21e 2592 }
bb20fd44 2593
c277df42 2594#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2595
2596 /* if this is reinstated, don't forget to put this back into perldiag:
2597
2598 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2599
2600 (F) The part of the regexp subject to either the * or + quantifier
2601 could match an empty string. The {#} shows in the regular
2602 expression about where the problem was discovered.
2603
2604 */
2605
bb20fd44 2606 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2607 vFAIL("Regexp *+ operand could be empty");
b81d288d 2608#endif
bb20fd44 2609
fac92740 2610 parse_start = RExC_parse;
830247a4 2611 nextchar(pRExC_state);
a0d0e21e 2612
821b33a5 2613 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2614
2615 if (op == '*' && (flags&SIMPLE)) {
830247a4 2616 reginsert(pRExC_state, STAR, ret);
c277df42 2617 ret->flags = 0;
830247a4 2618 RExC_naughty += 4;
a0d0e21e
LW
2619 }
2620 else if (op == '*') {
2621 min = 0;
2622 goto do_curly;
a0ed51b3
LW
2623 }
2624 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2625 reginsert(pRExC_state, PLUS, ret);
c277df42 2626 ret->flags = 0;
830247a4 2627 RExC_naughty += 3;
a0d0e21e
LW
2628 }
2629 else if (op == '+') {
2630 min = 1;
2631 goto do_curly;
a0ed51b3
LW
2632 }
2633 else if (op == '?') {
a0d0e21e
LW
2634 min = 0; max = 1;
2635 goto do_curly;
2636 }
2637 nest_check:
e476b1b5 2638 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2639 vWARN3(RExC_parse,
b45f050a 2640 "%.*s matches null string many times",
830247a4 2641 RExC_parse - origparse,
b45f050a 2642 origparse);
a0d0e21e
LW
2643 }
2644
830247a4
IZ
2645 if (*RExC_parse == '?') {
2646 nextchar(pRExC_state);
2647 reginsert(pRExC_state, MINMOD, ret);
2648 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2649 }
830247a4
IZ
2650 if (ISMULT2(RExC_parse)) {
2651 RExC_parse++;
b45f050a
JF
2652 vFAIL("Nested quantifiers");
2653 }
a0d0e21e
LW
2654
2655 return(ret);
a687059c
LW
2656}
2657
2658/*
2659 - regatom - the lowest level
2660 *
2661 * Optimization: gobbles an entire sequence of ordinary characters so that
2662 * it can turn them into a single node, which is smaller to store and
2663 * faster to run. Backslashed characters are exceptions, each becoming a
2664 * separate node; the code is simpler that way and it's not worth fixing.
2665 *
b45f050a 2666 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2667STATIC regnode *
830247a4 2668S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2669{
c277df42 2670 register regnode *ret = 0;
a0d0e21e
LW
2671 I32 flags;
2672
2673 *flagp = WORST; /* Tentatively. */
2674
2675tryagain:
830247a4 2676 switch (*RExC_parse) {
a0d0e21e 2677 case '^':
830247a4
IZ
2678 RExC_seen_zerolen++;
2679 nextchar(pRExC_state);
2680 if (RExC_flags16 & PMf_MULTILINE)
2681 ret = reg_node(pRExC_state, MBOL);
2682 else if (RExC_flags16 & PMf_SINGLELINE)
2683 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2684 else
830247a4 2685 ret = reg_node(pRExC_state, BOL);
fac92740 2686 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2687 break;
2688 case '$':
830247a4 2689 nextchar(pRExC_state);
b81d288d 2690 if (*RExC_parse)
830247a4
IZ
2691 RExC_seen_zerolen++;
2692 if (RExC_flags16 & PMf_MULTILINE)
2693 ret = reg_node(pRExC_state, MEOL);
2694 else if (RExC_flags16 & PMf_SINGLELINE)
2695 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2696 else
830247a4 2697 ret = reg_node(pRExC_state, EOL);
fac92740 2698 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2699 break;
2700 case '.':
830247a4 2701 nextchar(pRExC_state);
ffc61ed2
JH
2702 if (RExC_flags16 & PMf_SINGLELINE)
2703 ret = reg_node(pRExC_state, SANY);
2704 else
2705 ret = reg_node(pRExC_state, REG_ANY);
2706 *flagp |= HASWIDTH|SIMPLE;
830247a4 2707 RExC_naughty++;
fac92740 2708 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2709 break;
2710 case '[':
b45f050a 2711 {
830247a4 2712 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2713 ret = regclass(pRExC_state);
830247a4
IZ
2714 if (*RExC_parse != ']') {
2715 RExC_parse = oregcomp_parse;
b45f050a
JF
2716 vFAIL("Unmatched [");
2717 }
830247a4 2718 nextchar(pRExC_state);
a0d0e21e 2719 *flagp |= HASWIDTH|SIMPLE;
fac92740 2720 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2721 break;
b45f050a 2722 }
a0d0e21e 2723 case '(':
830247a4
IZ
2724 nextchar(pRExC_state);
2725 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2726 if (ret == NULL) {
bf93d4cc 2727 if (flags & TRYAGAIN) {
830247a4 2728 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2729 /* Make parent create an empty node if needed. */
2730 *flagp |= TRYAGAIN;
2731 return(NULL);
2732 }
a0d0e21e 2733 goto tryagain;
bf93d4cc 2734 }
a0d0e21e
LW
2735 return(NULL);
2736 }
c277df42 2737 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2738 break;
2739 case '|':
2740 case ')':
2741 if (flags & TRYAGAIN) {
2742 *flagp |= TRYAGAIN;
2743 return NULL;
2744 }
b45f050a 2745 vFAIL("Internal urp");
a0d0e21e
LW
2746 /* Supposed to be caught earlier. */
2747 break;
85afd4ae 2748 case '{':
830247a4
IZ
2749 if (!regcurly(RExC_parse)) {
2750 RExC_parse++;
85afd4ae
CS
2751 goto defchar;
2752 }
2753 /* FALL THROUGH */
a0d0e21e
LW
2754 case '?':
2755 case '+':
2756 case '*':
830247a4 2757 RExC_parse++;
b45f050a 2758 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2759 break;
2760 case '\\':
830247a4 2761 switch (*++RExC_parse) {
a0d0e21e 2762 case 'A':
830247a4
IZ
2763 RExC_seen_zerolen++;
2764 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2765 *flagp |= SIMPLE;
830247a4 2766 nextchar(pRExC_state);
fac92740 2767 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2768 break;
2769 case 'G':
830247a4
IZ
2770 ret = reg_node(pRExC_state, GPOS);
2771 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2772 *flagp |= SIMPLE;
830247a4 2773 nextchar(pRExC_state);
fac92740 2774 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2775 break;
2776 case 'Z':
830247a4 2777 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2778 *flagp |= SIMPLE;
830247a4 2779 nextchar(pRExC_state);
a0d0e21e 2780 break;
b85d18e9 2781 case 'z':
830247a4 2782 ret = reg_node(pRExC_state, EOS);
b85d18e9 2783 *flagp |= SIMPLE;
830247a4
IZ
2784 RExC_seen_zerolen++; /* Do not optimize RE away */
2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2787 break;
4a2d328f 2788 case 'C':
830247a4 2789 ret = reg_node(pRExC_state, SANY);
3baa4c62 2790 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2791 *flagp |= HASWIDTH|SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2794 break;
2795 case 'X':
830247a4 2796 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2797 *flagp |= HASWIDTH;
830247a4 2798 nextchar(pRExC_state);
fac92740 2799 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2800 break;
a0d0e21e 2801 case 'w':
ffc61ed2 2802 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2803 *flagp |= HASWIDTH|SIMPLE;
830247a4 2804 nextchar(pRExC_state);
fac92740 2805 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2806 break;
2807 case 'W':
ffc61ed2 2808 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2809 *flagp |= HASWIDTH|SIMPLE;
830247a4 2810 nextchar(pRExC_state);
fac92740 2811 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2812 break;
2813 case 'b':
830247a4
IZ
2814 RExC_seen_zerolen++;
2815 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2816 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2817 *flagp |= SIMPLE;
830247a4 2818 nextchar(pRExC_state);
fac92740 2819 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2820 break;
2821 case 'B':
830247a4
IZ
2822 RExC_seen_zerolen++;
2823 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2824 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2825 *flagp |= SIMPLE;
830247a4 2826 nextchar(pRExC_state);
fac92740 2827 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2828 break;
2829 case 's':
ffc61ed2 2830 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2831 *flagp |= HASWIDTH|SIMPLE;
830247a4 2832 nextchar(pRExC_state);
fac92740 2833 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2834 break;
2835 case 'S':
ffc61ed2 2836 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2837 *flagp |= HASWIDTH|SIMPLE;
830247a4 2838 nextchar(pRExC_state);
fac92740 2839 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2840 break;
2841 case 'd':
ffc61ed2 2842 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2843 *flagp |= HASWIDTH|SIMPLE;
830247a4 2844 nextchar(pRExC_state);
fac92740 2845 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2846 break;
2847 case 'D':
ffc61ed2 2848 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2849 *flagp |= HASWIDTH|SIMPLE;
830247a4 2850 nextchar(pRExC_state);
fac92740 2851 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2852 break;
a14b48bc
LW
2853 case 'p':
2854 case 'P':
3568d838 2855 {
830247a4 2856 char* oldregxend = RExC_end;
fac92740 2857 char* parse_start = RExC_parse;
a14b48bc 2858
830247a4 2859 if (RExC_parse[1] == '{') {
3568d838 2860 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2861 RExC_end = strchr(RExC_parse, '}');
2862 if (!RExC_end) {
2863 RExC_parse += 2;
2864 RExC_end = oldregxend;
b45f050a
JF
2865 vFAIL("Missing right brace on \\p{}");
2866 }
830247a4 2867 RExC_end++;
a14b48bc
LW
2868 }
2869 else
830247a4
IZ
2870 RExC_end = RExC_parse + 2;
2871 RExC_parse--;
a14b48bc 2872
ffc61ed2 2873 ret = regclass(pRExC_state);
a14b48bc 2874
830247a4
IZ
2875 RExC_end = oldregxend;
2876 RExC_parse--;
fac92740 2877 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2878 nextchar(pRExC_state);
a14b48bc
LW
2879 *flagp |= HASWIDTH|SIMPLE;
2880 }
2881 break;
a0d0e21e
LW
2882 case 'n':
2883 case 'r':
2884 case 't':
2885 case 'f':
2886 case 'e':
2887 case 'a':
2888 case 'x':
2889 case 'c':
2890 case '0':
2891 goto defchar;
2892 case '1': case '2': case '3': case '4':
2893 case '5': case '6': case '7': case '8': case '9':
2894 {
830247a4 2895 I32 num = atoi(RExC_parse);
a0d0e21e 2896
830247a4 2897 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2898 goto defchar;
2899 else {
fac92740 2900 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2901 while (isDIGIT(*RExC_parse))
2902 RExC_parse++;
b45f050a 2903
830247a4 2904 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2905 vFAIL("Reference to nonexistent group");
830247a4
IZ
2906 RExC_sawback = 1;
2907 ret = reganode(pRExC_state, FOLD
a0ed51b3 2908 ? (LOC ? REFFL : REFF)
c8756f30 2909 : REF, num);
a0d0e21e 2910 *flagp |= HASWIDTH;
fac92740
MJD
2911
2912 /* override incorrect value set in reganode MJD */
2913 Set_Node_Offset(ret, parse_start+1);
2914 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2915 RExC_parse--;
2916 nextchar(pRExC_state);
a0d0e21e
LW
2917 }
2918 }
2919 break;
2920 case '\0':
830247a4 2921 if (RExC_parse >= RExC_end)
b45f050a 2922 FAIL("Trailing \\");
a0d0e21e
LW
2923 /* FALL THROUGH */
2924 default:
c9f97d15
IZ
2925 /* Do not generate `unrecognized' warnings here, we fall
2926 back into the quick-grab loop below */
a0d0e21e
LW
2927 goto defchar;
2928 }
2929 break;
4633a7c4
LW
2930
2931 case '#':
830247a4
IZ
2932 if (RExC_flags16 & PMf_EXTENDED) {
2933 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2934 if (RExC_parse < RExC_end)
4633a7c4
LW
2935 goto tryagain;
2936 }
2937 /* FALL THROUGH */
2938
a0d0e21e 2939 default: {
ba210ebe 2940 register STRLEN len;
a0ed51b3 2941 register UV ender;
a0d0e21e 2942 register char *p;
c277df42 2943 char *oldp, *s;
ba210ebe 2944 STRLEN numlen;
fac92740 2945 char *parse_start = RExC_parse - 1;
a0d0e21e 2946
830247a4 2947 RExC_parse++;
a0d0e21e
LW
2948
2949 defchar:
830247a4 2950 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2951 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2952 : EXACT);
cd439c50 2953 s = STRING(ret);
830247a4
IZ
2954 for (len = 0, p = RExC_parse - 1;
2955 len < 127 && p < RExC_end;
a0d0e21e
LW
2956 len++)
2957 {
2958 oldp = p;
5b5a24f7 2959
830247a4
IZ
2960 if (RExC_flags16 & PMf_EXTENDED)
2961 p = regwhite(p, RExC_end);
a0d0e21e
LW
2962 switch (*p) {
2963 case '^':
2964 case '$':
2965 case '.':
2966 case '[':
2967 case '(':
2968 case ')':
2969 case '|':
2970 goto loopdone;
2971 case '\\':
2972 switch (*++p) {
2973 case 'A':
2974 case 'G':
2975 case 'Z':
b85d18e9 2976 case 'z':
a0d0e21e
LW
2977 case 'w':
2978 case 'W':
2979 case 'b':
2980 case 'B':
2981 case 's':
2982 case 'S':
2983 case 'd':
2984 case 'D':
a14b48bc
LW
2985 case 'p':
2986 case 'P':
a0d0e21e
LW
2987 --p;
2988 goto loopdone;
2989 case 'n':
2990 ender = '\n';
2991 p++;
a687059c 2992 break;
a0d0e21e
LW
2993 case 'r':
2994 ender = '\r';
2995 p++;
a687059c 2996 break;
a0d0e21e
LW
2997 case 't':
2998 ender = '\t';
2999 p++;
a687059c 3000 break;
a0d0e21e
LW
3001 case 'f':
3002 ender = '\f';
3003 p++;
a687059c 3004 break;
a0d0e21e 3005 case 'e':
c7f1f016 3006 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3007 p++;
a687059c 3008 break;
a0d0e21e 3009 case 'a':
c7f1f016 3010 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3011 p++;
a687059c 3012 break;
a0d0e21e 3013 case 'x':
a0ed51b3
LW
3014 if (*++p == '{') {
3015 char* e = strchr(p, '}');
b81d288d 3016
b45f050a 3017 if (!e) {
830247a4 3018 RExC_parse = p + 1;
b45f050a
JF
3019 vFAIL("Missing right brace on \\x{}");
3020 }
de5f0749 3021 else {
b21ed0a9 3022 numlen = 1; /* allow underscores */
de35ba6f 3023 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
3024 if (ender > 0xff)
3025 RExC_utf8 = 1;
b21ed0a9
GS
3026 /* numlen is generous */
3027 if (numlen + len >= 127) {
a0ed51b3
LW
3028 p--;
3029 goto loopdone;
3030 }
3031 p = e + 1;
3032 }
a0ed51b3
LW
3033 }
3034 else {
b21ed0a9 3035 numlen = 0; /* disallow underscores */
dff6d3cd 3036 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
3037 p += numlen;
3038 }
a687059c 3039 break;
a0d0e21e
LW
3040 case 'c':
3041 p++;
bbce6d69 3042 ender = UCHARAT(p++);
3043 ender = toCTRL(ender);
a687059c 3044 break;
a0d0e21e
LW
3045 case '0': case '1': case '2': case '3':case '4':
3046 case '5': case '6': case '7': case '8':case '9':
3047 if (*p == '0' ||
830247a4 3048 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 3049 numlen = 0; /* disallow underscores */
dff6d3cd 3050 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
3051 p += numlen;
3052 }
3053 else {
3054 --p;
3055 goto loopdone;
a687059c
LW
3056 }
3057 break;
a0d0e21e 3058 case '\0':
830247a4 3059 if (p >= RExC_end)
b45f050a 3060 FAIL("Trailing \\");
a687059c 3061 /* FALL THROUGH */
a0d0e21e 3062 default:
e476b1b5 3063 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3064 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3065 goto normal_default;
a0d0e21e
LW
3066 }
3067 break;
a687059c 3068 default:
a0ed51b3 3069 normal_default:
fd400ab9 3070 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3071 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3072 &numlen, 0);
a0ed51b3
LW
3073 p += numlen;
3074 }
3075 else
3076 ender = *p++;
a0d0e21e 3077 break;
a687059c 3078 }
830247a4
IZ
3079 if (RExC_flags16 & PMf_EXTENDED)
3080 p = regwhite(p, RExC_end);
a0ed51b3
LW
3081 if (UTF && FOLD) {
3082 if (LOC)
5e12f4fb 3083 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3084 else
3085 ender = toLOWER_uni(ender);
3086 }
a0d0e21e
LW
3087 if (ISMULT2(p)) { /* Back off on ?+*. */
3088 if (len)
3089 p = oldp;
2b9d42f0 3090 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3091 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3092 s += numlen;
3093 len += numlen;
3094 }
a0d0e21e
LW
3095 else {
3096 len++;
cd439c50 3097 REGC(ender, s++);
a0d0e21e
LW
3098 }
3099 break;
a687059c 3100 }
2b9d42f0 3101 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3102 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3103 s += numlen;
3104 len += numlen - 1;
3105 }
3106 else
cd439c50 3107 REGC(ender, s++);
a0d0e21e
LW
3108 }
3109 loopdone:
830247a4 3110 RExC_parse = p - 1;
fac92740 3111 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3112 nextchar(pRExC_state);
793db0cb
JH
3113 {
3114 /* len is STRLEN which is unsigned, need to copy to signed */
3115 IV iv = len;
3116 if (iv < 0)
3117 vFAIL("Internal disaster");
3118 }
a0d0e21e
LW
3119 if (len > 0)
3120 *flagp |= HASWIDTH;
3121 if (len == 1)
3122 *flagp |= SIMPLE;
c277df42 3123 if (!SIZE_ONLY)
cd439c50
IZ
3124 STR_LEN(ret) = len;
3125 if (SIZE_ONLY)
830247a4 3126 RExC_size += STR_SZ(len);
cd439c50 3127 else
830247a4 3128 RExC_emit += STR_SZ(len);
a687059c 3129 }
a0d0e21e
LW
3130 break;
3131 }
a687059c 3132
a0d0e21e 3133 return(ret);
a687059c
LW
3134}
3135
873ef191 3136STATIC char *
cea2e8a9 3137S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3138{
3139 while (p < e) {
3140 if (isSPACE(*p))
3141 ++p;
3142 else if (*p == '#') {
3143 do {
3144 p++;
3145 } while (p < e && *p != '\n');
3146 }
3147 else
3148 break;
3149 }
3150 return p;
3151}
3152
b8c5462f
JH
3153/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3154 Character classes ([:foo:]) can also be negated ([:^foo:]).
3155 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3156 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3157 but trigger warnings because they are currently unimplemented. */
3158STATIC I32
830247a4 3159S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3160{
3161 char *posixcc = 0;
936ed897 3162 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3163
830247a4 3164 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3165 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3166 (*RExC_parse == ':' ||
3167 *RExC_parse == '=' ||
3168 *RExC_parse == '.')) {
3169 char c = *RExC_parse;
3170 char* s = RExC_parse++;
b81d288d 3171
830247a4
IZ
3172 while (RExC_parse < RExC_end && *RExC_parse != c)
3173 RExC_parse++;
3174 if (RExC_parse == RExC_end)
620e46c5 3175 /* Grandfather lone [:, [=, [. */
830247a4 3176 RExC_parse = s;
620e46c5 3177 else {
830247a4 3178 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3179
830247a4
IZ
3180 if (*RExC_parse == ']') {
3181 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3182 posixcc = s + 1;
3183 if (*s == ':') {
3184 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3185 I32 skip = 5; /* the most common skip */
3186
3187 switch (*posixcc) {
3188 case 'a':
3189 if (strnEQ(posixcc, "alnum", 5))
3190 namedclass =
3191 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3192 else if (strnEQ(posixcc, "alpha", 5))
3193 namedclass =
3194 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3195 else if (strnEQ(posixcc, "ascii", 5))
3196 namedclass =
3197 complement ? ANYOF_NASCII : ANYOF_ASCII;
3198 break;
aaa51d5e
JF
3199 case 'b':
3200 if (strnEQ(posixcc, "blank", 5))
3201 namedclass =
3202 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3203 break;
b8c5462f
JH
3204 case 'c':
3205 if (strnEQ(posixcc, "cntrl", 5))
3206 namedclass =
3207 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3208 break;
3209 case 'd':
3210 if (strnEQ(posixcc, "digit", 5))
3211 namedclass =
3212 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3213 break;
3214 case 'g':
3215 if (strnEQ(posixcc, "graph", 5))
3216 namedclass =
3217 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3218 break;
3219 case 'l':
3220 if (strnEQ(posixcc, "lower", 5))
3221 namedclass =
3222 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3223 break;
3224 case 'p':
3225 if (strnEQ(posixcc, "print", 5))
3226 namedclass =
3227 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3228 else if (strnEQ(posixcc, "punct", 5))
3229 namedclass =
3230 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3231 break;
3232 case 's':
3233 if (strnEQ(posixcc, "space", 5))
3234 namedclass =
aaa51d5e 3235 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3236 break;
b8c5462f
JH
3237 case 'u':
3238 if (strnEQ(posixcc, "upper", 5))
3239 namedclass =
3240 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3241 break;
3242 case 'w': /* this is not POSIX, this is the Perl \w */
3243 if (strnEQ(posixcc, "word", 4)) {
3244 namedclass =
3245 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3246 skip = 4;
3247 }
3248 break;
3249 case 'x':
3250 if (strnEQ(posixcc, "xdigit", 6)) {
3251 namedclass =
3252 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3253 skip = 6;
3254 }
3255 break;
3256 }
ac561586
JH
3257 if (namedclass == OOB_NAMEDCLASS ||
3258 posixcc[skip] != ':' ||
3259 posixcc[skip+1] != ']')
b45f050a
JF
3260 {
3261 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3262 t - s - 1, s + 1);
3263 }
3264 } else if (!SIZE_ONLY) {
b8c5462f 3265 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3266
830247a4 3267 /* adjust RExC_parse so the warning shows after
b45f050a 3268 the class closes */
830247a4
IZ
3269 while (*RExC_parse && *RExC_parse != ']')
3270 RExC_parse++;
b45f050a
JF
3271 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3272 }
b8c5462f
JH
3273 } else {
3274 /* Maternal grandfather:
3275 * "[:" ending in ":" but not in ":]" */
830247a4 3276 RExC_parse = s;
767d463e 3277 }
620e46c5
JH
3278 }
3279 }
3280
b8c5462f
JH
3281 return namedclass;
3282}
3283
3284STATIC void
830247a4 3285S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3286{
e476b1b5 3287 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3288 (*RExC_parse == ':' ||
3289 *RExC_parse == '=' ||
3290 *RExC_parse == '.')) {
3291 char *s = RExC_parse;
93733859 3292 char c = *s++;
b8c5462f
JH
3293
3294 while(*s && isALNUM(*s))
3295 s++;
3296 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3297 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3298
3299 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3300 if (c == '=' || c == '.')
b45f050a 3301 {
830247a4 3302 /* adjust RExC_parse so the error shows after
b45f050a 3303 the class closes */
830247a4 3304 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3305 ;
3306 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3307 }
b8c5462f
JH
3308 }
3309 }
620e46c5
JH
3310}
3311
76e3520e 3312STATIC regnode *
830247a4 3313S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3314{
ffc61ed2 3315 register UV value;
3568d838 3316 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3317 register IV range = 0;
c277df42 3318 register regnode *ret;
ba210ebe 3319 STRLEN numlen;
ffc61ed2 3320 IV namedclass;
9c5ffd7c 3321 char *rangebegin = 0;
936ed897 3322 bool need_class = 0;
9c5ffd7c 3323 SV *listsv = Nullsv;
ffc61ed2
JH
3324 register char *e;
3325 UV n;
3568d838 3326 bool optimize_invert = TRUE;
ffc61ed2
JH
3327
3328 ret = reganode(pRExC_state, ANYOF, 0);
3329
3330 if (!SIZE_ONLY)
3331 ANYOF_FLAGS(ret) = 0;
3332
3333 if (*RExC_parse == '^') { /* Complement of range. */
3334 RExC_naughty++;
3335 RExC_parse++;
3336 if (!SIZE_ONLY)
3337 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3338 }
a0d0e21e 3339
936ed897 3340 if (SIZE_ONLY)
830247a4 3341 RExC_size += ANYOF_SKIP;
936ed897 3342 else {
830247a4 3343 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3344 if (FOLD)
3345 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3346 if (LOC)
3347 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3348 ANYOF_BITMAP_ZERO(ret);
3349 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3350 }
b8c5462f 3351
e476b1b5 3352 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3353 checkposixcc(pRExC_state);
b8c5462f 3354
830247a4 3355 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3356 goto charclassloop; /* allow 1st char to be ] or - */
3357
830247a4 3358 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3359
3360 charclassloop:
3361
3362 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3363
73b437c8 3364 if (!range)
830247a4 3365 rangebegin = RExC_parse;
ffc61ed2 3366 if (UTF) {
5e12f4fb 3367 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3368 RExC_end - RExC_parse,
3369 &numlen, 0);
ffc61ed2
JH
3370 RExC_parse += numlen;
3371 }
3372 else
3373 value = UCHARAT(RExC_parse++);
620e46c5 3374 if (value == '[')
830247a4 3375 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3376 else if (value == '\\') {
ffc61ed2 3377 if (UTF) {
5e12f4fb 3378 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3379 RExC_end - RExC_parse,
3380 &numlen, 0);
3381 RExC_parse += numlen;
3382 }
3383 else
3384 value = UCHARAT(RExC_parse++);
470c3474 3385 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3386 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3387 * be a problem later if we want switch on Unicode.
3388 * A similar issue a little bit later when switching on
3389 * namedclass. --jhi */
ffc61ed2 3390 switch ((I32)value) {
b8c5462f
JH
3391 case 'w': namedclass = ANYOF_ALNUM; break;
3392 case 'W': namedclass = ANYOF_NALNUM; break;
3393 case 's': namedclass = ANYOF_SPACE; break;
3394 case 'S': namedclass = ANYOF_NSPACE; break;
3395 case 'd': namedclass = ANYOF_DIGIT; break;
3396 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3397 case 'p':
3398 case 'P':
3399 if (*RExC_parse == '{') {
3400 e = strchr(RExC_parse++, '}');
3401 if (!e)
3402 vFAIL("Missing right brace on \\p{}");
3403 n = e - RExC_parse;
3404 }
3405 else {
3406 e = RExC_parse;
3407 n = 1;
3408 }
3409 if (!SIZE_ONLY) {
3410 if (value == 'p')
3411 Perl_sv_catpvf(aTHX_ listsv,
3412 "+utf8::%.*s\n", (int)n, RExC_parse);
3413 else
3414 Perl_sv_catpvf(aTHX_ listsv,
3415 "!utf8::%.*s\n", (int)n, RExC_parse);
3416 }
3417 RExC_parse = e + 1;
3418 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3419 continue;
b8c5462f
JH
3420 case 'n': value = '\n'; break;
3421 case 'r': value = '\r'; break;
3422 case 't': value = '\t'; break;
3423 case 'f': value = '\f'; break;
3424 case 'b': value = '\b'; break;
c7f1f016
NIS
3425 case 'e': value = ASCII_TO_NATIVE('\033');break;
3426 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3427 case 'x':
ffc61ed2
JH
3428 if (*RExC_parse == '{') {
3429 e = strchr(RExC_parse++, '}');
b81d288d 3430 if (!e)
ffc61ed2
JH
3431 vFAIL("Missing right brace on \\x{}");
3432 numlen = 1; /* allow underscores */
3433 value = (UV)scan_hex(RExC_parse,
3434 e - RExC_parse,
3435 &numlen);
3436 RExC_parse = e + 1;
3437 }
3438 else {
3439 numlen = 0; /* disallow underscores */
3440 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3441 RExC_parse += numlen;
3442 }
b8c5462f
JH
3443 break;
3444 case 'c':
830247a4 3445 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3446 value = toCTRL(value);
3447 break;
3448 case '0': case '1': case '2': case '3': case '4':
3449 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3450 numlen = 0; /* disallow underscores */
830247a4
IZ
3451 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3452 RExC_parse += numlen;
b8c5462f 3453 break;
1028017a 3454 default:
e476b1b5 3455 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3456 vWARN2(RExC_parse,
3457 "Unrecognized escape \\%c in character class passed through",
3458 (int)value);
1028017a 3459 break;
b8c5462f 3460 }
ffc61ed2
JH
3461 } /* end of \blah */
3462
3463 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3464
3465 if (!SIZE_ONLY && !need_class)
936ed897 3466 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3467
936ed897 3468 need_class = 1;
ffc61ed2
JH
3469
3470 /* a bad range like a-\d, a-[:digit:] ? */
3471 if (range) {
73b437c8 3472 if (!SIZE_ONLY) {
e476b1b5 3473 if (ckWARN(WARN_REGEXP))
830247a4 3474 vWARN4(RExC_parse,
b45f050a 3475 "False [] range \"%*.*s\"",
830247a4
IZ
3476 RExC_parse - rangebegin,
3477 RExC_parse - rangebegin,
b45f050a 3478 rangebegin);
3568d838
JH
3479 if (prevvalue < 256) {
3480 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3481 ANYOF_BITMAP_SET(ret, '-');
3482 }
3483 else {
3484 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3485 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3486 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3487 }
b8c5462f 3488 }
ffc61ed2
JH
3489
3490 range = 0; /* this was not a true range */
73b437c8 3491 }
ffc61ed2 3492
73b437c8 3493 if (!SIZE_ONLY) {
3568d838
JH
3494 if (namedclass > OOB_NAMEDCLASS)
3495 optimize_invert = FALSE;
e2962f66
JH
3496 /* Possible truncation here but in some 64-bit environments
3497 * the compiler gets heartburn about switch on 64-bit values.
3498 * A similar issue a little earlier when switching on value.
98f323fa 3499 * --jhi */
e2962f66 3500 switch ((I32)namedclass) {
73b437c8
JH
3501 case ANYOF_ALNUM:
3502 if (LOC)
936ed897 3503 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3504 else {
3505 for (value = 0; value < 256; value++)
3506 if (isALNUM(value))
936ed897 3507 ANYOF_BITMAP_SET(ret, value);
73b437c8 3508 }
ffc61ed2 3509 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3510 break;
3511 case ANYOF_NALNUM:
3512 if (LOC)
936ed897 3513 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3514 else {
3515 for (value = 0; value < 256; value++)
3516 if (!isALNUM(value))
936ed897 3517 ANYOF_BITMAP_SET(ret, value);
73b437c8 3518 }
ffc61ed2 3519 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3520 break;
ffc61ed2 3521 case ANYOF_ALNUMC:
73b437c8 3522 if (LOC)
ffc61ed2 3523 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3524 else {
3525 for (value = 0; value < 256; value++)
ffc61ed2 3526 if (isALNUMC(value))
936ed897 3527 ANYOF_BITMAP_SET(ret, value);
73b437c8 3528 }
ffc61ed2 3529 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3530 break;
3531 case ANYOF_NALNUMC:
3532 if (LOC)
936ed897 3533 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3534 else {
3535 for (value = 0; value < 256; value++)
3536 if (!isALNUMC(value))
936ed897 3537 ANYOF_BITMAP_SET(ret, value);
73b437c8 3538 }
ffc61ed2 3539 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3540 break;
3541 case ANYOF_ALPHA:
3542 if (LOC)
936ed897 3543 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3544 else {
3545 for (value = 0; value < 256; value++)
3546 if (isALPHA(value))
936ed897 3547 ANYOF_BITMAP_SET(ret, value);
73b437c8 3548 }
ffc61ed2 3549 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3550 break;
3551 case ANYOF_NALPHA:
3552 if (LOC)
936ed897 3553 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3554 else {
3555 for (value = 0; value < 256; value++)
3556 if (!isALPHA(value))
936ed897 3557 ANYOF_BITMAP_SET(ret, value);
73b437c8 3558 }
ffc61ed2 3559 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3560 break;
3561 case ANYOF_ASCII:
3562 if (LOC)
936ed897 3563 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3564 else {
c7f1f016 3565#ifndef EBCDIC
1ba5c669
JH
3566 for (value = 0; value < 128; value++)
3567 ANYOF_BITMAP_SET(ret, value);
3568#else /* EBCDIC */
ffbc6a93 3569 for (value = 0; value < 256; value++) {
3a3c4447
JH
3570 if (isASCII(value))
3571 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3572 }
1ba5c669 3573#endif /* EBCDIC */
73b437c8 3574 }
ffc61ed2 3575 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3576 break;
3577 case ANYOF_NASCII:
3578 if (LOC)
936ed897 3579 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3580 else {
c7f1f016 3581#ifndef EBCDIC
1ba5c669
JH
3582 for (value = 128; value < 256; value++)
3583 ANYOF_BITMAP_SET(ret, value);
3584#else /* EBCDIC */
ffbc6a93 3585 for (value = 0; value < 256; value++) {
3a3c4447
JH
3586 if (!isASCII(value))
3587 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3588 }
1ba5c669 3589#endif /* EBCDIC */
73b437c8 3590 }
ffc61ed2 3591 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3592 break;
aaa51d5e
JF
3593 case ANYOF_BLANK:
3594 if (LOC)
3595 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3596 else {
3597 for (value = 0; value < 256; value++)
3598 if (isBLANK(value))
3599 ANYOF_BITMAP_SET(ret, value);
3600 }
ffc61ed2 3601 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3602 break;
3603 case ANYOF_NBLANK:
3604 if (LOC)
3605 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3606 else {
3607 for (value = 0; value < 256; value++)
3608 if (!isBLANK(value))
3609 ANYOF_BITMAP_SET(ret, value);
3610 }
ffc61ed2 3611 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3612 break;
73b437c8
JH
3613 case ANYOF_CNTRL:
3614 if (LOC)
936ed897 3615 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3616 else {
3617 for (value = 0; value < 256; value++)
3618 if (isCNTRL(value))
936ed897 3619 ANYOF_BITMAP_SET(ret, value);
73b437c8 3620 }
ffc61ed2 3621 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3622 break;
3623 case ANYOF_NCNTRL:
3624 if (LOC)
936ed897 3625 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3626 else {
3627 for (value = 0; value < 256; value++)
3628 if (!isCNTRL(value))
936ed897 3629 ANYOF_BITMAP_SET(ret, value);
73b437c8 3630 }
ffc61ed2
JH
3631 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3632 break;
3633 case ANYOF_DIGIT:
3634 if (LOC)
3635 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3636 else {
3637 /* consecutive digits assumed */
3638 for (value = '0'; value <= '9'; value++)
3639 ANYOF_BITMAP_SET(ret, value);
3640 }
3641 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3642 break;
3643 case ANYOF_NDIGIT:
3644 if (LOC)
3645 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3646 else {
3647 /* consecutive digits assumed */
3648 for (value = 0; value < '0'; value++)
3649 ANYOF_BITMAP_SET(ret, value);
3650 for (value = '9' + 1; value < 256; value++)
3651 ANYOF_BITMAP_SET(ret, value);
3652 }
3653 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3654 break;
3655 case ANYOF_GRAPH:
3656 if (LOC)
936ed897 3657 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3658 else {
3659 for (value = 0; value < 256; value++)
3660 if (isGRAPH(value))
936ed897 3661 ANYOF_BITMAP_SET(ret, value);
73b437c8 3662 }
ffc61ed2 3663 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3664 break;
3665 case ANYOF_NGRAPH:
3666 if (LOC)
936ed897 3667 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3668 else {
3669 for (value = 0; value < 256; value++)
3670 if (!isGRAPH(value))
936ed897 3671 ANYOF_BITMAP_SET(ret, value);
73b437c8 3672 }
ffc61ed2 3673 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3674 break;
3675 case ANYOF_LOWER:
3676 if (LOC)
936ed897 3677 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (isLOWER(value))
936ed897 3681 ANYOF_BITMAP_SET(ret, value);
73b437c8 3682 }
ffc61ed2 3683 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3684 break;
3685 case ANYOF_NLOWER:
3686 if (LOC)
936ed897 3687 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3688 else {
3689 for (value = 0; value < 256; value++)
3690 if (!isLOWER(value))
936ed897 3691 ANYOF_BITMAP_SET(ret, value);
73b437c8 3692 }
ffc61ed2 3693 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3694 break;
3695 case ANYOF_PRINT:
3696 if (LOC)
936ed897 3697 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3698 else {
3699 for (value = 0; value < 256; value++)
3700 if (isPRINT(value))
936ed897 3701 ANYOF_BITMAP_SET(ret, value);
73b437c8 3702 }
ffc61ed2 3703 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3704 break;
3705 case ANYOF_NPRINT:
3706 if (LOC)
936ed897 3707 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3708 else {
3709 for (value = 0; value < 256; value++)
3710 if (!isPRINT(value))
936ed897 3711 ANYOF_BITMAP_SET(ret, value);
73b437c8 3712 }
ffc61ed2 3713 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3714 break;
aaa51d5e
JF
3715 case ANYOF_PSXSPC:
3716 if (LOC)
3717 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3718 else {
3719 for (value = 0; value < 256; value++)
3720 if (isPSXSPC(value))
3721 ANYOF_BITMAP_SET(ret, value);
3722 }
ffc61ed2 3723 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3724 break;
3725 case ANYOF_NPSXSPC:
3726 if (LOC)
3727 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3728 else {
3729 for (value = 0; value < 256; value++)
3730 if (!isPSXSPC(value))
3731 ANYOF_BITMAP_SET(ret, value);
3732 }
ffc61ed2 3733 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3734 break;
73b437c8
JH
3735 case ANYOF_PUNCT:
3736 if (LOC)
936ed897 3737 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3738 else {
3739 for (value = 0; value < 256; value++)
3740 if (isPUNCT(value))
936ed897 3741 ANYOF_BITMAP_SET(ret, value);
73b437c8 3742 }
ffc61ed2 3743 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3744 break;
3745 case ANYOF_NPUNCT:
3746 if (LOC)
936ed897 3747 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3748 else {
3749 for (value = 0; value < 256; value++)
3750 if (!isPUNCT(value))
936ed897 3751 ANYOF_BITMAP_SET(ret, value);
73b437c8 3752 }
ffc61ed2
JH
3753 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3754 break;
3755 case ANYOF_SPACE:
3756 if (LOC)
3757 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3758 else {
3759 for (value = 0; value < 256; value++)
3760 if (isSPACE(value))
3761 ANYOF_BITMAP_SET(ret, value);
3762 }
3763 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3764 break;
3765 case ANYOF_NSPACE:
3766 if (LOC)
3767 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3768 else {
3769 for (value = 0; value < 256; value++)
3770 if (!isSPACE(value))
3771 ANYOF_BITMAP_SET(ret, value);
3772 }
3773 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3774 break;
3775 case ANYOF_UPPER:
3776 if (LOC)
936ed897 3777 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3778 else {
3779 for (value = 0; value < 256; value++)
3780 if (isUPPER(value))
936ed897 3781 ANYOF_BITMAP_SET(ret, value);
73b437c8 3782 }
ffc61ed2 3783 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3784 break;
3785 case ANYOF_NUPPER:
3786 if (LOC)
936ed897 3787 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3788 else {
3789 for (value = 0; value < 256; value++)
3790 if (!isUPPER(value))
936ed897 3791 ANYOF_BITMAP_SET(ret, value);
73b437c8 3792 }
ffc61ed2 3793 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3794 break;
3795 case ANYOF_XDIGIT:
3796 if (LOC)
936ed897 3797 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3798 else {
3799 for (value = 0; value < 256; value++)
3800 if (isXDIGIT(value))
936ed897 3801 ANYOF_BITMAP_SET(ret, value);
73b437c8 3802 }
ffc61ed2 3803 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3804 break;
3805 case ANYOF_NXDIGIT:
3806 if (LOC)
936ed897 3807 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3808 else {
3809 for (value = 0; value < 256; value++)
3810 if (!isXDIGIT(value))
936ed897 3811 ANYOF_BITMAP_SET(ret, value);
73b437c8 3812 }
ffc61ed2 3813 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3814 break;
3815 default:
b45f050a 3816 vFAIL("Invalid [::] class");
73b437c8 3817 break;
b8c5462f 3818 }
b8c5462f 3819 if (LOC)
936ed897 3820 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3821 continue;
a0d0e21e 3822 }
ffc61ed2
JH
3823 } /* end of namedclass \blah */
3824
a0d0e21e 3825 if (range) {
3a3c4447 3826 if (prevvalue > value) /* b-a */ {
b45f050a 3827 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3828 RExC_parse - rangebegin,
3829 RExC_parse - rangebegin,
b45f050a 3830 rangebegin);
3568d838 3831 range = 0; /* not a valid range */
73b437c8 3832 }
a0d0e21e
LW
3833 }
3834 else {
3568d838 3835 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3836 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3837 RExC_parse[1] != ']') {
3838 RExC_parse++;
ffc61ed2
JH
3839
3840 /* a bad range like \w-, [:word:]- ? */
3841 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3842 if (ckWARN(WARN_REGEXP))
830247a4 3843 vWARN4(RExC_parse,
b45f050a 3844 "False [] range \"%*.*s\"",
830247a4
IZ
3845 RExC_parse - rangebegin,
3846 RExC_parse - rangebegin,
b45f050a 3847 rangebegin);
73b437c8 3848 if (!SIZE_ONLY)
936ed897 3849 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3850 } else
ffc61ed2
JH
3851 range = 1; /* yeah, it's a range! */
3852 continue; /* but do it the next time */
a0d0e21e 3853 }
a687059c 3854 }
ffc61ed2 3855
93733859 3856 /* now is the next time */
ae5c130c 3857 if (!SIZE_ONLY) {
3568d838
JH
3858 IV i;
3859
3860 if (prevvalue < 256) {
3861 IV ceilvalue = value < 256 ? value : 255;
3862
3863#ifdef EBCDIC
3a3c4447
JH
3864 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3865 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3866 {
3568d838
JH
3867 if (isLOWER(prevvalue)) {
3868 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3869 if (isLOWER(i))
3870 ANYOF_BITMAP_SET(ret, i);
3871 } else {
3568d838 3872 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3873 if (isUPPER(i))
3874 ANYOF_BITMAP_SET(ret, i);
3875 }
8ada0baa 3876 }
ffc61ed2 3877 else
8ada0baa 3878#endif
3568d838
JH
3879 for (i = prevvalue; i <= ceilvalue; i++)
3880 ANYOF_BITMAP_SET(ret, i);
3881 }
3882 if (value > 255) {
ffc61ed2 3883 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3884 if (prevvalue < value)
ffc61ed2 3885 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3886 (UV)prevvalue, (UV)value);
3887 else if (prevvalue == value)
ffc61ed2
JH
3888 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3889 (UV)value);
3890 }
8ada0baa 3891 }
ffc61ed2
JH
3892
3893 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3894 }
ffc61ed2 3895
936ed897 3896 if (need_class) {
4f66b38d 3897 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3898 if (SIZE_ONLY)
830247a4 3899 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3900 else
830247a4 3901 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3902 }
ffc61ed2 3903
ae5c130c 3904 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3905 if (!SIZE_ONLY &&
ffc61ed2 3906 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3907 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3908 ) {
a0ed51b3 3909 for (value = 0; value < 256; ++value) {
936ed897 3910 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3911 IV fold = PL_fold[value];
3912
3913 if (fold != value)
3914 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3915 }
3916 }
936ed897 3917 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3918 }
ffc61ed2 3919
ae5c130c 3920 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3921 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3922 /* If the only flag is inversion. */
3923 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3924 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3925 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3926 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3927 }
a0d0e21e 3928
b81d288d 3929 if (!SIZE_ONLY) {
fde631ed 3930 AV *av = newAV();
ffc61ed2
JH
3931 SV *rv;
3932
3933 av_store(av, 0, listsv);
3934 av_store(av, 1, NULL);
3935 rv = newRV_noinc((SV*)av);
19860706 3936 n = add_data(pRExC_state, 1, "s");
830247a4 3937 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3938 ARG_SET(ret, n);
a0ed51b3
LW
3939 }
3940
3941 return ret;
3942}
3943
76e3520e 3944STATIC char*
830247a4 3945S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3946{
830247a4 3947 char* retval = RExC_parse++;
a0d0e21e 3948
4633a7c4 3949 for (;;) {
830247a4
IZ
3950 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3951 RExC_parse[2] == '#') {
3952 while (*RExC_parse && *RExC_parse != ')')
3953 RExC_parse++;
3954 RExC_parse++;
4633a7c4
LW
3955 continue;
3956 }
830247a4
IZ
3957 if (RExC_flags16 & PMf_EXTENDED) {
3958 if (isSPACE(*RExC_parse)) {
3959 RExC_parse++;
748a9306
LW
3960 continue;
3961 }
830247a4
IZ
3962 else if (*RExC_parse == '#') {
3963 while (*RExC_parse && *RExC_parse != '\n')
3964 RExC_parse++;
3965 RExC_parse++;
748a9306
LW
3966 continue;
3967 }
748a9306 3968 }
4633a7c4 3969 return retval;
a0d0e21e 3970 }
a687059c
LW
3971}
3972
3973/*
c277df42 3974- reg_node - emit a node
a0d0e21e 3975*/
76e3520e 3976STATIC regnode * /* Location. */
830247a4 3977S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3978{
c277df42
IZ
3979 register regnode *ret;
3980 register regnode *ptr;
a687059c 3981
830247a4 3982 ret = RExC_emit;
c277df42 3983 if (SIZE_ONLY) {
830247a4
IZ
3984 SIZE_ALIGN(RExC_size);
3985 RExC_size += 1;
a0d0e21e
LW
3986 return(ret);
3987 }
a687059c 3988
c277df42 3989 NODE_ALIGN_FILL(ret);
a0d0e21e 3990 ptr = ret;
c277df42 3991 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
3992 if (RExC_offsets) { /* MJD */
3993 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
3994 "reg_node", __LINE__,
3995 reg_name[op],
3996 RExC_emit - RExC_emit_start > RExC_offsets[0]
3997 ? "Overwriting end of array!\n" : "OK",
3998 RExC_emit - RExC_emit_start,
3999 RExC_parse - RExC_start,
4000 RExC_offsets[0]));
4001 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4002 }
4003
830247a4 4004 RExC_emit = ptr;
a687059c 4005
a0d0e21e 4006 return(ret);
a687059c
LW
4007}
4008
4009/*
a0d0e21e
LW
4010- reganode - emit a node with an argument
4011*/
76e3520e 4012STATIC regnode * /* Location. */
830247a4 4013S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4014{
c277df42
IZ
4015 register regnode *ret;
4016 register regnode *ptr;
fe14fcc3 4017
830247a4 4018 ret = RExC_emit;
c277df42 4019 if (SIZE_ONLY) {
830247a4
IZ
4020 SIZE_ALIGN(RExC_size);
4021 RExC_size += 2;
a0d0e21e
LW
4022 return(ret);
4023 }
fe14fcc3 4024
c277df42 4025 NODE_ALIGN_FILL(ret);
a0d0e21e 4026 ptr = ret;
c277df42 4027 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4028 if (RExC_offsets) { /* MJD */
4029 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4030 "reganode",
4031 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4032 "Overwriting end of array!\n" : "OK",
4033 RExC_emit - RExC_emit_start,
4034 RExC_parse - RExC_start,
4035 RExC_offsets[0]));
4036 Set_Cur_Node_Offset;
4037 }
4038
830247a4 4039 RExC_emit = ptr;
fe14fcc3 4040
a0d0e21e 4041 return(ret);
fe14fcc3
LW
4042}
4043
4044/*
cd439c50 4045- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4046*/
4047STATIC void
830247a4 4048S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4049{
5e12f4fb 4050 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4051}
4052
4053/*
a0d0e21e
LW
4054- reginsert - insert an operator in front of already-emitted operand
4055*
4056* Means relocating the operand.
4057*/
76e3520e 4058STATIC void
830247a4 4059S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4060{
c277df42
IZ
4061 register regnode *src;
4062 register regnode *dst;
4063 register regnode *place;
4064 register int offset = regarglen[(U8)op];
b81d288d 4065
22c35a8c 4066/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4067
4068 if (SIZE_ONLY) {
830247a4 4069 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4070 return;
4071 }
a687059c 4072
830247a4
IZ
4073 src = RExC_emit;
4074 RExC_emit += NODE_STEP_REGNODE + offset;
4075 dst = RExC_emit;
fac92740 4076 while (src > opnd) {
c277df42 4077 StructCopy(--src, --dst, regnode);
fac92740
MJD
4078 if (RExC_offsets) { /* MJD 20010112 */
4079 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4080 "reg_insert",
4081 dst - RExC_emit_start > RExC_offsets[0]
4082 ? "Overwriting end of array!\n" : "OK",
4083 src - RExC_emit_start,
4084 dst - RExC_emit_start,
4085 RExC_offsets[0]));
4086 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4087 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4088 }
4089 }
4090
a0d0e21e
LW
4091
4092 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4093 if (RExC_offsets) { /* MJD */
4094 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4095 "reginsert",
4096 place - RExC_emit_start > RExC_offsets[0]
4097 ? "Overwriting end of array!\n" : "OK",
4098 place - RExC_emit_start,
4099 RExC_parse - RExC_start,
4100 RExC_offsets[0]));
4101 Set_Node_Offset(place, RExC_parse);
4102 }
c277df42
IZ
4103 src = NEXTOPER(place);
4104 FILL_ADVANCE_NODE(place, op);
4105 Zero(src, offset, regnode);
a687059c
LW
4106}
4107
4108/*
c277df42 4109- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4110*/
76e3520e 4111STATIC void
830247a4 4112S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4113{
c277df42
IZ
4114 register regnode *scan;
4115 register regnode *temp;
a0d0e21e 4116
c277df42 4117 if (SIZE_ONLY)
a0d0e21e
LW
4118 return;
4119
4120 /* Find last node. */
4121 scan = p;
4122 for (;;) {
4123 temp = regnext(scan);
4124 if (temp == NULL)
4125 break;
4126 scan = temp;
4127 }
a687059c 4128
c277df42
IZ
4129 if (reg_off_by_arg[OP(scan)]) {
4130 ARG_SET(scan, val - scan);
a0ed51b3
LW
4131 }
4132 else {
c277df42
IZ
4133 NEXT_OFF(scan) = val - scan;
4134 }
a687059c
LW
4135}
4136
4137/*
a0d0e21e
LW
4138- regoptail - regtail on operand of first argument; nop if operandless
4139*/
76e3520e 4140STATIC void
830247a4 4141S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4142{
a0d0e21e 4143 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4144 if (p == NULL || SIZE_ONLY)
4145 return;
22c35a8c 4146 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4147 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4148 }
22c35a8c 4149 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4150 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4151 }
4152 else
a0d0e21e 4153 return;
a687059c
LW
4154}
4155
4156/*
4157 - regcurly - a little FSA that accepts {\d+,?\d*}
4158 */
79072805 4159STATIC I32
cea2e8a9 4160S_regcurly(pTHX_ register char *s)
a687059c
LW
4161{
4162 if (*s++ != '{')
4163 return FALSE;
f0fcb552 4164 if (!isDIGIT(*s))
a687059c 4165 return FALSE;
f0fcb552 4166 while (isDIGIT(*s))
a687059c
LW
4167 s++;
4168 if (*s == ',')
4169 s++;
f0fcb552 4170 while (isDIGIT(*s))
a687059c
LW
4171 s++;
4172 if (*s != '}')
4173 return FALSE;
4174 return TRUE;
4175}
4176
a687059c 4177
76e3520e 4178STATIC regnode *
cea2e8a9 4179S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4180{
35ff7856 4181#ifdef DEBUGGING
f248d071 4182 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4183 register regnode *next;
c277df42
IZ
4184
4185 while (op != END && (!last || node < last)) {
4186 /* While that wasn't END last time... */
4187
4188 NODE_ALIGN(node);
4189 op = OP(node);
4190 if (op == CLOSE)
4191 l--;
4192 next = regnext(node);
4193 /* Where, what. */
4194 if (OP(node) == OPTIMIZED)
4195 goto after_print;
4196 regprop(sv, node);
b900a521 4197 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4198 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4199 if (next == NULL) /* Next ptr. */
4200 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4201 else
b900a521 4202 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4203 (void)PerlIO_putc(Perl_debug_log, '\n');
4204 after_print:
22c35a8c 4205 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4206 register regnode *nnode = (OP(next) == LONGJMP
4207 ? regnext(next)
c277df42
IZ
4208 : next);
4209 if (last && nnode > last)
4210 nnode = last;
4211 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4212 }
22c35a8c 4213 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4214 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4215 }
4216 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4217 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4218 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4219 }
22c35a8c 4220 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4221 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4222 next, sv, l + 1);
a0ed51b3
LW
4223 }
4224 else if ( op == PLUS || op == STAR) {
c277df42 4225 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4226 }
4227 else if (op == ANYOF) {
4f66b38d
HS
4228 /* arglen 1 + class block */
4229 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4230 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4231 node = NEXTOPER(node);
a0ed51b3 4232 }
22c35a8c 4233 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4234 /* Literal string, where present. */
cd439c50 4235 node += NODE_SZ_STR(node) - 1;
c277df42 4236 node = NEXTOPER(node);
a0ed51b3
LW
4237 }
4238 else {
c277df42
IZ
4239 node = NEXTOPER(node);
4240 node += regarglen[(U8)op];
4241 }
4242 if (op == CURLYX || op == OPEN)
4243 l++;
4244 else if (op == WHILEM)
4245 l--;
4246 }
17c3b450 4247#endif /* DEBUGGING */
c277df42
IZ
4248 return node;
4249}
4250
a687059c 4251/*
fd181c75 4252 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4253 */
4254void
864dbfa3 4255Perl_regdump(pTHX_ regexp *r)
a687059c 4256{
35ff7856 4257#ifdef DEBUGGING
46fc3d4c 4258 SV *sv = sv_newmortal();
a687059c 4259
c277df42 4260 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4261
4262 /* Header fields of interest. */
c277df42 4263 if (r->anchored_substr)
7b0972df 4264 PerlIO_printf(Perl_debug_log,
b81d288d 4265 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4266 PL_colors[0],
7b0972df 4267 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4268 SvPVX(r->anchored_substr),
3280af22 4269 PL_colors[1],
c277df42 4270 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4271 (IV)r->anchored_offset);
c277df42 4272 if (r->float_substr)
7b0972df 4273 PerlIO_printf(Perl_debug_log,
b81d288d 4274 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4275 PL_colors[0],
b81d288d 4276 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4277 SvPVX(r->float_substr),
3280af22 4278 PL_colors[1],
c277df42 4279 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4280 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4281 if (r->check_substr)
b81d288d
AB
4282 PerlIO_printf(Perl_debug_log,
4283 r->check_substr == r->float_substr
c277df42
IZ
4284 ? "(checking floating" : "(checking anchored");
4285 if (r->reganch & ROPT_NOSCAN)
4286 PerlIO_printf(Perl_debug_log, " noscan");
4287 if (r->reganch & ROPT_CHECK_ALL)
4288 PerlIO_printf(Perl_debug_log, " isall");
4289 if (r->check_substr)
4290 PerlIO_printf(Perl_debug_log, ") ");
4291
46fc3d4c 4292 if (r->regstclass) {
4293 regprop(sv, r->regstclass);
4294 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4295 }
774d564b 4296 if (r->reganch & ROPT_ANCH) {
4297 PerlIO_printf(Perl_debug_log, "anchored");
4298 if (r->reganch & ROPT_ANCH_BOL)
4299 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4300 if (r->reganch & ROPT_ANCH_MBOL)
4301 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4302 if (r->reganch & ROPT_ANCH_SBOL)
4303 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4304 if (r->reganch & ROPT_ANCH_GPOS)
4305 PerlIO_printf(Perl_debug_log, "(GPOS)");
4306 PerlIO_putc(Perl_debug_log, ' ');
4307 }
c277df42
IZ
4308 if (r->reganch & ROPT_GPOS_SEEN)
4309 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4310 if (r->reganch & ROPT_SKIP)
760ac839 4311 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4312 if (r->reganch & ROPT_IMPLICIT)
760ac839 4313 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4314 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4315 if (r->reganch & ROPT_EVAL_SEEN)
4316 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4317 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4318 if (r->offsets) {
4319 U32 i;
4320 U32 len = r->offsets[0];
392fbf5d 4321 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4322 for (i = 1; i <= len; i++)
392fbf5d
RB
4323 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4324 (UV)r->offsets[i*2-1],
4325 (UV)r->offsets[i*2]);
fac92740
MJD
4326 PerlIO_printf(Perl_debug_log, "\n");
4327 }
17c3b450 4328#endif /* DEBUGGING */
a687059c
LW
4329}
4330
653099ff
GS
4331STATIC void
4332S_put_byte(pTHX_ SV *sv, int c)
4333{
ffc61ed2 4334 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4335 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4336 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4337 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4338 else
4339 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4340}
4341
a687059c 4342/*
a0d0e21e
LW
4343- regprop - printable representation of opcode
4344*/
46fc3d4c 4345void
864dbfa3 4346Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4347{
35ff7856 4348#ifdef DEBUGGING
9b155405 4349 register int k;
a0d0e21e 4350
54dc92de 4351 sv_setpvn(sv, "", 0);
9b155405 4352 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4353 /* It would be nice to FAIL() here, but this may be called from
4354 regexec.c, and it would be hard to supply pRExC_state. */
4355 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4356 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4357
4358 k = PL_regkind[(U8)OP(o)];
4359
4360 if (k == EXACT)
7821416a 4361 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4362 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4363 else if (k == CURLY) {
cb434fcc 4364 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4365 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4366 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4367 }
2c2d71f5
JH
4368 else if (k == WHILEM && o->flags) /* Ordinal/of */
4369 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4370 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4371 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4372 else if (k == LOGICAL)
04ebc1ab 4373 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4374 else if (k == ANYOF) {
4375 int i, rangestart = -1;
ffc61ed2 4376 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4377 const char * const anyofs[] = { /* Should be syncronized with
4378 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4379 "\\w",
4380 "\\W",
4381 "\\s",
4382 "\\S",
4383 "\\d",
4384 "\\D",
4385 "[:alnum:]",
4386 "[:^alnum:]",
4387 "[:alpha:]",
4388 "[:^alpha:]",
4389 "[:ascii:]",
4390 "[:^ascii:]",
4391 "[:ctrl:]",
4392 "[:^ctrl:]",
4393 "[:graph:]",
4394 "[:^graph:]",
4395 "[:lower:]",
4396 "[:^lower:]",
4397 "[:print:]",
4398 "[:^print:]",
4399 "[:punct:]",
4400 "[:^punct:]",
4401 "[:upper:]",
aaa51d5e 4402 "[:^upper:]",
653099ff 4403 "[:xdigit:]",
aaa51d5e
JF
4404 "[:^xdigit:]",
4405 "[:space:]",
4406 "[:^space:]",
4407 "[:blank:]",
4408 "[:^blank:]"
653099ff
GS
4409 };
4410
19860706 4411 if (flags & ANYOF_LOCALE)
653099ff 4412 sv_catpv(sv, "{loc}");
19860706 4413 if (flags & ANYOF_FOLD)
653099ff
GS
4414 sv_catpv(sv, "{i}");
4415 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4416 if (flags & ANYOF_INVERT)
653099ff 4417 sv_catpv(sv, "^");
ffc61ed2
JH
4418 for (i = 0; i <= 256; i++) {
4419 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4420 if (rangestart == -1)
4421 rangestart = i;
4422 } else if (rangestart != -1) {
4423 if (i <= rangestart + 3)
4424 for (; rangestart < i; rangestart++)
653099ff 4425 put_byte(sv, rangestart);
ffc61ed2
JH
4426 else {
4427 put_byte(sv, rangestart);
4428 sv_catpv(sv, "-");
4429 put_byte(sv, i - 1);
653099ff 4430 }
ffc61ed2 4431 rangestart = -1;
653099ff 4432 }
847a199f 4433 }
ffc61ed2
JH
4434
4435 if (o->flags & ANYOF_CLASS)
4436 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4437 if (ANYOF_CLASS_TEST(o,i))
4438 sv_catpv(sv, anyofs[i]);
4439
4440 if (flags & ANYOF_UNICODE)
4441 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4442 else if (flags & ANYOF_UNICODE_ALL)
4443 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4444
4445 {
4446 SV *lv;
4447 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4448
ffc61ed2
JH
4449 if (lv) {
4450 if (sw) {
4451 UV i;
4452 U8 s[UTF8_MAXLEN+1];
b81d288d 4453
ffc61ed2 4454 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4455 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4456
3568d838 4457 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4458 if (rangestart == -1)
4459 rangestart = i;
4460 } else if (rangestart != -1) {
4461 U8 *p;
b81d288d 4462
ffc61ed2
JH
4463 if (i <= rangestart + 3)
4464 for (; rangestart < i; rangestart++) {
2b9d42f0 4465 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4466 put_byte(sv, *p);
4467 }
4468 else {
2b9d42f0 4469 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4470 put_byte(sv, *p);
4471 sv_catpv(sv, "-");
2b9d42f0 4472 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4473 put_byte(sv, *p);
4474 }
4475 rangestart = -1;
4476 }
19860706 4477 }
ffc61ed2
JH
4478
4479 sv_catpv(sv, "..."); /* et cetera */
19860706 4480 }
fde631ed 4481
ffc61ed2
JH
4482 {
4483 char *s = savepv(SvPVX(lv));
4484 char *origs = s;
b81d288d 4485
ffc61ed2 4486 while(*s && *s != '\n') s++;
b81d288d 4487
ffc61ed2
JH
4488 if (*s == '\n') {
4489 char *t = ++s;
4490
4491 while (*s) {
4492 if (*s == '\n')
4493 *s = ' ';
4494 s++;
4495 }
4496 if (s[-1] == ' ')
4497 s[-1] = 0;
4498
4499 sv_catpv(sv, t);
fde631ed 4500 }
b81d288d 4501
ffc61ed2 4502 Safefree(origs);
fde631ed
JH
4503 }
4504 }
653099ff 4505 }
ffc61ed2 4506
653099ff
GS
4507 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4508 }
9b155405 4509 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4510 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4511#endif /* DEBUGGING */
35ff7856 4512}
a687059c 4513
cad2e5aa
JH
4514SV *
4515Perl_re_intuit_string(pTHX_ regexp *prog)
4516{ /* Assume that RE_INTUIT is set */
4517 DEBUG_r(
4518 { STRLEN n_a;
4519 char *s = SvPV(prog->check_substr,n_a);
4520
4521 if (!PL_colorset) reginitcolors();
4522 PerlIO_printf(Perl_debug_log,
4523 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4524 PL_colors[4],PL_colors[5],PL_colors[0],
4525 s,
4526 PL_colors[1],
4527 (strlen(s) > 60 ? "..." : ""));
4528 } );
4529
4530 return prog->check_substr;
4531}
4532
2b69d0c2 4533void
864dbfa3 4534Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4535{
adac82c7 4536 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4537
4538 if (!r || (--r->refcnt > 0))
4539 return;
cad2e5aa
JH
4540 DEBUG_r(PerlIO_printf(Perl_debug_log,
4541 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4542 PL_colors[4],PL_colors[5],PL_colors[0],
4543 r->precomp,
4544 PL_colors[1],
4545 (strlen(r->precomp) > 60 ? "..." : "")));
4546
c277df42 4547 if (r->precomp)
a0d0e21e 4548 Safefree(r->precomp);
fac92740
MJD
4549 if (r->offsets) /* 20010421 MJD */
4550 Safefree(r->offsets);
cf93c79d
IZ
4551 if (RX_MATCH_COPIED(r))
4552 Safefree(r->subbeg);
a193d654
GS
4553 if (r->substrs) {
4554 if (r->anchored_substr)
4555 SvREFCNT_dec(r->anchored_substr);
4556 if (r->float_substr)
4557 SvREFCNT_dec(r->float_substr);
2779dcf1 4558 Safefree(r->substrs);
a193d654 4559 }
c277df42
IZ
4560 if (r->data) {
4561 int n = r->data->count;
dfad63ad
HS
4562 AV* new_comppad = NULL;
4563 AV* old_comppad;
4564 SV** old_curpad;
4565
c277df42
IZ
4566 while (--n >= 0) {
4567 switch (r->data->what[n]) {
4568 case 's':
4569 SvREFCNT_dec((SV*)r->data->data[n]);
4570 break;
653099ff
GS
4571 case 'f':
4572 Safefree(r->data->data[n]);
4573 break;
dfad63ad
HS
4574 case 'p':
4575 new_comppad = (AV*)r->data->data[n];
4576 break;
c277df42 4577 case 'o':
dfad63ad 4578 if (new_comppad == NULL)
cea2e8a9 4579 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4580 old_comppad = PL_comppad;
4581 old_curpad = PL_curpad;
1e6dc0b6
SB
4582 /* Watch out for global destruction's random ordering. */
4583 if (SvTYPE(new_comppad) == SVt_PVAV) {
4584 PL_comppad = new_comppad;
4585 PL_curpad = AvARRAY(new_comppad);
4586 }
4587 else
4588 PL_curpad = NULL;
c277df42 4589 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4590 PL_comppad = old_comppad;
4591 PL_curpad = old_curpad;
4592 SvREFCNT_dec((SV*)new_comppad);
4593 new_comppad = NULL;
c277df42
IZ
4594 break;
4595 case 'n':
4596 break;
4597 default:
830247a4 4598 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4599 }
4600 }
4601 Safefree(r->data->what);
4602 Safefree(r->data);
a0d0e21e
LW
4603 }
4604 Safefree(r->startp);
4605 Safefree(r->endp);
4606 Safefree(r);
a687059c 4607}
c277df42
IZ
4608
4609/*
4610 - regnext - dig the "next" pointer out of a node
4611 *
4612 * [Note, when REGALIGN is defined there are two places in regmatch()
4613 * that bypass this code for speed.]
4614 */
4615regnode *
864dbfa3 4616Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4617{
4618 register I32 offset;
4619
3280af22 4620 if (p == &PL_regdummy)
c277df42
IZ
4621 return(NULL);
4622
4623 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4624 if (offset == 0)
4625 return(NULL);
4626
c277df42 4627 return(p+offset);
c277df42
IZ
4628}
4629
01f988be 4630STATIC void
cea2e8a9 4631S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4632{
4633 va_list args;
4634 STRLEN l1 = strlen(pat1);
4635 STRLEN l2 = strlen(pat2);
4636 char buf[512];
06bf62c7 4637 SV *msv;
c277df42
IZ
4638 char *message;
4639
4640 if (l1 > 510)
4641 l1 = 510;
4642 if (l1 + l2 > 510)
4643 l2 = 510 - l1;
4644 Copy(pat1, buf, l1 , char);
4645 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4646 buf[l1 + l2] = '\n';
4647 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4648#ifdef I_STDARG
4649 /* ANSI variant takes additional second argument */
c277df42 4650 va_start(args, pat2);
8736538c
AS
4651#else
4652 va_start(args);
4653#endif
5a844595 4654 msv = vmess(buf, &args);
c277df42 4655 va_end(args);
06bf62c7 4656 message = SvPV(msv,l1);
c277df42
IZ
4657 if (l1 > 512)
4658 l1 = 512;
4659 Copy(message, buf, l1 , char);
4660 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4661 Perl_croak(aTHX_ "%s", buf);
c277df42 4662}
a0ed51b3
LW
4663
4664/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4665
4666void
864dbfa3 4667Perl_save_re_context(pTHX)
b81d288d 4668{
830247a4
IZ
4669#if 0
4670 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4671 SAVEI32(RExC_npar); /* () count. */
4672 SAVEI32(RExC_size); /* Code size. */
4673 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4674 SAVEVPTR(RExC_rx); /* from regcomp.c */
4675 SAVEI32(RExC_seen); /* from regcomp.c */
4676 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4677 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4678 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4679 SAVEPPTR(RExC_end); /* End of input for compile */
4680 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4681#endif
4682
4683 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4684 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4685 SAVEPPTR(PL_reginput); /* String-input pointer. */
4686 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4687 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4688 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4689 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4690 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4691 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4692 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4693 PL_reg_start_tmp = 0;
a0ed51b3
LW
4694 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4695 PL_reg_start_tmpl = 0;
7766f137 4696 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4697 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4698 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4699 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4700 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4701 SAVEVPTR(PL_regcc); /* from regexec.c */
4702 SAVEVPTR(PL_curcop);
7766f137
GS
4703 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4704 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4705 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4706 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4707 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4708 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4709 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4710 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4711 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4712 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4713#ifdef DEBUGGING
b81d288d 4714 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4715#endif
a0ed51b3 4716}
51371543
GS
4717
4718#ifdef PERL_OBJECT
51371543
GS
4719#include "XSUB.h"
4720#undef this
4721#define this pPerl
4722#endif
4723
4724static void
4725clear_re(pTHXo_ void *r)
4726{
4727 ReREFCNT_dec((regexp *)r);
4728}
ffbc6a93 4729