This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Delta delta.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
f9373011
PM
396#define vWARNdep(loc,m) \
397 STMT_START { \
398 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
399 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
400 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 } STMT_END \
403
b45f050a
JF
404
405#define vWARN2(loc, m, a1) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
409 a1, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN3(loc, m, a1, a2) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
417 a1, a2, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
419 } STMT_END
420
421#define vWARN4(loc, m, a1, a2, a3) \
422 STMT_START { \
830247a4 423 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
424 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
425 a1, a2, a3, \
830247a4 426 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
427 } STMT_END
428
9d1d55b5
JP
429/* used for the parse_flags section for (?c) -- japhy */
430#define vWARN5(loc, m, a1, a2, a3, a4) \
431 STMT_START { \
432 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
433 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
434 a1, a2, a3, a4, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436 } STMT_END
437
8615cb43 438
cd439c50 439/* Allow for side effects in s */
fd3f0ae2 440#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 441
fac92740
MJD
442/* Macros for recording node offsets. 20001227 mjd@plover.com
443 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
444 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
445 * Element 0 holds the number n.
446 */
447
448#define MJD_OFFSET_DEBUG(x)
449/* #define MJD_OFFSET_DEBUG(x) fprintf x */
450
451
452# define Set_Node_Offset_To_R(node,byte) \
453 STMT_START { \
454 if (! SIZE_ONLY) { \
455 if((node) < 0) { \
456 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
457 } else { \
458 RExC_offsets[2*(node)-1] = (byte); \
459 } \
460 } \
461 } STMT_END
462
463# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
464# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
465
466# define Set_Node_Length_To_R(node,len) \
467 STMT_START { \
468 if (! SIZE_ONLY) { \
469 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
470 if((node) < 0) { \
471 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
472 } else { \
473 RExC_offsets[2*(node)] = (len); \
474 } \
475 } \
476 } STMT_END
477
478# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
479# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
480# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
481
482/* Get offsets and lengths */
483#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
484#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
485
51371543 486static void clear_re(pTHXo_ void *r);
4327152a 487
653099ff
GS
488/* Mark that we cannot extend a found fixed substring at this point.
489 Updata the longest found anchored substring and the longest found
490 floating substrings if needed. */
491
4327152a 492STATIC void
830247a4 493S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 494{
a0ed51b3
LW
495 STRLEN l = CHR_SVLEN(data->last_found);
496 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 497
c277df42
IZ
498 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
499 sv_setsv(*data->longest, data->last_found);
500 if (*data->longest == data->longest_fixed) {
501 data->offset_fixed = l ? data->last_start_min : data->pos_min;
502 if (data->flags & SF_BEFORE_EOL)
b81d288d 503 data->flags
c277df42
IZ
504 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
505 else
506 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
507 }
508 else {
c277df42 509 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
510 data->offset_float_max = (l
511 ? data->last_start_max
c277df42
IZ
512 : data->pos_min + data->pos_delta);
513 if (data->flags & SF_BEFORE_EOL)
b81d288d 514 data->flags
c277df42
IZ
515 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
516 else
517 data->flags &= ~SF_FL_BEFORE_EOL;
518 }
519 }
520 SvCUR_set(data->last_found, 0);
521 data->last_end = -1;
522 data->flags &= ~SF_BEFORE_EOL;
523}
524
653099ff
GS
525/* Can match anything (initialization) */
526STATIC void
830247a4 527S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 528{
9af0acdb
JH
529 int value;
530
653099ff 531 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
532 for (value = 0; value < 256; ++value)
533 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 534 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* Can match anything (initialization) */
540STATIC int
541S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
542{
543 int value;
544
aaa51d5e 545 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
546 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
547 return 1;
1aa99e6b
IH
548 if (!(cl->flags & ANYOF_UNICODE_ALL))
549 return 0;
9af0acdb
JH
550 for (value = 0; value < 256; ++value)
551 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
552 return 0;
553 return 1;
554}
555
556/* Can match anything (initialization) */
557STATIC void
830247a4 558S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563}
564
565STATIC void
830247a4 566S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 567{
8ecf7187 568 Zero(cl, 1, struct regnode_charclass_class);
653099ff 569 cl->type = ANYOF;
830247a4 570 cl_anything(pRExC_state, cl);
653099ff
GS
571 if (LOC)
572 cl->flags |= ANYOF_LOCALE;
573}
574
575/* 'And' a given class with another one. Can create false positives */
576/* We assume that cl is not inverted */
577STATIC void
578S_cl_and(pTHX_ struct regnode_charclass_class *cl,
579 struct regnode_charclass_class *and_with)
580{
653099ff
GS
581 if (!(and_with->flags & ANYOF_CLASS)
582 && !(cl->flags & ANYOF_CLASS)
583 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
584 && !(and_with->flags & ANYOF_FOLD)
585 && !(cl->flags & ANYOF_FOLD)) {
586 int i;
587
588 if (and_with->flags & ANYOF_INVERT)
589 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
590 cl->bitmap[i] &= ~and_with->bitmap[i];
591 else
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] &= and_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
595 if (!(and_with->flags & ANYOF_EOS))
596 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
597
598 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
599 cl->flags &= ~ANYOF_UNICODE_ALL;
600 cl->flags |= ANYOF_UNICODE;
601 ARG_SET(cl, ARG(and_with));
602 }
603 if (!(and_with->flags & ANYOF_UNICODE_ALL))
604 cl->flags &= ~ANYOF_UNICODE_ALL;
605 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
606 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
607}
608
609/* 'OR' a given class with another one. Can create false positives */
610/* We assume that cl is not inverted */
611STATIC void
830247a4 612S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 613{
653099ff
GS
614 if (or_with->flags & ANYOF_INVERT) {
615 /* We do not use
616 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
617 * <= (B1 | !B2) | (CL1 | !CL2)
618 * which is wasteful if CL2 is small, but we ignore CL2:
619 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
620 * XXXX Can we handle case-fold? Unclear:
621 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
622 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
623 */
624 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
625 && !(or_with->flags & ANYOF_FOLD)
626 && !(cl->flags & ANYOF_FOLD) ) {
627 int i;
628
629 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
630 cl->bitmap[i] |= ~or_with->bitmap[i];
631 } /* XXXX: logic is complicated otherwise */
632 else {
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 } else {
636 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
637 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 638 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
639 || (cl->flags & ANYOF_FOLD)) ) {
640 int i;
641
642 /* OR char bitmap and class bitmap separately */
643 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
644 cl->bitmap[i] |= or_with->bitmap[i];
645 if (or_with->flags & ANYOF_CLASS) {
646 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
647 cl->classflags[i] |= or_with->classflags[i];
648 cl->flags |= ANYOF_CLASS;
649 }
650 }
651 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653 }
654 }
655 if (or_with->flags & ANYOF_EOS)
656 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
657
658 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
659 ARG(cl) != ARG(or_with)) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
663 if (or_with->flags & ANYOF_UNICODE_ALL) {
664 cl->flags |= ANYOF_UNICODE_ALL;
665 cl->flags &= ~ANYOF_UNICODE;
666 }
653099ff
GS
667}
668
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
c277df42
IZ
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
76e3520e 675STATIC I32
830247a4 676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 685 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
653099ff 688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 689
c277df42
IZ
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
22c35a8c 693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 694 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
b81d288d 699#endif
c277df42 700
cd439c50 701 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
b81d288d 704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
22c35a8c 710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
b81d288d 716#endif
c277df42 717 n = regnext(n);
a0ed51b3 718 }
f49d4d0f 719 else if (stringok) {
cd439c50 720 int oldl = STR_LEN(scan);
c277df42 721 regnode *nnext = regnext(n);
f49d4d0f 722
b81d288d 723 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
c277df42 728 /* Now we can overwrite *n : */
f49d4d0f 729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 730#ifdef DEBUGGING
f49d4d0f 731 stop = next - 1;
b81d288d 732#endif
c277df42
IZ
733 n = nnext;
734 }
735 }
736#ifdef DEBUGGING
737 /* Allow dumping */
cd439c50 738 n = scan + NODE_SZ_STR(scan);
c277df42 739 while (n <= stop) {
22c35a8c 740 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
741 OP(n) = OPTIMIZED;
742 NEXT_OFF(n) = 0;
743 }
744 n++;
745 }
653099ff 746#endif
c277df42 747 }
653099ff
GS
748 /* Follow the next-chain of the current node and optimize
749 away all the NOTHINGs from it. */
c277df42 750 if (OP(scan) != CURLYX) {
048cfca1
GS
751 int max = (reg_off_by_arg[OP(scan)]
752 ? I32_MAX
753 /* I32 may be smaller than U16 on CRAYs! */
754 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
755 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
756 int noff;
757 regnode *n = scan;
b81d288d 758
c277df42
IZ
759 /* Skip NOTHING and LONGJMP. */
760 while ((n = regnext(n))
22c35a8c 761 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
762 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
763 && off + noff < max)
764 off += noff;
765 if (reg_off_by_arg[OP(scan)])
766 ARG(scan) = off;
b81d288d 767 else
c277df42
IZ
768 NEXT_OFF(scan) = off;
769 }
653099ff
GS
770 /* The principal pseudo-switch. Cannot be a switch, since we
771 look into several different things. */
b81d288d 772 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
773 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
774 next = regnext(scan);
775 code = OP(scan);
b81d288d
AB
776
777 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 778 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 779 struct regnode_charclass_class accum;
c277df42 780
653099ff 781 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 782 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 783 if (flags & SCF_DO_STCLASS)
830247a4 784 cl_init_zero(pRExC_state, &accum);
c277df42 785 while (OP(scan) == code) {
830247a4 786 I32 deltanext, minnext, f = 0, fake;
653099ff 787 struct regnode_charclass_class this_class;
c277df42
IZ
788
789 num++;
790 data_fake.flags = 0;
b81d288d 791 if (data) {
2c2d71f5 792 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
793 data_fake.last_closep = data->last_closep;
794 }
795 else
796 data_fake.last_closep = &fake;
c277df42
IZ
797 next = regnext(scan);
798 scan = NEXTOPER(scan);
799 if (code != BRANCH)
800 scan = NEXTOPER(scan);
653099ff 801 if (flags & SCF_DO_STCLASS) {
830247a4 802 cl_init(pRExC_state, &this_class);
653099ff
GS
803 data_fake.start_class = &this_class;
804 f = SCF_DO_STCLASS_AND;
b81d288d 805 }
e1901655
IZ
806 if (flags & SCF_WHILEM_VISITED_POS)
807 f |= SCF_WHILEM_VISITED_POS;
653099ff 808 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
809 minnext = study_chunk(pRExC_state, &scan, &deltanext,
810 next, &data_fake, f);
b81d288d 811 if (min1 > minnext)
c277df42
IZ
812 min1 = minnext;
813 if (max1 < minnext + deltanext)
814 max1 = minnext + deltanext;
815 if (deltanext == I32_MAX)
aca2d497 816 is_inf = is_inf_internal = 1;
c277df42
IZ
817 scan = next;
818 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
819 pars++;
405ff068 820 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 821 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
822 if (data)
823 data->whilem_c = data_fake.whilem_c;
653099ff 824 if (flags & SCF_DO_STCLASS)
830247a4 825 cl_or(pRExC_state, &accum, &this_class);
b81d288d 826 if (code == SUSPEND)
c277df42
IZ
827 break;
828 }
829 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
830 min1 = 0;
831 if (flags & SCF_DO_SUBSTR) {
832 data->pos_min += min1;
833 data->pos_delta += max1 - min1;
834 if (max1 != min1 || is_inf)
835 data->longest = &(data->longest_float);
836 }
837 min += min1;
838 delta += max1 - min1;
653099ff 839 if (flags & SCF_DO_STCLASS_OR) {
830247a4 840 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
841 if (min1) {
842 cl_and(data->start_class, &and_with);
843 flags &= ~SCF_DO_STCLASS;
844 }
845 }
846 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
847 if (min1) {
848 cl_and(data->start_class, &accum);
653099ff 849 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
850 }
851 else {
b81d288d 852 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
853 * data->start_class */
854 StructCopy(data->start_class, &and_with,
855 struct regnode_charclass_class);
856 flags &= ~SCF_DO_STCLASS_AND;
857 StructCopy(&accum, data->start_class,
858 struct regnode_charclass_class);
859 flags |= SCF_DO_STCLASS_OR;
860 data->start_class->flags |= ANYOF_EOS;
861 }
653099ff 862 }
a0ed51b3
LW
863 }
864 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
865 scan = NEXTOPER(NEXTOPER(scan));
866 else /* single branch is optimized. */
867 scan = NEXTOPER(scan);
868 continue;
a0ed51b3
LW
869 }
870 else if (OP(scan) == EXACT) {
cd439c50 871 I32 l = STR_LEN(scan);
1aa99e6b 872 UV uc = *((U8*)STRING(scan));
a0ed51b3 873 if (UTF) {
1aa99e6b
IH
874 U8 *s = (U8*)STRING(scan);
875 l = utf8_length(s, s + l);
9041c2e3 876 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
877 }
878 min += l;
c277df42 879 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
880 /* The code below prefers earlier match for fixed
881 offset, later match for variable offset. */
882 if (data->last_end == -1) { /* Update the start info. */
883 data->last_start_min = data->pos_min;
884 data->last_start_max = is_inf
b81d288d 885 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 886 }
cd439c50 887 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
888 data->last_end = data->pos_min + l;
889 data->pos_min += l; /* As in the first entry. */
890 data->flags &= ~SF_BEFORE_EOL;
891 }
653099ff
GS
892 if (flags & SCF_DO_STCLASS_AND) {
893 /* Check whether it is compatible with what we know already! */
894 int compat = 1;
895
1aa99e6b 896 if (uc >= 0x100 ||
516a5887 897 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 898 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 899 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 900 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 901 )
653099ff
GS
902 compat = 0;
903 ANYOF_CLASS_ZERO(data->start_class);
904 ANYOF_BITMAP_ZERO(data->start_class);
905 if (compat)
1aa99e6b 906 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 907 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
908 if (uc < 0x100)
909 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
910 }
911 else if (flags & SCF_DO_STCLASS_OR) {
912 /* false positive possible if the class is case-folded */
1aa99e6b 913 if (uc < 0x100)
9b877dbb
IH
914 ANYOF_BITMAP_SET(data->start_class, uc);
915 else
916 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
917 data->start_class->flags &= ~ANYOF_EOS;
918 cl_and(data->start_class, &and_with);
919 }
920 flags &= ~SCF_DO_STCLASS;
a0ed51b3 921 }
653099ff 922 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 923 I32 l = STR_LEN(scan);
1aa99e6b 924 UV uc = *((U8*)STRING(scan));
653099ff
GS
925
926 /* Search for fixed substrings supports EXACT only. */
b81d288d 927 if (flags & SCF_DO_SUBSTR)
830247a4 928 scan_commit(pRExC_state, data);
a0ed51b3 929 if (UTF) {
1aa99e6b
IH
930 U8 *s = (U8 *)STRING(scan);
931 l = utf8_length(s, s + l);
9041c2e3 932 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
933 }
934 min += l;
c277df42 935 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 936 data->pos_min += l;
653099ff
GS
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
939 int compat = 1;
940
1aa99e6b 941 if (uc >= 0x100 ||
516a5887 942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 944 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
945 compat = 0;
946 ANYOF_CLASS_ZERO(data->start_class);
947 ANYOF_BITMAP_ZERO(data->start_class);
948 if (compat) {
1aa99e6b 949 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
950 data->start_class->flags &= ~ANYOF_EOS;
951 data->start_class->flags |= ANYOF_FOLD;
952 if (OP(scan) == EXACTFL)
953 data->start_class->flags |= ANYOF_LOCALE;
954 }
955 }
956 else if (flags & SCF_DO_STCLASS_OR) {
957 if (data->start_class->flags & ANYOF_FOLD) {
958 /* false positive possible if the class is case-folded.
959 Assume that the locale settings are the same... */
1aa99e6b
IH
960 if (uc < 0x100)
961 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
962 data->start_class->flags &= ~ANYOF_EOS;
963 }
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
4d61ec05 968 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 969 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 970 I32 f = flags, pos_before = 0;
c277df42 971 regnode *oscan = scan;
653099ff
GS
972 struct regnode_charclass_class this_class;
973 struct regnode_charclass_class *oclass = NULL;
974
22c35a8c 975 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 976 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
977 scan = NEXTOPER(scan);
978 goto finish;
979 case PLUS:
653099ff 980 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 981 next = NEXTOPER(scan);
653099ff 982 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
983 mincount = 1;
984 maxcount = REG_INFTY;
c277df42
IZ
985 next = regnext(scan);
986 scan = NEXTOPER(scan);
987 goto do_curly;
988 }
989 }
990 if (flags & SCF_DO_SUBSTR)
991 data->pos_min++;
992 min++;
993 /* Fall through. */
994 case STAR:
653099ff
GS
995 if (flags & SCF_DO_STCLASS) {
996 mincount = 0;
b81d288d 997 maxcount = REG_INFTY;
653099ff
GS
998 next = regnext(scan);
999 scan = NEXTOPER(scan);
1000 goto do_curly;
1001 }
b81d288d 1002 is_inf = is_inf_internal = 1;
c277df42
IZ
1003 scan = regnext(scan);
1004 if (flags & SCF_DO_SUBSTR) {
830247a4 1005 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1006 data->longest = &(data->longest_float);
1007 }
1008 goto optimize_curly_tail;
1009 case CURLY:
b81d288d 1010 mincount = ARG1(scan);
c277df42
IZ
1011 maxcount = ARG2(scan);
1012 next = regnext(scan);
cb434fcc
IZ
1013 if (OP(scan) == CURLYX) {
1014 I32 lp = (data ? *(data->last_closep) : 0);
1015
1016 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1017 }
c277df42
IZ
1018 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1019 do_curly:
1020 if (flags & SCF_DO_SUBSTR) {
830247a4 1021 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1022 pos_before = data->pos_min;
1023 }
1024 if (data) {
1025 fl = data->flags;
1026 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1027 if (is_inf)
1028 data->flags |= SF_IS_INF;
1029 }
653099ff 1030 if (flags & SCF_DO_STCLASS) {
830247a4 1031 cl_init(pRExC_state, &this_class);
653099ff
GS
1032 oclass = data->start_class;
1033 data->start_class = &this_class;
1034 f |= SCF_DO_STCLASS_AND;
1035 f &= ~SCF_DO_STCLASS_OR;
1036 }
e1901655
IZ
1037 /* These are the cases when once a subexpression
1038 fails at a particular position, it cannot succeed
1039 even after backtracking at the enclosing scope.
b81d288d 1040
e1901655
IZ
1041 XXXX what if minimal match and we are at the
1042 initial run of {n,m}? */
1043 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1044 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1045
c277df42 1046 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1047 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1048 mincount == 0
653099ff
GS
1049 ? (f & ~SCF_DO_SUBSTR) : f);
1050
1051 if (flags & SCF_DO_STCLASS)
1052 data->start_class = oclass;
1053 if (mincount == 0 || minnext == 0) {
1054 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1055 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1056 }
1057 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1058 /* Switch to OR mode: cache the old value of
653099ff
GS
1059 * data->start_class */
1060 StructCopy(data->start_class, &and_with,
1061 struct regnode_charclass_class);
1062 flags &= ~SCF_DO_STCLASS_AND;
1063 StructCopy(&this_class, data->start_class,
1064 struct regnode_charclass_class);
1065 flags |= SCF_DO_STCLASS_OR;
1066 data->start_class->flags |= ANYOF_EOS;
1067 }
1068 } else { /* Non-zero len */
1069 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1070 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1071 cl_and(data->start_class, &and_with);
1072 }
1073 else if (flags & SCF_DO_STCLASS_AND)
1074 cl_and(data->start_class, &this_class);
1075 flags &= ~SCF_DO_STCLASS;
1076 }
c277df42
IZ
1077 if (!scan) /* It was not CURLYX, but CURLY. */
1078 scan = next;
84037bb0
HS
1079 if (ckWARN(WARN_REGEXP)
1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1102 regnode *nxt1 = nxt, *nxt2;
1103
1104 /* Skip open. */
1105 nxt = regnext(nxt);
4d61ec05 1106 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1107 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1108 && STR_LEN(nxt) == 1))
c277df42
IZ
1109 goto nogo;
1110 nxt2 = nxt;
1111 nxt = regnext(nxt);
b81d288d 1112 if (OP(nxt) != CLOSE)
c277df42
IZ
1113 goto nogo;
1114 /* Now we know that nxt2 is the only contents: */
1115 oscan->flags = ARG(nxt);
1116 OP(oscan) = CURLYN;
1117 OP(nxt1) = NOTHING; /* was OPEN. */
1118#ifdef DEBUGGING
1119 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1120 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1121 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1122 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1123 OP(nxt + 1) = OPTIMIZED; /* was count. */
1124 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1125#endif
c277df42 1126 }
c277df42
IZ
1127 nogo:
1128
1129 /* Try optimization CURLYX => CURLYM. */
b81d288d 1130 if ( OP(oscan) == CURLYX && data
c277df42 1131 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext ) {
1134 /* XXXX How to optimize if data == 0? */
1135 /* Optimize to a simpler form. */
1136 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1137 regnode *nxt2;
1138
1139 OP(oscan) = CURLYM;
1140 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1141 && (OP(nxt2) != WHILEM))
c277df42
IZ
1142 nxt = nxt2;
1143 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1144 /* Need to optimize away parenths. */
1145 if (data->flags & SF_IN_PAR) {
1146 /* Set the parenth number. */
1147 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1148
b81d288d 1149 if (OP(nxt) != CLOSE)
b45f050a 1150 FAIL("Panic opt close");
c277df42
IZ
1151 oscan->flags = ARG(nxt);
1152 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1153 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1154#ifdef DEBUGGING
1155 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1156 OP(nxt + 1) = OPTIMIZED; /* was count. */
1157 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1158 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1159#endif
c277df42
IZ
1160#if 0
1161 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1162 regnode *nnxt = regnext(nxt1);
b81d288d 1163
c277df42
IZ
1164 if (nnxt == nxt) {
1165 if (reg_off_by_arg[OP(nxt1)])
1166 ARG_SET(nxt1, nxt2 - nxt1);
1167 else if (nxt2 - nxt1 < U16_MAX)
1168 NEXT_OFF(nxt1) = nxt2 - nxt1;
1169 else
1170 OP(nxt) = NOTHING; /* Cannot beautify */
1171 }
1172 nxt1 = nnxt;
1173 }
1174#endif
1175 /* Optimize again: */
b81d288d 1176 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1177 NULL, 0);
a0ed51b3
LW
1178 }
1179 else
c277df42 1180 oscan->flags = 0;
c277df42 1181 }
e1901655
IZ
1182 else if ((OP(oscan) == CURLYX)
1183 && (flags & SCF_WHILEM_VISITED_POS)
1184 /* See the comment on a similar expression above.
1185 However, this time it not a subexpression
1186 we care about, but the expression itself. */
1187 && (maxcount == REG_INFTY)
1188 && data && ++data->whilem_c < 16) {
1189 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1190 /* Find WHILEM (as in regexec.c) */
1191 regnode *nxt = oscan + NEXT_OFF(oscan);
1192
1193 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1194 nxt += ARG(nxt);
1195 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1196 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1197 }
b81d288d 1198 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1199 pars++;
1200 if (flags & SCF_DO_SUBSTR) {
1201 SV *last_str = Nullsv;
1202 int counted = mincount != 0;
1203
1204 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1205 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1206 ? pos_before : data->last_start_min;
1207 STRLEN l;
1208 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1209 I32 old = b - data->last_start_min;
1210
1211 if (UTF)
1212 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1213
a0ed51b3 1214 l -= old;
c277df42 1215 /* Get the added string: */
79cb57f6 1216 last_str = newSVpvn(s + old, l);
c277df42
IZ
1217 if (deltanext == 0 && pos_before == b) {
1218 /* What was added is a constant string */
1219 if (mincount > 1) {
1220 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1221 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1222 SvPVX(last_str), l, mincount - 1);
1223 SvCUR(last_str) *= mincount;
1224 /* Add additional parts. */
b81d288d 1225 SvCUR_set(data->last_found,
c277df42
IZ
1226 SvCUR(data->last_found) - l);
1227 sv_catsv(data->last_found, last_str);
1228 data->last_end += l * (mincount - 1);
1229 }
2a8d9689
HS
1230 } else {
1231 /* start offset must point into the last copy */
1232 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1233 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1234 * (minnext + data->pos_delta);
c277df42
IZ
1235 }
1236 }
1237 /* It is counted once already... */
1238 data->pos_min += minnext * (mincount - counted);
1239 data->pos_delta += - counted * deltanext +
1240 (minnext + deltanext) * maxcount - minnext * mincount;
1241 if (mincount != maxcount) {
653099ff
GS
1242 /* Cannot extend fixed substrings found inside
1243 the group. */
830247a4 1244 scan_commit(pRExC_state,data);
c277df42
IZ
1245 if (mincount && last_str) {
1246 sv_setsv(data->last_found, last_str);
1247 data->last_end = data->pos_min;
b81d288d 1248 data->last_start_min =
a0ed51b3 1249 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1250 data->last_start_max = is_inf
1251 ? I32_MAX
c277df42 1252 : data->pos_min + data->pos_delta
a0ed51b3 1253 - CHR_SVLEN(last_str);
c277df42
IZ
1254 }
1255 data->longest = &(data->longest_float);
1256 }
aca2d497 1257 SvREFCNT_dec(last_str);
c277df42 1258 }
405ff068 1259 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1260 data->flags |= SF_HAS_EVAL;
1261 optimize_curly_tail:
c277df42 1262 if (OP(oscan) != CURLYX) {
22c35a8c 1263 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1264 && NEXT_OFF(next))
1265 NEXT_OFF(oscan) += NEXT_OFF(next);
1266 }
c277df42 1267 continue;
653099ff 1268 default: /* REF and CLUMP only? */
c277df42 1269 if (flags & SCF_DO_SUBSTR) {
830247a4 1270 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1271 data->longest = &(data->longest_float);
1272 }
aca2d497 1273 is_inf = is_inf_internal = 1;
653099ff 1274 if (flags & SCF_DO_STCLASS_OR)
830247a4 1275 cl_anything(pRExC_state, data->start_class);
653099ff 1276 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1277 break;
1278 }
a0ed51b3 1279 }
ffc61ed2 1280 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1281 int value = 0;
653099ff 1282
c277df42 1283 if (flags & SCF_DO_SUBSTR) {
830247a4 1284 scan_commit(pRExC_state,data);
c277df42
IZ
1285 data->pos_min++;
1286 }
1287 min++;
653099ff
GS
1288 if (flags & SCF_DO_STCLASS) {
1289 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1290
1291 /* Some of the logic below assumes that switching
1292 locale on will only add false positives. */
1293 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1294 case SANY:
653099ff
GS
1295 default:
1296 do_default:
1297 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1298 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1299 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1300 break;
1301 case REG_ANY:
1302 if (OP(scan) == SANY)
1303 goto do_default;
1304 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1305 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1306 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1307 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1308 }
1309 if (flags & SCF_DO_STCLASS_AND || !value)
1310 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1311 break;
1312 case ANYOF:
1313 if (flags & SCF_DO_STCLASS_AND)
1314 cl_and(data->start_class,
1315 (struct regnode_charclass_class*)scan);
1316 else
830247a4 1317 cl_or(pRExC_state, data->start_class,
653099ff
GS
1318 (struct regnode_charclass_class*)scan);
1319 break;
1320 case ALNUM:
1321 if (flags & SCF_DO_STCLASS_AND) {
1322 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1324 for (value = 0; value < 256; value++)
1325 if (!isALNUM(value))
1326 ANYOF_BITMAP_CLEAR(data->start_class, value);
1327 }
1328 }
1329 else {
1330 if (data->start_class->flags & ANYOF_LOCALE)
1331 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1332 else {
1333 for (value = 0; value < 256; value++)
1334 if (isALNUM(value))
b81d288d 1335 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1336 }
1337 }
1338 break;
1339 case ALNUML:
1340 if (flags & SCF_DO_STCLASS_AND) {
1341 if (data->start_class->flags & ANYOF_LOCALE)
1342 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1343 }
1344 else {
1345 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1346 data->start_class->flags |= ANYOF_LOCALE;
1347 }
1348 break;
1349 case NALNUM:
1350 if (flags & SCF_DO_STCLASS_AND) {
1351 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1353 for (value = 0; value < 256; value++)
1354 if (isALNUM(value))
1355 ANYOF_BITMAP_CLEAR(data->start_class, value);
1356 }
1357 }
1358 else {
1359 if (data->start_class->flags & ANYOF_LOCALE)
1360 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1361 else {
1362 for (value = 0; value < 256; value++)
1363 if (!isALNUM(value))
b81d288d 1364 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1365 }
1366 }
1367 break;
1368 case NALNUML:
1369 if (flags & SCF_DO_STCLASS_AND) {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1372 }
1373 else {
1374 data->start_class->flags |= ANYOF_LOCALE;
1375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1376 }
1377 break;
1378 case SPACE:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1382 for (value = 0; value < 256; value++)
1383 if (!isSPACE(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (isSPACE(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case SPACEL:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1401 }
1402 else {
1403 data->start_class->flags |= ANYOF_LOCALE;
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1405 }
1406 break;
1407 case NSPACE:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1411 for (value = 0; value < 256; value++)
1412 if (isSPACE(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (!isSPACE(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case NSPACEL:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE) {
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1430 for (value = 0; value < 256; value++)
1431 if (!isSPACE(value))
1432 ANYOF_BITMAP_CLEAR(data->start_class, value);
1433 }
1434 }
1435 else {
1436 data->start_class->flags |= ANYOF_LOCALE;
1437 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1438 }
1439 break;
1440 case DIGIT:
1441 if (flags & SCF_DO_STCLASS_AND) {
1442 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1443 for (value = 0; value < 256; value++)
1444 if (!isDIGIT(value))
1445 ANYOF_BITMAP_CLEAR(data->start_class, value);
1446 }
1447 else {
1448 if (data->start_class->flags & ANYOF_LOCALE)
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1450 else {
1451 for (value = 0; value < 256; value++)
1452 if (isDIGIT(value))
b81d288d 1453 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1454 }
1455 }
1456 break;
1457 case NDIGIT:
1458 if (flags & SCF_DO_STCLASS_AND) {
1459 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1460 for (value = 0; value < 256; value++)
1461 if (isDIGIT(value))
1462 ANYOF_BITMAP_CLEAR(data->start_class, value);
1463 }
1464 else {
1465 if (data->start_class->flags & ANYOF_LOCALE)
1466 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1467 else {
1468 for (value = 0; value < 256; value++)
1469 if (!isDIGIT(value))
b81d288d 1470 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1471 }
1472 }
1473 break;
1474 }
1475 if (flags & SCF_DO_STCLASS_OR)
1476 cl_and(data->start_class, &and_with);
1477 flags &= ~SCF_DO_STCLASS;
1478 }
a0ed51b3 1479 }
22c35a8c 1480 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1481 data->flags |= (OP(scan) == MEOL
1482 ? SF_BEFORE_MEOL
1483 : SF_BEFORE_SEOL);
a0ed51b3 1484 }
653099ff
GS
1485 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1486 /* Lookbehind, or need to calculate parens/evals/stclass: */
1487 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1488 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1489 /* Lookahead/lookbehind */
cb434fcc 1490 I32 deltanext, minnext, fake = 0;
c277df42 1491 regnode *nscan;
653099ff
GS
1492 struct regnode_charclass_class intrnl;
1493 int f = 0;
c277df42
IZ
1494
1495 data_fake.flags = 0;
b81d288d 1496 if (data) {
2c2d71f5 1497 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1498 data_fake.last_closep = data->last_closep;
1499 }
1500 else
1501 data_fake.last_closep = &fake;
653099ff
GS
1502 if ( flags & SCF_DO_STCLASS && !scan->flags
1503 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1504 cl_init(pRExC_state, &intrnl);
653099ff 1505 data_fake.start_class = &intrnl;
e1901655 1506 f |= SCF_DO_STCLASS_AND;
653099ff 1507 }
e1901655
IZ
1508 if (flags & SCF_WHILEM_VISITED_POS)
1509 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1510 next = regnext(scan);
1511 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1512 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1513 if (scan->flags) {
1514 if (deltanext) {
9baa0206 1515 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1516 }
1517 else if (minnext > U8_MAX) {
9baa0206 1518 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1519 }
1520 scan->flags = minnext;
1521 }
1522 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1523 pars++;
405ff068 1524 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1525 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1526 if (data)
1527 data->whilem_c = data_fake.whilem_c;
e1901655 1528 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1529 int was = (data->start_class->flags & ANYOF_EOS);
1530
1531 cl_and(data->start_class, &intrnl);
1532 if (was)
1533 data->start_class->flags |= ANYOF_EOS;
1534 }
a0ed51b3
LW
1535 }
1536 else if (OP(scan) == OPEN) {
c277df42 1537 pars++;
a0ed51b3 1538 }
cb434fcc
IZ
1539 else if (OP(scan) == CLOSE) {
1540 if (ARG(scan) == is_par) {
1541 next = regnext(scan);
c277df42 1542
cb434fcc
IZ
1543 if ( next && (OP(next) != WHILEM) && next < last)
1544 is_par = 0; /* Disable optimization */
1545 }
1546 if (data)
1547 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1548 }
1549 else if (OP(scan) == EVAL) {
c277df42
IZ
1550 if (data)
1551 data->flags |= SF_HAS_EVAL;
1552 }
96776eda 1553 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1554 if (flags & SCF_DO_SUBSTR) {
830247a4 1555 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1556 data->longest = &(data->longest_float);
1557 }
1558 is_inf = is_inf_internal = 1;
653099ff 1559 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1560 cl_anything(pRExC_state, data->start_class);
96776eda 1561 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1562 }
c277df42
IZ
1563 /* Else: zero-length, ignore. */
1564 scan = regnext(scan);
1565 }
1566
1567 finish:
1568 *scanp = scan;
aca2d497 1569 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1570 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1571 data->pos_delta = I32_MAX - data->pos_min;
1572 if (is_par > U8_MAX)
1573 is_par = 0;
1574 if (is_par && pars==1 && data) {
1575 data->flags |= SF_IN_PAR;
1576 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1577 }
1578 else if (pars && data) {
c277df42
IZ
1579 data->flags |= SF_HAS_PAR;
1580 data->flags &= ~SF_IN_PAR;
1581 }
653099ff
GS
1582 if (flags & SCF_DO_STCLASS_OR)
1583 cl_and(data->start_class, &and_with);
c277df42
IZ
1584 return min;
1585}
1586
76e3520e 1587STATIC I32
830247a4 1588S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1589{
830247a4 1590 if (RExC_rx->data) {
b81d288d
AB
1591 Renewc(RExC_rx->data,
1592 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1593 char, struct reg_data);
830247a4
IZ
1594 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1595 RExC_rx->data->count += n;
a0ed51b3
LW
1596 }
1597 else {
830247a4 1598 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1599 char, struct reg_data);
830247a4
IZ
1600 New(1208, RExC_rx->data->what, n, U8);
1601 RExC_rx->data->count = n;
c277df42 1602 }
830247a4
IZ
1603 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1604 return RExC_rx->data->count - n;
c277df42
IZ
1605}
1606
d88dccdf 1607void
864dbfa3 1608Perl_reginitcolors(pTHX)
d88dccdf 1609{
d88dccdf
IZ
1610 int i = 0;
1611 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1612
d88dccdf
IZ
1613 if (s) {
1614 PL_colors[0] = s = savepv(s);
1615 while (++i < 6) {
1616 s = strchr(s, '\t');
1617 if (s) {
1618 *s = '\0';
1619 PL_colors[i] = ++s;
1620 }
1621 else
c712d376 1622 PL_colors[i] = s = "";
d88dccdf
IZ
1623 }
1624 } else {
b81d288d 1625 while (i < 6)
d88dccdf
IZ
1626 PL_colors[i++] = "";
1627 }
1628 PL_colorset = 1;
1629}
1630
8615cb43 1631
a687059c 1632/*
e50aee73 1633 - pregcomp - compile a regular expression into internal code
a687059c
LW
1634 *
1635 * We can't allocate space until we know how big the compiled form will be,
1636 * but we can't compile it (and thus know how big it is) until we've got a
1637 * place to put the code. So we cheat: we compile it twice, once with code
1638 * generation turned off and size counting turned on, and once "for real".
1639 * This also means that we don't allocate space until we are sure that the
1640 * thing really will compile successfully, and we never have to move the
1641 * code and thus invalidate pointers into it. (Note that it has to be in
1642 * one piece because free() must be able to free it all.) [NB: not true in perl]
1643 *
1644 * Beware that the optimization-preparation code in here knows about some
1645 * of the structure of the compiled regexp. [I'll say.]
1646 */
1647regexp *
864dbfa3 1648Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1649{
a0d0e21e 1650 register regexp *r;
c277df42 1651 regnode *scan;
c277df42 1652 regnode *first;
a0d0e21e 1653 I32 flags;
a0d0e21e
LW
1654 I32 minlen = 0;
1655 I32 sawplus = 0;
1656 I32 sawopen = 0;
2c2d71f5 1657 scan_data_t data;
830247a4
IZ
1658 RExC_state_t RExC_state;
1659 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1660
1661 if (exp == NULL)
c277df42 1662 FAIL("NULL regexp argument");
a0d0e21e 1663
830247a4 1664 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1665 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1666 RExC_utf8 = 1;
a0ed51b3 1667 else
1aa99e6b 1668 RExC_utf8 = 0;
a0ed51b3 1669
5cfc7842 1670 RExC_precomp = exp;
35ef4773 1671 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1672 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1673 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1674 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1675 RExC_flags16 = pm->op_pmflags;
1676 RExC_sawback = 0;
bbce6d69 1677
830247a4
IZ
1678 RExC_seen = 0;
1679 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1680 RExC_seen_evals = 0;
1681 RExC_extralen = 0;
c277df42 1682
bbce6d69 1683 /* First pass: determine size, legality. */
830247a4 1684 RExC_parse = exp;
fac92740 1685 RExC_start = exp;
830247a4
IZ
1686 RExC_end = xend;
1687 RExC_naughty = 0;
1688 RExC_npar = 1;
1689 RExC_size = 0L;
1690 RExC_emit = &PL_regdummy;
1691 RExC_whilem_seen = 0;
85ddcde9
JH
1692#if 0 /* REGC() is (currently) a NOP at the first pass.
1693 * Clever compilers notice this and complain. --jhi */
830247a4 1694 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1695#endif
830247a4 1696 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1697 RExC_precomp = Nullch;
a0d0e21e
LW
1698 return(NULL);
1699 }
830247a4 1700 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1701
c277df42
IZ
1702 /* Small enough for pointer-storage convention?
1703 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1704 if (RExC_size >= 0x10000L && RExC_extralen)
1705 RExC_size += RExC_extralen;
c277df42 1706 else
830247a4
IZ
1707 RExC_extralen = 0;
1708 if (RExC_whilem_seen > 15)
1709 RExC_whilem_seen = 15;
a0d0e21e 1710
bbce6d69 1711 /* Allocate space and initialize. */
830247a4 1712 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1713 char, regexp);
a0d0e21e 1714 if (r == NULL)
b45f050a
JF
1715 FAIL("Regexp out of space");
1716
0f79a09d
GS
1717#ifdef DEBUGGING
1718 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1719 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1720#endif
c277df42 1721 r->refcnt = 1;
bbce6d69 1722 r->prelen = xend - exp;
5cfc7842 1723 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1724 r->subbeg = NULL;
1725 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1726 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1727
1728 r->substrs = 0; /* Useful during FAIL. */
1729 r->startp = 0; /* Useful during FAIL. */
1730 r->endp = 0; /* Useful during FAIL. */
1731
fac92740
MJD
1732 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1733 if (r->offsets) {
1734 r->offsets[0] = RExC_size;
1735 }
1736 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1737 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1738 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1739 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1740
830247a4 1741 RExC_rx = r;
bbce6d69 1742
1743 /* Second pass: emit code. */
830247a4
IZ
1744 RExC_parse = exp;
1745 RExC_end = xend;
1746 RExC_naughty = 0;
1747 RExC_npar = 1;
fac92740 1748 RExC_emit_start = r->program;
830247a4 1749 RExC_emit = r->program;
2cd61cdb 1750 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1751 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1752 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1753 r->data = 0;
830247a4 1754 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1755 return(NULL);
1756
1757 /* Dig out information for optimizations. */
cf93c79d 1758 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1759 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1760 if (UTF)
1761 r->reganch |= ROPT_UTF8;
c277df42 1762 r->regstclass = NULL;
830247a4 1763 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1764 r->reganch |= ROPT_NAUGHTY;
c277df42 1765 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1766
1767 /* XXXX To minimize changes to RE engine we always allocate
1768 3-units-long substrs field. */
1769 Newz(1004, r->substrs, 1, struct reg_substr_data);
1770
2c2d71f5 1771 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1772 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1773 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1774 I32 fake;
c5254dd6 1775 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1776 struct regnode_charclass_class ch_class;
1777 int stclass_flag;
cb434fcc 1778 I32 last_close = 0;
a0d0e21e
LW
1779
1780 first = scan;
c277df42 1781 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1782 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1783 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1784 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1785 (OP(first) == PLUS) ||
1786 (OP(first) == MINMOD) ||
653099ff 1787 /* An {n,m} with n>0 */
22c35a8c 1788 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1789 if (OP(first) == PLUS)
1790 sawplus = 1;
1791 else
1792 first += regarglen[(U8)OP(first)];
1793 first = NEXTOPER(first);
a687059c
LW
1794 }
1795
a0d0e21e
LW
1796 /* Starting-point info. */
1797 again:
653099ff 1798 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1799 if (OP(first) == EXACT)
1800 ; /* Empty, get anchored substr later. */
1801 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1802 r->regstclass = first;
1803 }
653099ff 1804 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1805 r->regstclass = first;
22c35a8c
GS
1806 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1807 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1808 r->regstclass = first;
22c35a8c 1809 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1810 r->reganch |= (OP(first) == MBOL
1811 ? ROPT_ANCH_MBOL
1812 : (OP(first) == SBOL
1813 ? ROPT_ANCH_SBOL
1814 : ROPT_ANCH_BOL));
a0d0e21e 1815 first = NEXTOPER(first);
774d564b 1816 goto again;
1817 }
1818 else if (OP(first) == GPOS) {
1819 r->reganch |= ROPT_ANCH_GPOS;
1820 first = NEXTOPER(first);
1821 goto again;
a0d0e21e
LW
1822 }
1823 else if ((OP(first) == STAR &&
22c35a8c 1824 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1825 !(r->reganch & ROPT_ANCH) )
1826 {
1827 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1828 int type = OP(NEXTOPER(first));
1829
ffc61ed2 1830 if (type == REG_ANY)
cad2e5aa
JH
1831 type = ROPT_ANCH_MBOL;
1832 else
1833 type = ROPT_ANCH_SBOL;
1834
1835 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1836 first = NEXTOPER(first);
774d564b 1837 goto again;
a0d0e21e 1838 }
b81d288d 1839 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1840 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1841 /* x+ must match at the 1st pos of run of x's */
1842 r->reganch |= ROPT_SKIP;
a0d0e21e 1843
c277df42 1844 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1845 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1846 (IV)(first - scan + 1)));
a0d0e21e
LW
1847 /*
1848 * If there's something expensive in the r.e., find the
1849 * longest literal string that must appear and make it the
1850 * regmust. Resolve ties in favor of later strings, since
1851 * the regstart check works with the beginning of the r.e.
1852 * and avoiding duplication strengthens checking. Not a
1853 * strong reason, but sufficient in the absence of others.
1854 * [Now we resolve ties in favor of the earlier string if
c277df42 1855 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1856 * earlier string may buy us something the later one won't.]
1857 */
a0d0e21e 1858 minlen = 0;
a687059c 1859
79cb57f6
GS
1860 data.longest_fixed = newSVpvn("",0);
1861 data.longest_float = newSVpvn("",0);
1862 data.last_found = newSVpvn("",0);
c277df42
IZ
1863 data.longest = &(data.longest_fixed);
1864 first = scan;
653099ff 1865 if (!r->regstclass) {
830247a4 1866 cl_init(pRExC_state, &ch_class);
653099ff
GS
1867 data.start_class = &ch_class;
1868 stclass_flag = SCF_DO_STCLASS_AND;
1869 } else /* XXXX Check for BOUND? */
1870 stclass_flag = 0;
cb434fcc 1871 data.last_closep = &last_close;
653099ff 1872
830247a4 1873 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1874 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1875 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1876 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1877 && !RExC_seen_zerolen
1878 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1879 r->reganch |= ROPT_CHECK_ALL;
830247a4 1880 scan_commit(pRExC_state, &data);
c277df42
IZ
1881 SvREFCNT_dec(data.last_found);
1882
a0ed51b3 1883 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1884 if (longest_float_length
c277df42
IZ
1885 || (data.flags & SF_FL_BEFORE_EOL
1886 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1887 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1888 int t;
1889
a0ed51b3 1890 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1891 && data.offset_fixed == data.offset_float_min
1892 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1893 goto remove_float; /* As in (a)+. */
1894
c277df42
IZ
1895 r->float_substr = data.longest_float;
1896 r->float_min_offset = data.offset_float_min;
1897 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1898 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1899 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1900 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1901 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1902 }
1903 else {
aca2d497 1904 remove_float:
c277df42
IZ
1905 r->float_substr = Nullsv;
1906 SvREFCNT_dec(data.longest_float);
c5254dd6 1907 longest_float_length = 0;
a0d0e21e 1908 }
c277df42 1909
a0ed51b3 1910 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1911 if (longest_fixed_length
c277df42
IZ
1912 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1913 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1914 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1915 int t;
1916
c277df42
IZ
1917 r->anchored_substr = data.longest_fixed;
1918 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1919 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1920 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1921 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1922 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1923 }
1924 else {
c277df42
IZ
1925 r->anchored_substr = Nullsv;
1926 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1927 longest_fixed_length = 0;
a0d0e21e 1928 }
b81d288d 1929 if (r->regstclass
ffc61ed2 1930 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1931 r->regstclass = NULL;
1932 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1933 && !(data.start_class->flags & ANYOF_EOS)
1934 && !cl_is_anything(data.start_class)) {
830247a4 1935 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1936
b81d288d 1937 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1938 struct regnode_charclass_class);
1939 StructCopy(data.start_class,
830247a4 1940 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1941 struct regnode_charclass_class);
830247a4 1942 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1943 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1944 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1945 DEBUG_r({ SV *sv = sv_newmortal();
1946 regprop(sv, (regnode*)data.start_class);
1947 PerlIO_printf(Perl_debug_log,
1948 "synthetic stclass `%s'.\n",
1949 SvPVX(sv));});
653099ff 1950 }
c277df42
IZ
1951
1952 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1953 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1954 r->check_substr = r->anchored_substr;
1955 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1956 if (r->reganch & ROPT_ANCH_SINGLE)
1957 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1958 }
1959 else {
c277df42
IZ
1960 r->check_substr = r->float_substr;
1961 r->check_offset_min = data.offset_float_min;
1962 r->check_offset_max = data.offset_float_max;
a0d0e21e 1963 }
30382c73
IZ
1964 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1965 This should be changed ASAP! */
1966 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1967 r->reganch |= RE_USE_INTUIT;
1968 if (SvTAIL(r->check_substr))
1969 r->reganch |= RE_INTUIT_TAIL;
1970 }
a0ed51b3
LW
1971 }
1972 else {
c277df42
IZ
1973 /* Several toplevels. Best we can is to set minlen. */
1974 I32 fake;
653099ff 1975 struct regnode_charclass_class ch_class;
cb434fcc 1976 I32 last_close = 0;
c277df42
IZ
1977
1978 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1979 scan = r->program + 1;
830247a4 1980 cl_init(pRExC_state, &ch_class);
653099ff 1981 data.start_class = &ch_class;
cb434fcc 1982 data.last_closep = &last_close;
e1901655 1983 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1984 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1985 if (!(data.start_class->flags & ANYOF_EOS)
1986 && !cl_is_anything(data.start_class)) {
830247a4 1987 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1988
b81d288d 1989 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1990 struct regnode_charclass_class);
1991 StructCopy(data.start_class,
830247a4 1992 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1993 struct regnode_charclass_class);
830247a4 1994 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1995 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1996 DEBUG_r({ SV* sv = sv_newmortal();
1997 regprop(sv, (regnode*)data.start_class);
1998 PerlIO_printf(Perl_debug_log,
1999 "synthetic stclass `%s'.\n",
2000 SvPVX(sv));});
653099ff 2001 }
a0d0e21e
LW
2002 }
2003
a0d0e21e 2004 r->minlen = minlen;
b81d288d 2005 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2006 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2007 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2008 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2009 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2010 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
2011 if (RExC_seen & REG_SEEN_SANY)
2012 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
2013 Newz(1002, r->startp, RExC_npar, I32);
2014 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2015 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2016 DEBUG_r(regdump(r));
2017 return(r);
a687059c
LW
2018}
2019
2020/*
2021 - reg - regular expression, i.e. main body or parenthesized thing
2022 *
2023 * Caller must absorb opening parenthesis.
2024 *
2025 * Combining parenthesis handling with the base level of regular expression
2026 * is a trifle forced, but the need to tie the tails of the branches to what
2027 * follows makes it hard to avoid.
2028 */
76e3520e 2029STATIC regnode *
830247a4 2030S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2031 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2032{
c277df42
IZ
2033 register regnode *ret; /* Will be the head of the group. */
2034 register regnode *br;
2035 register regnode *lastbr;
2036 register regnode *ender = 0;
a0d0e21e 2037 register I32 parno = 0;
830247a4 2038 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2039
2040 /* for (?g), (?gc), and (?o) warnings; warning
2041 about (?c) will warn about (?g) -- japhy */
2042
2043 I32 wastedflags = 0x00,
2044 wasted_o = 0x01,
2045 wasted_g = 0x02,
2046 wasted_gc = 0x02 | 0x04,
2047 wasted_c = 0x04;
2048
fac92740 2049 char * parse_start = RExC_parse; /* MJD */
830247a4 2050 char *oregcomp_parse = RExC_parse;
c277df42 2051 char c;
a0d0e21e 2052
821b33a5 2053 *flagp = 0; /* Tentatively. */
a0d0e21e 2054
9d1d55b5 2055
a0d0e21e
LW
2056 /* Make an OPEN node, if parenthesized. */
2057 if (paren) {
fac92740 2058 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2059 U16 posflags = 0, negflags = 0;
2060 U16 *flagsp = &posflags;
0f5d15d6 2061 int logical = 0;
830247a4 2062 char *seqstart = RExC_parse;
ca9dfc88 2063
830247a4
IZ
2064 RExC_parse++;
2065 paren = *RExC_parse++;
c277df42 2066 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2067 switch (paren) {
fac92740 2068 case '<': /* (?<...) */
830247a4 2069 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2070 if (*RExC_parse == '!')
c277df42 2071 paren = ',';
b81d288d 2072 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2073 goto unknown;
830247a4 2074 RExC_parse++;
fac92740
MJD
2075 case '=': /* (?=...) */
2076 case '!': /* (?!...) */
830247a4 2077 RExC_seen_zerolen++;
fac92740
MJD
2078 case ':': /* (?:...) */
2079 case '>': /* (?>...) */
a0d0e21e 2080 break;
fac92740
MJD
2081 case '$': /* (?$...) */
2082 case '@': /* (?@...) */
8615cb43 2083 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2084 break;
fac92740 2085 case '#': /* (?#...) */
830247a4
IZ
2086 while (*RExC_parse && *RExC_parse != ')')
2087 RExC_parse++;
2088 if (*RExC_parse != ')')
c277df42 2089 FAIL("Sequence (?#... not terminated");
830247a4 2090 nextchar(pRExC_state);
a0d0e21e
LW
2091 *flagp = TRYAGAIN;
2092 return NULL;
fac92740 2093 case 'p': /* (?p...) */
f9373011
PM
2094 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2095 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2096 /* FALL THROUGH*/
fac92740 2097 case '?': /* (??...) */
0f5d15d6 2098 logical = 1;
830247a4 2099 paren = *RExC_parse++;
0f5d15d6 2100 /* FALL THROUGH */
fac92740 2101 case '{': /* (?{...}) */
c277df42 2102 {
c277df42
IZ
2103 I32 count = 1, n = 0;
2104 char c;
830247a4 2105 char *s = RExC_parse;
c277df42
IZ
2106 SV *sv;
2107 OP_4tree *sop, *rop;
2108
830247a4
IZ
2109 RExC_seen_zerolen++;
2110 RExC_seen |= REG_SEEN_EVAL;
2111 while (count && (c = *RExC_parse)) {
2112 if (c == '\\' && RExC_parse[1])
2113 RExC_parse++;
b81d288d 2114 else if (c == '{')
c277df42 2115 count++;
b81d288d 2116 else if (c == '}')
c277df42 2117 count--;
830247a4 2118 RExC_parse++;
c277df42 2119 }
830247a4 2120 if (*RExC_parse != ')')
b45f050a 2121 {
b81d288d 2122 RExC_parse = s;
b45f050a
JF
2123 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2124 }
c277df42
IZ
2125 if (!SIZE_ONLY) {
2126 AV *av;
b81d288d
AB
2127
2128 if (RExC_parse - 1 - s)
830247a4 2129 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2130 else
79cb57f6 2131 sv = newSVpvn("", 0);
c277df42 2132
569233ed
SB
2133 ENTER;
2134 Perl_save_re_context(aTHX);
c277df42 2135 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2136 LEAVE;
c277df42 2137
830247a4
IZ
2138 n = add_data(pRExC_state, 3, "nop");
2139 RExC_rx->data->data[n] = (void*)rop;
2140 RExC_rx->data->data[n+1] = (void*)sop;
2141 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2142 SvREFCNT_dec(sv);
a0ed51b3 2143 }
e24b16f9 2144 else { /* First pass */
830247a4 2145 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2146 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2147 /* No compiled RE interpolated, has runtime
2148 components ===> unsafe. */
2149 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2150 if (PL_tainting && PL_tainted)
cc6b7395 2151 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2152 }
2153
830247a4 2154 nextchar(pRExC_state);
0f5d15d6 2155 if (logical) {
830247a4 2156 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2157 if (!SIZE_ONLY)
2158 ret->flags = 2;
830247a4 2159 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2160 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2161 return ret;
2162 }
830247a4 2163 return reganode(pRExC_state, EVAL, n);
c277df42 2164 }
fac92740 2165 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2166 {
fac92740 2167 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2168 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2169 || RExC_parse[1] == '<'
830247a4 2170 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2171 I32 flag;
2172
830247a4 2173 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2174 if (!SIZE_ONLY)
2175 ret->flags = 1;
830247a4 2176 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2177 goto insert_if;
b81d288d 2178 }
a0ed51b3 2179 }
830247a4 2180 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2181 /* (?(1)...) */
830247a4 2182 parno = atoi(RExC_parse++);
c277df42 2183
830247a4
IZ
2184 while (isDIGIT(*RExC_parse))
2185 RExC_parse++;
fac92740
MJD
2186 ret = reganode(pRExC_state, GROUPP, parno);
2187
830247a4 2188 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2189 vFAIL("Switch condition not recognized");
c277df42 2190 insert_if:
830247a4
IZ
2191 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2192 br = regbranch(pRExC_state, &flags, 1);
c277df42 2193 if (br == NULL)
830247a4 2194 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2195 else
830247a4
IZ
2196 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2197 c = *nextchar(pRExC_state);
d1b80229
IZ
2198 if (flags&HASWIDTH)
2199 *flagp |= HASWIDTH;
c277df42 2200 if (c == '|') {
830247a4
IZ
2201 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2202 regbranch(pRExC_state, &flags, 1);
2203 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2204 if (flags&HASWIDTH)
2205 *flagp |= HASWIDTH;
830247a4 2206 c = *nextchar(pRExC_state);
a0ed51b3
LW
2207 }
2208 else
c277df42
IZ
2209 lastbr = NULL;
2210 if (c != ')')
8615cb43 2211 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2212 ender = reg_node(pRExC_state, TAIL);
2213 regtail(pRExC_state, br, ender);
c277df42 2214 if (lastbr) {
830247a4
IZ
2215 regtail(pRExC_state, lastbr, ender);
2216 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2217 }
2218 else
830247a4 2219 regtail(pRExC_state, ret, ender);
c277df42 2220 return ret;
a0ed51b3
LW
2221 }
2222 else {
830247a4 2223 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2224 }
2225 }
1b1626e4 2226 case 0:
830247a4 2227 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2228 vFAIL("Sequence (? incomplete");
1b1626e4 2229 break;
a0d0e21e 2230 default:
830247a4 2231 --RExC_parse;
fac92740 2232 parse_flags: /* (?i) */
830247a4 2233 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2234 /* (?g), (?gc) and (?o) are useless here
2235 and must be globally applied -- japhy */
2236
2237 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2238 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2239 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2240 if (! (wastedflags & wflagbit) ) {
2241 wastedflags |= wflagbit;
2242 vWARN5(
2243 RExC_parse + 1,
2244 "Useless (%s%c) - %suse /%c modifier",
2245 flagsp == &negflags ? "?-" : "?",
2246 *RExC_parse,
2247 flagsp == &negflags ? "don't " : "",
2248 *RExC_parse
2249 );
2250 }
2251 }
2252 }
2253 else if (*RExC_parse == 'c') {
2254 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2255 if (! (wastedflags & wasted_c) ) {
2256 wastedflags |= wasted_gc;
2257 vWARN3(
2258 RExC_parse + 1,
2259 "Useless (%sc) - %suse /gc modifier",
2260 flagsp == &negflags ? "?-" : "?",
2261 flagsp == &negflags ? "don't " : ""
2262 );
2263 }
2264 }
2265 }
2266 else { pmflag(flagsp, *RExC_parse); }
2267
830247a4 2268 ++RExC_parse;
ca9dfc88 2269 }
830247a4 2270 if (*RExC_parse == '-') {
ca9dfc88 2271 flagsp = &negflags;
9d1d55b5 2272 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2273 ++RExC_parse;
ca9dfc88 2274 goto parse_flags;
48c036b1 2275 }
830247a4
IZ
2276 RExC_flags16 |= posflags;
2277 RExC_flags16 &= ~negflags;
2278 if (*RExC_parse == ':') {
2279 RExC_parse++;
ca9dfc88
IZ
2280 paren = ':';
2281 break;
2282 }
c277df42 2283 unknown:
830247a4
IZ
2284 if (*RExC_parse != ')') {
2285 RExC_parse++;
2286 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2287 }
830247a4 2288 nextchar(pRExC_state);
a0d0e21e
LW
2289 *flagp = TRYAGAIN;
2290 return NULL;
2291 }
2292 }
fac92740 2293 else { /* (...) */
830247a4
IZ
2294 parno = RExC_npar;
2295 RExC_npar++;
2296 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2297 Set_Node_Length(ret, 1); /* MJD */
2298 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2299 open = 1;
a0d0e21e 2300 }
a0ed51b3 2301 }
fac92740 2302 else /* ! paren */
a0d0e21e
LW
2303 ret = NULL;
2304
2305 /* Pick up the branches, linking them together. */
fac92740 2306 parse_start = RExC_parse; /* MJD */
830247a4 2307 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2308 /* branch_len = (paren != 0); */
2309
a0d0e21e
LW
2310 if (br == NULL)
2311 return(NULL);
830247a4
IZ
2312 if (*RExC_parse == '|') {
2313 if (!SIZE_ONLY && RExC_extralen) {
2314 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2315 }
fac92740 2316 else { /* MJD */
830247a4 2317 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2318 Set_Node_Length(br, paren != 0);
2319 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2320 }
c277df42
IZ
2321 have_branch = 1;
2322 if (SIZE_ONLY)
830247a4 2323 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2324 }
2325 else if (paren == ':') {
c277df42
IZ
2326 *flagp |= flags&SIMPLE;
2327 }
2328 if (open) { /* Starts with OPEN. */
830247a4 2329 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2330 }
2331 else if (paren != '?') /* Not Conditional */
a0d0e21e 2332 ret = br;
821b33a5
IZ
2333 if (flags&HASWIDTH)
2334 *flagp |= HASWIDTH;
a0d0e21e 2335 *flagp |= flags&SPSTART;
c277df42 2336 lastbr = br;
830247a4
IZ
2337 while (*RExC_parse == '|') {
2338 if (!SIZE_ONLY && RExC_extralen) {
2339 ender = reganode(pRExC_state, LONGJMP,0);
2340 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2341 }
2342 if (SIZE_ONLY)
830247a4
IZ
2343 RExC_extralen += 2; /* Account for LONGJMP. */
2344 nextchar(pRExC_state);
2345 br = regbranch(pRExC_state, &flags, 0);
fac92740 2346
a687059c 2347 if (br == NULL)
a0d0e21e 2348 return(NULL);
830247a4 2349 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2350 lastbr = br;
821b33a5
IZ
2351 if (flags&HASWIDTH)
2352 *flagp |= HASWIDTH;
a687059c 2353 *flagp |= flags&SPSTART;
a0d0e21e
LW
2354 }
2355
c277df42
IZ
2356 if (have_branch || paren != ':') {
2357 /* Make a closing node, and hook it on the end. */
2358 switch (paren) {
2359 case ':':
830247a4 2360 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2361 break;
2362 case 1:
830247a4 2363 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2364 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2365 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2366 break;
2367 case '<':
c277df42
IZ
2368 case ',':
2369 case '=':
2370 case '!':
c277df42 2371 *flagp &= ~HASWIDTH;
821b33a5
IZ
2372 /* FALL THROUGH */
2373 case '>':
830247a4 2374 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2375 break;
2376 case 0:
830247a4 2377 ender = reg_node(pRExC_state, END);
c277df42
IZ
2378 break;
2379 }
830247a4 2380 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2381
c277df42
IZ
2382 if (have_branch) {
2383 /* Hook the tails of the branches to the closing node. */
2384 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2385 regoptail(pRExC_state, br, ender);
c277df42
IZ
2386 }
2387 }
a0d0e21e 2388 }
c277df42
IZ
2389
2390 {
2391 char *p;
2392 static char parens[] = "=!<,>";
2393
2394 if (paren && (p = strchr(parens, paren))) {
2395 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2396 int flag = (p - parens) > 1;
2397
2398 if (paren == '>')
2399 node = SUSPEND, flag = 0;
830247a4 2400 reginsert(pRExC_state, node,ret);
c277df42 2401 ret->flags = flag;
830247a4 2402 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2403 }
a0d0e21e
LW
2404 }
2405
2406 /* Check for proper termination. */
ce3e6498 2407 if (paren) {
830247a4
IZ
2408 RExC_flags16 = oregflags;
2409 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2410 RExC_parse = oregcomp_parse;
380a0633 2411 vFAIL("Unmatched (");
ce3e6498 2412 }
a0ed51b3 2413 }
830247a4
IZ
2414 else if (!paren && RExC_parse < RExC_end) {
2415 if (*RExC_parse == ')') {
2416 RExC_parse++;
380a0633 2417 vFAIL("Unmatched )");
a0ed51b3
LW
2418 }
2419 else
b45f050a 2420 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2421 /* NOTREACHED */
2422 }
a687059c 2423
a0d0e21e 2424 return(ret);
a687059c
LW
2425}
2426
2427/*
2428 - regbranch - one alternative of an | operator
2429 *
2430 * Implements the concatenation operator.
2431 */
76e3520e 2432STATIC regnode *
830247a4 2433S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2434{
c277df42
IZ
2435 register regnode *ret;
2436 register regnode *chain = NULL;
2437 register regnode *latest;
2438 I32 flags = 0, c = 0;
a0d0e21e 2439
b81d288d 2440 if (first)
c277df42
IZ
2441 ret = NULL;
2442 else {
b81d288d 2443 if (!SIZE_ONLY && RExC_extralen)
830247a4 2444 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2445 else {
830247a4 2446 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2447 Set_Node_Length(ret, 1);
2448 }
c277df42
IZ
2449 }
2450
b81d288d 2451 if (!first && SIZE_ONLY)
830247a4 2452 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2453
c277df42 2454 *flagp = WORST; /* Tentatively. */
a0d0e21e 2455
830247a4
IZ
2456 RExC_parse--;
2457 nextchar(pRExC_state);
2458 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2459 flags &= ~TRYAGAIN;
830247a4 2460 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2461 if (latest == NULL) {
2462 if (flags & TRYAGAIN)
2463 continue;
2464 return(NULL);
a0ed51b3
LW
2465 }
2466 else if (ret == NULL)
c277df42 2467 ret = latest;
a0d0e21e 2468 *flagp |= flags&HASWIDTH;
c277df42 2469 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2470 *flagp |= flags&SPSTART;
2471 else {
830247a4
IZ
2472 RExC_naughty++;
2473 regtail(pRExC_state, chain, latest);
a687059c 2474 }
a0d0e21e 2475 chain = latest;
c277df42
IZ
2476 c++;
2477 }
2478 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2479 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2480 if (ret == NULL)
2481 ret = chain;
2482 }
2483 if (c == 1) {
2484 *flagp |= flags&SIMPLE;
a0d0e21e 2485 }
a687059c 2486
a0d0e21e 2487 return(ret);
a687059c
LW
2488}
2489
2490/*
2491 - regpiece - something followed by possible [*+?]
2492 *
2493 * Note that the branching code sequences used for ? and the general cases
2494 * of * and + are somewhat optimized: they use the same NOTHING node as
2495 * both the endmarker for their branch list and the body of the last branch.
2496 * It might seem that this node could be dispensed with entirely, but the
2497 * endmarker role is not redundant.
2498 */
76e3520e 2499STATIC regnode *
830247a4 2500S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2501{
c277df42 2502 register regnode *ret;
a0d0e21e
LW
2503 register char op;
2504 register char *next;
2505 I32 flags;
830247a4 2506 char *origparse = RExC_parse;
a0d0e21e
LW
2507 char *maxpos;
2508 I32 min;
c277df42 2509 I32 max = REG_INFTY;
fac92740 2510 char *parse_start;
a0d0e21e 2511
830247a4 2512 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2513 if (ret == NULL) {
2514 if (flags & TRYAGAIN)
2515 *flagp |= TRYAGAIN;
2516 return(NULL);
2517 }
2518
830247a4 2519 op = *RExC_parse;
a0d0e21e 2520
830247a4 2521 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2522 parse_start = RExC_parse; /* MJD */
830247a4 2523 next = RExC_parse + 1;
a0d0e21e
LW
2524 maxpos = Nullch;
2525 while (isDIGIT(*next) || *next == ',') {
2526 if (*next == ',') {
2527 if (maxpos)
2528 break;
2529 else
2530 maxpos = next;
a687059c 2531 }
a0d0e21e
LW
2532 next++;
2533 }
2534 if (*next == '}') { /* got one */
2535 if (!maxpos)
2536 maxpos = next;
830247a4
IZ
2537 RExC_parse++;
2538 min = atoi(RExC_parse);
a0d0e21e
LW
2539 if (*maxpos == ',')
2540 maxpos++;
2541 else
830247a4 2542 maxpos = RExC_parse;
a0d0e21e
LW
2543 max = atoi(maxpos);
2544 if (!max && *maxpos != '0')
c277df42
IZ
2545 max = REG_INFTY; /* meaning "infinity" */
2546 else if (max >= REG_INFTY)
8615cb43 2547 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2548 RExC_parse = next;
2549 nextchar(pRExC_state);
a0d0e21e
LW
2550
2551 do_curly:
2552 if ((flags&SIMPLE)) {
830247a4
IZ
2553 RExC_naughty += 2 + RExC_naughty / 2;
2554 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2555 Set_Node_Offset(ret, parse_start+1); /* MJD */
2556 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2557 }
2558 else {
830247a4 2559 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2560
2561 w->flags = 0;
830247a4
IZ
2562 regtail(pRExC_state, ret, w);
2563 if (!SIZE_ONLY && RExC_extralen) {
2564 reginsert(pRExC_state, LONGJMP,ret);
2565 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2566 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2567 }
830247a4 2568 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2569 /* MJD hk */
2570 Set_Node_Offset(ret, parse_start+1);
2571 Set_Node_Length(ret,
2572 op == '{' ? (RExC_parse - parse_start) : 1);
2573
830247a4 2574 if (!SIZE_ONLY && RExC_extralen)
c277df42 2575 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2576 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2577 if (SIZE_ONLY)
830247a4
IZ
2578 RExC_whilem_seen++, RExC_extralen += 3;
2579 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2580 }
c277df42 2581 ret->flags = 0;
a0d0e21e
LW
2582
2583 if (min > 0)
821b33a5
IZ
2584 *flagp = WORST;
2585 if (max > 0)
2586 *flagp |= HASWIDTH;
a0d0e21e 2587 if (max && max < min)
8615cb43 2588 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2589 if (!SIZE_ONLY) {
2590 ARG1_SET(ret, min);
2591 ARG2_SET(ret, max);
a687059c 2592 }
a687059c 2593
a0d0e21e 2594 goto nest_check;
a687059c 2595 }
a0d0e21e 2596 }
a687059c 2597
a0d0e21e
LW
2598 if (!ISMULT1(op)) {
2599 *flagp = flags;
a687059c 2600 return(ret);
a0d0e21e 2601 }
bb20fd44 2602
c277df42 2603#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2604
2605 /* if this is reinstated, don't forget to put this back into perldiag:
2606
2607 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2608
2609 (F) The part of the regexp subject to either the * or + quantifier
2610 could match an empty string. The {#} shows in the regular
2611 expression about where the problem was discovered.
2612
2613 */
2614
bb20fd44 2615 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2616 vFAIL("Regexp *+ operand could be empty");
b81d288d 2617#endif
bb20fd44 2618
fac92740 2619 parse_start = RExC_parse;
830247a4 2620 nextchar(pRExC_state);
a0d0e21e 2621
821b33a5 2622 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2623
2624 if (op == '*' && (flags&SIMPLE)) {
830247a4 2625 reginsert(pRExC_state, STAR, ret);
c277df42 2626 ret->flags = 0;
830247a4 2627 RExC_naughty += 4;
a0d0e21e
LW
2628 }
2629 else if (op == '*') {
2630 min = 0;
2631 goto do_curly;
a0ed51b3
LW
2632 }
2633 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, PLUS, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 3;
a0d0e21e
LW
2637 }
2638 else if (op == '+') {
2639 min = 1;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '?') {
a0d0e21e
LW
2643 min = 0; max = 1;
2644 goto do_curly;
2645 }
2646 nest_check:
e476b1b5 2647 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2648 vWARN3(RExC_parse,
b45f050a 2649 "%.*s matches null string many times",
830247a4 2650 RExC_parse - origparse,
b45f050a 2651 origparse);
a0d0e21e
LW
2652 }
2653
830247a4
IZ
2654 if (*RExC_parse == '?') {
2655 nextchar(pRExC_state);
2656 reginsert(pRExC_state, MINMOD, ret);
2657 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2658 }
830247a4
IZ
2659 if (ISMULT2(RExC_parse)) {
2660 RExC_parse++;
b45f050a
JF
2661 vFAIL("Nested quantifiers");
2662 }
a0d0e21e
LW
2663
2664 return(ret);
a687059c
LW
2665}
2666
2667/*
2668 - regatom - the lowest level
2669 *
2670 * Optimization: gobbles an entire sequence of ordinary characters so that
2671 * it can turn them into a single node, which is smaller to store and
2672 * faster to run. Backslashed characters are exceptions, each becoming a
2673 * separate node; the code is simpler that way and it's not worth fixing.
2674 *
b45f050a 2675 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2676STATIC regnode *
830247a4 2677S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2678{
c277df42 2679 register regnode *ret = 0;
a0d0e21e 2680 I32 flags;
f06dbbb7 2681 char *parse_start = 0;
a0d0e21e
LW
2682
2683 *flagp = WORST; /* Tentatively. */
2684
2685tryagain:
830247a4 2686 switch (*RExC_parse) {
a0d0e21e 2687 case '^':
830247a4
IZ
2688 RExC_seen_zerolen++;
2689 nextchar(pRExC_state);
2690 if (RExC_flags16 & PMf_MULTILINE)
2691 ret = reg_node(pRExC_state, MBOL);
2692 else if (RExC_flags16 & PMf_SINGLELINE)
2693 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2694 else
830247a4 2695 ret = reg_node(pRExC_state, BOL);
fac92740 2696 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2697 break;
2698 case '$':
830247a4 2699 nextchar(pRExC_state);
b81d288d 2700 if (*RExC_parse)
830247a4
IZ
2701 RExC_seen_zerolen++;
2702 if (RExC_flags16 & PMf_MULTILINE)
2703 ret = reg_node(pRExC_state, MEOL);
2704 else if (RExC_flags16 & PMf_SINGLELINE)
2705 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2706 else
830247a4 2707 ret = reg_node(pRExC_state, EOL);
fac92740 2708 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2709 break;
2710 case '.':
830247a4 2711 nextchar(pRExC_state);
ffc61ed2
JH
2712 if (RExC_flags16 & PMf_SINGLELINE)
2713 ret = reg_node(pRExC_state, SANY);
2714 else
2715 ret = reg_node(pRExC_state, REG_ANY);
2716 *flagp |= HASWIDTH|SIMPLE;
830247a4 2717 RExC_naughty++;
fac92740 2718 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2719 break;
2720 case '[':
b45f050a 2721 {
830247a4 2722 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2723 ret = regclass(pRExC_state);
830247a4
IZ
2724 if (*RExC_parse != ']') {
2725 RExC_parse = oregcomp_parse;
b45f050a
JF
2726 vFAIL("Unmatched [");
2727 }
830247a4 2728 nextchar(pRExC_state);
a0d0e21e 2729 *flagp |= HASWIDTH|SIMPLE;
fac92740 2730 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2731 break;
b45f050a 2732 }
a0d0e21e 2733 case '(':
830247a4
IZ
2734 nextchar(pRExC_state);
2735 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2736 if (ret == NULL) {
bf93d4cc 2737 if (flags & TRYAGAIN) {
830247a4 2738 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2739 /* Make parent create an empty node if needed. */
2740 *flagp |= TRYAGAIN;
2741 return(NULL);
2742 }
a0d0e21e 2743 goto tryagain;
bf93d4cc 2744 }
a0d0e21e
LW
2745 return(NULL);
2746 }
c277df42 2747 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2748 break;
2749 case '|':
2750 case ')':
2751 if (flags & TRYAGAIN) {
2752 *flagp |= TRYAGAIN;
2753 return NULL;
2754 }
b45f050a 2755 vFAIL("Internal urp");
a0d0e21e
LW
2756 /* Supposed to be caught earlier. */
2757 break;
85afd4ae 2758 case '{':
830247a4
IZ
2759 if (!regcurly(RExC_parse)) {
2760 RExC_parse++;
85afd4ae
CS
2761 goto defchar;
2762 }
2763 /* FALL THROUGH */
a0d0e21e
LW
2764 case '?':
2765 case '+':
2766 case '*':
830247a4 2767 RExC_parse++;
b45f050a 2768 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2769 break;
2770 case '\\':
830247a4 2771 switch (*++RExC_parse) {
a0d0e21e 2772 case 'A':
830247a4
IZ
2773 RExC_seen_zerolen++;
2774 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2775 *flagp |= SIMPLE;
830247a4 2776 nextchar(pRExC_state);
fac92740 2777 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2778 break;
2779 case 'G':
830247a4
IZ
2780 ret = reg_node(pRExC_state, GPOS);
2781 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2782 *flagp |= SIMPLE;
830247a4 2783 nextchar(pRExC_state);
fac92740 2784 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2785 break;
2786 case 'Z':
830247a4 2787 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2788 *flagp |= SIMPLE;
830247a4 2789 nextchar(pRExC_state);
a0d0e21e 2790 break;
b85d18e9 2791 case 'z':
830247a4 2792 ret = reg_node(pRExC_state, EOS);
b85d18e9 2793 *flagp |= SIMPLE;
830247a4
IZ
2794 RExC_seen_zerolen++; /* Do not optimize RE away */
2795 nextchar(pRExC_state);
fac92740 2796 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2797 break;
4a2d328f 2798 case 'C':
830247a4 2799 ret = reg_node(pRExC_state, SANY);
3baa4c62 2800 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2801 *flagp |= HASWIDTH|SIMPLE;
830247a4 2802 nextchar(pRExC_state);
fac92740 2803 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2804 break;
2805 case 'X':
830247a4 2806 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2807 *flagp |= HASWIDTH;
830247a4 2808 nextchar(pRExC_state);
fac92740 2809 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2810 break;
a0d0e21e 2811 case 'w':
ffc61ed2 2812 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2813 *flagp |= HASWIDTH|SIMPLE;
830247a4 2814 nextchar(pRExC_state);
fac92740 2815 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2816 break;
2817 case 'W':
ffc61ed2 2818 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2819 *flagp |= HASWIDTH|SIMPLE;
830247a4 2820 nextchar(pRExC_state);
fac92740 2821 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2822 break;
2823 case 'b':
830247a4
IZ
2824 RExC_seen_zerolen++;
2825 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2826 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2827 *flagp |= SIMPLE;
830247a4 2828 nextchar(pRExC_state);
fac92740 2829 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2830 break;
2831 case 'B':
830247a4
IZ
2832 RExC_seen_zerolen++;
2833 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2834 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2835 *flagp |= SIMPLE;
830247a4 2836 nextchar(pRExC_state);
fac92740 2837 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2838 break;
2839 case 's':
ffc61ed2 2840 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2841 *flagp |= HASWIDTH|SIMPLE;
830247a4 2842 nextchar(pRExC_state);
fac92740 2843 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2844 break;
2845 case 'S':
ffc61ed2 2846 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2847 *flagp |= HASWIDTH|SIMPLE;
830247a4 2848 nextchar(pRExC_state);
fac92740 2849 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2850 break;
2851 case 'd':
ffc61ed2 2852 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2853 *flagp |= HASWIDTH|SIMPLE;
830247a4 2854 nextchar(pRExC_state);
fac92740 2855 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2856 break;
2857 case 'D':
ffc61ed2 2858 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2859 *flagp |= HASWIDTH|SIMPLE;
830247a4 2860 nextchar(pRExC_state);
fac92740 2861 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2862 break;
a14b48bc
LW
2863 case 'p':
2864 case 'P':
3568d838 2865 {
830247a4 2866 char* oldregxend = RExC_end;
fac92740 2867 char* parse_start = RExC_parse;
a14b48bc 2868
830247a4 2869 if (RExC_parse[1] == '{') {
3568d838 2870 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2871 RExC_end = strchr(RExC_parse, '}');
2872 if (!RExC_end) {
2873 RExC_parse += 2;
2874 RExC_end = oldregxend;
b45f050a
JF
2875 vFAIL("Missing right brace on \\p{}");
2876 }
830247a4 2877 RExC_end++;
a14b48bc
LW
2878 }
2879 else
830247a4
IZ
2880 RExC_end = RExC_parse + 2;
2881 RExC_parse--;
a14b48bc 2882
ffc61ed2 2883 ret = regclass(pRExC_state);
a14b48bc 2884
830247a4
IZ
2885 RExC_end = oldregxend;
2886 RExC_parse--;
fac92740 2887 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2888 nextchar(pRExC_state);
a14b48bc
LW
2889 *flagp |= HASWIDTH|SIMPLE;
2890 }
2891 break;
a0d0e21e
LW
2892 case 'n':
2893 case 'r':
2894 case 't':
2895 case 'f':
2896 case 'e':
2897 case 'a':
2898 case 'x':
2899 case 'c':
2900 case '0':
2901 goto defchar;
2902 case '1': case '2': case '3': case '4':
2903 case '5': case '6': case '7': case '8': case '9':
2904 {
830247a4 2905 I32 num = atoi(RExC_parse);
a0d0e21e 2906
830247a4 2907 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2908 goto defchar;
2909 else {
fac92740 2910 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2911 while (isDIGIT(*RExC_parse))
2912 RExC_parse++;
b45f050a 2913
830247a4 2914 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2915 vFAIL("Reference to nonexistent group");
830247a4
IZ
2916 RExC_sawback = 1;
2917 ret = reganode(pRExC_state, FOLD
a0ed51b3 2918 ? (LOC ? REFFL : REFF)
c8756f30 2919 : REF, num);
a0d0e21e 2920 *flagp |= HASWIDTH;
fac92740
MJD
2921
2922 /* override incorrect value set in reganode MJD */
2923 Set_Node_Offset(ret, parse_start+1);
2924 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2925 RExC_parse--;
2926 nextchar(pRExC_state);
a0d0e21e
LW
2927 }
2928 }
2929 break;
2930 case '\0':
830247a4 2931 if (RExC_parse >= RExC_end)
b45f050a 2932 FAIL("Trailing \\");
a0d0e21e
LW
2933 /* FALL THROUGH */
2934 default:
c9f97d15
IZ
2935 /* Do not generate `unrecognized' warnings here, we fall
2936 back into the quick-grab loop below */
a0d0e21e
LW
2937 goto defchar;
2938 }
2939 break;
4633a7c4
LW
2940
2941 case '#':
830247a4
IZ
2942 if (RExC_flags16 & PMf_EXTENDED) {
2943 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2944 if (RExC_parse < RExC_end)
4633a7c4
LW
2945 goto tryagain;
2946 }
2947 /* FALL THROUGH */
2948
a0d0e21e 2949 default: {
ba210ebe 2950 register STRLEN len;
a0ed51b3 2951 register UV ender;
a0d0e21e 2952 register char *p;
c277df42 2953 char *oldp, *s;
ba210ebe 2954 STRLEN numlen;
f06dbbb7
JH
2955
2956 parse_start = RExC_parse - 1;
a0d0e21e 2957
830247a4 2958 RExC_parse++;
a0d0e21e
LW
2959
2960 defchar:
830247a4 2961 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2962 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2963 : EXACT);
cd439c50 2964 s = STRING(ret);
830247a4
IZ
2965 for (len = 0, p = RExC_parse - 1;
2966 len < 127 && p < RExC_end;
a0d0e21e
LW
2967 len++)
2968 {
2969 oldp = p;
5b5a24f7 2970
830247a4
IZ
2971 if (RExC_flags16 & PMf_EXTENDED)
2972 p = regwhite(p, RExC_end);
a0d0e21e
LW
2973 switch (*p) {
2974 case '^':
2975 case '$':
2976 case '.':
2977 case '[':
2978 case '(':
2979 case ')':
2980 case '|':
2981 goto loopdone;
2982 case '\\':
2983 switch (*++p) {
2984 case 'A':
2985 case 'G':
2986 case 'Z':
b85d18e9 2987 case 'z':
a0d0e21e
LW
2988 case 'w':
2989 case 'W':
2990 case 'b':
2991 case 'B':
2992 case 's':
2993 case 'S':
2994 case 'd':
2995 case 'D':
a14b48bc
LW
2996 case 'p':
2997 case 'P':
a0d0e21e
LW
2998 --p;
2999 goto loopdone;
3000 case 'n':
3001 ender = '\n';
3002 p++;
a687059c 3003 break;
a0d0e21e
LW
3004 case 'r':
3005 ender = '\r';
3006 p++;
a687059c 3007 break;
a0d0e21e
LW
3008 case 't':
3009 ender = '\t';
3010 p++;
a687059c 3011 break;
a0d0e21e
LW
3012 case 'f':
3013 ender = '\f';
3014 p++;
a687059c 3015 break;
a0d0e21e 3016 case 'e':
c7f1f016 3017 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3018 p++;
a687059c 3019 break;
a0d0e21e 3020 case 'a':
c7f1f016 3021 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3022 p++;
a687059c 3023 break;
a0d0e21e 3024 case 'x':
a0ed51b3
LW
3025 if (*++p == '{') {
3026 char* e = strchr(p, '}');
b81d288d 3027
b45f050a 3028 if (!e) {
830247a4 3029 RExC_parse = p + 1;
b45f050a
JF
3030 vFAIL("Missing right brace on \\x{}");
3031 }
de5f0749 3032 else {
b21ed0a9 3033 numlen = 1; /* allow underscores */
de35ba6f 3034 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
3035 if (ender > 0xff)
3036 RExC_utf8 = 1;
b21ed0a9
GS
3037 /* numlen is generous */
3038 if (numlen + len >= 127) {
a0ed51b3
LW
3039 p--;
3040 goto loopdone;
3041 }
3042 p = e + 1;
3043 }
a0ed51b3
LW
3044 }
3045 else {
b21ed0a9 3046 numlen = 0; /* disallow underscores */
dff6d3cd 3047 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
3048 p += numlen;
3049 }
a687059c 3050 break;
a0d0e21e
LW
3051 case 'c':
3052 p++;
bbce6d69 3053 ender = UCHARAT(p++);
3054 ender = toCTRL(ender);
a687059c 3055 break;
a0d0e21e
LW
3056 case '0': case '1': case '2': case '3':case '4':
3057 case '5': case '6': case '7': case '8':case '9':
3058 if (*p == '0' ||
830247a4 3059 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 3060 numlen = 0; /* disallow underscores */
dff6d3cd 3061 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
3062 p += numlen;
3063 }
3064 else {
3065 --p;
3066 goto loopdone;
a687059c
LW
3067 }
3068 break;
a0d0e21e 3069 case '\0':
830247a4 3070 if (p >= RExC_end)
b45f050a 3071 FAIL("Trailing \\");
a687059c 3072 /* FALL THROUGH */
a0d0e21e 3073 default:
e476b1b5 3074 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3075 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3076 goto normal_default;
a0d0e21e
LW
3077 }
3078 break;
a687059c 3079 default:
a0ed51b3 3080 normal_default:
fd400ab9 3081 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3082 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3083 &numlen, 0);
a0ed51b3
LW
3084 p += numlen;
3085 }
3086 else
3087 ender = *p++;
a0d0e21e 3088 break;
a687059c 3089 }
830247a4
IZ
3090 if (RExC_flags16 & PMf_EXTENDED)
3091 p = regwhite(p, RExC_end);
a0ed51b3
LW
3092 if (UTF && FOLD) {
3093 if (LOC)
5e12f4fb 3094 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3095 else
3096 ender = toLOWER_uni(ender);
3097 }
a0d0e21e
LW
3098 if (ISMULT2(p)) { /* Back off on ?+*. */
3099 if (len)
3100 p = oldp;
2b9d42f0 3101 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3102 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3103 s += numlen;
3104 len += numlen;
3105 }
a0d0e21e
LW
3106 else {
3107 len++;
cd439c50 3108 REGC(ender, s++);
a0d0e21e
LW
3109 }
3110 break;
a687059c 3111 }
2b9d42f0 3112 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3113 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3114 s += numlen;
3115 len += numlen - 1;
3116 }
3117 else
cd439c50 3118 REGC(ender, s++);
a0d0e21e
LW
3119 }
3120 loopdone:
830247a4 3121 RExC_parse = p - 1;
fac92740 3122 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3123 nextchar(pRExC_state);
793db0cb
JH
3124 {
3125 /* len is STRLEN which is unsigned, need to copy to signed */
3126 IV iv = len;
3127 if (iv < 0)
3128 vFAIL("Internal disaster");
3129 }
a0d0e21e
LW
3130 if (len > 0)
3131 *flagp |= HASWIDTH;
3132 if (len == 1)
3133 *flagp |= SIMPLE;
c277df42 3134 if (!SIZE_ONLY)
cd439c50
IZ
3135 STR_LEN(ret) = len;
3136 if (SIZE_ONLY)
830247a4 3137 RExC_size += STR_SZ(len);
cd439c50 3138 else
830247a4 3139 RExC_emit += STR_SZ(len);
a687059c 3140 }
a0d0e21e
LW
3141 break;
3142 }
a687059c 3143
a0d0e21e 3144 return(ret);
a687059c
LW
3145}
3146
873ef191 3147STATIC char *
cea2e8a9 3148S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3149{
3150 while (p < e) {
3151 if (isSPACE(*p))
3152 ++p;
3153 else if (*p == '#') {
3154 do {
3155 p++;
3156 } while (p < e && *p != '\n');
3157 }
3158 else
3159 break;
3160 }
3161 return p;
3162}
3163
b8c5462f
JH
3164/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3165 Character classes ([:foo:]) can also be negated ([:^foo:]).
3166 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3167 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3168 but trigger warnings because they are currently unimplemented. */
3169STATIC I32
830247a4 3170S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3171{
3172 char *posixcc = 0;
936ed897 3173 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3174
830247a4 3175 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3176 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3177 (*RExC_parse == ':' ||
3178 *RExC_parse == '=' ||
3179 *RExC_parse == '.')) {
3180 char c = *RExC_parse;
3181 char* s = RExC_parse++;
b81d288d 3182
830247a4
IZ
3183 while (RExC_parse < RExC_end && *RExC_parse != c)
3184 RExC_parse++;
3185 if (RExC_parse == RExC_end)
620e46c5 3186 /* Grandfather lone [:, [=, [. */
830247a4 3187 RExC_parse = s;
620e46c5 3188 else {
830247a4 3189 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3190
830247a4
IZ
3191 if (*RExC_parse == ']') {
3192 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3193 posixcc = s + 1;
3194 if (*s == ':') {
3195 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3196 I32 skip = 5; /* the most common skip */
3197
3198 switch (*posixcc) {
3199 case 'a':
3200 if (strnEQ(posixcc, "alnum", 5))
3201 namedclass =
3202 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3203 else if (strnEQ(posixcc, "alpha", 5))
3204 namedclass =
3205 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3206 else if (strnEQ(posixcc, "ascii", 5))
3207 namedclass =
3208 complement ? ANYOF_NASCII : ANYOF_ASCII;
3209 break;
aaa51d5e
JF
3210 case 'b':
3211 if (strnEQ(posixcc, "blank", 5))
3212 namedclass =
3213 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3214 break;
b8c5462f
JH
3215 case 'c':
3216 if (strnEQ(posixcc, "cntrl", 5))
3217 namedclass =
3218 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3219 break;
3220 case 'd':
3221 if (strnEQ(posixcc, "digit", 5))
3222 namedclass =
3223 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3224 break;
3225 case 'g':
3226 if (strnEQ(posixcc, "graph", 5))
3227 namedclass =
3228 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3229 break;
3230 case 'l':
3231 if (strnEQ(posixcc, "lower", 5))
3232 namedclass =
3233 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3234 break;
3235 case 'p':
3236 if (strnEQ(posixcc, "print", 5))
3237 namedclass =
3238 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3239 else if (strnEQ(posixcc, "punct", 5))
3240 namedclass =
3241 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3242 break;
3243 case 's':
3244 if (strnEQ(posixcc, "space", 5))
3245 namedclass =
aaa51d5e 3246 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3247 break;
b8c5462f
JH
3248 case 'u':
3249 if (strnEQ(posixcc, "upper", 5))
3250 namedclass =
3251 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3252 break;
3253 case 'w': /* this is not POSIX, this is the Perl \w */
3254 if (strnEQ(posixcc, "word", 4)) {
3255 namedclass =
3256 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3257 skip = 4;
3258 }
3259 break;
3260 case 'x':
3261 if (strnEQ(posixcc, "xdigit", 6)) {
3262 namedclass =
3263 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3264 skip = 6;
3265 }
3266 break;
3267 }
ac561586
JH
3268 if (namedclass == OOB_NAMEDCLASS ||
3269 posixcc[skip] != ':' ||
3270 posixcc[skip+1] != ']')
b45f050a
JF
3271 {
3272 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3273 t - s - 1, s + 1);
3274 }
3275 } else if (!SIZE_ONLY) {
b8c5462f 3276 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3277
830247a4 3278 /* adjust RExC_parse so the warning shows after
b45f050a 3279 the class closes */
830247a4
IZ
3280 while (*RExC_parse && *RExC_parse != ']')
3281 RExC_parse++;
b45f050a
JF
3282 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3283 }
b8c5462f
JH
3284 } else {
3285 /* Maternal grandfather:
3286 * "[:" ending in ":" but not in ":]" */
830247a4 3287 RExC_parse = s;
767d463e 3288 }
620e46c5
JH
3289 }
3290 }
3291
b8c5462f
JH
3292 return namedclass;
3293}
3294
3295STATIC void
830247a4 3296S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3297{
e476b1b5 3298 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3299 (*RExC_parse == ':' ||
3300 *RExC_parse == '=' ||
3301 *RExC_parse == '.')) {
3302 char *s = RExC_parse;
93733859 3303 char c = *s++;
b8c5462f
JH
3304
3305 while(*s && isALNUM(*s))
3306 s++;
3307 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3308 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3309
3310 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3311 if (c == '=' || c == '.')
b45f050a 3312 {
830247a4 3313 /* adjust RExC_parse so the error shows after
b45f050a 3314 the class closes */
830247a4 3315 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3316 ;
3317 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3318 }
b8c5462f
JH
3319 }
3320 }
620e46c5
JH
3321}
3322
76e3520e 3323STATIC regnode *
830247a4 3324S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3325{
ffc61ed2 3326 register UV value;
3568d838 3327 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3328 register IV range = 0;
c277df42 3329 register regnode *ret;
ba210ebe 3330 STRLEN numlen;
ffc61ed2 3331 IV namedclass;
9c5ffd7c 3332 char *rangebegin = 0;
936ed897 3333 bool need_class = 0;
9c5ffd7c 3334 SV *listsv = Nullsv;
ffc61ed2
JH
3335 register char *e;
3336 UV n;
3568d838 3337 bool optimize_invert = TRUE;
ffc61ed2
JH
3338
3339 ret = reganode(pRExC_state, ANYOF, 0);
3340
3341 if (!SIZE_ONLY)
3342 ANYOF_FLAGS(ret) = 0;
3343
3344 if (*RExC_parse == '^') { /* Complement of range. */
3345 RExC_naughty++;
3346 RExC_parse++;
3347 if (!SIZE_ONLY)
3348 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3349 }
a0d0e21e 3350
936ed897 3351 if (SIZE_ONLY)
830247a4 3352 RExC_size += ANYOF_SKIP;
936ed897 3353 else {
830247a4 3354 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3355 if (FOLD)
3356 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3357 if (LOC)
3358 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3359 ANYOF_BITMAP_ZERO(ret);
3360 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3361 }
b8c5462f 3362
e476b1b5 3363 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3364 checkposixcc(pRExC_state);
b8c5462f 3365
830247a4 3366 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3367 goto charclassloop; /* allow 1st char to be ] or - */
3368
830247a4 3369 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3370
3371 charclassloop:
3372
3373 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3374
73b437c8 3375 if (!range)
830247a4 3376 rangebegin = RExC_parse;
ffc61ed2 3377 if (UTF) {
5e12f4fb 3378 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3379 RExC_end - RExC_parse,
3380 &numlen, 0);
ffc61ed2
JH
3381 RExC_parse += numlen;
3382 }
3383 else
3384 value = UCHARAT(RExC_parse++);
620e46c5 3385 if (value == '[')
830247a4 3386 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3387 else if (value == '\\') {
ffc61ed2 3388 if (UTF) {
5e12f4fb 3389 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3390 RExC_end - RExC_parse,
3391 &numlen, 0);
3392 RExC_parse += numlen;
3393 }
3394 else
3395 value = UCHARAT(RExC_parse++);
470c3474 3396 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3397 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3398 * be a problem later if we want switch on Unicode.
3399 * A similar issue a little bit later when switching on
3400 * namedclass. --jhi */
ffc61ed2 3401 switch ((I32)value) {
b8c5462f
JH
3402 case 'w': namedclass = ANYOF_ALNUM; break;
3403 case 'W': namedclass = ANYOF_NALNUM; break;
3404 case 's': namedclass = ANYOF_SPACE; break;
3405 case 'S': namedclass = ANYOF_NSPACE; break;
3406 case 'd': namedclass = ANYOF_DIGIT; break;
3407 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3408 case 'p':
3409 case 'P':
3410 if (*RExC_parse == '{') {
3411 e = strchr(RExC_parse++, '}');
3412 if (!e)
3413 vFAIL("Missing right brace on \\p{}");
3414 n = e - RExC_parse;
3415 }
3416 else {
3417 e = RExC_parse;
3418 n = 1;
3419 }
3420 if (!SIZE_ONLY) {
3421 if (value == 'p')
3422 Perl_sv_catpvf(aTHX_ listsv,
3423 "+utf8::%.*s\n", (int)n, RExC_parse);
3424 else
3425 Perl_sv_catpvf(aTHX_ listsv,
3426 "!utf8::%.*s\n", (int)n, RExC_parse);
3427 }
3428 RExC_parse = e + 1;
3429 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3430 continue;
b8c5462f
JH
3431 case 'n': value = '\n'; break;
3432 case 'r': value = '\r'; break;
3433 case 't': value = '\t'; break;
3434 case 'f': value = '\f'; break;
3435 case 'b': value = '\b'; break;
c7f1f016
NIS
3436 case 'e': value = ASCII_TO_NATIVE('\033');break;
3437 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3438 case 'x':
ffc61ed2
JH
3439 if (*RExC_parse == '{') {
3440 e = strchr(RExC_parse++, '}');
b81d288d 3441 if (!e)
ffc61ed2
JH
3442 vFAIL("Missing right brace on \\x{}");
3443 numlen = 1; /* allow underscores */
3444 value = (UV)scan_hex(RExC_parse,
3445 e - RExC_parse,
3446 &numlen);
3447 RExC_parse = e + 1;
3448 }
3449 else {
3450 numlen = 0; /* disallow underscores */
3451 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3452 RExC_parse += numlen;
3453 }
b8c5462f
JH
3454 break;
3455 case 'c':
830247a4 3456 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3457 value = toCTRL(value);
3458 break;
3459 case '0': case '1': case '2': case '3': case '4':
3460 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3461 numlen = 0; /* disallow underscores */
830247a4
IZ
3462 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3463 RExC_parse += numlen;
b8c5462f 3464 break;
1028017a 3465 default:
e476b1b5 3466 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3467 vWARN2(RExC_parse,
3468 "Unrecognized escape \\%c in character class passed through",
3469 (int)value);
1028017a 3470 break;
b8c5462f 3471 }
ffc61ed2
JH
3472 } /* end of \blah */
3473
3474 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3475
3476 if (!SIZE_ONLY && !need_class)
936ed897 3477 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3478
936ed897 3479 need_class = 1;
ffc61ed2
JH
3480
3481 /* a bad range like a-\d, a-[:digit:] ? */
3482 if (range) {
73b437c8 3483 if (!SIZE_ONLY) {
e476b1b5 3484 if (ckWARN(WARN_REGEXP))
830247a4 3485 vWARN4(RExC_parse,
b45f050a 3486 "False [] range \"%*.*s\"",
830247a4
IZ
3487 RExC_parse - rangebegin,
3488 RExC_parse - rangebegin,
b45f050a 3489 rangebegin);
3568d838
JH
3490 if (prevvalue < 256) {
3491 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3492 ANYOF_BITMAP_SET(ret, '-');
3493 }
3494 else {
3495 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3496 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3497 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3498 }
b8c5462f 3499 }
ffc61ed2
JH
3500
3501 range = 0; /* this was not a true range */
73b437c8 3502 }
ffc61ed2 3503
73b437c8 3504 if (!SIZE_ONLY) {
3568d838
JH
3505 if (namedclass > OOB_NAMEDCLASS)
3506 optimize_invert = FALSE;
e2962f66
JH
3507 /* Possible truncation here but in some 64-bit environments
3508 * the compiler gets heartburn about switch on 64-bit values.
3509 * A similar issue a little earlier when switching on value.
98f323fa 3510 * --jhi */
e2962f66 3511 switch ((I32)namedclass) {
73b437c8
JH
3512 case ANYOF_ALNUM:
3513 if (LOC)
936ed897 3514 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3515 else {
3516 for (value = 0; value < 256; value++)
3517 if (isALNUM(value))
936ed897 3518 ANYOF_BITMAP_SET(ret, value);
73b437c8 3519 }
ffc61ed2 3520 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3521 break;
3522 case ANYOF_NALNUM:
3523 if (LOC)
936ed897 3524 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3525 else {
3526 for (value = 0; value < 256; value++)
3527 if (!isALNUM(value))
936ed897 3528 ANYOF_BITMAP_SET(ret, value);
73b437c8 3529 }
ffc61ed2 3530 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3531 break;
ffc61ed2 3532 case ANYOF_ALNUMC:
73b437c8 3533 if (LOC)
ffc61ed2 3534 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3535 else {
3536 for (value = 0; value < 256; value++)
ffc61ed2 3537 if (isALNUMC(value))
936ed897 3538 ANYOF_BITMAP_SET(ret, value);
73b437c8 3539 }
ffc61ed2 3540 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3541 break;
3542 case ANYOF_NALNUMC:
3543 if (LOC)
936ed897 3544 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3545 else {
3546 for (value = 0; value < 256; value++)
3547 if (!isALNUMC(value))
936ed897 3548 ANYOF_BITMAP_SET(ret, value);
73b437c8 3549 }
ffc61ed2 3550 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3551 break;
3552 case ANYOF_ALPHA:
3553 if (LOC)
936ed897 3554 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3555 else {
3556 for (value = 0; value < 256; value++)
3557 if (isALPHA(value))
936ed897 3558 ANYOF_BITMAP_SET(ret, value);
73b437c8 3559 }
ffc61ed2 3560 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3561 break;
3562 case ANYOF_NALPHA:
3563 if (LOC)
936ed897 3564 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3565 else {
3566 for (value = 0; value < 256; value++)
3567 if (!isALPHA(value))
936ed897 3568 ANYOF_BITMAP_SET(ret, value);
73b437c8 3569 }
ffc61ed2 3570 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3571 break;
3572 case ANYOF_ASCII:
3573 if (LOC)
936ed897 3574 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3575 else {
c7f1f016 3576#ifndef EBCDIC
1ba5c669
JH
3577 for (value = 0; value < 128; value++)
3578 ANYOF_BITMAP_SET(ret, value);
3579#else /* EBCDIC */
ffbc6a93 3580 for (value = 0; value < 256; value++) {
3a3c4447
JH
3581 if (isASCII(value))
3582 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3583 }
1ba5c669 3584#endif /* EBCDIC */
73b437c8 3585 }
ffc61ed2 3586 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3587 break;
3588 case ANYOF_NASCII:
3589 if (LOC)
936ed897 3590 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3591 else {
c7f1f016 3592#ifndef EBCDIC
1ba5c669
JH
3593 for (value = 128; value < 256; value++)
3594 ANYOF_BITMAP_SET(ret, value);
3595#else /* EBCDIC */
ffbc6a93 3596 for (value = 0; value < 256; value++) {
3a3c4447
JH
3597 if (!isASCII(value))
3598 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3599 }
1ba5c669 3600#endif /* EBCDIC */
73b437c8 3601 }
ffc61ed2 3602 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3603 break;
aaa51d5e
JF
3604 case ANYOF_BLANK:
3605 if (LOC)
3606 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3607 else {
3608 for (value = 0; value < 256; value++)
3609 if (isBLANK(value))
3610 ANYOF_BITMAP_SET(ret, value);
3611 }
ffc61ed2 3612 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3613 break;
3614 case ANYOF_NBLANK:
3615 if (LOC)
3616 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3617 else {
3618 for (value = 0; value < 256; value++)
3619 if (!isBLANK(value))
3620 ANYOF_BITMAP_SET(ret, value);
3621 }
ffc61ed2 3622 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3623 break;
73b437c8
JH
3624 case ANYOF_CNTRL:
3625 if (LOC)
936ed897 3626 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3627 else {
3628 for (value = 0; value < 256; value++)
3629 if (isCNTRL(value))
936ed897 3630 ANYOF_BITMAP_SET(ret, value);
73b437c8 3631 }
ffc61ed2 3632 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3633 break;
3634 case ANYOF_NCNTRL:
3635 if (LOC)
936ed897 3636 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3637 else {
3638 for (value = 0; value < 256; value++)
3639 if (!isCNTRL(value))
936ed897 3640 ANYOF_BITMAP_SET(ret, value);
73b437c8 3641 }
ffc61ed2
JH
3642 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3643 break;
3644 case ANYOF_DIGIT:
3645 if (LOC)
3646 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3647 else {
3648 /* consecutive digits assumed */
3649 for (value = '0'; value <= '9'; value++)
3650 ANYOF_BITMAP_SET(ret, value);
3651 }
3652 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3653 break;
3654 case ANYOF_NDIGIT:
3655 if (LOC)
3656 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3657 else {
3658 /* consecutive digits assumed */
3659 for (value = 0; value < '0'; value++)
3660 ANYOF_BITMAP_SET(ret, value);
3661 for (value = '9' + 1; value < 256; value++)
3662 ANYOF_BITMAP_SET(ret, value);
3663 }
3664 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3665 break;
3666 case ANYOF_GRAPH:
3667 if (LOC)
936ed897 3668 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3669 else {
3670 for (value = 0; value < 256; value++)
3671 if (isGRAPH(value))
936ed897 3672 ANYOF_BITMAP_SET(ret, value);
73b437c8 3673 }
ffc61ed2 3674 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3675 break;
3676 case ANYOF_NGRAPH:
3677 if (LOC)
936ed897 3678 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3679 else {
3680 for (value = 0; value < 256; value++)
3681 if (!isGRAPH(value))
936ed897 3682 ANYOF_BITMAP_SET(ret, value);
73b437c8 3683 }
ffc61ed2 3684 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3685 break;
3686 case ANYOF_LOWER:
3687 if (LOC)
936ed897 3688 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3689 else {
3690 for (value = 0; value < 256; value++)
3691 if (isLOWER(value))
936ed897 3692 ANYOF_BITMAP_SET(ret, value);
73b437c8 3693 }
ffc61ed2 3694 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3695 break;
3696 case ANYOF_NLOWER:
3697 if (LOC)
936ed897 3698 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3699 else {
3700 for (value = 0; value < 256; value++)
3701 if (!isLOWER(value))
936ed897 3702 ANYOF_BITMAP_SET(ret, value);
73b437c8 3703 }
ffc61ed2 3704 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3705 break;
3706 case ANYOF_PRINT:
3707 if (LOC)
936ed897 3708 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3709 else {
3710 for (value = 0; value < 256; value++)
3711 if (isPRINT(value))
936ed897 3712 ANYOF_BITMAP_SET(ret, value);
73b437c8 3713 }
ffc61ed2 3714 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3715 break;
3716 case ANYOF_NPRINT:
3717 if (LOC)
936ed897 3718 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3719 else {
3720 for (value = 0; value < 256; value++)
3721 if (!isPRINT(value))
936ed897 3722 ANYOF_BITMAP_SET(ret, value);
73b437c8 3723 }
ffc61ed2 3724 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3725 break;
aaa51d5e
JF
3726 case ANYOF_PSXSPC:
3727 if (LOC)
3728 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3729 else {
3730 for (value = 0; value < 256; value++)
3731 if (isPSXSPC(value))
3732 ANYOF_BITMAP_SET(ret, value);
3733 }
ffc61ed2 3734 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3735 break;
3736 case ANYOF_NPSXSPC:
3737 if (LOC)
3738 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3739 else {
3740 for (value = 0; value < 256; value++)
3741 if (!isPSXSPC(value))
3742 ANYOF_BITMAP_SET(ret, value);
3743 }
ffc61ed2 3744 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3745 break;
73b437c8
JH
3746 case ANYOF_PUNCT:
3747 if (LOC)
936ed897 3748 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3749 else {
3750 for (value = 0; value < 256; value++)
3751 if (isPUNCT(value))
936ed897 3752 ANYOF_BITMAP_SET(ret, value);
73b437c8 3753 }
ffc61ed2 3754 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3755 break;
3756 case ANYOF_NPUNCT:
3757 if (LOC)
936ed897 3758 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3759 else {
3760 for (value = 0; value < 256; value++)
3761 if (!isPUNCT(value))
936ed897 3762 ANYOF_BITMAP_SET(ret, value);
73b437c8 3763 }
ffc61ed2
JH
3764 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3765 break;
3766 case ANYOF_SPACE:
3767 if (LOC)
3768 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3769 else {
3770 for (value = 0; value < 256; value++)
3771 if (isSPACE(value))
3772 ANYOF_BITMAP_SET(ret, value);
3773 }
3774 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3775 break;
3776 case ANYOF_NSPACE:
3777 if (LOC)
3778 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3779 else {
3780 for (value = 0; value < 256; value++)
3781 if (!isSPACE(value))
3782 ANYOF_BITMAP_SET(ret, value);
3783 }
3784 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3785 break;
3786 case ANYOF_UPPER:
3787 if (LOC)
936ed897 3788 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3789 else {
3790 for (value = 0; value < 256; value++)
3791 if (isUPPER(value))
936ed897 3792 ANYOF_BITMAP_SET(ret, value);
73b437c8 3793 }
ffc61ed2 3794 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3795 break;
3796 case ANYOF_NUPPER:
3797 if (LOC)
936ed897 3798 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3799 else {
3800 for (value = 0; value < 256; value++)
3801 if (!isUPPER(value))
936ed897 3802 ANYOF_BITMAP_SET(ret, value);
73b437c8 3803 }
ffc61ed2 3804 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3805 break;
3806 case ANYOF_XDIGIT:
3807 if (LOC)
936ed897 3808 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3809 else {
3810 for (value = 0; value < 256; value++)
3811 if (isXDIGIT(value))
936ed897 3812 ANYOF_BITMAP_SET(ret, value);
73b437c8 3813 }
ffc61ed2 3814 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3815 break;
3816 case ANYOF_NXDIGIT:
3817 if (LOC)
936ed897 3818 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3819 else {
3820 for (value = 0; value < 256; value++)
3821 if (!isXDIGIT(value))
936ed897 3822 ANYOF_BITMAP_SET(ret, value);
73b437c8 3823 }
ffc61ed2 3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3825 break;
3826 default:
b45f050a 3827 vFAIL("Invalid [::] class");
73b437c8 3828 break;
b8c5462f 3829 }
b8c5462f 3830 if (LOC)
936ed897 3831 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3832 continue;
a0d0e21e 3833 }
ffc61ed2
JH
3834 } /* end of namedclass \blah */
3835
a0d0e21e 3836 if (range) {
3a3c4447 3837 if (prevvalue > value) /* b-a */ {
b45f050a 3838 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3839 RExC_parse - rangebegin,
3840 RExC_parse - rangebegin,
b45f050a 3841 rangebegin);
3568d838 3842 range = 0; /* not a valid range */
73b437c8 3843 }
a0d0e21e
LW
3844 }
3845 else {
3568d838 3846 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3847 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3848 RExC_parse[1] != ']') {
3849 RExC_parse++;
ffc61ed2
JH
3850
3851 /* a bad range like \w-, [:word:]- ? */
3852 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3853 if (ckWARN(WARN_REGEXP))
830247a4 3854 vWARN4(RExC_parse,
b45f050a 3855 "False [] range \"%*.*s\"",
830247a4
IZ
3856 RExC_parse - rangebegin,
3857 RExC_parse - rangebegin,
b45f050a 3858 rangebegin);
73b437c8 3859 if (!SIZE_ONLY)
936ed897 3860 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3861 } else
ffc61ed2
JH
3862 range = 1; /* yeah, it's a range! */
3863 continue; /* but do it the next time */
a0d0e21e 3864 }
a687059c 3865 }
ffc61ed2 3866
93733859 3867 /* now is the next time */
ae5c130c 3868 if (!SIZE_ONLY) {
3568d838
JH
3869 IV i;
3870
3871 if (prevvalue < 256) {
3872 IV ceilvalue = value < 256 ? value : 255;
3873
3874#ifdef EBCDIC
3a3c4447
JH
3875 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3876 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3877 {
3568d838
JH
3878 if (isLOWER(prevvalue)) {
3879 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3880 if (isLOWER(i))
3881 ANYOF_BITMAP_SET(ret, i);
3882 } else {
3568d838 3883 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3884 if (isUPPER(i))
3885 ANYOF_BITMAP_SET(ret, i);
3886 }
8ada0baa 3887 }
ffc61ed2 3888 else
8ada0baa 3889#endif
3568d838
JH
3890 for (i = prevvalue; i <= ceilvalue; i++)
3891 ANYOF_BITMAP_SET(ret, i);
3892 }
3893 if (value > 255) {
ffc61ed2 3894 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3895 if (prevvalue < value)
ffc61ed2 3896 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3897 (UV)prevvalue, (UV)value);
3898 else if (prevvalue == value)
ffc61ed2
JH
3899 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3900 (UV)value);
3901 }
8ada0baa 3902 }
ffc61ed2
JH
3903
3904 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3905 }
ffc61ed2 3906
936ed897 3907 if (need_class) {
4f66b38d 3908 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3909 if (SIZE_ONLY)
830247a4 3910 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3911 else
830247a4 3912 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3913 }
ffc61ed2 3914
ae5c130c 3915 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3916 if (!SIZE_ONLY &&
ffc61ed2 3917 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3918 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3919 ) {
a0ed51b3 3920 for (value = 0; value < 256; ++value) {
936ed897 3921 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3922 IV fold = PL_fold[value];
3923
3924 if (fold != value)
3925 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3926 }
3927 }
936ed897 3928 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3929 }
ffc61ed2 3930
ae5c130c 3931 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3932 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3933 /* If the only flag is inversion. */
3934 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3935 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3936 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3937 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3938 }
a0d0e21e 3939
b81d288d 3940 if (!SIZE_ONLY) {
fde631ed 3941 AV *av = newAV();
ffc61ed2
JH
3942 SV *rv;
3943
3944 av_store(av, 0, listsv);
3945 av_store(av, 1, NULL);
3946 rv = newRV_noinc((SV*)av);
19860706 3947 n = add_data(pRExC_state, 1, "s");
830247a4 3948 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3949 ARG_SET(ret, n);
a0ed51b3
LW
3950 }
3951
3952 return ret;
3953}
3954
76e3520e 3955STATIC char*
830247a4 3956S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3957{
830247a4 3958 char* retval = RExC_parse++;
a0d0e21e 3959
4633a7c4 3960 for (;;) {
830247a4
IZ
3961 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3962 RExC_parse[2] == '#') {
3963 while (*RExC_parse && *RExC_parse != ')')
3964 RExC_parse++;
3965 RExC_parse++;
4633a7c4
LW
3966 continue;
3967 }
830247a4
IZ
3968 if (RExC_flags16 & PMf_EXTENDED) {
3969 if (isSPACE(*RExC_parse)) {
3970 RExC_parse++;
748a9306
LW
3971 continue;
3972 }
830247a4
IZ
3973 else if (*RExC_parse == '#') {
3974 while (*RExC_parse && *RExC_parse != '\n')
3975 RExC_parse++;
3976 RExC_parse++;
748a9306
LW
3977 continue;
3978 }
748a9306 3979 }
4633a7c4 3980 return retval;
a0d0e21e 3981 }
a687059c
LW
3982}
3983
3984/*
c277df42 3985- reg_node - emit a node
a0d0e21e 3986*/
76e3520e 3987STATIC regnode * /* Location. */
830247a4 3988S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3989{
c277df42
IZ
3990 register regnode *ret;
3991 register regnode *ptr;
a687059c 3992
830247a4 3993 ret = RExC_emit;
c277df42 3994 if (SIZE_ONLY) {
830247a4
IZ
3995 SIZE_ALIGN(RExC_size);
3996 RExC_size += 1;
a0d0e21e
LW
3997 return(ret);
3998 }
a687059c 3999
c277df42 4000 NODE_ALIGN_FILL(ret);
a0d0e21e 4001 ptr = ret;
c277df42 4002 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4003 if (RExC_offsets) { /* MJD */
4004 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4005 "reg_node", __LINE__,
4006 reg_name[op],
4007 RExC_emit - RExC_emit_start > RExC_offsets[0]
4008 ? "Overwriting end of array!\n" : "OK",
4009 RExC_emit - RExC_emit_start,
4010 RExC_parse - RExC_start,
4011 RExC_offsets[0]));
4012 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4013 }
4014
830247a4 4015 RExC_emit = ptr;
a687059c 4016
a0d0e21e 4017 return(ret);
a687059c
LW
4018}
4019
4020/*
a0d0e21e
LW
4021- reganode - emit a node with an argument
4022*/
76e3520e 4023STATIC regnode * /* Location. */
830247a4 4024S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4025{
c277df42
IZ
4026 register regnode *ret;
4027 register regnode *ptr;
fe14fcc3 4028
830247a4 4029 ret = RExC_emit;
c277df42 4030 if (SIZE_ONLY) {
830247a4
IZ
4031 SIZE_ALIGN(RExC_size);
4032 RExC_size += 2;
a0d0e21e
LW
4033 return(ret);
4034 }
fe14fcc3 4035
c277df42 4036 NODE_ALIGN_FILL(ret);
a0d0e21e 4037 ptr = ret;
c277df42 4038 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4039 if (RExC_offsets) { /* MJD */
4040 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4041 "reganode",
4042 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4043 "Overwriting end of array!\n" : "OK",
4044 RExC_emit - RExC_emit_start,
4045 RExC_parse - RExC_start,
4046 RExC_offsets[0]));
4047 Set_Cur_Node_Offset;
4048 }
4049
830247a4 4050 RExC_emit = ptr;
fe14fcc3 4051
a0d0e21e 4052 return(ret);
fe14fcc3
LW
4053}
4054
4055/*
cd439c50 4056- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4057*/
4058STATIC void
830247a4 4059S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4060{
5e12f4fb 4061 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4062}
4063
4064/*
a0d0e21e
LW
4065- reginsert - insert an operator in front of already-emitted operand
4066*
4067* Means relocating the operand.
4068*/
76e3520e 4069STATIC void
830247a4 4070S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4071{
c277df42
IZ
4072 register regnode *src;
4073 register regnode *dst;
4074 register regnode *place;
4075 register int offset = regarglen[(U8)op];
b81d288d 4076
22c35a8c 4077/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4078
4079 if (SIZE_ONLY) {
830247a4 4080 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4081 return;
4082 }
a687059c 4083
830247a4
IZ
4084 src = RExC_emit;
4085 RExC_emit += NODE_STEP_REGNODE + offset;
4086 dst = RExC_emit;
fac92740 4087 while (src > opnd) {
c277df42 4088 StructCopy(--src, --dst, regnode);
fac92740
MJD
4089 if (RExC_offsets) { /* MJD 20010112 */
4090 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4091 "reg_insert",
4092 dst - RExC_emit_start > RExC_offsets[0]
4093 ? "Overwriting end of array!\n" : "OK",
4094 src - RExC_emit_start,
4095 dst - RExC_emit_start,
4096 RExC_offsets[0]));
4097 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4098 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4099 }
4100 }
4101
a0d0e21e
LW
4102
4103 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4104 if (RExC_offsets) { /* MJD */
4105 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4106 "reginsert",
4107 place - RExC_emit_start > RExC_offsets[0]
4108 ? "Overwriting end of array!\n" : "OK",
4109 place - RExC_emit_start,
4110 RExC_parse - RExC_start,
4111 RExC_offsets[0]));
4112 Set_Node_Offset(place, RExC_parse);
4113 }
c277df42
IZ
4114 src = NEXTOPER(place);
4115 FILL_ADVANCE_NODE(place, op);
4116 Zero(src, offset, regnode);
a687059c
LW
4117}
4118
4119/*
c277df42 4120- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4121*/
76e3520e 4122STATIC void
830247a4 4123S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4124{
c277df42
IZ
4125 register regnode *scan;
4126 register regnode *temp;
a0d0e21e 4127
c277df42 4128 if (SIZE_ONLY)
a0d0e21e
LW
4129 return;
4130
4131 /* Find last node. */
4132 scan = p;
4133 for (;;) {
4134 temp = regnext(scan);
4135 if (temp == NULL)
4136 break;
4137 scan = temp;
4138 }
a687059c 4139
c277df42
IZ
4140 if (reg_off_by_arg[OP(scan)]) {
4141 ARG_SET(scan, val - scan);
a0ed51b3
LW
4142 }
4143 else {
c277df42
IZ
4144 NEXT_OFF(scan) = val - scan;
4145 }
a687059c
LW
4146}
4147
4148/*
a0d0e21e
LW
4149- regoptail - regtail on operand of first argument; nop if operandless
4150*/
76e3520e 4151STATIC void
830247a4 4152S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4153{
a0d0e21e 4154 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4155 if (p == NULL || SIZE_ONLY)
4156 return;
22c35a8c 4157 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4158 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4159 }
22c35a8c 4160 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4161 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4162 }
4163 else
a0d0e21e 4164 return;
a687059c
LW
4165}
4166
4167/*
4168 - regcurly - a little FSA that accepts {\d+,?\d*}
4169 */
79072805 4170STATIC I32
cea2e8a9 4171S_regcurly(pTHX_ register char *s)
a687059c
LW
4172{
4173 if (*s++ != '{')
4174 return FALSE;
f0fcb552 4175 if (!isDIGIT(*s))
a687059c 4176 return FALSE;
f0fcb552 4177 while (isDIGIT(*s))
a687059c
LW
4178 s++;
4179 if (*s == ',')
4180 s++;
f0fcb552 4181 while (isDIGIT(*s))
a687059c
LW
4182 s++;
4183 if (*s != '}')
4184 return FALSE;
4185 return TRUE;
4186}
4187
a687059c 4188
8fa7f367
JH
4189#ifdef DEBUGGING
4190
76e3520e 4191STATIC regnode *
cea2e8a9 4192S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4193{
f248d071 4194 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4195 register regnode *next;
c277df42
IZ
4196
4197 while (op != END && (!last || node < last)) {
4198 /* While that wasn't END last time... */
4199
4200 NODE_ALIGN(node);
4201 op = OP(node);
4202 if (op == CLOSE)
4203 l--;
4204 next = regnext(node);
4205 /* Where, what. */
4206 if (OP(node) == OPTIMIZED)
4207 goto after_print;
4208 regprop(sv, node);
b900a521 4209 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4210 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4211 if (next == NULL) /* Next ptr. */
4212 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4213 else
b900a521 4214 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4215 (void)PerlIO_putc(Perl_debug_log, '\n');
4216 after_print:
22c35a8c 4217 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4218 register regnode *nnode = (OP(next) == LONGJMP
4219 ? regnext(next)
c277df42
IZ
4220 : next);
4221 if (last && nnode > last)
4222 nnode = last;
4223 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4224 }
22c35a8c 4225 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4226 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4227 }
4228 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4229 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4230 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4231 }
22c35a8c 4232 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4233 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4234 next, sv, l + 1);
a0ed51b3
LW
4235 }
4236 else if ( op == PLUS || op == STAR) {
c277df42 4237 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4238 }
4239 else if (op == ANYOF) {
4f66b38d
HS
4240 /* arglen 1 + class block */
4241 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4242 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4243 node = NEXTOPER(node);
a0ed51b3 4244 }
22c35a8c 4245 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4246 /* Literal string, where present. */
cd439c50 4247 node += NODE_SZ_STR(node) - 1;
c277df42 4248 node = NEXTOPER(node);
a0ed51b3
LW
4249 }
4250 else {
c277df42
IZ
4251 node = NEXTOPER(node);
4252 node += regarglen[(U8)op];
4253 }
4254 if (op == CURLYX || op == OPEN)
4255 l++;
4256 else if (op == WHILEM)
4257 l--;
4258 }
4259 return node;
4260}
4261
8fa7f367
JH
4262#endif /* DEBUGGING */
4263
a687059c 4264/*
fd181c75 4265 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4266 */
4267void
864dbfa3 4268Perl_regdump(pTHX_ regexp *r)
a687059c 4269{
35ff7856 4270#ifdef DEBUGGING
46fc3d4c 4271 SV *sv = sv_newmortal();
a687059c 4272
c277df42 4273 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4274
4275 /* Header fields of interest. */
c277df42 4276 if (r->anchored_substr)
7b0972df 4277 PerlIO_printf(Perl_debug_log,
b81d288d 4278 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4279 PL_colors[0],
7b0972df 4280 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4281 SvPVX(r->anchored_substr),
3280af22 4282 PL_colors[1],
c277df42 4283 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4284 (IV)r->anchored_offset);
c277df42 4285 if (r->float_substr)
7b0972df 4286 PerlIO_printf(Perl_debug_log,
b81d288d 4287 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4288 PL_colors[0],
b81d288d 4289 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4290 SvPVX(r->float_substr),
3280af22 4291 PL_colors[1],
c277df42 4292 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4293 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4294 if (r->check_substr)
b81d288d
AB
4295 PerlIO_printf(Perl_debug_log,
4296 r->check_substr == r->float_substr
c277df42
IZ
4297 ? "(checking floating" : "(checking anchored");
4298 if (r->reganch & ROPT_NOSCAN)
4299 PerlIO_printf(Perl_debug_log, " noscan");
4300 if (r->reganch & ROPT_CHECK_ALL)
4301 PerlIO_printf(Perl_debug_log, " isall");
4302 if (r->check_substr)
4303 PerlIO_printf(Perl_debug_log, ") ");
4304
46fc3d4c 4305 if (r->regstclass) {
4306 regprop(sv, r->regstclass);
4307 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4308 }
774d564b 4309 if (r->reganch & ROPT_ANCH) {
4310 PerlIO_printf(Perl_debug_log, "anchored");
4311 if (r->reganch & ROPT_ANCH_BOL)
4312 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4313 if (r->reganch & ROPT_ANCH_MBOL)
4314 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4315 if (r->reganch & ROPT_ANCH_SBOL)
4316 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4317 if (r->reganch & ROPT_ANCH_GPOS)
4318 PerlIO_printf(Perl_debug_log, "(GPOS)");
4319 PerlIO_putc(Perl_debug_log, ' ');
4320 }
c277df42
IZ
4321 if (r->reganch & ROPT_GPOS_SEEN)
4322 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4323 if (r->reganch & ROPT_SKIP)
760ac839 4324 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4325 if (r->reganch & ROPT_IMPLICIT)
760ac839 4326 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4327 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4328 if (r->reganch & ROPT_EVAL_SEEN)
4329 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4330 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4331 if (r->offsets) {
4332 U32 i;
4333 U32 len = r->offsets[0];
392fbf5d 4334 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4335 for (i = 1; i <= len; i++)
392fbf5d
RB
4336 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4337 (UV)r->offsets[i*2-1],
4338 (UV)r->offsets[i*2]);
fac92740
MJD
4339 PerlIO_printf(Perl_debug_log, "\n");
4340 }
17c3b450 4341#endif /* DEBUGGING */
a687059c
LW
4342}
4343
8fa7f367
JH
4344#ifdef DEBUGGING
4345
653099ff
GS
4346STATIC void
4347S_put_byte(pTHX_ SV *sv, int c)
4348{
ffc61ed2 4349 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4350 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4351 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4352 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4353 else
4354 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4355}
4356
8fa7f367
JH
4357#endif /* DEBUGGING */
4358
a687059c 4359/*
a0d0e21e
LW
4360- regprop - printable representation of opcode
4361*/
46fc3d4c 4362void
864dbfa3 4363Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4364{
35ff7856 4365#ifdef DEBUGGING
9b155405 4366 register int k;
a0d0e21e 4367
54dc92de 4368 sv_setpvn(sv, "", 0);
9b155405 4369 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4370 /* It would be nice to FAIL() here, but this may be called from
4371 regexec.c, and it would be hard to supply pRExC_state. */
4372 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4373 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4374
4375 k = PL_regkind[(U8)OP(o)];
4376
4377 if (k == EXACT)
7821416a 4378 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4379 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4380 else if (k == CURLY) {
cb434fcc 4381 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4382 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4383 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4384 }
2c2d71f5
JH
4385 else if (k == WHILEM && o->flags) /* Ordinal/of */
4386 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4387 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4388 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4389 else if (k == LOGICAL)
04ebc1ab 4390 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4391 else if (k == ANYOF) {
4392 int i, rangestart = -1;
ffc61ed2 4393 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4394 const char * const anyofs[] = { /* Should be syncronized with
4395 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4396 "\\w",
4397 "\\W",
4398 "\\s",
4399 "\\S",
4400 "\\d",
4401 "\\D",
4402 "[:alnum:]",
4403 "[:^alnum:]",
4404 "[:alpha:]",
4405 "[:^alpha:]",
4406 "[:ascii:]",
4407 "[:^ascii:]",
4408 "[:ctrl:]",
4409 "[:^ctrl:]",
4410 "[:graph:]",
4411 "[:^graph:]",
4412 "[:lower:]",
4413 "[:^lower:]",
4414 "[:print:]",
4415 "[:^print:]",
4416 "[:punct:]",
4417 "[:^punct:]",
4418 "[:upper:]",
aaa51d5e 4419 "[:^upper:]",
653099ff 4420 "[:xdigit:]",
aaa51d5e
JF
4421 "[:^xdigit:]",
4422 "[:space:]",
4423 "[:^space:]",
4424 "[:blank:]",
4425 "[:^blank:]"
653099ff
GS
4426 };
4427
19860706 4428 if (flags & ANYOF_LOCALE)
653099ff 4429 sv_catpv(sv, "{loc}");
19860706 4430 if (flags & ANYOF_FOLD)
653099ff
GS
4431 sv_catpv(sv, "{i}");
4432 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4433 if (flags & ANYOF_INVERT)
653099ff 4434 sv_catpv(sv, "^");
ffc61ed2
JH
4435 for (i = 0; i <= 256; i++) {
4436 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4437 if (rangestart == -1)
4438 rangestart = i;
4439 } else if (rangestart != -1) {
4440 if (i <= rangestart + 3)
4441 for (; rangestart < i; rangestart++)
653099ff 4442 put_byte(sv, rangestart);
ffc61ed2
JH
4443 else {
4444 put_byte(sv, rangestart);
4445 sv_catpv(sv, "-");
4446 put_byte(sv, i - 1);
653099ff 4447 }
ffc61ed2 4448 rangestart = -1;
653099ff 4449 }
847a199f 4450 }
ffc61ed2
JH
4451
4452 if (o->flags & ANYOF_CLASS)
4453 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4454 if (ANYOF_CLASS_TEST(o,i))
4455 sv_catpv(sv, anyofs[i]);
4456
4457 if (flags & ANYOF_UNICODE)
4458 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4459 else if (flags & ANYOF_UNICODE_ALL)
4460 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4461
4462 {
4463 SV *lv;
4464 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4465
ffc61ed2
JH
4466 if (lv) {
4467 if (sw) {
4468 UV i;
4469 U8 s[UTF8_MAXLEN+1];
b81d288d 4470
ffc61ed2 4471 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4472 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4473
3568d838 4474 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4475 if (rangestart == -1)
4476 rangestart = i;
4477 } else if (rangestart != -1) {
4478 U8 *p;
b81d288d 4479
ffc61ed2
JH
4480 if (i <= rangestart + 3)
4481 for (; rangestart < i; rangestart++) {
2b9d42f0 4482 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4483 put_byte(sv, *p);
4484 }
4485 else {
2b9d42f0 4486 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4487 put_byte(sv, *p);
4488 sv_catpv(sv, "-");
2b9d42f0 4489 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4490 put_byte(sv, *p);
4491 }
4492 rangestart = -1;
4493 }
19860706 4494 }
ffc61ed2
JH
4495
4496 sv_catpv(sv, "..."); /* et cetera */
19860706 4497 }
fde631ed 4498
ffc61ed2
JH
4499 {
4500 char *s = savepv(SvPVX(lv));
4501 char *origs = s;
b81d288d 4502
ffc61ed2 4503 while(*s && *s != '\n') s++;
b81d288d 4504
ffc61ed2
JH
4505 if (*s == '\n') {
4506 char *t = ++s;
4507
4508 while (*s) {
4509 if (*s == '\n')
4510 *s = ' ';
4511 s++;
4512 }
4513 if (s[-1] == ' ')
4514 s[-1] = 0;
4515
4516 sv_catpv(sv, t);
fde631ed 4517 }
b81d288d 4518
ffc61ed2 4519 Safefree(origs);
fde631ed
JH
4520 }
4521 }
653099ff 4522 }
ffc61ed2 4523
653099ff
GS
4524 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4525 }
9b155405 4526 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4527 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4528#endif /* DEBUGGING */
35ff7856 4529}
a687059c 4530
cad2e5aa
JH
4531SV *
4532Perl_re_intuit_string(pTHX_ regexp *prog)
4533{ /* Assume that RE_INTUIT is set */
4534 DEBUG_r(
4535 { STRLEN n_a;
4536 char *s = SvPV(prog->check_substr,n_a);
4537
4538 if (!PL_colorset) reginitcolors();
4539 PerlIO_printf(Perl_debug_log,
4540 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4541 PL_colors[4],PL_colors[5],PL_colors[0],
4542 s,
4543 PL_colors[1],
4544 (strlen(s) > 60 ? "..." : ""));
4545 } );
4546
4547 return prog->check_substr;
4548}
4549
2b69d0c2 4550void
864dbfa3 4551Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4552{
adac82c7 4553 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4554
4555 if (!r || (--r->refcnt > 0))
4556 return;
cad2e5aa
JH
4557 DEBUG_r(PerlIO_printf(Perl_debug_log,
4558 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4559 PL_colors[4],PL_colors[5],PL_colors[0],
4560 r->precomp,
4561 PL_colors[1],
4562 (strlen(r->precomp) > 60 ? "..." : "")));
4563
c277df42 4564 if (r->precomp)
a0d0e21e 4565 Safefree(r->precomp);
fac92740
MJD
4566 if (r->offsets) /* 20010421 MJD */
4567 Safefree(r->offsets);
cf93c79d
IZ
4568 if (RX_MATCH_COPIED(r))
4569 Safefree(r->subbeg);
a193d654
GS
4570 if (r->substrs) {
4571 if (r->anchored_substr)
4572 SvREFCNT_dec(r->anchored_substr);
4573 if (r->float_substr)
4574 SvREFCNT_dec(r->float_substr);
2779dcf1 4575 Safefree(r->substrs);
a193d654 4576 }
c277df42
IZ
4577 if (r->data) {
4578 int n = r->data->count;
dfad63ad
HS
4579 AV* new_comppad = NULL;
4580 AV* old_comppad;
4581 SV** old_curpad;
4582
c277df42
IZ
4583 while (--n >= 0) {
4584 switch (r->data->what[n]) {
4585 case 's':
4586 SvREFCNT_dec((SV*)r->data->data[n]);
4587 break;
653099ff
GS
4588 case 'f':
4589 Safefree(r->data->data[n]);
4590 break;
dfad63ad
HS
4591 case 'p':
4592 new_comppad = (AV*)r->data->data[n];
4593 break;
c277df42 4594 case 'o':
dfad63ad 4595 if (new_comppad == NULL)
cea2e8a9 4596 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4597 old_comppad = PL_comppad;
4598 old_curpad = PL_curpad;
1e6dc0b6
SB
4599 /* Watch out for global destruction's random ordering. */
4600 if (SvTYPE(new_comppad) == SVt_PVAV) {
4601 PL_comppad = new_comppad;
4602 PL_curpad = AvARRAY(new_comppad);
4603 }
4604 else
4605 PL_curpad = NULL;
c277df42 4606 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4607 PL_comppad = old_comppad;
4608 PL_curpad = old_curpad;
4609 SvREFCNT_dec((SV*)new_comppad);
4610 new_comppad = NULL;
c277df42
IZ
4611 break;
4612 case 'n':
4613 break;
4614 default:
830247a4 4615 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4616 }
4617 }
4618 Safefree(r->data->what);
4619 Safefree(r->data);
a0d0e21e
LW
4620 }
4621 Safefree(r->startp);
4622 Safefree(r->endp);
4623 Safefree(r);
a687059c 4624}
c277df42
IZ
4625
4626/*
4627 - regnext - dig the "next" pointer out of a node
4628 *
4629 * [Note, when REGALIGN is defined there are two places in regmatch()
4630 * that bypass this code for speed.]
4631 */
4632regnode *
864dbfa3 4633Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4634{
4635 register I32 offset;
4636
3280af22 4637 if (p == &PL_regdummy)
c277df42
IZ
4638 return(NULL);
4639
4640 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4641 if (offset == 0)
4642 return(NULL);
4643
c277df42 4644 return(p+offset);
c277df42
IZ
4645}
4646
01f988be 4647STATIC void
cea2e8a9 4648S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4649{
4650 va_list args;
4651 STRLEN l1 = strlen(pat1);
4652 STRLEN l2 = strlen(pat2);
4653 char buf[512];
06bf62c7 4654 SV *msv;
c277df42
IZ
4655 char *message;
4656
4657 if (l1 > 510)
4658 l1 = 510;
4659 if (l1 + l2 > 510)
4660 l2 = 510 - l1;
4661 Copy(pat1, buf, l1 , char);
4662 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4663 buf[l1 + l2] = '\n';
4664 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4665#ifdef I_STDARG
4666 /* ANSI variant takes additional second argument */
c277df42 4667 va_start(args, pat2);
8736538c
AS
4668#else
4669 va_start(args);
4670#endif
5a844595 4671 msv = vmess(buf, &args);
c277df42 4672 va_end(args);
06bf62c7 4673 message = SvPV(msv,l1);
c277df42
IZ
4674 if (l1 > 512)
4675 l1 = 512;
4676 Copy(message, buf, l1 , char);
4677 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4678 Perl_croak(aTHX_ "%s", buf);
c277df42 4679}
a0ed51b3
LW
4680
4681/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4682
4683void
864dbfa3 4684Perl_save_re_context(pTHX)
b81d288d 4685{
830247a4
IZ
4686#if 0
4687 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4688 SAVEI32(RExC_npar); /* () count. */
4689 SAVEI32(RExC_size); /* Code size. */
4690 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4691 SAVEVPTR(RExC_rx); /* from regcomp.c */
4692 SAVEI32(RExC_seen); /* from regcomp.c */
4693 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4694 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4695 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4696 SAVEPPTR(RExC_end); /* End of input for compile */
4697 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4698#endif
4699
4700 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4701 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4702 SAVEPPTR(PL_reginput); /* String-input pointer. */
4703 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4704 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4705 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4706 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4707 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4708 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4709 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4710 PL_reg_start_tmp = 0;
a0ed51b3
LW
4711 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4712 PL_reg_start_tmpl = 0;
7766f137 4713 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4714 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4715 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4716 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4717 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4718 SAVEVPTR(PL_regcc); /* from regexec.c */
4719 SAVEVPTR(PL_curcop);
7766f137
GS
4720 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4721 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4722 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4723 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4724 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4725 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4726 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4727 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4728 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4729 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4730#ifdef DEBUGGING
b81d288d 4731 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4732#endif
a0ed51b3 4733}
51371543
GS
4734
4735#ifdef PERL_OBJECT
51371543
GS
4736#include "XSUB.h"
4737#undef this
4738#define this pPerl
4739#endif
4740
4741static void
4742clear_re(pTHXo_ void *r)
4743{
4744 ReREFCNT_dec((regexp *)r);
4745}
ffbc6a93 4746