This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate Memoize 0.64. Few tweaks were required in
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
f9373011
PM
396#define vWARNdep(loc,m) \
397 STMT_START { \
398 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
399 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
400 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 } STMT_END \
403
b45f050a
JF
404
405#define vWARN2(loc, m, a1) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
409 a1, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN3(loc, m, a1, a2) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
417 a1, a2, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
419 } STMT_END
420
421#define vWARN4(loc, m, a1, a2, a3) \
422 STMT_START { \
830247a4 423 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
424 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
425 a1, a2, a3, \
830247a4 426 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
427 } STMT_END
428
9d1d55b5
JP
429/* used for the parse_flags section for (?c) -- japhy */
430#define vWARN5(loc, m, a1, a2, a3, a4) \
431 STMT_START { \
432 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
433 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
434 a1, a2, a3, a4, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436 } STMT_END
437
8615cb43 438
cd439c50 439/* Allow for side effects in s */
fd3f0ae2 440#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 441
fac92740
MJD
442/* Macros for recording node offsets. 20001227 mjd@plover.com
443 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
444 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
445 * Element 0 holds the number n.
446 */
447
448#define MJD_OFFSET_DEBUG(x)
449/* #define MJD_OFFSET_DEBUG(x) fprintf x */
450
451
452# define Set_Node_Offset_To_R(node,byte) \
453 STMT_START { \
454 if (! SIZE_ONLY) { \
455 if((node) < 0) { \
456 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
457 } else { \
458 RExC_offsets[2*(node)-1] = (byte); \
459 } \
460 } \
461 } STMT_END
462
463# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
464# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
465
466# define Set_Node_Length_To_R(node,len) \
467 STMT_START { \
468 if (! SIZE_ONLY) { \
469 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
470 if((node) < 0) { \
471 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
472 } else { \
473 RExC_offsets[2*(node)] = (len); \
474 } \
475 } \
476 } STMT_END
477
478# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
479# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
480# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
481
482/* Get offsets and lengths */
483#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
484#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
485
51371543 486static void clear_re(pTHXo_ void *r);
4327152a 487
653099ff
GS
488/* Mark that we cannot extend a found fixed substring at this point.
489 Updata the longest found anchored substring and the longest found
490 floating substrings if needed. */
491
4327152a 492STATIC void
830247a4 493S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 494{
a0ed51b3
LW
495 STRLEN l = CHR_SVLEN(data->last_found);
496 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 497
c277df42
IZ
498 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
499 sv_setsv(*data->longest, data->last_found);
500 if (*data->longest == data->longest_fixed) {
501 data->offset_fixed = l ? data->last_start_min : data->pos_min;
502 if (data->flags & SF_BEFORE_EOL)
b81d288d 503 data->flags
c277df42
IZ
504 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
505 else
506 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
507 }
508 else {
c277df42 509 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
510 data->offset_float_max = (l
511 ? data->last_start_max
c277df42
IZ
512 : data->pos_min + data->pos_delta);
513 if (data->flags & SF_BEFORE_EOL)
b81d288d 514 data->flags
c277df42
IZ
515 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
516 else
517 data->flags &= ~SF_FL_BEFORE_EOL;
518 }
519 }
520 SvCUR_set(data->last_found, 0);
521 data->last_end = -1;
522 data->flags &= ~SF_BEFORE_EOL;
523}
524
653099ff
GS
525/* Can match anything (initialization) */
526STATIC void
830247a4 527S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 528{
9af0acdb
JH
529 int value;
530
653099ff 531 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
532 for (value = 0; value < 256; ++value)
533 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 534 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* Can match anything (initialization) */
540STATIC int
541S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
542{
543 int value;
544
aaa51d5e 545 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
546 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
547 return 1;
1aa99e6b
IH
548 if (!(cl->flags & ANYOF_UNICODE_ALL))
549 return 0;
9af0acdb
JH
550 for (value = 0; value < 256; ++value)
551 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
552 return 0;
553 return 1;
554}
555
556/* Can match anything (initialization) */
557STATIC void
830247a4 558S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563}
564
565STATIC void
830247a4 566S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 567{
8ecf7187 568 Zero(cl, 1, struct regnode_charclass_class);
653099ff 569 cl->type = ANYOF;
830247a4 570 cl_anything(pRExC_state, cl);
653099ff
GS
571 if (LOC)
572 cl->flags |= ANYOF_LOCALE;
573}
574
575/* 'And' a given class with another one. Can create false positives */
576/* We assume that cl is not inverted */
577STATIC void
578S_cl_and(pTHX_ struct regnode_charclass_class *cl,
579 struct regnode_charclass_class *and_with)
580{
653099ff
GS
581 if (!(and_with->flags & ANYOF_CLASS)
582 && !(cl->flags & ANYOF_CLASS)
583 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
584 && !(and_with->flags & ANYOF_FOLD)
585 && !(cl->flags & ANYOF_FOLD)) {
586 int i;
587
588 if (and_with->flags & ANYOF_INVERT)
589 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
590 cl->bitmap[i] &= ~and_with->bitmap[i];
591 else
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] &= and_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
595 if (!(and_with->flags & ANYOF_EOS))
596 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
597
598 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
599 cl->flags &= ~ANYOF_UNICODE_ALL;
600 cl->flags |= ANYOF_UNICODE;
601 ARG_SET(cl, ARG(and_with));
602 }
603 if (!(and_with->flags & ANYOF_UNICODE_ALL))
604 cl->flags &= ~ANYOF_UNICODE_ALL;
605 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
606 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
607}
608
609/* 'OR' a given class with another one. Can create false positives */
610/* We assume that cl is not inverted */
611STATIC void
830247a4 612S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 613{
653099ff
GS
614 if (or_with->flags & ANYOF_INVERT) {
615 /* We do not use
616 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
617 * <= (B1 | !B2) | (CL1 | !CL2)
618 * which is wasteful if CL2 is small, but we ignore CL2:
619 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
620 * XXXX Can we handle case-fold? Unclear:
621 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
622 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
623 */
624 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
625 && !(or_with->flags & ANYOF_FOLD)
626 && !(cl->flags & ANYOF_FOLD) ) {
627 int i;
628
629 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
630 cl->bitmap[i] |= ~or_with->bitmap[i];
631 } /* XXXX: logic is complicated otherwise */
632 else {
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 } else {
636 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
637 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 638 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
639 || (cl->flags & ANYOF_FOLD)) ) {
640 int i;
641
642 /* OR char bitmap and class bitmap separately */
643 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
644 cl->bitmap[i] |= or_with->bitmap[i];
645 if (or_with->flags & ANYOF_CLASS) {
646 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
647 cl->classflags[i] |= or_with->classflags[i];
648 cl->flags |= ANYOF_CLASS;
649 }
650 }
651 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653 }
654 }
655 if (or_with->flags & ANYOF_EOS)
656 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
657
658 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
659 ARG(cl) != ARG(or_with)) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
663 if (or_with->flags & ANYOF_UNICODE_ALL) {
664 cl->flags |= ANYOF_UNICODE_ALL;
665 cl->flags &= ~ANYOF_UNICODE;
666 }
653099ff
GS
667}
668
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
c277df42
IZ
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
76e3520e 675STATIC I32
830247a4 676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 685 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
653099ff 688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 689
c277df42
IZ
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
22c35a8c 693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 694 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
b81d288d 699#endif
c277df42 700
cd439c50 701 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
b81d288d 704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
22c35a8c 710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
b81d288d 716#endif
c277df42 717 n = regnext(n);
a0ed51b3 718 }
f49d4d0f 719 else if (stringok) {
cd439c50 720 int oldl = STR_LEN(scan);
c277df42 721 regnode *nnext = regnext(n);
f49d4d0f 722
b81d288d 723 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
c277df42 728 /* Now we can overwrite *n : */
f49d4d0f 729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 730#ifdef DEBUGGING
f49d4d0f 731 stop = next - 1;
b81d288d 732#endif
c277df42
IZ
733 n = nnext;
734 }
735 }
736#ifdef DEBUGGING
737 /* Allow dumping */
cd439c50 738 n = scan + NODE_SZ_STR(scan);
c277df42 739 while (n <= stop) {
22c35a8c 740 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
741 OP(n) = OPTIMIZED;
742 NEXT_OFF(n) = 0;
743 }
744 n++;
745 }
653099ff 746#endif
c277df42 747 }
653099ff
GS
748 /* Follow the next-chain of the current node and optimize
749 away all the NOTHINGs from it. */
c277df42 750 if (OP(scan) != CURLYX) {
048cfca1
GS
751 int max = (reg_off_by_arg[OP(scan)]
752 ? I32_MAX
753 /* I32 may be smaller than U16 on CRAYs! */
754 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
755 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
756 int noff;
757 regnode *n = scan;
b81d288d 758
c277df42
IZ
759 /* Skip NOTHING and LONGJMP. */
760 while ((n = regnext(n))
22c35a8c 761 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
762 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
763 && off + noff < max)
764 off += noff;
765 if (reg_off_by_arg[OP(scan)])
766 ARG(scan) = off;
b81d288d 767 else
c277df42
IZ
768 NEXT_OFF(scan) = off;
769 }
653099ff
GS
770 /* The principal pseudo-switch. Cannot be a switch, since we
771 look into several different things. */
b81d288d 772 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
773 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
774 next = regnext(scan);
775 code = OP(scan);
b81d288d
AB
776
777 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 778 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 779 struct regnode_charclass_class accum;
c277df42 780
653099ff 781 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 782 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 783 if (flags & SCF_DO_STCLASS)
830247a4 784 cl_init_zero(pRExC_state, &accum);
c277df42 785 while (OP(scan) == code) {
830247a4 786 I32 deltanext, minnext, f = 0, fake;
653099ff 787 struct regnode_charclass_class this_class;
c277df42
IZ
788
789 num++;
790 data_fake.flags = 0;
b81d288d 791 if (data) {
2c2d71f5 792 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
793 data_fake.last_closep = data->last_closep;
794 }
795 else
796 data_fake.last_closep = &fake;
c277df42
IZ
797 next = regnext(scan);
798 scan = NEXTOPER(scan);
799 if (code != BRANCH)
800 scan = NEXTOPER(scan);
653099ff 801 if (flags & SCF_DO_STCLASS) {
830247a4 802 cl_init(pRExC_state, &this_class);
653099ff
GS
803 data_fake.start_class = &this_class;
804 f = SCF_DO_STCLASS_AND;
b81d288d 805 }
e1901655
IZ
806 if (flags & SCF_WHILEM_VISITED_POS)
807 f |= SCF_WHILEM_VISITED_POS;
653099ff 808 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
809 minnext = study_chunk(pRExC_state, &scan, &deltanext,
810 next, &data_fake, f);
b81d288d 811 if (min1 > minnext)
c277df42
IZ
812 min1 = minnext;
813 if (max1 < minnext + deltanext)
814 max1 = minnext + deltanext;
815 if (deltanext == I32_MAX)
aca2d497 816 is_inf = is_inf_internal = 1;
c277df42
IZ
817 scan = next;
818 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
819 pars++;
405ff068 820 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 821 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
822 if (data)
823 data->whilem_c = data_fake.whilem_c;
653099ff 824 if (flags & SCF_DO_STCLASS)
830247a4 825 cl_or(pRExC_state, &accum, &this_class);
b81d288d 826 if (code == SUSPEND)
c277df42
IZ
827 break;
828 }
829 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
830 min1 = 0;
831 if (flags & SCF_DO_SUBSTR) {
832 data->pos_min += min1;
833 data->pos_delta += max1 - min1;
834 if (max1 != min1 || is_inf)
835 data->longest = &(data->longest_float);
836 }
837 min += min1;
838 delta += max1 - min1;
653099ff 839 if (flags & SCF_DO_STCLASS_OR) {
830247a4 840 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
841 if (min1) {
842 cl_and(data->start_class, &and_with);
843 flags &= ~SCF_DO_STCLASS;
844 }
845 }
846 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
847 if (min1) {
848 cl_and(data->start_class, &accum);
653099ff 849 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
850 }
851 else {
b81d288d 852 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
853 * data->start_class */
854 StructCopy(data->start_class, &and_with,
855 struct regnode_charclass_class);
856 flags &= ~SCF_DO_STCLASS_AND;
857 StructCopy(&accum, data->start_class,
858 struct regnode_charclass_class);
859 flags |= SCF_DO_STCLASS_OR;
860 data->start_class->flags |= ANYOF_EOS;
861 }
653099ff 862 }
a0ed51b3
LW
863 }
864 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
865 scan = NEXTOPER(NEXTOPER(scan));
866 else /* single branch is optimized. */
867 scan = NEXTOPER(scan);
868 continue;
a0ed51b3
LW
869 }
870 else if (OP(scan) == EXACT) {
cd439c50 871 I32 l = STR_LEN(scan);
1aa99e6b 872 UV uc = *((U8*)STRING(scan));
a0ed51b3 873 if (UTF) {
1aa99e6b
IH
874 U8 *s = (U8*)STRING(scan);
875 l = utf8_length(s, s + l);
9041c2e3 876 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
877 }
878 min += l;
c277df42 879 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
880 /* The code below prefers earlier match for fixed
881 offset, later match for variable offset. */
882 if (data->last_end == -1) { /* Update the start info. */
883 data->last_start_min = data->pos_min;
884 data->last_start_max = is_inf
b81d288d 885 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 886 }
cd439c50 887 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
888 data->last_end = data->pos_min + l;
889 data->pos_min += l; /* As in the first entry. */
890 data->flags &= ~SF_BEFORE_EOL;
891 }
653099ff
GS
892 if (flags & SCF_DO_STCLASS_AND) {
893 /* Check whether it is compatible with what we know already! */
894 int compat = 1;
895
1aa99e6b 896 if (uc >= 0x100 ||
516a5887 897 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 898 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 899 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 900 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 901 )
653099ff
GS
902 compat = 0;
903 ANYOF_CLASS_ZERO(data->start_class);
904 ANYOF_BITMAP_ZERO(data->start_class);
905 if (compat)
1aa99e6b 906 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 907 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
908 if (uc < 0x100)
909 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
910 }
911 else if (flags & SCF_DO_STCLASS_OR) {
912 /* false positive possible if the class is case-folded */
1aa99e6b 913 if (uc < 0x100)
9b877dbb
IH
914 ANYOF_BITMAP_SET(data->start_class, uc);
915 else
916 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
917 data->start_class->flags &= ~ANYOF_EOS;
918 cl_and(data->start_class, &and_with);
919 }
920 flags &= ~SCF_DO_STCLASS;
a0ed51b3 921 }
653099ff 922 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 923 I32 l = STR_LEN(scan);
1aa99e6b 924 UV uc = *((U8*)STRING(scan));
653099ff
GS
925
926 /* Search for fixed substrings supports EXACT only. */
b81d288d 927 if (flags & SCF_DO_SUBSTR)
830247a4 928 scan_commit(pRExC_state, data);
a0ed51b3 929 if (UTF) {
1aa99e6b
IH
930 U8 *s = (U8 *)STRING(scan);
931 l = utf8_length(s, s + l);
9041c2e3 932 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
933 }
934 min += l;
c277df42 935 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 936 data->pos_min += l;
653099ff
GS
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
939 int compat = 1;
940
1aa99e6b 941 if (uc >= 0x100 ||
516a5887 942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 944 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
945 compat = 0;
946 ANYOF_CLASS_ZERO(data->start_class);
947 ANYOF_BITMAP_ZERO(data->start_class);
948 if (compat) {
1aa99e6b 949 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
950 data->start_class->flags &= ~ANYOF_EOS;
951 data->start_class->flags |= ANYOF_FOLD;
952 if (OP(scan) == EXACTFL)
953 data->start_class->flags |= ANYOF_LOCALE;
954 }
955 }
956 else if (flags & SCF_DO_STCLASS_OR) {
957 if (data->start_class->flags & ANYOF_FOLD) {
958 /* false positive possible if the class is case-folded.
959 Assume that the locale settings are the same... */
1aa99e6b
IH
960 if (uc < 0x100)
961 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
962 data->start_class->flags &= ~ANYOF_EOS;
963 }
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
4d61ec05 968 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 969 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 970 I32 f = flags, pos_before = 0;
c277df42 971 regnode *oscan = scan;
653099ff
GS
972 struct regnode_charclass_class this_class;
973 struct regnode_charclass_class *oclass = NULL;
974
22c35a8c 975 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 976 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
977 scan = NEXTOPER(scan);
978 goto finish;
979 case PLUS:
653099ff 980 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 981 next = NEXTOPER(scan);
653099ff 982 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
983 mincount = 1;
984 maxcount = REG_INFTY;
c277df42
IZ
985 next = regnext(scan);
986 scan = NEXTOPER(scan);
987 goto do_curly;
988 }
989 }
990 if (flags & SCF_DO_SUBSTR)
991 data->pos_min++;
992 min++;
993 /* Fall through. */
994 case STAR:
653099ff
GS
995 if (flags & SCF_DO_STCLASS) {
996 mincount = 0;
b81d288d 997 maxcount = REG_INFTY;
653099ff
GS
998 next = regnext(scan);
999 scan = NEXTOPER(scan);
1000 goto do_curly;
1001 }
b81d288d 1002 is_inf = is_inf_internal = 1;
c277df42
IZ
1003 scan = regnext(scan);
1004 if (flags & SCF_DO_SUBSTR) {
830247a4 1005 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1006 data->longest = &(data->longest_float);
1007 }
1008 goto optimize_curly_tail;
1009 case CURLY:
b81d288d 1010 mincount = ARG1(scan);
c277df42
IZ
1011 maxcount = ARG2(scan);
1012 next = regnext(scan);
cb434fcc
IZ
1013 if (OP(scan) == CURLYX) {
1014 I32 lp = (data ? *(data->last_closep) : 0);
1015
1016 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1017 }
c277df42
IZ
1018 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1019 do_curly:
1020 if (flags & SCF_DO_SUBSTR) {
830247a4 1021 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1022 pos_before = data->pos_min;
1023 }
1024 if (data) {
1025 fl = data->flags;
1026 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1027 if (is_inf)
1028 data->flags |= SF_IS_INF;
1029 }
653099ff 1030 if (flags & SCF_DO_STCLASS) {
830247a4 1031 cl_init(pRExC_state, &this_class);
653099ff
GS
1032 oclass = data->start_class;
1033 data->start_class = &this_class;
1034 f |= SCF_DO_STCLASS_AND;
1035 f &= ~SCF_DO_STCLASS_OR;
1036 }
e1901655
IZ
1037 /* These are the cases when once a subexpression
1038 fails at a particular position, it cannot succeed
1039 even after backtracking at the enclosing scope.
b81d288d 1040
e1901655
IZ
1041 XXXX what if minimal match and we are at the
1042 initial run of {n,m}? */
1043 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1044 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1045
c277df42 1046 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1047 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1048 mincount == 0
653099ff
GS
1049 ? (f & ~SCF_DO_SUBSTR) : f);
1050
1051 if (flags & SCF_DO_STCLASS)
1052 data->start_class = oclass;
1053 if (mincount == 0 || minnext == 0) {
1054 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1055 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1056 }
1057 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1058 /* Switch to OR mode: cache the old value of
653099ff
GS
1059 * data->start_class */
1060 StructCopy(data->start_class, &and_with,
1061 struct regnode_charclass_class);
1062 flags &= ~SCF_DO_STCLASS_AND;
1063 StructCopy(&this_class, data->start_class,
1064 struct regnode_charclass_class);
1065 flags |= SCF_DO_STCLASS_OR;
1066 data->start_class->flags |= ANYOF_EOS;
1067 }
1068 } else { /* Non-zero len */
1069 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1070 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1071 cl_and(data->start_class, &and_with);
1072 }
1073 else if (flags & SCF_DO_STCLASS_AND)
1074 cl_and(data->start_class, &this_class);
1075 flags &= ~SCF_DO_STCLASS;
1076 }
c277df42
IZ
1077 if (!scan) /* It was not CURLYX, but CURLY. */
1078 scan = next;
b81d288d 1079 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
99799961 1080 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1081 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1082 {
830247a4 1083 vWARN(RExC_parse,
b45f050a
JF
1084 "Quantifier unexpected on zero-length expression");
1085 }
1086
c277df42 1087 min += minnext * mincount;
b81d288d 1088 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1089 && (minnext + deltanext) > 0)
1090 || deltanext == I32_MAX);
aca2d497 1091 is_inf |= is_inf_internal;
c277df42
IZ
1092 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1093
1094 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1095 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1096 && data->flags & SF_IN_PAR
1097 && !(data->flags & SF_HAS_EVAL)
1098 && !deltanext && minnext == 1 ) {
1099 /* Try to optimize to CURLYN. */
1100 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1101 regnode *nxt1 = nxt, *nxt2;
1102
1103 /* Skip open. */
1104 nxt = regnext(nxt);
4d61ec05 1105 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1106 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1107 && STR_LEN(nxt) == 1))
c277df42
IZ
1108 goto nogo;
1109 nxt2 = nxt;
1110 nxt = regnext(nxt);
b81d288d 1111 if (OP(nxt) != CLOSE)
c277df42
IZ
1112 goto nogo;
1113 /* Now we know that nxt2 is the only contents: */
1114 oscan->flags = ARG(nxt);
1115 OP(oscan) = CURLYN;
1116 OP(nxt1) = NOTHING; /* was OPEN. */
1117#ifdef DEBUGGING
1118 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1119 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1120 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1121 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1122 OP(nxt + 1) = OPTIMIZED; /* was count. */
1123 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1124#endif
c277df42 1125 }
c277df42
IZ
1126 nogo:
1127
1128 /* Try optimization CURLYX => CURLYM. */
b81d288d 1129 if ( OP(oscan) == CURLYX && data
c277df42 1130 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1131 && !(data->flags & SF_HAS_EVAL)
1132 && !deltanext ) {
1133 /* XXXX How to optimize if data == 0? */
1134 /* Optimize to a simpler form. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1136 regnode *nxt2;
1137
1138 OP(oscan) = CURLYM;
1139 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1140 && (OP(nxt2) != WHILEM))
c277df42
IZ
1141 nxt = nxt2;
1142 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1143 /* Need to optimize away parenths. */
1144 if (data->flags & SF_IN_PAR) {
1145 /* Set the parenth number. */
1146 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1147
b81d288d 1148 if (OP(nxt) != CLOSE)
b45f050a 1149 FAIL("Panic opt close");
c277df42
IZ
1150 oscan->flags = ARG(nxt);
1151 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1152 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1153#ifdef DEBUGGING
1154 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1155 OP(nxt + 1) = OPTIMIZED; /* was count. */
1156 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1157 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1158#endif
c277df42
IZ
1159#if 0
1160 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1161 regnode *nnxt = regnext(nxt1);
b81d288d 1162
c277df42
IZ
1163 if (nnxt == nxt) {
1164 if (reg_off_by_arg[OP(nxt1)])
1165 ARG_SET(nxt1, nxt2 - nxt1);
1166 else if (nxt2 - nxt1 < U16_MAX)
1167 NEXT_OFF(nxt1) = nxt2 - nxt1;
1168 else
1169 OP(nxt) = NOTHING; /* Cannot beautify */
1170 }
1171 nxt1 = nnxt;
1172 }
1173#endif
1174 /* Optimize again: */
b81d288d 1175 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1176 NULL, 0);
a0ed51b3
LW
1177 }
1178 else
c277df42 1179 oscan->flags = 0;
c277df42 1180 }
e1901655
IZ
1181 else if ((OP(oscan) == CURLYX)
1182 && (flags & SCF_WHILEM_VISITED_POS)
1183 /* See the comment on a similar expression above.
1184 However, this time it not a subexpression
1185 we care about, but the expression itself. */
1186 && (maxcount == REG_INFTY)
1187 && data && ++data->whilem_c < 16) {
1188 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1189 /* Find WHILEM (as in regexec.c) */
1190 regnode *nxt = oscan + NEXT_OFF(oscan);
1191
1192 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1193 nxt += ARG(nxt);
1194 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1195 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1196 }
b81d288d 1197 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1198 pars++;
1199 if (flags & SCF_DO_SUBSTR) {
1200 SV *last_str = Nullsv;
1201 int counted = mincount != 0;
1202
1203 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1204 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1205 ? pos_before : data->last_start_min;
1206 STRLEN l;
1207 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1208 I32 old = b - data->last_start_min;
1209
1210 if (UTF)
1211 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1212
a0ed51b3 1213 l -= old;
c277df42 1214 /* Get the added string: */
79cb57f6 1215 last_str = newSVpvn(s + old, l);
c277df42
IZ
1216 if (deltanext == 0 && pos_before == b) {
1217 /* What was added is a constant string */
1218 if (mincount > 1) {
1219 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1220 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1221 SvPVX(last_str), l, mincount - 1);
1222 SvCUR(last_str) *= mincount;
1223 /* Add additional parts. */
b81d288d 1224 SvCUR_set(data->last_found,
c277df42
IZ
1225 SvCUR(data->last_found) - l);
1226 sv_catsv(data->last_found, last_str);
1227 data->last_end += l * (mincount - 1);
1228 }
2a8d9689
HS
1229 } else {
1230 /* start offset must point into the last copy */
1231 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1232 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1233 * (minnext + data->pos_delta);
c277df42
IZ
1234 }
1235 }
1236 /* It is counted once already... */
1237 data->pos_min += minnext * (mincount - counted);
1238 data->pos_delta += - counted * deltanext +
1239 (minnext + deltanext) * maxcount - minnext * mincount;
1240 if (mincount != maxcount) {
653099ff
GS
1241 /* Cannot extend fixed substrings found inside
1242 the group. */
830247a4 1243 scan_commit(pRExC_state,data);
c277df42
IZ
1244 if (mincount && last_str) {
1245 sv_setsv(data->last_found, last_str);
1246 data->last_end = data->pos_min;
b81d288d 1247 data->last_start_min =
a0ed51b3 1248 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1249 data->last_start_max = is_inf
1250 ? I32_MAX
c277df42 1251 : data->pos_min + data->pos_delta
a0ed51b3 1252 - CHR_SVLEN(last_str);
c277df42
IZ
1253 }
1254 data->longest = &(data->longest_float);
1255 }
aca2d497 1256 SvREFCNT_dec(last_str);
c277df42 1257 }
405ff068 1258 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1259 data->flags |= SF_HAS_EVAL;
1260 optimize_curly_tail:
c277df42 1261 if (OP(oscan) != CURLYX) {
22c35a8c 1262 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1263 && NEXT_OFF(next))
1264 NEXT_OFF(oscan) += NEXT_OFF(next);
1265 }
c277df42 1266 continue;
653099ff 1267 default: /* REF and CLUMP only? */
c277df42 1268 if (flags & SCF_DO_SUBSTR) {
830247a4 1269 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1270 data->longest = &(data->longest_float);
1271 }
aca2d497 1272 is_inf = is_inf_internal = 1;
653099ff 1273 if (flags & SCF_DO_STCLASS_OR)
830247a4 1274 cl_anything(pRExC_state, data->start_class);
653099ff 1275 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1276 break;
1277 }
a0ed51b3 1278 }
ffc61ed2 1279 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1280 int value = 0;
653099ff 1281
c277df42 1282 if (flags & SCF_DO_SUBSTR) {
830247a4 1283 scan_commit(pRExC_state,data);
c277df42
IZ
1284 data->pos_min++;
1285 }
1286 min++;
653099ff
GS
1287 if (flags & SCF_DO_STCLASS) {
1288 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1289
1290 /* Some of the logic below assumes that switching
1291 locale on will only add false positives. */
1292 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1293 case SANY:
653099ff
GS
1294 default:
1295 do_default:
1296 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1297 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1298 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1299 break;
1300 case REG_ANY:
1301 if (OP(scan) == SANY)
1302 goto do_default;
1303 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1304 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1305 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1306 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1307 }
1308 if (flags & SCF_DO_STCLASS_AND || !value)
1309 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1310 break;
1311 case ANYOF:
1312 if (flags & SCF_DO_STCLASS_AND)
1313 cl_and(data->start_class,
1314 (struct regnode_charclass_class*)scan);
1315 else
830247a4 1316 cl_or(pRExC_state, data->start_class,
653099ff
GS
1317 (struct regnode_charclass_class*)scan);
1318 break;
1319 case ALNUM:
1320 if (flags & SCF_DO_STCLASS_AND) {
1321 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1322 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1323 for (value = 0; value < 256; value++)
1324 if (!isALNUM(value))
1325 ANYOF_BITMAP_CLEAR(data->start_class, value);
1326 }
1327 }
1328 else {
1329 if (data->start_class->flags & ANYOF_LOCALE)
1330 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1331 else {
1332 for (value = 0; value < 256; value++)
1333 if (isALNUM(value))
b81d288d 1334 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1335 }
1336 }
1337 break;
1338 case ALNUML:
1339 if (flags & SCF_DO_STCLASS_AND) {
1340 if (data->start_class->flags & ANYOF_LOCALE)
1341 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1342 }
1343 else {
1344 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1345 data->start_class->flags |= ANYOF_LOCALE;
1346 }
1347 break;
1348 case NALNUM:
1349 if (flags & SCF_DO_STCLASS_AND) {
1350 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1351 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1352 for (value = 0; value < 256; value++)
1353 if (isALNUM(value))
1354 ANYOF_BITMAP_CLEAR(data->start_class, value);
1355 }
1356 }
1357 else {
1358 if (data->start_class->flags & ANYOF_LOCALE)
1359 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1360 else {
1361 for (value = 0; value < 256; value++)
1362 if (!isALNUM(value))
b81d288d 1363 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1364 }
1365 }
1366 break;
1367 case NALNUML:
1368 if (flags & SCF_DO_STCLASS_AND) {
1369 if (data->start_class->flags & ANYOF_LOCALE)
1370 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1371 }
1372 else {
1373 data->start_class->flags |= ANYOF_LOCALE;
1374 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1375 }
1376 break;
1377 case SPACE:
1378 if (flags & SCF_DO_STCLASS_AND) {
1379 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1381 for (value = 0; value < 256; value++)
1382 if (!isSPACE(value))
1383 ANYOF_BITMAP_CLEAR(data->start_class, value);
1384 }
1385 }
1386 else {
1387 if (data->start_class->flags & ANYOF_LOCALE)
1388 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1389 else {
1390 for (value = 0; value < 256; value++)
1391 if (isSPACE(value))
b81d288d 1392 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1393 }
1394 }
1395 break;
1396 case SPACEL:
1397 if (flags & SCF_DO_STCLASS_AND) {
1398 if (data->start_class->flags & ANYOF_LOCALE)
1399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1400 }
1401 else {
1402 data->start_class->flags |= ANYOF_LOCALE;
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1404 }
1405 break;
1406 case NSPACE:
1407 if (flags & SCF_DO_STCLASS_AND) {
1408 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1409 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1410 for (value = 0; value < 256; value++)
1411 if (isSPACE(value))
1412 ANYOF_BITMAP_CLEAR(data->start_class, value);
1413 }
1414 }
1415 else {
1416 if (data->start_class->flags & ANYOF_LOCALE)
1417 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1418 else {
1419 for (value = 0; value < 256; value++)
1420 if (!isSPACE(value))
b81d288d 1421 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1422 }
1423 }
1424 break;
1425 case NSPACEL:
1426 if (flags & SCF_DO_STCLASS_AND) {
1427 if (data->start_class->flags & ANYOF_LOCALE) {
1428 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1429 for (value = 0; value < 256; value++)
1430 if (!isSPACE(value))
1431 ANYOF_BITMAP_CLEAR(data->start_class, value);
1432 }
1433 }
1434 else {
1435 data->start_class->flags |= ANYOF_LOCALE;
1436 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1437 }
1438 break;
1439 case DIGIT:
1440 if (flags & SCF_DO_STCLASS_AND) {
1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1442 for (value = 0; value < 256; value++)
1443 if (!isDIGIT(value))
1444 ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 }
1446 else {
1447 if (data->start_class->flags & ANYOF_LOCALE)
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1449 else {
1450 for (value = 0; value < 256; value++)
1451 if (isDIGIT(value))
b81d288d 1452 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1453 }
1454 }
1455 break;
1456 case NDIGIT:
1457 if (flags & SCF_DO_STCLASS_AND) {
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1459 for (value = 0; value < 256; value++)
1460 if (isDIGIT(value))
1461 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 }
1463 else {
1464 if (data->start_class->flags & ANYOF_LOCALE)
1465 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1466 else {
1467 for (value = 0; value < 256; value++)
1468 if (!isDIGIT(value))
b81d288d 1469 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1470 }
1471 }
1472 break;
1473 }
1474 if (flags & SCF_DO_STCLASS_OR)
1475 cl_and(data->start_class, &and_with);
1476 flags &= ~SCF_DO_STCLASS;
1477 }
a0ed51b3 1478 }
22c35a8c 1479 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1480 data->flags |= (OP(scan) == MEOL
1481 ? SF_BEFORE_MEOL
1482 : SF_BEFORE_SEOL);
a0ed51b3 1483 }
653099ff
GS
1484 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1485 /* Lookbehind, or need to calculate parens/evals/stclass: */
1486 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1487 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1488 /* Lookahead/lookbehind */
cb434fcc 1489 I32 deltanext, minnext, fake = 0;
c277df42 1490 regnode *nscan;
653099ff
GS
1491 struct regnode_charclass_class intrnl;
1492 int f = 0;
c277df42
IZ
1493
1494 data_fake.flags = 0;
b81d288d 1495 if (data) {
2c2d71f5 1496 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1497 data_fake.last_closep = data->last_closep;
1498 }
1499 else
1500 data_fake.last_closep = &fake;
653099ff
GS
1501 if ( flags & SCF_DO_STCLASS && !scan->flags
1502 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1503 cl_init(pRExC_state, &intrnl);
653099ff 1504 data_fake.start_class = &intrnl;
e1901655 1505 f |= SCF_DO_STCLASS_AND;
653099ff 1506 }
e1901655
IZ
1507 if (flags & SCF_WHILEM_VISITED_POS)
1508 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1509 next = regnext(scan);
1510 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1511 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1512 if (scan->flags) {
1513 if (deltanext) {
9baa0206 1514 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1515 }
1516 else if (minnext > U8_MAX) {
9baa0206 1517 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1518 }
1519 scan->flags = minnext;
1520 }
1521 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1522 pars++;
405ff068 1523 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1524 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1525 if (data)
1526 data->whilem_c = data_fake.whilem_c;
e1901655 1527 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1528 int was = (data->start_class->flags & ANYOF_EOS);
1529
1530 cl_and(data->start_class, &intrnl);
1531 if (was)
1532 data->start_class->flags |= ANYOF_EOS;
1533 }
a0ed51b3
LW
1534 }
1535 else if (OP(scan) == OPEN) {
c277df42 1536 pars++;
a0ed51b3 1537 }
cb434fcc
IZ
1538 else if (OP(scan) == CLOSE) {
1539 if (ARG(scan) == is_par) {
1540 next = regnext(scan);
c277df42 1541
cb434fcc
IZ
1542 if ( next && (OP(next) != WHILEM) && next < last)
1543 is_par = 0; /* Disable optimization */
1544 }
1545 if (data)
1546 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1547 }
1548 else if (OP(scan) == EVAL) {
c277df42
IZ
1549 if (data)
1550 data->flags |= SF_HAS_EVAL;
1551 }
96776eda 1552 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1553 if (flags & SCF_DO_SUBSTR) {
830247a4 1554 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1555 data->longest = &(data->longest_float);
1556 }
1557 is_inf = is_inf_internal = 1;
653099ff 1558 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1559 cl_anything(pRExC_state, data->start_class);
96776eda 1560 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1561 }
c277df42
IZ
1562 /* Else: zero-length, ignore. */
1563 scan = regnext(scan);
1564 }
1565
1566 finish:
1567 *scanp = scan;
aca2d497 1568 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1569 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1570 data->pos_delta = I32_MAX - data->pos_min;
1571 if (is_par > U8_MAX)
1572 is_par = 0;
1573 if (is_par && pars==1 && data) {
1574 data->flags |= SF_IN_PAR;
1575 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1576 }
1577 else if (pars && data) {
c277df42
IZ
1578 data->flags |= SF_HAS_PAR;
1579 data->flags &= ~SF_IN_PAR;
1580 }
653099ff
GS
1581 if (flags & SCF_DO_STCLASS_OR)
1582 cl_and(data->start_class, &and_with);
c277df42
IZ
1583 return min;
1584}
1585
76e3520e 1586STATIC I32
830247a4 1587S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1588{
830247a4 1589 if (RExC_rx->data) {
b81d288d
AB
1590 Renewc(RExC_rx->data,
1591 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1592 char, struct reg_data);
830247a4
IZ
1593 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1594 RExC_rx->data->count += n;
a0ed51b3
LW
1595 }
1596 else {
830247a4 1597 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 New(1208, RExC_rx->data->what, n, U8);
1600 RExC_rx->data->count = n;
c277df42 1601 }
830247a4
IZ
1602 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1603 return RExC_rx->data->count - n;
c277df42
IZ
1604}
1605
d88dccdf 1606void
864dbfa3 1607Perl_reginitcolors(pTHX)
d88dccdf 1608{
d88dccdf
IZ
1609 int i = 0;
1610 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1611
d88dccdf
IZ
1612 if (s) {
1613 PL_colors[0] = s = savepv(s);
1614 while (++i < 6) {
1615 s = strchr(s, '\t');
1616 if (s) {
1617 *s = '\0';
1618 PL_colors[i] = ++s;
1619 }
1620 else
c712d376 1621 PL_colors[i] = s = "";
d88dccdf
IZ
1622 }
1623 } else {
b81d288d 1624 while (i < 6)
d88dccdf
IZ
1625 PL_colors[i++] = "";
1626 }
1627 PL_colorset = 1;
1628}
1629
8615cb43 1630
a687059c 1631/*
e50aee73 1632 - pregcomp - compile a regular expression into internal code
a687059c
LW
1633 *
1634 * We can't allocate space until we know how big the compiled form will be,
1635 * but we can't compile it (and thus know how big it is) until we've got a
1636 * place to put the code. So we cheat: we compile it twice, once with code
1637 * generation turned off and size counting turned on, and once "for real".
1638 * This also means that we don't allocate space until we are sure that the
1639 * thing really will compile successfully, and we never have to move the
1640 * code and thus invalidate pointers into it. (Note that it has to be in
1641 * one piece because free() must be able to free it all.) [NB: not true in perl]
1642 *
1643 * Beware that the optimization-preparation code in here knows about some
1644 * of the structure of the compiled regexp. [I'll say.]
1645 */
1646regexp *
864dbfa3 1647Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1648{
a0d0e21e 1649 register regexp *r;
c277df42 1650 regnode *scan;
c277df42 1651 regnode *first;
a0d0e21e 1652 I32 flags;
a0d0e21e
LW
1653 I32 minlen = 0;
1654 I32 sawplus = 0;
1655 I32 sawopen = 0;
2c2d71f5 1656 scan_data_t data;
830247a4
IZ
1657 RExC_state_t RExC_state;
1658 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1659
1660 if (exp == NULL)
c277df42 1661 FAIL("NULL regexp argument");
a0d0e21e 1662
830247a4 1663 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1664 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1665 RExC_utf8 = 1;
a0ed51b3 1666 else
1aa99e6b 1667 RExC_utf8 = 0;
a0ed51b3 1668
5cfc7842 1669 RExC_precomp = exp;
35ef4773 1670 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1671 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1672 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1673 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1674 RExC_flags16 = pm->op_pmflags;
1675 RExC_sawback = 0;
bbce6d69 1676
830247a4
IZ
1677 RExC_seen = 0;
1678 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1679 RExC_seen_evals = 0;
1680 RExC_extralen = 0;
c277df42 1681
bbce6d69 1682 /* First pass: determine size, legality. */
830247a4 1683 RExC_parse = exp;
fac92740 1684 RExC_start = exp;
830247a4
IZ
1685 RExC_end = xend;
1686 RExC_naughty = 0;
1687 RExC_npar = 1;
1688 RExC_size = 0L;
1689 RExC_emit = &PL_regdummy;
1690 RExC_whilem_seen = 0;
85ddcde9
JH
1691#if 0 /* REGC() is (currently) a NOP at the first pass.
1692 * Clever compilers notice this and complain. --jhi */
830247a4 1693 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1694#endif
830247a4 1695 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1696 RExC_precomp = Nullch;
a0d0e21e
LW
1697 return(NULL);
1698 }
830247a4 1699 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1700
c277df42
IZ
1701 /* Small enough for pointer-storage convention?
1702 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1703 if (RExC_size >= 0x10000L && RExC_extralen)
1704 RExC_size += RExC_extralen;
c277df42 1705 else
830247a4
IZ
1706 RExC_extralen = 0;
1707 if (RExC_whilem_seen > 15)
1708 RExC_whilem_seen = 15;
a0d0e21e 1709
bbce6d69 1710 /* Allocate space and initialize. */
830247a4 1711 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1712 char, regexp);
a0d0e21e 1713 if (r == NULL)
b45f050a
JF
1714 FAIL("Regexp out of space");
1715
0f79a09d
GS
1716#ifdef DEBUGGING
1717 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1718 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1719#endif
c277df42 1720 r->refcnt = 1;
bbce6d69 1721 r->prelen = xend - exp;
5cfc7842 1722 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1723 r->subbeg = NULL;
1724 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1725 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1726
1727 r->substrs = 0; /* Useful during FAIL. */
1728 r->startp = 0; /* Useful during FAIL. */
1729 r->endp = 0; /* Useful during FAIL. */
1730
fac92740
MJD
1731 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1732 if (r->offsets) {
1733 r->offsets[0] = RExC_size;
1734 }
1735 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1736 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1737 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1738 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1739
830247a4 1740 RExC_rx = r;
bbce6d69 1741
1742 /* Second pass: emit code. */
830247a4
IZ
1743 RExC_parse = exp;
1744 RExC_end = xend;
1745 RExC_naughty = 0;
1746 RExC_npar = 1;
fac92740 1747 RExC_emit_start = r->program;
830247a4 1748 RExC_emit = r->program;
2cd61cdb 1749 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1750 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1751 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1752 r->data = 0;
830247a4 1753 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1754 return(NULL);
1755
1756 /* Dig out information for optimizations. */
cf93c79d 1757 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1758 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1759 if (UTF)
1760 r->reganch |= ROPT_UTF8;
c277df42 1761 r->regstclass = NULL;
830247a4 1762 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1763 r->reganch |= ROPT_NAUGHTY;
c277df42 1764 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1765
1766 /* XXXX To minimize changes to RE engine we always allocate
1767 3-units-long substrs field. */
1768 Newz(1004, r->substrs, 1, struct reg_substr_data);
1769
2c2d71f5 1770 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1771 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1772 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1773 I32 fake;
c5254dd6 1774 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1775 struct regnode_charclass_class ch_class;
1776 int stclass_flag;
cb434fcc 1777 I32 last_close = 0;
a0d0e21e
LW
1778
1779 first = scan;
c277df42 1780 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1781 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1782 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1783 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1784 (OP(first) == PLUS) ||
1785 (OP(first) == MINMOD) ||
653099ff 1786 /* An {n,m} with n>0 */
22c35a8c 1787 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1788 if (OP(first) == PLUS)
1789 sawplus = 1;
1790 else
1791 first += regarglen[(U8)OP(first)];
1792 first = NEXTOPER(first);
a687059c
LW
1793 }
1794
a0d0e21e
LW
1795 /* Starting-point info. */
1796 again:
653099ff 1797 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1798 if (OP(first) == EXACT)
1799 ; /* Empty, get anchored substr later. */
1800 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1801 r->regstclass = first;
1802 }
653099ff 1803 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1804 r->regstclass = first;
22c35a8c
GS
1805 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1806 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1807 r->regstclass = first;
22c35a8c 1808 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1809 r->reganch |= (OP(first) == MBOL
1810 ? ROPT_ANCH_MBOL
1811 : (OP(first) == SBOL
1812 ? ROPT_ANCH_SBOL
1813 : ROPT_ANCH_BOL));
a0d0e21e 1814 first = NEXTOPER(first);
774d564b 1815 goto again;
1816 }
1817 else if (OP(first) == GPOS) {
1818 r->reganch |= ROPT_ANCH_GPOS;
1819 first = NEXTOPER(first);
1820 goto again;
a0d0e21e
LW
1821 }
1822 else if ((OP(first) == STAR &&
22c35a8c 1823 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1824 !(r->reganch & ROPT_ANCH) )
1825 {
1826 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1827 int type = OP(NEXTOPER(first));
1828
ffc61ed2 1829 if (type == REG_ANY)
cad2e5aa
JH
1830 type = ROPT_ANCH_MBOL;
1831 else
1832 type = ROPT_ANCH_SBOL;
1833
1834 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1835 first = NEXTOPER(first);
774d564b 1836 goto again;
a0d0e21e 1837 }
b81d288d 1838 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1839 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1840 /* x+ must match at the 1st pos of run of x's */
1841 r->reganch |= ROPT_SKIP;
a0d0e21e 1842
c277df42 1843 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1844 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1845 (IV)(first - scan + 1)));
a0d0e21e
LW
1846 /*
1847 * If there's something expensive in the r.e., find the
1848 * longest literal string that must appear and make it the
1849 * regmust. Resolve ties in favor of later strings, since
1850 * the regstart check works with the beginning of the r.e.
1851 * and avoiding duplication strengthens checking. Not a
1852 * strong reason, but sufficient in the absence of others.
1853 * [Now we resolve ties in favor of the earlier string if
c277df42 1854 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1855 * earlier string may buy us something the later one won't.]
1856 */
a0d0e21e 1857 minlen = 0;
a687059c 1858
79cb57f6
GS
1859 data.longest_fixed = newSVpvn("",0);
1860 data.longest_float = newSVpvn("",0);
1861 data.last_found = newSVpvn("",0);
c277df42
IZ
1862 data.longest = &(data.longest_fixed);
1863 first = scan;
653099ff 1864 if (!r->regstclass) {
830247a4 1865 cl_init(pRExC_state, &ch_class);
653099ff
GS
1866 data.start_class = &ch_class;
1867 stclass_flag = SCF_DO_STCLASS_AND;
1868 } else /* XXXX Check for BOUND? */
1869 stclass_flag = 0;
cb434fcc 1870 data.last_closep = &last_close;
653099ff 1871
830247a4 1872 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1873 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1874 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1875 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1876 && !RExC_seen_zerolen
1877 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1878 r->reganch |= ROPT_CHECK_ALL;
830247a4 1879 scan_commit(pRExC_state, &data);
c277df42
IZ
1880 SvREFCNT_dec(data.last_found);
1881
a0ed51b3 1882 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1883 if (longest_float_length
c277df42
IZ
1884 || (data.flags & SF_FL_BEFORE_EOL
1885 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1886 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1887 int t;
1888
a0ed51b3 1889 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1890 && data.offset_fixed == data.offset_float_min
1891 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1892 goto remove_float; /* As in (a)+. */
1893
c277df42
IZ
1894 r->float_substr = data.longest_float;
1895 r->float_min_offset = data.offset_float_min;
1896 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1897 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1898 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1899 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1900 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1901 }
1902 else {
aca2d497 1903 remove_float:
c277df42
IZ
1904 r->float_substr = Nullsv;
1905 SvREFCNT_dec(data.longest_float);
c5254dd6 1906 longest_float_length = 0;
a0d0e21e 1907 }
c277df42 1908
a0ed51b3 1909 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1910 if (longest_fixed_length
c277df42
IZ
1911 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1912 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1913 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1914 int t;
1915
c277df42
IZ
1916 r->anchored_substr = data.longest_fixed;
1917 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1918 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1921 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1922 }
1923 else {
c277df42
IZ
1924 r->anchored_substr = Nullsv;
1925 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1926 longest_fixed_length = 0;
a0d0e21e 1927 }
b81d288d 1928 if (r->regstclass
ffc61ed2 1929 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1930 r->regstclass = NULL;
1931 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1932 && !(data.start_class->flags & ANYOF_EOS)
1933 && !cl_is_anything(data.start_class)) {
830247a4 1934 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1935
b81d288d 1936 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1937 struct regnode_charclass_class);
1938 StructCopy(data.start_class,
830247a4 1939 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1940 struct regnode_charclass_class);
830247a4 1941 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1942 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1943 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1944 DEBUG_r({ SV *sv = sv_newmortal();
1945 regprop(sv, (regnode*)data.start_class);
1946 PerlIO_printf(Perl_debug_log,
1947 "synthetic stclass `%s'.\n",
1948 SvPVX(sv));});
653099ff 1949 }
c277df42
IZ
1950
1951 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1952 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1953 r->check_substr = r->anchored_substr;
1954 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1955 if (r->reganch & ROPT_ANCH_SINGLE)
1956 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1957 }
1958 else {
c277df42
IZ
1959 r->check_substr = r->float_substr;
1960 r->check_offset_min = data.offset_float_min;
1961 r->check_offset_max = data.offset_float_max;
a0d0e21e 1962 }
30382c73
IZ
1963 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1964 This should be changed ASAP! */
1965 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1966 r->reganch |= RE_USE_INTUIT;
1967 if (SvTAIL(r->check_substr))
1968 r->reganch |= RE_INTUIT_TAIL;
1969 }
a0ed51b3
LW
1970 }
1971 else {
c277df42
IZ
1972 /* Several toplevels. Best we can is to set minlen. */
1973 I32 fake;
653099ff 1974 struct regnode_charclass_class ch_class;
cb434fcc 1975 I32 last_close = 0;
c277df42
IZ
1976
1977 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1978 scan = r->program + 1;
830247a4 1979 cl_init(pRExC_state, &ch_class);
653099ff 1980 data.start_class = &ch_class;
cb434fcc 1981 data.last_closep = &last_close;
e1901655 1982 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1983 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1984 if (!(data.start_class->flags & ANYOF_EOS)
1985 && !cl_is_anything(data.start_class)) {
830247a4 1986 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1987
b81d288d 1988 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1989 struct regnode_charclass_class);
1990 StructCopy(data.start_class,
830247a4 1991 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1992 struct regnode_charclass_class);
830247a4 1993 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1994 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1995 DEBUG_r({ SV* sv = sv_newmortal();
1996 regprop(sv, (regnode*)data.start_class);
1997 PerlIO_printf(Perl_debug_log,
1998 "synthetic stclass `%s'.\n",
1999 SvPVX(sv));});
653099ff 2000 }
a0d0e21e
LW
2001 }
2002
a0d0e21e 2003 r->minlen = minlen;
b81d288d 2004 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2005 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2006 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2007 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2008 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2009 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
2010 if (RExC_seen & REG_SEEN_SANY)
2011 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
2012 Newz(1002, r->startp, RExC_npar, I32);
2013 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2014 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2015 DEBUG_r(regdump(r));
2016 return(r);
a687059c
LW
2017}
2018
2019/*
2020 - reg - regular expression, i.e. main body or parenthesized thing
2021 *
2022 * Caller must absorb opening parenthesis.
2023 *
2024 * Combining parenthesis handling with the base level of regular expression
2025 * is a trifle forced, but the need to tie the tails of the branches to what
2026 * follows makes it hard to avoid.
2027 */
76e3520e 2028STATIC regnode *
830247a4 2029S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2030 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2031{
c277df42
IZ
2032 register regnode *ret; /* Will be the head of the group. */
2033 register regnode *br;
2034 register regnode *lastbr;
2035 register regnode *ender = 0;
a0d0e21e 2036 register I32 parno = 0;
830247a4 2037 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2038
2039 /* for (?g), (?gc), and (?o) warnings; warning
2040 about (?c) will warn about (?g) -- japhy */
2041
2042 I32 wastedflags = 0x00,
2043 wasted_o = 0x01,
2044 wasted_g = 0x02,
2045 wasted_gc = 0x02 | 0x04,
2046 wasted_c = 0x04;
2047
fac92740 2048 char * parse_start = RExC_parse; /* MJD */
830247a4 2049 char *oregcomp_parse = RExC_parse;
c277df42 2050 char c;
a0d0e21e 2051
821b33a5 2052 *flagp = 0; /* Tentatively. */
a0d0e21e 2053
9d1d55b5 2054
a0d0e21e
LW
2055 /* Make an OPEN node, if parenthesized. */
2056 if (paren) {
fac92740 2057 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2058 U16 posflags = 0, negflags = 0;
2059 U16 *flagsp = &posflags;
0f5d15d6 2060 int logical = 0;
830247a4 2061 char *seqstart = RExC_parse;
ca9dfc88 2062
830247a4
IZ
2063 RExC_parse++;
2064 paren = *RExC_parse++;
c277df42 2065 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2066 switch (paren) {
fac92740 2067 case '<': /* (?<...) */
830247a4 2068 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2069 if (*RExC_parse == '!')
c277df42 2070 paren = ',';
b81d288d 2071 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2072 goto unknown;
830247a4 2073 RExC_parse++;
fac92740
MJD
2074 case '=': /* (?=...) */
2075 case '!': /* (?!...) */
830247a4 2076 RExC_seen_zerolen++;
fac92740
MJD
2077 case ':': /* (?:...) */
2078 case '>': /* (?>...) */
a0d0e21e 2079 break;
fac92740
MJD
2080 case '$': /* (?$...) */
2081 case '@': /* (?@...) */
8615cb43 2082 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2083 break;
fac92740 2084 case '#': /* (?#...) */
830247a4
IZ
2085 while (*RExC_parse && *RExC_parse != ')')
2086 RExC_parse++;
2087 if (*RExC_parse != ')')
c277df42 2088 FAIL("Sequence (?#... not terminated");
830247a4 2089 nextchar(pRExC_state);
a0d0e21e
LW
2090 *flagp = TRYAGAIN;
2091 return NULL;
fac92740 2092 case 'p': /* (?p...) */
f9373011
PM
2093 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2094 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2095 /* FALL THROUGH*/
fac92740 2096 case '?': /* (??...) */
0f5d15d6 2097 logical = 1;
830247a4 2098 paren = *RExC_parse++;
0f5d15d6 2099 /* FALL THROUGH */
fac92740 2100 case '{': /* (?{...}) */
c277df42 2101 {
c277df42
IZ
2102 I32 count = 1, n = 0;
2103 char c;
830247a4 2104 char *s = RExC_parse;
c277df42
IZ
2105 SV *sv;
2106 OP_4tree *sop, *rop;
2107
830247a4
IZ
2108 RExC_seen_zerolen++;
2109 RExC_seen |= REG_SEEN_EVAL;
2110 while (count && (c = *RExC_parse)) {
2111 if (c == '\\' && RExC_parse[1])
2112 RExC_parse++;
b81d288d 2113 else if (c == '{')
c277df42 2114 count++;
b81d288d 2115 else if (c == '}')
c277df42 2116 count--;
830247a4 2117 RExC_parse++;
c277df42 2118 }
830247a4 2119 if (*RExC_parse != ')')
b45f050a 2120 {
b81d288d 2121 RExC_parse = s;
b45f050a
JF
2122 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2123 }
c277df42
IZ
2124 if (!SIZE_ONLY) {
2125 AV *av;
b81d288d
AB
2126
2127 if (RExC_parse - 1 - s)
830247a4 2128 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2129 else
79cb57f6 2130 sv = newSVpvn("", 0);
c277df42 2131
569233ed
SB
2132 ENTER;
2133 Perl_save_re_context(aTHX);
c277df42 2134 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2135 LEAVE;
c277df42 2136
830247a4
IZ
2137 n = add_data(pRExC_state, 3, "nop");
2138 RExC_rx->data->data[n] = (void*)rop;
2139 RExC_rx->data->data[n+1] = (void*)sop;
2140 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2141 SvREFCNT_dec(sv);
a0ed51b3 2142 }
e24b16f9 2143 else { /* First pass */
830247a4 2144 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2145 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2146 /* No compiled RE interpolated, has runtime
2147 components ===> unsafe. */
2148 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2149 if (PL_tainting && PL_tainted)
cc6b7395 2150 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2151 }
2152
830247a4 2153 nextchar(pRExC_state);
0f5d15d6 2154 if (logical) {
830247a4 2155 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2156 if (!SIZE_ONLY)
2157 ret->flags = 2;
830247a4 2158 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2159 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2160 return ret;
2161 }
830247a4 2162 return reganode(pRExC_state, EVAL, n);
c277df42 2163 }
fac92740 2164 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2165 {
fac92740 2166 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2167 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2168 || RExC_parse[1] == '<'
830247a4 2169 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2170 I32 flag;
2171
830247a4 2172 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2173 if (!SIZE_ONLY)
2174 ret->flags = 1;
830247a4 2175 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2176 goto insert_if;
b81d288d 2177 }
a0ed51b3 2178 }
830247a4 2179 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2180 /* (?(1)...) */
830247a4 2181 parno = atoi(RExC_parse++);
c277df42 2182
830247a4
IZ
2183 while (isDIGIT(*RExC_parse))
2184 RExC_parse++;
fac92740
MJD
2185 ret = reganode(pRExC_state, GROUPP, parno);
2186
830247a4 2187 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2188 vFAIL("Switch condition not recognized");
c277df42 2189 insert_if:
830247a4
IZ
2190 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2191 br = regbranch(pRExC_state, &flags, 1);
c277df42 2192 if (br == NULL)
830247a4 2193 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2194 else
830247a4
IZ
2195 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2196 c = *nextchar(pRExC_state);
d1b80229
IZ
2197 if (flags&HASWIDTH)
2198 *flagp |= HASWIDTH;
c277df42 2199 if (c == '|') {
830247a4
IZ
2200 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2201 regbranch(pRExC_state, &flags, 1);
2202 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2203 if (flags&HASWIDTH)
2204 *flagp |= HASWIDTH;
830247a4 2205 c = *nextchar(pRExC_state);
a0ed51b3
LW
2206 }
2207 else
c277df42
IZ
2208 lastbr = NULL;
2209 if (c != ')')
8615cb43 2210 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2211 ender = reg_node(pRExC_state, TAIL);
2212 regtail(pRExC_state, br, ender);
c277df42 2213 if (lastbr) {
830247a4
IZ
2214 regtail(pRExC_state, lastbr, ender);
2215 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2216 }
2217 else
830247a4 2218 regtail(pRExC_state, ret, ender);
c277df42 2219 return ret;
a0ed51b3
LW
2220 }
2221 else {
830247a4 2222 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2223 }
2224 }
1b1626e4 2225 case 0:
830247a4 2226 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2227 vFAIL("Sequence (? incomplete");
1b1626e4 2228 break;
a0d0e21e 2229 default:
830247a4 2230 --RExC_parse;
fac92740 2231 parse_flags: /* (?i) */
830247a4 2232 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2233 /* (?g), (?gc) and (?o) are useless here
2234 and must be globally applied -- japhy */
2235
2236 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2237 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2238 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2239 if (! (wastedflags & wflagbit) ) {
2240 wastedflags |= wflagbit;
2241 vWARN5(
2242 RExC_parse + 1,
2243 "Useless (%s%c) - %suse /%c modifier",
2244 flagsp == &negflags ? "?-" : "?",
2245 *RExC_parse,
2246 flagsp == &negflags ? "don't " : "",
2247 *RExC_parse
2248 );
2249 }
2250 }
2251 }
2252 else if (*RExC_parse == 'c') {
2253 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2254 if (! (wastedflags & wasted_c) ) {
2255 wastedflags |= wasted_gc;
2256 vWARN3(
2257 RExC_parse + 1,
2258 "Useless (%sc) - %suse /gc modifier",
2259 flagsp == &negflags ? "?-" : "?",
2260 flagsp == &negflags ? "don't " : ""
2261 );
2262 }
2263 }
2264 }
2265 else { pmflag(flagsp, *RExC_parse); }
2266
830247a4 2267 ++RExC_parse;
ca9dfc88 2268 }
830247a4 2269 if (*RExC_parse == '-') {
ca9dfc88 2270 flagsp = &negflags;
9d1d55b5 2271 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2272 ++RExC_parse;
ca9dfc88 2273 goto parse_flags;
48c036b1 2274 }
830247a4
IZ
2275 RExC_flags16 |= posflags;
2276 RExC_flags16 &= ~negflags;
2277 if (*RExC_parse == ':') {
2278 RExC_parse++;
ca9dfc88
IZ
2279 paren = ':';
2280 break;
2281 }
c277df42 2282 unknown:
830247a4
IZ
2283 if (*RExC_parse != ')') {
2284 RExC_parse++;
2285 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2286 }
830247a4 2287 nextchar(pRExC_state);
a0d0e21e
LW
2288 *flagp = TRYAGAIN;
2289 return NULL;
2290 }
2291 }
fac92740 2292 else { /* (...) */
830247a4
IZ
2293 parno = RExC_npar;
2294 RExC_npar++;
2295 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2296 Set_Node_Length(ret, 1); /* MJD */
2297 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2298 open = 1;
a0d0e21e 2299 }
a0ed51b3 2300 }
fac92740 2301 else /* ! paren */
a0d0e21e
LW
2302 ret = NULL;
2303
2304 /* Pick up the branches, linking them together. */
fac92740 2305 parse_start = RExC_parse; /* MJD */
830247a4 2306 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2307 /* branch_len = (paren != 0); */
2308
a0d0e21e
LW
2309 if (br == NULL)
2310 return(NULL);
830247a4
IZ
2311 if (*RExC_parse == '|') {
2312 if (!SIZE_ONLY && RExC_extralen) {
2313 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2314 }
fac92740 2315 else { /* MJD */
830247a4 2316 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2317 Set_Node_Length(br, paren != 0);
2318 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2319 }
c277df42
IZ
2320 have_branch = 1;
2321 if (SIZE_ONLY)
830247a4 2322 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2323 }
2324 else if (paren == ':') {
c277df42
IZ
2325 *flagp |= flags&SIMPLE;
2326 }
2327 if (open) { /* Starts with OPEN. */
830247a4 2328 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2329 }
2330 else if (paren != '?') /* Not Conditional */
a0d0e21e 2331 ret = br;
821b33a5
IZ
2332 if (flags&HASWIDTH)
2333 *flagp |= HASWIDTH;
a0d0e21e 2334 *flagp |= flags&SPSTART;
c277df42 2335 lastbr = br;
830247a4
IZ
2336 while (*RExC_parse == '|') {
2337 if (!SIZE_ONLY && RExC_extralen) {
2338 ender = reganode(pRExC_state, LONGJMP,0);
2339 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2340 }
2341 if (SIZE_ONLY)
830247a4
IZ
2342 RExC_extralen += 2; /* Account for LONGJMP. */
2343 nextchar(pRExC_state);
2344 br = regbranch(pRExC_state, &flags, 0);
fac92740 2345
a687059c 2346 if (br == NULL)
a0d0e21e 2347 return(NULL);
830247a4 2348 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2349 lastbr = br;
821b33a5
IZ
2350 if (flags&HASWIDTH)
2351 *flagp |= HASWIDTH;
a687059c 2352 *flagp |= flags&SPSTART;
a0d0e21e
LW
2353 }
2354
c277df42
IZ
2355 if (have_branch || paren != ':') {
2356 /* Make a closing node, and hook it on the end. */
2357 switch (paren) {
2358 case ':':
830247a4 2359 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2360 break;
2361 case 1:
830247a4 2362 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2363 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2364 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2365 break;
2366 case '<':
c277df42
IZ
2367 case ',':
2368 case '=':
2369 case '!':
c277df42 2370 *flagp &= ~HASWIDTH;
821b33a5
IZ
2371 /* FALL THROUGH */
2372 case '>':
830247a4 2373 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2374 break;
2375 case 0:
830247a4 2376 ender = reg_node(pRExC_state, END);
c277df42
IZ
2377 break;
2378 }
830247a4 2379 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2380
c277df42
IZ
2381 if (have_branch) {
2382 /* Hook the tails of the branches to the closing node. */
2383 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2384 regoptail(pRExC_state, br, ender);
c277df42
IZ
2385 }
2386 }
a0d0e21e 2387 }
c277df42
IZ
2388
2389 {
2390 char *p;
2391 static char parens[] = "=!<,>";
2392
2393 if (paren && (p = strchr(parens, paren))) {
2394 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2395 int flag = (p - parens) > 1;
2396
2397 if (paren == '>')
2398 node = SUSPEND, flag = 0;
830247a4 2399 reginsert(pRExC_state, node,ret);
c277df42 2400 ret->flags = flag;
830247a4 2401 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2402 }
a0d0e21e
LW
2403 }
2404
2405 /* Check for proper termination. */
ce3e6498 2406 if (paren) {
830247a4
IZ
2407 RExC_flags16 = oregflags;
2408 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2409 RExC_parse = oregcomp_parse;
380a0633 2410 vFAIL("Unmatched (");
ce3e6498 2411 }
a0ed51b3 2412 }
830247a4
IZ
2413 else if (!paren && RExC_parse < RExC_end) {
2414 if (*RExC_parse == ')') {
2415 RExC_parse++;
380a0633 2416 vFAIL("Unmatched )");
a0ed51b3
LW
2417 }
2418 else
b45f050a 2419 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2420 /* NOTREACHED */
2421 }
a687059c 2422
a0d0e21e 2423 return(ret);
a687059c
LW
2424}
2425
2426/*
2427 - regbranch - one alternative of an | operator
2428 *
2429 * Implements the concatenation operator.
2430 */
76e3520e 2431STATIC regnode *
830247a4 2432S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2433{
c277df42
IZ
2434 register regnode *ret;
2435 register regnode *chain = NULL;
2436 register regnode *latest;
2437 I32 flags = 0, c = 0;
a0d0e21e 2438
b81d288d 2439 if (first)
c277df42
IZ
2440 ret = NULL;
2441 else {
b81d288d 2442 if (!SIZE_ONLY && RExC_extralen)
830247a4 2443 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2444 else {
830247a4 2445 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2446 Set_Node_Length(ret, 1);
2447 }
c277df42
IZ
2448 }
2449
b81d288d 2450 if (!first && SIZE_ONLY)
830247a4 2451 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2452
c277df42 2453 *flagp = WORST; /* Tentatively. */
a0d0e21e 2454
830247a4
IZ
2455 RExC_parse--;
2456 nextchar(pRExC_state);
2457 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2458 flags &= ~TRYAGAIN;
830247a4 2459 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2460 if (latest == NULL) {
2461 if (flags & TRYAGAIN)
2462 continue;
2463 return(NULL);
a0ed51b3
LW
2464 }
2465 else if (ret == NULL)
c277df42 2466 ret = latest;
a0d0e21e 2467 *flagp |= flags&HASWIDTH;
c277df42 2468 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2469 *flagp |= flags&SPSTART;
2470 else {
830247a4
IZ
2471 RExC_naughty++;
2472 regtail(pRExC_state, chain, latest);
a687059c 2473 }
a0d0e21e 2474 chain = latest;
c277df42
IZ
2475 c++;
2476 }
2477 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2478 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2479 if (ret == NULL)
2480 ret = chain;
2481 }
2482 if (c == 1) {
2483 *flagp |= flags&SIMPLE;
a0d0e21e 2484 }
a687059c 2485
a0d0e21e 2486 return(ret);
a687059c
LW
2487}
2488
2489/*
2490 - regpiece - something followed by possible [*+?]
2491 *
2492 * Note that the branching code sequences used for ? and the general cases
2493 * of * and + are somewhat optimized: they use the same NOTHING node as
2494 * both the endmarker for their branch list and the body of the last branch.
2495 * It might seem that this node could be dispensed with entirely, but the
2496 * endmarker role is not redundant.
2497 */
76e3520e 2498STATIC regnode *
830247a4 2499S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2500{
c277df42 2501 register regnode *ret;
a0d0e21e
LW
2502 register char op;
2503 register char *next;
2504 I32 flags;
830247a4 2505 char *origparse = RExC_parse;
a0d0e21e
LW
2506 char *maxpos;
2507 I32 min;
c277df42 2508 I32 max = REG_INFTY;
fac92740 2509 char *parse_start;
a0d0e21e 2510
830247a4 2511 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2512 if (ret == NULL) {
2513 if (flags & TRYAGAIN)
2514 *flagp |= TRYAGAIN;
2515 return(NULL);
2516 }
2517
830247a4 2518 op = *RExC_parse;
a0d0e21e 2519
830247a4 2520 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2521 parse_start = RExC_parse; /* MJD */
830247a4 2522 next = RExC_parse + 1;
a0d0e21e
LW
2523 maxpos = Nullch;
2524 while (isDIGIT(*next) || *next == ',') {
2525 if (*next == ',') {
2526 if (maxpos)
2527 break;
2528 else
2529 maxpos = next;
a687059c 2530 }
a0d0e21e
LW
2531 next++;
2532 }
2533 if (*next == '}') { /* got one */
2534 if (!maxpos)
2535 maxpos = next;
830247a4
IZ
2536 RExC_parse++;
2537 min = atoi(RExC_parse);
a0d0e21e
LW
2538 if (*maxpos == ',')
2539 maxpos++;
2540 else
830247a4 2541 maxpos = RExC_parse;
a0d0e21e
LW
2542 max = atoi(maxpos);
2543 if (!max && *maxpos != '0')
c277df42
IZ
2544 max = REG_INFTY; /* meaning "infinity" */
2545 else if (max >= REG_INFTY)
8615cb43 2546 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2547 RExC_parse = next;
2548 nextchar(pRExC_state);
a0d0e21e
LW
2549
2550 do_curly:
2551 if ((flags&SIMPLE)) {
830247a4
IZ
2552 RExC_naughty += 2 + RExC_naughty / 2;
2553 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2554 Set_Node_Offset(ret, parse_start+1); /* MJD */
2555 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2556 }
2557 else {
830247a4 2558 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2559
2560 w->flags = 0;
830247a4
IZ
2561 regtail(pRExC_state, ret, w);
2562 if (!SIZE_ONLY && RExC_extralen) {
2563 reginsert(pRExC_state, LONGJMP,ret);
2564 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2565 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2566 }
830247a4 2567 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2568 /* MJD hk */
2569 Set_Node_Offset(ret, parse_start+1);
2570 Set_Node_Length(ret,
2571 op == '{' ? (RExC_parse - parse_start) : 1);
2572
830247a4 2573 if (!SIZE_ONLY && RExC_extralen)
c277df42 2574 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2575 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2576 if (SIZE_ONLY)
830247a4
IZ
2577 RExC_whilem_seen++, RExC_extralen += 3;
2578 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2579 }
c277df42 2580 ret->flags = 0;
a0d0e21e
LW
2581
2582 if (min > 0)
821b33a5
IZ
2583 *flagp = WORST;
2584 if (max > 0)
2585 *flagp |= HASWIDTH;
a0d0e21e 2586 if (max && max < min)
8615cb43 2587 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2588 if (!SIZE_ONLY) {
2589 ARG1_SET(ret, min);
2590 ARG2_SET(ret, max);
a687059c 2591 }
a687059c 2592
a0d0e21e 2593 goto nest_check;
a687059c 2594 }
a0d0e21e 2595 }
a687059c 2596
a0d0e21e
LW
2597 if (!ISMULT1(op)) {
2598 *flagp = flags;
a687059c 2599 return(ret);
a0d0e21e 2600 }
bb20fd44 2601
c277df42 2602#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2603
2604 /* if this is reinstated, don't forget to put this back into perldiag:
2605
2606 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2607
2608 (F) The part of the regexp subject to either the * or + quantifier
2609 could match an empty string. The {#} shows in the regular
2610 expression about where the problem was discovered.
2611
2612 */
2613
bb20fd44 2614 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2615 vFAIL("Regexp *+ operand could be empty");
b81d288d 2616#endif
bb20fd44 2617
fac92740 2618 parse_start = RExC_parse;
830247a4 2619 nextchar(pRExC_state);
a0d0e21e 2620
821b33a5 2621 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2622
2623 if (op == '*' && (flags&SIMPLE)) {
830247a4 2624 reginsert(pRExC_state, STAR, ret);
c277df42 2625 ret->flags = 0;
830247a4 2626 RExC_naughty += 4;
a0d0e21e
LW
2627 }
2628 else if (op == '*') {
2629 min = 0;
2630 goto do_curly;
a0ed51b3
LW
2631 }
2632 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2633 reginsert(pRExC_state, PLUS, ret);
c277df42 2634 ret->flags = 0;
830247a4 2635 RExC_naughty += 3;
a0d0e21e
LW
2636 }
2637 else if (op == '+') {
2638 min = 1;
2639 goto do_curly;
a0ed51b3
LW
2640 }
2641 else if (op == '?') {
a0d0e21e
LW
2642 min = 0; max = 1;
2643 goto do_curly;
2644 }
2645 nest_check:
e476b1b5 2646 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2647 vWARN3(RExC_parse,
b45f050a 2648 "%.*s matches null string many times",
830247a4 2649 RExC_parse - origparse,
b45f050a 2650 origparse);
a0d0e21e
LW
2651 }
2652
830247a4
IZ
2653 if (*RExC_parse == '?') {
2654 nextchar(pRExC_state);
2655 reginsert(pRExC_state, MINMOD, ret);
2656 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2657 }
830247a4
IZ
2658 if (ISMULT2(RExC_parse)) {
2659 RExC_parse++;
b45f050a
JF
2660 vFAIL("Nested quantifiers");
2661 }
a0d0e21e
LW
2662
2663 return(ret);
a687059c
LW
2664}
2665
2666/*
2667 - regatom - the lowest level
2668 *
2669 * Optimization: gobbles an entire sequence of ordinary characters so that
2670 * it can turn them into a single node, which is smaller to store and
2671 * faster to run. Backslashed characters are exceptions, each becoming a
2672 * separate node; the code is simpler that way and it's not worth fixing.
2673 *
b45f050a 2674 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2675STATIC regnode *
830247a4 2676S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2677{
c277df42 2678 register regnode *ret = 0;
a0d0e21e
LW
2679 I32 flags;
2680
2681 *flagp = WORST; /* Tentatively. */
2682
2683tryagain:
830247a4 2684 switch (*RExC_parse) {
a0d0e21e 2685 case '^':
830247a4
IZ
2686 RExC_seen_zerolen++;
2687 nextchar(pRExC_state);
2688 if (RExC_flags16 & PMf_MULTILINE)
2689 ret = reg_node(pRExC_state, MBOL);
2690 else if (RExC_flags16 & PMf_SINGLELINE)
2691 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2692 else
830247a4 2693 ret = reg_node(pRExC_state, BOL);
fac92740 2694 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2695 break;
2696 case '$':
830247a4 2697 nextchar(pRExC_state);
b81d288d 2698 if (*RExC_parse)
830247a4
IZ
2699 RExC_seen_zerolen++;
2700 if (RExC_flags16 & PMf_MULTILINE)
2701 ret = reg_node(pRExC_state, MEOL);
2702 else if (RExC_flags16 & PMf_SINGLELINE)
2703 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2704 else
830247a4 2705 ret = reg_node(pRExC_state, EOL);
fac92740 2706 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2707 break;
2708 case '.':
830247a4 2709 nextchar(pRExC_state);
ffc61ed2
JH
2710 if (RExC_flags16 & PMf_SINGLELINE)
2711 ret = reg_node(pRExC_state, SANY);
2712 else
2713 ret = reg_node(pRExC_state, REG_ANY);
2714 *flagp |= HASWIDTH|SIMPLE;
830247a4 2715 RExC_naughty++;
fac92740 2716 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2717 break;
2718 case '[':
b45f050a 2719 {
830247a4 2720 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2721 ret = regclass(pRExC_state);
830247a4
IZ
2722 if (*RExC_parse != ']') {
2723 RExC_parse = oregcomp_parse;
b45f050a
JF
2724 vFAIL("Unmatched [");
2725 }
830247a4 2726 nextchar(pRExC_state);
a0d0e21e 2727 *flagp |= HASWIDTH|SIMPLE;
fac92740 2728 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2729 break;
b45f050a 2730 }
a0d0e21e 2731 case '(':
830247a4
IZ
2732 nextchar(pRExC_state);
2733 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2734 if (ret == NULL) {
bf93d4cc 2735 if (flags & TRYAGAIN) {
830247a4 2736 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2737 /* Make parent create an empty node if needed. */
2738 *flagp |= TRYAGAIN;
2739 return(NULL);
2740 }
a0d0e21e 2741 goto tryagain;
bf93d4cc 2742 }
a0d0e21e
LW
2743 return(NULL);
2744 }
c277df42 2745 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2746 break;
2747 case '|':
2748 case ')':
2749 if (flags & TRYAGAIN) {
2750 *flagp |= TRYAGAIN;
2751 return NULL;
2752 }
b45f050a 2753 vFAIL("Internal urp");
a0d0e21e
LW
2754 /* Supposed to be caught earlier. */
2755 break;
85afd4ae 2756 case '{':
830247a4
IZ
2757 if (!regcurly(RExC_parse)) {
2758 RExC_parse++;
85afd4ae
CS
2759 goto defchar;
2760 }
2761 /* FALL THROUGH */
a0d0e21e
LW
2762 case '?':
2763 case '+':
2764 case '*':
830247a4 2765 RExC_parse++;
b45f050a 2766 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2767 break;
2768 case '\\':
830247a4 2769 switch (*++RExC_parse) {
a0d0e21e 2770 case 'A':
830247a4
IZ
2771 RExC_seen_zerolen++;
2772 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2773 *flagp |= SIMPLE;
830247a4 2774 nextchar(pRExC_state);
fac92740 2775 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2776 break;
2777 case 'G':
830247a4
IZ
2778 ret = reg_node(pRExC_state, GPOS);
2779 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2780 *flagp |= SIMPLE;
830247a4 2781 nextchar(pRExC_state);
fac92740 2782 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2783 break;
2784 case 'Z':
830247a4 2785 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2786 *flagp |= SIMPLE;
830247a4 2787 nextchar(pRExC_state);
a0d0e21e 2788 break;
b85d18e9 2789 case 'z':
830247a4 2790 ret = reg_node(pRExC_state, EOS);
b85d18e9 2791 *flagp |= SIMPLE;
830247a4
IZ
2792 RExC_seen_zerolen++; /* Do not optimize RE away */
2793 nextchar(pRExC_state);
fac92740 2794 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2795 break;
4a2d328f 2796 case 'C':
830247a4 2797 ret = reg_node(pRExC_state, SANY);
3baa4c62 2798 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2799 *flagp |= HASWIDTH|SIMPLE;
830247a4 2800 nextchar(pRExC_state);
fac92740 2801 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2802 break;
2803 case 'X':
830247a4 2804 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2805 *flagp |= HASWIDTH;
830247a4 2806 nextchar(pRExC_state);
fac92740 2807 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2808 break;
a0d0e21e 2809 case 'w':
ffc61ed2 2810 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2811 *flagp |= HASWIDTH|SIMPLE;
830247a4 2812 nextchar(pRExC_state);
fac92740 2813 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2814 break;
2815 case 'W':
ffc61ed2 2816 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2817 *flagp |= HASWIDTH|SIMPLE;
830247a4 2818 nextchar(pRExC_state);
fac92740 2819 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2820 break;
2821 case 'b':
830247a4
IZ
2822 RExC_seen_zerolen++;
2823 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2824 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2825 *flagp |= SIMPLE;
830247a4 2826 nextchar(pRExC_state);
fac92740 2827 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2828 break;
2829 case 'B':
830247a4
IZ
2830 RExC_seen_zerolen++;
2831 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2832 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2833 *flagp |= SIMPLE;
830247a4 2834 nextchar(pRExC_state);
fac92740 2835 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2836 break;
2837 case 's':
ffc61ed2 2838 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2839 *flagp |= HASWIDTH|SIMPLE;
830247a4 2840 nextchar(pRExC_state);
fac92740 2841 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2842 break;
2843 case 'S':
ffc61ed2 2844 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2845 *flagp |= HASWIDTH|SIMPLE;
830247a4 2846 nextchar(pRExC_state);
fac92740 2847 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2848 break;
2849 case 'd':
ffc61ed2 2850 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2851 *flagp |= HASWIDTH|SIMPLE;
830247a4 2852 nextchar(pRExC_state);
fac92740 2853 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2854 break;
2855 case 'D':
ffc61ed2 2856 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2857 *flagp |= HASWIDTH|SIMPLE;
830247a4 2858 nextchar(pRExC_state);
fac92740 2859 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2860 break;
a14b48bc
LW
2861 case 'p':
2862 case 'P':
3568d838 2863 {
830247a4 2864 char* oldregxend = RExC_end;
fac92740 2865 char* parse_start = RExC_parse;
a14b48bc 2866
830247a4 2867 if (RExC_parse[1] == '{') {
3568d838 2868 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2869 RExC_end = strchr(RExC_parse, '}');
2870 if (!RExC_end) {
2871 RExC_parse += 2;
2872 RExC_end = oldregxend;
b45f050a
JF
2873 vFAIL("Missing right brace on \\p{}");
2874 }
830247a4 2875 RExC_end++;
a14b48bc
LW
2876 }
2877 else
830247a4
IZ
2878 RExC_end = RExC_parse + 2;
2879 RExC_parse--;
a14b48bc 2880
ffc61ed2 2881 ret = regclass(pRExC_state);
a14b48bc 2882
830247a4
IZ
2883 RExC_end = oldregxend;
2884 RExC_parse--;
fac92740 2885 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2886 nextchar(pRExC_state);
a14b48bc
LW
2887 *flagp |= HASWIDTH|SIMPLE;
2888 }
2889 break;
a0d0e21e
LW
2890 case 'n':
2891 case 'r':
2892 case 't':
2893 case 'f':
2894 case 'e':
2895 case 'a':
2896 case 'x':
2897 case 'c':
2898 case '0':
2899 goto defchar;
2900 case '1': case '2': case '3': case '4':
2901 case '5': case '6': case '7': case '8': case '9':
2902 {
830247a4 2903 I32 num = atoi(RExC_parse);
a0d0e21e 2904
830247a4 2905 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2906 goto defchar;
2907 else {
fac92740 2908 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2909 while (isDIGIT(*RExC_parse))
2910 RExC_parse++;
b45f050a 2911
830247a4 2912 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2913 vFAIL("Reference to nonexistent group");
830247a4
IZ
2914 RExC_sawback = 1;
2915 ret = reganode(pRExC_state, FOLD
a0ed51b3 2916 ? (LOC ? REFFL : REFF)
c8756f30 2917 : REF, num);
a0d0e21e 2918 *flagp |= HASWIDTH;
fac92740
MJD
2919
2920 /* override incorrect value set in reganode MJD */
2921 Set_Node_Offset(ret, parse_start+1);
2922 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2923 RExC_parse--;
2924 nextchar(pRExC_state);
a0d0e21e
LW
2925 }
2926 }
2927 break;
2928 case '\0':
830247a4 2929 if (RExC_parse >= RExC_end)
b45f050a 2930 FAIL("Trailing \\");
a0d0e21e
LW
2931 /* FALL THROUGH */
2932 default:
c9f97d15
IZ
2933 /* Do not generate `unrecognized' warnings here, we fall
2934 back into the quick-grab loop below */
a0d0e21e
LW
2935 goto defchar;
2936 }
2937 break;
4633a7c4
LW
2938
2939 case '#':
830247a4
IZ
2940 if (RExC_flags16 & PMf_EXTENDED) {
2941 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2942 if (RExC_parse < RExC_end)
4633a7c4
LW
2943 goto tryagain;
2944 }
2945 /* FALL THROUGH */
2946
a0d0e21e 2947 default: {
ba210ebe 2948 register STRLEN len;
a0ed51b3 2949 register UV ender;
a0d0e21e 2950 register char *p;
c277df42 2951 char *oldp, *s;
ba210ebe 2952 STRLEN numlen;
fac92740 2953 char *parse_start = RExC_parse - 1;
a0d0e21e 2954
830247a4 2955 RExC_parse++;
a0d0e21e
LW
2956
2957 defchar:
830247a4 2958 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2959 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2960 : EXACT);
cd439c50 2961 s = STRING(ret);
830247a4
IZ
2962 for (len = 0, p = RExC_parse - 1;
2963 len < 127 && p < RExC_end;
a0d0e21e
LW
2964 len++)
2965 {
2966 oldp = p;
5b5a24f7 2967
830247a4
IZ
2968 if (RExC_flags16 & PMf_EXTENDED)
2969 p = regwhite(p, RExC_end);
a0d0e21e
LW
2970 switch (*p) {
2971 case '^':
2972 case '$':
2973 case '.':
2974 case '[':
2975 case '(':
2976 case ')':
2977 case '|':
2978 goto loopdone;
2979 case '\\':
2980 switch (*++p) {
2981 case 'A':
2982 case 'G':
2983 case 'Z':
b85d18e9 2984 case 'z':
a0d0e21e
LW
2985 case 'w':
2986 case 'W':
2987 case 'b':
2988 case 'B':
2989 case 's':
2990 case 'S':
2991 case 'd':
2992 case 'D':
a14b48bc
LW
2993 case 'p':
2994 case 'P':
a0d0e21e
LW
2995 --p;
2996 goto loopdone;
2997 case 'n':
2998 ender = '\n';
2999 p++;
a687059c 3000 break;
a0d0e21e
LW
3001 case 'r':
3002 ender = '\r';
3003 p++;
a687059c 3004 break;
a0d0e21e
LW
3005 case 't':
3006 ender = '\t';
3007 p++;
a687059c 3008 break;
a0d0e21e
LW
3009 case 'f':
3010 ender = '\f';
3011 p++;
a687059c 3012 break;
a0d0e21e 3013 case 'e':
c7f1f016 3014 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3015 p++;
a687059c 3016 break;
a0d0e21e 3017 case 'a':
c7f1f016 3018 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3019 p++;
a687059c 3020 break;
a0d0e21e 3021 case 'x':
a0ed51b3
LW
3022 if (*++p == '{') {
3023 char* e = strchr(p, '}');
b81d288d 3024
b45f050a 3025 if (!e) {
830247a4 3026 RExC_parse = p + 1;
b45f050a
JF
3027 vFAIL("Missing right brace on \\x{}");
3028 }
de5f0749 3029 else {
b21ed0a9 3030 numlen = 1; /* allow underscores */
de35ba6f 3031 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
3032 if (ender > 0xff)
3033 RExC_utf8 = 1;
b21ed0a9
GS
3034 /* numlen is generous */
3035 if (numlen + len >= 127) {
a0ed51b3
LW
3036 p--;
3037 goto loopdone;
3038 }
3039 p = e + 1;
3040 }
a0ed51b3
LW
3041 }
3042 else {
b21ed0a9 3043 numlen = 0; /* disallow underscores */
dff6d3cd 3044 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
3045 p += numlen;
3046 }
a687059c 3047 break;
a0d0e21e
LW
3048 case 'c':
3049 p++;
bbce6d69 3050 ender = UCHARAT(p++);
3051 ender = toCTRL(ender);
a687059c 3052 break;
a0d0e21e
LW
3053 case '0': case '1': case '2': case '3':case '4':
3054 case '5': case '6': case '7': case '8':case '9':
3055 if (*p == '0' ||
830247a4 3056 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 3057 numlen = 0; /* disallow underscores */
dff6d3cd 3058 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
3059 p += numlen;
3060 }
3061 else {
3062 --p;
3063 goto loopdone;
a687059c
LW
3064 }
3065 break;
a0d0e21e 3066 case '\0':
830247a4 3067 if (p >= RExC_end)
b45f050a 3068 FAIL("Trailing \\");
a687059c 3069 /* FALL THROUGH */
a0d0e21e 3070 default:
e476b1b5 3071 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3072 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3073 goto normal_default;
a0d0e21e
LW
3074 }
3075 break;
a687059c 3076 default:
a0ed51b3 3077 normal_default:
fd400ab9 3078 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3079 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3080 &numlen, 0);
a0ed51b3
LW
3081 p += numlen;
3082 }
3083 else
3084 ender = *p++;
a0d0e21e 3085 break;
a687059c 3086 }
830247a4
IZ
3087 if (RExC_flags16 & PMf_EXTENDED)
3088 p = regwhite(p, RExC_end);
a0ed51b3
LW
3089 if (UTF && FOLD) {
3090 if (LOC)
5e12f4fb 3091 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3092 else
3093 ender = toLOWER_uni(ender);
3094 }
a0d0e21e
LW
3095 if (ISMULT2(p)) { /* Back off on ?+*. */
3096 if (len)
3097 p = oldp;
2b9d42f0 3098 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3099 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3100 s += numlen;
3101 len += numlen;
3102 }
a0d0e21e
LW
3103 else {
3104 len++;
cd439c50 3105 REGC(ender, s++);
a0d0e21e
LW
3106 }
3107 break;
a687059c 3108 }
2b9d42f0 3109 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3110 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3111 s += numlen;
3112 len += numlen - 1;
3113 }
3114 else
cd439c50 3115 REGC(ender, s++);
a0d0e21e
LW
3116 }
3117 loopdone:
830247a4 3118 RExC_parse = p - 1;
fac92740 3119 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3120 nextchar(pRExC_state);
793db0cb
JH
3121 {
3122 /* len is STRLEN which is unsigned, need to copy to signed */
3123 IV iv = len;
3124 if (iv < 0)
3125 vFAIL("Internal disaster");
3126 }
a0d0e21e
LW
3127 if (len > 0)
3128 *flagp |= HASWIDTH;
3129 if (len == 1)
3130 *flagp |= SIMPLE;
c277df42 3131 if (!SIZE_ONLY)
cd439c50
IZ
3132 STR_LEN(ret) = len;
3133 if (SIZE_ONLY)
830247a4 3134 RExC_size += STR_SZ(len);
cd439c50 3135 else
830247a4 3136 RExC_emit += STR_SZ(len);
a687059c 3137 }
a0d0e21e
LW
3138 break;
3139 }
a687059c 3140
a0d0e21e 3141 return(ret);
a687059c
LW
3142}
3143
873ef191 3144STATIC char *
cea2e8a9 3145S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3146{
3147 while (p < e) {
3148 if (isSPACE(*p))
3149 ++p;
3150 else if (*p == '#') {
3151 do {
3152 p++;
3153 } while (p < e && *p != '\n');
3154 }
3155 else
3156 break;
3157 }
3158 return p;
3159}
3160
b8c5462f
JH
3161/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3162 Character classes ([:foo:]) can also be negated ([:^foo:]).
3163 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3164 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3165 but trigger warnings because they are currently unimplemented. */
3166STATIC I32
830247a4 3167S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3168{
3169 char *posixcc = 0;
936ed897 3170 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3171
830247a4 3172 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3173 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3174 (*RExC_parse == ':' ||
3175 *RExC_parse == '=' ||
3176 *RExC_parse == '.')) {
3177 char c = *RExC_parse;
3178 char* s = RExC_parse++;
b81d288d 3179
830247a4
IZ
3180 while (RExC_parse < RExC_end && *RExC_parse != c)
3181 RExC_parse++;
3182 if (RExC_parse == RExC_end)
620e46c5 3183 /* Grandfather lone [:, [=, [. */
830247a4 3184 RExC_parse = s;
620e46c5 3185 else {
830247a4 3186 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3187
830247a4
IZ
3188 if (*RExC_parse == ']') {
3189 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3190 posixcc = s + 1;
3191 if (*s == ':') {
3192 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3193 I32 skip = 5; /* the most common skip */
3194
3195 switch (*posixcc) {
3196 case 'a':
3197 if (strnEQ(posixcc, "alnum", 5))
3198 namedclass =
3199 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3200 else if (strnEQ(posixcc, "alpha", 5))
3201 namedclass =
3202 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3203 else if (strnEQ(posixcc, "ascii", 5))
3204 namedclass =
3205 complement ? ANYOF_NASCII : ANYOF_ASCII;
3206 break;
aaa51d5e
JF
3207 case 'b':
3208 if (strnEQ(posixcc, "blank", 5))
3209 namedclass =
3210 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3211 break;
b8c5462f
JH
3212 case 'c':
3213 if (strnEQ(posixcc, "cntrl", 5))
3214 namedclass =
3215 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3216 break;
3217 case 'd':
3218 if (strnEQ(posixcc, "digit", 5))
3219 namedclass =
3220 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3221 break;
3222 case 'g':
3223 if (strnEQ(posixcc, "graph", 5))
3224 namedclass =
3225 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3226 break;
3227 case 'l':
3228 if (strnEQ(posixcc, "lower", 5))
3229 namedclass =
3230 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3231 break;
3232 case 'p':
3233 if (strnEQ(posixcc, "print", 5))
3234 namedclass =
3235 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3236 else if (strnEQ(posixcc, "punct", 5))
3237 namedclass =
3238 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3239 break;
3240 case 's':
3241 if (strnEQ(posixcc, "space", 5))
3242 namedclass =
aaa51d5e 3243 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3244 break;
b8c5462f
JH
3245 case 'u':
3246 if (strnEQ(posixcc, "upper", 5))
3247 namedclass =
3248 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3249 break;
3250 case 'w': /* this is not POSIX, this is the Perl \w */
3251 if (strnEQ(posixcc, "word", 4)) {
3252 namedclass =
3253 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3254 skip = 4;
3255 }
3256 break;
3257 case 'x':
3258 if (strnEQ(posixcc, "xdigit", 6)) {
3259 namedclass =
3260 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3261 skip = 6;
3262 }
3263 break;
3264 }
ac561586
JH
3265 if (namedclass == OOB_NAMEDCLASS ||
3266 posixcc[skip] != ':' ||
3267 posixcc[skip+1] != ']')
b45f050a
JF
3268 {
3269 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3270 t - s - 1, s + 1);
3271 }
3272 } else if (!SIZE_ONLY) {
b8c5462f 3273 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3274
830247a4 3275 /* adjust RExC_parse so the warning shows after
b45f050a 3276 the class closes */
830247a4
IZ
3277 while (*RExC_parse && *RExC_parse != ']')
3278 RExC_parse++;
b45f050a
JF
3279 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3280 }
b8c5462f
JH
3281 } else {
3282 /* Maternal grandfather:
3283 * "[:" ending in ":" but not in ":]" */
830247a4 3284 RExC_parse = s;
767d463e 3285 }
620e46c5
JH
3286 }
3287 }
3288
b8c5462f
JH
3289 return namedclass;
3290}
3291
3292STATIC void
830247a4 3293S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3294{
e476b1b5 3295 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3296 (*RExC_parse == ':' ||
3297 *RExC_parse == '=' ||
3298 *RExC_parse == '.')) {
3299 char *s = RExC_parse;
93733859 3300 char c = *s++;
b8c5462f
JH
3301
3302 while(*s && isALNUM(*s))
3303 s++;
3304 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3305 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3306
3307 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3308 if (c == '=' || c == '.')
b45f050a 3309 {
830247a4 3310 /* adjust RExC_parse so the error shows after
b45f050a 3311 the class closes */
830247a4 3312 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3313 ;
3314 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3315 }
b8c5462f
JH
3316 }
3317 }
620e46c5
JH
3318}
3319
76e3520e 3320STATIC regnode *
830247a4 3321S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3322{
ffc61ed2 3323 register UV value;
3568d838 3324 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3325 register IV range = 0;
c277df42 3326 register regnode *ret;
ba210ebe 3327 STRLEN numlen;
ffc61ed2 3328 IV namedclass;
9c5ffd7c 3329 char *rangebegin = 0;
936ed897 3330 bool need_class = 0;
9c5ffd7c 3331 SV *listsv = Nullsv;
ffc61ed2
JH
3332 register char *e;
3333 UV n;
3568d838 3334 bool optimize_invert = TRUE;
ffc61ed2
JH
3335
3336 ret = reganode(pRExC_state, ANYOF, 0);
3337
3338 if (!SIZE_ONLY)
3339 ANYOF_FLAGS(ret) = 0;
3340
3341 if (*RExC_parse == '^') { /* Complement of range. */
3342 RExC_naughty++;
3343 RExC_parse++;
3344 if (!SIZE_ONLY)
3345 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3346 }
a0d0e21e 3347
936ed897 3348 if (SIZE_ONLY)
830247a4 3349 RExC_size += ANYOF_SKIP;
936ed897 3350 else {
830247a4 3351 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3352 if (FOLD)
3353 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3354 if (LOC)
3355 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3356 ANYOF_BITMAP_ZERO(ret);
3357 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3358 }
b8c5462f 3359
e476b1b5 3360 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3361 checkposixcc(pRExC_state);
b8c5462f 3362
830247a4 3363 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3364 goto charclassloop; /* allow 1st char to be ] or - */
3365
830247a4 3366 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3367
3368 charclassloop:
3369
3370 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3371
73b437c8 3372 if (!range)
830247a4 3373 rangebegin = RExC_parse;
ffc61ed2 3374 if (UTF) {
5e12f4fb 3375 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3376 RExC_end - RExC_parse,
3377 &numlen, 0);
ffc61ed2
JH
3378 RExC_parse += numlen;
3379 }
3380 else
3381 value = UCHARAT(RExC_parse++);
620e46c5 3382 if (value == '[')
830247a4 3383 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3384 else if (value == '\\') {
ffc61ed2 3385 if (UTF) {
5e12f4fb 3386 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3387 RExC_end - RExC_parse,
3388 &numlen, 0);
3389 RExC_parse += numlen;
3390 }
3391 else
3392 value = UCHARAT(RExC_parse++);
470c3474 3393 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3394 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3395 * be a problem later if we want switch on Unicode.
3396 * A similar issue a little bit later when switching on
3397 * namedclass. --jhi */
ffc61ed2 3398 switch ((I32)value) {
b8c5462f
JH
3399 case 'w': namedclass = ANYOF_ALNUM; break;
3400 case 'W': namedclass = ANYOF_NALNUM; break;
3401 case 's': namedclass = ANYOF_SPACE; break;
3402 case 'S': namedclass = ANYOF_NSPACE; break;
3403 case 'd': namedclass = ANYOF_DIGIT; break;
3404 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3405 case 'p':
3406 case 'P':
3407 if (*RExC_parse == '{') {
3408 e = strchr(RExC_parse++, '}');
3409 if (!e)
3410 vFAIL("Missing right brace on \\p{}");
3411 n = e - RExC_parse;
3412 }
3413 else {
3414 e = RExC_parse;
3415 n = 1;
3416 }
3417 if (!SIZE_ONLY) {
3418 if (value == 'p')
3419 Perl_sv_catpvf(aTHX_ listsv,
3420 "+utf8::%.*s\n", (int)n, RExC_parse);
3421 else
3422 Perl_sv_catpvf(aTHX_ listsv,
3423 "!utf8::%.*s\n", (int)n, RExC_parse);
3424 }
3425 RExC_parse = e + 1;
3426 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3427 continue;
b8c5462f
JH
3428 case 'n': value = '\n'; break;
3429 case 'r': value = '\r'; break;
3430 case 't': value = '\t'; break;
3431 case 'f': value = '\f'; break;
3432 case 'b': value = '\b'; break;
c7f1f016
NIS
3433 case 'e': value = ASCII_TO_NATIVE('\033');break;
3434 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3435 case 'x':
ffc61ed2
JH
3436 if (*RExC_parse == '{') {
3437 e = strchr(RExC_parse++, '}');
b81d288d 3438 if (!e)
ffc61ed2
JH
3439 vFAIL("Missing right brace on \\x{}");
3440 numlen = 1; /* allow underscores */
3441 value = (UV)scan_hex(RExC_parse,
3442 e - RExC_parse,
3443 &numlen);
3444 RExC_parse = e + 1;
3445 }
3446 else {
3447 numlen = 0; /* disallow underscores */
3448 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3449 RExC_parse += numlen;
3450 }
b8c5462f
JH
3451 break;
3452 case 'c':
830247a4 3453 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3454 value = toCTRL(value);
3455 break;
3456 case '0': case '1': case '2': case '3': case '4':
3457 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3458 numlen = 0; /* disallow underscores */
830247a4
IZ
3459 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3460 RExC_parse += numlen;
b8c5462f 3461 break;
1028017a 3462 default:
e476b1b5 3463 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3464 vWARN2(RExC_parse,
3465 "Unrecognized escape \\%c in character class passed through",
3466 (int)value);
1028017a 3467 break;
b8c5462f 3468 }
ffc61ed2
JH
3469 } /* end of \blah */
3470
3471 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3472
3473 if (!SIZE_ONLY && !need_class)
936ed897 3474 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3475
936ed897 3476 need_class = 1;
ffc61ed2
JH
3477
3478 /* a bad range like a-\d, a-[:digit:] ? */
3479 if (range) {
73b437c8 3480 if (!SIZE_ONLY) {
e476b1b5 3481 if (ckWARN(WARN_REGEXP))
830247a4 3482 vWARN4(RExC_parse,
b45f050a 3483 "False [] range \"%*.*s\"",
830247a4
IZ
3484 RExC_parse - rangebegin,
3485 RExC_parse - rangebegin,
b45f050a 3486 rangebegin);
3568d838
JH
3487 if (prevvalue < 256) {
3488 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3489 ANYOF_BITMAP_SET(ret, '-');
3490 }
3491 else {
3492 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3493 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3494 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3495 }
b8c5462f 3496 }
ffc61ed2
JH
3497
3498 range = 0; /* this was not a true range */
73b437c8 3499 }
ffc61ed2 3500
73b437c8 3501 if (!SIZE_ONLY) {
3568d838
JH
3502 if (namedclass > OOB_NAMEDCLASS)
3503 optimize_invert = FALSE;
e2962f66
JH
3504 /* Possible truncation here but in some 64-bit environments
3505 * the compiler gets heartburn about switch on 64-bit values.
3506 * A similar issue a little earlier when switching on value.
98f323fa 3507 * --jhi */
e2962f66 3508 switch ((I32)namedclass) {
73b437c8
JH
3509 case ANYOF_ALNUM:
3510 if (LOC)
936ed897 3511 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3512 else {
3513 for (value = 0; value < 256; value++)
3514 if (isALNUM(value))
936ed897 3515 ANYOF_BITMAP_SET(ret, value);
73b437c8 3516 }
ffc61ed2 3517 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3518 break;
3519 case ANYOF_NALNUM:
3520 if (LOC)
936ed897 3521 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3522 else {
3523 for (value = 0; value < 256; value++)
3524 if (!isALNUM(value))
936ed897 3525 ANYOF_BITMAP_SET(ret, value);
73b437c8 3526 }
ffc61ed2 3527 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3528 break;
ffc61ed2 3529 case ANYOF_ALNUMC:
73b437c8 3530 if (LOC)
ffc61ed2 3531 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3532 else {
3533 for (value = 0; value < 256; value++)
ffc61ed2 3534 if (isALNUMC(value))
936ed897 3535 ANYOF_BITMAP_SET(ret, value);
73b437c8 3536 }
ffc61ed2 3537 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3538 break;
3539 case ANYOF_NALNUMC:
3540 if (LOC)
936ed897 3541 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3542 else {
3543 for (value = 0; value < 256; value++)
3544 if (!isALNUMC(value))
936ed897 3545 ANYOF_BITMAP_SET(ret, value);
73b437c8 3546 }
ffc61ed2 3547 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3548 break;
3549 case ANYOF_ALPHA:
3550 if (LOC)
936ed897 3551 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3552 else {
3553 for (value = 0; value < 256; value++)
3554 if (isALPHA(value))
936ed897 3555 ANYOF_BITMAP_SET(ret, value);
73b437c8 3556 }
ffc61ed2 3557 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3558 break;
3559 case ANYOF_NALPHA:
3560 if (LOC)
936ed897 3561 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3562 else {
3563 for (value = 0; value < 256; value++)
3564 if (!isALPHA(value))
936ed897 3565 ANYOF_BITMAP_SET(ret, value);
73b437c8 3566 }
ffc61ed2 3567 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3568 break;
3569 case ANYOF_ASCII:
3570 if (LOC)
936ed897 3571 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3572 else {
c7f1f016 3573#ifndef EBCDIC
1ba5c669
JH
3574 for (value = 0; value < 128; value++)
3575 ANYOF_BITMAP_SET(ret, value);
3576#else /* EBCDIC */
ffbc6a93 3577 for (value = 0; value < 256; value++) {
3a3c4447
JH
3578 if (isASCII(value))
3579 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3580 }
1ba5c669 3581#endif /* EBCDIC */
73b437c8 3582 }
ffc61ed2 3583 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3584 break;
3585 case ANYOF_NASCII:
3586 if (LOC)
936ed897 3587 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3588 else {
c7f1f016 3589#ifndef EBCDIC
1ba5c669
JH
3590 for (value = 128; value < 256; value++)
3591 ANYOF_BITMAP_SET(ret, value);
3592#else /* EBCDIC */
ffbc6a93 3593 for (value = 0; value < 256; value++) {
3a3c4447
JH
3594 if (!isASCII(value))
3595 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3596 }
1ba5c669 3597#endif /* EBCDIC */
73b437c8 3598 }
ffc61ed2 3599 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3600 break;
aaa51d5e
JF
3601 case ANYOF_BLANK:
3602 if (LOC)
3603 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3604 else {
3605 for (value = 0; value < 256; value++)
3606 if (isBLANK(value))
3607 ANYOF_BITMAP_SET(ret, value);
3608 }
ffc61ed2 3609 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3610 break;
3611 case ANYOF_NBLANK:
3612 if (LOC)
3613 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3614 else {
3615 for (value = 0; value < 256; value++)
3616 if (!isBLANK(value))
3617 ANYOF_BITMAP_SET(ret, value);
3618 }
ffc61ed2 3619 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3620 break;
73b437c8
JH
3621 case ANYOF_CNTRL:
3622 if (LOC)
936ed897 3623 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3624 else {
3625 for (value = 0; value < 256; value++)
3626 if (isCNTRL(value))
936ed897 3627 ANYOF_BITMAP_SET(ret, value);
73b437c8 3628 }
ffc61ed2 3629 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3630 break;
3631 case ANYOF_NCNTRL:
3632 if (LOC)
936ed897 3633 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3634 else {
3635 for (value = 0; value < 256; value++)
3636 if (!isCNTRL(value))
936ed897 3637 ANYOF_BITMAP_SET(ret, value);
73b437c8 3638 }
ffc61ed2
JH
3639 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3640 break;
3641 case ANYOF_DIGIT:
3642 if (LOC)
3643 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3644 else {
3645 /* consecutive digits assumed */
3646 for (value = '0'; value <= '9'; value++)
3647 ANYOF_BITMAP_SET(ret, value);
3648 }
3649 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3650 break;
3651 case ANYOF_NDIGIT:
3652 if (LOC)
3653 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3654 else {
3655 /* consecutive digits assumed */
3656 for (value = 0; value < '0'; value++)
3657 ANYOF_BITMAP_SET(ret, value);
3658 for (value = '9' + 1; value < 256; value++)
3659 ANYOF_BITMAP_SET(ret, value);
3660 }
3661 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3662 break;
3663 case ANYOF_GRAPH:
3664 if (LOC)
936ed897 3665 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3666 else {
3667 for (value = 0; value < 256; value++)
3668 if (isGRAPH(value))
936ed897 3669 ANYOF_BITMAP_SET(ret, value);
73b437c8 3670 }
ffc61ed2 3671 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3672 break;
3673 case ANYOF_NGRAPH:
3674 if (LOC)
936ed897 3675 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3676 else {
3677 for (value = 0; value < 256; value++)
3678 if (!isGRAPH(value))
936ed897 3679 ANYOF_BITMAP_SET(ret, value);
73b437c8 3680 }
ffc61ed2 3681 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3682 break;
3683 case ANYOF_LOWER:
3684 if (LOC)
936ed897 3685 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3686 else {
3687 for (value = 0; value < 256; value++)
3688 if (isLOWER(value))
936ed897 3689 ANYOF_BITMAP_SET(ret, value);
73b437c8 3690 }
ffc61ed2 3691 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3692 break;
3693 case ANYOF_NLOWER:
3694 if (LOC)
936ed897 3695 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3696 else {
3697 for (value = 0; value < 256; value++)
3698 if (!isLOWER(value))
936ed897 3699 ANYOF_BITMAP_SET(ret, value);
73b437c8 3700 }
ffc61ed2 3701 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3702 break;
3703 case ANYOF_PRINT:
3704 if (LOC)
936ed897 3705 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3706 else {
3707 for (value = 0; value < 256; value++)
3708 if (isPRINT(value))
936ed897 3709 ANYOF_BITMAP_SET(ret, value);
73b437c8 3710 }
ffc61ed2 3711 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3712 break;
3713 case ANYOF_NPRINT:
3714 if (LOC)
936ed897 3715 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3716 else {
3717 for (value = 0; value < 256; value++)
3718 if (!isPRINT(value))
936ed897 3719 ANYOF_BITMAP_SET(ret, value);
73b437c8 3720 }
ffc61ed2 3721 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3722 break;
aaa51d5e
JF
3723 case ANYOF_PSXSPC:
3724 if (LOC)
3725 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3726 else {
3727 for (value = 0; value < 256; value++)
3728 if (isPSXSPC(value))
3729 ANYOF_BITMAP_SET(ret, value);
3730 }
ffc61ed2 3731 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3732 break;
3733 case ANYOF_NPSXSPC:
3734 if (LOC)
3735 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3736 else {
3737 for (value = 0; value < 256; value++)
3738 if (!isPSXSPC(value))
3739 ANYOF_BITMAP_SET(ret, value);
3740 }
ffc61ed2 3741 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3742 break;
73b437c8
JH
3743 case ANYOF_PUNCT:
3744 if (LOC)
936ed897 3745 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3746 else {
3747 for (value = 0; value < 256; value++)
3748 if (isPUNCT(value))
936ed897 3749 ANYOF_BITMAP_SET(ret, value);
73b437c8 3750 }
ffc61ed2 3751 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3752 break;
3753 case ANYOF_NPUNCT:
3754 if (LOC)
936ed897 3755 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3756 else {
3757 for (value = 0; value < 256; value++)
3758 if (!isPUNCT(value))
936ed897 3759 ANYOF_BITMAP_SET(ret, value);
73b437c8 3760 }
ffc61ed2
JH
3761 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3762 break;
3763 case ANYOF_SPACE:
3764 if (LOC)
3765 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3766 else {
3767 for (value = 0; value < 256; value++)
3768 if (isSPACE(value))
3769 ANYOF_BITMAP_SET(ret, value);
3770 }
3771 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3772 break;
3773 case ANYOF_NSPACE:
3774 if (LOC)
3775 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3776 else {
3777 for (value = 0; value < 256; value++)
3778 if (!isSPACE(value))
3779 ANYOF_BITMAP_SET(ret, value);
3780 }
3781 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3782 break;
3783 case ANYOF_UPPER:
3784 if (LOC)
936ed897 3785 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3786 else {
3787 for (value = 0; value < 256; value++)
3788 if (isUPPER(value))
936ed897 3789 ANYOF_BITMAP_SET(ret, value);
73b437c8 3790 }
ffc61ed2 3791 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3792 break;
3793 case ANYOF_NUPPER:
3794 if (LOC)
936ed897 3795 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3796 else {
3797 for (value = 0; value < 256; value++)
3798 if (!isUPPER(value))
936ed897 3799 ANYOF_BITMAP_SET(ret, value);
73b437c8 3800 }
ffc61ed2 3801 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3802 break;
3803 case ANYOF_XDIGIT:
3804 if (LOC)
936ed897 3805 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3806 else {
3807 for (value = 0; value < 256; value++)
3808 if (isXDIGIT(value))
936ed897 3809 ANYOF_BITMAP_SET(ret, value);
73b437c8 3810 }
ffc61ed2 3811 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3812 break;
3813 case ANYOF_NXDIGIT:
3814 if (LOC)
936ed897 3815 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3816 else {
3817 for (value = 0; value < 256; value++)
3818 if (!isXDIGIT(value))
936ed897 3819 ANYOF_BITMAP_SET(ret, value);
73b437c8 3820 }
ffc61ed2 3821 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3822 break;
3823 default:
b45f050a 3824 vFAIL("Invalid [::] class");
73b437c8 3825 break;
b8c5462f 3826 }
b8c5462f 3827 if (LOC)
936ed897 3828 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3829 continue;
a0d0e21e 3830 }
ffc61ed2
JH
3831 } /* end of namedclass \blah */
3832
a0d0e21e 3833 if (range) {
3a3c4447 3834 if (prevvalue > value) /* b-a */ {
b45f050a 3835 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3836 RExC_parse - rangebegin,
3837 RExC_parse - rangebegin,
b45f050a 3838 rangebegin);
3568d838 3839 range = 0; /* not a valid range */
73b437c8 3840 }
a0d0e21e
LW
3841 }
3842 else {
3568d838 3843 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3844 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3845 RExC_parse[1] != ']') {
3846 RExC_parse++;
ffc61ed2
JH
3847
3848 /* a bad range like \w-, [:word:]- ? */
3849 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3850 if (ckWARN(WARN_REGEXP))
830247a4 3851 vWARN4(RExC_parse,
b45f050a 3852 "False [] range \"%*.*s\"",
830247a4
IZ
3853 RExC_parse - rangebegin,
3854 RExC_parse - rangebegin,
b45f050a 3855 rangebegin);
73b437c8 3856 if (!SIZE_ONLY)
936ed897 3857 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3858 } else
ffc61ed2
JH
3859 range = 1; /* yeah, it's a range! */
3860 continue; /* but do it the next time */
a0d0e21e 3861 }
a687059c 3862 }
ffc61ed2 3863
93733859 3864 /* now is the next time */
ae5c130c 3865 if (!SIZE_ONLY) {
3568d838
JH
3866 IV i;
3867
3868 if (prevvalue < 256) {
3869 IV ceilvalue = value < 256 ? value : 255;
3870
3871#ifdef EBCDIC
3a3c4447
JH
3872 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3873 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3874 {
3568d838
JH
3875 if (isLOWER(prevvalue)) {
3876 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3877 if (isLOWER(i))
3878 ANYOF_BITMAP_SET(ret, i);
3879 } else {
3568d838 3880 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3881 if (isUPPER(i))
3882 ANYOF_BITMAP_SET(ret, i);
3883 }
8ada0baa 3884 }
ffc61ed2 3885 else
8ada0baa 3886#endif
3568d838
JH
3887 for (i = prevvalue; i <= ceilvalue; i++)
3888 ANYOF_BITMAP_SET(ret, i);
3889 }
3890 if (value > 255) {
ffc61ed2 3891 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3892 if (prevvalue < value)
ffc61ed2 3893 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3894 (UV)prevvalue, (UV)value);
3895 else if (prevvalue == value)
ffc61ed2
JH
3896 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3897 (UV)value);
3898 }
8ada0baa 3899 }
ffc61ed2
JH
3900
3901 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3902 }
ffc61ed2 3903
936ed897 3904 if (need_class) {
4f66b38d 3905 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3906 if (SIZE_ONLY)
830247a4 3907 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3908 else
830247a4 3909 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3910 }
ffc61ed2 3911
ae5c130c 3912 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3913 if (!SIZE_ONLY &&
ffc61ed2 3914 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3915 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3916 ) {
a0ed51b3 3917 for (value = 0; value < 256; ++value) {
936ed897 3918 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3919 IV fold = PL_fold[value];
3920
3921 if (fold != value)
3922 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3923 }
3924 }
936ed897 3925 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3926 }
ffc61ed2 3927
ae5c130c 3928 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3929 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3930 /* If the only flag is inversion. */
3931 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3932 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3933 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3934 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3935 }
a0d0e21e 3936
b81d288d 3937 if (!SIZE_ONLY) {
fde631ed 3938 AV *av = newAV();
ffc61ed2
JH
3939 SV *rv;
3940
3941 av_store(av, 0, listsv);
3942 av_store(av, 1, NULL);
3943 rv = newRV_noinc((SV*)av);
19860706 3944 n = add_data(pRExC_state, 1, "s");
830247a4 3945 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3946 ARG_SET(ret, n);
a0ed51b3
LW
3947 }
3948
3949 return ret;
3950}
3951
76e3520e 3952STATIC char*
830247a4 3953S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3954{
830247a4 3955 char* retval = RExC_parse++;
a0d0e21e 3956
4633a7c4 3957 for (;;) {
830247a4
IZ
3958 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3959 RExC_parse[2] == '#') {
3960 while (*RExC_parse && *RExC_parse != ')')
3961 RExC_parse++;
3962 RExC_parse++;
4633a7c4
LW
3963 continue;
3964 }
830247a4
IZ
3965 if (RExC_flags16 & PMf_EXTENDED) {
3966 if (isSPACE(*RExC_parse)) {
3967 RExC_parse++;
748a9306
LW
3968 continue;
3969 }
830247a4
IZ
3970 else if (*RExC_parse == '#') {
3971 while (*RExC_parse && *RExC_parse != '\n')
3972 RExC_parse++;
3973 RExC_parse++;
748a9306
LW
3974 continue;
3975 }
748a9306 3976 }
4633a7c4 3977 return retval;
a0d0e21e 3978 }
a687059c
LW
3979}
3980
3981/*
c277df42 3982- reg_node - emit a node
a0d0e21e 3983*/
76e3520e 3984STATIC regnode * /* Location. */
830247a4 3985S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3986{
c277df42
IZ
3987 register regnode *ret;
3988 register regnode *ptr;
a687059c 3989
830247a4 3990 ret = RExC_emit;
c277df42 3991 if (SIZE_ONLY) {
830247a4
IZ
3992 SIZE_ALIGN(RExC_size);
3993 RExC_size += 1;
a0d0e21e
LW
3994 return(ret);
3995 }
a687059c 3996
c277df42 3997 NODE_ALIGN_FILL(ret);
a0d0e21e 3998 ptr = ret;
c277df42 3999 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4000 if (RExC_offsets) { /* MJD */
4001 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4002 "reg_node", __LINE__,
4003 reg_name[op],
4004 RExC_emit - RExC_emit_start > RExC_offsets[0]
4005 ? "Overwriting end of array!\n" : "OK",
4006 RExC_emit - RExC_emit_start,
4007 RExC_parse - RExC_start,
4008 RExC_offsets[0]));
4009 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4010 }
4011
830247a4 4012 RExC_emit = ptr;
a687059c 4013
a0d0e21e 4014 return(ret);
a687059c
LW
4015}
4016
4017/*
a0d0e21e
LW
4018- reganode - emit a node with an argument
4019*/
76e3520e 4020STATIC regnode * /* Location. */
830247a4 4021S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4022{
c277df42
IZ
4023 register regnode *ret;
4024 register regnode *ptr;
fe14fcc3 4025
830247a4 4026 ret = RExC_emit;
c277df42 4027 if (SIZE_ONLY) {
830247a4
IZ
4028 SIZE_ALIGN(RExC_size);
4029 RExC_size += 2;
a0d0e21e
LW
4030 return(ret);
4031 }
fe14fcc3 4032
c277df42 4033 NODE_ALIGN_FILL(ret);
a0d0e21e 4034 ptr = ret;
c277df42 4035 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4036 if (RExC_offsets) { /* MJD */
4037 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4038 "reganode",
4039 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4040 "Overwriting end of array!\n" : "OK",
4041 RExC_emit - RExC_emit_start,
4042 RExC_parse - RExC_start,
4043 RExC_offsets[0]));
4044 Set_Cur_Node_Offset;
4045 }
4046
830247a4 4047 RExC_emit = ptr;
fe14fcc3 4048
a0d0e21e 4049 return(ret);
fe14fcc3
LW
4050}
4051
4052/*
cd439c50 4053- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4054*/
4055STATIC void
830247a4 4056S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4057{
5e12f4fb 4058 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4059}
4060
4061/*
a0d0e21e
LW
4062- reginsert - insert an operator in front of already-emitted operand
4063*
4064* Means relocating the operand.
4065*/
76e3520e 4066STATIC void
830247a4 4067S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4068{
c277df42
IZ
4069 register regnode *src;
4070 register regnode *dst;
4071 register regnode *place;
4072 register int offset = regarglen[(U8)op];
b81d288d 4073
22c35a8c 4074/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4075
4076 if (SIZE_ONLY) {
830247a4 4077 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4078 return;
4079 }
a687059c 4080
830247a4
IZ
4081 src = RExC_emit;
4082 RExC_emit += NODE_STEP_REGNODE + offset;
4083 dst = RExC_emit;
fac92740 4084 while (src > opnd) {
c277df42 4085 StructCopy(--src, --dst, regnode);
fac92740
MJD
4086 if (RExC_offsets) { /* MJD 20010112 */
4087 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4088 "reg_insert",
4089 dst - RExC_emit_start > RExC_offsets[0]
4090 ? "Overwriting end of array!\n" : "OK",
4091 src - RExC_emit_start,
4092 dst - RExC_emit_start,
4093 RExC_offsets[0]));
4094 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4095 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4096 }
4097 }
4098
a0d0e21e
LW
4099
4100 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4101 if (RExC_offsets) { /* MJD */
4102 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4103 "reginsert",
4104 place - RExC_emit_start > RExC_offsets[0]
4105 ? "Overwriting end of array!\n" : "OK",
4106 place - RExC_emit_start,
4107 RExC_parse - RExC_start,
4108 RExC_offsets[0]));
4109 Set_Node_Offset(place, RExC_parse);
4110 }
c277df42
IZ
4111 src = NEXTOPER(place);
4112 FILL_ADVANCE_NODE(place, op);
4113 Zero(src, offset, regnode);
a687059c
LW
4114}
4115
4116/*
c277df42 4117- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4118*/
76e3520e 4119STATIC void
830247a4 4120S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4121{
c277df42
IZ
4122 register regnode *scan;
4123 register regnode *temp;
a0d0e21e 4124
c277df42 4125 if (SIZE_ONLY)
a0d0e21e
LW
4126 return;
4127
4128 /* Find last node. */
4129 scan = p;
4130 for (;;) {
4131 temp = regnext(scan);
4132 if (temp == NULL)
4133 break;
4134 scan = temp;
4135 }
a687059c 4136
c277df42
IZ
4137 if (reg_off_by_arg[OP(scan)]) {
4138 ARG_SET(scan, val - scan);
a0ed51b3
LW
4139 }
4140 else {
c277df42
IZ
4141 NEXT_OFF(scan) = val - scan;
4142 }
a687059c
LW
4143}
4144
4145/*
a0d0e21e
LW
4146- regoptail - regtail on operand of first argument; nop if operandless
4147*/
76e3520e 4148STATIC void
830247a4 4149S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4150{
a0d0e21e 4151 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4152 if (p == NULL || SIZE_ONLY)
4153 return;
22c35a8c 4154 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4155 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4156 }
22c35a8c 4157 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4158 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4159 }
4160 else
a0d0e21e 4161 return;
a687059c
LW
4162}
4163
4164/*
4165 - regcurly - a little FSA that accepts {\d+,?\d*}
4166 */
79072805 4167STATIC I32
cea2e8a9 4168S_regcurly(pTHX_ register char *s)
a687059c
LW
4169{
4170 if (*s++ != '{')
4171 return FALSE;
f0fcb552 4172 if (!isDIGIT(*s))
a687059c 4173 return FALSE;
f0fcb552 4174 while (isDIGIT(*s))
a687059c
LW
4175 s++;
4176 if (*s == ',')
4177 s++;
f0fcb552 4178 while (isDIGIT(*s))
a687059c
LW
4179 s++;
4180 if (*s != '}')
4181 return FALSE;
4182 return TRUE;
4183}
4184
a687059c 4185
8fa7f367
JH
4186#ifdef DEBUGGING
4187
76e3520e 4188STATIC regnode *
cea2e8a9 4189S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4190{
f248d071 4191 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4192 register regnode *next;
c277df42
IZ
4193
4194 while (op != END && (!last || node < last)) {
4195 /* While that wasn't END last time... */
4196
4197 NODE_ALIGN(node);
4198 op = OP(node);
4199 if (op == CLOSE)
4200 l--;
4201 next = regnext(node);
4202 /* Where, what. */
4203 if (OP(node) == OPTIMIZED)
4204 goto after_print;
4205 regprop(sv, node);
b900a521 4206 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4207 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4208 if (next == NULL) /* Next ptr. */
4209 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4210 else
b900a521 4211 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4212 (void)PerlIO_putc(Perl_debug_log, '\n');
4213 after_print:
22c35a8c 4214 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4215 register regnode *nnode = (OP(next) == LONGJMP
4216 ? regnext(next)
c277df42
IZ
4217 : next);
4218 if (last && nnode > last)
4219 nnode = last;
4220 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4221 }
22c35a8c 4222 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4223 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4224 }
4225 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4226 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4227 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4228 }
22c35a8c 4229 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4230 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4231 next, sv, l + 1);
a0ed51b3
LW
4232 }
4233 else if ( op == PLUS || op == STAR) {
c277df42 4234 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4235 }
4236 else if (op == ANYOF) {
4f66b38d
HS
4237 /* arglen 1 + class block */
4238 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4239 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4240 node = NEXTOPER(node);
a0ed51b3 4241 }
22c35a8c 4242 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4243 /* Literal string, where present. */
cd439c50 4244 node += NODE_SZ_STR(node) - 1;
c277df42 4245 node = NEXTOPER(node);
a0ed51b3
LW
4246 }
4247 else {
c277df42
IZ
4248 node = NEXTOPER(node);
4249 node += regarglen[(U8)op];
4250 }
4251 if (op == CURLYX || op == OPEN)
4252 l++;
4253 else if (op == WHILEM)
4254 l--;
4255 }
4256 return node;
4257}
4258
8fa7f367
JH
4259#endif /* DEBUGGING */
4260
a687059c 4261/*
fd181c75 4262 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4263 */
4264void
864dbfa3 4265Perl_regdump(pTHX_ regexp *r)
a687059c 4266{
35ff7856 4267#ifdef DEBUGGING
46fc3d4c 4268 SV *sv = sv_newmortal();
a687059c 4269
c277df42 4270 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4271
4272 /* Header fields of interest. */
c277df42 4273 if (r->anchored_substr)
7b0972df 4274 PerlIO_printf(Perl_debug_log,
b81d288d 4275 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4276 PL_colors[0],
7b0972df 4277 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4278 SvPVX(r->anchored_substr),
3280af22 4279 PL_colors[1],
c277df42 4280 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4281 (IV)r->anchored_offset);
c277df42 4282 if (r->float_substr)
7b0972df 4283 PerlIO_printf(Perl_debug_log,
b81d288d 4284 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4285 PL_colors[0],
b81d288d 4286 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4287 SvPVX(r->float_substr),
3280af22 4288 PL_colors[1],
c277df42 4289 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4290 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4291 if (r->check_substr)
b81d288d
AB
4292 PerlIO_printf(Perl_debug_log,
4293 r->check_substr == r->float_substr
c277df42
IZ
4294 ? "(checking floating" : "(checking anchored");
4295 if (r->reganch & ROPT_NOSCAN)
4296 PerlIO_printf(Perl_debug_log, " noscan");
4297 if (r->reganch & ROPT_CHECK_ALL)
4298 PerlIO_printf(Perl_debug_log, " isall");
4299 if (r->check_substr)
4300 PerlIO_printf(Perl_debug_log, ") ");
4301
46fc3d4c 4302 if (r->regstclass) {
4303 regprop(sv, r->regstclass);
4304 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4305 }
774d564b 4306 if (r->reganch & ROPT_ANCH) {
4307 PerlIO_printf(Perl_debug_log, "anchored");
4308 if (r->reganch & ROPT_ANCH_BOL)
4309 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4310 if (r->reganch & ROPT_ANCH_MBOL)
4311 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4312 if (r->reganch & ROPT_ANCH_SBOL)
4313 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4314 if (r->reganch & ROPT_ANCH_GPOS)
4315 PerlIO_printf(Perl_debug_log, "(GPOS)");
4316 PerlIO_putc(Perl_debug_log, ' ');
4317 }
c277df42
IZ
4318 if (r->reganch & ROPT_GPOS_SEEN)
4319 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4320 if (r->reganch & ROPT_SKIP)
760ac839 4321 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4322 if (r->reganch & ROPT_IMPLICIT)
760ac839 4323 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4324 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4325 if (r->reganch & ROPT_EVAL_SEEN)
4326 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4327 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4328 if (r->offsets) {
4329 U32 i;
4330 U32 len = r->offsets[0];
392fbf5d 4331 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4332 for (i = 1; i <= len; i++)
392fbf5d
RB
4333 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4334 (UV)r->offsets[i*2-1],
4335 (UV)r->offsets[i*2]);
fac92740
MJD
4336 PerlIO_printf(Perl_debug_log, "\n");
4337 }
17c3b450 4338#endif /* DEBUGGING */
a687059c
LW
4339}
4340
8fa7f367
JH
4341#ifdef DEBUGGING
4342
653099ff
GS
4343STATIC void
4344S_put_byte(pTHX_ SV *sv, int c)
4345{
ffc61ed2 4346 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4347 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4348 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4349 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4350 else
4351 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4352}
4353
8fa7f367
JH
4354#endif /* DEBUGGING */
4355
a687059c 4356/*
a0d0e21e
LW
4357- regprop - printable representation of opcode
4358*/
46fc3d4c 4359void
864dbfa3 4360Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4361{
35ff7856 4362#ifdef DEBUGGING
9b155405 4363 register int k;
a0d0e21e 4364
54dc92de 4365 sv_setpvn(sv, "", 0);
9b155405 4366 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4367 /* It would be nice to FAIL() here, but this may be called from
4368 regexec.c, and it would be hard to supply pRExC_state. */
4369 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4370 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4371
4372 k = PL_regkind[(U8)OP(o)];
4373
4374 if (k == EXACT)
7821416a 4375 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4376 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4377 else if (k == CURLY) {
cb434fcc 4378 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4379 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4380 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4381 }
2c2d71f5
JH
4382 else if (k == WHILEM && o->flags) /* Ordinal/of */
4383 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4384 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4385 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4386 else if (k == LOGICAL)
04ebc1ab 4387 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4388 else if (k == ANYOF) {
4389 int i, rangestart = -1;
ffc61ed2 4390 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4391 const char * const anyofs[] = { /* Should be syncronized with
4392 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4393 "\\w",
4394 "\\W",
4395 "\\s",
4396 "\\S",
4397 "\\d",
4398 "\\D",
4399 "[:alnum:]",
4400 "[:^alnum:]",
4401 "[:alpha:]",
4402 "[:^alpha:]",
4403 "[:ascii:]",
4404 "[:^ascii:]",
4405 "[:ctrl:]",
4406 "[:^ctrl:]",
4407 "[:graph:]",
4408 "[:^graph:]",
4409 "[:lower:]",
4410 "[:^lower:]",
4411 "[:print:]",
4412 "[:^print:]",
4413 "[:punct:]",
4414 "[:^punct:]",
4415 "[:upper:]",
aaa51d5e 4416 "[:^upper:]",
653099ff 4417 "[:xdigit:]",
aaa51d5e
JF
4418 "[:^xdigit:]",
4419 "[:space:]",
4420 "[:^space:]",
4421 "[:blank:]",
4422 "[:^blank:]"
653099ff
GS
4423 };
4424
19860706 4425 if (flags & ANYOF_LOCALE)
653099ff 4426 sv_catpv(sv, "{loc}");
19860706 4427 if (flags & ANYOF_FOLD)
653099ff
GS
4428 sv_catpv(sv, "{i}");
4429 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4430 if (flags & ANYOF_INVERT)
653099ff 4431 sv_catpv(sv, "^");
ffc61ed2
JH
4432 for (i = 0; i <= 256; i++) {
4433 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4434 if (rangestart == -1)
4435 rangestart = i;
4436 } else if (rangestart != -1) {
4437 if (i <= rangestart + 3)
4438 for (; rangestart < i; rangestart++)
653099ff 4439 put_byte(sv, rangestart);
ffc61ed2
JH
4440 else {
4441 put_byte(sv, rangestart);
4442 sv_catpv(sv, "-");
4443 put_byte(sv, i - 1);
653099ff 4444 }
ffc61ed2 4445 rangestart = -1;
653099ff 4446 }
847a199f 4447 }
ffc61ed2
JH
4448
4449 if (o->flags & ANYOF_CLASS)
4450 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4451 if (ANYOF_CLASS_TEST(o,i))
4452 sv_catpv(sv, anyofs[i]);
4453
4454 if (flags & ANYOF_UNICODE)
4455 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4456 else if (flags & ANYOF_UNICODE_ALL)
4457 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4458
4459 {
4460 SV *lv;
4461 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4462
ffc61ed2
JH
4463 if (lv) {
4464 if (sw) {
4465 UV i;
4466 U8 s[UTF8_MAXLEN+1];
b81d288d 4467
ffc61ed2 4468 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4469 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4470
3568d838 4471 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4472 if (rangestart == -1)
4473 rangestart = i;
4474 } else if (rangestart != -1) {
4475 U8 *p;
b81d288d 4476
ffc61ed2
JH
4477 if (i <= rangestart + 3)
4478 for (; rangestart < i; rangestart++) {
2b9d42f0 4479 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4480 put_byte(sv, *p);
4481 }
4482 else {
2b9d42f0 4483 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4484 put_byte(sv, *p);
4485 sv_catpv(sv, "-");
2b9d42f0 4486 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4487 put_byte(sv, *p);
4488 }
4489 rangestart = -1;
4490 }
19860706 4491 }
ffc61ed2
JH
4492
4493 sv_catpv(sv, "..."); /* et cetera */
19860706 4494 }
fde631ed 4495
ffc61ed2
JH
4496 {
4497 char *s = savepv(SvPVX(lv));
4498 char *origs = s;
b81d288d 4499
ffc61ed2 4500 while(*s && *s != '\n') s++;
b81d288d 4501
ffc61ed2
JH
4502 if (*s == '\n') {
4503 char *t = ++s;
4504
4505 while (*s) {
4506 if (*s == '\n')
4507 *s = ' ';
4508 s++;
4509 }
4510 if (s[-1] == ' ')
4511 s[-1] = 0;
4512
4513 sv_catpv(sv, t);
fde631ed 4514 }
b81d288d 4515
ffc61ed2 4516 Safefree(origs);
fde631ed
JH
4517 }
4518 }
653099ff 4519 }
ffc61ed2 4520
653099ff
GS
4521 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4522 }
9b155405 4523 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4524 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4525#endif /* DEBUGGING */
35ff7856 4526}
a687059c 4527
cad2e5aa
JH
4528SV *
4529Perl_re_intuit_string(pTHX_ regexp *prog)
4530{ /* Assume that RE_INTUIT is set */
4531 DEBUG_r(
4532 { STRLEN n_a;
4533 char *s = SvPV(prog->check_substr,n_a);
4534
4535 if (!PL_colorset) reginitcolors();
4536 PerlIO_printf(Perl_debug_log,
4537 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4538 PL_colors[4],PL_colors[5],PL_colors[0],
4539 s,
4540 PL_colors[1],
4541 (strlen(s) > 60 ? "..." : ""));
4542 } );
4543
4544 return prog->check_substr;
4545}
4546
2b69d0c2 4547void
864dbfa3 4548Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4549{
adac82c7 4550 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4551
4552 if (!r || (--r->refcnt > 0))
4553 return;
cad2e5aa
JH
4554 DEBUG_r(PerlIO_printf(Perl_debug_log,
4555 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4556 PL_colors[4],PL_colors[5],PL_colors[0],
4557 r->precomp,
4558 PL_colors[1],
4559 (strlen(r->precomp) > 60 ? "..." : "")));
4560
c277df42 4561 if (r->precomp)
a0d0e21e 4562 Safefree(r->precomp);
fac92740
MJD
4563 if (r->offsets) /* 20010421 MJD */
4564 Safefree(r->offsets);
cf93c79d
IZ
4565 if (RX_MATCH_COPIED(r))
4566 Safefree(r->subbeg);
a193d654
GS
4567 if (r->substrs) {
4568 if (r->anchored_substr)
4569 SvREFCNT_dec(r->anchored_substr);
4570 if (r->float_substr)
4571 SvREFCNT_dec(r->float_substr);
2779dcf1 4572 Safefree(r->substrs);
a193d654 4573 }
c277df42
IZ
4574 if (r->data) {
4575 int n = r->data->count;
dfad63ad
HS
4576 AV* new_comppad = NULL;
4577 AV* old_comppad;
4578 SV** old_curpad;
4579
c277df42
IZ
4580 while (--n >= 0) {
4581 switch (r->data->what[n]) {
4582 case 's':
4583 SvREFCNT_dec((SV*)r->data->data[n]);
4584 break;
653099ff
GS
4585 case 'f':
4586 Safefree(r->data->data[n]);
4587 break;
dfad63ad
HS
4588 case 'p':
4589 new_comppad = (AV*)r->data->data[n];
4590 break;
c277df42 4591 case 'o':
dfad63ad 4592 if (new_comppad == NULL)
cea2e8a9 4593 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4594 old_comppad = PL_comppad;
4595 old_curpad = PL_curpad;
1e6dc0b6
SB
4596 /* Watch out for global destruction's random ordering. */
4597 if (SvTYPE(new_comppad) == SVt_PVAV) {
4598 PL_comppad = new_comppad;
4599 PL_curpad = AvARRAY(new_comppad);
4600 }
4601 else
4602 PL_curpad = NULL;
c277df42 4603 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4604 PL_comppad = old_comppad;
4605 PL_curpad = old_curpad;
4606 SvREFCNT_dec((SV*)new_comppad);
4607 new_comppad = NULL;
c277df42
IZ
4608 break;
4609 case 'n':
4610 break;
4611 default:
830247a4 4612 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4613 }
4614 }
4615 Safefree(r->data->what);
4616 Safefree(r->data);
a0d0e21e
LW
4617 }
4618 Safefree(r->startp);
4619 Safefree(r->endp);
4620 Safefree(r);
a687059c 4621}
c277df42
IZ
4622
4623/*
4624 - regnext - dig the "next" pointer out of a node
4625 *
4626 * [Note, when REGALIGN is defined there are two places in regmatch()
4627 * that bypass this code for speed.]
4628 */
4629regnode *
864dbfa3 4630Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4631{
4632 register I32 offset;
4633
3280af22 4634 if (p == &PL_regdummy)
c277df42
IZ
4635 return(NULL);
4636
4637 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4638 if (offset == 0)
4639 return(NULL);
4640
c277df42 4641 return(p+offset);
c277df42
IZ
4642}
4643
01f988be 4644STATIC void
cea2e8a9 4645S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4646{
4647 va_list args;
4648 STRLEN l1 = strlen(pat1);
4649 STRLEN l2 = strlen(pat2);
4650 char buf[512];
06bf62c7 4651 SV *msv;
c277df42
IZ
4652 char *message;
4653
4654 if (l1 > 510)
4655 l1 = 510;
4656 if (l1 + l2 > 510)
4657 l2 = 510 - l1;
4658 Copy(pat1, buf, l1 , char);
4659 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4660 buf[l1 + l2] = '\n';
4661 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4662#ifdef I_STDARG
4663 /* ANSI variant takes additional second argument */
c277df42 4664 va_start(args, pat2);
8736538c
AS
4665#else
4666 va_start(args);
4667#endif
5a844595 4668 msv = vmess(buf, &args);
c277df42 4669 va_end(args);
06bf62c7 4670 message = SvPV(msv,l1);
c277df42
IZ
4671 if (l1 > 512)
4672 l1 = 512;
4673 Copy(message, buf, l1 , char);
4674 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4675 Perl_croak(aTHX_ "%s", buf);
c277df42 4676}
a0ed51b3
LW
4677
4678/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4679
4680void
864dbfa3 4681Perl_save_re_context(pTHX)
b81d288d 4682{
830247a4
IZ
4683#if 0
4684 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4685 SAVEI32(RExC_npar); /* () count. */
4686 SAVEI32(RExC_size); /* Code size. */
4687 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4688 SAVEVPTR(RExC_rx); /* from regcomp.c */
4689 SAVEI32(RExC_seen); /* from regcomp.c */
4690 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4691 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4692 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4693 SAVEPPTR(RExC_end); /* End of input for compile */
4694 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4695#endif
4696
4697 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4698 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4699 SAVEPPTR(PL_reginput); /* String-input pointer. */
4700 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4701 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4702 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4703 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4704 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4705 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4706 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4707 PL_reg_start_tmp = 0;
a0ed51b3
LW
4708 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4709 PL_reg_start_tmpl = 0;
7766f137 4710 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4711 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4712 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4713 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4714 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4715 SAVEVPTR(PL_regcc); /* from regexec.c */
4716 SAVEVPTR(PL_curcop);
7766f137
GS
4717 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4718 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4719 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4720 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4721 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4722 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4723 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4724 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4725 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4726 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4727#ifdef DEBUGGING
b81d288d 4728 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4729#endif
a0ed51b3 4730}
51371543
GS
4731
4732#ifdef PERL_OBJECT
51371543
GS
4733#include "XSUB.h"
4734#undef this
4735#define this pPerl
4736#endif
4737
4738static void
4739clear_re(pTHXo_ void *r)
4740{
4741 ReREFCNT_dec((regexp *)r);
4742}
ffbc6a93 4743