This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
rescue perldoc for VMS
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
9014280d 388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
9014280d 395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
f9373011
PM
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
397 } STMT_END \
398
b45f050a
JF
399
400#define vWARN2(loc, m, a1) \
401 STMT_START { \
7a799e20 402 IV offset = loc - RExC_precomp; \
9014280d 403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 404 a1, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN3(loc, m, a1, a2) \
409 STMT_START { \
7a799e20 410 IV offset = loc - RExC_precomp; \
9014280d 411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
b45f050a 412 a1, a2, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
414 } STMT_END
415
416#define vWARN4(loc, m, a1, a2, a3) \
417 STMT_START { \
7a799e20 418 IV offset = loc - RExC_precomp; \
9014280d 419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 420 a1, a2, a3, \
830247a4 421 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
422 } STMT_END
423
9d1d55b5
JP
424/* used for the parse_flags section for (?c) -- japhy */
425#define vWARN5(loc, m, a1, a2, a3, a4) \
426 STMT_START { \
7a799e20 427 IV offset = loc - RExC_precomp; \
9014280d 428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
9d1d55b5
JP
429 a1, a2, a3, a4, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
431 } STMT_END
432
8615cb43 433
cd439c50 434/* Allow for side effects in s */
fd3f0ae2 435#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 436
fac92740
MJD
437/* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
441 */
442
443#define MJD_OFFSET_DEBUG(x)
444/* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447# define Set_Node_Offset_To_R(node,byte) \
448 STMT_START { \
449 if (! SIZE_ONLY) { \
450 if((node) < 0) { \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452 } else { \
453 RExC_offsets[2*(node)-1] = (byte); \
454 } \
455 } \
456 } STMT_END
457
458# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461# define Set_Node_Length_To_R(node,len) \
462 STMT_START { \
463 if (! SIZE_ONLY) { \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465 if((node) < 0) { \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467 } else { \
468 RExC_offsets[2*(node)] = (len); \
469 } \
470 } \
471 } STMT_END
472
473# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
476
477/* Get offsets and lengths */
478#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
acfe0abc 481static void clear_re(pTHX_ void *r);
4327152a 482
653099ff
GS
483/* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
486
4327152a 487STATIC void
830247a4 488S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 489{
a0ed51b3
LW
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 492
c277df42
IZ
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
b81d288d 498 data->flags
c277df42
IZ
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500 else
501 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
502 }
503 else {
c277df42 504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
505 data->offset_float_max = (l
506 ? data->last_start_max
c277df42 507 : data->pos_min + data->pos_delta);
9051bda5
HS
508 if ((U32)data->offset_float_max > (U32)I32_MAX)
509 data->offset_float_max = I32_MAX;
c277df42 510 if (data->flags & SF_BEFORE_EOL)
b81d288d 511 data->flags
c277df42
IZ
512 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513 else
514 data->flags &= ~SF_FL_BEFORE_EOL;
515 }
516 }
517 SvCUR_set(data->last_found, 0);
518 data->last_end = -1;
519 data->flags &= ~SF_BEFORE_EOL;
520}
521
653099ff
GS
522/* Can match anything (initialization) */
523STATIC void
830247a4 524S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 525{
653099ff 526 ANYOF_CLASS_ZERO(cl);
f8bef550 527 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 528 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
529 if (LOC)
530 cl->flags |= ANYOF_LOCALE;
531}
532
533/* Can match anything (initialization) */
534STATIC int
535S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536{
537 int value;
538
aaa51d5e 539 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
540 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541 return 1;
1aa99e6b
IH
542 if (!(cl->flags & ANYOF_UNICODE_ALL))
543 return 0;
f8bef550
NC
544 if (!ANYOF_BITMAP_TESTALLSET(cl))
545 return 0;
653099ff
GS
546 return 1;
547}
548
549/* Can match anything (initialization) */
550STATIC void
830247a4 551S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 552{
8ecf7187 553 Zero(cl, 1, struct regnode_charclass_class);
653099ff 554 cl->type = ANYOF;
830247a4 555 cl_anything(pRExC_state, cl);
653099ff
GS
556}
557
558STATIC void
830247a4 559S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 560{
8ecf7187 561 Zero(cl, 1, struct regnode_charclass_class);
653099ff 562 cl->type = ANYOF;
830247a4 563 cl_anything(pRExC_state, cl);
653099ff
GS
564 if (LOC)
565 cl->flags |= ANYOF_LOCALE;
566}
567
568/* 'And' a given class with another one. Can create false positives */
569/* We assume that cl is not inverted */
570STATIC void
571S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572 struct regnode_charclass_class *and_with)
573{
653099ff
GS
574 if (!(and_with->flags & ANYOF_CLASS)
575 && !(cl->flags & ANYOF_CLASS)
576 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577 && !(and_with->flags & ANYOF_FOLD)
578 && !(cl->flags & ANYOF_FOLD)) {
579 int i;
580
581 if (and_with->flags & ANYOF_INVERT)
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] &= ~and_with->bitmap[i];
584 else
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= and_with->bitmap[i];
587 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588 if (!(and_with->flags & ANYOF_EOS))
589 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
590
591 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592 cl->flags &= ~ANYOF_UNICODE_ALL;
593 cl->flags |= ANYOF_UNICODE;
594 ARG_SET(cl, ARG(and_with));
595 }
596 if (!(and_with->flags & ANYOF_UNICODE_ALL))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
600}
601
602/* 'OR' a given class with another one. Can create false positives */
603/* We assume that cl is not inverted */
604STATIC void
830247a4 605S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 606{
653099ff
GS
607 if (or_with->flags & ANYOF_INVERT) {
608 /* We do not use
609 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610 * <= (B1 | !B2) | (CL1 | !CL2)
611 * which is wasteful if CL2 is small, but we ignore CL2:
612 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613 * XXXX Can we handle case-fold? Unclear:
614 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616 */
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618 && !(or_with->flags & ANYOF_FOLD)
619 && !(cl->flags & ANYOF_FOLD) ) {
620 int i;
621
622 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623 cl->bitmap[i] |= ~or_with->bitmap[i];
624 } /* XXXX: logic is complicated otherwise */
625 else {
830247a4 626 cl_anything(pRExC_state, cl);
653099ff
GS
627 }
628 } else {
629 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 631 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
632 || (cl->flags & ANYOF_FOLD)) ) {
633 int i;
634
635 /* OR char bitmap and class bitmap separately */
636 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637 cl->bitmap[i] |= or_with->bitmap[i];
638 if (or_with->flags & ANYOF_CLASS) {
639 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640 cl->classflags[i] |= or_with->classflags[i];
641 cl->flags |= ANYOF_CLASS;
642 }
643 }
644 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 645 cl_anything(pRExC_state, cl);
653099ff
GS
646 }
647 }
648 if (or_with->flags & ANYOF_EOS)
649 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
650
651 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652 ARG(cl) != ARG(or_with)) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
655 }
656 if (or_with->flags & ANYOF_UNICODE_ALL) {
657 cl->flags |= ANYOF_UNICODE_ALL;
658 cl->flags &= ~ANYOF_UNICODE;
659 }
653099ff
GS
660}
661
5d1c421c
JH
662/*
663 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664 * These need to be revisited when a newer toolchain becomes available.
665 */
666#if defined(__sparc64__) && defined(__GNUC__)
667# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668# undef SPARC64_GCC_WORKAROUND
669# define SPARC64_GCC_WORKAROUND 1
670# endif
671#endif
672
653099ff
GS
673/* REx optimizer. Converts nodes into quickier variants "in place".
674 Finds fixed substrings. */
675
c277df42
IZ
676/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677 to the position after last scanned or to NULL. */
678
76e3520e 679STATIC I32
830247a4 680S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
681 /* scanp: Start here (read-write). */
682 /* deltap: Write maxlen-minlen here. */
683 /* last: Stop before this one. */
684{
685 I32 min = 0, pars = 0, code;
686 regnode *scan = *scanp, *next;
687 I32 delta = 0;
688 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 689 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
690 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691 scan_data_t data_fake;
653099ff 692 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 693
c277df42
IZ
694 while (scan && OP(scan) != END && scan < last) {
695 /* Peephole optimizer: */
696
22c35a8c 697 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 698 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
699 regnode *n = regnext(scan);
700 U32 stringok = 1;
701#ifdef DEBUGGING
702 regnode *stop = scan;
b81d288d 703#endif
c277df42 704
cd439c50 705 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
706 /* Skip NOTHING, merge EXACT*. */
707 while (n &&
b81d288d 708 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
709 (stringok && (OP(n) == OP(scan))))
710 && NEXT_OFF(n)
711 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712 if (OP(n) == TAIL || n > next)
713 stringok = 0;
22c35a8c 714 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
715 NEXT_OFF(scan) += NEXT_OFF(n);
716 next = n + NODE_STEP_REGNODE;
717#ifdef DEBUGGING
718 if (stringok)
719 stop = n;
b81d288d 720#endif
c277df42 721 n = regnext(n);
a0ed51b3 722 }
f49d4d0f 723 else if (stringok) {
cd439c50 724 int oldl = STR_LEN(scan);
c277df42 725 regnode *nnext = regnext(n);
f49d4d0f 726
b81d288d 727 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
728 break;
729 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
730 STR_LEN(scan) += STR_LEN(n);
731 next = n + NODE_SZ_STR(n);
c277df42 732 /* Now we can overwrite *n : */
f49d4d0f 733 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 734#ifdef DEBUGGING
f49d4d0f 735 stop = next - 1;
b81d288d 736#endif
c277df42
IZ
737 n = nnext;
738 }
739 }
61a36c01 740
d65e4eab 741 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
742/*
743 Two problematic code points in Unicode casefolding of EXACT nodes:
744
745 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748 which casefold to
749
750 Unicode UTF-8
751
752 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
753 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755 This means that in case-insensitive matching (or "loose matching",
756 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757 length of the above casefolded versions) can match a target string
758 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759 This would rather mess up the minimum length computation.
760
761 What we'll do is to look for the tail four bytes, and then peek
762 at the preceding two bytes to see whether we need to decrease
763 the minimum length by four (six minus two).
764
765 Thanks to the design of UTF-8, there cannot be false matches:
766 A sequence of valid UTF-8 bytes cannot be a subsequence of
767 another valid sequence of UTF-8 bytes.
768
769*/
770 char *s0 = STRING(scan), *s, *t;
771 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772 char *t0 = "\xcc\x88\xcc\x81";
773 char *t1 = t0 + 3;
774
775 for (s = s0 + 2;
776 s < s2 && (t = ninstr(s, s1, t0, t1));
777 s = t + 4) {
778 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780 min -= 4;
781 }
782 }
783
c277df42
IZ
784#ifdef DEBUGGING
785 /* Allow dumping */
cd439c50 786 n = scan + NODE_SZ_STR(scan);
c277df42 787 while (n <= stop) {
22c35a8c 788 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
789 OP(n) = OPTIMIZED;
790 NEXT_OFF(n) = 0;
791 }
792 n++;
793 }
653099ff 794#endif
c277df42 795 }
653099ff
GS
796 /* Follow the next-chain of the current node and optimize
797 away all the NOTHINGs from it. */
c277df42 798 if (OP(scan) != CURLYX) {
048cfca1
GS
799 int max = (reg_off_by_arg[OP(scan)]
800 ? I32_MAX
801 /* I32 may be smaller than U16 on CRAYs! */
802 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
803 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804 int noff;
805 regnode *n = scan;
b81d288d 806
c277df42
IZ
807 /* Skip NOTHING and LONGJMP. */
808 while ((n = regnext(n))
22c35a8c 809 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
810 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 && off + noff < max)
812 off += noff;
813 if (reg_off_by_arg[OP(scan)])
814 ARG(scan) = off;
b81d288d 815 else
c277df42
IZ
816 NEXT_OFF(scan) = off;
817 }
653099ff
GS
818 /* The principal pseudo-switch. Cannot be a switch, since we
819 look into several different things. */
b81d288d 820 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
821 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822 next = regnext(scan);
823 code = OP(scan);
b81d288d
AB
824
825 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 826 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 827 struct regnode_charclass_class accum;
c277df42 828
653099ff 829 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 830 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 831 if (flags & SCF_DO_STCLASS)
830247a4 832 cl_init_zero(pRExC_state, &accum);
c277df42 833 while (OP(scan) == code) {
830247a4 834 I32 deltanext, minnext, f = 0, fake;
653099ff 835 struct regnode_charclass_class this_class;
c277df42
IZ
836
837 num++;
838 data_fake.flags = 0;
b81d288d 839 if (data) {
2c2d71f5 840 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
841 data_fake.last_closep = data->last_closep;
842 }
843 else
844 data_fake.last_closep = &fake;
c277df42
IZ
845 next = regnext(scan);
846 scan = NEXTOPER(scan);
847 if (code != BRANCH)
848 scan = NEXTOPER(scan);
653099ff 849 if (flags & SCF_DO_STCLASS) {
830247a4 850 cl_init(pRExC_state, &this_class);
653099ff
GS
851 data_fake.start_class = &this_class;
852 f = SCF_DO_STCLASS_AND;
b81d288d 853 }
e1901655
IZ
854 if (flags & SCF_WHILEM_VISITED_POS)
855 f |= SCF_WHILEM_VISITED_POS;
653099ff 856 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
857 minnext = study_chunk(pRExC_state, &scan, &deltanext,
858 next, &data_fake, f);
b81d288d 859 if (min1 > minnext)
c277df42
IZ
860 min1 = minnext;
861 if (max1 < minnext + deltanext)
862 max1 = minnext + deltanext;
863 if (deltanext == I32_MAX)
aca2d497 864 is_inf = is_inf_internal = 1;
c277df42
IZ
865 scan = next;
866 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867 pars++;
405ff068 868 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 869 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
870 if (data)
871 data->whilem_c = data_fake.whilem_c;
653099ff 872 if (flags & SCF_DO_STCLASS)
830247a4 873 cl_or(pRExC_state, &accum, &this_class);
b81d288d 874 if (code == SUSPEND)
c277df42
IZ
875 break;
876 }
877 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878 min1 = 0;
879 if (flags & SCF_DO_SUBSTR) {
880 data->pos_min += min1;
881 data->pos_delta += max1 - min1;
882 if (max1 != min1 || is_inf)
883 data->longest = &(data->longest_float);
884 }
885 min += min1;
886 delta += max1 - min1;
653099ff 887 if (flags & SCF_DO_STCLASS_OR) {
830247a4 888 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
889 if (min1) {
890 cl_and(data->start_class, &and_with);
891 flags &= ~SCF_DO_STCLASS;
892 }
893 }
894 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
895 if (min1) {
896 cl_and(data->start_class, &accum);
653099ff 897 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
898 }
899 else {
b81d288d 900 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
901 * data->start_class */
902 StructCopy(data->start_class, &and_with,
903 struct regnode_charclass_class);
904 flags &= ~SCF_DO_STCLASS_AND;
905 StructCopy(&accum, data->start_class,
906 struct regnode_charclass_class);
907 flags |= SCF_DO_STCLASS_OR;
908 data->start_class->flags |= ANYOF_EOS;
909 }
653099ff 910 }
a0ed51b3
LW
911 }
912 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
913 scan = NEXTOPER(NEXTOPER(scan));
914 else /* single branch is optimized. */
915 scan = NEXTOPER(scan);
916 continue;
a0ed51b3
LW
917 }
918 else if (OP(scan) == EXACT) {
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
a0ed51b3 921 if (UTF) {
1aa99e6b
IH
922 U8 *s = (U8*)STRING(scan);
923 l = utf8_length(s, s + l);
9041c2e3 924 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
925 }
926 min += l;
c277df42 927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
928 /* The code below prefers earlier match for fixed
929 offset, later match for variable offset. */
930 if (data->last_end == -1) { /* Update the start info. */
931 data->last_start_min = data->pos_min;
932 data->last_start_max = is_inf
b81d288d 933 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 934 }
cd439c50 935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
936 if (UTF)
937 SvUTF8_on(data->last_found);
c277df42
IZ
938 data->last_end = data->pos_min + l;
939 data->pos_min += l; /* As in the first entry. */
940 data->flags &= ~SF_BEFORE_EOL;
941 }
653099ff
GS
942 if (flags & SCF_DO_STCLASS_AND) {
943 /* Check whether it is compatible with what we know already! */
944 int compat = 1;
945
1aa99e6b 946 if (uc >= 0x100 ||
516a5887 947 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 948 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 949 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 950 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 951 )
653099ff
GS
952 compat = 0;
953 ANYOF_CLASS_ZERO(data->start_class);
954 ANYOF_BITMAP_ZERO(data->start_class);
955 if (compat)
1aa99e6b 956 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 957 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
958 if (uc < 0x100)
959 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
960 }
961 else if (flags & SCF_DO_STCLASS_OR) {
962 /* false positive possible if the class is case-folded */
1aa99e6b 963 if (uc < 0x100)
9b877dbb
IH
964 ANYOF_BITMAP_SET(data->start_class, uc);
965 else
966 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
967 data->start_class->flags &= ~ANYOF_EOS;
968 cl_and(data->start_class, &and_with);
969 }
970 flags &= ~SCF_DO_STCLASS;
a0ed51b3 971 }
653099ff 972 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 973 I32 l = STR_LEN(scan);
1aa99e6b 974 UV uc = *((U8*)STRING(scan));
653099ff
GS
975
976 /* Search for fixed substrings supports EXACT only. */
b81d288d 977 if (flags & SCF_DO_SUBSTR)
830247a4 978 scan_commit(pRExC_state, data);
a0ed51b3 979 if (UTF) {
1aa99e6b
IH
980 U8 *s = (U8 *)STRING(scan);
981 l = utf8_length(s, s + l);
9041c2e3 982 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
983 }
984 min += l;
c277df42 985 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 986 data->pos_min += l;
653099ff
GS
987 if (flags & SCF_DO_STCLASS_AND) {
988 /* Check whether it is compatible with what we know already! */
989 int compat = 1;
990
1aa99e6b 991 if (uc >= 0x100 ||
516a5887 992 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 993 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 994 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
995 compat = 0;
996 ANYOF_CLASS_ZERO(data->start_class);
997 ANYOF_BITMAP_ZERO(data->start_class);
998 if (compat) {
1aa99e6b 999 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1000 data->start_class->flags &= ~ANYOF_EOS;
1001 data->start_class->flags |= ANYOF_FOLD;
1002 if (OP(scan) == EXACTFL)
1003 data->start_class->flags |= ANYOF_LOCALE;
1004 }
1005 }
1006 else if (flags & SCF_DO_STCLASS_OR) {
1007 if (data->start_class->flags & ANYOF_FOLD) {
1008 /* false positive possible if the class is case-folded.
1009 Assume that the locale settings are the same... */
1aa99e6b
IH
1010 if (uc < 0x100)
1011 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1012 data->start_class->flags &= ~ANYOF_EOS;
1013 }
1014 cl_and(data->start_class, &and_with);
1015 }
1016 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1017 }
4d61ec05 1018 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1019 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1020 I32 f = flags, pos_before = 0;
c277df42 1021 regnode *oscan = scan;
653099ff
GS
1022 struct regnode_charclass_class this_class;
1023 struct regnode_charclass_class *oclass = NULL;
727f22e3 1024 I32 next_is_eval = 0;
653099ff 1025
22c35a8c 1026 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1027 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1028 scan = NEXTOPER(scan);
1029 goto finish;
1030 case PLUS:
653099ff 1031 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1032 next = NEXTOPER(scan);
653099ff 1033 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1034 mincount = 1;
1035 maxcount = REG_INFTY;
c277df42
IZ
1036 next = regnext(scan);
1037 scan = NEXTOPER(scan);
1038 goto do_curly;
1039 }
1040 }
1041 if (flags & SCF_DO_SUBSTR)
1042 data->pos_min++;
1043 min++;
1044 /* Fall through. */
1045 case STAR:
653099ff
GS
1046 if (flags & SCF_DO_STCLASS) {
1047 mincount = 0;
b81d288d 1048 maxcount = REG_INFTY;
653099ff
GS
1049 next = regnext(scan);
1050 scan = NEXTOPER(scan);
1051 goto do_curly;
1052 }
b81d288d 1053 is_inf = is_inf_internal = 1;
c277df42
IZ
1054 scan = regnext(scan);
1055 if (flags & SCF_DO_SUBSTR) {
830247a4 1056 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1057 data->longest = &(data->longest_float);
1058 }
1059 goto optimize_curly_tail;
1060 case CURLY:
b81d288d 1061 mincount = ARG1(scan);
c277df42
IZ
1062 maxcount = ARG2(scan);
1063 next = regnext(scan);
cb434fcc
IZ
1064 if (OP(scan) == CURLYX) {
1065 I32 lp = (data ? *(data->last_closep) : 0);
1066
1067 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 }
c277df42 1069 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1070 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1071 do_curly:
1072 if (flags & SCF_DO_SUBSTR) {
830247a4 1073 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1074 pos_before = data->pos_min;
1075 }
1076 if (data) {
1077 fl = data->flags;
1078 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 if (is_inf)
1080 data->flags |= SF_IS_INF;
1081 }
653099ff 1082 if (flags & SCF_DO_STCLASS) {
830247a4 1083 cl_init(pRExC_state, &this_class);
653099ff
GS
1084 oclass = data->start_class;
1085 data->start_class = &this_class;
1086 f |= SCF_DO_STCLASS_AND;
1087 f &= ~SCF_DO_STCLASS_OR;
1088 }
e1901655
IZ
1089 /* These are the cases when once a subexpression
1090 fails at a particular position, it cannot succeed
1091 even after backtracking at the enclosing scope.
b81d288d 1092
e1901655
IZ
1093 XXXX what if minimal match and we are at the
1094 initial run of {n,m}? */
1095 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1097
c277df42 1098 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1099 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 mincount == 0
653099ff
GS
1101 ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103 if (flags & SCF_DO_STCLASS)
1104 data->start_class = oclass;
1105 if (mincount == 0 || minnext == 0) {
1106 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1107 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1108 }
1109 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1110 /* Switch to OR mode: cache the old value of
653099ff
GS
1111 * data->start_class */
1112 StructCopy(data->start_class, &and_with,
1113 struct regnode_charclass_class);
1114 flags &= ~SCF_DO_STCLASS_AND;
1115 StructCopy(&this_class, data->start_class,
1116 struct regnode_charclass_class);
1117 flags |= SCF_DO_STCLASS_OR;
1118 data->start_class->flags |= ANYOF_EOS;
1119 }
1120 } else { /* Non-zero len */
1121 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1122 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1123 cl_and(data->start_class, &and_with);
1124 }
1125 else if (flags & SCF_DO_STCLASS_AND)
1126 cl_and(data->start_class, &this_class);
1127 flags &= ~SCF_DO_STCLASS;
1128 }
c277df42
IZ
1129 if (!scan) /* It was not CURLYX, but CURLY. */
1130 scan = next;
84037bb0 1131 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1132 /* ? quantifier ok, except for (?{ ... }) */
1133 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1134 && (minnext == 0) && (deltanext == 0)
99799961 1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1136 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1137 {
830247a4 1138 vWARN(RExC_parse,
b45f050a
JF
1139 "Quantifier unexpected on zero-length expression");
1140 }
1141
c277df42 1142 min += minnext * mincount;
b81d288d 1143 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1144 && (minnext + deltanext) > 0)
1145 || deltanext == I32_MAX);
aca2d497 1146 is_inf |= is_inf_internal;
c277df42
IZ
1147 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1150 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1151 && data->flags & SF_IN_PAR
1152 && !(data->flags & SF_HAS_EVAL)
1153 && !deltanext && minnext == 1 ) {
1154 /* Try to optimize to CURLYN. */
1155 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1156 regnode *nxt1 = nxt;
1157#ifdef DEBUGGING
1158 regnode *nxt2;
1159#endif
c277df42
IZ
1160
1161 /* Skip open. */
1162 nxt = regnext(nxt);
4d61ec05 1163 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1164 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1165 && STR_LEN(nxt) == 1))
c277df42 1166 goto nogo;
497b47a8 1167#ifdef DEBUGGING
c277df42 1168 nxt2 = nxt;
497b47a8 1169#endif
c277df42 1170 nxt = regnext(nxt);
b81d288d 1171 if (OP(nxt) != CLOSE)
c277df42
IZ
1172 goto nogo;
1173 /* Now we know that nxt2 is the only contents: */
1174 oscan->flags = ARG(nxt);
1175 OP(oscan) = CURLYN;
1176 OP(nxt1) = NOTHING; /* was OPEN. */
1177#ifdef DEBUGGING
1178 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1182 OP(nxt + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1184#endif
c277df42 1185 }
c277df42
IZ
1186 nogo:
1187
1188 /* Try optimization CURLYX => CURLYM. */
b81d288d 1189 if ( OP(oscan) == CURLYX && data
c277df42 1190 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1191 && !(data->flags & SF_HAS_EVAL)
1192 && !deltanext ) {
1193 /* XXXX How to optimize if data == 0? */
1194 /* Optimize to a simpler form. */
1195 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196 regnode *nxt2;
1197
1198 OP(oscan) = CURLYM;
1199 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1200 && (OP(nxt2) != WHILEM))
c277df42
IZ
1201 nxt = nxt2;
1202 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1203 /* Need to optimize away parenths. */
1204 if (data->flags & SF_IN_PAR) {
1205 /* Set the parenth number. */
1206 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
b81d288d 1208 if (OP(nxt) != CLOSE)
b45f050a 1209 FAIL("Panic opt close");
c277df42
IZ
1210 oscan->flags = ARG(nxt);
1211 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1212 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1213#ifdef DEBUGGING
1214 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215 OP(nxt + 1) = OPTIMIZED; /* was count. */
1216 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1218#endif
c277df42
IZ
1219#if 0
1220 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221 regnode *nnxt = regnext(nxt1);
b81d288d 1222
c277df42
IZ
1223 if (nnxt == nxt) {
1224 if (reg_off_by_arg[OP(nxt1)])
1225 ARG_SET(nxt1, nxt2 - nxt1);
1226 else if (nxt2 - nxt1 < U16_MAX)
1227 NEXT_OFF(nxt1) = nxt2 - nxt1;
1228 else
1229 OP(nxt) = NOTHING; /* Cannot beautify */
1230 }
1231 nxt1 = nnxt;
1232 }
1233#endif
1234 /* Optimize again: */
b81d288d 1235 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1236 NULL, 0);
a0ed51b3
LW
1237 }
1238 else
c277df42 1239 oscan->flags = 0;
c277df42 1240 }
e1901655
IZ
1241 else if ((OP(oscan) == CURLYX)
1242 && (flags & SCF_WHILEM_VISITED_POS)
1243 /* See the comment on a similar expression above.
1244 However, this time it not a subexpression
1245 we care about, but the expression itself. */
1246 && (maxcount == REG_INFTY)
1247 && data && ++data->whilem_c < 16) {
1248 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1249 /* Find WHILEM (as in regexec.c) */
1250 regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253 nxt += ARG(nxt);
1254 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1255 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1256 }
b81d288d 1257 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1258 pars++;
1259 if (flags & SCF_DO_SUBSTR) {
1260 SV *last_str = Nullsv;
1261 int counted = mincount != 0;
1262
1263 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1264#if defined(SPARC64_GCC_WORKAROUND)
1265 I32 b = 0;
1266 STRLEN l = 0;
1267 char *s = NULL;
1268 I32 old = 0;
1269
1270 if (pos_before >= data->last_start_min)
1271 b = pos_before;
1272 else
1273 b = data->last_start_min;
1274
1275 l = 0;
1276 s = SvPV(data->last_found, l);
1277 old = b - data->last_start_min;
1278
1279#else
b81d288d 1280 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1281 ? pos_before : data->last_start_min;
1282 STRLEN l;
1283 char *s = SvPV(data->last_found, l);
a0ed51b3 1284 I32 old = b - data->last_start_min;
5d1c421c 1285#endif
a0ed51b3
LW
1286
1287 if (UTF)
1288 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1289
a0ed51b3 1290 l -= old;
c277df42 1291 /* Get the added string: */
79cb57f6 1292 last_str = newSVpvn(s + old, l);
c277df42
IZ
1293 if (deltanext == 0 && pos_before == b) {
1294 /* What was added is a constant string */
1295 if (mincount > 1) {
1296 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1297 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1298 SvPVX(last_str), l, mincount - 1);
1299 SvCUR(last_str) *= mincount;
1300 /* Add additional parts. */
b81d288d 1301 SvCUR_set(data->last_found,
c277df42
IZ
1302 SvCUR(data->last_found) - l);
1303 sv_catsv(data->last_found, last_str);
1304 data->last_end += l * (mincount - 1);
1305 }
2a8d9689
HS
1306 } else {
1307 /* start offset must point into the last copy */
1308 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1309 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1310 * (minnext + data->pos_delta);
c277df42
IZ
1311 }
1312 }
1313 /* It is counted once already... */
1314 data->pos_min += minnext * (mincount - counted);
1315 data->pos_delta += - counted * deltanext +
1316 (minnext + deltanext) * maxcount - minnext * mincount;
1317 if (mincount != maxcount) {
653099ff
GS
1318 /* Cannot extend fixed substrings found inside
1319 the group. */
830247a4 1320 scan_commit(pRExC_state,data);
c277df42
IZ
1321 if (mincount && last_str) {
1322 sv_setsv(data->last_found, last_str);
1323 data->last_end = data->pos_min;
b81d288d 1324 data->last_start_min =
a0ed51b3 1325 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1326 data->last_start_max = is_inf
1327 ? I32_MAX
c277df42 1328 : data->pos_min + data->pos_delta
a0ed51b3 1329 - CHR_SVLEN(last_str);
c277df42
IZ
1330 }
1331 data->longest = &(data->longest_float);
1332 }
aca2d497 1333 SvREFCNT_dec(last_str);
c277df42 1334 }
405ff068 1335 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1336 data->flags |= SF_HAS_EVAL;
1337 optimize_curly_tail:
c277df42 1338 if (OP(oscan) != CURLYX) {
22c35a8c 1339 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1340 && NEXT_OFF(next))
1341 NEXT_OFF(oscan) += NEXT_OFF(next);
1342 }
c277df42 1343 continue;
653099ff 1344 default: /* REF and CLUMP only? */
c277df42 1345 if (flags & SCF_DO_SUBSTR) {
830247a4 1346 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1347 data->longest = &(data->longest_float);
1348 }
aca2d497 1349 is_inf = is_inf_internal = 1;
653099ff 1350 if (flags & SCF_DO_STCLASS_OR)
830247a4 1351 cl_anything(pRExC_state, data->start_class);
653099ff 1352 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1353 break;
1354 }
a0ed51b3 1355 }
ffc61ed2 1356 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1357 int value = 0;
653099ff 1358
c277df42 1359 if (flags & SCF_DO_SUBSTR) {
830247a4 1360 scan_commit(pRExC_state,data);
c277df42
IZ
1361 data->pos_min++;
1362 }
1363 min++;
653099ff
GS
1364 if (flags & SCF_DO_STCLASS) {
1365 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367 /* Some of the logic below assumes that switching
1368 locale on will only add false positives. */
1369 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1370 case SANY:
653099ff
GS
1371 default:
1372 do_default:
1373 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1375 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1376 break;
1377 case REG_ANY:
1378 if (OP(scan) == SANY)
1379 goto do_default;
1380 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1383 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1384 }
1385 if (flags & SCF_DO_STCLASS_AND || !value)
1386 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387 break;
1388 case ANYOF:
1389 if (flags & SCF_DO_STCLASS_AND)
1390 cl_and(data->start_class,
1391 (struct regnode_charclass_class*)scan);
1392 else
830247a4 1393 cl_or(pRExC_state, data->start_class,
653099ff
GS
1394 (struct regnode_charclass_class*)scan);
1395 break;
1396 case ALNUM:
1397 if (flags & SCF_DO_STCLASS_AND) {
1398 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400 for (value = 0; value < 256; value++)
1401 if (!isALNUM(value))
1402 ANYOF_BITMAP_CLEAR(data->start_class, value);
1403 }
1404 }
1405 else {
1406 if (data->start_class->flags & ANYOF_LOCALE)
1407 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408 else {
1409 for (value = 0; value < 256; value++)
1410 if (isALNUM(value))
b81d288d 1411 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1412 }
1413 }
1414 break;
1415 case ALNUML:
1416 if (flags & SCF_DO_STCLASS_AND) {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419 }
1420 else {
1421 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422 data->start_class->flags |= ANYOF_LOCALE;
1423 }
1424 break;
1425 case NALNUM:
1426 if (flags & SCF_DO_STCLASS_AND) {
1427 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429 for (value = 0; value < 256; value++)
1430 if (isALNUM(value))
1431 ANYOF_BITMAP_CLEAR(data->start_class, value);
1432 }
1433 }
1434 else {
1435 if (data->start_class->flags & ANYOF_LOCALE)
1436 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437 else {
1438 for (value = 0; value < 256; value++)
1439 if (!isALNUM(value))
b81d288d 1440 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1441 }
1442 }
1443 break;
1444 case NALNUML:
1445 if (flags & SCF_DO_STCLASS_AND) {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448 }
1449 else {
1450 data->start_class->flags |= ANYOF_LOCALE;
1451 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452 }
1453 break;
1454 case SPACE:
1455 if (flags & SCF_DO_STCLASS_AND) {
1456 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458 for (value = 0; value < 256; value++)
1459 if (!isSPACE(value))
1460 ANYOF_BITMAP_CLEAR(data->start_class, value);
1461 }
1462 }
1463 else {
1464 if (data->start_class->flags & ANYOF_LOCALE)
1465 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466 else {
1467 for (value = 0; value < 256; value++)
1468 if (isSPACE(value))
b81d288d 1469 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1470 }
1471 }
1472 break;
1473 case SPACEL:
1474 if (flags & SCF_DO_STCLASS_AND) {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477 }
1478 else {
1479 data->start_class->flags |= ANYOF_LOCALE;
1480 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481 }
1482 break;
1483 case NSPACE:
1484 if (flags & SCF_DO_STCLASS_AND) {
1485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487 for (value = 0; value < 256; value++)
1488 if (isSPACE(value))
1489 ANYOF_BITMAP_CLEAR(data->start_class, value);
1490 }
1491 }
1492 else {
1493 if (data->start_class->flags & ANYOF_LOCALE)
1494 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495 else {
1496 for (value = 0; value < 256; value++)
1497 if (!isSPACE(value))
b81d288d 1498 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1499 }
1500 }
1501 break;
1502 case NSPACEL:
1503 if (flags & SCF_DO_STCLASS_AND) {
1504 if (data->start_class->flags & ANYOF_LOCALE) {
1505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506 for (value = 0; value < 256; value++)
1507 if (!isSPACE(value))
1508 ANYOF_BITMAP_CLEAR(data->start_class, value);
1509 }
1510 }
1511 else {
1512 data->start_class->flags |= ANYOF_LOCALE;
1513 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514 }
1515 break;
1516 case DIGIT:
1517 if (flags & SCF_DO_STCLASS_AND) {
1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519 for (value = 0; value < 256; value++)
1520 if (!isDIGIT(value))
1521 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 }
1523 else {
1524 if (data->start_class->flags & ANYOF_LOCALE)
1525 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526 else {
1527 for (value = 0; value < 256; value++)
1528 if (isDIGIT(value))
b81d288d 1529 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1530 }
1531 }
1532 break;
1533 case NDIGIT:
1534 if (flags & SCF_DO_STCLASS_AND) {
1535 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536 for (value = 0; value < 256; value++)
1537 if (isDIGIT(value))
1538 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539 }
1540 else {
1541 if (data->start_class->flags & ANYOF_LOCALE)
1542 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543 else {
1544 for (value = 0; value < 256; value++)
1545 if (!isDIGIT(value))
b81d288d 1546 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1547 }
1548 }
1549 break;
1550 }
1551 if (flags & SCF_DO_STCLASS_OR)
1552 cl_and(data->start_class, &and_with);
1553 flags &= ~SCF_DO_STCLASS;
1554 }
a0ed51b3 1555 }
22c35a8c 1556 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1557 data->flags |= (OP(scan) == MEOL
1558 ? SF_BEFORE_MEOL
1559 : SF_BEFORE_SEOL);
a0ed51b3 1560 }
653099ff
GS
1561 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1562 /* Lookbehind, or need to calculate parens/evals/stclass: */
1563 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1564 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1565 /* Lookahead/lookbehind */
cb434fcc 1566 I32 deltanext, minnext, fake = 0;
c277df42 1567 regnode *nscan;
653099ff
GS
1568 struct regnode_charclass_class intrnl;
1569 int f = 0;
c277df42
IZ
1570
1571 data_fake.flags = 0;
b81d288d 1572 if (data) {
2c2d71f5 1573 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1574 data_fake.last_closep = data->last_closep;
1575 }
1576 else
1577 data_fake.last_closep = &fake;
653099ff
GS
1578 if ( flags & SCF_DO_STCLASS && !scan->flags
1579 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1580 cl_init(pRExC_state, &intrnl);
653099ff 1581 data_fake.start_class = &intrnl;
e1901655 1582 f |= SCF_DO_STCLASS_AND;
653099ff 1583 }
e1901655
IZ
1584 if (flags & SCF_WHILEM_VISITED_POS)
1585 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1586 next = regnext(scan);
1587 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1588 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1589 if (scan->flags) {
1590 if (deltanext) {
9baa0206 1591 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1592 }
1593 else if (minnext > U8_MAX) {
9baa0206 1594 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1595 }
1596 scan->flags = minnext;
1597 }
1598 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599 pars++;
405ff068 1600 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1601 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1602 if (data)
1603 data->whilem_c = data_fake.whilem_c;
e1901655 1604 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1605 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607 cl_and(data->start_class, &intrnl);
1608 if (was)
1609 data->start_class->flags |= ANYOF_EOS;
1610 }
a0ed51b3
LW
1611 }
1612 else if (OP(scan) == OPEN) {
c277df42 1613 pars++;
a0ed51b3 1614 }
cb434fcc
IZ
1615 else if (OP(scan) == CLOSE) {
1616 if (ARG(scan) == is_par) {
1617 next = regnext(scan);
c277df42 1618
cb434fcc
IZ
1619 if ( next && (OP(next) != WHILEM) && next < last)
1620 is_par = 0; /* Disable optimization */
1621 }
1622 if (data)
1623 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1624 }
1625 else if (OP(scan) == EVAL) {
c277df42
IZ
1626 if (data)
1627 data->flags |= SF_HAS_EVAL;
1628 }
96776eda 1629 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1630 if (flags & SCF_DO_SUBSTR) {
830247a4 1631 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1632 data->longest = &(data->longest_float);
1633 }
1634 is_inf = is_inf_internal = 1;
653099ff 1635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1636 cl_anything(pRExC_state, data->start_class);
96776eda 1637 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1638 }
c277df42
IZ
1639 /* Else: zero-length, ignore. */
1640 scan = regnext(scan);
1641 }
1642
1643 finish:
1644 *scanp = scan;
aca2d497 1645 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1646 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1647 data->pos_delta = I32_MAX - data->pos_min;
1648 if (is_par > U8_MAX)
1649 is_par = 0;
1650 if (is_par && pars==1 && data) {
1651 data->flags |= SF_IN_PAR;
1652 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1653 }
1654 else if (pars && data) {
c277df42
IZ
1655 data->flags |= SF_HAS_PAR;
1656 data->flags &= ~SF_IN_PAR;
1657 }
653099ff
GS
1658 if (flags & SCF_DO_STCLASS_OR)
1659 cl_and(data->start_class, &and_with);
c277df42
IZ
1660 return min;
1661}
1662
76e3520e 1663STATIC I32
830247a4 1664S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1665{
830247a4 1666 if (RExC_rx->data) {
b81d288d
AB
1667 Renewc(RExC_rx->data,
1668 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1669 char, struct reg_data);
830247a4
IZ
1670 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671 RExC_rx->data->count += n;
a0ed51b3
LW
1672 }
1673 else {
830247a4 1674 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1675 char, struct reg_data);
830247a4
IZ
1676 New(1208, RExC_rx->data->what, n, U8);
1677 RExC_rx->data->count = n;
c277df42 1678 }
830247a4
IZ
1679 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680 return RExC_rx->data->count - n;
c277df42
IZ
1681}
1682
d88dccdf 1683void
864dbfa3 1684Perl_reginitcolors(pTHX)
d88dccdf 1685{
d88dccdf
IZ
1686 int i = 0;
1687 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1688
d88dccdf
IZ
1689 if (s) {
1690 PL_colors[0] = s = savepv(s);
1691 while (++i < 6) {
1692 s = strchr(s, '\t');
1693 if (s) {
1694 *s = '\0';
1695 PL_colors[i] = ++s;
1696 }
1697 else
c712d376 1698 PL_colors[i] = s = "";
d88dccdf
IZ
1699 }
1700 } else {
b81d288d 1701 while (i < 6)
d88dccdf
IZ
1702 PL_colors[i++] = "";
1703 }
1704 PL_colorset = 1;
1705}
1706
8615cb43 1707
a687059c 1708/*
e50aee73 1709 - pregcomp - compile a regular expression into internal code
a687059c
LW
1710 *
1711 * We can't allocate space until we know how big the compiled form will be,
1712 * but we can't compile it (and thus know how big it is) until we've got a
1713 * place to put the code. So we cheat: we compile it twice, once with code
1714 * generation turned off and size counting turned on, and once "for real".
1715 * This also means that we don't allocate space until we are sure that the
1716 * thing really will compile successfully, and we never have to move the
1717 * code and thus invalidate pointers into it. (Note that it has to be in
1718 * one piece because free() must be able to free it all.) [NB: not true in perl]
1719 *
1720 * Beware that the optimization-preparation code in here knows about some
1721 * of the structure of the compiled regexp. [I'll say.]
1722 */
1723regexp *
864dbfa3 1724Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1725{
a0d0e21e 1726 register regexp *r;
c277df42 1727 regnode *scan;
c277df42 1728 regnode *first;
a0d0e21e 1729 I32 flags;
a0d0e21e
LW
1730 I32 minlen = 0;
1731 I32 sawplus = 0;
1732 I32 sawopen = 0;
2c2d71f5 1733 scan_data_t data;
830247a4
IZ
1734 RExC_state_t RExC_state;
1735 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1736
1737 if (exp == NULL)
c277df42 1738 FAIL("NULL regexp argument");
a0d0e21e 1739
a5961de5 1740 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1741
5cfc7842 1742 RExC_precomp = exp;
a5961de5
JH
1743 DEBUG_r({
1744 if (!PL_colorset) reginitcolors();
1745 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746 PL_colors[4],PL_colors[5],PL_colors[0],
1747 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748 });
830247a4
IZ
1749 RExC_flags16 = pm->op_pmflags;
1750 RExC_sawback = 0;
bbce6d69 1751
830247a4
IZ
1752 RExC_seen = 0;
1753 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754 RExC_seen_evals = 0;
1755 RExC_extralen = 0;
c277df42 1756
bbce6d69 1757 /* First pass: determine size, legality. */
830247a4 1758 RExC_parse = exp;
fac92740 1759 RExC_start = exp;
830247a4
IZ
1760 RExC_end = xend;
1761 RExC_naughty = 0;
1762 RExC_npar = 1;
1763 RExC_size = 0L;
1764 RExC_emit = &PL_regdummy;
1765 RExC_whilem_seen = 0;
85ddcde9
JH
1766#if 0 /* REGC() is (currently) a NOP at the first pass.
1767 * Clever compilers notice this and complain. --jhi */
830247a4 1768 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1769#endif
830247a4 1770 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1771 RExC_precomp = Nullch;
a0d0e21e
LW
1772 return(NULL);
1773 }
830247a4 1774 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1775
c277df42
IZ
1776 /* Small enough for pointer-storage convention?
1777 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1778 if (RExC_size >= 0x10000L && RExC_extralen)
1779 RExC_size += RExC_extralen;
c277df42 1780 else
830247a4
IZ
1781 RExC_extralen = 0;
1782 if (RExC_whilem_seen > 15)
1783 RExC_whilem_seen = 15;
a0d0e21e 1784
bbce6d69 1785 /* Allocate space and initialize. */
830247a4 1786 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1787 char, regexp);
a0d0e21e 1788 if (r == NULL)
b45f050a
JF
1789 FAIL("Regexp out of space");
1790
0f79a09d
GS
1791#ifdef DEBUGGING
1792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1793 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1794#endif
c277df42 1795 r->refcnt = 1;
bbce6d69 1796 r->prelen = xend - exp;
5cfc7842 1797 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1798 r->subbeg = NULL;
1799 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1800 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1801
1802 r->substrs = 0; /* Useful during FAIL. */
1803 r->startp = 0; /* Useful during FAIL. */
1804 r->endp = 0; /* Useful during FAIL. */
1805
fac92740
MJD
1806 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807 if (r->offsets) {
1808 r->offsets[0] = RExC_size;
1809 }
1810 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1811 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1812 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1813 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1814
830247a4 1815 RExC_rx = r;
bbce6d69 1816
1817 /* Second pass: emit code. */
055bb491 1818 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1819 RExC_parse = exp;
1820 RExC_end = xend;
1821 RExC_naughty = 0;
1822 RExC_npar = 1;
fac92740 1823 RExC_emit_start = r->program;
830247a4 1824 RExC_emit = r->program;
2cd61cdb 1825 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1826 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1828 r->data = 0;
830247a4 1829 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1830 return(NULL);
1831
1832 /* Dig out information for optimizations. */
cf93c79d 1833 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1834 pm->op_pmflags = RExC_flags16;
a0ed51b3 1835 if (UTF)
5ff6fc6d 1836 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1837 r->regstclass = NULL;
830247a4 1838 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1839 r->reganch |= ROPT_NAUGHTY;
c277df42 1840 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1841
1842 /* XXXX To minimize changes to RE engine we always allocate
1843 3-units-long substrs field. */
1844 Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
2c2d71f5 1846 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1847 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1849 I32 fake;
c5254dd6 1850 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1851 struct regnode_charclass_class ch_class;
1852 int stclass_flag;
cb434fcc 1853 I32 last_close = 0;
a0d0e21e
LW
1854
1855 first = scan;
c277df42 1856 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1857 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1858 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860 (OP(first) == PLUS) ||
1861 (OP(first) == MINMOD) ||
653099ff 1862 /* An {n,m} with n>0 */
22c35a8c 1863 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1864 if (OP(first) == PLUS)
1865 sawplus = 1;
1866 else
1867 first += regarglen[(U8)OP(first)];
1868 first = NEXTOPER(first);
a687059c
LW
1869 }
1870
a0d0e21e
LW
1871 /* Starting-point info. */
1872 again:
653099ff 1873 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1874 if (OP(first) == EXACT)
1875 ; /* Empty, get anchored substr later. */
1876 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1877 r->regstclass = first;
1878 }
653099ff 1879 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1880 r->regstclass = first;
22c35a8c
GS
1881 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1883 r->regstclass = first;
22c35a8c 1884 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1885 r->reganch |= (OP(first) == MBOL
1886 ? ROPT_ANCH_MBOL
1887 : (OP(first) == SBOL
1888 ? ROPT_ANCH_SBOL
1889 : ROPT_ANCH_BOL));
a0d0e21e 1890 first = NEXTOPER(first);
774d564b 1891 goto again;
1892 }
1893 else if (OP(first) == GPOS) {
1894 r->reganch |= ROPT_ANCH_GPOS;
1895 first = NEXTOPER(first);
1896 goto again;
a0d0e21e 1897 }
e09294f4 1898 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1899 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1900 !(r->reganch & ROPT_ANCH) )
1901 {
1902 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1903 int type = OP(NEXTOPER(first));
1904
ffc61ed2 1905 if (type == REG_ANY)
cad2e5aa
JH
1906 type = ROPT_ANCH_MBOL;
1907 else
1908 type = ROPT_ANCH_SBOL;
1909
1910 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1911 first = NEXTOPER(first);
774d564b 1912 goto again;
a0d0e21e 1913 }
b81d288d 1914 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1915 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1916 /* x+ must match at the 1st pos of run of x's */
1917 r->reganch |= ROPT_SKIP;
a0d0e21e 1918
c277df42 1919 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1920 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1921 (IV)(first - scan + 1)));
a0d0e21e
LW
1922 /*
1923 * If there's something expensive in the r.e., find the
1924 * longest literal string that must appear and make it the
1925 * regmust. Resolve ties in favor of later strings, since
1926 * the regstart check works with the beginning of the r.e.
1927 * and avoiding duplication strengthens checking. Not a
1928 * strong reason, but sufficient in the absence of others.
1929 * [Now we resolve ties in favor of the earlier string if
c277df42 1930 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1931 * earlier string may buy us something the later one won't.]
1932 */
a0d0e21e 1933 minlen = 0;
a687059c 1934
79cb57f6
GS
1935 data.longest_fixed = newSVpvn("",0);
1936 data.longest_float = newSVpvn("",0);
1937 data.last_found = newSVpvn("",0);
c277df42
IZ
1938 data.longest = &(data.longest_fixed);
1939 first = scan;
653099ff 1940 if (!r->regstclass) {
830247a4 1941 cl_init(pRExC_state, &ch_class);
653099ff
GS
1942 data.start_class = &ch_class;
1943 stclass_flag = SCF_DO_STCLASS_AND;
1944 } else /* XXXX Check for BOUND? */
1945 stclass_flag = 0;
cb434fcc 1946 data.last_closep = &last_close;
653099ff 1947
830247a4 1948 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1949 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1950 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1951 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1952 && !RExC_seen_zerolen
1953 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1954 r->reganch |= ROPT_CHECK_ALL;
830247a4 1955 scan_commit(pRExC_state, &data);
c277df42
IZ
1956 SvREFCNT_dec(data.last_found);
1957
a0ed51b3 1958 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1959 if (longest_float_length
c277df42
IZ
1960 || (data.flags & SF_FL_BEFORE_EOL
1961 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1962 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1963 int t;
1964
a0ed51b3 1965 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1966 && data.offset_fixed == data.offset_float_min
1967 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968 goto remove_float; /* As in (a)+. */
1969
33b8afdf
JH
1970 if (SvUTF8(data.longest_float)) {
1971 r->float_utf8 = data.longest_float;
1972 r->float_substr = Nullsv;
1973 } else {
1974 r->float_substr = data.longest_float;
1975 r->float_utf8 = Nullsv;
1976 }
c277df42
IZ
1977 r->float_min_offset = data.offset_float_min;
1978 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1979 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1981 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 1982 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1983 }
1984 else {
aca2d497 1985 remove_float:
33b8afdf 1986 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1987 SvREFCNT_dec(data.longest_float);
c5254dd6 1988 longest_float_length = 0;
a0d0e21e 1989 }
c277df42 1990
a0ed51b3 1991 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1992 if (longest_fixed_length
c277df42
IZ
1993 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1995 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1996 int t;
1997
33b8afdf
JH
1998 if (SvUTF8(data.longest_fixed)) {
1999 r->anchored_utf8 = data.longest_fixed;
2000 r->anchored_substr = Nullsv;
2001 } else {
2002 r->anchored_substr = data.longest_fixed;
2003 r->anchored_utf8 = Nullsv;
2004 }
c277df42 2005 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2006 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 2008 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 2009 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2010 }
2011 else {
33b8afdf 2012 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2013 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2014 longest_fixed_length = 0;
a0d0e21e 2015 }
b81d288d 2016 if (r->regstclass
ffc61ed2 2017 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2018 r->regstclass = NULL;
33b8afdf
JH
2019 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020 && stclass_flag
653099ff
GS
2021 && !(data.start_class->flags & ANYOF_EOS)
2022 && !cl_is_anything(data.start_class)) {
830247a4 2023 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2024
b81d288d 2025 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2026 struct regnode_charclass_class);
2027 StructCopy(data.start_class,
830247a4 2028 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2029 struct regnode_charclass_class);
830247a4 2030 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2031 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2032 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2033 DEBUG_r({ SV *sv = sv_newmortal();
2034 regprop(sv, (regnode*)data.start_class);
2035 PerlIO_printf(Perl_debug_log,
2036 "synthetic stclass `%s'.\n",
2037 SvPVX(sv));});
653099ff 2038 }
c277df42
IZ
2039
2040 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2041 if (longest_fixed_length > longest_float_length) {
c277df42 2042 r->check_substr = r->anchored_substr;
33b8afdf 2043 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2044 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2045 if (r->reganch & ROPT_ANCH_SINGLE)
2046 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2047 }
2048 else {
c277df42 2049 r->check_substr = r->float_substr;
33b8afdf 2050 r->check_utf8 = r->float_utf8;
c277df42
IZ
2051 r->check_offset_min = data.offset_float_min;
2052 r->check_offset_max = data.offset_float_max;
a0d0e21e 2053 }
30382c73
IZ
2054 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2055 This should be changed ASAP! */
33b8afdf 2056 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2057 r->reganch |= RE_USE_INTUIT;
33b8afdf 2058 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2059 r->reganch |= RE_INTUIT_TAIL;
2060 }
a0ed51b3
LW
2061 }
2062 else {
c277df42
IZ
2063 /* Several toplevels. Best we can is to set minlen. */
2064 I32 fake;
653099ff 2065 struct regnode_charclass_class ch_class;
cb434fcc 2066 I32 last_close = 0;
c277df42
IZ
2067
2068 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2069 scan = r->program + 1;
830247a4 2070 cl_init(pRExC_state, &ch_class);
653099ff 2071 data.start_class = &ch_class;
cb434fcc 2072 data.last_closep = &last_close;
e1901655 2073 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2074 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2075 = r->float_substr = r->float_utf8 = Nullsv;
653099ff
GS
2076 if (!(data.start_class->flags & ANYOF_EOS)
2077 && !cl_is_anything(data.start_class)) {
830247a4 2078 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2079
b81d288d 2080 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2081 struct regnode_charclass_class);
2082 StructCopy(data.start_class,
830247a4 2083 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2084 struct regnode_charclass_class);
830247a4 2085 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2086 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2087 DEBUG_r({ SV* sv = sv_newmortal();
2088 regprop(sv, (regnode*)data.start_class);
2089 PerlIO_printf(Perl_debug_log,
2090 "synthetic stclass `%s'.\n",
2091 SvPVX(sv));});
653099ff 2092 }
a0d0e21e
LW
2093 }
2094
a0d0e21e 2095 r->minlen = minlen;
b81d288d 2096 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2097 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2098 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2099 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2100 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2101 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2102 if (RExC_seen & REG_SEEN_CANY)
2103 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2104 Newz(1002, r->startp, RExC_npar, I32);
2105 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2106 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2107 DEBUG_r(regdump(r));
2108 return(r);
a687059c
LW
2109}
2110
2111/*
2112 - reg - regular expression, i.e. main body or parenthesized thing
2113 *
2114 * Caller must absorb opening parenthesis.
2115 *
2116 * Combining parenthesis handling with the base level of regular expression
2117 * is a trifle forced, but the need to tie the tails of the branches to what
2118 * follows makes it hard to avoid.
2119 */
76e3520e 2120STATIC regnode *
830247a4 2121S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2122 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2123{
c277df42
IZ
2124 register regnode *ret; /* Will be the head of the group. */
2125 register regnode *br;
2126 register regnode *lastbr;
2127 register regnode *ender = 0;
a0d0e21e 2128 register I32 parno = 0;
830247a4 2129 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2130
2131 /* for (?g), (?gc), and (?o) warnings; warning
2132 about (?c) will warn about (?g) -- japhy */
2133
2134 I32 wastedflags = 0x00,
2135 wasted_o = 0x01,
2136 wasted_g = 0x02,
2137 wasted_gc = 0x02 | 0x04,
2138 wasted_c = 0x04;
2139
fac92740 2140 char * parse_start = RExC_parse; /* MJD */
830247a4 2141 char *oregcomp_parse = RExC_parse;
c277df42 2142 char c;
a0d0e21e 2143
821b33a5 2144 *flagp = 0; /* Tentatively. */
a0d0e21e 2145
9d1d55b5 2146
a0d0e21e
LW
2147 /* Make an OPEN node, if parenthesized. */
2148 if (paren) {
fac92740 2149 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2150 U16 posflags = 0, negflags = 0;
2151 U16 *flagsp = &posflags;
0f5d15d6 2152 int logical = 0;
830247a4 2153 char *seqstart = RExC_parse;
ca9dfc88 2154
830247a4
IZ
2155 RExC_parse++;
2156 paren = *RExC_parse++;
c277df42 2157 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2158 switch (paren) {
fac92740 2159 case '<': /* (?<...) */
830247a4 2160 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2161 if (*RExC_parse == '!')
c277df42 2162 paren = ',';
b81d288d 2163 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2164 goto unknown;
830247a4 2165 RExC_parse++;
fac92740
MJD
2166 case '=': /* (?=...) */
2167 case '!': /* (?!...) */
830247a4 2168 RExC_seen_zerolen++;
fac92740
MJD
2169 case ':': /* (?:...) */
2170 case '>': /* (?>...) */
a0d0e21e 2171 break;
fac92740
MJD
2172 case '$': /* (?$...) */
2173 case '@': /* (?@...) */
8615cb43 2174 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2175 break;
fac92740 2176 case '#': /* (?#...) */
830247a4
IZ
2177 while (*RExC_parse && *RExC_parse != ')')
2178 RExC_parse++;
2179 if (*RExC_parse != ')')
c277df42 2180 FAIL("Sequence (?#... not terminated");
830247a4 2181 nextchar(pRExC_state);
a0d0e21e
LW
2182 *flagp = TRYAGAIN;
2183 return NULL;
fac92740 2184 case 'p': /* (?p...) */
9014280d 2185 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2186 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2187 /* FALL THROUGH*/
fac92740 2188 case '?': /* (??...) */
0f5d15d6 2189 logical = 1;
438a3801
YST
2190 if (*RExC_parse != '{')
2191 goto unknown;
830247a4 2192 paren = *RExC_parse++;
0f5d15d6 2193 /* FALL THROUGH */
fac92740 2194 case '{': /* (?{...}) */
c277df42 2195 {
c277df42
IZ
2196 I32 count = 1, n = 0;
2197 char c;
830247a4 2198 char *s = RExC_parse;
c277df42
IZ
2199 SV *sv;
2200 OP_4tree *sop, *rop;
2201
830247a4
IZ
2202 RExC_seen_zerolen++;
2203 RExC_seen |= REG_SEEN_EVAL;
2204 while (count && (c = *RExC_parse)) {
2205 if (c == '\\' && RExC_parse[1])
2206 RExC_parse++;
b81d288d 2207 else if (c == '{')
c277df42 2208 count++;
b81d288d 2209 else if (c == '}')
c277df42 2210 count--;
830247a4 2211 RExC_parse++;
c277df42 2212 }
830247a4 2213 if (*RExC_parse != ')')
b45f050a 2214 {
b81d288d 2215 RExC_parse = s;
b45f050a
JF
2216 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2217 }
c277df42
IZ
2218 if (!SIZE_ONLY) {
2219 AV *av;
b81d288d
AB
2220
2221 if (RExC_parse - 1 - s)
830247a4 2222 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2223 else
79cb57f6 2224 sv = newSVpvn("", 0);
c277df42 2225
569233ed
SB
2226 ENTER;
2227 Perl_save_re_context(aTHX);
c277df42 2228 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2229 sop->op_private |= OPpREFCOUNTED;
2230 /* re_dup will OpREFCNT_inc */
2231 OpREFCNT_set(sop, 1);
569233ed 2232 LEAVE;
c277df42 2233
830247a4
IZ
2234 n = add_data(pRExC_state, 3, "nop");
2235 RExC_rx->data->data[n] = (void*)rop;
2236 RExC_rx->data->data[n+1] = (void*)sop;
2237 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2238 SvREFCNT_dec(sv);
a0ed51b3 2239 }
e24b16f9 2240 else { /* First pass */
830247a4 2241 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2242 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2243 /* No compiled RE interpolated, has runtime
2244 components ===> unsafe. */
2245 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2246 if (PL_tainting && PL_tainted)
cc6b7395 2247 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2248 }
2249
830247a4 2250 nextchar(pRExC_state);
0f5d15d6 2251 if (logical) {
830247a4 2252 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2253 if (!SIZE_ONLY)
2254 ret->flags = 2;
830247a4 2255 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2256 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2257 return ret;
2258 }
830247a4 2259 return reganode(pRExC_state, EVAL, n);
c277df42 2260 }
fac92740 2261 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2262 {
fac92740 2263 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2264 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2265 || RExC_parse[1] == '<'
830247a4 2266 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2267 I32 flag;
2268
830247a4 2269 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2270 if (!SIZE_ONLY)
2271 ret->flags = 1;
830247a4 2272 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2273 goto insert_if;
b81d288d 2274 }
a0ed51b3 2275 }
830247a4 2276 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2277 /* (?(1)...) */
830247a4 2278 parno = atoi(RExC_parse++);
c277df42 2279
830247a4
IZ
2280 while (isDIGIT(*RExC_parse))
2281 RExC_parse++;
fac92740
MJD
2282 ret = reganode(pRExC_state, GROUPP, parno);
2283
830247a4 2284 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2285 vFAIL("Switch condition not recognized");
c277df42 2286 insert_if:
830247a4
IZ
2287 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2288 br = regbranch(pRExC_state, &flags, 1);
c277df42 2289 if (br == NULL)
830247a4 2290 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2291 else
830247a4
IZ
2292 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2293 c = *nextchar(pRExC_state);
d1b80229
IZ
2294 if (flags&HASWIDTH)
2295 *flagp |= HASWIDTH;
c277df42 2296 if (c == '|') {
830247a4
IZ
2297 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2298 regbranch(pRExC_state, &flags, 1);
2299 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2300 if (flags&HASWIDTH)
2301 *flagp |= HASWIDTH;
830247a4 2302 c = *nextchar(pRExC_state);
a0ed51b3
LW
2303 }
2304 else
c277df42
IZ
2305 lastbr = NULL;
2306 if (c != ')')
8615cb43 2307 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2308 ender = reg_node(pRExC_state, TAIL);
2309 regtail(pRExC_state, br, ender);
c277df42 2310 if (lastbr) {
830247a4
IZ
2311 regtail(pRExC_state, lastbr, ender);
2312 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2313 }
2314 else
830247a4 2315 regtail(pRExC_state, ret, ender);
c277df42 2316 return ret;
a0ed51b3
LW
2317 }
2318 else {
830247a4 2319 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2320 }
2321 }
1b1626e4 2322 case 0:
830247a4 2323 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2324 vFAIL("Sequence (? incomplete");
1b1626e4 2325 break;
a0d0e21e 2326 default:
830247a4 2327 --RExC_parse;
fac92740 2328 parse_flags: /* (?i) */
830247a4 2329 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2330 /* (?g), (?gc) and (?o) are useless here
2331 and must be globally applied -- japhy */
2332
2333 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2334 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2335 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2336 if (! (wastedflags & wflagbit) ) {
2337 wastedflags |= wflagbit;
2338 vWARN5(
2339 RExC_parse + 1,
2340 "Useless (%s%c) - %suse /%c modifier",
2341 flagsp == &negflags ? "?-" : "?",
2342 *RExC_parse,
2343 flagsp == &negflags ? "don't " : "",
2344 *RExC_parse
2345 );
2346 }
2347 }
2348 }
2349 else if (*RExC_parse == 'c') {
2350 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2351 if (! (wastedflags & wasted_c) ) {
2352 wastedflags |= wasted_gc;
2353 vWARN3(
2354 RExC_parse + 1,
2355 "Useless (%sc) - %suse /gc modifier",
2356 flagsp == &negflags ? "?-" : "?",
2357 flagsp == &negflags ? "don't " : ""
2358 );
2359 }
2360 }
2361 }
2362 else { pmflag(flagsp, *RExC_parse); }
2363
830247a4 2364 ++RExC_parse;
ca9dfc88 2365 }
830247a4 2366 if (*RExC_parse == '-') {
ca9dfc88 2367 flagsp = &negflags;
9d1d55b5 2368 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2369 ++RExC_parse;
ca9dfc88 2370 goto parse_flags;
48c036b1 2371 }
830247a4
IZ
2372 RExC_flags16 |= posflags;
2373 RExC_flags16 &= ~negflags;
2374 if (*RExC_parse == ':') {
2375 RExC_parse++;
ca9dfc88
IZ
2376 paren = ':';
2377 break;
2378 }
c277df42 2379 unknown:
830247a4
IZ
2380 if (*RExC_parse != ')') {
2381 RExC_parse++;
2382 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2383 }
830247a4 2384 nextchar(pRExC_state);
a0d0e21e
LW
2385 *flagp = TRYAGAIN;
2386 return NULL;
2387 }
2388 }
fac92740 2389 else { /* (...) */
830247a4
IZ
2390 parno = RExC_npar;
2391 RExC_npar++;
2392 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2393 Set_Node_Length(ret, 1); /* MJD */
2394 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2395 open = 1;
a0d0e21e 2396 }
a0ed51b3 2397 }
fac92740 2398 else /* ! paren */
a0d0e21e
LW
2399 ret = NULL;
2400
2401 /* Pick up the branches, linking them together. */
fac92740 2402 parse_start = RExC_parse; /* MJD */
830247a4 2403 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2404 /* branch_len = (paren != 0); */
2405
a0d0e21e
LW
2406 if (br == NULL)
2407 return(NULL);
830247a4
IZ
2408 if (*RExC_parse == '|') {
2409 if (!SIZE_ONLY && RExC_extralen) {
2410 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2411 }
fac92740 2412 else { /* MJD */
830247a4 2413 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2414 Set_Node_Length(br, paren != 0);
2415 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2416 }
c277df42
IZ
2417 have_branch = 1;
2418 if (SIZE_ONLY)
830247a4 2419 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2420 }
2421 else if (paren == ':') {
c277df42
IZ
2422 *flagp |= flags&SIMPLE;
2423 }
2424 if (open) { /* Starts with OPEN. */
830247a4 2425 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2426 }
2427 else if (paren != '?') /* Not Conditional */
a0d0e21e 2428 ret = br;
32a0ca98 2429 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2430 lastbr = br;
830247a4
IZ
2431 while (*RExC_parse == '|') {
2432 if (!SIZE_ONLY && RExC_extralen) {
2433 ender = reganode(pRExC_state, LONGJMP,0);
2434 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2435 }
2436 if (SIZE_ONLY)
830247a4
IZ
2437 RExC_extralen += 2; /* Account for LONGJMP. */
2438 nextchar(pRExC_state);
2439 br = regbranch(pRExC_state, &flags, 0);
fac92740 2440
a687059c 2441 if (br == NULL)
a0d0e21e 2442 return(NULL);
830247a4 2443 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2444 lastbr = br;
821b33a5
IZ
2445 if (flags&HASWIDTH)
2446 *flagp |= HASWIDTH;
a687059c 2447 *flagp |= flags&SPSTART;
a0d0e21e
LW
2448 }
2449
c277df42
IZ
2450 if (have_branch || paren != ':') {
2451 /* Make a closing node, and hook it on the end. */
2452 switch (paren) {
2453 case ':':
830247a4 2454 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2455 break;
2456 case 1:
830247a4 2457 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2458 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2459 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2460 break;
2461 case '<':
c277df42
IZ
2462 case ',':
2463 case '=':
2464 case '!':
c277df42 2465 *flagp &= ~HASWIDTH;
821b33a5
IZ
2466 /* FALL THROUGH */
2467 case '>':
830247a4 2468 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2469 break;
2470 case 0:
830247a4 2471 ender = reg_node(pRExC_state, END);
c277df42
IZ
2472 break;
2473 }
830247a4 2474 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2475
c277df42
IZ
2476 if (have_branch) {
2477 /* Hook the tails of the branches to the closing node. */
2478 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2479 regoptail(pRExC_state, br, ender);
c277df42
IZ
2480 }
2481 }
a0d0e21e 2482 }
c277df42
IZ
2483
2484 {
2485 char *p;
2486 static char parens[] = "=!<,>";
2487
2488 if (paren && (p = strchr(parens, paren))) {
2489 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2490 int flag = (p - parens) > 1;
2491
2492 if (paren == '>')
2493 node = SUSPEND, flag = 0;
830247a4 2494 reginsert(pRExC_state, node,ret);
c277df42 2495 ret->flags = flag;
830247a4 2496 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2497 }
a0d0e21e
LW
2498 }
2499
2500 /* Check for proper termination. */
ce3e6498 2501 if (paren) {
830247a4
IZ
2502 RExC_flags16 = oregflags;
2503 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2504 RExC_parse = oregcomp_parse;
380a0633 2505 vFAIL("Unmatched (");
ce3e6498 2506 }
a0ed51b3 2507 }
830247a4
IZ
2508 else if (!paren && RExC_parse < RExC_end) {
2509 if (*RExC_parse == ')') {
2510 RExC_parse++;
380a0633 2511 vFAIL("Unmatched )");
a0ed51b3
LW
2512 }
2513 else
b45f050a 2514 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2515 /* NOTREACHED */
2516 }
a687059c 2517
a0d0e21e 2518 return(ret);
a687059c
LW
2519}
2520
2521/*
2522 - regbranch - one alternative of an | operator
2523 *
2524 * Implements the concatenation operator.
2525 */
76e3520e 2526STATIC regnode *
830247a4 2527S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2528{
c277df42
IZ
2529 register regnode *ret;
2530 register regnode *chain = NULL;
2531 register regnode *latest;
2532 I32 flags = 0, c = 0;
a0d0e21e 2533
b81d288d 2534 if (first)
c277df42
IZ
2535 ret = NULL;
2536 else {
b81d288d 2537 if (!SIZE_ONLY && RExC_extralen)
830247a4 2538 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2539 else {
830247a4 2540 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2541 Set_Node_Length(ret, 1);
2542 }
c277df42
IZ
2543 }
2544
b81d288d 2545 if (!first && SIZE_ONLY)
830247a4 2546 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2547
c277df42 2548 *flagp = WORST; /* Tentatively. */
a0d0e21e 2549
830247a4
IZ
2550 RExC_parse--;
2551 nextchar(pRExC_state);
2552 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2553 flags &= ~TRYAGAIN;
830247a4 2554 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2555 if (latest == NULL) {
2556 if (flags & TRYAGAIN)
2557 continue;
2558 return(NULL);
a0ed51b3
LW
2559 }
2560 else if (ret == NULL)
c277df42 2561 ret = latest;
a0d0e21e 2562 *flagp |= flags&HASWIDTH;
c277df42 2563 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2564 *flagp |= flags&SPSTART;
2565 else {
830247a4
IZ
2566 RExC_naughty++;
2567 regtail(pRExC_state, chain, latest);
a687059c 2568 }
a0d0e21e 2569 chain = latest;
c277df42
IZ
2570 c++;
2571 }
2572 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2573 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2574 if (ret == NULL)
2575 ret = chain;
2576 }
2577 if (c == 1) {
2578 *flagp |= flags&SIMPLE;
a0d0e21e 2579 }
a687059c 2580
a0d0e21e 2581 return(ret);
a687059c
LW
2582}
2583
2584/*
2585 - regpiece - something followed by possible [*+?]
2586 *
2587 * Note that the branching code sequences used for ? and the general cases
2588 * of * and + are somewhat optimized: they use the same NOTHING node as
2589 * both the endmarker for their branch list and the body of the last branch.
2590 * It might seem that this node could be dispensed with entirely, but the
2591 * endmarker role is not redundant.
2592 */
76e3520e 2593STATIC regnode *
830247a4 2594S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2595{
c277df42 2596 register regnode *ret;
a0d0e21e
LW
2597 register char op;
2598 register char *next;
2599 I32 flags;
830247a4 2600 char *origparse = RExC_parse;
a0d0e21e
LW
2601 char *maxpos;
2602 I32 min;
c277df42 2603 I32 max = REG_INFTY;
fac92740 2604 char *parse_start;
a0d0e21e 2605
830247a4 2606 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2607 if (ret == NULL) {
2608 if (flags & TRYAGAIN)
2609 *flagp |= TRYAGAIN;
2610 return(NULL);
2611 }
2612
830247a4 2613 op = *RExC_parse;
a0d0e21e 2614
830247a4 2615 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2616 parse_start = RExC_parse; /* MJD */
830247a4 2617 next = RExC_parse + 1;
a0d0e21e
LW
2618 maxpos = Nullch;
2619 while (isDIGIT(*next) || *next == ',') {
2620 if (*next == ',') {
2621 if (maxpos)
2622 break;
2623 else
2624 maxpos = next;
a687059c 2625 }
a0d0e21e
LW
2626 next++;
2627 }
2628 if (*next == '}') { /* got one */
2629 if (!maxpos)
2630 maxpos = next;
830247a4
IZ
2631 RExC_parse++;
2632 min = atoi(RExC_parse);
a0d0e21e
LW
2633 if (*maxpos == ',')
2634 maxpos++;
2635 else
830247a4 2636 maxpos = RExC_parse;
a0d0e21e
LW
2637 max = atoi(maxpos);
2638 if (!max && *maxpos != '0')
c277df42
IZ
2639 max = REG_INFTY; /* meaning "infinity" */
2640 else if (max >= REG_INFTY)
8615cb43 2641 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2642 RExC_parse = next;
2643 nextchar(pRExC_state);
a0d0e21e
LW
2644
2645 do_curly:
2646 if ((flags&SIMPLE)) {
830247a4
IZ
2647 RExC_naughty += 2 + RExC_naughty / 2;
2648 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2649 Set_Node_Offset(ret, parse_start+1); /* MJD */
2650 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2651 }
2652 else {
830247a4 2653 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2654
2655 w->flags = 0;
830247a4
IZ
2656 regtail(pRExC_state, ret, w);
2657 if (!SIZE_ONLY && RExC_extralen) {
2658 reginsert(pRExC_state, LONGJMP,ret);
2659 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2660 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2661 }
830247a4 2662 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2663 /* MJD hk */
2664 Set_Node_Offset(ret, parse_start+1);
2665 Set_Node_Length(ret,
2666 op == '{' ? (RExC_parse - parse_start) : 1);
2667
830247a4 2668 if (!SIZE_ONLY && RExC_extralen)
c277df42 2669 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2670 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2671 if (SIZE_ONLY)
830247a4
IZ
2672 RExC_whilem_seen++, RExC_extralen += 3;
2673 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2674 }
c277df42 2675 ret->flags = 0;
a0d0e21e
LW
2676
2677 if (min > 0)
821b33a5
IZ
2678 *flagp = WORST;
2679 if (max > 0)
2680 *flagp |= HASWIDTH;
a0d0e21e 2681 if (max && max < min)
8615cb43 2682 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2683 if (!SIZE_ONLY) {
2684 ARG1_SET(ret, min);
2685 ARG2_SET(ret, max);
a687059c 2686 }
a687059c 2687
a0d0e21e 2688 goto nest_check;
a687059c 2689 }
a0d0e21e 2690 }
a687059c 2691
a0d0e21e
LW
2692 if (!ISMULT1(op)) {
2693 *flagp = flags;
a687059c 2694 return(ret);
a0d0e21e 2695 }
bb20fd44 2696
c277df42 2697#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2698
2699 /* if this is reinstated, don't forget to put this back into perldiag:
2700
2701 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2702
2703 (F) The part of the regexp subject to either the * or + quantifier
2704 could match an empty string. The {#} shows in the regular
2705 expression about where the problem was discovered.
2706
2707 */
2708
bb20fd44 2709 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2710 vFAIL("Regexp *+ operand could be empty");
b81d288d 2711#endif
bb20fd44 2712
fac92740 2713 parse_start = RExC_parse;
830247a4 2714 nextchar(pRExC_state);
a0d0e21e 2715
821b33a5 2716 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2717
2718 if (op == '*' && (flags&SIMPLE)) {
830247a4 2719 reginsert(pRExC_state, STAR, ret);
c277df42 2720 ret->flags = 0;
830247a4 2721 RExC_naughty += 4;
a0d0e21e
LW
2722 }
2723 else if (op == '*') {
2724 min = 0;
2725 goto do_curly;
a0ed51b3
LW
2726 }
2727 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2728 reginsert(pRExC_state, PLUS, ret);
c277df42 2729 ret->flags = 0;
830247a4 2730 RExC_naughty += 3;
a0d0e21e
LW
2731 }
2732 else if (op == '+') {
2733 min = 1;
2734 goto do_curly;
a0ed51b3
LW
2735 }
2736 else if (op == '?') {
a0d0e21e
LW
2737 min = 0; max = 1;
2738 goto do_curly;
2739 }
2740 nest_check:
e476b1b5 2741 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2742 vWARN3(RExC_parse,
b45f050a 2743 "%.*s matches null string many times",
830247a4 2744 RExC_parse - origparse,
b45f050a 2745 origparse);
a0d0e21e
LW
2746 }
2747
830247a4
IZ
2748 if (*RExC_parse == '?') {
2749 nextchar(pRExC_state);
2750 reginsert(pRExC_state, MINMOD, ret);
2751 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2752 }
830247a4
IZ
2753 if (ISMULT2(RExC_parse)) {
2754 RExC_parse++;
b45f050a
JF
2755 vFAIL("Nested quantifiers");
2756 }
a0d0e21e
LW
2757
2758 return(ret);
a687059c
LW
2759}
2760
2761/*
2762 - regatom - the lowest level
2763 *
2764 * Optimization: gobbles an entire sequence of ordinary characters so that
2765 * it can turn them into a single node, which is smaller to store and
2766 * faster to run. Backslashed characters are exceptions, each becoming a
2767 * separate node; the code is simpler that way and it's not worth fixing.
2768 *
b45f050a 2769 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2770STATIC regnode *
830247a4 2771S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2772{
c277df42 2773 register regnode *ret = 0;
a0d0e21e 2774 I32 flags;
f06dbbb7 2775 char *parse_start = 0;
a0d0e21e
LW
2776
2777 *flagp = WORST; /* Tentatively. */
2778
2779tryagain:
830247a4 2780 switch (*RExC_parse) {
a0d0e21e 2781 case '^':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 nextchar(pRExC_state);
2784 if (RExC_flags16 & PMf_MULTILINE)
2785 ret = reg_node(pRExC_state, MBOL);
2786 else if (RExC_flags16 & PMf_SINGLELINE)
2787 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2788 else
830247a4 2789 ret = reg_node(pRExC_state, BOL);
fac92740 2790 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2791 break;
2792 case '$':
830247a4 2793 nextchar(pRExC_state);
b81d288d 2794 if (*RExC_parse)
830247a4
IZ
2795 RExC_seen_zerolen++;
2796 if (RExC_flags16 & PMf_MULTILINE)
2797 ret = reg_node(pRExC_state, MEOL);
2798 else if (RExC_flags16 & PMf_SINGLELINE)
2799 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2800 else
830247a4 2801 ret = reg_node(pRExC_state, EOL);
fac92740 2802 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2803 break;
2804 case '.':
830247a4 2805 nextchar(pRExC_state);
cce850e4 2806 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2807 ret = reg_node(pRExC_state, SANY);
2808 else
2809 ret = reg_node(pRExC_state, REG_ANY);
2810 *flagp |= HASWIDTH|SIMPLE;
830247a4 2811 RExC_naughty++;
fac92740 2812 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2813 break;
2814 case '[':
b45f050a 2815 {
830247a4 2816 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2817 ret = regclass(pRExC_state);
830247a4
IZ
2818 if (*RExC_parse != ']') {
2819 RExC_parse = oregcomp_parse;
b45f050a
JF
2820 vFAIL("Unmatched [");
2821 }
830247a4 2822 nextchar(pRExC_state);
a0d0e21e 2823 *flagp |= HASWIDTH|SIMPLE;
fac92740 2824 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2825 break;
b45f050a 2826 }
a0d0e21e 2827 case '(':
830247a4
IZ
2828 nextchar(pRExC_state);
2829 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2830 if (ret == NULL) {
bf93d4cc 2831 if (flags & TRYAGAIN) {
830247a4 2832 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2833 /* Make parent create an empty node if needed. */
2834 *flagp |= TRYAGAIN;
2835 return(NULL);
2836 }
a0d0e21e 2837 goto tryagain;
bf93d4cc 2838 }
a0d0e21e
LW
2839 return(NULL);
2840 }
c277df42 2841 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2842 break;
2843 case '|':
2844 case ')':
2845 if (flags & TRYAGAIN) {
2846 *flagp |= TRYAGAIN;
2847 return NULL;
2848 }
b45f050a 2849 vFAIL("Internal urp");
a0d0e21e
LW
2850 /* Supposed to be caught earlier. */
2851 break;
85afd4ae 2852 case '{':
830247a4
IZ
2853 if (!regcurly(RExC_parse)) {
2854 RExC_parse++;
85afd4ae
CS
2855 goto defchar;
2856 }
2857 /* FALL THROUGH */
a0d0e21e
LW
2858 case '?':
2859 case '+':
2860 case '*':
830247a4 2861 RExC_parse++;
b45f050a 2862 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2863 break;
2864 case '\\':
830247a4 2865 switch (*++RExC_parse) {
a0d0e21e 2866 case 'A':
830247a4
IZ
2867 RExC_seen_zerolen++;
2868 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2869 *flagp |= SIMPLE;
830247a4 2870 nextchar(pRExC_state);
fac92740 2871 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2872 break;
2873 case 'G':
830247a4
IZ
2874 ret = reg_node(pRExC_state, GPOS);
2875 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2876 *flagp |= SIMPLE;
830247a4 2877 nextchar(pRExC_state);
fac92740 2878 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2879 break;
2880 case 'Z':
830247a4 2881 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2882 *flagp |= SIMPLE;
a1917ab9 2883 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2884 nextchar(pRExC_state);
a0d0e21e 2885 break;
b85d18e9 2886 case 'z':
830247a4 2887 ret = reg_node(pRExC_state, EOS);
b85d18e9 2888 *flagp |= SIMPLE;
830247a4
IZ
2889 RExC_seen_zerolen++; /* Do not optimize RE away */
2890 nextchar(pRExC_state);
fac92740 2891 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2892 break;
4a2d328f 2893 case 'C':
f33976b4
DB
2894 ret = reg_node(pRExC_state, CANY);
2895 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2896 *flagp |= HASWIDTH|SIMPLE;
830247a4 2897 nextchar(pRExC_state);
fac92740 2898 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2899 break;
2900 case 'X':
830247a4 2901 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2902 *flagp |= HASWIDTH;
830247a4 2903 nextchar(pRExC_state);
fac92740 2904 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2905 break;
a0d0e21e 2906 case 'w':
ffc61ed2 2907 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2908 *flagp |= HASWIDTH|SIMPLE;
830247a4 2909 nextchar(pRExC_state);
fac92740 2910 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2911 break;
2912 case 'W':
ffc61ed2 2913 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2914 *flagp |= HASWIDTH|SIMPLE;
830247a4 2915 nextchar(pRExC_state);
fac92740 2916 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2917 break;
2918 case 'b':
830247a4
IZ
2919 RExC_seen_zerolen++;
2920 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2921 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2922 *flagp |= SIMPLE;
830247a4 2923 nextchar(pRExC_state);
fac92740 2924 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2925 break;
2926 case 'B':
830247a4
IZ
2927 RExC_seen_zerolen++;
2928 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2929 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2930 *flagp |= SIMPLE;
830247a4 2931 nextchar(pRExC_state);
fac92740 2932 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2933 break;
2934 case 's':
ffc61ed2 2935 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2936 *flagp |= HASWIDTH|SIMPLE;
830247a4 2937 nextchar(pRExC_state);
fac92740 2938 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2939 break;
2940 case 'S':
ffc61ed2 2941 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2942 *flagp |= HASWIDTH|SIMPLE;
830247a4 2943 nextchar(pRExC_state);
fac92740 2944 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2945 break;
2946 case 'd':
ffc61ed2 2947 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2948 *flagp |= HASWIDTH|SIMPLE;
830247a4 2949 nextchar(pRExC_state);
fac92740 2950 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2951 break;
2952 case 'D':
ffc61ed2 2953 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2954 *flagp |= HASWIDTH|SIMPLE;
830247a4 2955 nextchar(pRExC_state);
fac92740 2956 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2957 break;
a14b48bc
LW
2958 case 'p':
2959 case 'P':
3568d838 2960 {
830247a4 2961 char* oldregxend = RExC_end;
fac92740 2962 char* parse_start = RExC_parse;
a14b48bc 2963
830247a4 2964 if (RExC_parse[1] == '{') {
3568d838 2965 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2966 RExC_end = strchr(RExC_parse, '}');
2967 if (!RExC_end) {
0da60cf5 2968 U8 c = (U8)*RExC_parse;
830247a4
IZ
2969 RExC_parse += 2;
2970 RExC_end = oldregxend;
0da60cf5 2971 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2972 }
830247a4 2973 RExC_end++;
a14b48bc 2974 }
af6f566e 2975 else {
830247a4 2976 RExC_end = RExC_parse + 2;
af6f566e
HS
2977 if (RExC_end > oldregxend)
2978 RExC_end = oldregxend;
2979 }
830247a4 2980 RExC_parse--;
a14b48bc 2981
ffc61ed2 2982 ret = regclass(pRExC_state);
a14b48bc 2983
830247a4
IZ
2984 RExC_end = oldregxend;
2985 RExC_parse--;
fac92740 2986 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2987 nextchar(pRExC_state);
a14b48bc
LW
2988 *flagp |= HASWIDTH|SIMPLE;
2989 }
2990 break;
a0d0e21e
LW
2991 case 'n':
2992 case 'r':
2993 case 't':
2994 case 'f':
2995 case 'e':
2996 case 'a':
2997 case 'x':
2998 case 'c':
2999 case '0':
3000 goto defchar;
3001 case '1': case '2': case '3': case '4':
3002 case '5': case '6': case '7': case '8': case '9':
3003 {
830247a4 3004 I32 num = atoi(RExC_parse);
a0d0e21e 3005
830247a4 3006 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3007 goto defchar;
3008 else {
fac92740 3009 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3010 while (isDIGIT(*RExC_parse))
3011 RExC_parse++;
b45f050a 3012
830247a4 3013 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 3014 vFAIL("Reference to nonexistent group");
830247a4
IZ
3015 RExC_sawback = 1;
3016 ret = reganode(pRExC_state, FOLD
a0ed51b3 3017 ? (LOC ? REFFL : REFF)
c8756f30 3018 : REF, num);
a0d0e21e 3019 *flagp |= HASWIDTH;
fac92740
MJD
3020
3021 /* override incorrect value set in reganode MJD */
3022 Set_Node_Offset(ret, parse_start+1);
3023 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3024 RExC_parse--;
3025 nextchar(pRExC_state);
a0d0e21e
LW
3026 }
3027 }
3028 break;
3029 case '\0':
830247a4 3030 if (RExC_parse >= RExC_end)
b45f050a 3031 FAIL("Trailing \\");
a0d0e21e
LW
3032 /* FALL THROUGH */
3033 default:
c9f97d15
IZ
3034 /* Do not generate `unrecognized' warnings here, we fall
3035 back into the quick-grab loop below */
a0d0e21e
LW
3036 goto defchar;
3037 }
3038 break;
4633a7c4
LW
3039
3040 case '#':
830247a4
IZ
3041 if (RExC_flags16 & PMf_EXTENDED) {
3042 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3043 if (RExC_parse < RExC_end)
4633a7c4
LW
3044 goto tryagain;
3045 }
3046 /* FALL THROUGH */
3047
a0d0e21e 3048 default: {
ba210ebe 3049 register STRLEN len;
58ae7d3f 3050 register UV ender;
a0d0e21e 3051 register char *p;
c277df42 3052 char *oldp, *s;
ba210ebe 3053 STRLEN numlen;
80aecb99 3054 STRLEN foldlen;
60a8b682 3055 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3056
3057 parse_start = RExC_parse - 1;
a0d0e21e 3058
830247a4 3059 RExC_parse++;
a0d0e21e
LW
3060
3061 defchar:
58ae7d3f 3062 ender = 0;
830247a4 3063 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3064 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3065 : EXACT);
cd439c50 3066 s = STRING(ret);
830247a4
IZ
3067 for (len = 0, p = RExC_parse - 1;
3068 len < 127 && p < RExC_end;
a0d0e21e
LW
3069 len++)
3070 {
3071 oldp = p;
5b5a24f7 3072
830247a4
IZ
3073 if (RExC_flags16 & PMf_EXTENDED)
3074 p = regwhite(p, RExC_end);
a0d0e21e
LW
3075 switch (*p) {
3076 case '^':
3077 case '$':
3078 case '.':
3079 case '[':
3080 case '(':
3081 case ')':
3082 case '|':
3083 goto loopdone;
3084 case '\\':
3085 switch (*++p) {
3086 case 'A':
1ed8eac0
JF
3087 case 'C':
3088 case 'X':
a0d0e21e
LW
3089 case 'G':
3090 case 'Z':
b85d18e9 3091 case 'z':
a0d0e21e
LW
3092 case 'w':
3093 case 'W':
3094 case 'b':
3095 case 'B':
3096 case 's':
3097 case 'S':
3098 case 'd':
3099 case 'D':
a14b48bc
LW
3100 case 'p':
3101 case 'P':
a0d0e21e
LW
3102 --p;
3103 goto loopdone;
3104 case 'n':
3105 ender = '\n';
3106 p++;
a687059c 3107 break;
a0d0e21e
LW
3108 case 'r':
3109 ender = '\r';
3110 p++;
a687059c 3111 break;
a0d0e21e
LW
3112 case 't':
3113 ender = '\t';
3114 p++;
a687059c 3115 break;
a0d0e21e
LW
3116 case 'f':
3117 ender = '\f';
3118 p++;
a687059c 3119 break;
a0d0e21e 3120 case 'e':
c7f1f016 3121 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3122 p++;
a687059c 3123 break;
a0d0e21e 3124 case 'a':
c7f1f016 3125 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3126 p++;
a687059c 3127 break;
a0d0e21e 3128 case 'x':
a0ed51b3
LW
3129 if (*++p == '{') {
3130 char* e = strchr(p, '}');
b81d288d 3131
b45f050a 3132 if (!e) {
830247a4 3133 RExC_parse = p + 1;
b45f050a
JF
3134 vFAIL("Missing right brace on \\x{}");
3135 }
de5f0749 3136 else {
a4c04bdc
NC
3137 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3138 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3139 numlen = e - p - 1;
3140 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3141 if (ender > 0xff)
3142 RExC_utf8 = 1;
b21ed0a9
GS
3143 /* numlen is generous */
3144 if (numlen + len >= 127) {
a0ed51b3
LW
3145 p--;
3146 goto loopdone;
3147 }
3148 p = e + 1;
3149 }
a0ed51b3
LW
3150 }
3151 else {
a4c04bdc 3152 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3153 numlen = 2;
3154 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3155 p += numlen;
3156 }
a687059c 3157 break;
a0d0e21e
LW
3158 case 'c':
3159 p++;
bbce6d69 3160 ender = UCHARAT(p++);
3161 ender = toCTRL(ender);
a687059c 3162 break;
a0d0e21e
LW
3163 case '0': case '1': case '2': case '3':case '4':
3164 case '5': case '6': case '7': case '8':case '9':
3165 if (*p == '0' ||
830247a4 3166 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3167 I32 flags = 0;
3168 numlen = 3;
3169 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3170 p += numlen;
3171 }
3172 else {
3173 --p;
3174 goto loopdone;
a687059c
LW
3175 }
3176 break;
a0d0e21e 3177 case '\0':
830247a4 3178 if (p >= RExC_end)
b45f050a 3179 FAIL("Trailing \\");
a687059c 3180 /* FALL THROUGH */
a0d0e21e 3181 default:
e476b1b5 3182 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3183 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3184 goto normal_default;
a0d0e21e
LW
3185 }
3186 break;
a687059c 3187 default:
a0ed51b3 3188 normal_default:
fd400ab9 3189 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3190 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3191 &numlen, 0);
a0ed51b3
LW
3192 p += numlen;
3193 }
3194 else
3195 ender = *p++;
a0d0e21e 3196 break;
a687059c 3197 }
830247a4
IZ
3198 if (RExC_flags16 & PMf_EXTENDED)
3199 p = regwhite(p, RExC_end);
60a8b682
JH
3200 if (UTF && FOLD) {
3201 /* Prime the casefolded buffer. */
ac7e0132 3202 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3203 }
a0d0e21e
LW
3204 if (ISMULT2(p)) { /* Back off on ?+*. */
3205 if (len)
3206 p = oldp;
16ea2a2e 3207 else if (UTF) {
0ebc6274
JH
3208 STRLEN unilen;
3209
80aecb99 3210 if (FOLD) {
60a8b682 3211 /* Emit all the Unicode characters. */
80aecb99
JH
3212 for (foldbuf = tmpbuf;
3213 foldlen;
3214 foldlen -= numlen) {
3215 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3216 if (numlen > 0) {
0ebc6274
JH
3217 reguni(pRExC_state, ender, s, &unilen);
3218 s += unilen;
3219 len += unilen;
3220 /* In EBCDIC the numlen
3221 * and unilen can differ. */
9dc45d57 3222 foldbuf += numlen;
47654450
JH
3223 if (numlen >= foldlen)
3224 break;
9dc45d57
JH
3225 }
3226 else
3227 break; /* "Can't happen." */
80aecb99
JH
3228 }
3229 }
3230 else {
0ebc6274 3231 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3232 if (unilen > 0) {
0ebc6274
JH
3233 s += unilen;
3234 len += unilen;
9dc45d57 3235 }
80aecb99 3236 }
a0ed51b3 3237 }
a0d0e21e
LW
3238 else {
3239 len++;
cd439c50 3240 REGC(ender, s++);
a0d0e21e
LW
3241 }
3242 break;
a687059c 3243 }
16ea2a2e 3244 if (UTF) {
0ebc6274
JH
3245 STRLEN unilen;
3246
80aecb99 3247 if (FOLD) {
60a8b682 3248 /* Emit all the Unicode characters. */
80aecb99
JH
3249 for (foldbuf = tmpbuf;
3250 foldlen;
3251 foldlen -= numlen) {
3252 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3253 if (numlen > 0) {
0ebc6274
JH
3254 reguni(pRExC_state, ender, s, &unilen);
3255 len += unilen;
3256 s += unilen;
3257 /* In EBCDIC the numlen
3258 * and unilen can differ. */
9dc45d57 3259 foldbuf += numlen;
47654450
JH
3260 if (numlen >= foldlen)
3261 break;
9dc45d57
JH
3262 }
3263 else
3264 break;
80aecb99
JH
3265 }
3266 }
3267 else {
0ebc6274 3268 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3269 if (unilen > 0) {
0ebc6274
JH
3270 s += unilen;
3271 len += unilen;
9dc45d57 3272 }
80aecb99
JH
3273 }
3274 len--;
a0ed51b3
LW
3275 }
3276 else
cd439c50 3277 REGC(ender, s++);
a0d0e21e
LW
3278 }
3279 loopdone:
830247a4 3280 RExC_parse = p - 1;
fac92740 3281 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3282 nextchar(pRExC_state);
793db0cb
JH
3283 {
3284 /* len is STRLEN which is unsigned, need to copy to signed */
3285 IV iv = len;
3286 if (iv < 0)
3287 vFAIL("Internal disaster");
3288 }
a0d0e21e
LW
3289 if (len > 0)
3290 *flagp |= HASWIDTH;
3291 if (len == 1)
3292 *flagp |= SIMPLE;
c277df42 3293 if (!SIZE_ONLY)
cd439c50
IZ
3294 STR_LEN(ret) = len;
3295 if (SIZE_ONLY)
830247a4 3296 RExC_size += STR_SZ(len);
cd439c50 3297 else
830247a4 3298 RExC_emit += STR_SZ(len);
a687059c 3299 }
a0d0e21e
LW
3300 break;
3301 }
a687059c 3302
60a8b682
JH
3303 /* If the encoding pragma is in effect recode the text of
3304 * any EXACT-kind nodes. */
22c54be3 3305 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3306 STRLEN oldlen = STR_LEN(ret);
3307 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3308
3309 if (RExC_utf8)
3310 SvUTF8_on(sv);
3311 if (sv_utf8_downgrade(sv, TRUE)) {
799ef3cb 3312 char *s = sv_recode_to_utf8(sv, PL_encoding);
22c54be3
JH
3313 STRLEN newlen = SvCUR(sv);
3314
3315 if (!SIZE_ONLY) {
3316 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3317 (int)oldlen, STRING(ret),
3318 (int)newlen, s));
3319 Copy(s, STRING(ret), newlen, char);
3320 STR_LEN(ret) += newlen - oldlen;
3321 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3322 } else
3323 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3324 }
a72c7584
JH
3325 }
3326
a0d0e21e 3327 return(ret);
a687059c
LW
3328}
3329
873ef191 3330STATIC char *
cea2e8a9 3331S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3332{
3333 while (p < e) {
3334 if (isSPACE(*p))
3335 ++p;
3336 else if (*p == '#') {
3337 do {
3338 p++;
3339 } while (p < e && *p != '\n');
3340 }
3341 else
3342 break;
3343 }
3344 return p;
3345}
3346
b8c5462f
JH
3347/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3348 Character classes ([:foo:]) can also be negated ([:^foo:]).
3349 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3350 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3351 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3352
3353#define POSIXCC_DONE(c) ((c) == ':')
3354#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3355#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3356
b8c5462f 3357STATIC I32
830247a4 3358S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3359{
3360 char *posixcc = 0;
936ed897 3361 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3362
830247a4 3363 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3364 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3365 POSIXCC(UCHARAT(RExC_parse))) {
3366 char c = UCHARAT(RExC_parse);
830247a4 3367 char* s = RExC_parse++;
b81d288d 3368
9a86a77b 3369 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3370 RExC_parse++;
3371 if (RExC_parse == RExC_end)
620e46c5 3372 /* Grandfather lone [:, [=, [. */
830247a4 3373 RExC_parse = s;
620e46c5 3374 else {
830247a4 3375 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3376
9a86a77b 3377 if (UCHARAT(RExC_parse) == ']') {
830247a4 3378 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3379 posixcc = s + 1;
3380 if (*s == ':') {
3381 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3382 I32 skip = 5; /* the most common skip */
3383
3384 switch (*posixcc) {
3385 case 'a':
3386 if (strnEQ(posixcc, "alnum", 5))
3387 namedclass =
3388 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3389 else if (strnEQ(posixcc, "alpha", 5))
3390 namedclass =
3391 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3392 else if (strnEQ(posixcc, "ascii", 5))
3393 namedclass =
3394 complement ? ANYOF_NASCII : ANYOF_ASCII;
3395 break;
aaa51d5e
JF
3396 case 'b':
3397 if (strnEQ(posixcc, "blank", 5))
3398 namedclass =
3399 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3400 break;
b8c5462f
JH
3401 case 'c':
3402 if (strnEQ(posixcc, "cntrl", 5))
3403 namedclass =
3404 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3405 break;
3406 case 'd':
3407 if (strnEQ(posixcc, "digit", 5))
3408 namedclass =
3409 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3410 break;
3411 case 'g':
3412 if (strnEQ(posixcc, "graph", 5))
3413 namedclass =
3414 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3415 break;
3416 case 'l':
3417 if (strnEQ(posixcc, "lower", 5))
3418 namedclass =
3419 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3420 break;
3421 case 'p':
3422 if (strnEQ(posixcc, "print", 5))
3423 namedclass =
3424 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3425 else if (strnEQ(posixcc, "punct", 5))
3426 namedclass =
3427 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3428 break;
3429 case 's':
3430 if (strnEQ(posixcc, "space", 5))
3431 namedclass =
aaa51d5e 3432 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3433 break;
b8c5462f
JH
3434 case 'u':
3435 if (strnEQ(posixcc, "upper", 5))
3436 namedclass =
3437 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3438 break;
3439 case 'w': /* this is not POSIX, this is the Perl \w */
3440 if (strnEQ(posixcc, "word", 4)) {
3441 namedclass =
3442 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3443 skip = 4;
3444 }
3445 break;
3446 case 'x':
3447 if (strnEQ(posixcc, "xdigit", 6)) {
3448 namedclass =
3449 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3450 skip = 6;
3451 }
3452 break;
3453 }
ac561586
JH
3454 if (namedclass == OOB_NAMEDCLASS ||
3455 posixcc[skip] != ':' ||
3456 posixcc[skip+1] != ']')
b45f050a
JF
3457 {
3458 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3459 t - s - 1, s + 1);
3460 }
3461 } else if (!SIZE_ONLY) {
b8c5462f 3462 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3463
830247a4 3464 /* adjust RExC_parse so the warning shows after
b45f050a 3465 the class closes */
9a86a77b 3466 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3467 RExC_parse++;
b45f050a
JF
3468 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3469 }
b8c5462f
JH
3470 } else {
3471 /* Maternal grandfather:
3472 * "[:" ending in ":" but not in ":]" */
830247a4 3473 RExC_parse = s;
767d463e 3474 }
620e46c5
JH
3475 }
3476 }
3477
b8c5462f
JH
3478 return namedclass;
3479}
3480
3481STATIC void
830247a4 3482S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3483{
b938889d 3484 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3485 char *s = RExC_parse;
93733859 3486 char c = *s++;
b8c5462f
JH
3487
3488 while(*s && isALNUM(*s))
3489 s++;
3490 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3491 if (ckWARN(WARN_REGEXP))
3492 vWARN3(s+2,
3493 "POSIX syntax [%c %c] belongs inside character classes",
3494 c, c);
b45f050a
JF
3495
3496 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3497 if (POSIXCC_NOTYET(c)) {
830247a4 3498 /* adjust RExC_parse so the error shows after
b45f050a 3499 the class closes */
9a86a77b 3500 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3501 ;
3502 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3503 }
b8c5462f
JH
3504 }
3505 }
620e46c5
JH
3506}
3507
76e3520e 3508STATIC regnode *
830247a4 3509S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3510{
ffc61ed2 3511 register UV value;
9a86a77b 3512 register UV nextvalue;
3568d838 3513 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3514 register IV range = 0;
c277df42 3515 register regnode *ret;
ba210ebe 3516 STRLEN numlen;
ffc61ed2 3517 IV namedclass;
9c5ffd7c 3518 char *rangebegin = 0;
936ed897 3519 bool need_class = 0;
9c5ffd7c 3520 SV *listsv = Nullsv;
ffc61ed2
JH
3521 register char *e;
3522 UV n;
9e55ce06
JH
3523 bool optimize_invert = TRUE;
3524 AV* unicode_alternate = 0;
ffc61ed2
JH
3525
3526 ret = reganode(pRExC_state, ANYOF, 0);
3527
3528 if (!SIZE_ONLY)
3529 ANYOF_FLAGS(ret) = 0;
3530
9a86a77b 3531 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3532 RExC_naughty++;
3533 RExC_parse++;
3534 if (!SIZE_ONLY)
3535 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3536 }
a0d0e21e 3537
936ed897 3538 if (SIZE_ONLY)
830247a4 3539 RExC_size += ANYOF_SKIP;
936ed897 3540 else {
830247a4 3541 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3542 if (FOLD)
3543 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3544 if (LOC)
3545 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3546 ANYOF_BITMAP_ZERO(ret);
3547 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3548 }
b8c5462f 3549
9a86a77b
JH
3550 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3551
b938889d 3552 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3553 checkposixcc(pRExC_state);
b8c5462f 3554
f064b6ad
HS
3555 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3556 if (UCHARAT(RExC_parse) == ']')
3557 goto charclassloop;
ffc61ed2 3558
9a86a77b 3559 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3560
3561 charclassloop:
3562
3563 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3564
73b437c8 3565 if (!range)
830247a4 3566 rangebegin = RExC_parse;
ffc61ed2 3567 if (UTF) {
5e12f4fb 3568 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3569 RExC_end - RExC_parse,
3570 &numlen, 0);
ffc61ed2
JH
3571 RExC_parse += numlen;
3572 }
3573 else
3574 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3575 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3576 if (value == '[' && POSIXCC(nextvalue))
830247a4 3577 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3578 else if (value == '\\') {
ffc61ed2 3579 if (UTF) {
5e12f4fb 3580 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3581 RExC_end - RExC_parse,
3582 &numlen, 0);
3583 RExC_parse += numlen;
3584 }
3585 else
3586 value = UCHARAT(RExC_parse++);
470c3474 3587 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3588 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3589 * be a problem later if we want switch on Unicode.
3590 * A similar issue a little bit later when switching on
3591 * namedclass. --jhi */
ffc61ed2 3592 switch ((I32)value) {
b8c5462f
JH
3593 case 'w': namedclass = ANYOF_ALNUM; break;
3594 case 'W': namedclass = ANYOF_NALNUM; break;
3595 case 's': namedclass = ANYOF_SPACE; break;
3596 case 'S': namedclass = ANYOF_NSPACE; break;
3597 case 'd': namedclass = ANYOF_DIGIT; break;
3598 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3599 case 'p':
3600 case 'P':
af6f566e 3601 if (RExC_parse >= RExC_end)
2a4859cd 3602 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3603 if (*RExC_parse == '{') {
0da60cf5 3604 U8 c = (U8)value;
ffc61ed2
JH
3605 e = strchr(RExC_parse++, '}');
3606 if (!e)
0da60cf5 3607 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3608 while (isSPACE(UCHARAT(RExC_parse)))
3609 RExC_parse++;
3610 if (e == RExC_parse)
0da60cf5 3611 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3612 n = e - RExC_parse;
ab13f0c7
JH
3613 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3614 n--;
ffc61ed2
JH
3615 }
3616 else {
3617 e = RExC_parse;
3618 n = 1;
3619 }
3620 if (!SIZE_ONLY) {
ab13f0c7
JH
3621 if (UCHARAT(RExC_parse) == '^') {
3622 RExC_parse++;
3623 n--;
3624 value = value == 'p' ? 'P' : 'p'; /* toggle */
3625 while (isSPACE(UCHARAT(RExC_parse))) {
3626 RExC_parse++;
3627 n--;
3628 }
3629 }
ffc61ed2 3630 if (value == 'p')
ab13f0c7
JH
3631 Perl_sv_catpvf(aTHX_ listsv,
3632 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3633 else
ab13f0c7
JH
3634 Perl_sv_catpvf(aTHX_ listsv,
3635 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3636 }
3637 RExC_parse = e + 1;
3638 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3639 continue;
b8c5462f
JH
3640 case 'n': value = '\n'; break;
3641 case 'r': value = '\r'; break;
3642 case 't': value = '\t'; break;
3643 case 'f': value = '\f'; break;
3644 case 'b': value = '\b'; break;
c7f1f016
NIS
3645 case 'e': value = ASCII_TO_NATIVE('\033');break;
3646 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3647 case 'x':
ffc61ed2 3648 if (*RExC_parse == '{') {
a4c04bdc
NC
3649 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3650 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3651 e = strchr(RExC_parse++, '}');
b81d288d 3652 if (!e)
ffc61ed2 3653 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3654
3655 numlen = e - RExC_parse;
3656 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3657 RExC_parse = e + 1;
3658 }
3659 else {
a4c04bdc 3660 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3661 numlen = 2;
3662 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3663 RExC_parse += numlen;
3664 }
b8c5462f
JH
3665 break;
3666 case 'c':
830247a4 3667 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3668 value = toCTRL(value);
3669 break;
3670 case '0': case '1': case '2': case '3': case '4':
3671 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3672 {
3673 I32 flags = 0;
3674 numlen = 3;
3675 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3676 RExC_parse += numlen;
b8c5462f 3677 break;
53305cf1 3678 }
1028017a 3679 default:
e476b1b5 3680 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3681 vWARN2(RExC_parse,
3682 "Unrecognized escape \\%c in character class passed through",
3683 (int)value);
1028017a 3684 break;
b8c5462f 3685 }
ffc61ed2
JH
3686 } /* end of \blah */
3687
3688 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3689
3690 if (!SIZE_ONLY && !need_class)
936ed897 3691 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3692
936ed897 3693 need_class = 1;
ffc61ed2
JH
3694
3695 /* a bad range like a-\d, a-[:digit:] ? */
3696 if (range) {
73b437c8 3697 if (!SIZE_ONLY) {
e476b1b5 3698 if (ckWARN(WARN_REGEXP))
830247a4 3699 vWARN4(RExC_parse,
b45f050a 3700 "False [] range \"%*.*s\"",
830247a4
IZ
3701 RExC_parse - rangebegin,
3702 RExC_parse - rangebegin,
b45f050a 3703 rangebegin);
3568d838
JH
3704 if (prevvalue < 256) {
3705 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3706 ANYOF_BITMAP_SET(ret, '-');
3707 }
3708 else {
3709 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3710 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3711 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3712 }
b8c5462f 3713 }
ffc61ed2
JH
3714
3715 range = 0; /* this was not a true range */
73b437c8 3716 }
ffc61ed2 3717
73b437c8 3718 if (!SIZE_ONLY) {
3568d838
JH
3719 if (namedclass > OOB_NAMEDCLASS)
3720 optimize_invert = FALSE;
e2962f66
JH
3721 /* Possible truncation here but in some 64-bit environments
3722 * the compiler gets heartburn about switch on 64-bit values.
3723 * A similar issue a little earlier when switching on value.
98f323fa 3724 * --jhi */
e2962f66 3725 switch ((I32)namedclass) {
73b437c8
JH
3726 case ANYOF_ALNUM:
3727 if (LOC)
936ed897 3728 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3729 else {
3730 for (value = 0; value < 256; value++)
3731 if (isALNUM(value))
936ed897 3732 ANYOF_BITMAP_SET(ret, value);
73b437c8 3733 }
ffc61ed2 3734 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3735 break;
3736 case ANYOF_NALNUM:
3737 if (LOC)
936ed897 3738 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3739 else {
3740 for (value = 0; value < 256; value++)
3741 if (!isALNUM(value))
936ed897 3742 ANYOF_BITMAP_SET(ret, value);
73b437c8 3743 }
ffc61ed2 3744 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3745 break;
ffc61ed2 3746 case ANYOF_ALNUMC:
73b437c8 3747 if (LOC)
ffc61ed2 3748 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3749 else {
3750 for (value = 0; value < 256; value++)
ffc61ed2 3751 if (isALNUMC(value))
936ed897 3752 ANYOF_BITMAP_SET(ret, value);
73b437c8 3753 }
ffc61ed2 3754 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3755 break;
3756 case ANYOF_NALNUMC:
3757 if (LOC)
936ed897 3758 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3759 else {
3760 for (value = 0; value < 256; value++)
3761 if (!isALNUMC(value))
936ed897 3762 ANYOF_BITMAP_SET(ret, value);
73b437c8 3763 }
ffc61ed2 3764 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3765 break;
3766 case ANYOF_ALPHA:
3767 if (LOC)
936ed897 3768 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3769 else {
3770 for (value = 0; value < 256; value++)
3771 if (isALPHA(value))
936ed897 3772 ANYOF_BITMAP_SET(ret, value);
73b437c8 3773 }
ffc61ed2 3774 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3775 break;
3776 case ANYOF_NALPHA:
3777 if (LOC)
936ed897 3778 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3779 else {
3780 for (value = 0; value < 256; value++)
3781 if (!isALPHA(value))
936ed897 3782 ANYOF_BITMAP_SET(ret, value);
73b437c8 3783 }
ffc61ed2 3784 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3785 break;
3786 case ANYOF_ASCII:
3787 if (LOC)
936ed897 3788 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3789 else {
c7f1f016 3790#ifndef EBCDIC
1ba5c669
JH
3791 for (value = 0; value < 128; value++)
3792 ANYOF_BITMAP_SET(ret, value);
3793#else /* EBCDIC */
ffbc6a93 3794 for (value = 0; value < 256; value++) {
3a3c4447
JH
3795 if (isASCII(value))
3796 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3797 }
1ba5c669 3798#endif /* EBCDIC */
73b437c8 3799 }
ffc61ed2 3800 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3801 break;
3802 case ANYOF_NASCII:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3805 else {
c7f1f016 3806#ifndef EBCDIC
1ba5c669
JH
3807 for (value = 128; value < 256; value++)
3808 ANYOF_BITMAP_SET(ret, value);
3809#else /* EBCDIC */
ffbc6a93 3810 for (value = 0; value < 256; value++) {
3a3c4447
JH
3811 if (!isASCII(value))
3812 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3813 }
1ba5c669 3814#endif /* EBCDIC */
73b437c8 3815 }
ffc61ed2 3816 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3817 break;
aaa51d5e
JF
3818 case ANYOF_BLANK:
3819 if (LOC)
3820 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3821 else {
3822 for (value = 0; value < 256; value++)
3823 if (isBLANK(value))
3824 ANYOF_BITMAP_SET(ret, value);
3825 }
ffc61ed2 3826 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3827 break;
3828 case ANYOF_NBLANK:
3829 if (LOC)
3830 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3831 else {
3832 for (value = 0; value < 256; value++)
3833 if (!isBLANK(value))
3834 ANYOF_BITMAP_SET(ret, value);
3835 }
ffc61ed2 3836 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3837 break;
73b437c8
JH
3838 case ANYOF_CNTRL:
3839 if (LOC)
936ed897 3840 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3841 else {
3842 for (value = 0; value < 256; value++)
3843 if (isCNTRL(value))
936ed897 3844 ANYOF_BITMAP_SET(ret, value);
73b437c8 3845 }
ffc61ed2 3846 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3847 break;
3848 case ANYOF_NCNTRL:
3849 if (LOC)
936ed897 3850 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3851 else {
3852 for (value = 0; value < 256; value++)
3853 if (!isCNTRL(value))
936ed897 3854 ANYOF_BITMAP_SET(ret, value);
73b437c8 3855 }
ffc61ed2
JH
3856 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3857 break;
3858 case ANYOF_DIGIT:
3859 if (LOC)
3860 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3861 else {
3862 /* consecutive digits assumed */
3863 for (value = '0'; value <= '9'; value++)
3864 ANYOF_BITMAP_SET(ret, value);
3865 }
3866 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3867 break;
3868 case ANYOF_NDIGIT:
3869 if (LOC)
3870 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3871 else {
3872 /* consecutive digits assumed */
3873 for (value = 0; value < '0'; value++)
3874 ANYOF_BITMAP_SET(ret, value);
3875 for (value = '9' + 1; value < 256; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3877 }
3878 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3879 break;
3880 case ANYOF_GRAPH:
3881 if (LOC)
936ed897 3882 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3883 else {
3884 for (value = 0; value < 256; value++)
3885 if (isGRAPH(value))
936ed897 3886 ANYOF_BITMAP_SET(ret, value);
73b437c8 3887 }
ffc61ed2 3888 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3889 break;
3890 case ANYOF_NGRAPH:
3891 if (LOC)
936ed897 3892 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3893 else {
3894 for (value = 0; value < 256; value++)
3895 if (!isGRAPH(value))
936ed897 3896 ANYOF_BITMAP_SET(ret, value);
73b437c8 3897 }
ffc61ed2 3898 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3899 break;
3900 case ANYOF_LOWER:
3901 if (LOC)
936ed897 3902 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3903 else {
3904 for (value = 0; value < 256; value++)
3905 if (isLOWER(value))
936ed897 3906 ANYOF_BITMAP_SET(ret, value);
73b437c8 3907 }
ffc61ed2 3908 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3909 break;
3910 case ANYOF_NLOWER:
3911 if (LOC)
936ed897 3912 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3913 else {
3914 for (value = 0; value < 256; value++)
3915 if (!isLOWER(value))
936ed897 3916 ANYOF_BITMAP_SET(ret, value);
73b437c8 3917 }
ffc61ed2 3918 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3919 break;
3920 case ANYOF_PRINT:
3921 if (LOC)
936ed897 3922 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3923 else {
3924 for (value = 0; value < 256; value++)
3925 if (isPRINT(value))
936ed897 3926 ANYOF_BITMAP_SET(ret, value);
73b437c8 3927 }
ffc61ed2 3928 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3929 break;
3930 case ANYOF_NPRINT:
3931 if (LOC)
936ed897 3932 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3933 else {
3934 for (value = 0; value < 256; value++)
3935 if (!isPRINT(value))
936ed897 3936 ANYOF_BITMAP_SET(ret, value);
73b437c8 3937 }
ffc61ed2 3938 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3939 break;
aaa51d5e
JF
3940 case ANYOF_PSXSPC:
3941 if (LOC)
3942 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3943 else {
3944 for (value = 0; value < 256; value++)
3945 if (isPSXSPC(value))
3946 ANYOF_BITMAP_SET(ret, value);
3947 }
ffc61ed2 3948 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3949 break;
3950 case ANYOF_NPSXSPC:
3951 if (LOC)
3952 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3953 else {
3954 for (value = 0; value < 256; value++)
3955 if (!isPSXSPC(value))
3956 ANYOF_BITMAP_SET(ret, value);
3957 }
ffc61ed2 3958 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3959 break;
73b437c8
JH
3960 case ANYOF_PUNCT:
3961 if (LOC)
936ed897 3962 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3963 else {
3964 for (value = 0; value < 256; value++)
3965 if (isPUNCT(value))
936ed897 3966 ANYOF_BITMAP_SET(ret, value);
73b437c8 3967 }
ffc61ed2 3968 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3969 break;
3970 case ANYOF_NPUNCT:
3971 if (LOC)
936ed897 3972 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3973 else {
3974 for (value = 0; value < 256; value++)
3975 if (!isPUNCT(value))
936ed897 3976 ANYOF_BITMAP_SET(ret, value);
73b437c8 3977 }
ffc61ed2
JH
3978 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3979 break;
3980 case ANYOF_SPACE:
3981 if (LOC)
3982 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3983 else {
3984 for (value = 0; value < 256; value++)
3985 if (isSPACE(value))
3986 ANYOF_BITMAP_SET(ret, value);
3987 }
3988 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3989 break;
3990 case ANYOF_NSPACE:
3991 if (LOC)
3992 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3993 else {
3994 for (value = 0; value < 256; value++)
3995 if (!isSPACE(value))
3996 ANYOF_BITMAP_SET(ret, value);
3997 }
3998 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3999 break;
4000 case ANYOF_UPPER:
4001 if (LOC)
936ed897 4002 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4003 else {
4004 for (value = 0; value < 256; value++)
4005 if (isUPPER(value))
936ed897 4006 ANYOF_BITMAP_SET(ret, value);
73b437c8 4007 }
ffc61ed2 4008 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4009 break;
4010 case ANYOF_NUPPER:
4011 if (LOC)
936ed897 4012 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4013 else {
4014 for (value = 0; value < 256; value++)
4015 if (!isUPPER(value))
936ed897 4016 ANYOF_BITMAP_SET(ret, value);
73b437c8 4017 }
ffc61ed2 4018 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4019 break;
4020 case ANYOF_XDIGIT:
4021 if (LOC)
936ed897 4022 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4023 else {
4024 for (value = 0; value < 256; value++)
4025 if (isXDIGIT(value))
936ed897 4026 ANYOF_BITMAP_SET(ret, value);
73b437c8 4027 }
ffc61ed2 4028 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4029 break;
4030 case ANYOF_NXDIGIT:
4031 if (LOC)
936ed897 4032 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4033 else {
4034 for (value = 0; value < 256; value++)
4035 if (!isXDIGIT(value))
936ed897 4036 ANYOF_BITMAP_SET(ret, value);
73b437c8 4037 }
ffc61ed2 4038 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4039 break;
4040 default:
b45f050a 4041 vFAIL("Invalid [::] class");
73b437c8 4042 break;
b8c5462f 4043 }
b8c5462f 4044 if (LOC)
936ed897 4045 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4046 continue;
a0d0e21e 4047 }
ffc61ed2
JH
4048 } /* end of namedclass \blah */
4049
a0d0e21e 4050 if (range) {
3a3c4447 4051 if (prevvalue > value) /* b-a */ {
b45f050a 4052 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4053 RExC_parse - rangebegin,
4054 RExC_parse - rangebegin,
b45f050a 4055 rangebegin);
3568d838 4056 range = 0; /* not a valid range */
73b437c8 4057 }
a0d0e21e
LW
4058 }
4059 else {
3568d838 4060 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4061 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4062 RExC_parse[1] != ']') {
4063 RExC_parse++;
ffc61ed2
JH
4064
4065 /* a bad range like \w-, [:word:]- ? */
4066 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4067 if (ckWARN(WARN_REGEXP))
830247a4 4068 vWARN4(RExC_parse,
b45f050a 4069 "False [] range \"%*.*s\"",
830247a4
IZ
4070 RExC_parse - rangebegin,
4071 RExC_parse - rangebegin,
b45f050a 4072 rangebegin);
73b437c8 4073 if (!SIZE_ONLY)
936ed897 4074 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4075 } else
ffc61ed2
JH
4076 range = 1; /* yeah, it's a range! */
4077 continue; /* but do it the next time */
a0d0e21e 4078 }
a687059c 4079 }
ffc61ed2 4080
93733859 4081 /* now is the next time */
ae5c130c 4082 if (!SIZE_ONLY) {
3568d838
JH
4083 IV i;
4084
4085 if (prevvalue < 256) {
4086 IV ceilvalue = value < 256 ? value : 255;
4087
4088#ifdef EBCDIC
3a3c4447
JH
4089 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4090 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4091 {
3568d838
JH
4092 if (isLOWER(prevvalue)) {
4093 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4094 if (isLOWER(i))
4095 ANYOF_BITMAP_SET(ret, i);
4096 } else {
3568d838 4097 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4098 if (isUPPER(i))
4099 ANYOF_BITMAP_SET(ret, i);
4100 }
8ada0baa 4101 }
ffc61ed2 4102 else
8ada0baa 4103#endif
a5961de5
JH
4104 for (i = prevvalue; i <= ceilvalue; i++)
4105 ANYOF_BITMAP_SET(ret, i);
3568d838 4106 }
a5961de5 4107 if (value > 255 || UTF) {
b08decb7
JH
4108 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4109 UV natvalue = NATIVE_TO_UNI(value);
4110
ffc61ed2 4111 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4112 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4113 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4114 prevnatvalue, natvalue);
4115 }
4116 else if (prevnatvalue == natvalue) {
4117 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4118 if (FOLD) {
254ba52a
JH
4119 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4120 STRLEN foldlen;
2f3bf011 4121 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4122
c840d2a2
JH
4123 /* If folding and foldable and a single
4124 * character, insert also the folded version
4125 * to the charclass. */
9e55ce06
JH
4126 if (f != value) {
4127 if (foldlen == UNISKIP(f))
4128 Perl_sv_catpvf(aTHX_ listsv,
4129 "%04"UVxf"\n", f);
4130 else {
4131 /* Any multicharacter foldings
4132 * require the following transform:
4133 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4134 * where E folds into "pq" and F folds
4135 * into "rst", all other characters
4136 * fold to single characters. We save
4137 * away these multicharacter foldings,
4138 * to be later saved as part of the
4139 * additional "s" data. */
4140 SV *sv;
4141
4142 if (!unicode_alternate)
4143 unicode_alternate = newAV();
4144 sv = newSVpvn((char*)foldbuf, foldlen);
4145 SvUTF8_on(sv);
4146 av_push(unicode_alternate, sv);
4147 }
4148 }
254ba52a 4149
60a8b682
JH
4150 /* If folding and the value is one of the Greek
4151 * sigmas insert a few more sigmas to make the
4152 * folding rules of the sigmas to work right.
4153 * Note that not all the possible combinations
4154 * are handled here: some of them are handled
9e55ce06
JH
4155 * by the standard folding rules, and some of
4156 * them (literal or EXACTF cases) are handled
4157 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4158 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4159 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4160 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4161 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4162 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4163 }
4164 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4165 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4166 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4167 }
4168 }
ffc61ed2 4169 }
8ada0baa 4170 }
ffc61ed2
JH
4171
4172 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4173 }
ffc61ed2 4174
936ed897 4175 if (need_class) {
4f66b38d 4176 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4177 if (SIZE_ONLY)
830247a4 4178 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4179 else
830247a4 4180 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4181 }
ffc61ed2 4182
ae5c130c 4183 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4184 if (!SIZE_ONLY &&
ffc61ed2 4185 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4186 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4187 ) {
a0ed51b3 4188 for (value = 0; value < 256; ++value) {
936ed897 4189 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4190 IV fold = PL_fold[value];
4191
4192 if (fold != value)
4193 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4194 }
4195 }
936ed897 4196 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4197 }
ffc61ed2 4198
ae5c130c 4199 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4200 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4201 /* If the only flag is inversion. */
4202 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4203 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4204 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4205 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4206 }
a0d0e21e 4207
b81d288d 4208 if (!SIZE_ONLY) {
fde631ed 4209 AV *av = newAV();
ffc61ed2
JH
4210 SV *rv;
4211
9e55ce06
JH
4212 /* The 0th element stores the character class description
4213 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4214 * to initialize the appropriate swash (which gets stored in
4215 * the 1st element), and also useful for dumping the regnode.
4216 * The 2nd element stores the multicharacter foldings,
4217 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4218 av_store(av, 0, listsv);
4219 av_store(av, 1, NULL);
9e55ce06 4220 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4221 rv = newRV_noinc((SV*)av);
19860706 4222 n = add_data(pRExC_state, 1, "s");
830247a4 4223 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4224 ARG_SET(ret, n);
a0ed51b3
LW
4225 }
4226
4227 return ret;
4228}
4229
76e3520e 4230STATIC char*
830247a4 4231S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4232{
830247a4 4233 char* retval = RExC_parse++;
a0d0e21e 4234
4633a7c4 4235 for (;;) {
830247a4
IZ
4236 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4237 RExC_parse[2] == '#') {
4238 while (*RExC_parse && *RExC_parse != ')')
4239 RExC_parse++;
4240 RExC_parse++;
4633a7c4
LW
4241 continue;
4242 }
830247a4
IZ
4243 if (RExC_flags16 & PMf_EXTENDED) {
4244 if (isSPACE(*RExC_parse)) {
4245 RExC_parse++;
748a9306
LW
4246 continue;
4247 }
830247a4
IZ
4248 else if (*RExC_parse == '#') {
4249 while (*RExC_parse && *RExC_parse != '\n')
4250 RExC_parse++;
4251 RExC_parse++;
748a9306
LW
4252 continue;
4253 }
748a9306 4254 }
4633a7c4 4255 return retval;
a0d0e21e 4256 }
a687059c
LW
4257}
4258
4259/*
c277df42 4260- reg_node - emit a node
a0d0e21e 4261*/
76e3520e 4262STATIC regnode * /* Location. */
830247a4 4263S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4264{
c277df42
IZ
4265 register regnode *ret;
4266 register regnode *ptr;
a687059c 4267
830247a4 4268 ret = RExC_emit;
c277df42 4269 if (SIZE_ONLY) {
830247a4
IZ
4270 SIZE_ALIGN(RExC_size);
4271 RExC_size += 1;
a0d0e21e
LW
4272 return(ret);
4273 }
a687059c 4274
c277df42 4275 NODE_ALIGN_FILL(ret);
a0d0e21e 4276 ptr = ret;
c277df42 4277 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4278 if (RExC_offsets) { /* MJD */
4279 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4280 "reg_node", __LINE__,
4281 reg_name[op],
4282 RExC_emit - RExC_emit_start > RExC_offsets[0]
4283 ? "Overwriting end of array!\n" : "OK",
4284 RExC_emit - RExC_emit_start,
4285 RExC_parse - RExC_start,
4286 RExC_offsets[0]));
4287 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4288 }
4289
830247a4 4290 RExC_emit = ptr;
a687059c 4291
a0d0e21e 4292 return(ret);
a687059c
LW
4293}
4294
4295/*
a0d0e21e
LW
4296- reganode - emit a node with an argument
4297*/
76e3520e 4298STATIC regnode * /* Location. */
830247a4 4299S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4300{
c277df42
IZ
4301 register regnode *ret;
4302 register regnode *ptr;
fe14fcc3 4303
830247a4 4304 ret = RExC_emit;
c277df42 4305 if (SIZE_ONLY) {
830247a4
IZ
4306 SIZE_ALIGN(RExC_size);
4307 RExC_size += 2;
a0d0e21e
LW
4308 return(ret);
4309 }
fe14fcc3 4310
c277df42 4311 NODE_ALIGN_FILL(ret);
a0d0e21e 4312 ptr = ret;
c277df42 4313 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4314 if (RExC_offsets) { /* MJD */
4315 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4316 "reganode",
4317 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4318 "Overwriting end of array!\n" : "OK",
4319 RExC_emit - RExC_emit_start,
4320 RExC_parse - RExC_start,
4321 RExC_offsets[0]));
4322 Set_Cur_Node_Offset;
4323 }
4324
830247a4 4325 RExC_emit = ptr;
fe14fcc3 4326
a0d0e21e 4327 return(ret);
fe14fcc3
LW
4328}
4329
4330/*
cd439c50 4331- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4332*/
4333STATIC void
830247a4 4334S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4335{
5e12f4fb 4336 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4337}
4338
4339/*
a0d0e21e
LW
4340- reginsert - insert an operator in front of already-emitted operand
4341*
4342* Means relocating the operand.
4343*/
76e3520e 4344STATIC void
830247a4 4345S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4346{
c277df42
IZ
4347 register regnode *src;
4348 register regnode *dst;
4349 register regnode *place;
4350 register int offset = regarglen[(U8)op];
b81d288d 4351
22c35a8c 4352/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4353
4354 if (SIZE_ONLY) {
830247a4 4355 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4356 return;
4357 }
a687059c 4358
830247a4
IZ
4359 src = RExC_emit;
4360 RExC_emit += NODE_STEP_REGNODE + offset;
4361 dst = RExC_emit;
fac92740 4362 while (src > opnd) {
c277df42 4363 StructCopy(--src, --dst, regnode);
fac92740
MJD
4364 if (RExC_offsets) { /* MJD 20010112 */
4365 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4366 "reg_insert",
4367 dst - RExC_emit_start > RExC_offsets[0]
4368 ? "Overwriting end of array!\n" : "OK",
4369 src - RExC_emit_start,
4370 dst - RExC_emit_start,
4371 RExC_offsets[0]));
4372 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4373 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4374 }
4375 }
4376
a0d0e21e
LW
4377
4378 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4379 if (RExC_offsets) { /* MJD */
4380 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4381 "reginsert",
4382 place - RExC_emit_start > RExC_offsets[0]
4383 ? "Overwriting end of array!\n" : "OK",
4384 place - RExC_emit_start,
4385 RExC_parse - RExC_start,
4386 RExC_offsets[0]));
4387 Set_Node_Offset(place, RExC_parse);
4388 }
c277df42
IZ
4389 src = NEXTOPER(place);
4390 FILL_ADVANCE_NODE(place, op);
4391 Zero(src, offset, regnode);
a687059c
LW
4392}
4393
4394/*
c277df42 4395- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4396*/
76e3520e 4397STATIC void
830247a4 4398S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4399{
c277df42
IZ
4400 register regnode *scan;
4401 register regnode *temp;
a0d0e21e 4402
c277df42 4403 if (SIZE_ONLY)
a0d0e21e
LW
4404 return;
4405
4406 /* Find last node. */
4407 scan = p;
4408 for (;;) {
4409 temp = regnext(scan);
4410 if (temp == NULL)
4411 break;
4412 scan = temp;
4413 }
a687059c 4414
c277df42
IZ
4415 if (reg_off_by_arg[OP(scan)]) {
4416 ARG_SET(scan, val - scan);
a0ed51b3
LW
4417 }
4418 else {
c277df42
IZ
4419 NEXT_OFF(scan) = val - scan;
4420 }
a687059c
LW
4421}
4422
4423/*
a0d0e21e
LW
4424- regoptail - regtail on operand of first argument; nop if operandless
4425*/
76e3520e 4426STATIC void
830247a4 4427S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4428{
a0d0e21e 4429 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4430 if (p == NULL || SIZE_ONLY)
4431 return;
22c35a8c 4432 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4433 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4434 }
22c35a8c 4435 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4436 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4437 }
4438 else
a0d0e21e 4439 return;
a687059c
LW
4440}
4441
4442/*
4443 - regcurly - a little FSA that accepts {\d+,?\d*}
4444 */
79072805 4445STATIC I32
cea2e8a9 4446S_regcurly(pTHX_ register char *s)
a687059c
LW
4447{
4448 if (*s++ != '{')
4449 return FALSE;
f0fcb552 4450 if (!isDIGIT(*s))
a687059c 4451 return FALSE;
f0fcb552 4452 while (isDIGIT(*s))
a687059c
LW
4453 s++;
4454 if (*s == ',')
4455 s++;
f0fcb552 4456 while (isDIGIT(*s))
a687059c
LW
4457 s++;
4458 if (*s != '}')
4459 return FALSE;
4460 return TRUE;
4461}
4462
a687059c 4463
8fa7f367
JH
4464#ifdef DEBUGGING
4465
76e3520e 4466STATIC regnode *
cea2e8a9 4467S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4468{
f248d071 4469 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4470 register regnode *next;
c277df42
IZ
4471
4472 while (op != END && (!last || node < last)) {
4473 /* While that wasn't END last time... */
4474
4475 NODE_ALIGN(node);
4476 op = OP(node);
4477 if (op == CLOSE)
4478 l--;
4479 next = regnext(node);
4480 /* Where, what. */
4481 if (OP(node) == OPTIMIZED)
4482 goto after_print;
4483 regprop(sv, node);
b900a521 4484 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4485 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4486 if (next == NULL) /* Next ptr. */
4487 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4488 else
b900a521 4489 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4490 (void)PerlIO_putc(Perl_debug_log, '\n');
4491 after_print:
22c35a8c 4492 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4493 register regnode *nnode = (OP(next) == LONGJMP
4494 ? regnext(next)
c277df42
IZ
4495 : next);
4496 if (last && nnode > last)
4497 nnode = last;
4498 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4499 }
22c35a8c 4500 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4501 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4502 }
4503 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4504 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4505 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4506 }
22c35a8c 4507 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4508 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4509 next, sv, l + 1);
a0ed51b3
LW
4510 }
4511 else if ( op == PLUS || op == STAR) {
c277df42 4512 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4513 }
4514 else if (op == ANYOF) {
4f66b38d
HS
4515 /* arglen 1 + class block */
4516 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4517 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4518 node = NEXTOPER(node);
a0ed51b3 4519 }
22c35a8c 4520 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4521 /* Literal string, where present. */
cd439c50 4522 node += NODE_SZ_STR(node) - 1;
c277df42 4523 node = NEXTOPER(node);
a0ed51b3
LW
4524 }
4525 else {
c277df42
IZ
4526 node = NEXTOPER(node);
4527 node += regarglen[(U8)op];
4528 }
4529 if (op == CURLYX || op == OPEN)
4530 l++;
4531 else if (op == WHILEM)
4532 l--;
4533 }
4534 return node;
4535}
4536
8fa7f367
JH
4537#endif /* DEBUGGING */
4538
a687059c 4539/*
fd181c75 4540 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4541 */
4542void
864dbfa3 4543Perl_regdump(pTHX_ regexp *r)
a687059c 4544{
35ff7856 4545#ifdef DEBUGGING
46fc3d4c 4546 SV *sv = sv_newmortal();
a687059c 4547
c277df42 4548 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4549
4550 /* Header fields of interest. */
c277df42 4551 if (r->anchored_substr)
7b0972df 4552 PerlIO_printf(Perl_debug_log,
b81d288d 4553 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4554 PL_colors[0],
7b0972df 4555 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4556 SvPVX(r->anchored_substr),
3280af22 4557 PL_colors[1],
c277df42 4558 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4559 (IV)r->anchored_offset);
33b8afdf
JH
4560 else if (r->anchored_utf8)
4561 PerlIO_printf(Perl_debug_log,
4562 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4563 PL_colors[0],
4564 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4565 SvPVX(r->anchored_utf8),
4566 PL_colors[1],
4567 SvTAIL(r->anchored_utf8) ? "$" : "",
4568 (IV)r->anchored_offset);
c277df42 4569 if (r->float_substr)
7b0972df 4570 PerlIO_printf(Perl_debug_log,
b81d288d 4571 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4572 PL_colors[0],
b81d288d 4573 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4574 SvPVX(r->float_substr),
3280af22 4575 PL_colors[1],
c277df42 4576 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4577 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4578 else if (r->float_utf8)
4579 PerlIO_printf(Perl_debug_log,
4580 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4581 PL_colors[0],
4582 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4583 SvPVX(r->float_utf8),
4584 PL_colors[1],
4585 SvTAIL(r->float_utf8) ? "$" : "",
4586 (IV)r->float_min_offset, (UV)r->float_max_offset);
4587 if (r->check_substr || r->check_utf8)
b81d288d
AB
4588 PerlIO_printf(Perl_debug_log,
4589 r->check_substr == r->float_substr
33b8afdf 4590 && r->check_utf8 == r->float_utf8
c277df42
IZ
4591 ? "(checking floating" : "(checking anchored");
4592 if (r->reganch & ROPT_NOSCAN)
4593 PerlIO_printf(Perl_debug_log, " noscan");
4594 if (r->reganch & ROPT_CHECK_ALL)
4595 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4596 if (r->check_substr || r->check_utf8)
c277df42
IZ
4597 PerlIO_printf(Perl_debug_log, ") ");
4598
46fc3d4c 4599 if (r->regstclass) {
4600 regprop(sv, r->regstclass);
4601 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4602 }
774d564b 4603 if (r->reganch & ROPT_ANCH) {
4604 PerlIO_printf(Perl_debug_log, "anchored");
4605 if (r->reganch & ROPT_ANCH_BOL)
4606 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4607 if (r->reganch & ROPT_ANCH_MBOL)
4608 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4609 if (r->reganch & ROPT_ANCH_SBOL)
4610 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4611 if (r->reganch & ROPT_ANCH_GPOS)
4612 PerlIO_printf(Perl_debug_log, "(GPOS)");
4613 PerlIO_putc(Perl_debug_log, ' ');
4614 }
c277df42
IZ
4615 if (r->reganch & ROPT_GPOS_SEEN)
4616 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4617 if (r->reganch & ROPT_SKIP)
760ac839 4618 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4619 if (r->reganch & ROPT_IMPLICIT)
760ac839 4620 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4621 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4622 if (r->reganch & ROPT_EVAL_SEEN)
4623 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4624 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4625 if (r->offsets) {
4626 U32 i;
4627 U32 len = r->offsets[0];
392fbf5d 4628 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4629 for (i = 1; i <= len; i++)
392fbf5d
RB
4630 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4631 (UV)r->offsets[i*2-1],
4632 (UV)r->offsets[i*2]);
fac92740
MJD
4633 PerlIO_printf(Perl_debug_log, "\n");
4634 }
17c3b450 4635#endif /* DEBUGGING */
a687059c
LW
4636}
4637
8fa7f367
JH
4638#ifdef DEBUGGING
4639
653099ff
GS
4640STATIC void
4641S_put_byte(pTHX_ SV *sv, int c)
4642{
7be5a6cf 4643 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4644 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4645 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4646 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4647 else
4648 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4649}
4650
8fa7f367
JH
4651#endif /* DEBUGGING */
4652
a687059c 4653/*
a0d0e21e
LW
4654- regprop - printable representation of opcode
4655*/
46fc3d4c 4656void
864dbfa3 4657Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4658{
35ff7856 4659#ifdef DEBUGGING
9b155405 4660 register int k;
a0d0e21e 4661
54dc92de 4662 sv_setpvn(sv, "", 0);
9b155405 4663 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4664 /* It would be nice to FAIL() here, but this may be called from
4665 regexec.c, and it would be hard to supply pRExC_state. */
4666 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4667 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4668
4669 k = PL_regkind[(U8)OP(o)];
4670
2a782b5b
JH
4671 if (k == EXACT) {
4672 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4673 /* Using is_utf8_string() is a crude hack but it may
4674 * be the best for now since we have no flag "this EXACTish
4675 * node was UTF-8" --jhi */
4676 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4677 char *s = do_utf8 ?
c728cb41
JH
4678 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4679 UNI_DISPLAY_REGEX) :
2a782b5b 4680 STRING(o);
40eddc46 4681 int len = do_utf8 ?
2a782b5b
JH
4682 strlen(s) :
4683 STR_LEN(o);
4684 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4685 PL_colors[0],
4686 len, s,
4687 PL_colors[1]);
4688 }
9b155405 4689 else if (k == CURLY) {
cb434fcc 4690 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4691 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4692 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4693 }
2c2d71f5
JH
4694 else if (k == WHILEM && o->flags) /* Ordinal/of */
4695 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4696 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4697 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4698 else if (k == LOGICAL)
04ebc1ab 4699 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4700 else if (k == ANYOF) {
4701 int i, rangestart = -1;
ffc61ed2 4702 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4703 const char * const anyofs[] = { /* Should be syncronized with
4704 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4705 "\\w",
4706 "\\W",
4707 "\\s",
4708 "\\S",
4709 "\\d",
4710 "\\D",
4711 "[:alnum:]",
4712 "[:^alnum:]",
4713 "[:alpha:]",
4714 "[:^alpha:]",
4715 "[:ascii:]",
4716 "[:^ascii:]",
4717 "[:ctrl:]",
4718 "[:^ctrl:]",
4719 "[:graph:]",
4720 "[:^graph:]",
4721 "[:lower:]",
4722 "[:^lower:]",
4723 "[:print:]",
4724 "[:^print:]",
4725 "[:punct:]",
4726 "[:^punct:]",
4727 "[:upper:]",
aaa51d5e 4728 "[:^upper:]",
653099ff 4729 "[:xdigit:]",
aaa51d5e
JF
4730 "[:^xdigit:]",
4731 "[:space:]",
4732 "[:^space:]",
4733 "[:blank:]",
4734 "[:^blank:]"
653099ff
GS
4735 };
4736
19860706 4737 if (flags & ANYOF_LOCALE)
653099ff 4738 sv_catpv(sv, "{loc}");
19860706 4739 if (flags & ANYOF_FOLD)
653099ff
GS
4740 sv_catpv(sv, "{i}");
4741 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4742 if (flags & ANYOF_INVERT)
653099ff 4743 sv_catpv(sv, "^");
ffc61ed2
JH
4744 for (i = 0; i <= 256; i++) {
4745 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4746 if (rangestart == -1)
4747 rangestart = i;
4748 } else if (rangestart != -1) {
4749 if (i <= rangestart + 3)
4750 for (; rangestart < i; rangestart++)
653099ff 4751 put_byte(sv, rangestart);
ffc61ed2
JH
4752 else {
4753 put_byte(sv, rangestart);
4754 sv_catpv(sv, "-");
4755 put_byte(sv, i - 1);
653099ff 4756 }
ffc61ed2 4757 rangestart = -1;
653099ff 4758 }
847a199f 4759 }
ffc61ed2
JH
4760
4761 if (o->flags & ANYOF_CLASS)
4762 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4763 if (ANYOF_CLASS_TEST(o,i))
4764 sv_catpv(sv, anyofs[i]);
4765
4766 if (flags & ANYOF_UNICODE)
4767 sv_catpv(sv, "{unicode}");
1aa99e6b 4768 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4769 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4770
4771 {
4772 SV *lv;
9e55ce06 4773 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4774
ffc61ed2
JH
4775 if (lv) {
4776 if (sw) {
4777 UV i;
4778 U8 s[UTF8_MAXLEN+1];
b81d288d 4779
ffc61ed2 4780 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4781 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4782
3568d838 4783 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4784 if (rangestart == -1)
4785 rangestart = i;
4786 } else if (rangestart != -1) {
4787 U8 *p;
b81d288d 4788
ffc61ed2
JH
4789 if (i <= rangestart + 3)
4790 for (; rangestart < i; rangestart++) {
2b9d42f0 4791 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4792 put_byte(sv, *p);
4793 }
4794 else {
2b9d42f0 4795 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4796 put_byte(sv, *p);
4797 sv_catpv(sv, "-");
2b9d42f0 4798 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4799 put_byte(sv, *p);
4800 }
4801 rangestart = -1;
4802 }
19860706 4803 }
ffc61ed2
JH
4804
4805 sv_catpv(sv, "..."); /* et cetera */
19860706 4806 }
fde631ed 4807
ffc61ed2
JH
4808 {
4809 char *s = savepv(SvPVX(lv));
4810 char *origs = s;
b81d288d 4811
ffc61ed2 4812 while(*s && *s != '\n') s++;
b81d288d 4813
ffc61ed2
JH
4814 if (*s == '\n') {
4815 char *t = ++s;
4816
4817 while (*s) {
4818 if (*s == '\n')
4819 *s = ' ';
4820 s++;
4821 }
4822 if (s[-1] == ' ')
4823 s[-1] = 0;
4824
4825 sv_catpv(sv, t);
fde631ed 4826 }
b81d288d 4827
ffc61ed2 4828 Safefree(origs);
fde631ed
JH
4829 }
4830 }
653099ff 4831 }
ffc61ed2 4832
653099ff
GS
4833 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4834 }
9b155405 4835 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4836 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4837#endif /* DEBUGGING */
35ff7856 4838}
a687059c 4839
cad2e5aa
JH
4840SV *
4841Perl_re_intuit_string(pTHX_ regexp *prog)
4842{ /* Assume that RE_INTUIT is set */
4843 DEBUG_r(
4844 { STRLEN n_a;
33b8afdf
JH
4845 char *s = SvPV(prog->check_substr
4846 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4847
4848 if (!PL_colorset) reginitcolors();
4849 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4850 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4851 PL_colors[4],
4852 prog->check_substr ? "" : "utf8 ",
4853 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4854 s,
4855 PL_colors[1],
4856 (strlen(s) > 60 ? "..." : ""));
4857 } );
4858
33b8afdf 4859 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4860}
4861
2b69d0c2 4862void
864dbfa3 4863Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4864{
9e55ce06
JH
4865#ifdef DEBUGGING
4866 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4867#endif
7821416a
IZ
4868
4869 if (!r || (--r->refcnt > 0))
4870 return;
9e55ce06 4871 DEBUG_r({
9e55ce06 4872 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4873 UNI_DISPLAY_REGEX);
9e55ce06
JH
4874 int len = SvCUR(dsv);
4875 if (!PL_colorset)
4876 reginitcolors();
4877 PerlIO_printf(Perl_debug_log,
4878 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4879 PL_colors[4],PL_colors[5],PL_colors[0],
4880 len, len, s,
4881 PL_colors[1],
4882 len > 60 ? "..." : "");
4883 });
cad2e5aa 4884
c277df42 4885 if (r->precomp)
a0d0e21e 4886 Safefree(r->precomp);
fac92740
MJD
4887 if (r->offsets) /* 20010421 MJD */
4888 Safefree(r->offsets);
cf93c79d
IZ
4889 if (RX_MATCH_COPIED(r))
4890 Safefree(r->subbeg);
a193d654
GS
4891 if (r->substrs) {
4892 if (r->anchored_substr)
4893 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4894 if (r->anchored_utf8)
4895 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4896 if (r->float_substr)
4897 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4898 if (r->float_utf8)
4899 SvREFCNT_dec(r->float_utf8);
2779dcf1 4900 Safefree(r->substrs);
a193d654 4901 }
c277df42
IZ
4902 if (r->data) {
4903 int n = r->data->count;
dfad63ad
HS
4904 AV* new_comppad = NULL;
4905 AV* old_comppad;
4906 SV** old_curpad;
4907
c277df42 4908 while (--n >= 0) {
261faec3 4909 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4910 switch (r->data->what[n]) {
4911 case 's':
4912 SvREFCNT_dec((SV*)r->data->data[n]);
4913 break;
653099ff
GS
4914 case 'f':
4915 Safefree(r->data->data[n]);
4916 break;
dfad63ad
HS
4917 case 'p':
4918 new_comppad = (AV*)r->data->data[n];
4919 break;
c277df42 4920 case 'o':
dfad63ad 4921 if (new_comppad == NULL)
cea2e8a9 4922 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4923 old_comppad = PL_comppad;
4924 old_curpad = PL_curpad;
1e6dc0b6
SB
4925 /* Watch out for global destruction's random ordering. */
4926 if (SvTYPE(new_comppad) == SVt_PVAV) {
4927 PL_comppad = new_comppad;
4928 PL_curpad = AvARRAY(new_comppad);
4929 }
4930 else
4931 PL_curpad = NULL;
9b978d73
DM
4932
4933 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4934 op_free((OP_4tree*)r->data->data[n]);
4935 }
4936
dfad63ad
HS
4937 PL_comppad = old_comppad;
4938 PL_curpad = old_curpad;
4939 SvREFCNT_dec((SV*)new_comppad);
4940 new_comppad = NULL;
c277df42
IZ
4941 break;
4942 case 'n':
9e55ce06 4943 break;
c277df42 4944 default:
830247a4 4945 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4946 }
4947 }
4948 Safefree(r->data->what);
4949 Safefree(r->data);
a0d0e21e
LW
4950 }
4951 Safefree(r->startp);
4952 Safefree(r->endp);
4953 Safefree(r);
a687059c 4954}
c277df42
IZ
4955
4956/*
4957 - regnext - dig the "next" pointer out of a node
4958 *
4959 * [Note, when REGALIGN is defined there are two places in regmatch()
4960 * that bypass this code for speed.]
4961 */
4962regnode *
864dbfa3 4963Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4964{
4965 register I32 offset;
4966
3280af22 4967 if (p == &PL_regdummy)
c277df42
IZ
4968 return(NULL);
4969
4970 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4971 if (offset == 0)
4972 return(NULL);
4973
c277df42 4974 return(p+offset);
c277df42
IZ
4975}
4976
01f988be 4977STATIC void
cea2e8a9 4978S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4979{
4980 va_list args;
4981 STRLEN l1 = strlen(pat1);
4982 STRLEN l2 = strlen(pat2);
4983 char buf[512];
06bf62c7 4984 SV *msv;
c277df42
IZ
4985 char *message;
4986
4987 if (l1 > 510)
4988 l1 = 510;
4989 if (l1 + l2 > 510)
4990 l2 = 510 - l1;
4991 Copy(pat1, buf, l1 , char);
4992 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4993 buf[l1 + l2] = '\n';
4994 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4995#ifdef I_STDARG
4996 /* ANSI variant takes additional second argument */
c277df42 4997 va_start(args, pat2);
8736538c
AS
4998#else
4999 va_start(args);
5000#endif
5a844595 5001 msv = vmess(buf, &args);
c277df42 5002 va_end(args);
06bf62c7 5003 message = SvPV(msv,l1);
c277df42
IZ
5004 if (l1 > 512)
5005 l1 = 512;
5006 Copy(message, buf, l1 , char);
5007 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5008 Perl_croak(aTHX_ "%s", buf);
c277df42 5009}
a0ed51b3
LW
5010
5011/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5012
5013void
864dbfa3 5014Perl_save_re_context(pTHX)
b81d288d 5015{
830247a4
IZ
5016#if 0
5017 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5018 SAVEI32(RExC_npar); /* () count. */
5019 SAVEI32(RExC_size); /* Code size. */
5020 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5021 SAVEVPTR(RExC_rx); /* from regcomp.c */
5022 SAVEI32(RExC_seen); /* from regcomp.c */
5023 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5024 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5025 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
5026 SAVEPPTR(RExC_end); /* End of input for compile */
5027 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5028#endif
5029
5030 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5031 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5032 SAVEPPTR(PL_reginput); /* String-input pointer. */
5033 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5034 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5035 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5036 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5037 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 5038 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5039 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5040 PL_reg_start_tmp = 0;
a0ed51b3
LW
5041 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5042 PL_reg_start_tmpl = 0;
7766f137 5043 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5044 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5045 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5046 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5047 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5048 SAVEVPTR(PL_regcc); /* from regexec.c */
5049 SAVEVPTR(PL_curcop);
7766f137
GS
5050 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5051 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5052 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5053 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 5054 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 5055 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5056 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5057 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5058 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5059 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5060 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5061#ifdef DEBUGGING
b81d288d 5062 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5063#endif
a0ed51b3 5064}
51371543 5065
51371543 5066static void
acfe0abc 5067clear_re(pTHX_ void *r)
51371543
GS
5068{
5069 ReREFCNT_dec((regexp *)r);
5070}
ffbc6a93 5071