This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
silence another signed/unsigned mismatch warning
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4 110typedef struct RExC_state_t {
e2509266 111 U32 flags; /* are we folding, multilining? */
830247a4
IZ
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380
MP
259#define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a
JF
274/*
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
277 * "...".
278 */
ccb2c380
MP
279#define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
282 \
283 if (!SIZE_ONLY) \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
288 ellipses = "..."; \
289 } \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
292} STMT_END
b45f050a
JF
293
294
295/*
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297 */
ccb2c380
MP
298#define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
302} STMT_END
b45f050a
JF
303
304/*
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306 */
ccb2c380
MP
307#define vFAIL(m) STMT_START { \
308 if (!SIZE_ONLY) \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
310 Simple_vFAIL(m); \
311} STMT_END
b45f050a
JF
312
313/*
314 * Like Simple_vFAIL(), but accepts two arguments.
315 */
ccb2c380
MP
316#define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
320} STMT_END
b45f050a
JF
321
322/*
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324 */
ccb2c380
MP
325#define vFAIL2(m,a1) STMT_START { \
326 if (!SIZE_ONLY) \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
329} STMT_END
b45f050a
JF
330
331
332/*
333 * Like Simple_vFAIL(), but accepts three arguments.
334 */
ccb2c380
MP
335#define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
339} STMT_END
b45f050a
JF
340
341/*
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343 */
ccb2c380
MP
344#define vFAIL3(m,a1,a2) STMT_START { \
345 if (!SIZE_ONLY) \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
348} STMT_END
b45f050a
JF
349
350/*
351 * Like Simple_vFAIL(), but accepts four arguments.
352 */
ccb2c380
MP
353#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
357} STMT_END
b45f050a
JF
358
359/*
360 * Like Simple_vFAIL(), but accepts five arguments.
361 */
ccb2c380
MP
362#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368
369#define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
420/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
a0ed51b3
LW
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42
IZ
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
498 data->last_end = -1;
499 data->flags &= ~SF_BEFORE_EOL;
500}
501
653099ff
GS
502/* Can match anything (initialization) */
503STATIC void
830247a4 504S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 505{
653099ff 506 ANYOF_CLASS_ZERO(cl);
f8bef550 507 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 508 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
509 if (LOC)
510 cl->flags |= ANYOF_LOCALE;
511}
512
513/* Can match anything (initialization) */
514STATIC int
515S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
516{
517 int value;
518
aaa51d5e 519 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
520 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
521 return 1;
1aa99e6b
IH
522 if (!(cl->flags & ANYOF_UNICODE_ALL))
523 return 0;
f8bef550
NC
524 if (!ANYOF_BITMAP_TESTALLSET(cl))
525 return 0;
653099ff
GS
526 return 1;
527}
528
529/* Can match anything (initialization) */
530STATIC void
830247a4 531S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 532{
8ecf7187 533 Zero(cl, 1, struct regnode_charclass_class);
653099ff 534 cl->type = ANYOF;
830247a4 535 cl_anything(pRExC_state, cl);
653099ff
GS
536}
537
538STATIC void
830247a4 539S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544 if (LOC)
545 cl->flags |= ANYOF_LOCALE;
546}
547
548/* 'And' a given class with another one. Can create false positives */
549/* We assume that cl is not inverted */
550STATIC void
551S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552 struct regnode_charclass_class *and_with)
553{
653099ff
GS
554 if (!(and_with->flags & ANYOF_CLASS)
555 && !(cl->flags & ANYOF_CLASS)
556 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557 && !(and_with->flags & ANYOF_FOLD)
558 && !(cl->flags & ANYOF_FOLD)) {
559 int i;
560
561 if (and_with->flags & ANYOF_INVERT)
562 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563 cl->bitmap[i] &= ~and_with->bitmap[i];
564 else
565 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566 cl->bitmap[i] &= and_with->bitmap[i];
567 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568 if (!(and_with->flags & ANYOF_EOS))
569 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
570
571 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 cl->flags |= ANYOF_UNICODE;
574 ARG_SET(cl, ARG(and_with));
575 }
576 if (!(and_with->flags & ANYOF_UNICODE_ALL))
577 cl->flags &= ~ANYOF_UNICODE_ALL;
578 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
580}
581
582/* 'OR' a given class with another one. Can create false positives */
583/* We assume that cl is not inverted */
584STATIC void
830247a4 585S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 586{
653099ff
GS
587 if (or_with->flags & ANYOF_INVERT) {
588 /* We do not use
589 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590 * <= (B1 | !B2) | (CL1 | !CL2)
591 * which is wasteful if CL2 is small, but we ignore CL2:
592 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593 * XXXX Can we handle case-fold? Unclear:
594 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
596 */
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598 && !(or_with->flags & ANYOF_FOLD)
599 && !(cl->flags & ANYOF_FOLD) ) {
600 int i;
601
602 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603 cl->bitmap[i] |= ~or_with->bitmap[i];
604 } /* XXXX: logic is complicated otherwise */
605 else {
830247a4 606 cl_anything(pRExC_state, cl);
653099ff
GS
607 }
608 } else {
609 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 611 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
612 || (cl->flags & ANYOF_FOLD)) ) {
613 int i;
614
615 /* OR char bitmap and class bitmap separately */
616 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617 cl->bitmap[i] |= or_with->bitmap[i];
618 if (or_with->flags & ANYOF_CLASS) {
619 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620 cl->classflags[i] |= or_with->classflags[i];
621 cl->flags |= ANYOF_CLASS;
622 }
623 }
624 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 }
628 if (or_with->flags & ANYOF_EOS)
629 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
630
631 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632 ARG(cl) != ARG(or_with)) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
635 }
636 if (or_with->flags & ANYOF_UNICODE_ALL) {
637 cl->flags |= ANYOF_UNICODE_ALL;
638 cl->flags &= ~ANYOF_UNICODE;
639 }
653099ff
GS
640}
641
5d1c421c
JH
642/*
643 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644 * These need to be revisited when a newer toolchain becomes available.
645 */
646#if defined(__sparc64__) && defined(__GNUC__)
647# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648# undef SPARC64_GCC_WORKAROUND
649# define SPARC64_GCC_WORKAROUND 1
650# endif
651#endif
652
653099ff
GS
653/* REx optimizer. Converts nodes into quickier variants "in place".
654 Finds fixed substrings. */
655
c277df42
IZ
656/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657 to the position after last scanned or to NULL. */
658
76e3520e 659STATIC I32
830247a4 660S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
661 /* scanp: Start here (read-write). */
662 /* deltap: Write maxlen-minlen here. */
663 /* last: Stop before this one. */
664{
665 I32 min = 0, pars = 0, code;
666 regnode *scan = *scanp, *next;
667 I32 delta = 0;
668 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 669 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
670 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671 scan_data_t data_fake;
653099ff 672 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 673
c277df42
IZ
674 while (scan && OP(scan) != END && scan < last) {
675 /* Peephole optimizer: */
676
22c35a8c 677 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 678 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
679 regnode *n = regnext(scan);
680 U32 stringok = 1;
681#ifdef DEBUGGING
682 regnode *stop = scan;
b81d288d 683#endif
c277df42 684
cd439c50 685 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
686 /* Skip NOTHING, merge EXACT*. */
687 while (n &&
b81d288d 688 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
689 (stringok && (OP(n) == OP(scan))))
690 && NEXT_OFF(n)
691 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692 if (OP(n) == TAIL || n > next)
693 stringok = 0;
22c35a8c 694 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
695 NEXT_OFF(scan) += NEXT_OFF(n);
696 next = n + NODE_STEP_REGNODE;
697#ifdef DEBUGGING
698 if (stringok)
699 stop = n;
b81d288d 700#endif
c277df42 701 n = regnext(n);
a0ed51b3 702 }
f49d4d0f 703 else if (stringok) {
cd439c50 704 int oldl = STR_LEN(scan);
c277df42 705 regnode *nnext = regnext(n);
f49d4d0f 706
b81d288d 707 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
708 break;
709 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
710 STR_LEN(scan) += STR_LEN(n);
711 next = n + NODE_SZ_STR(n);
c277df42 712 /* Now we can overwrite *n : */
f49d4d0f 713 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 714#ifdef DEBUGGING
f49d4d0f 715 stop = next - 1;
b81d288d 716#endif
c277df42
IZ
717 n = nnext;
718 }
719 }
61a36c01 720
d65e4eab 721 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
722/*
723 Two problematic code points in Unicode casefolding of EXACT nodes:
724
725 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
727
728 which casefold to
729
730 Unicode UTF-8
731
732 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
733 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
734
735 This means that in case-insensitive matching (or "loose matching",
736 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737 length of the above casefolded versions) can match a target string
738 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739 This would rather mess up the minimum length computation.
740
741 What we'll do is to look for the tail four bytes, and then peek
742 at the preceding two bytes to see whether we need to decrease
743 the minimum length by four (six minus two).
744
745 Thanks to the design of UTF-8, there cannot be false matches:
746 A sequence of valid UTF-8 bytes cannot be a subsequence of
747 another valid sequence of UTF-8 bytes.
748
749*/
750 char *s0 = STRING(scan), *s, *t;
751 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752 char *t0 = "\xcc\x88\xcc\x81";
753 char *t1 = t0 + 3;
754
755 for (s = s0 + 2;
756 s < s2 && (t = ninstr(s, s1, t0, t1));
757 s = t + 4) {
758 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
760 min -= 4;
761 }
762 }
763
c277df42
IZ
764#ifdef DEBUGGING
765 /* Allow dumping */
cd439c50 766 n = scan + NODE_SZ_STR(scan);
c277df42 767 while (n <= stop) {
22c35a8c 768 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
769 OP(n) = OPTIMIZED;
770 NEXT_OFF(n) = 0;
771 }
772 n++;
773 }
653099ff 774#endif
c277df42 775 }
653099ff
GS
776 /* Follow the next-chain of the current node and optimize
777 away all the NOTHINGs from it. */
c277df42 778 if (OP(scan) != CURLYX) {
048cfca1
GS
779 int max = (reg_off_by_arg[OP(scan)]
780 ? I32_MAX
781 /* I32 may be smaller than U16 on CRAYs! */
782 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
783 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
784 int noff;
785 regnode *n = scan;
b81d288d 786
c277df42
IZ
787 /* Skip NOTHING and LONGJMP. */
788 while ((n = regnext(n))
22c35a8c 789 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
790 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
791 && off + noff < max)
792 off += noff;
793 if (reg_off_by_arg[OP(scan)])
794 ARG(scan) = off;
b81d288d 795 else
c277df42
IZ
796 NEXT_OFF(scan) = off;
797 }
653099ff
GS
798 /* The principal pseudo-switch. Cannot be a switch, since we
799 look into several different things. */
b81d288d 800 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
801 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802 next = regnext(scan);
803 code = OP(scan);
b81d288d
AB
804
805 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 806 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 807 struct regnode_charclass_class accum;
c277df42 808
653099ff 809 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 810 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 811 if (flags & SCF_DO_STCLASS)
830247a4 812 cl_init_zero(pRExC_state, &accum);
c277df42 813 while (OP(scan) == code) {
830247a4 814 I32 deltanext, minnext, f = 0, fake;
653099ff 815 struct regnode_charclass_class this_class;
c277df42
IZ
816
817 num++;
818 data_fake.flags = 0;
b81d288d 819 if (data) {
2c2d71f5 820 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
821 data_fake.last_closep = data->last_closep;
822 }
823 else
824 data_fake.last_closep = &fake;
c277df42
IZ
825 next = regnext(scan);
826 scan = NEXTOPER(scan);
827 if (code != BRANCH)
828 scan = NEXTOPER(scan);
653099ff 829 if (flags & SCF_DO_STCLASS) {
830247a4 830 cl_init(pRExC_state, &this_class);
653099ff
GS
831 data_fake.start_class = &this_class;
832 f = SCF_DO_STCLASS_AND;
b81d288d 833 }
e1901655
IZ
834 if (flags & SCF_WHILEM_VISITED_POS)
835 f |= SCF_WHILEM_VISITED_POS;
653099ff 836 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
837 minnext = study_chunk(pRExC_state, &scan, &deltanext,
838 next, &data_fake, f);
b81d288d 839 if (min1 > minnext)
c277df42
IZ
840 min1 = minnext;
841 if (max1 < minnext + deltanext)
842 max1 = minnext + deltanext;
843 if (deltanext == I32_MAX)
aca2d497 844 is_inf = is_inf_internal = 1;
c277df42
IZ
845 scan = next;
846 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
847 pars++;
405ff068 848 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 849 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
850 if (data)
851 data->whilem_c = data_fake.whilem_c;
653099ff 852 if (flags & SCF_DO_STCLASS)
830247a4 853 cl_or(pRExC_state, &accum, &this_class);
b81d288d 854 if (code == SUSPEND)
c277df42
IZ
855 break;
856 }
857 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
858 min1 = 0;
859 if (flags & SCF_DO_SUBSTR) {
860 data->pos_min += min1;
861 data->pos_delta += max1 - min1;
862 if (max1 != min1 || is_inf)
863 data->longest = &(data->longest_float);
864 }
865 min += min1;
866 delta += max1 - min1;
653099ff 867 if (flags & SCF_DO_STCLASS_OR) {
830247a4 868 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
869 if (min1) {
870 cl_and(data->start_class, &and_with);
871 flags &= ~SCF_DO_STCLASS;
872 }
873 }
874 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
875 if (min1) {
876 cl_and(data->start_class, &accum);
653099ff 877 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
878 }
879 else {
b81d288d 880 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
881 * data->start_class */
882 StructCopy(data->start_class, &and_with,
883 struct regnode_charclass_class);
884 flags &= ~SCF_DO_STCLASS_AND;
885 StructCopy(&accum, data->start_class,
886 struct regnode_charclass_class);
887 flags |= SCF_DO_STCLASS_OR;
888 data->start_class->flags |= ANYOF_EOS;
889 }
653099ff 890 }
a0ed51b3
LW
891 }
892 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
893 scan = NEXTOPER(NEXTOPER(scan));
894 else /* single branch is optimized. */
895 scan = NEXTOPER(scan);
896 continue;
a0ed51b3
LW
897 }
898 else if (OP(scan) == EXACT) {
cd439c50 899 I32 l = STR_LEN(scan);
1aa99e6b 900 UV uc = *((U8*)STRING(scan));
a0ed51b3 901 if (UTF) {
1aa99e6b
IH
902 U8 *s = (U8*)STRING(scan);
903 l = utf8_length(s, s + l);
9041c2e3 904 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
905 }
906 min += l;
c277df42 907 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
908 /* The code below prefers earlier match for fixed
909 offset, later match for variable offset. */
910 if (data->last_end == -1) { /* Update the start info. */
911 data->last_start_min = data->pos_min;
912 data->last_start_max = is_inf
b81d288d 913 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 914 }
cd439c50 915 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
916 if (UTF)
917 SvUTF8_on(data->last_found);
c277df42
IZ
918 data->last_end = data->pos_min + l;
919 data->pos_min += l; /* As in the first entry. */
920 data->flags &= ~SF_BEFORE_EOL;
921 }
653099ff
GS
922 if (flags & SCF_DO_STCLASS_AND) {
923 /* Check whether it is compatible with what we know already! */
924 int compat = 1;
925
1aa99e6b 926 if (uc >= 0x100 ||
516a5887 927 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 928 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 929 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 930 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 931 )
653099ff
GS
932 compat = 0;
933 ANYOF_CLASS_ZERO(data->start_class);
934 ANYOF_BITMAP_ZERO(data->start_class);
935 if (compat)
1aa99e6b 936 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 937 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
938 if (uc < 0x100)
939 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
940 }
941 else if (flags & SCF_DO_STCLASS_OR) {
942 /* false positive possible if the class is case-folded */
1aa99e6b 943 if (uc < 0x100)
9b877dbb
IH
944 ANYOF_BITMAP_SET(data->start_class, uc);
945 else
946 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
947 data->start_class->flags &= ~ANYOF_EOS;
948 cl_and(data->start_class, &and_with);
949 }
950 flags &= ~SCF_DO_STCLASS;
a0ed51b3 951 }
653099ff 952 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 953 I32 l = STR_LEN(scan);
1aa99e6b 954 UV uc = *((U8*)STRING(scan));
653099ff
GS
955
956 /* Search for fixed substrings supports EXACT only. */
b81d288d 957 if (flags & SCF_DO_SUBSTR)
830247a4 958 scan_commit(pRExC_state, data);
a0ed51b3 959 if (UTF) {
1aa99e6b
IH
960 U8 *s = (U8 *)STRING(scan);
961 l = utf8_length(s, s + l);
9041c2e3 962 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
963 }
964 min += l;
c277df42 965 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 966 data->pos_min += l;
653099ff
GS
967 if (flags & SCF_DO_STCLASS_AND) {
968 /* Check whether it is compatible with what we know already! */
969 int compat = 1;
970
1aa99e6b 971 if (uc >= 0x100 ||
516a5887 972 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 973 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 974 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
975 compat = 0;
976 ANYOF_CLASS_ZERO(data->start_class);
977 ANYOF_BITMAP_ZERO(data->start_class);
978 if (compat) {
1aa99e6b 979 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
980 data->start_class->flags &= ~ANYOF_EOS;
981 data->start_class->flags |= ANYOF_FOLD;
982 if (OP(scan) == EXACTFL)
983 data->start_class->flags |= ANYOF_LOCALE;
984 }
985 }
986 else if (flags & SCF_DO_STCLASS_OR) {
987 if (data->start_class->flags & ANYOF_FOLD) {
988 /* false positive possible if the class is case-folded.
989 Assume that the locale settings are the same... */
1aa99e6b
IH
990 if (uc < 0x100)
991 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
992 data->start_class->flags &= ~ANYOF_EOS;
993 }
994 cl_and(data->start_class, &and_with);
995 }
996 flags &= ~SCF_DO_STCLASS;
a0ed51b3 997 }
4d61ec05 998 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 999 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1000 I32 f = flags, pos_before = 0;
c277df42 1001 regnode *oscan = scan;
653099ff
GS
1002 struct regnode_charclass_class this_class;
1003 struct regnode_charclass_class *oclass = NULL;
727f22e3 1004 I32 next_is_eval = 0;
653099ff 1005
22c35a8c 1006 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1007 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1008 scan = NEXTOPER(scan);
1009 goto finish;
1010 case PLUS:
653099ff 1011 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1012 next = NEXTOPER(scan);
653099ff 1013 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1014 mincount = 1;
1015 maxcount = REG_INFTY;
c277df42
IZ
1016 next = regnext(scan);
1017 scan = NEXTOPER(scan);
1018 goto do_curly;
1019 }
1020 }
1021 if (flags & SCF_DO_SUBSTR)
1022 data->pos_min++;
1023 min++;
1024 /* Fall through. */
1025 case STAR:
653099ff
GS
1026 if (flags & SCF_DO_STCLASS) {
1027 mincount = 0;
b81d288d 1028 maxcount = REG_INFTY;
653099ff
GS
1029 next = regnext(scan);
1030 scan = NEXTOPER(scan);
1031 goto do_curly;
1032 }
b81d288d 1033 is_inf = is_inf_internal = 1;
c277df42
IZ
1034 scan = regnext(scan);
1035 if (flags & SCF_DO_SUBSTR) {
830247a4 1036 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1037 data->longest = &(data->longest_float);
1038 }
1039 goto optimize_curly_tail;
1040 case CURLY:
b81d288d 1041 mincount = ARG1(scan);
c277df42
IZ
1042 maxcount = ARG2(scan);
1043 next = regnext(scan);
cb434fcc
IZ
1044 if (OP(scan) == CURLYX) {
1045 I32 lp = (data ? *(data->last_closep) : 0);
1046
1047 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1048 }
c277df42 1049 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1050 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1051 do_curly:
1052 if (flags & SCF_DO_SUBSTR) {
830247a4 1053 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1054 pos_before = data->pos_min;
1055 }
1056 if (data) {
1057 fl = data->flags;
1058 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1059 if (is_inf)
1060 data->flags |= SF_IS_INF;
1061 }
653099ff 1062 if (flags & SCF_DO_STCLASS) {
830247a4 1063 cl_init(pRExC_state, &this_class);
653099ff
GS
1064 oclass = data->start_class;
1065 data->start_class = &this_class;
1066 f |= SCF_DO_STCLASS_AND;
1067 f &= ~SCF_DO_STCLASS_OR;
1068 }
e1901655
IZ
1069 /* These are the cases when once a subexpression
1070 fails at a particular position, it cannot succeed
1071 even after backtracking at the enclosing scope.
b81d288d 1072
e1901655
IZ
1073 XXXX what if minimal match and we are at the
1074 initial run of {n,m}? */
1075 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1077
c277df42 1078 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1079 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1080 mincount == 0
653099ff
GS
1081 ? (f & ~SCF_DO_SUBSTR) : f);
1082
1083 if (flags & SCF_DO_STCLASS)
1084 data->start_class = oclass;
1085 if (mincount == 0 || minnext == 0) {
1086 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1087 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1088 }
1089 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1090 /* Switch to OR mode: cache the old value of
653099ff
GS
1091 * data->start_class */
1092 StructCopy(data->start_class, &and_with,
1093 struct regnode_charclass_class);
1094 flags &= ~SCF_DO_STCLASS_AND;
1095 StructCopy(&this_class, data->start_class,
1096 struct regnode_charclass_class);
1097 flags |= SCF_DO_STCLASS_OR;
1098 data->start_class->flags |= ANYOF_EOS;
1099 }
1100 } else { /* Non-zero len */
1101 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1102 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1103 cl_and(data->start_class, &and_with);
1104 }
1105 else if (flags & SCF_DO_STCLASS_AND)
1106 cl_and(data->start_class, &this_class);
1107 flags &= ~SCF_DO_STCLASS;
1108 }
c277df42
IZ
1109 if (!scan) /* It was not CURLYX, but CURLY. */
1110 scan = next;
84037bb0 1111 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1112 /* ? quantifier ok, except for (?{ ... }) */
1113 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1114 && (minnext == 0) && (deltanext == 0)
99799961 1115 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1116 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1117 {
830247a4 1118 vWARN(RExC_parse,
b45f050a
JF
1119 "Quantifier unexpected on zero-length expression");
1120 }
1121
c277df42 1122 min += minnext * mincount;
b81d288d 1123 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1124 && (minnext + deltanext) > 0)
1125 || deltanext == I32_MAX);
aca2d497 1126 is_inf |= is_inf_internal;
c277df42
IZ
1127 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1128
1129 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1130 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1131 && data->flags & SF_IN_PAR
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext && minnext == 1 ) {
1134 /* Try to optimize to CURLYN. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1136 regnode *nxt1 = nxt;
1137#ifdef DEBUGGING
1138 regnode *nxt2;
1139#endif
c277df42
IZ
1140
1141 /* Skip open. */
1142 nxt = regnext(nxt);
4d61ec05 1143 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1144 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1145 && STR_LEN(nxt) == 1))
c277df42 1146 goto nogo;
497b47a8 1147#ifdef DEBUGGING
c277df42 1148 nxt2 = nxt;
497b47a8 1149#endif
c277df42 1150 nxt = regnext(nxt);
b81d288d 1151 if (OP(nxt) != CLOSE)
c277df42
IZ
1152 goto nogo;
1153 /* Now we know that nxt2 is the only contents: */
eb160463 1154 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1155 OP(oscan) = CURLYN;
1156 OP(nxt1) = NOTHING; /* was OPEN. */
1157#ifdef DEBUGGING
1158 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1162 OP(nxt + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42 1165 }
c277df42
IZ
1166 nogo:
1167
1168 /* Try optimization CURLYX => CURLYM. */
b81d288d 1169 if ( OP(oscan) == CURLYX && data
c277df42 1170 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1171 && !(data->flags & SF_HAS_EVAL)
1172 && !deltanext ) {
1173 /* XXXX How to optimize if data == 0? */
1174 /* Optimize to a simpler form. */
1175 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1176 regnode *nxt2;
1177
1178 OP(oscan) = CURLYM;
1179 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1180 && (OP(nxt2) != WHILEM))
c277df42
IZ
1181 nxt = nxt2;
1182 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1183 /* Need to optimize away parenths. */
1184 if (data->flags & SF_IN_PAR) {
1185 /* Set the parenth number. */
1186 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1187
b81d288d 1188 if (OP(nxt) != CLOSE)
b45f050a 1189 FAIL("Panic opt close");
eb160463 1190 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1191 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1192 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1193#ifdef DEBUGGING
1194 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195 OP(nxt + 1) = OPTIMIZED; /* was count. */
1196 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1198#endif
c277df42
IZ
1199#if 0
1200 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201 regnode *nnxt = regnext(nxt1);
b81d288d 1202
c277df42
IZ
1203 if (nnxt == nxt) {
1204 if (reg_off_by_arg[OP(nxt1)])
1205 ARG_SET(nxt1, nxt2 - nxt1);
1206 else if (nxt2 - nxt1 < U16_MAX)
1207 NEXT_OFF(nxt1) = nxt2 - nxt1;
1208 else
1209 OP(nxt) = NOTHING; /* Cannot beautify */
1210 }
1211 nxt1 = nnxt;
1212 }
1213#endif
1214 /* Optimize again: */
b81d288d 1215 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1216 NULL, 0);
a0ed51b3
LW
1217 }
1218 else
c277df42 1219 oscan->flags = 0;
c277df42 1220 }
e1901655
IZ
1221 else if ((OP(oscan) == CURLYX)
1222 && (flags & SCF_WHILEM_VISITED_POS)
1223 /* See the comment on a similar expression above.
1224 However, this time it not a subexpression
1225 we care about, but the expression itself. */
1226 && (maxcount == REG_INFTY)
1227 && data && ++data->whilem_c < 16) {
1228 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1229 /* Find WHILEM (as in regexec.c) */
1230 regnode *nxt = oscan + NEXT_OFF(oscan);
1231
1232 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1233 nxt += ARG(nxt);
eb160463
GS
1234 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1236 }
b81d288d 1237 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1238 pars++;
1239 if (flags & SCF_DO_SUBSTR) {
1240 SV *last_str = Nullsv;
1241 int counted = mincount != 0;
1242
1243 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1244#if defined(SPARC64_GCC_WORKAROUND)
1245 I32 b = 0;
1246 STRLEN l = 0;
1247 char *s = NULL;
1248 I32 old = 0;
1249
1250 if (pos_before >= data->last_start_min)
1251 b = pos_before;
1252 else
1253 b = data->last_start_min;
1254
1255 l = 0;
1256 s = SvPV(data->last_found, l);
1257 old = b - data->last_start_min;
1258
1259#else
b81d288d 1260 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1261 ? pos_before : data->last_start_min;
1262 STRLEN l;
1263 char *s = SvPV(data->last_found, l);
a0ed51b3 1264 I32 old = b - data->last_start_min;
5d1c421c 1265#endif
a0ed51b3
LW
1266
1267 if (UTF)
1268 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1269
a0ed51b3 1270 l -= old;
c277df42 1271 /* Get the added string: */
79cb57f6 1272 last_str = newSVpvn(s + old, l);
0e933229
IH
1273 if (UTF)
1274 SvUTF8_on(last_str);
c277df42
IZ
1275 if (deltanext == 0 && pos_before == b) {
1276 /* What was added is a constant string */
1277 if (mincount > 1) {
1278 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1279 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1280 SvPVX(last_str), l, mincount - 1);
1281 SvCUR(last_str) *= mincount;
1282 /* Add additional parts. */
b81d288d 1283 SvCUR_set(data->last_found,
c277df42
IZ
1284 SvCUR(data->last_found) - l);
1285 sv_catsv(data->last_found, last_str);
1286 data->last_end += l * (mincount - 1);
1287 }
2a8d9689
HS
1288 } else {
1289 /* start offset must point into the last copy */
1290 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1291 data->last_start_max += is_inf ? I32_MAX
1292 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1293 }
1294 }
1295 /* It is counted once already... */
1296 data->pos_min += minnext * (mincount - counted);
1297 data->pos_delta += - counted * deltanext +
1298 (minnext + deltanext) * maxcount - minnext * mincount;
1299 if (mincount != maxcount) {
653099ff
GS
1300 /* Cannot extend fixed substrings found inside
1301 the group. */
830247a4 1302 scan_commit(pRExC_state,data);
c277df42
IZ
1303 if (mincount && last_str) {
1304 sv_setsv(data->last_found, last_str);
1305 data->last_end = data->pos_min;
b81d288d 1306 data->last_start_min =
a0ed51b3 1307 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1308 data->last_start_max = is_inf
1309 ? I32_MAX
c277df42 1310 : data->pos_min + data->pos_delta
a0ed51b3 1311 - CHR_SVLEN(last_str);
c277df42
IZ
1312 }
1313 data->longest = &(data->longest_float);
1314 }
aca2d497 1315 SvREFCNT_dec(last_str);
c277df42 1316 }
405ff068 1317 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1318 data->flags |= SF_HAS_EVAL;
1319 optimize_curly_tail:
c277df42 1320 if (OP(oscan) != CURLYX) {
22c35a8c 1321 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1322 && NEXT_OFF(next))
1323 NEXT_OFF(oscan) += NEXT_OFF(next);
1324 }
c277df42 1325 continue;
653099ff 1326 default: /* REF and CLUMP only? */
c277df42 1327 if (flags & SCF_DO_SUBSTR) {
830247a4 1328 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1329 data->longest = &(data->longest_float);
1330 }
aca2d497 1331 is_inf = is_inf_internal = 1;
653099ff 1332 if (flags & SCF_DO_STCLASS_OR)
830247a4 1333 cl_anything(pRExC_state, data->start_class);
653099ff 1334 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1335 break;
1336 }
a0ed51b3 1337 }
ffc61ed2 1338 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1339 int value = 0;
653099ff 1340
c277df42 1341 if (flags & SCF_DO_SUBSTR) {
830247a4 1342 scan_commit(pRExC_state,data);
c277df42
IZ
1343 data->pos_min++;
1344 }
1345 min++;
653099ff
GS
1346 if (flags & SCF_DO_STCLASS) {
1347 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1348
1349 /* Some of the logic below assumes that switching
1350 locale on will only add false positives. */
1351 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1352 case SANY:
653099ff
GS
1353 default:
1354 do_default:
1355 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1357 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1358 break;
1359 case REG_ANY:
1360 if (OP(scan) == SANY)
1361 goto do_default;
1362 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1365 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1366 }
1367 if (flags & SCF_DO_STCLASS_AND || !value)
1368 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1369 break;
1370 case ANYOF:
1371 if (flags & SCF_DO_STCLASS_AND)
1372 cl_and(data->start_class,
1373 (struct regnode_charclass_class*)scan);
1374 else
830247a4 1375 cl_or(pRExC_state, data->start_class,
653099ff
GS
1376 (struct regnode_charclass_class*)scan);
1377 break;
1378 case ALNUM:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (!isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (isALNUM(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case ALNUML:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1401 }
1402 else {
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404 data->start_class->flags |= ANYOF_LOCALE;
1405 }
1406 break;
1407 case NALNUM:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (!isALNUM(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case NALNUML:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1430 }
1431 else {
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 }
1435 break;
1436 case SPACE:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440 for (value = 0; value < 256; value++)
1441 if (!isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (isSPACE(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case SPACEL:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1459 }
1460 else {
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 }
1464 break;
1465 case NSPACE:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469 for (value = 0; value < 256; value++)
1470 if (isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1472 }
1473 }
1474 else {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1477 else {
1478 for (value = 0; value < 256; value++)
1479 if (!isSPACE(value))
b81d288d 1480 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1481 }
1482 }
1483 break;
1484 case NSPACEL:
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE) {
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1489 if (!isSPACE(value))
1490 ANYOF_BITMAP_CLEAR(data->start_class, value);
1491 }
1492 }
1493 else {
1494 data->start_class->flags |= ANYOF_LOCALE;
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1496 }
1497 break;
1498 case DIGIT:
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501 for (value = 0; value < 256; value++)
1502 if (!isDIGIT(value))
1503 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504 }
1505 else {
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1508 else {
1509 for (value = 0; value < 256; value++)
1510 if (isDIGIT(value))
b81d288d 1511 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1512 }
1513 }
1514 break;
1515 case NDIGIT:
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518 for (value = 0; value < 256; value++)
1519 if (isDIGIT(value))
1520 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 }
1522 else {
1523 if (data->start_class->flags & ANYOF_LOCALE)
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1525 else {
1526 for (value = 0; value < 256; value++)
1527 if (!isDIGIT(value))
b81d288d 1528 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1529 }
1530 }
1531 break;
1532 }
1533 if (flags & SCF_DO_STCLASS_OR)
1534 cl_and(data->start_class, &and_with);
1535 flags &= ~SCF_DO_STCLASS;
1536 }
a0ed51b3 1537 }
22c35a8c 1538 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1539 data->flags |= (OP(scan) == MEOL
1540 ? SF_BEFORE_MEOL
1541 : SF_BEFORE_SEOL);
a0ed51b3 1542 }
653099ff
GS
1543 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1544 /* Lookbehind, or need to calculate parens/evals/stclass: */
1545 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1546 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1547 /* Lookahead/lookbehind */
cb434fcc 1548 I32 deltanext, minnext, fake = 0;
c277df42 1549 regnode *nscan;
653099ff
GS
1550 struct regnode_charclass_class intrnl;
1551 int f = 0;
c277df42
IZ
1552
1553 data_fake.flags = 0;
b81d288d 1554 if (data) {
2c2d71f5 1555 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1556 data_fake.last_closep = data->last_closep;
1557 }
1558 else
1559 data_fake.last_closep = &fake;
653099ff
GS
1560 if ( flags & SCF_DO_STCLASS && !scan->flags
1561 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1562 cl_init(pRExC_state, &intrnl);
653099ff 1563 data_fake.start_class = &intrnl;
e1901655 1564 f |= SCF_DO_STCLASS_AND;
653099ff 1565 }
e1901655
IZ
1566 if (flags & SCF_WHILEM_VISITED_POS)
1567 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1568 next = regnext(scan);
1569 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1570 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1571 if (scan->flags) {
1572 if (deltanext) {
9baa0206 1573 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1574 }
1575 else if (minnext > U8_MAX) {
9baa0206 1576 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1577 }
eb160463 1578 scan->flags = (U8)minnext;
c277df42
IZ
1579 }
1580 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1581 pars++;
405ff068 1582 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1583 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1584 if (data)
1585 data->whilem_c = data_fake.whilem_c;
e1901655 1586 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1587 int was = (data->start_class->flags & ANYOF_EOS);
1588
1589 cl_and(data->start_class, &intrnl);
1590 if (was)
1591 data->start_class->flags |= ANYOF_EOS;
1592 }
a0ed51b3
LW
1593 }
1594 else if (OP(scan) == OPEN) {
c277df42 1595 pars++;
a0ed51b3 1596 }
cb434fcc 1597 else if (OP(scan) == CLOSE) {
eb160463 1598 if ((I32)ARG(scan) == is_par) {
cb434fcc 1599 next = regnext(scan);
c277df42 1600
cb434fcc
IZ
1601 if ( next && (OP(next) != WHILEM) && next < last)
1602 is_par = 0; /* Disable optimization */
1603 }
1604 if (data)
1605 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1606 }
1607 else if (OP(scan) == EVAL) {
c277df42
IZ
1608 if (data)
1609 data->flags |= SF_HAS_EVAL;
1610 }
96776eda 1611 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1612 if (flags & SCF_DO_SUBSTR) {
830247a4 1613 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1614 data->longest = &(data->longest_float);
1615 }
1616 is_inf = is_inf_internal = 1;
653099ff 1617 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1618 cl_anything(pRExC_state, data->start_class);
96776eda 1619 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1620 }
c277df42
IZ
1621 /* Else: zero-length, ignore. */
1622 scan = regnext(scan);
1623 }
1624
1625 finish:
1626 *scanp = scan;
aca2d497 1627 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1628 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1629 data->pos_delta = I32_MAX - data->pos_min;
1630 if (is_par > U8_MAX)
1631 is_par = 0;
1632 if (is_par && pars==1 && data) {
1633 data->flags |= SF_IN_PAR;
1634 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1635 }
1636 else if (pars && data) {
c277df42
IZ
1637 data->flags |= SF_HAS_PAR;
1638 data->flags &= ~SF_IN_PAR;
1639 }
653099ff
GS
1640 if (flags & SCF_DO_STCLASS_OR)
1641 cl_and(data->start_class, &and_with);
c277df42
IZ
1642 return min;
1643}
1644
76e3520e 1645STATIC I32
830247a4 1646S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1647{
830247a4 1648 if (RExC_rx->data) {
b81d288d
AB
1649 Renewc(RExC_rx->data,
1650 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1651 char, struct reg_data);
830247a4
IZ
1652 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653 RExC_rx->data->count += n;
a0ed51b3
LW
1654 }
1655 else {
830247a4 1656 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1657 char, struct reg_data);
830247a4
IZ
1658 New(1208, RExC_rx->data->what, n, U8);
1659 RExC_rx->data->count = n;
c277df42 1660 }
830247a4
IZ
1661 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662 return RExC_rx->data->count - n;
c277df42
IZ
1663}
1664
d88dccdf 1665void
864dbfa3 1666Perl_reginitcolors(pTHX)
d88dccdf 1667{
d88dccdf
IZ
1668 int i = 0;
1669 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1670
d88dccdf
IZ
1671 if (s) {
1672 PL_colors[0] = s = savepv(s);
1673 while (++i < 6) {
1674 s = strchr(s, '\t');
1675 if (s) {
1676 *s = '\0';
1677 PL_colors[i] = ++s;
1678 }
1679 else
c712d376 1680 PL_colors[i] = s = "";
d88dccdf
IZ
1681 }
1682 } else {
b81d288d 1683 while (i < 6)
d88dccdf
IZ
1684 PL_colors[i++] = "";
1685 }
1686 PL_colorset = 1;
1687}
1688
8615cb43 1689
a687059c 1690/*
e50aee73 1691 - pregcomp - compile a regular expression into internal code
a687059c
LW
1692 *
1693 * We can't allocate space until we know how big the compiled form will be,
1694 * but we can't compile it (and thus know how big it is) until we've got a
1695 * place to put the code. So we cheat: we compile it twice, once with code
1696 * generation turned off and size counting turned on, and once "for real".
1697 * This also means that we don't allocate space until we are sure that the
1698 * thing really will compile successfully, and we never have to move the
1699 * code and thus invalidate pointers into it. (Note that it has to be in
1700 * one piece because free() must be able to free it all.) [NB: not true in perl]
1701 *
1702 * Beware that the optimization-preparation code in here knows about some
1703 * of the structure of the compiled regexp. [I'll say.]
1704 */
1705regexp *
864dbfa3 1706Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1707{
a0d0e21e 1708 register regexp *r;
c277df42 1709 regnode *scan;
c277df42 1710 regnode *first;
a0d0e21e 1711 I32 flags;
a0d0e21e
LW
1712 I32 minlen = 0;
1713 I32 sawplus = 0;
1714 I32 sawopen = 0;
2c2d71f5 1715 scan_data_t data;
830247a4
IZ
1716 RExC_state_t RExC_state;
1717 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1718
1719 if (exp == NULL)
c277df42 1720 FAIL("NULL regexp argument");
a0d0e21e 1721
a5961de5 1722 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1723
5cfc7842 1724 RExC_precomp = exp;
a5961de5
JH
1725 DEBUG_r({
1726 if (!PL_colorset) reginitcolors();
1727 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728 PL_colors[4],PL_colors[5],PL_colors[0],
1729 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1730 });
e2509266 1731 RExC_flags = pm->op_pmflags;
830247a4 1732 RExC_sawback = 0;
bbce6d69 1733
830247a4
IZ
1734 RExC_seen = 0;
1735 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736 RExC_seen_evals = 0;
1737 RExC_extralen = 0;
c277df42 1738
bbce6d69 1739 /* First pass: determine size, legality. */
830247a4 1740 RExC_parse = exp;
fac92740 1741 RExC_start = exp;
830247a4
IZ
1742 RExC_end = xend;
1743 RExC_naughty = 0;
1744 RExC_npar = 1;
1745 RExC_size = 0L;
1746 RExC_emit = &PL_regdummy;
1747 RExC_whilem_seen = 0;
85ddcde9
JH
1748#if 0 /* REGC() is (currently) a NOP at the first pass.
1749 * Clever compilers notice this and complain. --jhi */
830247a4 1750 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1751#endif
830247a4 1752 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1753 RExC_precomp = Nullch;
a0d0e21e
LW
1754 return(NULL);
1755 }
830247a4 1756 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1757
c277df42
IZ
1758 /* Small enough for pointer-storage convention?
1759 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1760 if (RExC_size >= 0x10000L && RExC_extralen)
1761 RExC_size += RExC_extralen;
c277df42 1762 else
830247a4
IZ
1763 RExC_extralen = 0;
1764 if (RExC_whilem_seen > 15)
1765 RExC_whilem_seen = 15;
a0d0e21e 1766
bbce6d69 1767 /* Allocate space and initialize. */
830247a4 1768 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1769 char, regexp);
a0d0e21e 1770 if (r == NULL)
b45f050a
JF
1771 FAIL("Regexp out of space");
1772
0f79a09d
GS
1773#ifdef DEBUGGING
1774 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1775 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1776#endif
c277df42 1777 r->refcnt = 1;
bbce6d69 1778 r->prelen = xend - exp;
5cfc7842 1779 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1780 r->subbeg = NULL;
1781 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1782 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1783
1784 r->substrs = 0; /* Useful during FAIL. */
1785 r->startp = 0; /* Useful during FAIL. */
1786 r->endp = 0; /* Useful during FAIL. */
1787
fac92740
MJD
1788 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1789 if (r->offsets) {
1790 r->offsets[0] = RExC_size;
1791 }
1792 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1793 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1794 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1795 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1796
830247a4 1797 RExC_rx = r;
bbce6d69 1798
1799 /* Second pass: emit code. */
e2509266 1800 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1801 RExC_parse = exp;
1802 RExC_end = xend;
1803 RExC_naughty = 0;
1804 RExC_npar = 1;
fac92740 1805 RExC_emit_start = r->program;
830247a4 1806 RExC_emit = r->program;
2cd61cdb 1807 /* Store the count of eval-groups for security checks: */
eb160463 1808 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1809 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1810 r->data = 0;
830247a4 1811 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1812 return(NULL);
1813
1814 /* Dig out information for optimizations. */
cf93c79d 1815 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1816 pm->op_pmflags = RExC_flags;
a0ed51b3 1817 if (UTF)
5ff6fc6d 1818 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1819 r->regstclass = NULL;
830247a4 1820 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1821 r->reganch |= ROPT_NAUGHTY;
c277df42 1822 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1823
1824 /* XXXX To minimize changes to RE engine we always allocate
1825 3-units-long substrs field. */
1826 Newz(1004, r->substrs, 1, struct reg_substr_data);
1827
2c2d71f5 1828 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1829 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1830 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1831 I32 fake;
c5254dd6 1832 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1833 struct regnode_charclass_class ch_class;
1834 int stclass_flag;
cb434fcc 1835 I32 last_close = 0;
a0d0e21e
LW
1836
1837 first = scan;
c277df42 1838 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1839 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1840 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1841 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1842 (OP(first) == PLUS) ||
1843 (OP(first) == MINMOD) ||
653099ff 1844 /* An {n,m} with n>0 */
22c35a8c 1845 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1846 if (OP(first) == PLUS)
1847 sawplus = 1;
1848 else
1849 first += regarglen[(U8)OP(first)];
1850 first = NEXTOPER(first);
a687059c
LW
1851 }
1852
a0d0e21e
LW
1853 /* Starting-point info. */
1854 again:
653099ff 1855 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1856 if (OP(first) == EXACT)
1857 ; /* Empty, get anchored substr later. */
1858 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1859 r->regstclass = first;
1860 }
653099ff 1861 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1862 r->regstclass = first;
22c35a8c
GS
1863 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1864 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1865 r->regstclass = first;
22c35a8c 1866 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1867 r->reganch |= (OP(first) == MBOL
1868 ? ROPT_ANCH_MBOL
1869 : (OP(first) == SBOL
1870 ? ROPT_ANCH_SBOL
1871 : ROPT_ANCH_BOL));
a0d0e21e 1872 first = NEXTOPER(first);
774d564b 1873 goto again;
1874 }
1875 else if (OP(first) == GPOS) {
1876 r->reganch |= ROPT_ANCH_GPOS;
1877 first = NEXTOPER(first);
1878 goto again;
a0d0e21e 1879 }
e09294f4 1880 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1881 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1882 !(r->reganch & ROPT_ANCH) )
1883 {
1884 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1885 int type = OP(NEXTOPER(first));
1886
ffc61ed2 1887 if (type == REG_ANY)
cad2e5aa
JH
1888 type = ROPT_ANCH_MBOL;
1889 else
1890 type = ROPT_ANCH_SBOL;
1891
1892 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1893 first = NEXTOPER(first);
774d564b 1894 goto again;
a0d0e21e 1895 }
b81d288d 1896 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1897 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1898 /* x+ must match at the 1st pos of run of x's */
1899 r->reganch |= ROPT_SKIP;
a0d0e21e 1900
c277df42 1901 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1902 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1903 (IV)(first - scan + 1)));
a0d0e21e
LW
1904 /*
1905 * If there's something expensive in the r.e., find the
1906 * longest literal string that must appear and make it the
1907 * regmust. Resolve ties in favor of later strings, since
1908 * the regstart check works with the beginning of the r.e.
1909 * and avoiding duplication strengthens checking. Not a
1910 * strong reason, but sufficient in the absence of others.
1911 * [Now we resolve ties in favor of the earlier string if
c277df42 1912 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1913 * earlier string may buy us something the later one won't.]
1914 */
a0d0e21e 1915 minlen = 0;
a687059c 1916
79cb57f6
GS
1917 data.longest_fixed = newSVpvn("",0);
1918 data.longest_float = newSVpvn("",0);
1919 data.last_found = newSVpvn("",0);
c277df42
IZ
1920 data.longest = &(data.longest_fixed);
1921 first = scan;
653099ff 1922 if (!r->regstclass) {
830247a4 1923 cl_init(pRExC_state, &ch_class);
653099ff
GS
1924 data.start_class = &ch_class;
1925 stclass_flag = SCF_DO_STCLASS_AND;
1926 } else /* XXXX Check for BOUND? */
1927 stclass_flag = 0;
cb434fcc 1928 data.last_closep = &last_close;
653099ff 1929
830247a4 1930 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1931 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1932 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1933 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1934 && !RExC_seen_zerolen
1935 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1936 r->reganch |= ROPT_CHECK_ALL;
830247a4 1937 scan_commit(pRExC_state, &data);
c277df42
IZ
1938 SvREFCNT_dec(data.last_found);
1939
a0ed51b3 1940 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1941 if (longest_float_length
c277df42
IZ
1942 || (data.flags & SF_FL_BEFORE_EOL
1943 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1944 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1945 int t;
1946
a0ed51b3 1947 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1948 && data.offset_fixed == data.offset_float_min
1949 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1950 goto remove_float; /* As in (a)+. */
1951
33b8afdf
JH
1952 if (SvUTF8(data.longest_float)) {
1953 r->float_utf8 = data.longest_float;
1954 r->float_substr = Nullsv;
1955 } else {
1956 r->float_substr = data.longest_float;
1957 r->float_utf8 = Nullsv;
1958 }
c277df42
IZ
1959 r->float_min_offset = data.offset_float_min;
1960 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1961 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1962 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1963 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1964 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1965 }
1966 else {
aca2d497 1967 remove_float:
33b8afdf 1968 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1969 SvREFCNT_dec(data.longest_float);
c5254dd6 1970 longest_float_length = 0;
a0d0e21e 1971 }
c277df42 1972
a0ed51b3 1973 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1974 if (longest_fixed_length
c277df42
IZ
1975 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1976 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 1977 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1978 int t;
1979
33b8afdf
JH
1980 if (SvUTF8(data.longest_fixed)) {
1981 r->anchored_utf8 = data.longest_fixed;
1982 r->anchored_substr = Nullsv;
1983 } else {
1984 r->anchored_substr = data.longest_fixed;
1985 r->anchored_utf8 = Nullsv;
1986 }
c277df42 1987 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1988 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 1990 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1991 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1992 }
1993 else {
33b8afdf 1994 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 1995 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1996 longest_fixed_length = 0;
a0d0e21e 1997 }
b81d288d 1998 if (r->regstclass
ffc61ed2 1999 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2000 r->regstclass = NULL;
33b8afdf
JH
2001 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2002 && stclass_flag
653099ff 2003 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2004 && !cl_is_anything(data.start_class))
2005 {
830247a4 2006 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2007
b81d288d 2008 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2009 struct regnode_charclass_class);
2010 StructCopy(data.start_class,
830247a4 2011 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2012 struct regnode_charclass_class);
830247a4 2013 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2014 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2015 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2016 DEBUG_r({ SV *sv = sv_newmortal();
2017 regprop(sv, (regnode*)data.start_class);
2018 PerlIO_printf(Perl_debug_log,
2019 "synthetic stclass `%s'.\n",
2020 SvPVX(sv));});
653099ff 2021 }
c277df42
IZ
2022
2023 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2024 if (longest_fixed_length > longest_float_length) {
c277df42 2025 r->check_substr = r->anchored_substr;
33b8afdf 2026 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2030 }
2031 else {
c277df42 2032 r->check_substr = r->float_substr;
33b8afdf 2033 r->check_utf8 = r->float_utf8;
c277df42
IZ
2034 r->check_offset_min = data.offset_float_min;
2035 r->check_offset_max = data.offset_float_max;
a0d0e21e 2036 }
30382c73
IZ
2037 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2038 This should be changed ASAP! */
33b8afdf 2039 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2040 r->reganch |= RE_USE_INTUIT;
33b8afdf 2041 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2042 r->reganch |= RE_INTUIT_TAIL;
2043 }
a0ed51b3
LW
2044 }
2045 else {
c277df42
IZ
2046 /* Several toplevels. Best we can is to set minlen. */
2047 I32 fake;
653099ff 2048 struct regnode_charclass_class ch_class;
cb434fcc 2049 I32 last_close = 0;
c277df42
IZ
2050
2051 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2052 scan = r->program + 1;
830247a4 2053 cl_init(pRExC_state, &ch_class);
653099ff 2054 data.start_class = &ch_class;
cb434fcc 2055 data.last_closep = &last_close;
e1901655 2056 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2058 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2059 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2060 && !cl_is_anything(data.start_class))
2061 {
830247a4 2062 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2063
b81d288d 2064 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2065 struct regnode_charclass_class);
2066 StructCopy(data.start_class,
830247a4 2067 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2068 struct regnode_charclass_class);
830247a4 2069 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2070 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2071 DEBUG_r({ SV* sv = sv_newmortal();
2072 regprop(sv, (regnode*)data.start_class);
2073 PerlIO_printf(Perl_debug_log,
2074 "synthetic stclass `%s'.\n",
2075 SvPVX(sv));});
653099ff 2076 }
a0d0e21e
LW
2077 }
2078
a0d0e21e 2079 r->minlen = minlen;
b81d288d 2080 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2081 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2082 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2083 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2084 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2085 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2086 if (RExC_seen & REG_SEEN_CANY)
2087 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2088 Newz(1002, r->startp, RExC_npar, I32);
2089 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2090 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2091 DEBUG_r(regdump(r));
2092 return(r);
a687059c
LW
2093}
2094
2095/*
2096 - reg - regular expression, i.e. main body or parenthesized thing
2097 *
2098 * Caller must absorb opening parenthesis.
2099 *
2100 * Combining parenthesis handling with the base level of regular expression
2101 * is a trifle forced, but the need to tie the tails of the branches to what
2102 * follows makes it hard to avoid.
2103 */
76e3520e 2104STATIC regnode *
830247a4 2105S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2106 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2107{
c277df42
IZ
2108 register regnode *ret; /* Will be the head of the group. */
2109 register regnode *br;
2110 register regnode *lastbr;
2111 register regnode *ender = 0;
a0d0e21e 2112 register I32 parno = 0;
e2509266 2113 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2114
2115 /* for (?g), (?gc), and (?o) warnings; warning
2116 about (?c) will warn about (?g) -- japhy */
2117
2118 I32 wastedflags = 0x00,
2119 wasted_o = 0x01,
2120 wasted_g = 0x02,
2121 wasted_gc = 0x02 | 0x04,
2122 wasted_c = 0x04;
2123
fac92740 2124 char * parse_start = RExC_parse; /* MJD */
830247a4 2125 char *oregcomp_parse = RExC_parse;
c277df42 2126 char c;
a0d0e21e 2127
821b33a5 2128 *flagp = 0; /* Tentatively. */
a0d0e21e 2129
9d1d55b5 2130
a0d0e21e
LW
2131 /* Make an OPEN node, if parenthesized. */
2132 if (paren) {
fac92740 2133 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2134 U32 posflags = 0, negflags = 0;
2135 U32 *flagsp = &posflags;
0f5d15d6 2136 int logical = 0;
830247a4 2137 char *seqstart = RExC_parse;
ca9dfc88 2138
830247a4
IZ
2139 RExC_parse++;
2140 paren = *RExC_parse++;
c277df42 2141 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2142 switch (paren) {
fac92740 2143 case '<': /* (?<...) */
830247a4 2144 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2145 if (*RExC_parse == '!')
c277df42 2146 paren = ',';
b81d288d 2147 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2148 goto unknown;
830247a4 2149 RExC_parse++;
fac92740
MJD
2150 case '=': /* (?=...) */
2151 case '!': /* (?!...) */
830247a4 2152 RExC_seen_zerolen++;
fac92740
MJD
2153 case ':': /* (?:...) */
2154 case '>': /* (?>...) */
a0d0e21e 2155 break;
fac92740
MJD
2156 case '$': /* (?$...) */
2157 case '@': /* (?@...) */
8615cb43 2158 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2159 break;
fac92740 2160 case '#': /* (?#...) */
830247a4
IZ
2161 while (*RExC_parse && *RExC_parse != ')')
2162 RExC_parse++;
2163 if (*RExC_parse != ')')
c277df42 2164 FAIL("Sequence (?#... not terminated");
830247a4 2165 nextchar(pRExC_state);
a0d0e21e
LW
2166 *flagp = TRYAGAIN;
2167 return NULL;
fac92740 2168 case 'p': /* (?p...) */
9014280d 2169 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2170 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2171 /* FALL THROUGH*/
fac92740 2172 case '?': /* (??...) */
0f5d15d6 2173 logical = 1;
438a3801
YST
2174 if (*RExC_parse != '{')
2175 goto unknown;
830247a4 2176 paren = *RExC_parse++;
0f5d15d6 2177 /* FALL THROUGH */
fac92740 2178 case '{': /* (?{...}) */
c277df42 2179 {
c277df42
IZ
2180 I32 count = 1, n = 0;
2181 char c;
830247a4 2182 char *s = RExC_parse;
c277df42
IZ
2183 SV *sv;
2184 OP_4tree *sop, *rop;
2185
830247a4
IZ
2186 RExC_seen_zerolen++;
2187 RExC_seen |= REG_SEEN_EVAL;
2188 while (count && (c = *RExC_parse)) {
2189 if (c == '\\' && RExC_parse[1])
2190 RExC_parse++;
b81d288d 2191 else if (c == '{')
c277df42 2192 count++;
b81d288d 2193 else if (c == '}')
c277df42 2194 count--;
830247a4 2195 RExC_parse++;
c277df42 2196 }
830247a4 2197 if (*RExC_parse != ')')
b45f050a 2198 {
b81d288d 2199 RExC_parse = s;
b45f050a
JF
2200 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2201 }
c277df42 2202 if (!SIZE_ONLY) {
f3548bdc 2203 PAD *pad;
b81d288d
AB
2204
2205 if (RExC_parse - 1 - s)
830247a4 2206 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2207 else
79cb57f6 2208 sv = newSVpvn("", 0);
c277df42 2209
569233ed
SB
2210 ENTER;
2211 Perl_save_re_context(aTHX);
f3548bdc 2212 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2213 sop->op_private |= OPpREFCOUNTED;
2214 /* re_dup will OpREFCNT_inc */
2215 OpREFCNT_set(sop, 1);
569233ed 2216 LEAVE;
c277df42 2217
830247a4
IZ
2218 n = add_data(pRExC_state, 3, "nop");
2219 RExC_rx->data->data[n] = (void*)rop;
2220 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2221 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2222 SvREFCNT_dec(sv);
a0ed51b3 2223 }
e24b16f9 2224 else { /* First pass */
830247a4 2225 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2226 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2227 /* No compiled RE interpolated, has runtime
2228 components ===> unsafe. */
2229 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2230 if (PL_tainting && PL_tainted)
cc6b7395 2231 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2232 }
2233
830247a4 2234 nextchar(pRExC_state);
0f5d15d6 2235 if (logical) {
830247a4 2236 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2237 if (!SIZE_ONLY)
2238 ret->flags = 2;
830247a4 2239 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2240 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2241 return ret;
2242 }
ccb2c380
MP
2243 ret = reganode(pRExC_state, EVAL, n);
2244 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2245 Set_Node_Offset(ret, parse_start);
2246 return ret;
c277df42 2247 }
fac92740 2248 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2249 {
fac92740 2250 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2251 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2252 || RExC_parse[1] == '<'
830247a4 2253 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2254 I32 flag;
2255
830247a4 2256 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2257 if (!SIZE_ONLY)
2258 ret->flags = 1;
830247a4 2259 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2260 goto insert_if;
b81d288d 2261 }
a0ed51b3 2262 }
830247a4 2263 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2264 /* (?(1)...) */
830247a4 2265 parno = atoi(RExC_parse++);
c277df42 2266
830247a4
IZ
2267 while (isDIGIT(*RExC_parse))
2268 RExC_parse++;
fac92740
MJD
2269 ret = reganode(pRExC_state, GROUPP, parno);
2270
830247a4 2271 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2272 vFAIL("Switch condition not recognized");
c277df42 2273 insert_if:
830247a4
IZ
2274 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2275 br = regbranch(pRExC_state, &flags, 1);
c277df42 2276 if (br == NULL)
830247a4 2277 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2278 else
830247a4
IZ
2279 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2280 c = *nextchar(pRExC_state);
d1b80229
IZ
2281 if (flags&HASWIDTH)
2282 *flagp |= HASWIDTH;
c277df42 2283 if (c == '|') {
830247a4
IZ
2284 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2285 regbranch(pRExC_state, &flags, 1);
2286 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2287 if (flags&HASWIDTH)
2288 *flagp |= HASWIDTH;
830247a4 2289 c = *nextchar(pRExC_state);
a0ed51b3
LW
2290 }
2291 else
c277df42
IZ
2292 lastbr = NULL;
2293 if (c != ')')
8615cb43 2294 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2295 ender = reg_node(pRExC_state, TAIL);
2296 regtail(pRExC_state, br, ender);
c277df42 2297 if (lastbr) {
830247a4
IZ
2298 regtail(pRExC_state, lastbr, ender);
2299 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2300 }
2301 else
830247a4 2302 regtail(pRExC_state, ret, ender);
c277df42 2303 return ret;
a0ed51b3
LW
2304 }
2305 else {
830247a4 2306 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2307 }
2308 }
1b1626e4 2309 case 0:
830247a4 2310 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2311 vFAIL("Sequence (? incomplete");
1b1626e4 2312 break;
a0d0e21e 2313 default:
830247a4 2314 --RExC_parse;
fac92740 2315 parse_flags: /* (?i) */
830247a4 2316 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2317 /* (?g), (?gc) and (?o) are useless here
2318 and must be globally applied -- japhy */
2319
2320 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2321 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2322 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2323 if (! (wastedflags & wflagbit) ) {
2324 wastedflags |= wflagbit;
2325 vWARN5(
2326 RExC_parse + 1,
2327 "Useless (%s%c) - %suse /%c modifier",
2328 flagsp == &negflags ? "?-" : "?",
2329 *RExC_parse,
2330 flagsp == &negflags ? "don't " : "",
2331 *RExC_parse
2332 );
2333 }
2334 }
2335 }
2336 else if (*RExC_parse == 'c') {
2337 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2338 if (! (wastedflags & wasted_c) ) {
2339 wastedflags |= wasted_gc;
2340 vWARN3(
2341 RExC_parse + 1,
2342 "Useless (%sc) - %suse /gc modifier",
2343 flagsp == &negflags ? "?-" : "?",
2344 flagsp == &negflags ? "don't " : ""
2345 );
2346 }
2347 }
2348 }
2349 else { pmflag(flagsp, *RExC_parse); }
2350
830247a4 2351 ++RExC_parse;
ca9dfc88 2352 }
830247a4 2353 if (*RExC_parse == '-') {
ca9dfc88 2354 flagsp = &negflags;
9d1d55b5 2355 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2356 ++RExC_parse;
ca9dfc88 2357 goto parse_flags;
48c036b1 2358 }
e2509266
JH
2359 RExC_flags |= posflags;
2360 RExC_flags &= ~negflags;
830247a4
IZ
2361 if (*RExC_parse == ':') {
2362 RExC_parse++;
ca9dfc88
IZ
2363 paren = ':';
2364 break;
2365 }
c277df42 2366 unknown:
830247a4
IZ
2367 if (*RExC_parse != ')') {
2368 RExC_parse++;
2369 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2370 }
830247a4 2371 nextchar(pRExC_state);
a0d0e21e
LW
2372 *flagp = TRYAGAIN;
2373 return NULL;
2374 }
2375 }
fac92740 2376 else { /* (...) */
830247a4
IZ
2377 parno = RExC_npar;
2378 RExC_npar++;
2379 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2380 Set_Node_Length(ret, 1); /* MJD */
2381 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2382 open = 1;
a0d0e21e 2383 }
a0ed51b3 2384 }
fac92740 2385 else /* ! paren */
a0d0e21e
LW
2386 ret = NULL;
2387
2388 /* Pick up the branches, linking them together. */
fac92740 2389 parse_start = RExC_parse; /* MJD */
830247a4 2390 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2391 /* branch_len = (paren != 0); */
2392
a0d0e21e
LW
2393 if (br == NULL)
2394 return(NULL);
830247a4
IZ
2395 if (*RExC_parse == '|') {
2396 if (!SIZE_ONLY && RExC_extralen) {
2397 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2398 }
fac92740 2399 else { /* MJD */
830247a4 2400 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2401 Set_Node_Length(br, paren != 0);
2402 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2403 }
c277df42
IZ
2404 have_branch = 1;
2405 if (SIZE_ONLY)
830247a4 2406 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2407 }
2408 else if (paren == ':') {
c277df42
IZ
2409 *flagp |= flags&SIMPLE;
2410 }
2411 if (open) { /* Starts with OPEN. */
830247a4 2412 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2413 }
2414 else if (paren != '?') /* Not Conditional */
a0d0e21e 2415 ret = br;
32a0ca98 2416 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2417 lastbr = br;
830247a4
IZ
2418 while (*RExC_parse == '|') {
2419 if (!SIZE_ONLY && RExC_extralen) {
2420 ender = reganode(pRExC_state, LONGJMP,0);
2421 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2422 }
2423 if (SIZE_ONLY)
830247a4
IZ
2424 RExC_extralen += 2; /* Account for LONGJMP. */
2425 nextchar(pRExC_state);
2426 br = regbranch(pRExC_state, &flags, 0);
fac92740 2427
a687059c 2428 if (br == NULL)
a0d0e21e 2429 return(NULL);
830247a4 2430 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2431 lastbr = br;
821b33a5
IZ
2432 if (flags&HASWIDTH)
2433 *flagp |= HASWIDTH;
a687059c 2434 *flagp |= flags&SPSTART;
a0d0e21e
LW
2435 }
2436
c277df42
IZ
2437 if (have_branch || paren != ':') {
2438 /* Make a closing node, and hook it on the end. */
2439 switch (paren) {
2440 case ':':
830247a4 2441 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2442 break;
2443 case 1:
830247a4 2444 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2445 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2446 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2447 break;
2448 case '<':
c277df42
IZ
2449 case ',':
2450 case '=':
2451 case '!':
c277df42 2452 *flagp &= ~HASWIDTH;
821b33a5
IZ
2453 /* FALL THROUGH */
2454 case '>':
830247a4 2455 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2456 break;
2457 case 0:
830247a4 2458 ender = reg_node(pRExC_state, END);
c277df42
IZ
2459 break;
2460 }
830247a4 2461 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2462
c277df42
IZ
2463 if (have_branch) {
2464 /* Hook the tails of the branches to the closing node. */
2465 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2466 regoptail(pRExC_state, br, ender);
c277df42
IZ
2467 }
2468 }
a0d0e21e 2469 }
c277df42
IZ
2470
2471 {
2472 char *p;
2473 static char parens[] = "=!<,>";
2474
2475 if (paren && (p = strchr(parens, paren))) {
eb160463 2476 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2477 int flag = (p - parens) > 1;
2478
2479 if (paren == '>')
2480 node = SUSPEND, flag = 0;
830247a4 2481 reginsert(pRExC_state, node,ret);
ccb2c380
MP
2482 Set_Node_Offset(ret, oregcomp_parse);
2483 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
c277df42 2484 ret->flags = flag;
830247a4 2485 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2486 }
a0d0e21e
LW
2487 }
2488
2489 /* Check for proper termination. */
ce3e6498 2490 if (paren) {
e2509266 2491 RExC_flags = oregflags;
830247a4
IZ
2492 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2493 RExC_parse = oregcomp_parse;
380a0633 2494 vFAIL("Unmatched (");
ce3e6498 2495 }
a0ed51b3 2496 }
830247a4
IZ
2497 else if (!paren && RExC_parse < RExC_end) {
2498 if (*RExC_parse == ')') {
2499 RExC_parse++;
380a0633 2500 vFAIL("Unmatched )");
a0ed51b3
LW
2501 }
2502 else
b45f050a 2503 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2504 /* NOTREACHED */
2505 }
a687059c 2506
a0d0e21e 2507 return(ret);
a687059c
LW
2508}
2509
2510/*
2511 - regbranch - one alternative of an | operator
2512 *
2513 * Implements the concatenation operator.
2514 */
76e3520e 2515STATIC regnode *
830247a4 2516S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2517{
c277df42
IZ
2518 register regnode *ret;
2519 register regnode *chain = NULL;
2520 register regnode *latest;
2521 I32 flags = 0, c = 0;
a0d0e21e 2522
b81d288d 2523 if (first)
c277df42
IZ
2524 ret = NULL;
2525 else {
b81d288d 2526 if (!SIZE_ONLY && RExC_extralen)
830247a4 2527 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2528 else {
830247a4 2529 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2530 Set_Node_Length(ret, 1);
2531 }
c277df42
IZ
2532 }
2533
b81d288d 2534 if (!first && SIZE_ONLY)
830247a4 2535 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2536
c277df42 2537 *flagp = WORST; /* Tentatively. */
a0d0e21e 2538
830247a4
IZ
2539 RExC_parse--;
2540 nextchar(pRExC_state);
2541 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2542 flags &= ~TRYAGAIN;
830247a4 2543 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2544 if (latest == NULL) {
2545 if (flags & TRYAGAIN)
2546 continue;
2547 return(NULL);
a0ed51b3
LW
2548 }
2549 else if (ret == NULL)
c277df42 2550 ret = latest;
a0d0e21e 2551 *flagp |= flags&HASWIDTH;
c277df42 2552 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2553 *flagp |= flags&SPSTART;
2554 else {
830247a4
IZ
2555 RExC_naughty++;
2556 regtail(pRExC_state, chain, latest);
a687059c 2557 }
a0d0e21e 2558 chain = latest;
c277df42
IZ
2559 c++;
2560 }
2561 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2562 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2563 if (ret == NULL)
2564 ret = chain;
2565 }
2566 if (c == 1) {
2567 *flagp |= flags&SIMPLE;
a0d0e21e 2568 }
a687059c 2569
a0d0e21e 2570 return(ret);
a687059c
LW
2571}
2572
2573/*
2574 - regpiece - something followed by possible [*+?]
2575 *
2576 * Note that the branching code sequences used for ? and the general cases
2577 * of * and + are somewhat optimized: they use the same NOTHING node as
2578 * both the endmarker for their branch list and the body of the last branch.
2579 * It might seem that this node could be dispensed with entirely, but the
2580 * endmarker role is not redundant.
2581 */
76e3520e 2582STATIC regnode *
830247a4 2583S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2584{
c277df42 2585 register regnode *ret;
a0d0e21e
LW
2586 register char op;
2587 register char *next;
2588 I32 flags;
830247a4 2589 char *origparse = RExC_parse;
a0d0e21e
LW
2590 char *maxpos;
2591 I32 min;
c277df42 2592 I32 max = REG_INFTY;
fac92740 2593 char *parse_start;
a0d0e21e 2594
830247a4 2595 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2596 if (ret == NULL) {
2597 if (flags & TRYAGAIN)
2598 *flagp |= TRYAGAIN;
2599 return(NULL);
2600 }
2601
830247a4 2602 op = *RExC_parse;
a0d0e21e 2603
830247a4 2604 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2605 parse_start = RExC_parse; /* MJD */
830247a4 2606 next = RExC_parse + 1;
a0d0e21e
LW
2607 maxpos = Nullch;
2608 while (isDIGIT(*next) || *next == ',') {
2609 if (*next == ',') {
2610 if (maxpos)
2611 break;
2612 else
2613 maxpos = next;
a687059c 2614 }
a0d0e21e
LW
2615 next++;
2616 }
2617 if (*next == '}') { /* got one */
2618 if (!maxpos)
2619 maxpos = next;
830247a4
IZ
2620 RExC_parse++;
2621 min = atoi(RExC_parse);
a0d0e21e
LW
2622 if (*maxpos == ',')
2623 maxpos++;
2624 else
830247a4 2625 maxpos = RExC_parse;
a0d0e21e
LW
2626 max = atoi(maxpos);
2627 if (!max && *maxpos != '0')
c277df42
IZ
2628 max = REG_INFTY; /* meaning "infinity" */
2629 else if (max >= REG_INFTY)
8615cb43 2630 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2631 RExC_parse = next;
2632 nextchar(pRExC_state);
a0d0e21e
LW
2633
2634 do_curly:
2635 if ((flags&SIMPLE)) {
830247a4
IZ
2636 RExC_naughty += 2 + RExC_naughty / 2;
2637 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2638 Set_Node_Offset(ret, parse_start+1); /* MJD */
2639 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2640 }
2641 else {
830247a4 2642 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2643
2644 w->flags = 0;
830247a4
IZ
2645 regtail(pRExC_state, ret, w);
2646 if (!SIZE_ONLY && RExC_extralen) {
2647 reginsert(pRExC_state, LONGJMP,ret);
2648 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2649 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2650 }
830247a4 2651 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2652 /* MJD hk */
2653 Set_Node_Offset(ret, parse_start+1);
2654 Set_Node_Length(ret,
2655 op == '{' ? (RExC_parse - parse_start) : 1);
2656
830247a4 2657 if (!SIZE_ONLY && RExC_extralen)
c277df42 2658 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2659 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2660 if (SIZE_ONLY)
830247a4
IZ
2661 RExC_whilem_seen++, RExC_extralen += 3;
2662 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2663 }
c277df42 2664 ret->flags = 0;
a0d0e21e
LW
2665
2666 if (min > 0)
821b33a5
IZ
2667 *flagp = WORST;
2668 if (max > 0)
2669 *flagp |= HASWIDTH;
a0d0e21e 2670 if (max && max < min)
8615cb43 2671 vFAIL("Can't do {n,m} with n > m");
c277df42 2672 if (!SIZE_ONLY) {
eb160463
GS
2673 ARG1_SET(ret, (U16)min);
2674 ARG2_SET(ret, (U16)max);
a687059c 2675 }
a687059c 2676
a0d0e21e 2677 goto nest_check;
a687059c 2678 }
a0d0e21e 2679 }
a687059c 2680
a0d0e21e
LW
2681 if (!ISMULT1(op)) {
2682 *flagp = flags;
a687059c 2683 return(ret);
a0d0e21e 2684 }
bb20fd44 2685
c277df42 2686#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2687
2688 /* if this is reinstated, don't forget to put this back into perldiag:
2689
2690 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2691
2692 (F) The part of the regexp subject to either the * or + quantifier
2693 could match an empty string. The {#} shows in the regular
2694 expression about where the problem was discovered.
2695
2696 */
2697
bb20fd44 2698 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2699 vFAIL("Regexp *+ operand could be empty");
b81d288d 2700#endif
bb20fd44 2701
fac92740 2702 parse_start = RExC_parse;
830247a4 2703 nextchar(pRExC_state);
a0d0e21e 2704
821b33a5 2705 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2706
2707 if (op == '*' && (flags&SIMPLE)) {
830247a4 2708 reginsert(pRExC_state, STAR, ret);
c277df42 2709 ret->flags = 0;
830247a4 2710 RExC_naughty += 4;
a0d0e21e
LW
2711 }
2712 else if (op == '*') {
2713 min = 0;
2714 goto do_curly;
a0ed51b3
LW
2715 }
2716 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2717 reginsert(pRExC_state, PLUS, ret);
c277df42 2718 ret->flags = 0;
830247a4 2719 RExC_naughty += 3;
a0d0e21e
LW
2720 }
2721 else if (op == '+') {
2722 min = 1;
2723 goto do_curly;
a0ed51b3
LW
2724 }
2725 else if (op == '?') {
a0d0e21e
LW
2726 min = 0; max = 1;
2727 goto do_curly;
2728 }
2729 nest_check:
e476b1b5 2730 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2731 vWARN3(RExC_parse,
b45f050a 2732 "%.*s matches null string many times",
830247a4 2733 RExC_parse - origparse,
b45f050a 2734 origparse);
a0d0e21e
LW
2735 }
2736
830247a4
IZ
2737 if (*RExC_parse == '?') {
2738 nextchar(pRExC_state);
2739 reginsert(pRExC_state, MINMOD, ret);
2740 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2741 }
830247a4
IZ
2742 if (ISMULT2(RExC_parse)) {
2743 RExC_parse++;
b45f050a
JF
2744 vFAIL("Nested quantifiers");
2745 }
a0d0e21e
LW
2746
2747 return(ret);
a687059c
LW
2748}
2749
2750/*
2751 - regatom - the lowest level
2752 *
2753 * Optimization: gobbles an entire sequence of ordinary characters so that
2754 * it can turn them into a single node, which is smaller to store and
2755 * faster to run. Backslashed characters are exceptions, each becoming a
2756 * separate node; the code is simpler that way and it's not worth fixing.
2757 *
b45f050a 2758 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2759STATIC regnode *
830247a4 2760S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2761{
c277df42 2762 register regnode *ret = 0;
a0d0e21e 2763 I32 flags;
f06dbbb7 2764 char *parse_start = 0;
a0d0e21e
LW
2765
2766 *flagp = WORST; /* Tentatively. */
2767
2768tryagain:
830247a4 2769 switch (*RExC_parse) {
a0d0e21e 2770 case '^':
830247a4
IZ
2771 RExC_seen_zerolen++;
2772 nextchar(pRExC_state);
e2509266 2773 if (RExC_flags & PMf_MULTILINE)
830247a4 2774 ret = reg_node(pRExC_state, MBOL);
e2509266 2775 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2776 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2777 else
830247a4 2778 ret = reg_node(pRExC_state, BOL);
fac92740 2779 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2780 break;
2781 case '$':
830247a4 2782 nextchar(pRExC_state);
b81d288d 2783 if (*RExC_parse)
830247a4 2784 RExC_seen_zerolen++;
e2509266 2785 if (RExC_flags & PMf_MULTILINE)
830247a4 2786 ret = reg_node(pRExC_state, MEOL);
e2509266 2787 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2788 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2789 else
830247a4 2790 ret = reg_node(pRExC_state, EOL);
fac92740 2791 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2792 break;
2793 case '.':
830247a4 2794 nextchar(pRExC_state);
e2509266 2795 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2796 ret = reg_node(pRExC_state, SANY);
2797 else
2798 ret = reg_node(pRExC_state, REG_ANY);
2799 *flagp |= HASWIDTH|SIMPLE;
830247a4 2800 RExC_naughty++;
fac92740 2801 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2802 break;
2803 case '[':
b45f050a 2804 {
830247a4 2805 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2806 ret = regclass(pRExC_state);
830247a4
IZ
2807 if (*RExC_parse != ']') {
2808 RExC_parse = oregcomp_parse;
b45f050a
JF
2809 vFAIL("Unmatched [");
2810 }
830247a4 2811 nextchar(pRExC_state);
a0d0e21e 2812 *flagp |= HASWIDTH|SIMPLE;
fac92740 2813 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2814 break;
b45f050a 2815 }
a0d0e21e 2816 case '(':
830247a4
IZ
2817 nextchar(pRExC_state);
2818 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2819 if (ret == NULL) {
bf93d4cc 2820 if (flags & TRYAGAIN) {
830247a4 2821 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2822 /* Make parent create an empty node if needed. */
2823 *flagp |= TRYAGAIN;
2824 return(NULL);
2825 }
a0d0e21e 2826 goto tryagain;
bf93d4cc 2827 }
a0d0e21e
LW
2828 return(NULL);
2829 }
c277df42 2830 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2831 break;
2832 case '|':
2833 case ')':
2834 if (flags & TRYAGAIN) {
2835 *flagp |= TRYAGAIN;
2836 return NULL;
2837 }
b45f050a 2838 vFAIL("Internal urp");
a0d0e21e
LW
2839 /* Supposed to be caught earlier. */
2840 break;
85afd4ae 2841 case '{':
830247a4
IZ
2842 if (!regcurly(RExC_parse)) {
2843 RExC_parse++;
85afd4ae
CS
2844 goto defchar;
2845 }
2846 /* FALL THROUGH */
a0d0e21e
LW
2847 case '?':
2848 case '+':
2849 case '*':
830247a4 2850 RExC_parse++;
b45f050a 2851 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2852 break;
2853 case '\\':
830247a4 2854 switch (*++RExC_parse) {
a0d0e21e 2855 case 'A':
830247a4
IZ
2856 RExC_seen_zerolen++;
2857 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2858 *flagp |= SIMPLE;
830247a4 2859 nextchar(pRExC_state);
fac92740 2860 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2861 break;
2862 case 'G':
830247a4
IZ
2863 ret = reg_node(pRExC_state, GPOS);
2864 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2865 *flagp |= SIMPLE;
830247a4 2866 nextchar(pRExC_state);
fac92740 2867 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2868 break;
2869 case 'Z':
830247a4 2870 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2871 *flagp |= SIMPLE;
a1917ab9 2872 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2873 nextchar(pRExC_state);
a0d0e21e 2874 break;
b85d18e9 2875 case 'z':
830247a4 2876 ret = reg_node(pRExC_state, EOS);
b85d18e9 2877 *flagp |= SIMPLE;
830247a4
IZ
2878 RExC_seen_zerolen++; /* Do not optimize RE away */
2879 nextchar(pRExC_state);
fac92740 2880 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2881 break;
4a2d328f 2882 case 'C':
f33976b4
DB
2883 ret = reg_node(pRExC_state, CANY);
2884 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2885 *flagp |= HASWIDTH|SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2888 break;
2889 case 'X':
830247a4 2890 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2891 *flagp |= HASWIDTH;
830247a4 2892 nextchar(pRExC_state);
fac92740 2893 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2894 break;
a0d0e21e 2895 case 'w':
eb160463 2896 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2897 *flagp |= HASWIDTH|SIMPLE;
830247a4 2898 nextchar(pRExC_state);
fac92740 2899 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2900 break;
2901 case 'W':
eb160463 2902 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2903 *flagp |= HASWIDTH|SIMPLE;
830247a4 2904 nextchar(pRExC_state);
fac92740 2905 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2906 break;
2907 case 'b':
830247a4
IZ
2908 RExC_seen_zerolen++;
2909 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2910 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2911 *flagp |= SIMPLE;
830247a4 2912 nextchar(pRExC_state);
fac92740 2913 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2914 break;
2915 case 'B':
830247a4
IZ
2916 RExC_seen_zerolen++;
2917 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2918 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2919 *flagp |= SIMPLE;
830247a4 2920 nextchar(pRExC_state);
fac92740 2921 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2922 break;
2923 case 's':
eb160463 2924 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2925 *flagp |= HASWIDTH|SIMPLE;
830247a4 2926 nextchar(pRExC_state);
fac92740 2927 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2928 break;
2929 case 'S':
eb160463 2930 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2931 *flagp |= HASWIDTH|SIMPLE;
830247a4 2932 nextchar(pRExC_state);
fac92740 2933 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2934 break;
2935 case 'd':
ffc61ed2 2936 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2937 *flagp |= HASWIDTH|SIMPLE;
830247a4 2938 nextchar(pRExC_state);
fac92740 2939 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2940 break;
2941 case 'D':
ffc61ed2 2942 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2943 *flagp |= HASWIDTH|SIMPLE;
830247a4 2944 nextchar(pRExC_state);
fac92740 2945 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2946 break;
a14b48bc
LW
2947 case 'p':
2948 case 'P':
3568d838 2949 {
830247a4 2950 char* oldregxend = RExC_end;
ccb2c380 2951 char* parse_start = RExC_parse - 2;
a14b48bc 2952
830247a4 2953 if (RExC_parse[1] == '{') {
3568d838 2954 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2955 RExC_end = strchr(RExC_parse, '}');
2956 if (!RExC_end) {
0da60cf5 2957 U8 c = (U8)*RExC_parse;
830247a4
IZ
2958 RExC_parse += 2;
2959 RExC_end = oldregxend;
0da60cf5 2960 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2961 }
830247a4 2962 RExC_end++;
a14b48bc 2963 }
af6f566e 2964 else {
830247a4 2965 RExC_end = RExC_parse + 2;
af6f566e
HS
2966 if (RExC_end > oldregxend)
2967 RExC_end = oldregxend;
2968 }
830247a4 2969 RExC_parse--;
a14b48bc 2970
ffc61ed2 2971 ret = regclass(pRExC_state);
a14b48bc 2972
830247a4
IZ
2973 RExC_end = oldregxend;
2974 RExC_parse--;
ccb2c380
MP
2975
2976 Set_Node_Offset(ret, parse_start + 2);
2977 Set_Node_Cur_Length(ret);
830247a4 2978 nextchar(pRExC_state);
a14b48bc
LW
2979 *flagp |= HASWIDTH|SIMPLE;
2980 }
2981 break;
a0d0e21e
LW
2982 case 'n':
2983 case 'r':
2984 case 't':
2985 case 'f':
2986 case 'e':
2987 case 'a':
2988 case 'x':
2989 case 'c':
2990 case '0':
2991 goto defchar;
2992 case '1': case '2': case '3': case '4':
2993 case '5': case '6': case '7': case '8': case '9':
2994 {
830247a4 2995 I32 num = atoi(RExC_parse);
a0d0e21e 2996
830247a4 2997 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2998 goto defchar;
2999 else {
fac92740 3000 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3001 while (isDIGIT(*RExC_parse))
3002 RExC_parse++;
b45f050a 3003
eb160463 3004 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3005 vFAIL("Reference to nonexistent group");
830247a4 3006 RExC_sawback = 1;
eb160463
GS
3007 ret = reganode(pRExC_state,
3008 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3009 num);
a0d0e21e 3010 *flagp |= HASWIDTH;
fac92740
MJD
3011
3012 /* override incorrect value set in reganode MJD */
3013 Set_Node_Offset(ret, parse_start+1);
3014 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3015 RExC_parse--;
3016 nextchar(pRExC_state);
a0d0e21e
LW
3017 }
3018 }
3019 break;
3020 case '\0':
830247a4 3021 if (RExC_parse >= RExC_end)
b45f050a 3022 FAIL("Trailing \\");
a0d0e21e
LW
3023 /* FALL THROUGH */
3024 default:
c9f97d15
IZ
3025 /* Do not generate `unrecognized' warnings here, we fall
3026 back into the quick-grab loop below */
a0d0e21e
LW
3027 goto defchar;
3028 }
3029 break;
4633a7c4
LW
3030
3031 case '#':
e2509266 3032 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3033 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3034 if (RExC_parse < RExC_end)
4633a7c4
LW
3035 goto tryagain;
3036 }
3037 /* FALL THROUGH */
3038
a0d0e21e 3039 default: {
ba210ebe 3040 register STRLEN len;
58ae7d3f 3041 register UV ender;
a0d0e21e 3042 register char *p;
c277df42 3043 char *oldp, *s;
ba210ebe 3044 STRLEN numlen;
80aecb99 3045 STRLEN foldlen;
60a8b682 3046 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3047
3048 parse_start = RExC_parse - 1;
a0d0e21e 3049
830247a4 3050 RExC_parse++;
a0d0e21e
LW
3051
3052 defchar:
58ae7d3f 3053 ender = 0;
eb160463
GS
3054 ret = reg_node(pRExC_state,
3055 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3056 s = STRING(ret);
830247a4
IZ
3057 for (len = 0, p = RExC_parse - 1;
3058 len < 127 && p < RExC_end;
a0d0e21e
LW
3059 len++)
3060 {
3061 oldp = p;
5b5a24f7 3062
e2509266 3063 if (RExC_flags & PMf_EXTENDED)
830247a4 3064 p = regwhite(p, RExC_end);
a0d0e21e
LW
3065 switch (*p) {
3066 case '^':
3067 case '$':
3068 case '.':
3069 case '[':
3070 case '(':
3071 case ')':
3072 case '|':
3073 goto loopdone;
3074 case '\\':
3075 switch (*++p) {
3076 case 'A':
1ed8eac0
JF
3077 case 'C':
3078 case 'X':
a0d0e21e
LW
3079 case 'G':
3080 case 'Z':
b85d18e9 3081 case 'z':
a0d0e21e
LW
3082 case 'w':
3083 case 'W':
3084 case 'b':
3085 case 'B':
3086 case 's':
3087 case 'S':
3088 case 'd':
3089 case 'D':
a14b48bc
LW
3090 case 'p':
3091 case 'P':
a0d0e21e
LW
3092 --p;
3093 goto loopdone;
3094 case 'n':
3095 ender = '\n';
3096 p++;
a687059c 3097 break;
a0d0e21e
LW
3098 case 'r':
3099 ender = '\r';
3100 p++;
a687059c 3101 break;
a0d0e21e
LW
3102 case 't':
3103 ender = '\t';
3104 p++;
a687059c 3105 break;
a0d0e21e
LW
3106 case 'f':
3107 ender = '\f';
3108 p++;
a687059c 3109 break;
a0d0e21e 3110 case 'e':
c7f1f016 3111 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3112 p++;
a687059c 3113 break;
a0d0e21e 3114 case 'a':
c7f1f016 3115 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3116 p++;
a687059c 3117 break;
a0d0e21e 3118 case 'x':
a0ed51b3
LW
3119 if (*++p == '{') {
3120 char* e = strchr(p, '}');
b81d288d 3121
b45f050a 3122 if (!e) {
830247a4 3123 RExC_parse = p + 1;
b45f050a
JF
3124 vFAIL("Missing right brace on \\x{}");
3125 }
de5f0749 3126 else {
a4c04bdc
NC
3127 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3128 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3129 numlen = e - p - 1;
3130 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3131 if (ender > 0xff)
3132 RExC_utf8 = 1;
b21ed0a9
GS
3133 /* numlen is generous */
3134 if (numlen + len >= 127) {
a0ed51b3
LW
3135 p--;
3136 goto loopdone;
3137 }
3138 p = e + 1;
3139 }
a0ed51b3
LW
3140 }
3141 else {
a4c04bdc 3142 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3143 numlen = 2;
3144 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3145 p += numlen;
3146 }
a687059c 3147 break;
a0d0e21e
LW
3148 case 'c':
3149 p++;
bbce6d69 3150 ender = UCHARAT(p++);
3151 ender = toCTRL(ender);
a687059c 3152 break;
a0d0e21e
LW
3153 case '0': case '1': case '2': case '3':case '4':
3154 case '5': case '6': case '7': case '8':case '9':
3155 if (*p == '0' ||
830247a4 3156 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3157 I32 flags = 0;
3158 numlen = 3;
3159 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3160 p += numlen;
3161 }
3162 else {
3163 --p;
3164 goto loopdone;
a687059c
LW
3165 }
3166 break;
a0d0e21e 3167 case '\0':
830247a4 3168 if (p >= RExC_end)
b45f050a 3169 FAIL("Trailing \\");
a687059c 3170 /* FALL THROUGH */
a0d0e21e 3171 default:
e476b1b5 3172 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3173 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3174 goto normal_default;
a0d0e21e
LW
3175 }
3176 break;
a687059c 3177 default:
a0ed51b3 3178 normal_default:
fd400ab9 3179 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3180 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3181 &numlen, 0);
a0ed51b3
LW
3182 p += numlen;
3183 }
3184 else
3185 ender = *p++;
a0d0e21e 3186 break;
a687059c 3187 }
e2509266 3188 if (RExC_flags & PMf_EXTENDED)
830247a4 3189 p = regwhite(p, RExC_end);
60a8b682
JH
3190 if (UTF && FOLD) {
3191 /* Prime the casefolded buffer. */
ac7e0132 3192 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3193 }
a0d0e21e
LW
3194 if (ISMULT2(p)) { /* Back off on ?+*. */
3195 if (len)
3196 p = oldp;
16ea2a2e 3197 else if (UTF) {
0ebc6274
JH
3198 STRLEN unilen;
3199
80aecb99 3200 if (FOLD) {
60a8b682 3201 /* Emit all the Unicode characters. */
80aecb99
JH
3202 for (foldbuf = tmpbuf;
3203 foldlen;
3204 foldlen -= numlen) {
3205 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3206 if (numlen > 0) {
0ebc6274
JH
3207 reguni(pRExC_state, ender, s, &unilen);
3208 s += unilen;
3209 len += unilen;
3210 /* In EBCDIC the numlen
3211 * and unilen can differ. */
9dc45d57 3212 foldbuf += numlen;
47654450
JH
3213 if (numlen >= foldlen)
3214 break;
9dc45d57
JH
3215 }
3216 else
3217 break; /* "Can't happen." */
80aecb99
JH
3218 }
3219 }
3220 else {
0ebc6274 3221 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3222 if (unilen > 0) {
0ebc6274
JH
3223 s += unilen;
3224 len += unilen;
9dc45d57 3225 }
80aecb99 3226 }
a0ed51b3 3227 }
a0d0e21e
LW
3228 else {
3229 len++;
eb160463 3230 REGC((char)ender, s++);
a0d0e21e
LW
3231 }
3232 break;
a687059c 3233 }
16ea2a2e 3234 if (UTF) {
0ebc6274
JH
3235 STRLEN unilen;
3236
80aecb99 3237 if (FOLD) {
60a8b682 3238 /* Emit all the Unicode characters. */
80aecb99
JH
3239 for (foldbuf = tmpbuf;
3240 foldlen;
3241 foldlen -= numlen) {
3242 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3243 if (numlen > 0) {
0ebc6274
JH
3244 reguni(pRExC_state, ender, s, &unilen);
3245 len += unilen;
3246 s += unilen;
3247 /* In EBCDIC the numlen
3248 * and unilen can differ. */
9dc45d57 3249 foldbuf += numlen;
47654450
JH
3250 if (numlen >= foldlen)
3251 break;
9dc45d57
JH
3252 }
3253 else
3254 break;
80aecb99
JH
3255 }
3256 }
3257 else {
0ebc6274 3258 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3259 if (unilen > 0) {
0ebc6274
JH
3260 s += unilen;
3261 len += unilen;
9dc45d57 3262 }
80aecb99
JH
3263 }
3264 len--;
a0ed51b3
LW
3265 }
3266 else
eb160463 3267 REGC((char)ender, s++);
a0d0e21e
LW
3268 }
3269 loopdone:
830247a4 3270 RExC_parse = p - 1;
fac92740 3271 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3272 nextchar(pRExC_state);
793db0cb
JH
3273 {
3274 /* len is STRLEN which is unsigned, need to copy to signed */
3275 IV iv = len;
3276 if (iv < 0)
3277 vFAIL("Internal disaster");
3278 }
a0d0e21e
LW
3279 if (len > 0)
3280 *flagp |= HASWIDTH;
3281 if (len == 1)
3282 *flagp |= SIMPLE;
c277df42 3283 if (!SIZE_ONLY)
cd439c50
IZ
3284 STR_LEN(ret) = len;
3285 if (SIZE_ONLY)
830247a4 3286 RExC_size += STR_SZ(len);
cd439c50 3287 else
830247a4 3288 RExC_emit += STR_SZ(len);
a687059c 3289 }
a0d0e21e
LW
3290 break;
3291 }
a687059c 3292
60a8b682
JH
3293 /* If the encoding pragma is in effect recode the text of
3294 * any EXACT-kind nodes. */
22c54be3 3295 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3296 STRLEN oldlen = STR_LEN(ret);
3297 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3298
3299 if (RExC_utf8)
3300 SvUTF8_on(sv);
3301 if (sv_utf8_downgrade(sv, TRUE)) {
3302 char *s = sv_recode_to_utf8(sv, PL_encoding);
3303 STRLEN newlen = SvCUR(sv);
3304
3305 if (SvUTF8(sv))
3306 RExC_utf8 = 1;
3307 if (!SIZE_ONLY) {
3308 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3309 (int)oldlen, STRING(ret),
3310 (int)newlen, s));
3311 Copy(s, STRING(ret), newlen, char);
3312 STR_LEN(ret) += newlen - oldlen;
3313 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3314 } else
3315 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3316 }
a72c7584
JH
3317 }
3318
a0d0e21e 3319 return(ret);
a687059c
LW
3320}
3321
873ef191 3322STATIC char *
cea2e8a9 3323S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3324{
3325 while (p < e) {
3326 if (isSPACE(*p))
3327 ++p;
3328 else if (*p == '#') {
3329 do {
3330 p++;
3331 } while (p < e && *p != '\n');
3332 }
3333 else
3334 break;
3335 }
3336 return p;
3337}
3338
b8c5462f
JH
3339/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3340 Character classes ([:foo:]) can also be negated ([:^foo:]).
3341 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3342 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3343 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3344
3345#define POSIXCC_DONE(c) ((c) == ':')
3346#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3347#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3348
b8c5462f 3349STATIC I32
830247a4 3350S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3351{
3352 char *posixcc = 0;
936ed897 3353 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3354
830247a4 3355 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3356 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3357 POSIXCC(UCHARAT(RExC_parse))) {
3358 char c = UCHARAT(RExC_parse);
830247a4 3359 char* s = RExC_parse++;
b81d288d 3360
9a86a77b 3361 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3362 RExC_parse++;
3363 if (RExC_parse == RExC_end)
620e46c5 3364 /* Grandfather lone [:, [=, [. */
830247a4 3365 RExC_parse = s;
620e46c5 3366 else {
830247a4 3367 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3368
9a86a77b 3369 if (UCHARAT(RExC_parse) == ']') {
830247a4 3370 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3371 posixcc = s + 1;
3372 if (*s == ':') {
3373 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3374 I32 skip = 5; /* the most common skip */
3375
3376 switch (*posixcc) {
3377 case 'a':
3378 if (strnEQ(posixcc, "alnum", 5))
3379 namedclass =
3380 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3381 else if (strnEQ(posixcc, "alpha", 5))
3382 namedclass =
3383 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3384 else if (strnEQ(posixcc, "ascii", 5))
3385 namedclass =
3386 complement ? ANYOF_NASCII : ANYOF_ASCII;
3387 break;
aaa51d5e
JF
3388 case 'b':
3389 if (strnEQ(posixcc, "blank", 5))
3390 namedclass =
3391 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3392 break;
b8c5462f
JH
3393 case 'c':
3394 if (strnEQ(posixcc, "cntrl", 5))
3395 namedclass =
3396 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3397 break;
3398 case 'd':
3399 if (strnEQ(posixcc, "digit", 5))
3400 namedclass =
3401 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3402 break;
3403 case 'g':
3404 if (strnEQ(posixcc, "graph", 5))
3405 namedclass =
3406 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3407 break;
3408 case 'l':
3409 if (strnEQ(posixcc, "lower", 5))
3410 namedclass =
3411 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3412 break;
3413 case 'p':
3414 if (strnEQ(posixcc, "print", 5))
3415 namedclass =
3416 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3417 else if (strnEQ(posixcc, "punct", 5))
3418 namedclass =
3419 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3420 break;
3421 case 's':
3422 if (strnEQ(posixcc, "space", 5))
3423 namedclass =
aaa51d5e 3424 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3425 break;
b8c5462f
JH
3426 case 'u':
3427 if (strnEQ(posixcc, "upper", 5))
3428 namedclass =
3429 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3430 break;
3431 case 'w': /* this is not POSIX, this is the Perl \w */
3432 if (strnEQ(posixcc, "word", 4)) {
3433 namedclass =
3434 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3435 skip = 4;
3436 }
3437 break;
3438 case 'x':
3439 if (strnEQ(posixcc, "xdigit", 6)) {
3440 namedclass =
3441 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3442 skip = 6;
3443 }
3444 break;
3445 }
ac561586
JH
3446 if (namedclass == OOB_NAMEDCLASS ||
3447 posixcc[skip] != ':' ||
3448 posixcc[skip+1] != ']')
b45f050a
JF
3449 {
3450 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3451 t - s - 1, s + 1);
3452 }
3453 } else if (!SIZE_ONLY) {
b8c5462f 3454 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3455
830247a4 3456 /* adjust RExC_parse so the warning shows after
b45f050a 3457 the class closes */
9a86a77b 3458 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3459 RExC_parse++;
b45f050a
JF
3460 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3461 }
b8c5462f
JH
3462 } else {
3463 /* Maternal grandfather:
3464 * "[:" ending in ":" but not in ":]" */
830247a4 3465 RExC_parse = s;
767d463e 3466 }
620e46c5
JH
3467 }
3468 }
3469
b8c5462f
JH
3470 return namedclass;
3471}
3472
3473STATIC void
830247a4 3474S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3475{
b938889d 3476 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3477 char *s = RExC_parse;
93733859 3478 char c = *s++;
b8c5462f
JH
3479
3480 while(*s && isALNUM(*s))
3481 s++;
3482 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3483 if (ckWARN(WARN_REGEXP))
3484 vWARN3(s+2,
3485 "POSIX syntax [%c %c] belongs inside character classes",
3486 c, c);
b45f050a
JF
3487
3488 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3489 if (POSIXCC_NOTYET(c)) {
830247a4 3490 /* adjust RExC_parse so the error shows after
b45f050a 3491 the class closes */
9a86a77b 3492 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3493 ;
3494 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3495 }
b8c5462f
JH
3496 }
3497 }
620e46c5
JH
3498}
3499
76e3520e 3500STATIC regnode *
830247a4 3501S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3502{
ffc61ed2 3503 register UV value;
9a86a77b 3504 register UV nextvalue;
3568d838 3505 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3506 register IV range = 0;
c277df42 3507 register regnode *ret;
ba210ebe 3508 STRLEN numlen;
ffc61ed2 3509 IV namedclass;
9c5ffd7c 3510 char *rangebegin = 0;
936ed897 3511 bool need_class = 0;
9c5ffd7c 3512 SV *listsv = Nullsv;
ffc61ed2
JH
3513 register char *e;
3514 UV n;
9e55ce06
JH
3515 bool optimize_invert = TRUE;
3516 AV* unicode_alternate = 0;
1b2d223b
JH
3517#ifdef EBCDIC
3518 UV literal_endpoint = 0;
3519#endif
ffc61ed2
JH
3520
3521 ret = reganode(pRExC_state, ANYOF, 0);
3522
3523 if (!SIZE_ONLY)
3524 ANYOF_FLAGS(ret) = 0;
3525
9a86a77b 3526 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3527 RExC_naughty++;
3528 RExC_parse++;
3529 if (!SIZE_ONLY)
3530 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3531 }
a0d0e21e 3532
936ed897 3533 if (SIZE_ONLY)
830247a4 3534 RExC_size += ANYOF_SKIP;
936ed897 3535 else {
830247a4 3536 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3537 if (FOLD)
3538 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3539 if (LOC)
3540 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3541 ANYOF_BITMAP_ZERO(ret);
3542 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3543 }
b8c5462f 3544
9a86a77b
JH
3545 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3546
b938889d 3547 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3548 checkposixcc(pRExC_state);
b8c5462f 3549
f064b6ad
HS
3550 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3551 if (UCHARAT(RExC_parse) == ']')
3552 goto charclassloop;
ffc61ed2 3553
9a86a77b 3554 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3555
3556 charclassloop:
3557
3558 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3559
73b437c8 3560 if (!range)
830247a4 3561 rangebegin = RExC_parse;
ffc61ed2 3562 if (UTF) {
5e12f4fb 3563 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3564 RExC_end - RExC_parse,
3565 &numlen, 0);
ffc61ed2
JH
3566 RExC_parse += numlen;
3567 }
3568 else
3569 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3570 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3571 if (value == '[' && POSIXCC(nextvalue))
830247a4 3572 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3573 else if (value == '\\') {
ffc61ed2 3574 if (UTF) {
5e12f4fb 3575 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3576 RExC_end - RExC_parse,
3577 &numlen, 0);
3578 RExC_parse += numlen;
3579 }
3580 else
3581 value = UCHARAT(RExC_parse++);
470c3474 3582 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3583 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3584 * be a problem later if we want switch on Unicode.
3585 * A similar issue a little bit later when switching on
3586 * namedclass. --jhi */
ffc61ed2 3587 switch ((I32)value) {
b8c5462f
JH
3588 case 'w': namedclass = ANYOF_ALNUM; break;
3589 case 'W': namedclass = ANYOF_NALNUM; break;
3590 case 's': namedclass = ANYOF_SPACE; break;
3591 case 'S': namedclass = ANYOF_NSPACE; break;
3592 case 'd': namedclass = ANYOF_DIGIT; break;
3593 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3594 case 'p':
3595 case 'P':
af6f566e 3596 if (RExC_parse >= RExC_end)
2a4859cd 3597 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3598 if (*RExC_parse == '{') {
0da60cf5 3599 U8 c = (U8)value;
ffc61ed2
JH
3600 e = strchr(RExC_parse++, '}');
3601 if (!e)
0da60cf5 3602 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3603 while (isSPACE(UCHARAT(RExC_parse)))
3604 RExC_parse++;
3605 if (e == RExC_parse)
0da60cf5 3606 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3607 n = e - RExC_parse;
ab13f0c7
JH
3608 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3609 n--;
ffc61ed2
JH
3610 }
3611 else {
3612 e = RExC_parse;
3613 n = 1;
3614 }
3615 if (!SIZE_ONLY) {
ab13f0c7
JH
3616 if (UCHARAT(RExC_parse) == '^') {
3617 RExC_parse++;
3618 n--;
3619 value = value == 'p' ? 'P' : 'p'; /* toggle */
3620 while (isSPACE(UCHARAT(RExC_parse))) {
3621 RExC_parse++;
3622 n--;
3623 }
3624 }
ffc61ed2 3625 if (value == 'p')
ab13f0c7
JH
3626 Perl_sv_catpvf(aTHX_ listsv,
3627 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3628 else
ab13f0c7
JH
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3631 }
3632 RExC_parse = e + 1;
3633 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3634 continue;
b8c5462f
JH
3635 case 'n': value = '\n'; break;
3636 case 'r': value = '\r'; break;
3637 case 't': value = '\t'; break;
3638 case 'f': value = '\f'; break;
3639 case 'b': value = '\b'; break;
c7f1f016
NIS
3640 case 'e': value = ASCII_TO_NATIVE('\033');break;
3641 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3642 case 'x':
ffc61ed2 3643 if (*RExC_parse == '{') {
a4c04bdc
NC
3644 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3645 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3646 e = strchr(RExC_parse++, '}');
b81d288d 3647 if (!e)
ffc61ed2 3648 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3649
3650 numlen = e - RExC_parse;
3651 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3652 RExC_parse = e + 1;
3653 }
3654 else {
a4c04bdc 3655 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3656 numlen = 2;
3657 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3658 RExC_parse += numlen;
3659 }
b8c5462f
JH
3660 break;
3661 case 'c':
830247a4 3662 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3663 value = toCTRL(value);
3664 break;
3665 case '0': case '1': case '2': case '3': case '4':
3666 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3667 {
3668 I32 flags = 0;
3669 numlen = 3;
3670 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3671 RExC_parse += numlen;
b8c5462f 3672 break;
53305cf1 3673 }
1028017a 3674 default:
e476b1b5 3675 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3676 vWARN2(RExC_parse,
3677 "Unrecognized escape \\%c in character class passed through",
3678 (int)value);
1028017a 3679 break;
b8c5462f 3680 }
ffc61ed2 3681 } /* end of \blah */
1b2d223b
JH
3682#ifdef EBCDIC
3683 else
3684 literal_endpoint++;
3685#endif
ffc61ed2
JH
3686
3687 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3688
3689 if (!SIZE_ONLY && !need_class)
936ed897 3690 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3691
936ed897 3692 need_class = 1;
ffc61ed2
JH
3693
3694 /* a bad range like a-\d, a-[:digit:] ? */
3695 if (range) {
73b437c8 3696 if (!SIZE_ONLY) {
e476b1b5 3697 if (ckWARN(WARN_REGEXP))
830247a4 3698 vWARN4(RExC_parse,
b45f050a 3699 "False [] range \"%*.*s\"",
830247a4
IZ
3700 RExC_parse - rangebegin,
3701 RExC_parse - rangebegin,
b45f050a 3702 rangebegin);
3568d838
JH
3703 if (prevvalue < 256) {
3704 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3705 ANYOF_BITMAP_SET(ret, '-');
3706 }
3707 else {
3708 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3709 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3710 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3711 }
b8c5462f 3712 }
ffc61ed2
JH
3713
3714 range = 0; /* this was not a true range */
73b437c8 3715 }
ffc61ed2 3716
73b437c8 3717 if (!SIZE_ONLY) {
3568d838
JH
3718 if (namedclass > OOB_NAMEDCLASS)
3719 optimize_invert = FALSE;
e2962f66
JH
3720 /* Possible truncation here but in some 64-bit environments
3721 * the compiler gets heartburn about switch on 64-bit values.
3722 * A similar issue a little earlier when switching on value.
98f323fa 3723 * --jhi */
e2962f66 3724 switch ((I32)namedclass) {
73b437c8
JH
3725 case ANYOF_ALNUM:
3726 if (LOC)
936ed897 3727 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3728 else {
3729 for (value = 0; value < 256; value++)
3730 if (isALNUM(value))
936ed897 3731 ANYOF_BITMAP_SET(ret, value);
73b437c8 3732 }
ffc61ed2 3733 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3734 break;
3735 case ANYOF_NALNUM:
3736 if (LOC)
936ed897 3737 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3738 else {
3739 for (value = 0; value < 256; value++)
3740 if (!isALNUM(value))
936ed897 3741 ANYOF_BITMAP_SET(ret, value);
73b437c8 3742 }
ffc61ed2 3743 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3744 break;
ffc61ed2 3745 case ANYOF_ALNUMC:
73b437c8 3746 if (LOC)
ffc61ed2 3747 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3748 else {
3749 for (value = 0; value < 256; value++)
ffc61ed2 3750 if (isALNUMC(value))
936ed897 3751 ANYOF_BITMAP_SET(ret, value);
73b437c8 3752 }
ffc61ed2 3753 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3754 break;
3755 case ANYOF_NALNUMC:
3756 if (LOC)
936ed897 3757 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3758 else {
3759 for (value = 0; value < 256; value++)
3760 if (!isALNUMC(value))
936ed897 3761 ANYOF_BITMAP_SET(ret, value);
73b437c8 3762 }
ffc61ed2 3763 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3764 break;
3765 case ANYOF_ALPHA:
3766 if (LOC)
936ed897 3767 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3768 else {
3769 for (value = 0; value < 256; value++)
3770 if (isALPHA(value))
936ed897 3771 ANYOF_BITMAP_SET(ret, value);
73b437c8 3772 }
ffc61ed2 3773 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3774 break;
3775 case ANYOF_NALPHA:
3776 if (LOC)
936ed897 3777 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3778 else {
3779 for (value = 0; value < 256; value++)
3780 if (!isALPHA(value))
936ed897 3781 ANYOF_BITMAP_SET(ret, value);
73b437c8 3782 }
ffc61ed2 3783 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3784 break;
3785 case ANYOF_ASCII:
3786 if (LOC)
936ed897 3787 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3788 else {
c7f1f016 3789#ifndef EBCDIC
1ba5c669
JH
3790 for (value = 0; value < 128; value++)
3791 ANYOF_BITMAP_SET(ret, value);
3792#else /* EBCDIC */
ffbc6a93 3793 for (value = 0; value < 256; value++) {
3a3c4447
JH
3794 if (isASCII(value))
3795 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3796 }
1ba5c669 3797#endif /* EBCDIC */
73b437c8 3798 }
ffc61ed2 3799 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3800 break;
3801 case ANYOF_NASCII:
3802 if (LOC)
936ed897 3803 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3804 else {
c7f1f016 3805#ifndef EBCDIC
1ba5c669
JH
3806 for (value = 128; value < 256; value++)
3807 ANYOF_BITMAP_SET(ret, value);
3808#else /* EBCDIC */
ffbc6a93 3809 for (value = 0; value < 256; value++) {
3a3c4447
JH
3810 if (!isASCII(value))
3811 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3812 }
1ba5c669 3813#endif /* EBCDIC */
73b437c8 3814 }
ffc61ed2 3815 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3816 break;
aaa51d5e
JF
3817 case ANYOF_BLANK:
3818 if (LOC)
3819 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3820 else {
3821 for (value = 0; value < 256; value++)
3822 if (isBLANK(value))
3823 ANYOF_BITMAP_SET(ret, value);
3824 }
ffc61ed2 3825 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3826 break;
3827 case ANYOF_NBLANK:
3828 if (LOC)
3829 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3830 else {
3831 for (value = 0; value < 256; value++)
3832 if (!isBLANK(value))
3833 ANYOF_BITMAP_SET(ret, value);
3834 }
ffc61ed2 3835 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3836 break;
73b437c8
JH
3837 case ANYOF_CNTRL:
3838 if (LOC)
936ed897 3839 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (isCNTRL(value))
936ed897 3843 ANYOF_BITMAP_SET(ret, value);
73b437c8 3844 }
ffc61ed2 3845 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3846 break;
3847 case ANYOF_NCNTRL:
3848 if (LOC)
936ed897 3849 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3850 else {
3851 for (value = 0; value < 256; value++)
3852 if (!isCNTRL(value))
936ed897 3853 ANYOF_BITMAP_SET(ret, value);
73b437c8 3854 }
ffc61ed2
JH
3855 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3856 break;
3857 case ANYOF_DIGIT:
3858 if (LOC)
3859 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3860 else {
3861 /* consecutive digits assumed */
3862 for (value = '0'; value <= '9'; value++)
3863 ANYOF_BITMAP_SET(ret, value);
3864 }
3865 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3866 break;
3867 case ANYOF_NDIGIT:
3868 if (LOC)
3869 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3870 else {
3871 /* consecutive digits assumed */
3872 for (value = 0; value < '0'; value++)
3873 ANYOF_BITMAP_SET(ret, value);
3874 for (value = '9' + 1; value < 256; value++)
3875 ANYOF_BITMAP_SET(ret, value);
3876 }
3877 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3878 break;
3879 case ANYOF_GRAPH:
3880 if (LOC)
936ed897 3881 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3882 else {
3883 for (value = 0; value < 256; value++)
3884 if (isGRAPH(value))
936ed897 3885 ANYOF_BITMAP_SET(ret, value);
73b437c8 3886 }
ffc61ed2 3887 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3888 break;
3889 case ANYOF_NGRAPH:
3890 if (LOC)
936ed897 3891 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3892 else {
3893 for (value = 0; value < 256; value++)
3894 if (!isGRAPH(value))
936ed897 3895 ANYOF_BITMAP_SET(ret, value);
73b437c8 3896 }
ffc61ed2 3897 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3898 break;
3899 case ANYOF_LOWER:
3900 if (LOC)
936ed897 3901 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3902 else {
3903 for (value = 0; value < 256; value++)
3904 if (isLOWER(value))
936ed897 3905 ANYOF_BITMAP_SET(ret, value);
73b437c8 3906 }
ffc61ed2 3907 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3908 break;
3909 case ANYOF_NLOWER:
3910 if (LOC)
936ed897 3911 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3912 else {
3913 for (value = 0; value < 256; value++)
3914 if (!isLOWER(value))
936ed897 3915 ANYOF_BITMAP_SET(ret, value);
73b437c8 3916 }
ffc61ed2 3917 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3918 break;
3919 case ANYOF_PRINT:
3920 if (LOC)
936ed897 3921 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3922 else {
3923 for (value = 0; value < 256; value++)
3924 if (isPRINT(value))
936ed897 3925 ANYOF_BITMAP_SET(ret, value);
73b437c8 3926 }
ffc61ed2 3927 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3928 break;
3929 case ANYOF_NPRINT:
3930 if (LOC)
936ed897 3931 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3932 else {
3933 for (value = 0; value < 256; value++)
3934 if (!isPRINT(value))
936ed897 3935 ANYOF_BITMAP_SET(ret, value);
73b437c8 3936 }
ffc61ed2 3937 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3938 break;
aaa51d5e
JF
3939 case ANYOF_PSXSPC:
3940 if (LOC)
3941 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3942 else {
3943 for (value = 0; value < 256; value++)
3944 if (isPSXSPC(value))
3945 ANYOF_BITMAP_SET(ret, value);
3946 }
ffc61ed2 3947 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3948 break;
3949 case ANYOF_NPSXSPC:
3950 if (LOC)
3951 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3952 else {
3953 for (value = 0; value < 256; value++)
3954 if (!isPSXSPC(value))
3955 ANYOF_BITMAP_SET(ret, value);
3956 }
ffc61ed2 3957 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3958 break;
73b437c8
JH
3959 case ANYOF_PUNCT:
3960 if (LOC)
936ed897 3961 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3962 else {
3963 for (value = 0; value < 256; value++)
3964 if (isPUNCT(value))
936ed897 3965 ANYOF_BITMAP_SET(ret, value);
73b437c8 3966 }
ffc61ed2 3967 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3968 break;
3969 case ANYOF_NPUNCT:
3970 if (LOC)
936ed897 3971 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3972 else {
3973 for (value = 0; value < 256; value++)
3974 if (!isPUNCT(value))
936ed897 3975 ANYOF_BITMAP_SET(ret, value);
73b437c8 3976 }
ffc61ed2
JH
3977 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3978 break;
3979 case ANYOF_SPACE:
3980 if (LOC)
3981 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3982 else {
3983 for (value = 0; value < 256; value++)
3984 if (isSPACE(value))
3985 ANYOF_BITMAP_SET(ret, value);
3986 }
3987 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3988 break;
3989 case ANYOF_NSPACE:
3990 if (LOC)
3991 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3992 else {
3993 for (value = 0; value < 256; value++)
3994 if (!isSPACE(value))
3995 ANYOF_BITMAP_SET(ret, value);
3996 }
3997 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3998 break;
3999 case ANYOF_UPPER:
4000 if (LOC)
936ed897 4001 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4002 else {
4003 for (value = 0; value < 256; value++)
4004 if (isUPPER(value))
936ed897 4005 ANYOF_BITMAP_SET(ret, value);
73b437c8 4006 }
ffc61ed2 4007 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4008 break;
4009 case ANYOF_NUPPER:
4010 if (LOC)
936ed897 4011 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4012 else {
4013 for (value = 0; value < 256; value++)
4014 if (!isUPPER(value))
936ed897 4015 ANYOF_BITMAP_SET(ret, value);
73b437c8 4016 }
ffc61ed2 4017 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4018 break;
4019 case ANYOF_XDIGIT:
4020 if (LOC)
936ed897 4021 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4022 else {
4023 for (value = 0; value < 256; value++)
4024 if (isXDIGIT(value))
936ed897 4025 ANYOF_BITMAP_SET(ret, value);
73b437c8 4026 }
ffc61ed2 4027 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4028 break;
4029 case ANYOF_NXDIGIT:
4030 if (LOC)
936ed897 4031 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4032 else {
4033 for (value = 0; value < 256; value++)
4034 if (!isXDIGIT(value))
936ed897 4035 ANYOF_BITMAP_SET(ret, value);
73b437c8 4036 }
ffc61ed2 4037 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4038 break;
4039 default:
b45f050a 4040 vFAIL("Invalid [::] class");
73b437c8 4041 break;
b8c5462f 4042 }
b8c5462f 4043 if (LOC)
936ed897 4044 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4045 continue;
a0d0e21e 4046 }
ffc61ed2
JH
4047 } /* end of namedclass \blah */
4048
a0d0e21e 4049 if (range) {
eb160463 4050 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4051 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4052 RExC_parse - rangebegin,
4053 RExC_parse - rangebegin,
b45f050a 4054 rangebegin);
3568d838 4055 range = 0; /* not a valid range */
73b437c8 4056 }
a0d0e21e
LW
4057 }
4058 else {
3568d838 4059 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4060 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4061 RExC_parse[1] != ']') {
4062 RExC_parse++;
ffc61ed2
JH
4063
4064 /* a bad range like \w-, [:word:]- ? */
4065 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4066 if (ckWARN(WARN_REGEXP))
830247a4 4067 vWARN4(RExC_parse,
b45f050a 4068 "False [] range \"%*.*s\"",
830247a4
IZ
4069 RExC_parse - rangebegin,
4070 RExC_parse - rangebegin,
b45f050a 4071 rangebegin);
73b437c8 4072 if (!SIZE_ONLY)
936ed897 4073 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4074 } else
ffc61ed2
JH
4075 range = 1; /* yeah, it's a range! */
4076 continue; /* but do it the next time */
a0d0e21e 4077 }
a687059c 4078 }
ffc61ed2 4079
93733859 4080 /* now is the next time */
ae5c130c 4081 if (!SIZE_ONLY) {
3568d838
JH
4082 IV i;
4083
4084 if (prevvalue < 256) {
4085 IV ceilvalue = value < 256 ? value : 255;
4086
4087#ifdef EBCDIC
1b2d223b
JH
4088 /* In EBCDIC [\x89-\x91] should include
4089 * the \x8e but [i-j] should not. */
4090 if (literal_endpoint == 2 &&
4091 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4092 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4093 {
3568d838
JH
4094 if (isLOWER(prevvalue)) {
4095 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4096 if (isLOWER(i))
4097 ANYOF_BITMAP_SET(ret, i);
4098 } else {
3568d838 4099 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4100 if (isUPPER(i))
4101 ANYOF_BITMAP_SET(ret, i);
4102 }
8ada0baa 4103 }
ffc61ed2 4104 else
8ada0baa 4105#endif
a5961de5
JH
4106 for (i = prevvalue; i <= ceilvalue; i++)
4107 ANYOF_BITMAP_SET(ret, i);
3568d838 4108 }
a5961de5 4109 if (value > 255 || UTF) {
b08decb7
JH
4110 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4111 UV natvalue = NATIVE_TO_UNI(value);
4112
ffc61ed2 4113 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4114 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4115 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4116 prevnatvalue, natvalue);
4117 }
4118 else if (prevnatvalue == natvalue) {
4119 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4120 if (FOLD) {
254ba52a
JH
4121 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4122 STRLEN foldlen;
2f3bf011 4123 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4124
c840d2a2
JH
4125 /* If folding and foldable and a single
4126 * character, insert also the folded version
4127 * to the charclass. */
9e55ce06 4128 if (f != value) {
eb160463 4129 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4130 Perl_sv_catpvf(aTHX_ listsv,
4131 "%04"UVxf"\n", f);
4132 else {
4133 /* Any multicharacter foldings
4134 * require the following transform:
4135 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4136 * where E folds into "pq" and F folds
4137 * into "rst", all other characters
4138 * fold to single characters. We save
4139 * away these multicharacter foldings,
4140 * to be later saved as part of the
4141 * additional "s" data. */
4142 SV *sv;
4143
4144 if (!unicode_alternate)
4145 unicode_alternate = newAV();
4146 sv = newSVpvn((char*)foldbuf, foldlen);
4147 SvUTF8_on(sv);
4148 av_push(unicode_alternate, sv);
4149 }
4150 }
254ba52a 4151
60a8b682
JH
4152 /* If folding and the value is one of the Greek
4153 * sigmas insert a few more sigmas to make the
4154 * folding rules of the sigmas to work right.
4155 * Note that not all the possible combinations
4156 * are handled here: some of them are handled
9e55ce06
JH
4157 * by the standard folding rules, and some of
4158 * them (literal or EXACTF cases) are handled
4159 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4160 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4161 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4162 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4163 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4164 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4165 }
4166 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4167 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4168 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4169 }
4170 }
ffc61ed2 4171 }
1b2d223b
JH
4172#ifdef EBCDIC
4173 literal_endpoint = 0;
4174#endif
8ada0baa 4175 }
ffc61ed2
JH
4176
4177 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4178 }
ffc61ed2 4179
936ed897 4180 if (need_class) {
4f66b38d 4181 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4182 if (SIZE_ONLY)
830247a4 4183 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4184 else
830247a4 4185 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4186 }
ffc61ed2 4187
ae5c130c 4188 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4189 if (!SIZE_ONLY &&
ffc61ed2 4190 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4191 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4192 ) {
a0ed51b3 4193 for (value = 0; value < 256; ++value) {
936ed897 4194 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4195 UV fold = PL_fold[value];
ffc61ed2
JH
4196
4197 if (fold != value)
4198 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4199 }
4200 }
936ed897 4201 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4202 }
ffc61ed2 4203
ae5c130c 4204 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4205 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4206 /* If the only flag is inversion. */
4207 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4208 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4209 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4210 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4211 }
a0d0e21e 4212
b81d288d 4213 if (!SIZE_ONLY) {
fde631ed 4214 AV *av = newAV();
ffc61ed2
JH
4215 SV *rv;
4216
9e55ce06 4217 /* The 0th element stores the character class description
6a0407ee 4218 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4219 * to initialize the appropriate swash (which gets stored in
4220 * the 1st element), and also useful for dumping the regnode.
4221 * The 2nd element stores the multicharacter foldings,
6a0407ee 4222 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4223 av_store(av, 0, listsv);
4224 av_store(av, 1, NULL);
9e55ce06 4225 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4226 rv = newRV_noinc((SV*)av);
19860706 4227 n = add_data(pRExC_state, 1, "s");
830247a4 4228 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4229 ARG_SET(ret, n);
a0ed51b3
LW
4230 }
4231
4232 return ret;
4233}
4234
76e3520e 4235STATIC char*
830247a4 4236S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4237{
830247a4 4238 char* retval = RExC_parse++;
a0d0e21e 4239
4633a7c4 4240 for (;;) {
830247a4
IZ
4241 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4242 RExC_parse[2] == '#') {
4243 while (*RExC_parse && *RExC_parse != ')')
4244 RExC_parse++;
4245 RExC_parse++;
4633a7c4
LW
4246 continue;
4247 }
e2509266 4248 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4249 if (isSPACE(*RExC_parse)) {
4250 RExC_parse++;
748a9306
LW
4251 continue;
4252 }
830247a4
IZ
4253 else if (*RExC_parse == '#') {
4254 while (*RExC_parse && *RExC_parse != '\n')
4255 RExC_parse++;
4256 RExC_parse++;
748a9306
LW
4257 continue;
4258 }
748a9306 4259 }
4633a7c4 4260 return retval;
a0d0e21e 4261 }
a687059c
LW
4262}
4263
4264/*
c277df42 4265- reg_node - emit a node
a0d0e21e 4266*/
76e3520e 4267STATIC regnode * /* Location. */
830247a4 4268S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4269{
c277df42
IZ
4270 register regnode *ret;
4271 register regnode *ptr;
a687059c 4272
830247a4 4273 ret = RExC_emit;
c277df42 4274 if (SIZE_ONLY) {
830247a4
IZ
4275 SIZE_ALIGN(RExC_size);
4276 RExC_size += 1;
a0d0e21e
LW
4277 return(ret);
4278 }
a687059c 4279
c277df42 4280 NODE_ALIGN_FILL(ret);
a0d0e21e 4281 ptr = ret;
c277df42 4282 FILL_ADVANCE_NODE(ptr, op);
fac92740 4283 if (RExC_offsets) { /* MJD */
ccb2c380 4284 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4285 "reg_node", __LINE__,
4286 reg_name[op],
4287 RExC_emit - RExC_emit_start > RExC_offsets[0]
4288 ? "Overwriting end of array!\n" : "OK",
4289 RExC_emit - RExC_emit_start,
4290 RExC_parse - RExC_start,
4291 RExC_offsets[0]));
ccb2c380 4292 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4293 }
4294
830247a4 4295 RExC_emit = ptr;
a687059c 4296
a0d0e21e 4297 return(ret);
a687059c
LW
4298}
4299
4300/*
a0d0e21e
LW
4301- reganode - emit a node with an argument
4302*/
76e3520e 4303STATIC regnode * /* Location. */
830247a4 4304S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4305{
c277df42
IZ
4306 register regnode *ret;
4307 register regnode *ptr;
fe14fcc3 4308
830247a4 4309 ret = RExC_emit;
c277df42 4310 if (SIZE_ONLY) {
830247a4
IZ
4311 SIZE_ALIGN(RExC_size);
4312 RExC_size += 2;
a0d0e21e
LW
4313 return(ret);
4314 }
fe14fcc3 4315
c277df42 4316 NODE_ALIGN_FILL(ret);
a0d0e21e 4317 ptr = ret;
c277df42 4318 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4319 if (RExC_offsets) { /* MJD */
ccb2c380 4320 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4321 "reganode",
ccb2c380
MP
4322 __LINE__,
4323 reg_name[op],
fac92740
MJD
4324 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4325 "Overwriting end of array!\n" : "OK",
4326 RExC_emit - RExC_emit_start,
4327 RExC_parse - RExC_start,
4328 RExC_offsets[0]));
ccb2c380 4329 Set_Cur_Node_Offset;
fac92740
MJD
4330 }
4331
830247a4 4332 RExC_emit = ptr;
fe14fcc3 4333
a0d0e21e 4334 return(ret);
fe14fcc3
LW
4335}
4336
4337/*
cd439c50 4338- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4339*/
4340STATIC void
830247a4 4341S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4342{
5e12f4fb 4343 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4344}
4345
4346/*
a0d0e21e
LW
4347- reginsert - insert an operator in front of already-emitted operand
4348*
4349* Means relocating the operand.
4350*/
76e3520e 4351STATIC void
830247a4 4352S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4353{
c277df42
IZ
4354 register regnode *src;
4355 register regnode *dst;
4356 register regnode *place;
4357 register int offset = regarglen[(U8)op];
b81d288d 4358
22c35a8c 4359/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4360
4361 if (SIZE_ONLY) {
830247a4 4362 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4363 return;
4364 }
a687059c 4365
830247a4
IZ
4366 src = RExC_emit;
4367 RExC_emit += NODE_STEP_REGNODE + offset;
4368 dst = RExC_emit;
fac92740 4369 while (src > opnd) {
c277df42 4370 StructCopy(--src, --dst, regnode);
fac92740 4371 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4372 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4373 "reg_insert",
ccb2c380
MP
4374 __LINE__,
4375 reg_name[op],
fac92740
MJD
4376 dst - RExC_emit_start > RExC_offsets[0]
4377 ? "Overwriting end of array!\n" : "OK",
4378 src - RExC_emit_start,
4379 dst - RExC_emit_start,
4380 RExC_offsets[0]));
ccb2c380
MP
4381 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4382 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4383 }
4384 }
4385
a0d0e21e
LW
4386
4387 place = opnd; /* Op node, where operand used to be. */
fac92740 4388 if (RExC_offsets) { /* MJD */
ccb2c380 4389 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4390 "reginsert",
ccb2c380
MP
4391 __LINE__,
4392 reg_name[op],
fac92740
MJD
4393 place - RExC_emit_start > RExC_offsets[0]
4394 ? "Overwriting end of array!\n" : "OK",
4395 place - RExC_emit_start,
4396 RExC_parse - RExC_start,
4397 RExC_offsets[0]));
ccb2c380 4398 Set_Node_Offset(place, RExC_parse);
fac92740 4399 }
c277df42
IZ
4400 src = NEXTOPER(place);
4401 FILL_ADVANCE_NODE(place, op);
4402 Zero(src, offset, regnode);
a687059c
LW
4403}
4404
4405/*
c277df42 4406- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4407*/
76e3520e 4408STATIC void
830247a4 4409S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4410{
c277df42
IZ
4411 register regnode *scan;
4412 register regnode *temp;
a0d0e21e 4413
c277df42 4414 if (SIZE_ONLY)
a0d0e21e
LW
4415 return;
4416
4417 /* Find last node. */
4418 scan = p;
4419 for (;;) {
4420 temp = regnext(scan);
4421 if (temp == NULL)
4422 break;
4423 scan = temp;
4424 }
a687059c 4425
c277df42
IZ
4426 if (reg_off_by_arg[OP(scan)]) {
4427 ARG_SET(scan, val - scan);
a0ed51b3
LW
4428 }
4429 else {
c277df42
IZ
4430 NEXT_OFF(scan) = val - scan;
4431 }
a687059c
LW
4432}
4433
4434/*
a0d0e21e
LW
4435- regoptail - regtail on operand of first argument; nop if operandless
4436*/
76e3520e 4437STATIC void
830247a4 4438S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4439{
a0d0e21e 4440 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4441 if (p == NULL || SIZE_ONLY)
4442 return;
22c35a8c 4443 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4444 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4445 }
22c35a8c 4446 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4447 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4448 }
4449 else
a0d0e21e 4450 return;
a687059c
LW
4451}
4452
4453/*
4454 - regcurly - a little FSA that accepts {\d+,?\d*}
4455 */
79072805 4456STATIC I32
cea2e8a9 4457S_regcurly(pTHX_ register char *s)
a687059c
LW
4458{
4459 if (*s++ != '{')
4460 return FALSE;
f0fcb552 4461 if (!isDIGIT(*s))
a687059c 4462 return FALSE;
f0fcb552 4463 while (isDIGIT(*s))
a687059c
LW
4464 s++;
4465 if (*s == ',')
4466 s++;
f0fcb552 4467 while (isDIGIT(*s))
a687059c
LW
4468 s++;
4469 if (*s != '}')
4470 return FALSE;
4471 return TRUE;
4472}
4473
a687059c 4474
8fa7f367
JH
4475#ifdef DEBUGGING
4476
76e3520e 4477STATIC regnode *
cea2e8a9 4478S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4479{
f248d071 4480 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4481 register regnode *next;
c277df42
IZ
4482
4483 while (op != END && (!last || node < last)) {
4484 /* While that wasn't END last time... */
4485
4486 NODE_ALIGN(node);
4487 op = OP(node);
4488 if (op == CLOSE)
4489 l--;
4490 next = regnext(node);
4491 /* Where, what. */
4492 if (OP(node) == OPTIMIZED)
4493 goto after_print;
4494 regprop(sv, node);
b900a521 4495 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4496 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4497 if (next == NULL) /* Next ptr. */
4498 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4499 else
b900a521 4500 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4501 (void)PerlIO_putc(Perl_debug_log, '\n');
4502 after_print:
22c35a8c 4503 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4504 register regnode *nnode = (OP(next) == LONGJMP
4505 ? regnext(next)
c277df42
IZ
4506 : next);
4507 if (last && nnode > last)
4508 nnode = last;
4509 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4510 }
22c35a8c 4511 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4512 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4513 }
4514 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4515 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4516 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4517 }
22c35a8c 4518 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4519 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4520 next, sv, l + 1);
a0ed51b3
LW
4521 }
4522 else if ( op == PLUS || op == STAR) {
c277df42 4523 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4524 }
4525 else if (op == ANYOF) {
4f66b38d
HS
4526 /* arglen 1 + class block */
4527 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4528 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4529 node = NEXTOPER(node);
a0ed51b3 4530 }
22c35a8c 4531 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4532 /* Literal string, where present. */
cd439c50 4533 node += NODE_SZ_STR(node) - 1;
c277df42 4534 node = NEXTOPER(node);
a0ed51b3
LW
4535 }
4536 else {
c277df42
IZ
4537 node = NEXTOPER(node);
4538 node += regarglen[(U8)op];
4539 }
4540 if (op == CURLYX || op == OPEN)
4541 l++;
4542 else if (op == WHILEM)
4543 l--;
4544 }
4545 return node;
4546}
4547
8fa7f367
JH
4548#endif /* DEBUGGING */
4549
a687059c 4550/*
fd181c75 4551 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4552 */
4553void
864dbfa3 4554Perl_regdump(pTHX_ regexp *r)
a687059c 4555{
35ff7856 4556#ifdef DEBUGGING
46fc3d4c 4557 SV *sv = sv_newmortal();
a687059c 4558
c277df42 4559 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4560
4561 /* Header fields of interest. */
c277df42 4562 if (r->anchored_substr)
7b0972df 4563 PerlIO_printf(Perl_debug_log,
b81d288d 4564 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4565 PL_colors[0],
7b0972df 4566 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4567 SvPVX(r->anchored_substr),
3280af22 4568 PL_colors[1],
c277df42 4569 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4570 (IV)r->anchored_offset);
33b8afdf
JH
4571 else if (r->anchored_utf8)
4572 PerlIO_printf(Perl_debug_log,
4573 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4574 PL_colors[0],
4575 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4576 SvPVX(r->anchored_utf8),
4577 PL_colors[1],
4578 SvTAIL(r->anchored_utf8) ? "$" : "",
4579 (IV)r->anchored_offset);
c277df42 4580 if (r->float_substr)
7b0972df 4581 PerlIO_printf(Perl_debug_log,
b81d288d 4582 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4583 PL_colors[0],
b81d288d 4584 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4585 SvPVX(r->float_substr),
3280af22 4586 PL_colors[1],
c277df42 4587 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4588 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4589 else if (r->float_utf8)
4590 PerlIO_printf(Perl_debug_log,
4591 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4592 PL_colors[0],
4593 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4594 SvPVX(r->float_utf8),
4595 PL_colors[1],
4596 SvTAIL(r->float_utf8) ? "$" : "",
4597 (IV)r->float_min_offset, (UV)r->float_max_offset);
4598 if (r->check_substr || r->check_utf8)
b81d288d
AB
4599 PerlIO_printf(Perl_debug_log,
4600 r->check_substr == r->float_substr
33b8afdf 4601 && r->check_utf8 == r->float_utf8
c277df42
IZ
4602 ? "(checking floating" : "(checking anchored");
4603 if (r->reganch & ROPT_NOSCAN)
4604 PerlIO_printf(Perl_debug_log, " noscan");
4605 if (r->reganch & ROPT_CHECK_ALL)
4606 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4607 if (r->check_substr || r->check_utf8)
c277df42
IZ
4608 PerlIO_printf(Perl_debug_log, ") ");
4609
46fc3d4c 4610 if (r->regstclass) {
4611 regprop(sv, r->regstclass);
4612 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4613 }
774d564b 4614 if (r->reganch & ROPT_ANCH) {
4615 PerlIO_printf(Perl_debug_log, "anchored");
4616 if (r->reganch & ROPT_ANCH_BOL)
4617 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4618 if (r->reganch & ROPT_ANCH_MBOL)
4619 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4620 if (r->reganch & ROPT_ANCH_SBOL)
4621 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4622 if (r->reganch & ROPT_ANCH_GPOS)
4623 PerlIO_printf(Perl_debug_log, "(GPOS)");
4624 PerlIO_putc(Perl_debug_log, ' ');
4625 }
c277df42
IZ
4626 if (r->reganch & ROPT_GPOS_SEEN)
4627 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4628 if (r->reganch & ROPT_SKIP)
760ac839 4629 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4630 if (r->reganch & ROPT_IMPLICIT)
760ac839 4631 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4632 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4633 if (r->reganch & ROPT_EVAL_SEEN)
4634 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4635 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4636 if (r->offsets) {
4637 U32 i;
4638 U32 len = r->offsets[0];
392fbf5d 4639 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4640 for (i = 1; i <= len; i++)
392fbf5d
RB
4641 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4642 (UV)r->offsets[i*2-1],
4643 (UV)r->offsets[i*2]);
fac92740
MJD
4644 PerlIO_printf(Perl_debug_log, "\n");
4645 }
17c3b450 4646#endif /* DEBUGGING */
a687059c
LW
4647}
4648
8fa7f367
JH
4649#ifdef DEBUGGING
4650
653099ff
GS
4651STATIC void
4652S_put_byte(pTHX_ SV *sv, int c)
4653{
7be5a6cf 4654 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4655 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4656 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4657 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4658 else
4659 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4660}
4661
8fa7f367
JH
4662#endif /* DEBUGGING */
4663
a687059c 4664/*
a0d0e21e
LW
4665- regprop - printable representation of opcode
4666*/
46fc3d4c 4667void
864dbfa3 4668Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4669{
35ff7856 4670#ifdef DEBUGGING
9b155405 4671 register int k;
a0d0e21e 4672
54dc92de 4673 sv_setpvn(sv, "", 0);
9b155405 4674 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4675 /* It would be nice to FAIL() here, but this may be called from
4676 regexec.c, and it would be hard to supply pRExC_state. */
4677 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4678 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4679
4680 k = PL_regkind[(U8)OP(o)];
4681
2a782b5b
JH
4682 if (k == EXACT) {
4683 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4684 /* Using is_utf8_string() is a crude hack but it may
4685 * be the best for now since we have no flag "this EXACTish
4686 * node was UTF-8" --jhi */
4687 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4688 char *s = do_utf8 ?
c728cb41
JH
4689 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4690 UNI_DISPLAY_REGEX) :
2a782b5b 4691 STRING(o);
40eddc46 4692 int len = do_utf8 ?
2a782b5b
JH
4693 strlen(s) :
4694 STR_LEN(o);
4695 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4696 PL_colors[0],
4697 len, s,
4698 PL_colors[1]);
4699 }
9b155405 4700 else if (k == CURLY) {
cb434fcc 4701 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4702 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4703 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4704 }
2c2d71f5
JH
4705 else if (k == WHILEM && o->flags) /* Ordinal/of */
4706 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4707 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4708 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4709 else if (k == LOGICAL)
04ebc1ab 4710 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4711 else if (k == ANYOF) {
4712 int i, rangestart = -1;
ffc61ed2 4713 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4714 const char * const anyofs[] = { /* Should be syncronized with
4715 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4716 "\\w",
4717 "\\W",
4718 "\\s",
4719 "\\S",
4720 "\\d",
4721 "\\D",
4722 "[:alnum:]",
4723 "[:^alnum:]",
4724 "[:alpha:]",
4725 "[:^alpha:]",
4726 "[:ascii:]",
4727 "[:^ascii:]",
4728 "[:ctrl:]",
4729 "[:^ctrl:]",
4730 "[:graph:]",
4731 "[:^graph:]",
4732 "[:lower:]",
4733 "[:^lower:]",
4734 "[:print:]",
4735 "[:^print:]",
4736 "[:punct:]",
4737 "[:^punct:]",
4738 "[:upper:]",
aaa51d5e 4739 "[:^upper:]",
653099ff 4740 "[:xdigit:]",
aaa51d5e
JF
4741 "[:^xdigit:]",
4742 "[:space:]",
4743 "[:^space:]",
4744 "[:blank:]",
4745 "[:^blank:]"
653099ff
GS
4746 };
4747
19860706 4748 if (flags & ANYOF_LOCALE)
653099ff 4749 sv_catpv(sv, "{loc}");
19860706 4750 if (flags & ANYOF_FOLD)
653099ff
GS
4751 sv_catpv(sv, "{i}");
4752 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4753 if (flags & ANYOF_INVERT)
653099ff 4754 sv_catpv(sv, "^");
ffc61ed2
JH
4755 for (i = 0; i <= 256; i++) {
4756 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4757 if (rangestart == -1)
4758 rangestart = i;
4759 } else if (rangestart != -1) {
4760 if (i <= rangestart + 3)
4761 for (; rangestart < i; rangestart++)
653099ff 4762 put_byte(sv, rangestart);
ffc61ed2
JH
4763 else {
4764 put_byte(sv, rangestart);
4765 sv_catpv(sv, "-");
4766 put_byte(sv, i - 1);
653099ff 4767 }
ffc61ed2 4768 rangestart = -1;
653099ff 4769 }
847a199f 4770 }
ffc61ed2
JH
4771
4772 if (o->flags & ANYOF_CLASS)
4773 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4774 if (ANYOF_CLASS_TEST(o,i))
4775 sv_catpv(sv, anyofs[i]);
4776
4777 if (flags & ANYOF_UNICODE)
4778 sv_catpv(sv, "{unicode}");
1aa99e6b 4779 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4780 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4781
4782 {
4783 SV *lv;
9e55ce06 4784 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4785
ffc61ed2
JH
4786 if (lv) {
4787 if (sw) {
ffc61ed2 4788 U8 s[UTF8_MAXLEN+1];
b81d288d 4789
ffc61ed2 4790 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4791 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4792
3568d838 4793 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4794 if (rangestart == -1)
4795 rangestart = i;
4796 } else if (rangestart != -1) {
4797 U8 *p;
b81d288d 4798
ffc61ed2
JH
4799 if (i <= rangestart + 3)
4800 for (; rangestart < i; rangestart++) {
2b9d42f0 4801 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4802 put_byte(sv, *p);
4803 }
4804 else {
2b9d42f0 4805 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4806 put_byte(sv, *p);
4807 sv_catpv(sv, "-");
2b9d42f0 4808 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4809 put_byte(sv, *p);
4810 }
4811 rangestart = -1;
4812 }
19860706 4813 }
ffc61ed2
JH
4814
4815 sv_catpv(sv, "..."); /* et cetera */
19860706 4816 }
fde631ed 4817
ffc61ed2
JH
4818 {
4819 char *s = savepv(SvPVX(lv));
4820 char *origs = s;
b81d288d 4821
ffc61ed2 4822 while(*s && *s != '\n') s++;
b81d288d 4823
ffc61ed2
JH
4824 if (*s == '\n') {
4825 char *t = ++s;
4826
4827 while (*s) {
4828 if (*s == '\n')
4829 *s = ' ';
4830 s++;
4831 }
4832 if (s[-1] == ' ')
4833 s[-1] = 0;
4834
4835 sv_catpv(sv, t);
fde631ed 4836 }
b81d288d 4837
ffc61ed2 4838 Safefree(origs);
fde631ed
JH
4839 }
4840 }
653099ff 4841 }
ffc61ed2 4842
653099ff
GS
4843 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4844 }
9b155405 4845 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4846 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4847#endif /* DEBUGGING */
35ff7856 4848}
a687059c 4849
cad2e5aa
JH
4850SV *
4851Perl_re_intuit_string(pTHX_ regexp *prog)
4852{ /* Assume that RE_INTUIT is set */
4853 DEBUG_r(
4854 { STRLEN n_a;
33b8afdf
JH
4855 char *s = SvPV(prog->check_substr
4856 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4857
4858 if (!PL_colorset) reginitcolors();
4859 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4860 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4861 PL_colors[4],
4862 prog->check_substr ? "" : "utf8 ",
4863 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4864 s,
4865 PL_colors[1],
4866 (strlen(s) > 60 ? "..." : ""));
4867 } );
4868
33b8afdf 4869 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4870}
4871
2b69d0c2 4872void
864dbfa3 4873Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4874{
9e55ce06
JH
4875#ifdef DEBUGGING
4876 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4877#endif
7821416a
IZ
4878
4879 if (!r || (--r->refcnt > 0))
4880 return;
9e55ce06 4881 DEBUG_r({
d103360b
HS
4882 int len;
4883 char *s;
4884
4885 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4886 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4887 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4888 len = SvCUR(dsv);
9e55ce06
JH
4889 if (!PL_colorset)
4890 reginitcolors();
4891 PerlIO_printf(Perl_debug_log,
4892 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4893 PL_colors[4],PL_colors[5],PL_colors[0],
4894 len, len, s,
4895 PL_colors[1],
4896 len > 60 ? "..." : "");
4897 });
cad2e5aa 4898
c277df42 4899 if (r->precomp)
a0d0e21e 4900 Safefree(r->precomp);
fac92740
MJD
4901 if (r->offsets) /* 20010421 MJD */
4902 Safefree(r->offsets);
cf93c79d
IZ
4903 if (RX_MATCH_COPIED(r))
4904 Safefree(r->subbeg);
a193d654
GS
4905 if (r->substrs) {
4906 if (r->anchored_substr)
4907 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4908 if (r->anchored_utf8)
4909 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4910 if (r->float_substr)
4911 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4912 if (r->float_utf8)
4913 SvREFCNT_dec(r->float_utf8);
2779dcf1 4914 Safefree(r->substrs);
a193d654 4915 }
c277df42
IZ
4916 if (r->data) {
4917 int n = r->data->count;
f3548bdc
DM
4918 PAD* new_comppad = NULL;
4919 PAD* old_comppad;
dfad63ad 4920
c277df42 4921 while (--n >= 0) {
261faec3 4922 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4923 switch (r->data->what[n]) {
4924 case 's':
4925 SvREFCNT_dec((SV*)r->data->data[n]);
4926 break;
653099ff
GS
4927 case 'f':
4928 Safefree(r->data->data[n]);
4929 break;
dfad63ad
HS
4930 case 'p':
4931 new_comppad = (AV*)r->data->data[n];
4932 break;
c277df42 4933 case 'o':
dfad63ad 4934 if (new_comppad == NULL)
cea2e8a9 4935 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4936 PAD_SAVE_LOCAL(old_comppad,
4937 /* Watch out for global destruction's random ordering. */
4938 (SvTYPE(new_comppad) == SVt_PVAV) ?
4939 new_comppad : Null(PAD *)
4940 );
9b978d73
DM
4941 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4942 op_free((OP_4tree*)r->data->data[n]);
4943 }
4944
f3548bdc 4945 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4946 SvREFCNT_dec((SV*)new_comppad);
4947 new_comppad = NULL;
c277df42
IZ
4948 break;
4949 case 'n':
9e55ce06 4950 break;
c277df42 4951 default:
830247a4 4952 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4953 }
4954 }
4955 Safefree(r->data->what);
4956 Safefree(r->data);
a0d0e21e
LW
4957 }
4958 Safefree(r->startp);
4959 Safefree(r->endp);
4960 Safefree(r);
a687059c 4961}
c277df42
IZ
4962
4963/*
4964 - regnext - dig the "next" pointer out of a node
4965 *
4966 * [Note, when REGALIGN is defined there are two places in regmatch()
4967 * that bypass this code for speed.]
4968 */
4969regnode *
864dbfa3 4970Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4971{
4972 register I32 offset;
4973
3280af22 4974 if (p == &PL_regdummy)
c277df42
IZ
4975 return(NULL);
4976
4977 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4978 if (offset == 0)
4979 return(NULL);
4980
c277df42 4981 return(p+offset);
c277df42
IZ
4982}
4983
01f988be 4984STATIC void
cea2e8a9 4985S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4986{
4987 va_list args;
4988 STRLEN l1 = strlen(pat1);
4989 STRLEN l2 = strlen(pat2);
4990 char buf[512];
06bf62c7 4991 SV *msv;
c277df42
IZ
4992 char *message;
4993
4994 if (l1 > 510)
4995 l1 = 510;
4996 if (l1 + l2 > 510)
4997 l2 = 510 - l1;
4998 Copy(pat1, buf, l1 , char);
4999 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5000 buf[l1 + l2] = '\n';
5001 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5002#ifdef I_STDARG
5003 /* ANSI variant takes additional second argument */
c277df42 5004 va_start(args, pat2);
8736538c
AS
5005#else
5006 va_start(args);
5007#endif
5a844595 5008 msv = vmess(buf, &args);
c277df42 5009 va_end(args);
06bf62c7 5010 message = SvPV(msv,l1);
c277df42
IZ
5011 if (l1 > 512)
5012 l1 = 512;
5013 Copy(message, buf, l1 , char);
5014 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5015 Perl_croak(aTHX_ "%s", buf);
c277df42 5016}
a0ed51b3
LW
5017
5018/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5019
5020void
864dbfa3 5021Perl_save_re_context(pTHX)
b81d288d 5022{
830247a4
IZ
5023#if 0
5024 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5025 SAVEI32(RExC_npar); /* () count. */
5026 SAVEI32(RExC_size); /* Code size. */
e2509266 5027 SAVEI32(RExC_flags); /* are we folding, multilining? */
830247a4
IZ
5028 SAVEVPTR(RExC_rx); /* from regcomp.c */
5029 SAVEI32(RExC_seen); /* from regcomp.c */
5030 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5031 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5032 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
5033 SAVEPPTR(RExC_end); /* End of input for compile */
5034 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5035#endif
5036
5037 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5038 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5039 SAVEPPTR(PL_reginput); /* String-input pointer. */
5040 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5041 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5042 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5043 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5044 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 5045 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5046 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5047 PL_reg_start_tmp = 0;
a0ed51b3
LW
5048 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5049 PL_reg_start_tmpl = 0;
7766f137 5050 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5051 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5052 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5053 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5054 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5055 SAVEVPTR(PL_regcc); /* from regexec.c */
5056 SAVEVPTR(PL_curcop);
7766f137
GS
5057 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5058 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5059 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5060 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5061 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5062 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5063 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5064 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5065 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5066 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5067 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5068
5069 {
5070 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5071 U32 i;
ada6e8a9
AMS
5072 GV *mgv;
5073 REGEXP *rx;
5074 char digits[16];
5075
5076 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5077 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5078 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5079 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5080 save_scalar(mgv);
5081 }
5082 }
5083 }
5084
54b6e2fa 5085#ifdef DEBUGGING
b81d288d 5086 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5087#endif
a0ed51b3 5088}
51371543 5089
51371543 5090static void
acfe0abc 5091clear_re(pTHX_ void *r)
51371543
GS
5092{
5093 ReREFCNT_dec((regexp *)r);
5094}
ffbc6a93 5095