This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make the :bytes conditional on PerlIO.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4c79ee7a 72 **** Copyright (c) 1991-2003, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4 110typedef struct RExC_state_t {
e2509266 111 U32 flags; /* are we folding, multilining? */
830247a4
IZ
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380
MP
259#define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a
JF
274/*
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
277 * "...".
278 */
ccb2c380
MP
279#define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
282 \
283 if (!SIZE_ONLY) \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
288 ellipses = "..."; \
289 } \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
292} STMT_END
b45f050a
JF
293
294
295/*
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297 */
ccb2c380
MP
298#define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
302} STMT_END
b45f050a
JF
303
304/*
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306 */
ccb2c380
MP
307#define vFAIL(m) STMT_START { \
308 if (!SIZE_ONLY) \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
310 Simple_vFAIL(m); \
311} STMT_END
b45f050a
JF
312
313/*
314 * Like Simple_vFAIL(), but accepts two arguments.
315 */
ccb2c380
MP
316#define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
320} STMT_END
b45f050a
JF
321
322/*
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324 */
ccb2c380
MP
325#define vFAIL2(m,a1) STMT_START { \
326 if (!SIZE_ONLY) \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
329} STMT_END
b45f050a
JF
330
331
332/*
333 * Like Simple_vFAIL(), but accepts three arguments.
334 */
ccb2c380
MP
335#define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
339} STMT_END
b45f050a
JF
340
341/*
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343 */
ccb2c380
MP
344#define vFAIL3(m,a1,a2) STMT_START { \
345 if (!SIZE_ONLY) \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
348} STMT_END
b45f050a
JF
349
350/*
351 * Like Simple_vFAIL(), but accepts four arguments.
352 */
ccb2c380
MP
353#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
357} STMT_END
b45f050a
JF
358
359/*
360 * Like Simple_vFAIL(), but accepts five arguments.
361 */
ccb2c380
MP
362#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368
369#define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
420/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
a0ed51b3
LW
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42
IZ
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
498 data->last_end = -1;
499 data->flags &= ~SF_BEFORE_EOL;
500}
501
653099ff
GS
502/* Can match anything (initialization) */
503STATIC void
830247a4 504S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 505{
653099ff 506 ANYOF_CLASS_ZERO(cl);
f8bef550 507 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 508 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
509 if (LOC)
510 cl->flags |= ANYOF_LOCALE;
511}
512
513/* Can match anything (initialization) */
514STATIC int
515S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
516{
517 int value;
518
aaa51d5e 519 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
520 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
521 return 1;
1aa99e6b
IH
522 if (!(cl->flags & ANYOF_UNICODE_ALL))
523 return 0;
f8bef550
NC
524 if (!ANYOF_BITMAP_TESTALLSET(cl))
525 return 0;
653099ff
GS
526 return 1;
527}
528
529/* Can match anything (initialization) */
530STATIC void
830247a4 531S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 532{
8ecf7187 533 Zero(cl, 1, struct regnode_charclass_class);
653099ff 534 cl->type = ANYOF;
830247a4 535 cl_anything(pRExC_state, cl);
653099ff
GS
536}
537
538STATIC void
830247a4 539S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544 if (LOC)
545 cl->flags |= ANYOF_LOCALE;
546}
547
548/* 'And' a given class with another one. Can create false positives */
549/* We assume that cl is not inverted */
550STATIC void
551S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552 struct regnode_charclass_class *and_with)
553{
653099ff
GS
554 if (!(and_with->flags & ANYOF_CLASS)
555 && !(cl->flags & ANYOF_CLASS)
556 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557 && !(and_with->flags & ANYOF_FOLD)
558 && !(cl->flags & ANYOF_FOLD)) {
559 int i;
560
561 if (and_with->flags & ANYOF_INVERT)
562 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563 cl->bitmap[i] &= ~and_with->bitmap[i];
564 else
565 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566 cl->bitmap[i] &= and_with->bitmap[i];
567 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568 if (!(and_with->flags & ANYOF_EOS))
569 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
570
571 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 cl->flags |= ANYOF_UNICODE;
574 ARG_SET(cl, ARG(and_with));
575 }
576 if (!(and_with->flags & ANYOF_UNICODE_ALL))
577 cl->flags &= ~ANYOF_UNICODE_ALL;
578 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
580}
581
582/* 'OR' a given class with another one. Can create false positives */
583/* We assume that cl is not inverted */
584STATIC void
830247a4 585S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 586{
653099ff
GS
587 if (or_with->flags & ANYOF_INVERT) {
588 /* We do not use
589 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590 * <= (B1 | !B2) | (CL1 | !CL2)
591 * which is wasteful if CL2 is small, but we ignore CL2:
592 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593 * XXXX Can we handle case-fold? Unclear:
594 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
596 */
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598 && !(or_with->flags & ANYOF_FOLD)
599 && !(cl->flags & ANYOF_FOLD) ) {
600 int i;
601
602 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603 cl->bitmap[i] |= ~or_with->bitmap[i];
604 } /* XXXX: logic is complicated otherwise */
605 else {
830247a4 606 cl_anything(pRExC_state, cl);
653099ff
GS
607 }
608 } else {
609 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 611 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
612 || (cl->flags & ANYOF_FOLD)) ) {
613 int i;
614
615 /* OR char bitmap and class bitmap separately */
616 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617 cl->bitmap[i] |= or_with->bitmap[i];
618 if (or_with->flags & ANYOF_CLASS) {
619 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620 cl->classflags[i] |= or_with->classflags[i];
621 cl->flags |= ANYOF_CLASS;
622 }
623 }
624 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 }
628 if (or_with->flags & ANYOF_EOS)
629 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
630
631 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632 ARG(cl) != ARG(or_with)) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
635 }
636 if (or_with->flags & ANYOF_UNICODE_ALL) {
637 cl->flags |= ANYOF_UNICODE_ALL;
638 cl->flags &= ~ANYOF_UNICODE;
639 }
653099ff
GS
640}
641
5d1c421c
JH
642/*
643 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644 * These need to be revisited when a newer toolchain becomes available.
645 */
646#if defined(__sparc64__) && defined(__GNUC__)
647# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648# undef SPARC64_GCC_WORKAROUND
649# define SPARC64_GCC_WORKAROUND 1
650# endif
651#endif
652
653099ff
GS
653/* REx optimizer. Converts nodes into quickier variants "in place".
654 Finds fixed substrings. */
655
c277df42
IZ
656/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657 to the position after last scanned or to NULL. */
658
76e3520e 659STATIC I32
830247a4 660S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
661 /* scanp: Start here (read-write). */
662 /* deltap: Write maxlen-minlen here. */
663 /* last: Stop before this one. */
664{
665 I32 min = 0, pars = 0, code;
666 regnode *scan = *scanp, *next;
667 I32 delta = 0;
668 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 669 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
670 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671 scan_data_t data_fake;
653099ff 672 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 673
c277df42
IZ
674 while (scan && OP(scan) != END && scan < last) {
675 /* Peephole optimizer: */
676
22c35a8c 677 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 678 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
679 regnode *n = regnext(scan);
680 U32 stringok = 1;
681#ifdef DEBUGGING
682 regnode *stop = scan;
b81d288d 683#endif
c277df42 684
cd439c50 685 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
686 /* Skip NOTHING, merge EXACT*. */
687 while (n &&
b81d288d 688 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
689 (stringok && (OP(n) == OP(scan))))
690 && NEXT_OFF(n)
691 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692 if (OP(n) == TAIL || n > next)
693 stringok = 0;
22c35a8c 694 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
695 NEXT_OFF(scan) += NEXT_OFF(n);
696 next = n + NODE_STEP_REGNODE;
697#ifdef DEBUGGING
698 if (stringok)
699 stop = n;
b81d288d 700#endif
c277df42 701 n = regnext(n);
a0ed51b3 702 }
f49d4d0f 703 else if (stringok) {
cd439c50 704 int oldl = STR_LEN(scan);
c277df42 705 regnode *nnext = regnext(n);
f49d4d0f 706
b81d288d 707 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
708 break;
709 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
710 STR_LEN(scan) += STR_LEN(n);
711 next = n + NODE_SZ_STR(n);
c277df42 712 /* Now we can overwrite *n : */
f49d4d0f 713 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 714#ifdef DEBUGGING
f49d4d0f 715 stop = next - 1;
b81d288d 716#endif
c277df42
IZ
717 n = nnext;
718 }
719 }
61a36c01 720
d65e4eab 721 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
722/*
723 Two problematic code points in Unicode casefolding of EXACT nodes:
724
725 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
727
728 which casefold to
729
730 Unicode UTF-8
731
732 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
733 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
734
735 This means that in case-insensitive matching (or "loose matching",
736 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737 length of the above casefolded versions) can match a target string
738 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739 This would rather mess up the minimum length computation.
740
741 What we'll do is to look for the tail four bytes, and then peek
742 at the preceding two bytes to see whether we need to decrease
743 the minimum length by four (six minus two).
744
745 Thanks to the design of UTF-8, there cannot be false matches:
746 A sequence of valid UTF-8 bytes cannot be a subsequence of
747 another valid sequence of UTF-8 bytes.
748
749*/
750 char *s0 = STRING(scan), *s, *t;
751 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752 char *t0 = "\xcc\x88\xcc\x81";
753 char *t1 = t0 + 3;
754
755 for (s = s0 + 2;
756 s < s2 && (t = ninstr(s, s1, t0, t1));
757 s = t + 4) {
758 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
760 min -= 4;
761 }
762 }
763
c277df42
IZ
764#ifdef DEBUGGING
765 /* Allow dumping */
cd439c50 766 n = scan + NODE_SZ_STR(scan);
c277df42 767 while (n <= stop) {
22c35a8c 768 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
769 OP(n) = OPTIMIZED;
770 NEXT_OFF(n) = 0;
771 }
772 n++;
773 }
653099ff 774#endif
c277df42 775 }
653099ff
GS
776 /* Follow the next-chain of the current node and optimize
777 away all the NOTHINGs from it. */
c277df42 778 if (OP(scan) != CURLYX) {
048cfca1
GS
779 int max = (reg_off_by_arg[OP(scan)]
780 ? I32_MAX
781 /* I32 may be smaller than U16 on CRAYs! */
782 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
783 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
784 int noff;
785 regnode *n = scan;
b81d288d 786
c277df42
IZ
787 /* Skip NOTHING and LONGJMP. */
788 while ((n = regnext(n))
22c35a8c 789 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
790 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
791 && off + noff < max)
792 off += noff;
793 if (reg_off_by_arg[OP(scan)])
794 ARG(scan) = off;
b81d288d 795 else
c277df42
IZ
796 NEXT_OFF(scan) = off;
797 }
653099ff
GS
798 /* The principal pseudo-switch. Cannot be a switch, since we
799 look into several different things. */
b81d288d 800 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
801 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802 next = regnext(scan);
803 code = OP(scan);
b81d288d
AB
804
805 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 806 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 807 struct regnode_charclass_class accum;
c277df42 808
653099ff 809 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 810 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 811 if (flags & SCF_DO_STCLASS)
830247a4 812 cl_init_zero(pRExC_state, &accum);
c277df42 813 while (OP(scan) == code) {
830247a4 814 I32 deltanext, minnext, f = 0, fake;
653099ff 815 struct regnode_charclass_class this_class;
c277df42
IZ
816
817 num++;
818 data_fake.flags = 0;
b81d288d 819 if (data) {
2c2d71f5 820 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
821 data_fake.last_closep = data->last_closep;
822 }
823 else
824 data_fake.last_closep = &fake;
c277df42
IZ
825 next = regnext(scan);
826 scan = NEXTOPER(scan);
827 if (code != BRANCH)
828 scan = NEXTOPER(scan);
653099ff 829 if (flags & SCF_DO_STCLASS) {
830247a4 830 cl_init(pRExC_state, &this_class);
653099ff
GS
831 data_fake.start_class = &this_class;
832 f = SCF_DO_STCLASS_AND;
b81d288d 833 }
e1901655
IZ
834 if (flags & SCF_WHILEM_VISITED_POS)
835 f |= SCF_WHILEM_VISITED_POS;
653099ff 836 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
837 minnext = study_chunk(pRExC_state, &scan, &deltanext,
838 next, &data_fake, f);
b81d288d 839 if (min1 > minnext)
c277df42
IZ
840 min1 = minnext;
841 if (max1 < minnext + deltanext)
842 max1 = minnext + deltanext;
843 if (deltanext == I32_MAX)
aca2d497 844 is_inf = is_inf_internal = 1;
c277df42
IZ
845 scan = next;
846 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
847 pars++;
405ff068 848 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 849 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
850 if (data)
851 data->whilem_c = data_fake.whilem_c;
653099ff 852 if (flags & SCF_DO_STCLASS)
830247a4 853 cl_or(pRExC_state, &accum, &this_class);
b81d288d 854 if (code == SUSPEND)
c277df42
IZ
855 break;
856 }
857 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
858 min1 = 0;
859 if (flags & SCF_DO_SUBSTR) {
860 data->pos_min += min1;
861 data->pos_delta += max1 - min1;
862 if (max1 != min1 || is_inf)
863 data->longest = &(data->longest_float);
864 }
865 min += min1;
866 delta += max1 - min1;
653099ff 867 if (flags & SCF_DO_STCLASS_OR) {
830247a4 868 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
869 if (min1) {
870 cl_and(data->start_class, &and_with);
871 flags &= ~SCF_DO_STCLASS;
872 }
873 }
874 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
875 if (min1) {
876 cl_and(data->start_class, &accum);
653099ff 877 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
878 }
879 else {
b81d288d 880 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
881 * data->start_class */
882 StructCopy(data->start_class, &and_with,
883 struct regnode_charclass_class);
884 flags &= ~SCF_DO_STCLASS_AND;
885 StructCopy(&accum, data->start_class,
886 struct regnode_charclass_class);
887 flags |= SCF_DO_STCLASS_OR;
888 data->start_class->flags |= ANYOF_EOS;
889 }
653099ff 890 }
a0ed51b3
LW
891 }
892 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
893 scan = NEXTOPER(NEXTOPER(scan));
894 else /* single branch is optimized. */
895 scan = NEXTOPER(scan);
896 continue;
a0ed51b3
LW
897 }
898 else if (OP(scan) == EXACT) {
cd439c50 899 I32 l = STR_LEN(scan);
1aa99e6b 900 UV uc = *((U8*)STRING(scan));
a0ed51b3 901 if (UTF) {
1aa99e6b
IH
902 U8 *s = (U8*)STRING(scan);
903 l = utf8_length(s, s + l);
9041c2e3 904 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
905 }
906 min += l;
c277df42 907 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
908 /* The code below prefers earlier match for fixed
909 offset, later match for variable offset. */
910 if (data->last_end == -1) { /* Update the start info. */
911 data->last_start_min = data->pos_min;
912 data->last_start_max = is_inf
b81d288d 913 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 914 }
cd439c50 915 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
916 if (UTF)
917 SvUTF8_on(data->last_found);
c277df42
IZ
918 data->last_end = data->pos_min + l;
919 data->pos_min += l; /* As in the first entry. */
920 data->flags &= ~SF_BEFORE_EOL;
921 }
653099ff
GS
922 if (flags & SCF_DO_STCLASS_AND) {
923 /* Check whether it is compatible with what we know already! */
924 int compat = 1;
925
1aa99e6b 926 if (uc >= 0x100 ||
516a5887 927 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 928 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 929 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 930 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 931 )
653099ff
GS
932 compat = 0;
933 ANYOF_CLASS_ZERO(data->start_class);
934 ANYOF_BITMAP_ZERO(data->start_class);
935 if (compat)
1aa99e6b 936 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 937 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
938 if (uc < 0x100)
939 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
940 }
941 else if (flags & SCF_DO_STCLASS_OR) {
942 /* false positive possible if the class is case-folded */
1aa99e6b 943 if (uc < 0x100)
9b877dbb
IH
944 ANYOF_BITMAP_SET(data->start_class, uc);
945 else
946 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
947 data->start_class->flags &= ~ANYOF_EOS;
948 cl_and(data->start_class, &and_with);
949 }
950 flags &= ~SCF_DO_STCLASS;
a0ed51b3 951 }
653099ff 952 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 953 I32 l = STR_LEN(scan);
1aa99e6b 954 UV uc = *((U8*)STRING(scan));
653099ff
GS
955
956 /* Search for fixed substrings supports EXACT only. */
b81d288d 957 if (flags & SCF_DO_SUBSTR)
830247a4 958 scan_commit(pRExC_state, data);
a0ed51b3 959 if (UTF) {
1aa99e6b
IH
960 U8 *s = (U8 *)STRING(scan);
961 l = utf8_length(s, s + l);
9041c2e3 962 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
963 }
964 min += l;
c277df42 965 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 966 data->pos_min += l;
653099ff
GS
967 if (flags & SCF_DO_STCLASS_AND) {
968 /* Check whether it is compatible with what we know already! */
969 int compat = 1;
970
1aa99e6b 971 if (uc >= 0x100 ||
516a5887 972 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 973 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 974 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
975 compat = 0;
976 ANYOF_CLASS_ZERO(data->start_class);
977 ANYOF_BITMAP_ZERO(data->start_class);
978 if (compat) {
1aa99e6b 979 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
980 data->start_class->flags &= ~ANYOF_EOS;
981 data->start_class->flags |= ANYOF_FOLD;
982 if (OP(scan) == EXACTFL)
983 data->start_class->flags |= ANYOF_LOCALE;
984 }
985 }
986 else if (flags & SCF_DO_STCLASS_OR) {
987 if (data->start_class->flags & ANYOF_FOLD) {
988 /* false positive possible if the class is case-folded.
989 Assume that the locale settings are the same... */
1aa99e6b
IH
990 if (uc < 0x100)
991 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
992 data->start_class->flags &= ~ANYOF_EOS;
993 }
994 cl_and(data->start_class, &and_with);
995 }
996 flags &= ~SCF_DO_STCLASS;
a0ed51b3 997 }
4d61ec05 998 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 999 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1000 I32 f = flags, pos_before = 0;
c277df42 1001 regnode *oscan = scan;
653099ff
GS
1002 struct regnode_charclass_class this_class;
1003 struct regnode_charclass_class *oclass = NULL;
727f22e3 1004 I32 next_is_eval = 0;
653099ff 1005
22c35a8c 1006 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1007 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1008 scan = NEXTOPER(scan);
1009 goto finish;
1010 case PLUS:
653099ff 1011 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1012 next = NEXTOPER(scan);
653099ff 1013 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1014 mincount = 1;
1015 maxcount = REG_INFTY;
c277df42
IZ
1016 next = regnext(scan);
1017 scan = NEXTOPER(scan);
1018 goto do_curly;
1019 }
1020 }
1021 if (flags & SCF_DO_SUBSTR)
1022 data->pos_min++;
1023 min++;
1024 /* Fall through. */
1025 case STAR:
653099ff
GS
1026 if (flags & SCF_DO_STCLASS) {
1027 mincount = 0;
b81d288d 1028 maxcount = REG_INFTY;
653099ff
GS
1029 next = regnext(scan);
1030 scan = NEXTOPER(scan);
1031 goto do_curly;
1032 }
b81d288d 1033 is_inf = is_inf_internal = 1;
c277df42
IZ
1034 scan = regnext(scan);
1035 if (flags & SCF_DO_SUBSTR) {
830247a4 1036 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1037 data->longest = &(data->longest_float);
1038 }
1039 goto optimize_curly_tail;
1040 case CURLY:
b81d288d 1041 mincount = ARG1(scan);
c277df42
IZ
1042 maxcount = ARG2(scan);
1043 next = regnext(scan);
cb434fcc
IZ
1044 if (OP(scan) == CURLYX) {
1045 I32 lp = (data ? *(data->last_closep) : 0);
1046
1047 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1048 }
c277df42 1049 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1050 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1051 do_curly:
1052 if (flags & SCF_DO_SUBSTR) {
830247a4 1053 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1054 pos_before = data->pos_min;
1055 }
1056 if (data) {
1057 fl = data->flags;
1058 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1059 if (is_inf)
1060 data->flags |= SF_IS_INF;
1061 }
653099ff 1062 if (flags & SCF_DO_STCLASS) {
830247a4 1063 cl_init(pRExC_state, &this_class);
653099ff
GS
1064 oclass = data->start_class;
1065 data->start_class = &this_class;
1066 f |= SCF_DO_STCLASS_AND;
1067 f &= ~SCF_DO_STCLASS_OR;
1068 }
e1901655
IZ
1069 /* These are the cases when once a subexpression
1070 fails at a particular position, it cannot succeed
1071 even after backtracking at the enclosing scope.
b81d288d 1072
e1901655
IZ
1073 XXXX what if minimal match and we are at the
1074 initial run of {n,m}? */
1075 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1077
c277df42 1078 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1079 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1080 mincount == 0
653099ff
GS
1081 ? (f & ~SCF_DO_SUBSTR) : f);
1082
1083 if (flags & SCF_DO_STCLASS)
1084 data->start_class = oclass;
1085 if (mincount == 0 || minnext == 0) {
1086 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1087 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1088 }
1089 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1090 /* Switch to OR mode: cache the old value of
653099ff
GS
1091 * data->start_class */
1092 StructCopy(data->start_class, &and_with,
1093 struct regnode_charclass_class);
1094 flags &= ~SCF_DO_STCLASS_AND;
1095 StructCopy(&this_class, data->start_class,
1096 struct regnode_charclass_class);
1097 flags |= SCF_DO_STCLASS_OR;
1098 data->start_class->flags |= ANYOF_EOS;
1099 }
1100 } else { /* Non-zero len */
1101 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1102 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1103 cl_and(data->start_class, &and_with);
1104 }
1105 else if (flags & SCF_DO_STCLASS_AND)
1106 cl_and(data->start_class, &this_class);
1107 flags &= ~SCF_DO_STCLASS;
1108 }
c277df42
IZ
1109 if (!scan) /* It was not CURLYX, but CURLY. */
1110 scan = next;
84037bb0 1111 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1112 /* ? quantifier ok, except for (?{ ... }) */
1113 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1114 && (minnext == 0) && (deltanext == 0)
99799961 1115 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1116 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1117 {
830247a4 1118 vWARN(RExC_parse,
b45f050a
JF
1119 "Quantifier unexpected on zero-length expression");
1120 }
1121
c277df42 1122 min += minnext * mincount;
b81d288d 1123 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1124 && (minnext + deltanext) > 0)
1125 || deltanext == I32_MAX);
aca2d497 1126 is_inf |= is_inf_internal;
c277df42
IZ
1127 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1128
1129 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1130 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1131 && data->flags & SF_IN_PAR
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext && minnext == 1 ) {
1134 /* Try to optimize to CURLYN. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1136 regnode *nxt1 = nxt;
1137#ifdef DEBUGGING
1138 regnode *nxt2;
1139#endif
c277df42
IZ
1140
1141 /* Skip open. */
1142 nxt = regnext(nxt);
4d61ec05 1143 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1144 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1145 && STR_LEN(nxt) == 1))
c277df42 1146 goto nogo;
497b47a8 1147#ifdef DEBUGGING
c277df42 1148 nxt2 = nxt;
497b47a8 1149#endif
c277df42 1150 nxt = regnext(nxt);
b81d288d 1151 if (OP(nxt) != CLOSE)
c277df42
IZ
1152 goto nogo;
1153 /* Now we know that nxt2 is the only contents: */
eb160463 1154 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1155 OP(oscan) = CURLYN;
1156 OP(nxt1) = NOTHING; /* was OPEN. */
1157#ifdef DEBUGGING
1158 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1162 OP(nxt + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42 1165 }
c277df42
IZ
1166 nogo:
1167
1168 /* Try optimization CURLYX => CURLYM. */
b81d288d 1169 if ( OP(oscan) == CURLYX && data
c277df42 1170 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1171 && !(data->flags & SF_HAS_EVAL)
1172 && !deltanext ) {
1173 /* XXXX How to optimize if data == 0? */
1174 /* Optimize to a simpler form. */
1175 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1176 regnode *nxt2;
1177
1178 OP(oscan) = CURLYM;
1179 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1180 && (OP(nxt2) != WHILEM))
c277df42
IZ
1181 nxt = nxt2;
1182 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1183 /* Need to optimize away parenths. */
1184 if (data->flags & SF_IN_PAR) {
1185 /* Set the parenth number. */
1186 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1187
b81d288d 1188 if (OP(nxt) != CLOSE)
b45f050a 1189 FAIL("Panic opt close");
eb160463 1190 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1191 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1192 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1193#ifdef DEBUGGING
1194 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195 OP(nxt + 1) = OPTIMIZED; /* was count. */
1196 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1198#endif
c277df42
IZ
1199#if 0
1200 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201 regnode *nnxt = regnext(nxt1);
b81d288d 1202
c277df42
IZ
1203 if (nnxt == nxt) {
1204 if (reg_off_by_arg[OP(nxt1)])
1205 ARG_SET(nxt1, nxt2 - nxt1);
1206 else if (nxt2 - nxt1 < U16_MAX)
1207 NEXT_OFF(nxt1) = nxt2 - nxt1;
1208 else
1209 OP(nxt) = NOTHING; /* Cannot beautify */
1210 }
1211 nxt1 = nnxt;
1212 }
1213#endif
1214 /* Optimize again: */
b81d288d 1215 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1216 NULL, 0);
a0ed51b3
LW
1217 }
1218 else
c277df42 1219 oscan->flags = 0;
c277df42 1220 }
e1901655
IZ
1221 else if ((OP(oscan) == CURLYX)
1222 && (flags & SCF_WHILEM_VISITED_POS)
1223 /* See the comment on a similar expression above.
1224 However, this time it not a subexpression
1225 we care about, but the expression itself. */
1226 && (maxcount == REG_INFTY)
1227 && data && ++data->whilem_c < 16) {
1228 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1229 /* Find WHILEM (as in regexec.c) */
1230 regnode *nxt = oscan + NEXT_OFF(oscan);
1231
1232 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1233 nxt += ARG(nxt);
eb160463
GS
1234 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1236 }
b81d288d 1237 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1238 pars++;
1239 if (flags & SCF_DO_SUBSTR) {
1240 SV *last_str = Nullsv;
1241 int counted = mincount != 0;
1242
1243 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1244#if defined(SPARC64_GCC_WORKAROUND)
1245 I32 b = 0;
1246 STRLEN l = 0;
1247 char *s = NULL;
1248 I32 old = 0;
1249
1250 if (pos_before >= data->last_start_min)
1251 b = pos_before;
1252 else
1253 b = data->last_start_min;
1254
1255 l = 0;
1256 s = SvPV(data->last_found, l);
1257 old = b - data->last_start_min;
1258
1259#else
b81d288d 1260 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1261 ? pos_before : data->last_start_min;
1262 STRLEN l;
1263 char *s = SvPV(data->last_found, l);
a0ed51b3 1264 I32 old = b - data->last_start_min;
5d1c421c 1265#endif
a0ed51b3
LW
1266
1267 if (UTF)
1268 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1269
a0ed51b3 1270 l -= old;
c277df42 1271 /* Get the added string: */
79cb57f6 1272 last_str = newSVpvn(s + old, l);
0e933229
IH
1273 if (UTF)
1274 SvUTF8_on(last_str);
c277df42
IZ
1275 if (deltanext == 0 && pos_before == b) {
1276 /* What was added is a constant string */
1277 if (mincount > 1) {
1278 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1279 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1280 SvPVX(last_str), l, mincount - 1);
1281 SvCUR(last_str) *= mincount;
1282 /* Add additional parts. */
b81d288d 1283 SvCUR_set(data->last_found,
c277df42
IZ
1284 SvCUR(data->last_found) - l);
1285 sv_catsv(data->last_found, last_str);
1286 data->last_end += l * (mincount - 1);
1287 }
2a8d9689
HS
1288 } else {
1289 /* start offset must point into the last copy */
1290 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1291 data->last_start_max += is_inf ? I32_MAX
1292 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1293 }
1294 }
1295 /* It is counted once already... */
1296 data->pos_min += minnext * (mincount - counted);
1297 data->pos_delta += - counted * deltanext +
1298 (minnext + deltanext) * maxcount - minnext * mincount;
1299 if (mincount != maxcount) {
653099ff
GS
1300 /* Cannot extend fixed substrings found inside
1301 the group. */
830247a4 1302 scan_commit(pRExC_state,data);
c277df42
IZ
1303 if (mincount && last_str) {
1304 sv_setsv(data->last_found, last_str);
1305 data->last_end = data->pos_min;
b81d288d 1306 data->last_start_min =
a0ed51b3 1307 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1308 data->last_start_max = is_inf
1309 ? I32_MAX
c277df42 1310 : data->pos_min + data->pos_delta
a0ed51b3 1311 - CHR_SVLEN(last_str);
c277df42
IZ
1312 }
1313 data->longest = &(data->longest_float);
1314 }
aca2d497 1315 SvREFCNT_dec(last_str);
c277df42 1316 }
405ff068 1317 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1318 data->flags |= SF_HAS_EVAL;
1319 optimize_curly_tail:
c277df42 1320 if (OP(oscan) != CURLYX) {
22c35a8c 1321 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1322 && NEXT_OFF(next))
1323 NEXT_OFF(oscan) += NEXT_OFF(next);
1324 }
c277df42 1325 continue;
653099ff 1326 default: /* REF and CLUMP only? */
c277df42 1327 if (flags & SCF_DO_SUBSTR) {
830247a4 1328 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1329 data->longest = &(data->longest_float);
1330 }
aca2d497 1331 is_inf = is_inf_internal = 1;
653099ff 1332 if (flags & SCF_DO_STCLASS_OR)
830247a4 1333 cl_anything(pRExC_state, data->start_class);
653099ff 1334 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1335 break;
1336 }
a0ed51b3 1337 }
ffc61ed2 1338 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1339 int value = 0;
653099ff 1340
c277df42 1341 if (flags & SCF_DO_SUBSTR) {
830247a4 1342 scan_commit(pRExC_state,data);
c277df42
IZ
1343 data->pos_min++;
1344 }
1345 min++;
653099ff
GS
1346 if (flags & SCF_DO_STCLASS) {
1347 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1348
1349 /* Some of the logic below assumes that switching
1350 locale on will only add false positives. */
1351 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1352 case SANY:
653099ff
GS
1353 default:
1354 do_default:
1355 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1357 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1358 break;
1359 case REG_ANY:
1360 if (OP(scan) == SANY)
1361 goto do_default;
1362 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1365 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1366 }
1367 if (flags & SCF_DO_STCLASS_AND || !value)
1368 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1369 break;
1370 case ANYOF:
1371 if (flags & SCF_DO_STCLASS_AND)
1372 cl_and(data->start_class,
1373 (struct regnode_charclass_class*)scan);
1374 else
830247a4 1375 cl_or(pRExC_state, data->start_class,
653099ff
GS
1376 (struct regnode_charclass_class*)scan);
1377 break;
1378 case ALNUM:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (!isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (isALNUM(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case ALNUML:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1401 }
1402 else {
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404 data->start_class->flags |= ANYOF_LOCALE;
1405 }
1406 break;
1407 case NALNUM:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (!isALNUM(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case NALNUML:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1430 }
1431 else {
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 }
1435 break;
1436 case SPACE:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440 for (value = 0; value < 256; value++)
1441 if (!isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (isSPACE(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case SPACEL:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1459 }
1460 else {
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 }
1464 break;
1465 case NSPACE:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469 for (value = 0; value < 256; value++)
1470 if (isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1472 }
1473 }
1474 else {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1477 else {
1478 for (value = 0; value < 256; value++)
1479 if (!isSPACE(value))
b81d288d 1480 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1481 }
1482 }
1483 break;
1484 case NSPACEL:
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE) {
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1489 if (!isSPACE(value))
1490 ANYOF_BITMAP_CLEAR(data->start_class, value);
1491 }
1492 }
1493 else {
1494 data->start_class->flags |= ANYOF_LOCALE;
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1496 }
1497 break;
1498 case DIGIT:
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501 for (value = 0; value < 256; value++)
1502 if (!isDIGIT(value))
1503 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504 }
1505 else {
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1508 else {
1509 for (value = 0; value < 256; value++)
1510 if (isDIGIT(value))
b81d288d 1511 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1512 }
1513 }
1514 break;
1515 case NDIGIT:
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518 for (value = 0; value < 256; value++)
1519 if (isDIGIT(value))
1520 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 }
1522 else {
1523 if (data->start_class->flags & ANYOF_LOCALE)
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1525 else {
1526 for (value = 0; value < 256; value++)
1527 if (!isDIGIT(value))
b81d288d 1528 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1529 }
1530 }
1531 break;
1532 }
1533 if (flags & SCF_DO_STCLASS_OR)
1534 cl_and(data->start_class, &and_with);
1535 flags &= ~SCF_DO_STCLASS;
1536 }
a0ed51b3 1537 }
22c35a8c 1538 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1539 data->flags |= (OP(scan) == MEOL
1540 ? SF_BEFORE_MEOL
1541 : SF_BEFORE_SEOL);
a0ed51b3 1542 }
653099ff
GS
1543 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1544 /* Lookbehind, or need to calculate parens/evals/stclass: */
1545 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1546 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1547 /* Lookahead/lookbehind */
cb434fcc 1548 I32 deltanext, minnext, fake = 0;
c277df42 1549 regnode *nscan;
653099ff
GS
1550 struct regnode_charclass_class intrnl;
1551 int f = 0;
c277df42
IZ
1552
1553 data_fake.flags = 0;
b81d288d 1554 if (data) {
2c2d71f5 1555 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1556 data_fake.last_closep = data->last_closep;
1557 }
1558 else
1559 data_fake.last_closep = &fake;
653099ff
GS
1560 if ( flags & SCF_DO_STCLASS && !scan->flags
1561 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1562 cl_init(pRExC_state, &intrnl);
653099ff 1563 data_fake.start_class = &intrnl;
e1901655 1564 f |= SCF_DO_STCLASS_AND;
653099ff 1565 }
e1901655
IZ
1566 if (flags & SCF_WHILEM_VISITED_POS)
1567 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1568 next = regnext(scan);
1569 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1570 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1571 if (scan->flags) {
1572 if (deltanext) {
9baa0206 1573 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1574 }
1575 else if (minnext > U8_MAX) {
9baa0206 1576 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1577 }
eb160463 1578 scan->flags = (U8)minnext;
c277df42
IZ
1579 }
1580 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1581 pars++;
405ff068 1582 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1583 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1584 if (data)
1585 data->whilem_c = data_fake.whilem_c;
e1901655 1586 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1587 int was = (data->start_class->flags & ANYOF_EOS);
1588
1589 cl_and(data->start_class, &intrnl);
1590 if (was)
1591 data->start_class->flags |= ANYOF_EOS;
1592 }
a0ed51b3
LW
1593 }
1594 else if (OP(scan) == OPEN) {
c277df42 1595 pars++;
a0ed51b3 1596 }
cb434fcc 1597 else if (OP(scan) == CLOSE) {
eb160463 1598 if ((I32)ARG(scan) == is_par) {
cb434fcc 1599 next = regnext(scan);
c277df42 1600
cb434fcc
IZ
1601 if ( next && (OP(next) != WHILEM) && next < last)
1602 is_par = 0; /* Disable optimization */
1603 }
1604 if (data)
1605 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1606 }
1607 else if (OP(scan) == EVAL) {
c277df42
IZ
1608 if (data)
1609 data->flags |= SF_HAS_EVAL;
1610 }
96776eda 1611 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1612 if (flags & SCF_DO_SUBSTR) {
830247a4 1613 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1614 data->longest = &(data->longest_float);
1615 }
1616 is_inf = is_inf_internal = 1;
653099ff 1617 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1618 cl_anything(pRExC_state, data->start_class);
96776eda 1619 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1620 }
c277df42
IZ
1621 /* Else: zero-length, ignore. */
1622 scan = regnext(scan);
1623 }
1624
1625 finish:
1626 *scanp = scan;
aca2d497 1627 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1628 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1629 data->pos_delta = I32_MAX - data->pos_min;
1630 if (is_par > U8_MAX)
1631 is_par = 0;
1632 if (is_par && pars==1 && data) {
1633 data->flags |= SF_IN_PAR;
1634 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1635 }
1636 else if (pars && data) {
c277df42
IZ
1637 data->flags |= SF_HAS_PAR;
1638 data->flags &= ~SF_IN_PAR;
1639 }
653099ff
GS
1640 if (flags & SCF_DO_STCLASS_OR)
1641 cl_and(data->start_class, &and_with);
c277df42
IZ
1642 return min;
1643}
1644
76e3520e 1645STATIC I32
830247a4 1646S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1647{
830247a4 1648 if (RExC_rx->data) {
b81d288d
AB
1649 Renewc(RExC_rx->data,
1650 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1651 char, struct reg_data);
830247a4
IZ
1652 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653 RExC_rx->data->count += n;
a0ed51b3
LW
1654 }
1655 else {
830247a4 1656 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1657 char, struct reg_data);
830247a4
IZ
1658 New(1208, RExC_rx->data->what, n, U8);
1659 RExC_rx->data->count = n;
c277df42 1660 }
830247a4
IZ
1661 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662 return RExC_rx->data->count - n;
c277df42
IZ
1663}
1664
d88dccdf 1665void
864dbfa3 1666Perl_reginitcolors(pTHX)
d88dccdf 1667{
d88dccdf
IZ
1668 int i = 0;
1669 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1670
d88dccdf
IZ
1671 if (s) {
1672 PL_colors[0] = s = savepv(s);
1673 while (++i < 6) {
1674 s = strchr(s, '\t');
1675 if (s) {
1676 *s = '\0';
1677 PL_colors[i] = ++s;
1678 }
1679 else
c712d376 1680 PL_colors[i] = s = "";
d88dccdf
IZ
1681 }
1682 } else {
b81d288d 1683 while (i < 6)
d88dccdf
IZ
1684 PL_colors[i++] = "";
1685 }
1686 PL_colorset = 1;
1687}
1688
8615cb43 1689
a687059c 1690/*
e50aee73 1691 - pregcomp - compile a regular expression into internal code
a687059c
LW
1692 *
1693 * We can't allocate space until we know how big the compiled form will be,
1694 * but we can't compile it (and thus know how big it is) until we've got a
1695 * place to put the code. So we cheat: we compile it twice, once with code
1696 * generation turned off and size counting turned on, and once "for real".
1697 * This also means that we don't allocate space until we are sure that the
1698 * thing really will compile successfully, and we never have to move the
1699 * code and thus invalidate pointers into it. (Note that it has to be in
1700 * one piece because free() must be able to free it all.) [NB: not true in perl]
1701 *
1702 * Beware that the optimization-preparation code in here knows about some
1703 * of the structure of the compiled regexp. [I'll say.]
1704 */
1705regexp *
864dbfa3 1706Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1707{
a0d0e21e 1708 register regexp *r;
c277df42 1709 regnode *scan;
c277df42 1710 regnode *first;
a0d0e21e 1711 I32 flags;
a0d0e21e
LW
1712 I32 minlen = 0;
1713 I32 sawplus = 0;
1714 I32 sawopen = 0;
2c2d71f5 1715 scan_data_t data;
830247a4
IZ
1716 RExC_state_t RExC_state;
1717 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1718
1719 if (exp == NULL)
c277df42 1720 FAIL("NULL regexp argument");
a0d0e21e 1721
a5961de5 1722 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1723
5cfc7842 1724 RExC_precomp = exp;
a5961de5
JH
1725 DEBUG_r({
1726 if (!PL_colorset) reginitcolors();
1727 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728 PL_colors[4],PL_colors[5],PL_colors[0],
1729 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1730 });
e2509266 1731 RExC_flags = pm->op_pmflags;
830247a4 1732 RExC_sawback = 0;
bbce6d69 1733
830247a4
IZ
1734 RExC_seen = 0;
1735 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736 RExC_seen_evals = 0;
1737 RExC_extralen = 0;
c277df42 1738
bbce6d69 1739 /* First pass: determine size, legality. */
830247a4 1740 RExC_parse = exp;
fac92740 1741 RExC_start = exp;
830247a4
IZ
1742 RExC_end = xend;
1743 RExC_naughty = 0;
1744 RExC_npar = 1;
1745 RExC_size = 0L;
1746 RExC_emit = &PL_regdummy;
1747 RExC_whilem_seen = 0;
85ddcde9
JH
1748#if 0 /* REGC() is (currently) a NOP at the first pass.
1749 * Clever compilers notice this and complain. --jhi */
830247a4 1750 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1751#endif
830247a4 1752 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1753 RExC_precomp = Nullch;
a0d0e21e
LW
1754 return(NULL);
1755 }
830247a4 1756 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1757
c277df42
IZ
1758 /* Small enough for pointer-storage convention?
1759 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1760 if (RExC_size >= 0x10000L && RExC_extralen)
1761 RExC_size += RExC_extralen;
c277df42 1762 else
830247a4
IZ
1763 RExC_extralen = 0;
1764 if (RExC_whilem_seen > 15)
1765 RExC_whilem_seen = 15;
a0d0e21e 1766
bbce6d69 1767 /* Allocate space and initialize. */
830247a4 1768 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1769 char, regexp);
a0d0e21e 1770 if (r == NULL)
b45f050a
JF
1771 FAIL("Regexp out of space");
1772
0f79a09d
GS
1773#ifdef DEBUGGING
1774 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1775 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1776#endif
c277df42 1777 r->refcnt = 1;
bbce6d69 1778 r->prelen = xend - exp;
5cfc7842 1779 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1780 r->subbeg = NULL;
ed252734
NC
1781#ifdef PERL_COPY_ON_WRITE
1782 r->saved_copy = Nullsv;
1783#endif
cf93c79d 1784 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1785 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1786
1787 r->substrs = 0; /* Useful during FAIL. */
1788 r->startp = 0; /* Useful during FAIL. */
1789 r->endp = 0; /* Useful during FAIL. */
1790
fac92740
MJD
1791 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1792 if (r->offsets) {
1793 r->offsets[0] = RExC_size;
1794 }
1795 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1796 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1797 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1798 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1799
830247a4 1800 RExC_rx = r;
bbce6d69 1801
1802 /* Second pass: emit code. */
e2509266 1803 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1804 RExC_parse = exp;
1805 RExC_end = xend;
1806 RExC_naughty = 0;
1807 RExC_npar = 1;
fac92740 1808 RExC_emit_start = r->program;
830247a4 1809 RExC_emit = r->program;
2cd61cdb 1810 /* Store the count of eval-groups for security checks: */
eb160463 1811 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1812 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1813 r->data = 0;
830247a4 1814 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1815 return(NULL);
1816
1817 /* Dig out information for optimizations. */
cf93c79d 1818 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1819 pm->op_pmflags = RExC_flags;
a0ed51b3 1820 if (UTF)
5ff6fc6d 1821 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1822 r->regstclass = NULL;
830247a4 1823 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1824 r->reganch |= ROPT_NAUGHTY;
c277df42 1825 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1826
1827 /* XXXX To minimize changes to RE engine we always allocate
1828 3-units-long substrs field. */
1829 Newz(1004, r->substrs, 1, struct reg_substr_data);
1830
2c2d71f5 1831 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1832 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1833 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1834 I32 fake;
c5254dd6 1835 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1836 struct regnode_charclass_class ch_class;
1837 int stclass_flag;
cb434fcc 1838 I32 last_close = 0;
a0d0e21e
LW
1839
1840 first = scan;
c277df42 1841 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1842 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1843 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1844 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1845 (OP(first) == PLUS) ||
1846 (OP(first) == MINMOD) ||
653099ff 1847 /* An {n,m} with n>0 */
22c35a8c 1848 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1849 if (OP(first) == PLUS)
1850 sawplus = 1;
1851 else
1852 first += regarglen[(U8)OP(first)];
1853 first = NEXTOPER(first);
a687059c
LW
1854 }
1855
a0d0e21e
LW
1856 /* Starting-point info. */
1857 again:
653099ff 1858 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1859 if (OP(first) == EXACT)
1860 ; /* Empty, get anchored substr later. */
1861 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1862 r->regstclass = first;
1863 }
653099ff 1864 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1865 r->regstclass = first;
22c35a8c
GS
1866 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1867 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1868 r->regstclass = first;
22c35a8c 1869 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1870 r->reganch |= (OP(first) == MBOL
1871 ? ROPT_ANCH_MBOL
1872 : (OP(first) == SBOL
1873 ? ROPT_ANCH_SBOL
1874 : ROPT_ANCH_BOL));
a0d0e21e 1875 first = NEXTOPER(first);
774d564b 1876 goto again;
1877 }
1878 else if (OP(first) == GPOS) {
1879 r->reganch |= ROPT_ANCH_GPOS;
1880 first = NEXTOPER(first);
1881 goto again;
a0d0e21e 1882 }
e09294f4 1883 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1884 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1885 !(r->reganch & ROPT_ANCH) )
1886 {
1887 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1888 int type = OP(NEXTOPER(first));
1889
ffc61ed2 1890 if (type == REG_ANY)
cad2e5aa
JH
1891 type = ROPT_ANCH_MBOL;
1892 else
1893 type = ROPT_ANCH_SBOL;
1894
1895 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1896 first = NEXTOPER(first);
774d564b 1897 goto again;
a0d0e21e 1898 }
b81d288d 1899 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1900 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1901 /* x+ must match at the 1st pos of run of x's */
1902 r->reganch |= ROPT_SKIP;
a0d0e21e 1903
c277df42 1904 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1905 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1906 (IV)(first - scan + 1)));
a0d0e21e
LW
1907 /*
1908 * If there's something expensive in the r.e., find the
1909 * longest literal string that must appear and make it the
1910 * regmust. Resolve ties in favor of later strings, since
1911 * the regstart check works with the beginning of the r.e.
1912 * and avoiding duplication strengthens checking. Not a
1913 * strong reason, but sufficient in the absence of others.
1914 * [Now we resolve ties in favor of the earlier string if
c277df42 1915 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1916 * earlier string may buy us something the later one won't.]
1917 */
a0d0e21e 1918 minlen = 0;
a687059c 1919
79cb57f6
GS
1920 data.longest_fixed = newSVpvn("",0);
1921 data.longest_float = newSVpvn("",0);
1922 data.last_found = newSVpvn("",0);
c277df42
IZ
1923 data.longest = &(data.longest_fixed);
1924 first = scan;
653099ff 1925 if (!r->regstclass) {
830247a4 1926 cl_init(pRExC_state, &ch_class);
653099ff
GS
1927 data.start_class = &ch_class;
1928 stclass_flag = SCF_DO_STCLASS_AND;
1929 } else /* XXXX Check for BOUND? */
1930 stclass_flag = 0;
cb434fcc 1931 data.last_closep = &last_close;
653099ff 1932
830247a4 1933 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1934 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1935 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1936 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1937 && !RExC_seen_zerolen
1938 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1939 r->reganch |= ROPT_CHECK_ALL;
830247a4 1940 scan_commit(pRExC_state, &data);
c277df42
IZ
1941 SvREFCNT_dec(data.last_found);
1942
a0ed51b3 1943 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1944 if (longest_float_length
c277df42
IZ
1945 || (data.flags & SF_FL_BEFORE_EOL
1946 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1947 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1948 int t;
1949
a0ed51b3 1950 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1951 && data.offset_fixed == data.offset_float_min
1952 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1953 goto remove_float; /* As in (a)+. */
1954
33b8afdf
JH
1955 if (SvUTF8(data.longest_float)) {
1956 r->float_utf8 = data.longest_float;
1957 r->float_substr = Nullsv;
1958 } else {
1959 r->float_substr = data.longest_float;
1960 r->float_utf8 = Nullsv;
1961 }
c277df42
IZ
1962 r->float_min_offset = data.offset_float_min;
1963 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1964 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1965 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1966 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1967 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1968 }
1969 else {
aca2d497 1970 remove_float:
33b8afdf 1971 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1972 SvREFCNT_dec(data.longest_float);
c5254dd6 1973 longest_float_length = 0;
a0d0e21e 1974 }
c277df42 1975
a0ed51b3 1976 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1977 if (longest_fixed_length
c277df42
IZ
1978 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1979 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 1980 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1981 int t;
1982
33b8afdf
JH
1983 if (SvUTF8(data.longest_fixed)) {
1984 r->anchored_utf8 = data.longest_fixed;
1985 r->anchored_substr = Nullsv;
1986 } else {
1987 r->anchored_substr = data.longest_fixed;
1988 r->anchored_utf8 = Nullsv;
1989 }
c277df42 1990 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 1993 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1994 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1995 }
1996 else {
33b8afdf 1997 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 1998 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1999 longest_fixed_length = 0;
a0d0e21e 2000 }
b81d288d 2001 if (r->regstclass
ffc61ed2 2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2003 r->regstclass = NULL;
33b8afdf
JH
2004 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2005 && stclass_flag
653099ff 2006 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2007 && !cl_is_anything(data.start_class))
2008 {
830247a4 2009 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2010
b81d288d 2011 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2012 struct regnode_charclass_class);
2013 StructCopy(data.start_class,
830247a4 2014 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2015 struct regnode_charclass_class);
830247a4 2016 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2017 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2018 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2019 DEBUG_r({ SV *sv = sv_newmortal();
2020 regprop(sv, (regnode*)data.start_class);
2021 PerlIO_printf(Perl_debug_log,
2022 "synthetic stclass `%s'.\n",
2023 SvPVX(sv));});
653099ff 2024 }
c277df42
IZ
2025
2026 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2027 if (longest_fixed_length > longest_float_length) {
c277df42 2028 r->check_substr = r->anchored_substr;
33b8afdf 2029 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2030 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2031 if (r->reganch & ROPT_ANCH_SINGLE)
2032 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2033 }
2034 else {
c277df42 2035 r->check_substr = r->float_substr;
33b8afdf 2036 r->check_utf8 = r->float_utf8;
c277df42
IZ
2037 r->check_offset_min = data.offset_float_min;
2038 r->check_offset_max = data.offset_float_max;
a0d0e21e 2039 }
30382c73
IZ
2040 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2041 This should be changed ASAP! */
33b8afdf 2042 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2043 r->reganch |= RE_USE_INTUIT;
33b8afdf 2044 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2045 r->reganch |= RE_INTUIT_TAIL;
2046 }
a0ed51b3
LW
2047 }
2048 else {
c277df42
IZ
2049 /* Several toplevels. Best we can is to set minlen. */
2050 I32 fake;
653099ff 2051 struct regnode_charclass_class ch_class;
cb434fcc 2052 I32 last_close = 0;
c277df42
IZ
2053
2054 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2055 scan = r->program + 1;
830247a4 2056 cl_init(pRExC_state, &ch_class);
653099ff 2057 data.start_class = &ch_class;
cb434fcc 2058 data.last_closep = &last_close;
e1901655 2059 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2060 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2061 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2062 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2063 && !cl_is_anything(data.start_class))
2064 {
830247a4 2065 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2066
b81d288d 2067 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2068 struct regnode_charclass_class);
2069 StructCopy(data.start_class,
830247a4 2070 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2071 struct regnode_charclass_class);
830247a4 2072 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2073 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2074 DEBUG_r({ SV* sv = sv_newmortal();
2075 regprop(sv, (regnode*)data.start_class);
2076 PerlIO_printf(Perl_debug_log,
2077 "synthetic stclass `%s'.\n",
2078 SvPVX(sv));});
653099ff 2079 }
a0d0e21e
LW
2080 }
2081
a0d0e21e 2082 r->minlen = minlen;
b81d288d 2083 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2084 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2085 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2086 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2087 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2088 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2089 if (RExC_seen & REG_SEEN_CANY)
2090 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2091 Newz(1002, r->startp, RExC_npar, I32);
2092 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2093 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2094 DEBUG_r(regdump(r));
2095 return(r);
a687059c
LW
2096}
2097
2098/*
2099 - reg - regular expression, i.e. main body or parenthesized thing
2100 *
2101 * Caller must absorb opening parenthesis.
2102 *
2103 * Combining parenthesis handling with the base level of regular expression
2104 * is a trifle forced, but the need to tie the tails of the branches to what
2105 * follows makes it hard to avoid.
2106 */
76e3520e 2107STATIC regnode *
830247a4 2108S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2109 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2110{
c277df42
IZ
2111 register regnode *ret; /* Will be the head of the group. */
2112 register regnode *br;
2113 register regnode *lastbr;
2114 register regnode *ender = 0;
a0d0e21e 2115 register I32 parno = 0;
e2509266 2116 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2117
2118 /* for (?g), (?gc), and (?o) warnings; warning
2119 about (?c) will warn about (?g) -- japhy */
2120
2121 I32 wastedflags = 0x00,
2122 wasted_o = 0x01,
2123 wasted_g = 0x02,
2124 wasted_gc = 0x02 | 0x04,
2125 wasted_c = 0x04;
2126
fac92740 2127 char * parse_start = RExC_parse; /* MJD */
830247a4 2128 char *oregcomp_parse = RExC_parse;
c277df42 2129 char c;
a0d0e21e 2130
821b33a5 2131 *flagp = 0; /* Tentatively. */
a0d0e21e 2132
9d1d55b5 2133
a0d0e21e
LW
2134 /* Make an OPEN node, if parenthesized. */
2135 if (paren) {
fac92740 2136 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2137 U32 posflags = 0, negflags = 0;
2138 U32 *flagsp = &posflags;
0f5d15d6 2139 int logical = 0;
830247a4 2140 char *seqstart = RExC_parse;
ca9dfc88 2141
830247a4
IZ
2142 RExC_parse++;
2143 paren = *RExC_parse++;
c277df42 2144 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2145 switch (paren) {
fac92740 2146 case '<': /* (?<...) */
830247a4 2147 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2148 if (*RExC_parse == '!')
c277df42 2149 paren = ',';
b81d288d 2150 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2151 goto unknown;
830247a4 2152 RExC_parse++;
fac92740
MJD
2153 case '=': /* (?=...) */
2154 case '!': /* (?!...) */
830247a4 2155 RExC_seen_zerolen++;
fac92740
MJD
2156 case ':': /* (?:...) */
2157 case '>': /* (?>...) */
a0d0e21e 2158 break;
fac92740
MJD
2159 case '$': /* (?$...) */
2160 case '@': /* (?@...) */
8615cb43 2161 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2162 break;
fac92740 2163 case '#': /* (?#...) */
830247a4
IZ
2164 while (*RExC_parse && *RExC_parse != ')')
2165 RExC_parse++;
2166 if (*RExC_parse != ')')
c277df42 2167 FAIL("Sequence (?#... not terminated");
830247a4 2168 nextchar(pRExC_state);
a0d0e21e
LW
2169 *flagp = TRYAGAIN;
2170 return NULL;
fac92740 2171 case 'p': /* (?p...) */
9014280d 2172 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2173 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2174 /* FALL THROUGH*/
fac92740 2175 case '?': /* (??...) */
0f5d15d6 2176 logical = 1;
438a3801
YST
2177 if (*RExC_parse != '{')
2178 goto unknown;
830247a4 2179 paren = *RExC_parse++;
0f5d15d6 2180 /* FALL THROUGH */
fac92740 2181 case '{': /* (?{...}) */
c277df42 2182 {
c277df42
IZ
2183 I32 count = 1, n = 0;
2184 char c;
830247a4 2185 char *s = RExC_parse;
c277df42
IZ
2186 SV *sv;
2187 OP_4tree *sop, *rop;
2188
830247a4
IZ
2189 RExC_seen_zerolen++;
2190 RExC_seen |= REG_SEEN_EVAL;
2191 while (count && (c = *RExC_parse)) {
2192 if (c == '\\' && RExC_parse[1])
2193 RExC_parse++;
b81d288d 2194 else if (c == '{')
c277df42 2195 count++;
b81d288d 2196 else if (c == '}')
c277df42 2197 count--;
830247a4 2198 RExC_parse++;
c277df42 2199 }
830247a4 2200 if (*RExC_parse != ')')
b45f050a 2201 {
b81d288d 2202 RExC_parse = s;
b45f050a
JF
2203 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2204 }
c277df42 2205 if (!SIZE_ONLY) {
f3548bdc 2206 PAD *pad;
b81d288d
AB
2207
2208 if (RExC_parse - 1 - s)
830247a4 2209 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2210 else
79cb57f6 2211 sv = newSVpvn("", 0);
c277df42 2212
569233ed
SB
2213 ENTER;
2214 Perl_save_re_context(aTHX);
f3548bdc 2215 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2216 sop->op_private |= OPpREFCOUNTED;
2217 /* re_dup will OpREFCNT_inc */
2218 OpREFCNT_set(sop, 1);
569233ed 2219 LEAVE;
c277df42 2220
830247a4
IZ
2221 n = add_data(pRExC_state, 3, "nop");
2222 RExC_rx->data->data[n] = (void*)rop;
2223 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2224 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2225 SvREFCNT_dec(sv);
a0ed51b3 2226 }
e24b16f9 2227 else { /* First pass */
830247a4 2228 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2229 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2230 /* No compiled RE interpolated, has runtime
2231 components ===> unsafe. */
2232 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2233 if (PL_tainting && PL_tainted)
cc6b7395 2234 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2235 }
2236
830247a4 2237 nextchar(pRExC_state);
0f5d15d6 2238 if (logical) {
830247a4 2239 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2240 if (!SIZE_ONLY)
2241 ret->flags = 2;
830247a4 2242 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2243 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2244 return ret;
2245 }
ccb2c380
MP
2246 ret = reganode(pRExC_state, EVAL, n);
2247 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2248 Set_Node_Offset(ret, parse_start);
2249 return ret;
c277df42 2250 }
fac92740 2251 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2252 {
fac92740 2253 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2254 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2255 || RExC_parse[1] == '<'
830247a4 2256 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2257 I32 flag;
2258
830247a4 2259 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2260 if (!SIZE_ONLY)
2261 ret->flags = 1;
830247a4 2262 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2263 goto insert_if;
b81d288d 2264 }
a0ed51b3 2265 }
830247a4 2266 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2267 /* (?(1)...) */
830247a4 2268 parno = atoi(RExC_parse++);
c277df42 2269
830247a4
IZ
2270 while (isDIGIT(*RExC_parse))
2271 RExC_parse++;
fac92740
MJD
2272 ret = reganode(pRExC_state, GROUPP, parno);
2273
830247a4 2274 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2275 vFAIL("Switch condition not recognized");
c277df42 2276 insert_if:
830247a4
IZ
2277 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2278 br = regbranch(pRExC_state, &flags, 1);
c277df42 2279 if (br == NULL)
830247a4 2280 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2281 else
830247a4
IZ
2282 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2283 c = *nextchar(pRExC_state);
d1b80229
IZ
2284 if (flags&HASWIDTH)
2285 *flagp |= HASWIDTH;
c277df42 2286 if (c == '|') {
830247a4
IZ
2287 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2288 regbranch(pRExC_state, &flags, 1);
2289 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2290 if (flags&HASWIDTH)
2291 *flagp |= HASWIDTH;
830247a4 2292 c = *nextchar(pRExC_state);
a0ed51b3
LW
2293 }
2294 else
c277df42
IZ
2295 lastbr = NULL;
2296 if (c != ')')
8615cb43 2297 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2298 ender = reg_node(pRExC_state, TAIL);
2299 regtail(pRExC_state, br, ender);
c277df42 2300 if (lastbr) {
830247a4
IZ
2301 regtail(pRExC_state, lastbr, ender);
2302 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2303 }
2304 else
830247a4 2305 regtail(pRExC_state, ret, ender);
c277df42 2306 return ret;
a0ed51b3
LW
2307 }
2308 else {
830247a4 2309 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2310 }
2311 }
1b1626e4 2312 case 0:
830247a4 2313 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2314 vFAIL("Sequence (? incomplete");
1b1626e4 2315 break;
a0d0e21e 2316 default:
830247a4 2317 --RExC_parse;
fac92740 2318 parse_flags: /* (?i) */
830247a4 2319 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2320 /* (?g), (?gc) and (?o) are useless here
2321 and must be globally applied -- japhy */
2322
2323 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2324 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2325 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2326 if (! (wastedflags & wflagbit) ) {
2327 wastedflags |= wflagbit;
2328 vWARN5(
2329 RExC_parse + 1,
2330 "Useless (%s%c) - %suse /%c modifier",
2331 flagsp == &negflags ? "?-" : "?",
2332 *RExC_parse,
2333 flagsp == &negflags ? "don't " : "",
2334 *RExC_parse
2335 );
2336 }
2337 }
2338 }
2339 else if (*RExC_parse == 'c') {
2340 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2341 if (! (wastedflags & wasted_c) ) {
2342 wastedflags |= wasted_gc;
2343 vWARN3(
2344 RExC_parse + 1,
2345 "Useless (%sc) - %suse /gc modifier",
2346 flagsp == &negflags ? "?-" : "?",
2347 flagsp == &negflags ? "don't " : ""
2348 );
2349 }
2350 }
2351 }
2352 else { pmflag(flagsp, *RExC_parse); }
2353
830247a4 2354 ++RExC_parse;
ca9dfc88 2355 }
830247a4 2356 if (*RExC_parse == '-') {
ca9dfc88 2357 flagsp = &negflags;
9d1d55b5 2358 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2359 ++RExC_parse;
ca9dfc88 2360 goto parse_flags;
48c036b1 2361 }
e2509266
JH
2362 RExC_flags |= posflags;
2363 RExC_flags &= ~negflags;
830247a4
IZ
2364 if (*RExC_parse == ':') {
2365 RExC_parse++;
ca9dfc88
IZ
2366 paren = ':';
2367 break;
2368 }
c277df42 2369 unknown:
830247a4
IZ
2370 if (*RExC_parse != ')') {
2371 RExC_parse++;
2372 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2373 }
830247a4 2374 nextchar(pRExC_state);
a0d0e21e
LW
2375 *flagp = TRYAGAIN;
2376 return NULL;
2377 }
2378 }
fac92740 2379 else { /* (...) */
830247a4
IZ
2380 parno = RExC_npar;
2381 RExC_npar++;
2382 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2383 Set_Node_Length(ret, 1); /* MJD */
2384 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2385 open = 1;
a0d0e21e 2386 }
a0ed51b3 2387 }
fac92740 2388 else /* ! paren */
a0d0e21e
LW
2389 ret = NULL;
2390
2391 /* Pick up the branches, linking them together. */
fac92740 2392 parse_start = RExC_parse; /* MJD */
830247a4 2393 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2394 /* branch_len = (paren != 0); */
2395
a0d0e21e
LW
2396 if (br == NULL)
2397 return(NULL);
830247a4
IZ
2398 if (*RExC_parse == '|') {
2399 if (!SIZE_ONLY && RExC_extralen) {
2400 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2401 }
fac92740 2402 else { /* MJD */
830247a4 2403 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2404 Set_Node_Length(br, paren != 0);
2405 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2406 }
c277df42
IZ
2407 have_branch = 1;
2408 if (SIZE_ONLY)
830247a4 2409 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2410 }
2411 else if (paren == ':') {
c277df42
IZ
2412 *flagp |= flags&SIMPLE;
2413 }
2414 if (open) { /* Starts with OPEN. */
830247a4 2415 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2416 }
2417 else if (paren != '?') /* Not Conditional */
a0d0e21e 2418 ret = br;
32a0ca98 2419 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2420 lastbr = br;
830247a4
IZ
2421 while (*RExC_parse == '|') {
2422 if (!SIZE_ONLY && RExC_extralen) {
2423 ender = reganode(pRExC_state, LONGJMP,0);
2424 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2425 }
2426 if (SIZE_ONLY)
830247a4
IZ
2427 RExC_extralen += 2; /* Account for LONGJMP. */
2428 nextchar(pRExC_state);
2429 br = regbranch(pRExC_state, &flags, 0);
fac92740 2430
a687059c 2431 if (br == NULL)
a0d0e21e 2432 return(NULL);
830247a4 2433 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2434 lastbr = br;
821b33a5
IZ
2435 if (flags&HASWIDTH)
2436 *flagp |= HASWIDTH;
a687059c 2437 *flagp |= flags&SPSTART;
a0d0e21e
LW
2438 }
2439
c277df42
IZ
2440 if (have_branch || paren != ':') {
2441 /* Make a closing node, and hook it on the end. */
2442 switch (paren) {
2443 case ':':
830247a4 2444 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2445 break;
2446 case 1:
830247a4 2447 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2448 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2449 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2450 break;
2451 case '<':
c277df42
IZ
2452 case ',':
2453 case '=':
2454 case '!':
c277df42 2455 *flagp &= ~HASWIDTH;
821b33a5
IZ
2456 /* FALL THROUGH */
2457 case '>':
830247a4 2458 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2459 break;
2460 case 0:
830247a4 2461 ender = reg_node(pRExC_state, END);
c277df42
IZ
2462 break;
2463 }
830247a4 2464 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2465
c277df42
IZ
2466 if (have_branch) {
2467 /* Hook the tails of the branches to the closing node. */
2468 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2469 regoptail(pRExC_state, br, ender);
c277df42
IZ
2470 }
2471 }
a0d0e21e 2472 }
c277df42
IZ
2473
2474 {
2475 char *p;
2476 static char parens[] = "=!<,>";
2477
2478 if (paren && (p = strchr(parens, paren))) {
eb160463 2479 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2480 int flag = (p - parens) > 1;
2481
2482 if (paren == '>')
2483 node = SUSPEND, flag = 0;
830247a4 2484 reginsert(pRExC_state, node,ret);
ccb2c380
MP
2485 Set_Node_Offset(ret, oregcomp_parse);
2486 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
c277df42 2487 ret->flags = flag;
830247a4 2488 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2489 }
a0d0e21e
LW
2490 }
2491
2492 /* Check for proper termination. */
ce3e6498 2493 if (paren) {
e2509266 2494 RExC_flags = oregflags;
830247a4
IZ
2495 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2496 RExC_parse = oregcomp_parse;
380a0633 2497 vFAIL("Unmatched (");
ce3e6498 2498 }
a0ed51b3 2499 }
830247a4
IZ
2500 else if (!paren && RExC_parse < RExC_end) {
2501 if (*RExC_parse == ')') {
2502 RExC_parse++;
380a0633 2503 vFAIL("Unmatched )");
a0ed51b3
LW
2504 }
2505 else
b45f050a 2506 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2507 /* NOTREACHED */
2508 }
a687059c 2509
a0d0e21e 2510 return(ret);
a687059c
LW
2511}
2512
2513/*
2514 - regbranch - one alternative of an | operator
2515 *
2516 * Implements the concatenation operator.
2517 */
76e3520e 2518STATIC regnode *
830247a4 2519S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2520{
c277df42
IZ
2521 register regnode *ret;
2522 register regnode *chain = NULL;
2523 register regnode *latest;
2524 I32 flags = 0, c = 0;
a0d0e21e 2525
b81d288d 2526 if (first)
c277df42
IZ
2527 ret = NULL;
2528 else {
b81d288d 2529 if (!SIZE_ONLY && RExC_extralen)
830247a4 2530 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2531 else {
830247a4 2532 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2533 Set_Node_Length(ret, 1);
2534 }
c277df42
IZ
2535 }
2536
b81d288d 2537 if (!first && SIZE_ONLY)
830247a4 2538 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2539
c277df42 2540 *flagp = WORST; /* Tentatively. */
a0d0e21e 2541
830247a4
IZ
2542 RExC_parse--;
2543 nextchar(pRExC_state);
2544 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2545 flags &= ~TRYAGAIN;
830247a4 2546 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2547 if (latest == NULL) {
2548 if (flags & TRYAGAIN)
2549 continue;
2550 return(NULL);
a0ed51b3
LW
2551 }
2552 else if (ret == NULL)
c277df42 2553 ret = latest;
a0d0e21e 2554 *flagp |= flags&HASWIDTH;
c277df42 2555 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2556 *flagp |= flags&SPSTART;
2557 else {
830247a4
IZ
2558 RExC_naughty++;
2559 regtail(pRExC_state, chain, latest);
a687059c 2560 }
a0d0e21e 2561 chain = latest;
c277df42
IZ
2562 c++;
2563 }
2564 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2565 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2566 if (ret == NULL)
2567 ret = chain;
2568 }
2569 if (c == 1) {
2570 *flagp |= flags&SIMPLE;
a0d0e21e 2571 }
a687059c 2572
a0d0e21e 2573 return(ret);
a687059c
LW
2574}
2575
2576/*
2577 - regpiece - something followed by possible [*+?]
2578 *
2579 * Note that the branching code sequences used for ? and the general cases
2580 * of * and + are somewhat optimized: they use the same NOTHING node as
2581 * both the endmarker for their branch list and the body of the last branch.
2582 * It might seem that this node could be dispensed with entirely, but the
2583 * endmarker role is not redundant.
2584 */
76e3520e 2585STATIC regnode *
830247a4 2586S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2587{
c277df42 2588 register regnode *ret;
a0d0e21e
LW
2589 register char op;
2590 register char *next;
2591 I32 flags;
830247a4 2592 char *origparse = RExC_parse;
a0d0e21e
LW
2593 char *maxpos;
2594 I32 min;
c277df42 2595 I32 max = REG_INFTY;
fac92740 2596 char *parse_start;
a0d0e21e 2597
830247a4 2598 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2599 if (ret == NULL) {
2600 if (flags & TRYAGAIN)
2601 *flagp |= TRYAGAIN;
2602 return(NULL);
2603 }
2604
830247a4 2605 op = *RExC_parse;
a0d0e21e 2606
830247a4 2607 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2608 parse_start = RExC_parse; /* MJD */
830247a4 2609 next = RExC_parse + 1;
a0d0e21e
LW
2610 maxpos = Nullch;
2611 while (isDIGIT(*next) || *next == ',') {
2612 if (*next == ',') {
2613 if (maxpos)
2614 break;
2615 else
2616 maxpos = next;
a687059c 2617 }
a0d0e21e
LW
2618 next++;
2619 }
2620 if (*next == '}') { /* got one */
2621 if (!maxpos)
2622 maxpos = next;
830247a4
IZ
2623 RExC_parse++;
2624 min = atoi(RExC_parse);
a0d0e21e
LW
2625 if (*maxpos == ',')
2626 maxpos++;
2627 else
830247a4 2628 maxpos = RExC_parse;
a0d0e21e
LW
2629 max = atoi(maxpos);
2630 if (!max && *maxpos != '0')
c277df42
IZ
2631 max = REG_INFTY; /* meaning "infinity" */
2632 else if (max >= REG_INFTY)
8615cb43 2633 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2634 RExC_parse = next;
2635 nextchar(pRExC_state);
a0d0e21e
LW
2636
2637 do_curly:
2638 if ((flags&SIMPLE)) {
830247a4
IZ
2639 RExC_naughty += 2 + RExC_naughty / 2;
2640 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2641 Set_Node_Offset(ret, parse_start+1); /* MJD */
2642 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2643 }
2644 else {
830247a4 2645 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2646
2647 w->flags = 0;
830247a4
IZ
2648 regtail(pRExC_state, ret, w);
2649 if (!SIZE_ONLY && RExC_extralen) {
2650 reginsert(pRExC_state, LONGJMP,ret);
2651 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2652 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2653 }
830247a4 2654 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2655 /* MJD hk */
2656 Set_Node_Offset(ret, parse_start+1);
2657 Set_Node_Length(ret,
2658 op == '{' ? (RExC_parse - parse_start) : 1);
2659
830247a4 2660 if (!SIZE_ONLY && RExC_extralen)
c277df42 2661 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2662 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2663 if (SIZE_ONLY)
830247a4
IZ
2664 RExC_whilem_seen++, RExC_extralen += 3;
2665 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2666 }
c277df42 2667 ret->flags = 0;
a0d0e21e
LW
2668
2669 if (min > 0)
821b33a5
IZ
2670 *flagp = WORST;
2671 if (max > 0)
2672 *flagp |= HASWIDTH;
a0d0e21e 2673 if (max && max < min)
8615cb43 2674 vFAIL("Can't do {n,m} with n > m");
c277df42 2675 if (!SIZE_ONLY) {
eb160463
GS
2676 ARG1_SET(ret, (U16)min);
2677 ARG2_SET(ret, (U16)max);
a687059c 2678 }
a687059c 2679
a0d0e21e 2680 goto nest_check;
a687059c 2681 }
a0d0e21e 2682 }
a687059c 2683
a0d0e21e
LW
2684 if (!ISMULT1(op)) {
2685 *flagp = flags;
a687059c 2686 return(ret);
a0d0e21e 2687 }
bb20fd44 2688
c277df42 2689#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2690
2691 /* if this is reinstated, don't forget to put this back into perldiag:
2692
2693 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2694
2695 (F) The part of the regexp subject to either the * or + quantifier
2696 could match an empty string. The {#} shows in the regular
2697 expression about where the problem was discovered.
2698
2699 */
2700
bb20fd44 2701 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2702 vFAIL("Regexp *+ operand could be empty");
b81d288d 2703#endif
bb20fd44 2704
fac92740 2705 parse_start = RExC_parse;
830247a4 2706 nextchar(pRExC_state);
a0d0e21e 2707
821b33a5 2708 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2709
2710 if (op == '*' && (flags&SIMPLE)) {
830247a4 2711 reginsert(pRExC_state, STAR, ret);
c277df42 2712 ret->flags = 0;
830247a4 2713 RExC_naughty += 4;
a0d0e21e
LW
2714 }
2715 else if (op == '*') {
2716 min = 0;
2717 goto do_curly;
a0ed51b3
LW
2718 }
2719 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2720 reginsert(pRExC_state, PLUS, ret);
c277df42 2721 ret->flags = 0;
830247a4 2722 RExC_naughty += 3;
a0d0e21e
LW
2723 }
2724 else if (op == '+') {
2725 min = 1;
2726 goto do_curly;
a0ed51b3
LW
2727 }
2728 else if (op == '?') {
a0d0e21e
LW
2729 min = 0; max = 1;
2730 goto do_curly;
2731 }
2732 nest_check:
e476b1b5 2733 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2734 vWARN3(RExC_parse,
b45f050a 2735 "%.*s matches null string many times",
830247a4 2736 RExC_parse - origparse,
b45f050a 2737 origparse);
a0d0e21e
LW
2738 }
2739
830247a4
IZ
2740 if (*RExC_parse == '?') {
2741 nextchar(pRExC_state);
2742 reginsert(pRExC_state, MINMOD, ret);
2743 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2744 }
830247a4
IZ
2745 if (ISMULT2(RExC_parse)) {
2746 RExC_parse++;
b45f050a
JF
2747 vFAIL("Nested quantifiers");
2748 }
a0d0e21e
LW
2749
2750 return(ret);
a687059c
LW
2751}
2752
2753/*
2754 - regatom - the lowest level
2755 *
2756 * Optimization: gobbles an entire sequence of ordinary characters so that
2757 * it can turn them into a single node, which is smaller to store and
2758 * faster to run. Backslashed characters are exceptions, each becoming a
2759 * separate node; the code is simpler that way and it's not worth fixing.
2760 *
b45f050a 2761 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2762STATIC regnode *
830247a4 2763S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2764{
c277df42 2765 register regnode *ret = 0;
a0d0e21e 2766 I32 flags;
f06dbbb7 2767 char *parse_start = 0;
a0d0e21e
LW
2768
2769 *flagp = WORST; /* Tentatively. */
2770
2771tryagain:
830247a4 2772 switch (*RExC_parse) {
a0d0e21e 2773 case '^':
830247a4
IZ
2774 RExC_seen_zerolen++;
2775 nextchar(pRExC_state);
e2509266 2776 if (RExC_flags & PMf_MULTILINE)
830247a4 2777 ret = reg_node(pRExC_state, MBOL);
e2509266 2778 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2779 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2780 else
830247a4 2781 ret = reg_node(pRExC_state, BOL);
fac92740 2782 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2783 break;
2784 case '$':
830247a4 2785 nextchar(pRExC_state);
b81d288d 2786 if (*RExC_parse)
830247a4 2787 RExC_seen_zerolen++;
e2509266 2788 if (RExC_flags & PMf_MULTILINE)
830247a4 2789 ret = reg_node(pRExC_state, MEOL);
e2509266 2790 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2791 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2792 else
830247a4 2793 ret = reg_node(pRExC_state, EOL);
fac92740 2794 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2795 break;
2796 case '.':
830247a4 2797 nextchar(pRExC_state);
e2509266 2798 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2799 ret = reg_node(pRExC_state, SANY);
2800 else
2801 ret = reg_node(pRExC_state, REG_ANY);
2802 *flagp |= HASWIDTH|SIMPLE;
830247a4 2803 RExC_naughty++;
fac92740 2804 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2805 break;
2806 case '[':
b45f050a 2807 {
830247a4 2808 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2809 ret = regclass(pRExC_state);
830247a4
IZ
2810 if (*RExC_parse != ']') {
2811 RExC_parse = oregcomp_parse;
b45f050a
JF
2812 vFAIL("Unmatched [");
2813 }
830247a4 2814 nextchar(pRExC_state);
a0d0e21e 2815 *flagp |= HASWIDTH|SIMPLE;
fac92740 2816 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2817 break;
b45f050a 2818 }
a0d0e21e 2819 case '(':
830247a4
IZ
2820 nextchar(pRExC_state);
2821 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2822 if (ret == NULL) {
bf93d4cc 2823 if (flags & TRYAGAIN) {
830247a4 2824 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2825 /* Make parent create an empty node if needed. */
2826 *flagp |= TRYAGAIN;
2827 return(NULL);
2828 }
a0d0e21e 2829 goto tryagain;
bf93d4cc 2830 }
a0d0e21e
LW
2831 return(NULL);
2832 }
c277df42 2833 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2834 break;
2835 case '|':
2836 case ')':
2837 if (flags & TRYAGAIN) {
2838 *flagp |= TRYAGAIN;
2839 return NULL;
2840 }
b45f050a 2841 vFAIL("Internal urp");
a0d0e21e
LW
2842 /* Supposed to be caught earlier. */
2843 break;
85afd4ae 2844 case '{':
830247a4
IZ
2845 if (!regcurly(RExC_parse)) {
2846 RExC_parse++;
85afd4ae
CS
2847 goto defchar;
2848 }
2849 /* FALL THROUGH */
a0d0e21e
LW
2850 case '?':
2851 case '+':
2852 case '*':
830247a4 2853 RExC_parse++;
b45f050a 2854 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2855 break;
2856 case '\\':
830247a4 2857 switch (*++RExC_parse) {
a0d0e21e 2858 case 'A':
830247a4
IZ
2859 RExC_seen_zerolen++;
2860 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2861 *flagp |= SIMPLE;
830247a4 2862 nextchar(pRExC_state);
fac92740 2863 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2864 break;
2865 case 'G':
830247a4
IZ
2866 ret = reg_node(pRExC_state, GPOS);
2867 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2868 *flagp |= SIMPLE;
830247a4 2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2871 break;
2872 case 'Z':
830247a4 2873 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2874 *flagp |= SIMPLE;
a1917ab9 2875 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2876 nextchar(pRExC_state);
a0d0e21e 2877 break;
b85d18e9 2878 case 'z':
830247a4 2879 ret = reg_node(pRExC_state, EOS);
b85d18e9 2880 *flagp |= SIMPLE;
830247a4
IZ
2881 RExC_seen_zerolen++; /* Do not optimize RE away */
2882 nextchar(pRExC_state);
fac92740 2883 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2884 break;
4a2d328f 2885 case 'C':
f33976b4
DB
2886 ret = reg_node(pRExC_state, CANY);
2887 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2888 *flagp |= HASWIDTH|SIMPLE;
830247a4 2889 nextchar(pRExC_state);
fac92740 2890 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2891 break;
2892 case 'X':
830247a4 2893 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2894 *flagp |= HASWIDTH;
830247a4 2895 nextchar(pRExC_state);
fac92740 2896 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2897 break;
a0d0e21e 2898 case 'w':
eb160463 2899 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2900 *flagp |= HASWIDTH|SIMPLE;
830247a4 2901 nextchar(pRExC_state);
fac92740 2902 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2903 break;
2904 case 'W':
eb160463 2905 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2906 *flagp |= HASWIDTH|SIMPLE;
830247a4 2907 nextchar(pRExC_state);
fac92740 2908 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2909 break;
2910 case 'b':
830247a4
IZ
2911 RExC_seen_zerolen++;
2912 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2913 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2914 *flagp |= SIMPLE;
830247a4 2915 nextchar(pRExC_state);
fac92740 2916 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2917 break;
2918 case 'B':
830247a4
IZ
2919 RExC_seen_zerolen++;
2920 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2921 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2922 *flagp |= SIMPLE;
830247a4 2923 nextchar(pRExC_state);
fac92740 2924 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2925 break;
2926 case 's':
eb160463 2927 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2928 *flagp |= HASWIDTH|SIMPLE;
830247a4 2929 nextchar(pRExC_state);
fac92740 2930 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2931 break;
2932 case 'S':
eb160463 2933 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2934 *flagp |= HASWIDTH|SIMPLE;
830247a4 2935 nextchar(pRExC_state);
fac92740 2936 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2937 break;
2938 case 'd':
ffc61ed2 2939 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2940 *flagp |= HASWIDTH|SIMPLE;
830247a4 2941 nextchar(pRExC_state);
fac92740 2942 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2943 break;
2944 case 'D':
ffc61ed2 2945 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2946 *flagp |= HASWIDTH|SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2949 break;
a14b48bc
LW
2950 case 'p':
2951 case 'P':
3568d838 2952 {
830247a4 2953 char* oldregxend = RExC_end;
ccb2c380 2954 char* parse_start = RExC_parse - 2;
a14b48bc 2955
830247a4 2956 if (RExC_parse[1] == '{') {
3568d838 2957 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2958 RExC_end = strchr(RExC_parse, '}');
2959 if (!RExC_end) {
0da60cf5 2960 U8 c = (U8)*RExC_parse;
830247a4
IZ
2961 RExC_parse += 2;
2962 RExC_end = oldregxend;
0da60cf5 2963 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2964 }
830247a4 2965 RExC_end++;
a14b48bc 2966 }
af6f566e 2967 else {
830247a4 2968 RExC_end = RExC_parse + 2;
af6f566e
HS
2969 if (RExC_end > oldregxend)
2970 RExC_end = oldregxend;
2971 }
830247a4 2972 RExC_parse--;
a14b48bc 2973
ffc61ed2 2974 ret = regclass(pRExC_state);
a14b48bc 2975
830247a4
IZ
2976 RExC_end = oldregxend;
2977 RExC_parse--;
ccb2c380
MP
2978
2979 Set_Node_Offset(ret, parse_start + 2);
2980 Set_Node_Cur_Length(ret);
830247a4 2981 nextchar(pRExC_state);
a14b48bc
LW
2982 *flagp |= HASWIDTH|SIMPLE;
2983 }
2984 break;
a0d0e21e
LW
2985 case 'n':
2986 case 'r':
2987 case 't':
2988 case 'f':
2989 case 'e':
2990 case 'a':
2991 case 'x':
2992 case 'c':
2993 case '0':
2994 goto defchar;
2995 case '1': case '2': case '3': case '4':
2996 case '5': case '6': case '7': case '8': case '9':
2997 {
830247a4 2998 I32 num = atoi(RExC_parse);
a0d0e21e 2999
830247a4 3000 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3001 goto defchar;
3002 else {
fac92740 3003 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3004 while (isDIGIT(*RExC_parse))
3005 RExC_parse++;
b45f050a 3006
eb160463 3007 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3008 vFAIL("Reference to nonexistent group");
830247a4 3009 RExC_sawback = 1;
eb160463
GS
3010 ret = reganode(pRExC_state,
3011 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3012 num);
a0d0e21e 3013 *flagp |= HASWIDTH;
fac92740
MJD
3014
3015 /* override incorrect value set in reganode MJD */
3016 Set_Node_Offset(ret, parse_start+1);
3017 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3018 RExC_parse--;
3019 nextchar(pRExC_state);
a0d0e21e
LW
3020 }
3021 }
3022 break;
3023 case '\0':
830247a4 3024 if (RExC_parse >= RExC_end)
b45f050a 3025 FAIL("Trailing \\");
a0d0e21e
LW
3026 /* FALL THROUGH */
3027 default:
c9f97d15
IZ
3028 /* Do not generate `unrecognized' warnings here, we fall
3029 back into the quick-grab loop below */
a0d0e21e
LW
3030 goto defchar;
3031 }
3032 break;
4633a7c4
LW
3033
3034 case '#':
e2509266 3035 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3036 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3037 if (RExC_parse < RExC_end)
4633a7c4
LW
3038 goto tryagain;
3039 }
3040 /* FALL THROUGH */
3041
a0d0e21e 3042 default: {
ba210ebe 3043 register STRLEN len;
58ae7d3f 3044 register UV ender;
a0d0e21e 3045 register char *p;
c277df42 3046 char *oldp, *s;
ba210ebe 3047 STRLEN numlen;
80aecb99 3048 STRLEN foldlen;
60a8b682 3049 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3050
3051 parse_start = RExC_parse - 1;
a0d0e21e 3052
830247a4 3053 RExC_parse++;
a0d0e21e
LW
3054
3055 defchar:
58ae7d3f 3056 ender = 0;
eb160463
GS
3057 ret = reg_node(pRExC_state,
3058 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3059 s = STRING(ret);
830247a4
IZ
3060 for (len = 0, p = RExC_parse - 1;
3061 len < 127 && p < RExC_end;
a0d0e21e
LW
3062 len++)
3063 {
3064 oldp = p;
5b5a24f7 3065
e2509266 3066 if (RExC_flags & PMf_EXTENDED)
830247a4 3067 p = regwhite(p, RExC_end);
a0d0e21e
LW
3068 switch (*p) {
3069 case '^':
3070 case '$':
3071 case '.':
3072 case '[':
3073 case '(':
3074 case ')':
3075 case '|':
3076 goto loopdone;
3077 case '\\':
3078 switch (*++p) {
3079 case 'A':
1ed8eac0
JF
3080 case 'C':
3081 case 'X':
a0d0e21e
LW
3082 case 'G':
3083 case 'Z':
b85d18e9 3084 case 'z':
a0d0e21e
LW
3085 case 'w':
3086 case 'W':
3087 case 'b':
3088 case 'B':
3089 case 's':
3090 case 'S':
3091 case 'd':
3092 case 'D':
a14b48bc
LW
3093 case 'p':
3094 case 'P':
a0d0e21e
LW
3095 --p;
3096 goto loopdone;
3097 case 'n':
3098 ender = '\n';
3099 p++;
a687059c 3100 break;
a0d0e21e
LW
3101 case 'r':
3102 ender = '\r';
3103 p++;
a687059c 3104 break;
a0d0e21e
LW
3105 case 't':
3106 ender = '\t';
3107 p++;
a687059c 3108 break;
a0d0e21e
LW
3109 case 'f':
3110 ender = '\f';
3111 p++;
a687059c 3112 break;
a0d0e21e 3113 case 'e':
c7f1f016 3114 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3115 p++;
a687059c 3116 break;
a0d0e21e 3117 case 'a':
c7f1f016 3118 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3119 p++;
a687059c 3120 break;
a0d0e21e 3121 case 'x':
a0ed51b3
LW
3122 if (*++p == '{') {
3123 char* e = strchr(p, '}');
b81d288d 3124
b45f050a 3125 if (!e) {
830247a4 3126 RExC_parse = p + 1;
b45f050a
JF
3127 vFAIL("Missing right brace on \\x{}");
3128 }
de5f0749 3129 else {
a4c04bdc
NC
3130 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3131 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3132 numlen = e - p - 1;
3133 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3134 if (ender > 0xff)
3135 RExC_utf8 = 1;
b21ed0a9
GS
3136 /* numlen is generous */
3137 if (numlen + len >= 127) {
a0ed51b3
LW
3138 p--;
3139 goto loopdone;
3140 }
3141 p = e + 1;
3142 }
a0ed51b3
LW
3143 }
3144 else {
a4c04bdc 3145 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3146 numlen = 2;
3147 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3148 p += numlen;
3149 }
a687059c 3150 break;
a0d0e21e
LW
3151 case 'c':
3152 p++;
bbce6d69 3153 ender = UCHARAT(p++);
3154 ender = toCTRL(ender);
a687059c 3155 break;
a0d0e21e
LW
3156 case '0': case '1': case '2': case '3':case '4':
3157 case '5': case '6': case '7': case '8':case '9':
3158 if (*p == '0' ||
830247a4 3159 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3160 I32 flags = 0;
3161 numlen = 3;
3162 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3163 p += numlen;
3164 }
3165 else {
3166 --p;
3167 goto loopdone;
a687059c
LW
3168 }
3169 break;
a0d0e21e 3170 case '\0':
830247a4 3171 if (p >= RExC_end)
b45f050a 3172 FAIL("Trailing \\");
a687059c 3173 /* FALL THROUGH */
a0d0e21e 3174 default:
e476b1b5 3175 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3176 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3177 goto normal_default;
a0d0e21e
LW
3178 }
3179 break;
a687059c 3180 default:
a0ed51b3 3181 normal_default:
fd400ab9 3182 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3183 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3184 &numlen, 0);
a0ed51b3
LW
3185 p += numlen;
3186 }
3187 else
3188 ender = *p++;
a0d0e21e 3189 break;
a687059c 3190 }
e2509266 3191 if (RExC_flags & PMf_EXTENDED)
830247a4 3192 p = regwhite(p, RExC_end);
60a8b682
JH
3193 if (UTF && FOLD) {
3194 /* Prime the casefolded buffer. */
ac7e0132 3195 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3196 }
a0d0e21e
LW
3197 if (ISMULT2(p)) { /* Back off on ?+*. */
3198 if (len)
3199 p = oldp;
16ea2a2e 3200 else if (UTF) {
0ebc6274
JH
3201 STRLEN unilen;
3202
80aecb99 3203 if (FOLD) {
60a8b682 3204 /* Emit all the Unicode characters. */
80aecb99
JH
3205 for (foldbuf = tmpbuf;
3206 foldlen;
3207 foldlen -= numlen) {
3208 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3209 if (numlen > 0) {
0ebc6274
JH
3210 reguni(pRExC_state, ender, s, &unilen);
3211 s += unilen;
3212 len += unilen;
3213 /* In EBCDIC the numlen
3214 * and unilen can differ. */
9dc45d57 3215 foldbuf += numlen;
47654450
JH
3216 if (numlen >= foldlen)
3217 break;
9dc45d57
JH
3218 }
3219 else
3220 break; /* "Can't happen." */
80aecb99
JH
3221 }
3222 }
3223 else {
0ebc6274 3224 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3225 if (unilen > 0) {
0ebc6274
JH
3226 s += unilen;
3227 len += unilen;
9dc45d57 3228 }
80aecb99 3229 }
a0ed51b3 3230 }
a0d0e21e
LW
3231 else {
3232 len++;
eb160463 3233 REGC((char)ender, s++);
a0d0e21e
LW
3234 }
3235 break;
a687059c 3236 }
16ea2a2e 3237 if (UTF) {
0ebc6274
JH
3238 STRLEN unilen;
3239
80aecb99 3240 if (FOLD) {
60a8b682 3241 /* Emit all the Unicode characters. */
80aecb99
JH
3242 for (foldbuf = tmpbuf;
3243 foldlen;
3244 foldlen -= numlen) {
3245 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3246 if (numlen > 0) {
0ebc6274
JH
3247 reguni(pRExC_state, ender, s, &unilen);
3248 len += unilen;
3249 s += unilen;
3250 /* In EBCDIC the numlen
3251 * and unilen can differ. */
9dc45d57 3252 foldbuf += numlen;
47654450
JH
3253 if (numlen >= foldlen)
3254 break;
9dc45d57
JH
3255 }
3256 else
3257 break;
80aecb99
JH
3258 }
3259 }
3260 else {
0ebc6274 3261 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3262 if (unilen > 0) {
0ebc6274
JH
3263 s += unilen;
3264 len += unilen;
9dc45d57 3265 }
80aecb99
JH
3266 }
3267 len--;
a0ed51b3
LW
3268 }
3269 else
eb160463 3270 REGC((char)ender, s++);
a0d0e21e
LW
3271 }
3272 loopdone:
830247a4 3273 RExC_parse = p - 1;
fac92740 3274 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3275 nextchar(pRExC_state);
793db0cb
JH
3276 {
3277 /* len is STRLEN which is unsigned, need to copy to signed */
3278 IV iv = len;
3279 if (iv < 0)
3280 vFAIL("Internal disaster");
3281 }
a0d0e21e
LW
3282 if (len > 0)
3283 *flagp |= HASWIDTH;
3284 if (len == 1)
3285 *flagp |= SIMPLE;
c277df42 3286 if (!SIZE_ONLY)
cd439c50
IZ
3287 STR_LEN(ret) = len;
3288 if (SIZE_ONLY)
830247a4 3289 RExC_size += STR_SZ(len);
cd439c50 3290 else
830247a4 3291 RExC_emit += STR_SZ(len);
a687059c 3292 }
a0d0e21e
LW
3293 break;
3294 }
a687059c 3295
60a8b682
JH
3296 /* If the encoding pragma is in effect recode the text of
3297 * any EXACT-kind nodes. */
22c54be3 3298 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3299 STRLEN oldlen = STR_LEN(ret);
3300 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3301
3302 if (RExC_utf8)
3303 SvUTF8_on(sv);
3304 if (sv_utf8_downgrade(sv, TRUE)) {
3305 char *s = sv_recode_to_utf8(sv, PL_encoding);
3306 STRLEN newlen = SvCUR(sv);
3307
3308 if (SvUTF8(sv))
3309 RExC_utf8 = 1;
3310 if (!SIZE_ONLY) {
3311 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312 (int)oldlen, STRING(ret),
3313 (int)newlen, s));
3314 Copy(s, STRING(ret), newlen, char);
3315 STR_LEN(ret) += newlen - oldlen;
3316 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3317 } else
3318 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3319 }
a72c7584
JH
3320 }
3321
a0d0e21e 3322 return(ret);
a687059c
LW
3323}
3324
873ef191 3325STATIC char *
cea2e8a9 3326S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3327{
3328 while (p < e) {
3329 if (isSPACE(*p))
3330 ++p;
3331 else if (*p == '#') {
3332 do {
3333 p++;
3334 } while (p < e && *p != '\n');
3335 }
3336 else
3337 break;
3338 }
3339 return p;
3340}
3341
b8c5462f
JH
3342/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343 Character classes ([:foo:]) can also be negated ([:^foo:]).
3344 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3346 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3347
3348#define POSIXCC_DONE(c) ((c) == ':')
3349#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3351
b8c5462f 3352STATIC I32
830247a4 3353S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3354{
3355 char *posixcc = 0;
936ed897 3356 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3357
830247a4 3358 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3359 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3360 POSIXCC(UCHARAT(RExC_parse))) {
3361 char c = UCHARAT(RExC_parse);
830247a4 3362 char* s = RExC_parse++;
b81d288d 3363
9a86a77b 3364 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3365 RExC_parse++;
3366 if (RExC_parse == RExC_end)
620e46c5 3367 /* Grandfather lone [:, [=, [. */
830247a4 3368 RExC_parse = s;
620e46c5 3369 else {
830247a4 3370 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3371
9a86a77b 3372 if (UCHARAT(RExC_parse) == ']') {
830247a4 3373 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3374 posixcc = s + 1;
3375 if (*s == ':') {
3376 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377 I32 skip = 5; /* the most common skip */
3378
3379 switch (*posixcc) {
3380 case 'a':
3381 if (strnEQ(posixcc, "alnum", 5))
3382 namedclass =
3383 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384 else if (strnEQ(posixcc, "alpha", 5))
3385 namedclass =
3386 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387 else if (strnEQ(posixcc, "ascii", 5))
3388 namedclass =
3389 complement ? ANYOF_NASCII : ANYOF_ASCII;
3390 break;
aaa51d5e
JF
3391 case 'b':
3392 if (strnEQ(posixcc, "blank", 5))
3393 namedclass =
3394 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3395 break;
b8c5462f
JH
3396 case 'c':
3397 if (strnEQ(posixcc, "cntrl", 5))
3398 namedclass =
3399 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3400 break;
3401 case 'd':
3402 if (strnEQ(posixcc, "digit", 5))
3403 namedclass =
3404 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3405 break;
3406 case 'g':
3407 if (strnEQ(posixcc, "graph", 5))
3408 namedclass =
3409 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3410 break;
3411 case 'l':
3412 if (strnEQ(posixcc, "lower", 5))
3413 namedclass =
3414 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3415 break;
3416 case 'p':
3417 if (strnEQ(posixcc, "print", 5))
3418 namedclass =
3419 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420 else if (strnEQ(posixcc, "punct", 5))
3421 namedclass =
3422 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3423 break;
3424 case 's':
3425 if (strnEQ(posixcc, "space", 5))
3426 namedclass =
aaa51d5e 3427 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3428 break;
b8c5462f
JH
3429 case 'u':
3430 if (strnEQ(posixcc, "upper", 5))
3431 namedclass =
3432 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3433 break;
3434 case 'w': /* this is not POSIX, this is the Perl \w */
3435 if (strnEQ(posixcc, "word", 4)) {
3436 namedclass =
3437 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3438 skip = 4;
3439 }
3440 break;
3441 case 'x':
3442 if (strnEQ(posixcc, "xdigit", 6)) {
3443 namedclass =
3444 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3445 skip = 6;
3446 }
3447 break;
3448 }
ac561586
JH
3449 if (namedclass == OOB_NAMEDCLASS ||
3450 posixcc[skip] != ':' ||
3451 posixcc[skip+1] != ']')
b45f050a
JF
3452 {
3453 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3454 t - s - 1, s + 1);
3455 }
3456 } else if (!SIZE_ONLY) {
b8c5462f 3457 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3458
830247a4 3459 /* adjust RExC_parse so the warning shows after
b45f050a 3460 the class closes */
9a86a77b 3461 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3462 RExC_parse++;
b45f050a
JF
3463 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3464 }
b8c5462f
JH
3465 } else {
3466 /* Maternal grandfather:
3467 * "[:" ending in ":" but not in ":]" */
830247a4 3468 RExC_parse = s;
767d463e 3469 }
620e46c5
JH
3470 }
3471 }
3472
b8c5462f
JH
3473 return namedclass;
3474}
3475
3476STATIC void
830247a4 3477S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3478{
b938889d 3479 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3480 char *s = RExC_parse;
93733859 3481 char c = *s++;
b8c5462f
JH
3482
3483 while(*s && isALNUM(*s))
3484 s++;
3485 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3486 if (ckWARN(WARN_REGEXP))
3487 vWARN3(s+2,
3488 "POSIX syntax [%c %c] belongs inside character classes",
3489 c, c);
b45f050a
JF
3490
3491 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3492 if (POSIXCC_NOTYET(c)) {
830247a4 3493 /* adjust RExC_parse so the error shows after
b45f050a 3494 the class closes */
9a86a77b 3495 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3496 ;
3497 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3498 }
b8c5462f
JH
3499 }
3500 }
620e46c5
JH
3501}
3502
76e3520e 3503STATIC regnode *
830247a4 3504S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3505{
ffc61ed2 3506 register UV value;
9a86a77b 3507 register UV nextvalue;
3568d838 3508 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3509 register IV range = 0;
c277df42 3510 register regnode *ret;
ba210ebe 3511 STRLEN numlen;
ffc61ed2 3512 IV namedclass;
9c5ffd7c 3513 char *rangebegin = 0;
936ed897 3514 bool need_class = 0;
9c5ffd7c 3515 SV *listsv = Nullsv;
ffc61ed2
JH
3516 register char *e;
3517 UV n;
9e55ce06
JH
3518 bool optimize_invert = TRUE;
3519 AV* unicode_alternate = 0;
1b2d223b
JH
3520#ifdef EBCDIC
3521 UV literal_endpoint = 0;
3522#endif
ffc61ed2
JH
3523
3524 ret = reganode(pRExC_state, ANYOF, 0);
3525
3526 if (!SIZE_ONLY)
3527 ANYOF_FLAGS(ret) = 0;
3528
9a86a77b 3529 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3530 RExC_naughty++;
3531 RExC_parse++;
3532 if (!SIZE_ONLY)
3533 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534 }
a0d0e21e 3535
936ed897 3536 if (SIZE_ONLY)
830247a4 3537 RExC_size += ANYOF_SKIP;
936ed897 3538 else {
830247a4 3539 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3540 if (FOLD)
3541 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3542 if (LOC)
3543 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3544 ANYOF_BITMAP_ZERO(ret);
3545 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3546 }
b8c5462f 3547
9a86a77b
JH
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549
b938889d 3550 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3551 checkposixcc(pRExC_state);
b8c5462f 3552
f064b6ad
HS
3553 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554 if (UCHARAT(RExC_parse) == ']')
3555 goto charclassloop;
ffc61ed2 3556
9a86a77b 3557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3558
3559 charclassloop:
3560
3561 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3562
73b437c8 3563 if (!range)
830247a4 3564 rangebegin = RExC_parse;
ffc61ed2 3565 if (UTF) {
5e12f4fb 3566 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3567 RExC_end - RExC_parse,
3568 &numlen, 0);
ffc61ed2
JH
3569 RExC_parse += numlen;
3570 }
3571 else
3572 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3573 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574 if (value == '[' && POSIXCC(nextvalue))
830247a4 3575 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3576 else if (value == '\\') {
ffc61ed2 3577 if (UTF) {
5e12f4fb 3578 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3579 RExC_end - RExC_parse,
3580 &numlen, 0);
3581 RExC_parse += numlen;
3582 }
3583 else
3584 value = UCHARAT(RExC_parse++);
470c3474 3585 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3586 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3587 * be a problem later if we want switch on Unicode.
3588 * A similar issue a little bit later when switching on
3589 * namedclass. --jhi */
ffc61ed2 3590 switch ((I32)value) {
b8c5462f
JH
3591 case 'w': namedclass = ANYOF_ALNUM; break;
3592 case 'W': namedclass = ANYOF_NALNUM; break;
3593 case 's': namedclass = ANYOF_SPACE; break;
3594 case 'S': namedclass = ANYOF_NSPACE; break;
3595 case 'd': namedclass = ANYOF_DIGIT; break;
3596 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3597 case 'p':
3598 case 'P':
af6f566e 3599 if (RExC_parse >= RExC_end)
2a4859cd 3600 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3601 if (*RExC_parse == '{') {
0da60cf5 3602 U8 c = (U8)value;
ffc61ed2
JH
3603 e = strchr(RExC_parse++, '}');
3604 if (!e)
0da60cf5 3605 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3606 while (isSPACE(UCHARAT(RExC_parse)))
3607 RExC_parse++;
3608 if (e == RExC_parse)
0da60cf5 3609 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3610 n = e - RExC_parse;
ab13f0c7
JH
3611 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3612 n--;
ffc61ed2
JH
3613 }
3614 else {
3615 e = RExC_parse;
3616 n = 1;
3617 }
3618 if (!SIZE_ONLY) {
ab13f0c7
JH
3619 if (UCHARAT(RExC_parse) == '^') {
3620 RExC_parse++;
3621 n--;
3622 value = value == 'p' ? 'P' : 'p'; /* toggle */
3623 while (isSPACE(UCHARAT(RExC_parse))) {
3624 RExC_parse++;
3625 n--;
3626 }
3627 }
ffc61ed2 3628 if (value == 'p')
ab13f0c7
JH
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3631 else
ab13f0c7
JH
3632 Perl_sv_catpvf(aTHX_ listsv,
3633 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3634 }
3635 RExC_parse = e + 1;
3636 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3637 continue;
b8c5462f
JH
3638 case 'n': value = '\n'; break;
3639 case 'r': value = '\r'; break;
3640 case 't': value = '\t'; break;
3641 case 'f': value = '\f'; break;
3642 case 'b': value = '\b'; break;
c7f1f016
NIS
3643 case 'e': value = ASCII_TO_NATIVE('\033');break;
3644 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3645 case 'x':
ffc61ed2 3646 if (*RExC_parse == '{') {
a4c04bdc
NC
3647 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3649 e = strchr(RExC_parse++, '}');
b81d288d 3650 if (!e)
ffc61ed2 3651 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3652
3653 numlen = e - RExC_parse;
3654 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3655 RExC_parse = e + 1;
3656 }
3657 else {
a4c04bdc 3658 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3659 numlen = 2;
3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3661 RExC_parse += numlen;
3662 }
b8c5462f
JH
3663 break;
3664 case 'c':
830247a4 3665 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3666 value = toCTRL(value);
3667 break;
3668 case '0': case '1': case '2': case '3': case '4':
3669 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3670 {
3671 I32 flags = 0;
3672 numlen = 3;
3673 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3674 RExC_parse += numlen;
b8c5462f 3675 break;
53305cf1 3676 }
1028017a 3677 default:
e476b1b5 3678 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3679 vWARN2(RExC_parse,
3680 "Unrecognized escape \\%c in character class passed through",
3681 (int)value);
1028017a 3682 break;
b8c5462f 3683 }
ffc61ed2 3684 } /* end of \blah */
1b2d223b
JH
3685#ifdef EBCDIC
3686 else
3687 literal_endpoint++;
3688#endif
ffc61ed2
JH
3689
3690 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3691
3692 if (!SIZE_ONLY && !need_class)
936ed897 3693 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3694
936ed897 3695 need_class = 1;
ffc61ed2
JH
3696
3697 /* a bad range like a-\d, a-[:digit:] ? */
3698 if (range) {
73b437c8 3699 if (!SIZE_ONLY) {
e476b1b5 3700 if (ckWARN(WARN_REGEXP))
830247a4 3701 vWARN4(RExC_parse,
b45f050a 3702 "False [] range \"%*.*s\"",
830247a4
IZ
3703 RExC_parse - rangebegin,
3704 RExC_parse - rangebegin,
b45f050a 3705 rangebegin);
3568d838
JH
3706 if (prevvalue < 256) {
3707 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3708 ANYOF_BITMAP_SET(ret, '-');
3709 }
3710 else {
3711 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3712 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3713 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3714 }
b8c5462f 3715 }
ffc61ed2
JH
3716
3717 range = 0; /* this was not a true range */
73b437c8 3718 }
ffc61ed2 3719
73b437c8 3720 if (!SIZE_ONLY) {
3568d838
JH
3721 if (namedclass > OOB_NAMEDCLASS)
3722 optimize_invert = FALSE;
e2962f66
JH
3723 /* Possible truncation here but in some 64-bit environments
3724 * the compiler gets heartburn about switch on 64-bit values.
3725 * A similar issue a little earlier when switching on value.
98f323fa 3726 * --jhi */
e2962f66 3727 switch ((I32)namedclass) {
73b437c8
JH
3728 case ANYOF_ALNUM:
3729 if (LOC)
936ed897 3730 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3731 else {
3732 for (value = 0; value < 256; value++)
3733 if (isALNUM(value))
936ed897 3734 ANYOF_BITMAP_SET(ret, value);
73b437c8 3735 }
ffc61ed2 3736 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3737 break;
3738 case ANYOF_NALNUM:
3739 if (LOC)
936ed897 3740 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3741 else {
3742 for (value = 0; value < 256; value++)
3743 if (!isALNUM(value))
936ed897 3744 ANYOF_BITMAP_SET(ret, value);
73b437c8 3745 }
ffc61ed2 3746 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3747 break;
ffc61ed2 3748 case ANYOF_ALNUMC:
73b437c8 3749 if (LOC)
ffc61ed2 3750 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3751 else {
3752 for (value = 0; value < 256; value++)
ffc61ed2 3753 if (isALNUMC(value))
936ed897 3754 ANYOF_BITMAP_SET(ret, value);
73b437c8 3755 }
ffc61ed2 3756 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3757 break;
3758 case ANYOF_NALNUMC:
3759 if (LOC)
936ed897 3760 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3761 else {
3762 for (value = 0; value < 256; value++)
3763 if (!isALNUMC(value))
936ed897 3764 ANYOF_BITMAP_SET(ret, value);
73b437c8 3765 }
ffc61ed2 3766 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3767 break;
3768 case ANYOF_ALPHA:
3769 if (LOC)
936ed897 3770 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3771 else {
3772 for (value = 0; value < 256; value++)
3773 if (isALPHA(value))
936ed897 3774 ANYOF_BITMAP_SET(ret, value);
73b437c8 3775 }
ffc61ed2 3776 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3777 break;
3778 case ANYOF_NALPHA:
3779 if (LOC)
936ed897 3780 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3781 else {
3782 for (value = 0; value < 256; value++)
3783 if (!isALPHA(value))
936ed897 3784 ANYOF_BITMAP_SET(ret, value);
73b437c8 3785 }
ffc61ed2 3786 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3787 break;
3788 case ANYOF_ASCII:
3789 if (LOC)
936ed897 3790 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3791 else {
c7f1f016 3792#ifndef EBCDIC
1ba5c669
JH
3793 for (value = 0; value < 128; value++)
3794 ANYOF_BITMAP_SET(ret, value);
3795#else /* EBCDIC */
ffbc6a93 3796 for (value = 0; value < 256; value++) {
3a3c4447
JH
3797 if (isASCII(value))
3798 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3799 }
1ba5c669 3800#endif /* EBCDIC */
73b437c8 3801 }
ffc61ed2 3802 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3803 break;
3804 case ANYOF_NASCII:
3805 if (LOC)
936ed897 3806 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3807 else {
c7f1f016 3808#ifndef EBCDIC
1ba5c669
JH
3809 for (value = 128; value < 256; value++)
3810 ANYOF_BITMAP_SET(ret, value);
3811#else /* EBCDIC */
ffbc6a93 3812 for (value = 0; value < 256; value++) {
3a3c4447
JH
3813 if (!isASCII(value))
3814 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3815 }
1ba5c669 3816#endif /* EBCDIC */
73b437c8 3817 }
ffc61ed2 3818 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3819 break;
aaa51d5e
JF
3820 case ANYOF_BLANK:
3821 if (LOC)
3822 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3823 else {
3824 for (value = 0; value < 256; value++)
3825 if (isBLANK(value))
3826 ANYOF_BITMAP_SET(ret, value);
3827 }
ffc61ed2 3828 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3829 break;
3830 case ANYOF_NBLANK:
3831 if (LOC)
3832 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3833 else {
3834 for (value = 0; value < 256; value++)
3835 if (!isBLANK(value))
3836 ANYOF_BITMAP_SET(ret, value);
3837 }
ffc61ed2 3838 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3839 break;
73b437c8
JH
3840 case ANYOF_CNTRL:
3841 if (LOC)
936ed897 3842 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3843 else {
3844 for (value = 0; value < 256; value++)
3845 if (isCNTRL(value))
936ed897 3846 ANYOF_BITMAP_SET(ret, value);
73b437c8 3847 }
ffc61ed2 3848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3849 break;
3850 case ANYOF_NCNTRL:
3851 if (LOC)
936ed897 3852 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3853 else {
3854 for (value = 0; value < 256; value++)
3855 if (!isCNTRL(value))
936ed897 3856 ANYOF_BITMAP_SET(ret, value);
73b437c8 3857 }
ffc61ed2
JH
3858 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3859 break;
3860 case ANYOF_DIGIT:
3861 if (LOC)
3862 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3863 else {
3864 /* consecutive digits assumed */
3865 for (value = '0'; value <= '9'; value++)
3866 ANYOF_BITMAP_SET(ret, value);
3867 }
3868 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3869 break;
3870 case ANYOF_NDIGIT:
3871 if (LOC)
3872 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3873 else {
3874 /* consecutive digits assumed */
3875 for (value = 0; value < '0'; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3877 for (value = '9' + 1; value < 256; value++)
3878 ANYOF_BITMAP_SET(ret, value);
3879 }
3880 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3881 break;
3882 case ANYOF_GRAPH:
3883 if (LOC)
936ed897 3884 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3885 else {
3886 for (value = 0; value < 256; value++)
3887 if (isGRAPH(value))
936ed897 3888 ANYOF_BITMAP_SET(ret, value);
73b437c8 3889 }
ffc61ed2 3890 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3891 break;
3892 case ANYOF_NGRAPH:
3893 if (LOC)
936ed897 3894 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3895 else {
3896 for (value = 0; value < 256; value++)
3897 if (!isGRAPH(value))
936ed897 3898 ANYOF_BITMAP_SET(ret, value);
73b437c8 3899 }
ffc61ed2 3900 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3901 break;
3902 case ANYOF_LOWER:
3903 if (LOC)
936ed897 3904 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3905 else {
3906 for (value = 0; value < 256; value++)
3907 if (isLOWER(value))
936ed897 3908 ANYOF_BITMAP_SET(ret, value);
73b437c8 3909 }
ffc61ed2 3910 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3911 break;
3912 case ANYOF_NLOWER:
3913 if (LOC)
936ed897 3914 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3915 else {
3916 for (value = 0; value < 256; value++)
3917 if (!isLOWER(value))
936ed897 3918 ANYOF_BITMAP_SET(ret, value);
73b437c8 3919 }
ffc61ed2 3920 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3921 break;
3922 case ANYOF_PRINT:
3923 if (LOC)
936ed897 3924 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3925 else {
3926 for (value = 0; value < 256; value++)
3927 if (isPRINT(value))
936ed897 3928 ANYOF_BITMAP_SET(ret, value);
73b437c8 3929 }
ffc61ed2 3930 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3931 break;
3932 case ANYOF_NPRINT:
3933 if (LOC)
936ed897 3934 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3935 else {
3936 for (value = 0; value < 256; value++)
3937 if (!isPRINT(value))
936ed897 3938 ANYOF_BITMAP_SET(ret, value);
73b437c8 3939 }
ffc61ed2 3940 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3941 break;
aaa51d5e
JF
3942 case ANYOF_PSXSPC:
3943 if (LOC)
3944 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3945 else {
3946 for (value = 0; value < 256; value++)
3947 if (isPSXSPC(value))
3948 ANYOF_BITMAP_SET(ret, value);
3949 }
ffc61ed2 3950 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3951 break;
3952 case ANYOF_NPSXSPC:
3953 if (LOC)
3954 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3955 else {
3956 for (value = 0; value < 256; value++)
3957 if (!isPSXSPC(value))
3958 ANYOF_BITMAP_SET(ret, value);
3959 }
ffc61ed2 3960 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3961 break;
73b437c8
JH
3962 case ANYOF_PUNCT:
3963 if (LOC)
936ed897 3964 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3965 else {
3966 for (value = 0; value < 256; value++)
3967 if (isPUNCT(value))
936ed897 3968 ANYOF_BITMAP_SET(ret, value);
73b437c8 3969 }
ffc61ed2 3970 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3971 break;
3972 case ANYOF_NPUNCT:
3973 if (LOC)
936ed897 3974 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3975 else {
3976 for (value = 0; value < 256; value++)
3977 if (!isPUNCT(value))
936ed897 3978 ANYOF_BITMAP_SET(ret, value);
73b437c8 3979 }
ffc61ed2
JH
3980 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3981 break;
3982 case ANYOF_SPACE:
3983 if (LOC)
3984 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3985 else {
3986 for (value = 0; value < 256; value++)
3987 if (isSPACE(value))
3988 ANYOF_BITMAP_SET(ret, value);
3989 }
3990 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3991 break;
3992 case ANYOF_NSPACE:
3993 if (LOC)
3994 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3995 else {
3996 for (value = 0; value < 256; value++)
3997 if (!isSPACE(value))
3998 ANYOF_BITMAP_SET(ret, value);
3999 }
4000 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4001 break;
4002 case ANYOF_UPPER:
4003 if (LOC)
936ed897 4004 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4005 else {
4006 for (value = 0; value < 256; value++)
4007 if (isUPPER(value))
936ed897 4008 ANYOF_BITMAP_SET(ret, value);
73b437c8 4009 }
ffc61ed2 4010 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4011 break;
4012 case ANYOF_NUPPER:
4013 if (LOC)
936ed897 4014 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4015 else {
4016 for (value = 0; value < 256; value++)
4017 if (!isUPPER(value))
936ed897 4018 ANYOF_BITMAP_SET(ret, value);
73b437c8 4019 }
ffc61ed2 4020 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4021 break;
4022 case ANYOF_XDIGIT:
4023 if (LOC)
936ed897 4024 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4025 else {
4026 for (value = 0; value < 256; value++)
4027 if (isXDIGIT(value))
936ed897 4028 ANYOF_BITMAP_SET(ret, value);
73b437c8 4029 }
ffc61ed2 4030 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4031 break;
4032 case ANYOF_NXDIGIT:
4033 if (LOC)
936ed897 4034 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4035 else {
4036 for (value = 0; value < 256; value++)
4037 if (!isXDIGIT(value))
936ed897 4038 ANYOF_BITMAP_SET(ret, value);
73b437c8 4039 }
ffc61ed2 4040 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4041 break;
4042 default:
b45f050a 4043 vFAIL("Invalid [::] class");
73b437c8 4044 break;
b8c5462f 4045 }
b8c5462f 4046 if (LOC)
936ed897 4047 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4048 continue;
a0d0e21e 4049 }
ffc61ed2
JH
4050 } /* end of namedclass \blah */
4051
a0d0e21e 4052 if (range) {
eb160463 4053 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4054 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4055 RExC_parse - rangebegin,
4056 RExC_parse - rangebegin,
b45f050a 4057 rangebegin);
3568d838 4058 range = 0; /* not a valid range */
73b437c8 4059 }
a0d0e21e
LW
4060 }
4061 else {
3568d838 4062 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4063 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4064 RExC_parse[1] != ']') {
4065 RExC_parse++;
ffc61ed2
JH
4066
4067 /* a bad range like \w-, [:word:]- ? */
4068 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4069 if (ckWARN(WARN_REGEXP))
830247a4 4070 vWARN4(RExC_parse,
b45f050a 4071 "False [] range \"%*.*s\"",
830247a4
IZ
4072 RExC_parse - rangebegin,
4073 RExC_parse - rangebegin,
b45f050a 4074 rangebegin);
73b437c8 4075 if (!SIZE_ONLY)
936ed897 4076 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4077 } else
ffc61ed2
JH
4078 range = 1; /* yeah, it's a range! */
4079 continue; /* but do it the next time */
a0d0e21e 4080 }
a687059c 4081 }
ffc61ed2 4082
93733859 4083 /* now is the next time */
ae5c130c 4084 if (!SIZE_ONLY) {
3568d838
JH
4085 IV i;
4086
4087 if (prevvalue < 256) {
4088 IV ceilvalue = value < 256 ? value : 255;
4089
4090#ifdef EBCDIC
1b2d223b
JH
4091 /* In EBCDIC [\x89-\x91] should include
4092 * the \x8e but [i-j] should not. */
4093 if (literal_endpoint == 2 &&
4094 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4095 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4096 {
3568d838
JH
4097 if (isLOWER(prevvalue)) {
4098 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4099 if (isLOWER(i))
4100 ANYOF_BITMAP_SET(ret, i);
4101 } else {
3568d838 4102 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4103 if (isUPPER(i))
4104 ANYOF_BITMAP_SET(ret, i);
4105 }
8ada0baa 4106 }
ffc61ed2 4107 else
8ada0baa 4108#endif
a5961de5
JH
4109 for (i = prevvalue; i <= ceilvalue; i++)
4110 ANYOF_BITMAP_SET(ret, i);
3568d838 4111 }
a5961de5 4112 if (value > 255 || UTF) {
b08decb7
JH
4113 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4114 UV natvalue = NATIVE_TO_UNI(value);
4115
ffc61ed2 4116 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4117 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4118 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4119 prevnatvalue, natvalue);
4120 }
4121 else if (prevnatvalue == natvalue) {
4122 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4123 if (FOLD) {
254ba52a
JH
4124 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4125 STRLEN foldlen;
2f3bf011 4126 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4127
c840d2a2
JH
4128 /* If folding and foldable and a single
4129 * character, insert also the folded version
4130 * to the charclass. */
9e55ce06 4131 if (f != value) {
eb160463 4132 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4133 Perl_sv_catpvf(aTHX_ listsv,
4134 "%04"UVxf"\n", f);
4135 else {
4136 /* Any multicharacter foldings
4137 * require the following transform:
4138 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4139 * where E folds into "pq" and F folds
4140 * into "rst", all other characters
4141 * fold to single characters. We save
4142 * away these multicharacter foldings,
4143 * to be later saved as part of the
4144 * additional "s" data. */
4145 SV *sv;
4146
4147 if (!unicode_alternate)
4148 unicode_alternate = newAV();
4149 sv = newSVpvn((char*)foldbuf, foldlen);
4150 SvUTF8_on(sv);
4151 av_push(unicode_alternate, sv);
4152 }
4153 }
254ba52a 4154
60a8b682
JH
4155 /* If folding and the value is one of the Greek
4156 * sigmas insert a few more sigmas to make the
4157 * folding rules of the sigmas to work right.
4158 * Note that not all the possible combinations
4159 * are handled here: some of them are handled
9e55ce06
JH
4160 * by the standard folding rules, and some of
4161 * them (literal or EXACTF cases) are handled
4162 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4163 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4164 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4165 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4166 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4167 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4168 }
4169 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4170 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4171 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4172 }
4173 }
ffc61ed2 4174 }
1b2d223b
JH
4175#ifdef EBCDIC
4176 literal_endpoint = 0;
4177#endif
8ada0baa 4178 }
ffc61ed2
JH
4179
4180 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4181 }
ffc61ed2 4182
936ed897 4183 if (need_class) {
4f66b38d 4184 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4185 if (SIZE_ONLY)
830247a4 4186 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4187 else
830247a4 4188 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4189 }
ffc61ed2 4190
ae5c130c 4191 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4192 if (!SIZE_ONLY &&
ffc61ed2 4193 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4194 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4195 ) {
a0ed51b3 4196 for (value = 0; value < 256; ++value) {
936ed897 4197 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4198 UV fold = PL_fold[value];
ffc61ed2
JH
4199
4200 if (fold != value)
4201 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4202 }
4203 }
936ed897 4204 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4205 }
ffc61ed2 4206
ae5c130c 4207 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4208 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4209 /* If the only flag is inversion. */
4210 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4211 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4212 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4213 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4214 }
a0d0e21e 4215
b81d288d 4216 if (!SIZE_ONLY) {
fde631ed 4217 AV *av = newAV();
ffc61ed2
JH
4218 SV *rv;
4219
9e55ce06 4220 /* The 0th element stores the character class description
6a0407ee 4221 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4222 * to initialize the appropriate swash (which gets stored in
4223 * the 1st element), and also useful for dumping the regnode.
4224 * The 2nd element stores the multicharacter foldings,
6a0407ee 4225 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4226 av_store(av, 0, listsv);
4227 av_store(av, 1, NULL);
9e55ce06 4228 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4229 rv = newRV_noinc((SV*)av);
19860706 4230 n = add_data(pRExC_state, 1, "s");
830247a4 4231 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4232 ARG_SET(ret, n);
a0ed51b3
LW
4233 }
4234
4235 return ret;
4236}
4237
76e3520e 4238STATIC char*
830247a4 4239S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4240{
830247a4 4241 char* retval = RExC_parse++;
a0d0e21e 4242
4633a7c4 4243 for (;;) {
830247a4
IZ
4244 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4245 RExC_parse[2] == '#') {
4246 while (*RExC_parse && *RExC_parse != ')')
4247 RExC_parse++;
4248 RExC_parse++;
4633a7c4
LW
4249 continue;
4250 }
e2509266 4251 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4252 if (isSPACE(*RExC_parse)) {
4253 RExC_parse++;
748a9306
LW
4254 continue;
4255 }
830247a4
IZ
4256 else if (*RExC_parse == '#') {
4257 while (*RExC_parse && *RExC_parse != '\n')
4258 RExC_parse++;
4259 RExC_parse++;
748a9306
LW
4260 continue;
4261 }
748a9306 4262 }
4633a7c4 4263 return retval;
a0d0e21e 4264 }
a687059c
LW
4265}
4266
4267/*
c277df42 4268- reg_node - emit a node
a0d0e21e 4269*/
76e3520e 4270STATIC regnode * /* Location. */
830247a4 4271S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4272{
c277df42
IZ
4273 register regnode *ret;
4274 register regnode *ptr;
a687059c 4275
830247a4 4276 ret = RExC_emit;
c277df42 4277 if (SIZE_ONLY) {
830247a4
IZ
4278 SIZE_ALIGN(RExC_size);
4279 RExC_size += 1;
a0d0e21e
LW
4280 return(ret);
4281 }
a687059c 4282
c277df42 4283 NODE_ALIGN_FILL(ret);
a0d0e21e 4284 ptr = ret;
c277df42 4285 FILL_ADVANCE_NODE(ptr, op);
fac92740 4286 if (RExC_offsets) { /* MJD */
ccb2c380 4287 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4288 "reg_node", __LINE__,
4289 reg_name[op],
4290 RExC_emit - RExC_emit_start > RExC_offsets[0]
4291 ? "Overwriting end of array!\n" : "OK",
4292 RExC_emit - RExC_emit_start,
4293 RExC_parse - RExC_start,
4294 RExC_offsets[0]));
ccb2c380 4295 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4296 }
4297
830247a4 4298 RExC_emit = ptr;
a687059c 4299
a0d0e21e 4300 return(ret);
a687059c
LW
4301}
4302
4303/*
a0d0e21e
LW
4304- reganode - emit a node with an argument
4305*/
76e3520e 4306STATIC regnode * /* Location. */
830247a4 4307S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4308{
c277df42
IZ
4309 register regnode *ret;
4310 register regnode *ptr;
fe14fcc3 4311
830247a4 4312 ret = RExC_emit;
c277df42 4313 if (SIZE_ONLY) {
830247a4
IZ
4314 SIZE_ALIGN(RExC_size);
4315 RExC_size += 2;
a0d0e21e
LW
4316 return(ret);
4317 }
fe14fcc3 4318
c277df42 4319 NODE_ALIGN_FILL(ret);
a0d0e21e 4320 ptr = ret;
c277df42 4321 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4322 if (RExC_offsets) { /* MJD */
ccb2c380 4323 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4324 "reganode",
ccb2c380
MP
4325 __LINE__,
4326 reg_name[op],
fac92740
MJD
4327 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4328 "Overwriting end of array!\n" : "OK",
4329 RExC_emit - RExC_emit_start,
4330 RExC_parse - RExC_start,
4331 RExC_offsets[0]));
ccb2c380 4332 Set_Cur_Node_Offset;
fac92740
MJD
4333 }
4334
830247a4 4335 RExC_emit = ptr;
fe14fcc3 4336
a0d0e21e 4337 return(ret);
fe14fcc3
LW
4338}
4339
4340/*
cd439c50 4341- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4342*/
4343STATIC void
830247a4 4344S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4345{
5e12f4fb 4346 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4347}
4348
4349/*
a0d0e21e
LW
4350- reginsert - insert an operator in front of already-emitted operand
4351*
4352* Means relocating the operand.
4353*/
76e3520e 4354STATIC void
830247a4 4355S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4356{
c277df42
IZ
4357 register regnode *src;
4358 register regnode *dst;
4359 register regnode *place;
4360 register int offset = regarglen[(U8)op];
b81d288d 4361
22c35a8c 4362/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4363
4364 if (SIZE_ONLY) {
830247a4 4365 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4366 return;
4367 }
a687059c 4368
830247a4
IZ
4369 src = RExC_emit;
4370 RExC_emit += NODE_STEP_REGNODE + offset;
4371 dst = RExC_emit;
fac92740 4372 while (src > opnd) {
c277df42 4373 StructCopy(--src, --dst, regnode);
fac92740 4374 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4375 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4376 "reg_insert",
ccb2c380
MP
4377 __LINE__,
4378 reg_name[op],
fac92740
MJD
4379 dst - RExC_emit_start > RExC_offsets[0]
4380 ? "Overwriting end of array!\n" : "OK",
4381 src - RExC_emit_start,
4382 dst - RExC_emit_start,
4383 RExC_offsets[0]));
ccb2c380
MP
4384 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4385 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4386 }
4387 }
4388
a0d0e21e
LW
4389
4390 place = opnd; /* Op node, where operand used to be. */
fac92740 4391 if (RExC_offsets) { /* MJD */
ccb2c380 4392 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4393 "reginsert",
ccb2c380
MP
4394 __LINE__,
4395 reg_name[op],
fac92740
MJD
4396 place - RExC_emit_start > RExC_offsets[0]
4397 ? "Overwriting end of array!\n" : "OK",
4398 place - RExC_emit_start,
4399 RExC_parse - RExC_start,
4400 RExC_offsets[0]));
ccb2c380 4401 Set_Node_Offset(place, RExC_parse);
fac92740 4402 }
c277df42
IZ
4403 src = NEXTOPER(place);
4404 FILL_ADVANCE_NODE(place, op);
4405 Zero(src, offset, regnode);
a687059c
LW
4406}
4407
4408/*
c277df42 4409- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4410*/
76e3520e 4411STATIC void
830247a4 4412S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4413{
c277df42
IZ
4414 register regnode *scan;
4415 register regnode *temp;
a0d0e21e 4416
c277df42 4417 if (SIZE_ONLY)
a0d0e21e
LW
4418 return;
4419
4420 /* Find last node. */
4421 scan = p;
4422 for (;;) {
4423 temp = regnext(scan);
4424 if (temp == NULL)
4425 break;
4426 scan = temp;
4427 }
a687059c 4428
c277df42
IZ
4429 if (reg_off_by_arg[OP(scan)]) {
4430 ARG_SET(scan, val - scan);
a0ed51b3
LW
4431 }
4432 else {
c277df42
IZ
4433 NEXT_OFF(scan) = val - scan;
4434 }
a687059c
LW
4435}
4436
4437/*
a0d0e21e
LW
4438- regoptail - regtail on operand of first argument; nop if operandless
4439*/
76e3520e 4440STATIC void
830247a4 4441S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4442{
a0d0e21e 4443 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4444 if (p == NULL || SIZE_ONLY)
4445 return;
22c35a8c 4446 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4447 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4448 }
22c35a8c 4449 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4450 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4451 }
4452 else
a0d0e21e 4453 return;
a687059c
LW
4454}
4455
4456/*
4457 - regcurly - a little FSA that accepts {\d+,?\d*}
4458 */
79072805 4459STATIC I32
cea2e8a9 4460S_regcurly(pTHX_ register char *s)
a687059c
LW
4461{
4462 if (*s++ != '{')
4463 return FALSE;
f0fcb552 4464 if (!isDIGIT(*s))
a687059c 4465 return FALSE;
f0fcb552 4466 while (isDIGIT(*s))
a687059c
LW
4467 s++;
4468 if (*s == ',')
4469 s++;
f0fcb552 4470 while (isDIGIT(*s))
a687059c
LW
4471 s++;
4472 if (*s != '}')
4473 return FALSE;
4474 return TRUE;
4475}
4476
a687059c 4477
8fa7f367
JH
4478#ifdef DEBUGGING
4479
76e3520e 4480STATIC regnode *
cea2e8a9 4481S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4482{
f248d071 4483 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4484 register regnode *next;
c277df42
IZ
4485
4486 while (op != END && (!last || node < last)) {
4487 /* While that wasn't END last time... */
4488
4489 NODE_ALIGN(node);
4490 op = OP(node);
4491 if (op == CLOSE)
4492 l--;
4493 next = regnext(node);
4494 /* Where, what. */
4495 if (OP(node) == OPTIMIZED)
4496 goto after_print;
4497 regprop(sv, node);
b900a521 4498 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4499 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4500 if (next == NULL) /* Next ptr. */
4501 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4502 else
b900a521 4503 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4504 (void)PerlIO_putc(Perl_debug_log, '\n');
4505 after_print:
22c35a8c 4506 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4507 register regnode *nnode = (OP(next) == LONGJMP
4508 ? regnext(next)
c277df42
IZ
4509 : next);
4510 if (last && nnode > last)
4511 nnode = last;
4512 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4513 }
22c35a8c 4514 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4515 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4516 }
4517 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4518 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4519 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4520 }
22c35a8c 4521 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4522 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4523 next, sv, l + 1);
a0ed51b3
LW
4524 }
4525 else if ( op == PLUS || op == STAR) {
c277df42 4526 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4527 }
4528 else if (op == ANYOF) {
4f66b38d
HS
4529 /* arglen 1 + class block */
4530 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4531 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4532 node = NEXTOPER(node);
a0ed51b3 4533 }
22c35a8c 4534 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4535 /* Literal string, where present. */
cd439c50 4536 node += NODE_SZ_STR(node) - 1;
c277df42 4537 node = NEXTOPER(node);
a0ed51b3
LW
4538 }
4539 else {
c277df42
IZ
4540 node = NEXTOPER(node);
4541 node += regarglen[(U8)op];
4542 }
4543 if (op == CURLYX || op == OPEN)
4544 l++;
4545 else if (op == WHILEM)
4546 l--;
4547 }
4548 return node;
4549}
4550
8fa7f367
JH
4551#endif /* DEBUGGING */
4552
a687059c 4553/*
fd181c75 4554 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4555 */
4556void
864dbfa3 4557Perl_regdump(pTHX_ regexp *r)
a687059c 4558{
35ff7856 4559#ifdef DEBUGGING
46fc3d4c 4560 SV *sv = sv_newmortal();
a687059c 4561
c277df42 4562 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4563
4564 /* Header fields of interest. */
c277df42 4565 if (r->anchored_substr)
7b0972df 4566 PerlIO_printf(Perl_debug_log,
b81d288d 4567 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4568 PL_colors[0],
7b0972df 4569 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4570 SvPVX(r->anchored_substr),
3280af22 4571 PL_colors[1],
c277df42 4572 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4573 (IV)r->anchored_offset);
33b8afdf
JH
4574 else if (r->anchored_utf8)
4575 PerlIO_printf(Perl_debug_log,
4576 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4577 PL_colors[0],
4578 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4579 SvPVX(r->anchored_utf8),
4580 PL_colors[1],
4581 SvTAIL(r->anchored_utf8) ? "$" : "",
4582 (IV)r->anchored_offset);
c277df42 4583 if (r->float_substr)
7b0972df 4584 PerlIO_printf(Perl_debug_log,
b81d288d 4585 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4586 PL_colors[0],
b81d288d 4587 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4588 SvPVX(r->float_substr),
3280af22 4589 PL_colors[1],
c277df42 4590 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4591 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4592 else if (r->float_utf8)
4593 PerlIO_printf(Perl_debug_log,
4594 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4595 PL_colors[0],
4596 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4597 SvPVX(r->float_utf8),
4598 PL_colors[1],
4599 SvTAIL(r->float_utf8) ? "$" : "",
4600 (IV)r->float_min_offset, (UV)r->float_max_offset);
4601 if (r->check_substr || r->check_utf8)
b81d288d
AB
4602 PerlIO_printf(Perl_debug_log,
4603 r->check_substr == r->float_substr
33b8afdf 4604 && r->check_utf8 == r->float_utf8
c277df42
IZ
4605 ? "(checking floating" : "(checking anchored");
4606 if (r->reganch & ROPT_NOSCAN)
4607 PerlIO_printf(Perl_debug_log, " noscan");
4608 if (r->reganch & ROPT_CHECK_ALL)
4609 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4610 if (r->check_substr || r->check_utf8)
c277df42
IZ
4611 PerlIO_printf(Perl_debug_log, ") ");
4612
46fc3d4c 4613 if (r->regstclass) {
4614 regprop(sv, r->regstclass);
4615 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4616 }
774d564b 4617 if (r->reganch & ROPT_ANCH) {
4618 PerlIO_printf(Perl_debug_log, "anchored");
4619 if (r->reganch & ROPT_ANCH_BOL)
4620 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4621 if (r->reganch & ROPT_ANCH_MBOL)
4622 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4623 if (r->reganch & ROPT_ANCH_SBOL)
4624 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4625 if (r->reganch & ROPT_ANCH_GPOS)
4626 PerlIO_printf(Perl_debug_log, "(GPOS)");
4627 PerlIO_putc(Perl_debug_log, ' ');
4628 }
c277df42
IZ
4629 if (r->reganch & ROPT_GPOS_SEEN)
4630 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4631 if (r->reganch & ROPT_SKIP)
760ac839 4632 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4633 if (r->reganch & ROPT_IMPLICIT)
760ac839 4634 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4635 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4636 if (r->reganch & ROPT_EVAL_SEEN)
4637 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4638 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4639 if (r->offsets) {
4640 U32 i;
4641 U32 len = r->offsets[0];
392fbf5d 4642 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4643 for (i = 1; i <= len; i++)
392fbf5d
RB
4644 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4645 (UV)r->offsets[i*2-1],
4646 (UV)r->offsets[i*2]);
fac92740
MJD
4647 PerlIO_printf(Perl_debug_log, "\n");
4648 }
17c3b450 4649#endif /* DEBUGGING */
a687059c
LW
4650}
4651
8fa7f367
JH
4652#ifdef DEBUGGING
4653
653099ff
GS
4654STATIC void
4655S_put_byte(pTHX_ SV *sv, int c)
4656{
7be5a6cf 4657 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4658 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4659 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4660 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4661 else
4662 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4663}
4664
8fa7f367
JH
4665#endif /* DEBUGGING */
4666
a687059c 4667/*
a0d0e21e
LW
4668- regprop - printable representation of opcode
4669*/
46fc3d4c 4670void
864dbfa3 4671Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4672{
35ff7856 4673#ifdef DEBUGGING
9b155405 4674 register int k;
a0d0e21e 4675
54dc92de 4676 sv_setpvn(sv, "", 0);
9b155405 4677 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4678 /* It would be nice to FAIL() here, but this may be called from
4679 regexec.c, and it would be hard to supply pRExC_state. */
4680 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4681 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4682
4683 k = PL_regkind[(U8)OP(o)];
4684
2a782b5b
JH
4685 if (k == EXACT) {
4686 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4687 /* Using is_utf8_string() is a crude hack but it may
4688 * be the best for now since we have no flag "this EXACTish
4689 * node was UTF-8" --jhi */
4690 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4691 char *s = do_utf8 ?
c728cb41
JH
4692 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4693 UNI_DISPLAY_REGEX) :
2a782b5b 4694 STRING(o);
40eddc46 4695 int len = do_utf8 ?
2a782b5b
JH
4696 strlen(s) :
4697 STR_LEN(o);
4698 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4699 PL_colors[0],
4700 len, s,
4701 PL_colors[1]);
4702 }
9b155405 4703 else if (k == CURLY) {
cb434fcc 4704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4707 }
2c2d71f5
JH
4708 else if (k == WHILEM && o->flags) /* Ordinal/of */
4709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4712 else if (k == LOGICAL)
04ebc1ab 4713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4714 else if (k == ANYOF) {
4715 int i, rangestart = -1;
ffc61ed2 4716 U8 flags = ANYOF_FLAGS(o);
a6d05634 4717 const char * const anyofs[] = { /* Should be synchronized with
19860706 4718 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4719 "\\w",
4720 "\\W",
4721 "\\s",
4722 "\\S",
4723 "\\d",
4724 "\\D",
4725 "[:alnum:]",
4726 "[:^alnum:]",
4727 "[:alpha:]",
4728 "[:^alpha:]",
4729 "[:ascii:]",
4730 "[:^ascii:]",
4731 "[:ctrl:]",
4732 "[:^ctrl:]",
4733 "[:graph:]",
4734 "[:^graph:]",
4735 "[:lower:]",
4736 "[:^lower:]",
4737 "[:print:]",
4738 "[:^print:]",
4739 "[:punct:]",
4740 "[:^punct:]",
4741 "[:upper:]",
aaa51d5e 4742 "[:^upper:]",
653099ff 4743 "[:xdigit:]",
aaa51d5e
JF
4744 "[:^xdigit:]",
4745 "[:space:]",
4746 "[:^space:]",
4747 "[:blank:]",
4748 "[:^blank:]"
653099ff
GS
4749 };
4750
19860706 4751 if (flags & ANYOF_LOCALE)
653099ff 4752 sv_catpv(sv, "{loc}");
19860706 4753 if (flags & ANYOF_FOLD)
653099ff
GS
4754 sv_catpv(sv, "{i}");
4755 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4756 if (flags & ANYOF_INVERT)
653099ff 4757 sv_catpv(sv, "^");
ffc61ed2
JH
4758 for (i = 0; i <= 256; i++) {
4759 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4760 if (rangestart == -1)
4761 rangestart = i;
4762 } else if (rangestart != -1) {
4763 if (i <= rangestart + 3)
4764 for (; rangestart < i; rangestart++)
653099ff 4765 put_byte(sv, rangestart);
ffc61ed2
JH
4766 else {
4767 put_byte(sv, rangestart);
4768 sv_catpv(sv, "-");
4769 put_byte(sv, i - 1);
653099ff 4770 }
ffc61ed2 4771 rangestart = -1;
653099ff 4772 }
847a199f 4773 }
ffc61ed2
JH
4774
4775 if (o->flags & ANYOF_CLASS)
4776 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4777 if (ANYOF_CLASS_TEST(o,i))
4778 sv_catpv(sv, anyofs[i]);
4779
4780 if (flags & ANYOF_UNICODE)
4781 sv_catpv(sv, "{unicode}");
1aa99e6b 4782 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4783 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4784
4785 {
4786 SV *lv;
9e55ce06 4787 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4788
ffc61ed2
JH
4789 if (lv) {
4790 if (sw) {
ffc61ed2 4791 U8 s[UTF8_MAXLEN+1];
b81d288d 4792
ffc61ed2 4793 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4794 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4795
3568d838 4796 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4797 if (rangestart == -1)
4798 rangestart = i;
4799 } else if (rangestart != -1) {
4800 U8 *p;
b81d288d 4801
ffc61ed2
JH
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++) {
2b9d42f0 4804 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4805 put_byte(sv, *p);
4806 }
4807 else {
2b9d42f0 4808 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4809 put_byte(sv, *p);
4810 sv_catpv(sv, "-");
2b9d42f0 4811 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4812 put_byte(sv, *p);
4813 }
4814 rangestart = -1;
4815 }
19860706 4816 }
ffc61ed2
JH
4817
4818 sv_catpv(sv, "..."); /* et cetera */
19860706 4819 }
fde631ed 4820
ffc61ed2
JH
4821 {
4822 char *s = savepv(SvPVX(lv));
4823 char *origs = s;
b81d288d 4824
ffc61ed2 4825 while(*s && *s != '\n') s++;
b81d288d 4826
ffc61ed2
JH
4827 if (*s == '\n') {
4828 char *t = ++s;
4829
4830 while (*s) {
4831 if (*s == '\n')
4832 *s = ' ';
4833 s++;
4834 }
4835 if (s[-1] == ' ')
4836 s[-1] = 0;
4837
4838 sv_catpv(sv, t);
fde631ed 4839 }
b81d288d 4840
ffc61ed2 4841 Safefree(origs);
fde631ed
JH
4842 }
4843 }
653099ff 4844 }
ffc61ed2 4845
653099ff
GS
4846 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4847 }
9b155405 4848 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4849 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4850#endif /* DEBUGGING */
35ff7856 4851}
a687059c 4852
cad2e5aa
JH
4853SV *
4854Perl_re_intuit_string(pTHX_ regexp *prog)
4855{ /* Assume that RE_INTUIT is set */
4856 DEBUG_r(
4857 { STRLEN n_a;
33b8afdf
JH
4858 char *s = SvPV(prog->check_substr
4859 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4860
4861 if (!PL_colorset) reginitcolors();
4862 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4863 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4864 PL_colors[4],
4865 prog->check_substr ? "" : "utf8 ",
4866 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4867 s,
4868 PL_colors[1],
4869 (strlen(s) > 60 ? "..." : ""));
4870 } );
4871
33b8afdf 4872 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4873}
4874
2b69d0c2 4875void
864dbfa3 4876Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4877{
9e55ce06
JH
4878#ifdef DEBUGGING
4879 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4880#endif
7821416a
IZ
4881
4882 if (!r || (--r->refcnt > 0))
4883 return;
9e55ce06 4884 DEBUG_r({
d103360b
HS
4885 int len;
4886 char *s;
4887
4888 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4889 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4890 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4891 len = SvCUR(dsv);
9e55ce06
JH
4892 if (!PL_colorset)
4893 reginitcolors();
4894 PerlIO_printf(Perl_debug_log,
4895 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4896 PL_colors[4],PL_colors[5],PL_colors[0],
4897 len, len, s,
4898 PL_colors[1],
4899 len > 60 ? "..." : "");
4900 });
cad2e5aa 4901
c277df42 4902 if (r->precomp)
a0d0e21e 4903 Safefree(r->precomp);
fac92740
MJD
4904 if (r->offsets) /* 20010421 MJD */
4905 Safefree(r->offsets);
ed252734
NC
4906 RX_MATCH_COPY_FREE(r);
4907#ifdef PERL_COPY_ON_WRITE
4908 if (r->saved_copy)
4909 SvREFCNT_dec(r->saved_copy);
4910#endif
a193d654
GS
4911 if (r->substrs) {
4912 if (r->anchored_substr)
4913 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4914 if (r->anchored_utf8)
4915 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4916 if (r->float_substr)
4917 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4918 if (r->float_utf8)
4919 SvREFCNT_dec(r->float_utf8);
2779dcf1 4920 Safefree(r->substrs);
a193d654 4921 }
c277df42
IZ
4922 if (r->data) {
4923 int n = r->data->count;
f3548bdc
DM
4924 PAD* new_comppad = NULL;
4925 PAD* old_comppad;
dfad63ad 4926
c277df42 4927 while (--n >= 0) {
261faec3 4928 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4929 switch (r->data->what[n]) {
4930 case 's':
4931 SvREFCNT_dec((SV*)r->data->data[n]);
4932 break;
653099ff
GS
4933 case 'f':
4934 Safefree(r->data->data[n]);
4935 break;
dfad63ad
HS
4936 case 'p':
4937 new_comppad = (AV*)r->data->data[n];
4938 break;
c277df42 4939 case 'o':
dfad63ad 4940 if (new_comppad == NULL)
cea2e8a9 4941 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4942 PAD_SAVE_LOCAL(old_comppad,
4943 /* Watch out for global destruction's random ordering. */
4944 (SvTYPE(new_comppad) == SVt_PVAV) ?
4945 new_comppad : Null(PAD *)
4946 );
9b978d73
DM
4947 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4948 op_free((OP_4tree*)r->data->data[n]);
4949 }
4950
f3548bdc 4951 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4952 SvREFCNT_dec((SV*)new_comppad);
4953 new_comppad = NULL;
c277df42
IZ
4954 break;
4955 case 'n':
9e55ce06 4956 break;
c277df42 4957 default:
830247a4 4958 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4959 }
4960 }
4961 Safefree(r->data->what);
4962 Safefree(r->data);
a0d0e21e
LW
4963 }
4964 Safefree(r->startp);
4965 Safefree(r->endp);
4966 Safefree(r);
a687059c 4967}
c277df42
IZ
4968
4969/*
4970 - regnext - dig the "next" pointer out of a node
4971 *
4972 * [Note, when REGALIGN is defined there are two places in regmatch()
4973 * that bypass this code for speed.]
4974 */
4975regnode *
864dbfa3 4976Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4977{
4978 register I32 offset;
4979
3280af22 4980 if (p == &PL_regdummy)
c277df42
IZ
4981 return(NULL);
4982
4983 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4984 if (offset == 0)
4985 return(NULL);
4986
c277df42 4987 return(p+offset);
c277df42
IZ
4988}
4989
01f988be 4990STATIC void
cea2e8a9 4991S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4992{
4993 va_list args;
4994 STRLEN l1 = strlen(pat1);
4995 STRLEN l2 = strlen(pat2);
4996 char buf[512];
06bf62c7 4997 SV *msv;
c277df42
IZ
4998 char *message;
4999
5000 if (l1 > 510)
5001 l1 = 510;
5002 if (l1 + l2 > 510)
5003 l2 = 510 - l1;
5004 Copy(pat1, buf, l1 , char);
5005 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5006 buf[l1 + l2] = '\n';
5007 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5008#ifdef I_STDARG
5009 /* ANSI variant takes additional second argument */
c277df42 5010 va_start(args, pat2);
8736538c
AS
5011#else
5012 va_start(args);
5013#endif
5a844595 5014 msv = vmess(buf, &args);
c277df42 5015 va_end(args);
06bf62c7 5016 message = SvPV(msv,l1);
c277df42
IZ
5017 if (l1 > 512)
5018 l1 = 512;
5019 Copy(message, buf, l1 , char);
5020 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5021 Perl_croak(aTHX_ "%s", buf);
c277df42 5022}
a0ed51b3
LW
5023
5024/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5025
5026void
864dbfa3 5027Perl_save_re_context(pTHX)
b81d288d 5028{
830247a4 5029 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5030 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5031 SAVEPPTR(PL_reginput); /* String-input pointer. */
5032 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5033 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5034 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5035 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5036 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5037 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5038 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5039 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5040 PL_reg_start_tmp = 0;
a0ed51b3
LW
5041 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5042 PL_reg_start_tmpl = 0;
7766f137 5043 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5044 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5045 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5046 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5047 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5048 SAVEVPTR(PL_regcc); /* from regexec.c */
5049 SAVEVPTR(PL_curcop);
7766f137
GS
5050 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5051 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5052 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5053 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5054 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5055 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5056 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5057 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5058 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5059 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5060 PL_reg_oldsaved = Nullch;
5061 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5062 PL_reg_oldsavedlen = 0;
ed252734
NC
5063#ifdef PERL_COPY_ON_WRITE
5064 SAVESPTR(PL_nrs);
5065 PL_nrs = Nullsv;
5066#endif
a5db57d6
GS
5067 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5068 PL_reg_maxiter = 0;
5069 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5070 PL_reg_leftiter = 0;
5071 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5072 PL_reg_poscache = Nullch;
5073 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5074 PL_reg_poscache_size = 0;
5075 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5076 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5077 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5078
5079 {
5080 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5081 U32 i;
ada6e8a9
AMS
5082 GV *mgv;
5083 REGEXP *rx;
5084 char digits[16];
5085
5086 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5087 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5088 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5089 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5090 save_scalar(mgv);
5091 }
5092 }
5093 }
5094
54b6e2fa 5095#ifdef DEBUGGING
b81d288d 5096 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5097#endif
a0ed51b3 5098}
51371543 5099
51371543 5100static void
acfe0abc 5101clear_re(pTHX_ void *r)
51371543
GS
5102{
5103 ReREFCNT_dec((regexp *)r);
5104}
ffbc6a93 5105