This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The leavetry crash should now be history.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
9014280d 388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
9014280d 395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
f9373011
PM
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
397 } STMT_END \
398
b45f050a
JF
399
400#define vWARN2(loc, m, a1) \
401 STMT_START { \
7a799e20 402 IV offset = loc - RExC_precomp; \
9014280d 403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 404 a1, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN3(loc, m, a1, a2) \
409 STMT_START { \
7a799e20 410 IV offset = loc - RExC_precomp; \
9014280d 411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
b45f050a 412 a1, a2, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
414 } STMT_END
415
416#define vWARN4(loc, m, a1, a2, a3) \
417 STMT_START { \
7a799e20 418 IV offset = loc - RExC_precomp; \
9014280d 419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 420 a1, a2, a3, \
830247a4 421 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
422 } STMT_END
423
9d1d55b5
JP
424/* used for the parse_flags section for (?c) -- japhy */
425#define vWARN5(loc, m, a1, a2, a3, a4) \
426 STMT_START { \
7a799e20 427 IV offset = loc - RExC_precomp; \
9014280d 428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
9d1d55b5
JP
429 a1, a2, a3, a4, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
431 } STMT_END
432
8615cb43 433
cd439c50 434/* Allow for side effects in s */
fd3f0ae2 435#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 436
fac92740
MJD
437/* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
441 */
442
443#define MJD_OFFSET_DEBUG(x)
444/* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447# define Set_Node_Offset_To_R(node,byte) \
448 STMT_START { \
449 if (! SIZE_ONLY) { \
450 if((node) < 0) { \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452 } else { \
453 RExC_offsets[2*(node)-1] = (byte); \
454 } \
455 } \
456 } STMT_END
457
458# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461# define Set_Node_Length_To_R(node,len) \
462 STMT_START { \
463 if (! SIZE_ONLY) { \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465 if((node) < 0) { \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467 } else { \
468 RExC_offsets[2*(node)] = (len); \
469 } \
470 } \
471 } STMT_END
472
473# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
476
477/* Get offsets and lengths */
478#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
acfe0abc 481static void clear_re(pTHX_ void *r);
4327152a 482
653099ff
GS
483/* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
486
4327152a 487STATIC void
830247a4 488S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 489{
a0ed51b3
LW
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 492
c277df42
IZ
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
b81d288d 498 data->flags
c277df42
IZ
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500 else
501 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
502 }
503 else {
c277df42 504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
505 data->offset_float_max = (l
506 ? data->last_start_max
c277df42
IZ
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
b81d288d 509 data->flags
c277df42
IZ
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511 else
512 data->flags &= ~SF_FL_BEFORE_EOL;
513 }
514 }
515 SvCUR_set(data->last_found, 0);
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
588
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
593 }
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
598}
599
600/* 'OR' a given class with another one. Can create false positives */
601/* We assume that cl is not inverted */
602STATIC void
830247a4 603S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 604{
653099ff
GS
605 if (or_with->flags & ANYOF_INVERT) {
606 /* We do not use
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614 */
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
618 int i;
619
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
623 else {
830247a4 624 cl_anything(pRExC_state, cl);
653099ff
GS
625 }
626 } else {
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 629 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
630 || (cl->flags & ANYOF_FOLD)) ) {
631 int i;
632
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
640 }
641 }
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 643 cl_anything(pRExC_state, cl);
653099ff
GS
644 }
645 }
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
648
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
653 }
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
657 }
653099ff
GS
658}
659
5d1c421c
JH
660/*
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
663 */
664#if defined(__sparc64__) && defined(__GNUC__)
665# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666# undef SPARC64_GCC_WORKAROUND
667# define SPARC64_GCC_WORKAROUND 1
668# endif
669#endif
670
653099ff
GS
671/* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
673
c277df42
IZ
674/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
676
76e3520e 677STATIC I32
830247a4 678S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
682{
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
685 I32 delta = 0;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 687 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
653099ff 690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 691
c277df42
IZ
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
694
22c35a8c 695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 696 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
697 regnode *n = regnext(scan);
698 U32 stringok = 1;
699#ifdef DEBUGGING
700 regnode *stop = scan;
b81d288d 701#endif
c277df42 702
cd439c50 703 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
704 /* Skip NOTHING, merge EXACT*. */
705 while (n &&
b81d288d 706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
707 (stringok && (OP(n) == OP(scan))))
708 && NEXT_OFF(n)
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
711 stringok = 0;
22c35a8c 712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
715#ifdef DEBUGGING
716 if (stringok)
717 stop = n;
b81d288d 718#endif
c277df42 719 n = regnext(n);
a0ed51b3 720 }
f49d4d0f 721 else if (stringok) {
cd439c50 722 int oldl = STR_LEN(scan);
c277df42 723 regnode *nnext = regnext(n);
f49d4d0f 724
b81d288d 725 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
726 break;
727 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
c277df42 730 /* Now we can overwrite *n : */
f49d4d0f 731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 732#ifdef DEBUGGING
f49d4d0f 733 stop = next - 1;
b81d288d 734#endif
c277df42
IZ
735 n = nnext;
736 }
737 }
61a36c01 738
d65e4eab 739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
740/*
741 Two problematic code points in Unicode casefolding of EXACT nodes:
742
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746 which casefold to
747
748 Unicode UTF-8
749
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
758
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
762
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
766
767*/
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
771 char *t1 = t0 + 3;
772
773 for (s = s0 + 2;
774 s < s2 && (t = ninstr(s, s1, t0, t1));
775 s = t + 4) {
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778 min -= 4;
779 }
780 }
781
c277df42
IZ
782#ifdef DEBUGGING
783 /* Allow dumping */
cd439c50 784 n = scan + NODE_SZ_STR(scan);
c277df42 785 while (n <= stop) {
22c35a8c 786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
787 OP(n) = OPTIMIZED;
788 NEXT_OFF(n) = 0;
789 }
790 n++;
791 }
653099ff 792#endif
c277df42 793 }
653099ff
GS
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
c277df42 796 if (OP(scan) != CURLYX) {
048cfca1
GS
797 int max = (reg_off_by_arg[OP(scan)]
798 ? I32_MAX
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802 int noff;
803 regnode *n = scan;
b81d288d 804
c277df42
IZ
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
22c35a8c 807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809 && off + noff < max)
810 off += noff;
811 if (reg_off_by_arg[OP(scan)])
812 ARG(scan) = off;
b81d288d 813 else
c277df42
IZ
814 NEXT_OFF(scan) = off;
815 }
653099ff
GS
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
b81d288d 818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
821 code = OP(scan);
b81d288d
AB
822
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 824 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 825 struct regnode_charclass_class accum;
c277df42 826
653099ff 827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 829 if (flags & SCF_DO_STCLASS)
830247a4 830 cl_init_zero(pRExC_state, &accum);
c277df42 831 while (OP(scan) == code) {
830247a4 832 I32 deltanext, minnext, f = 0, fake;
653099ff 833 struct regnode_charclass_class this_class;
c277df42
IZ
834
835 num++;
836 data_fake.flags = 0;
b81d288d 837 if (data) {
2c2d71f5 838 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
839 data_fake.last_closep = data->last_closep;
840 }
841 else
842 data_fake.last_closep = &fake;
c277df42
IZ
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
845 if (code != BRANCH)
846 scan = NEXTOPER(scan);
653099ff 847 if (flags & SCF_DO_STCLASS) {
830247a4 848 cl_init(pRExC_state, &this_class);
653099ff
GS
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
b81d288d 851 }
e1901655
IZ
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
653099ff 854 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
b81d288d 857 if (min1 > minnext)
c277df42
IZ
858 min1 = minnext;
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
aca2d497 862 is_inf = is_inf_internal = 1;
c277df42
IZ
863 scan = next;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865 pars++;
405ff068 866 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 867 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
868 if (data)
869 data->whilem_c = data_fake.whilem_c;
653099ff 870 if (flags & SCF_DO_STCLASS)
830247a4 871 cl_or(pRExC_state, &accum, &this_class);
b81d288d 872 if (code == SUSPEND)
c277df42
IZ
873 break;
874 }
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876 min1 = 0;
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
882 }
883 min += min1;
884 delta += max1 - min1;
653099ff 885 if (flags & SCF_DO_STCLASS_OR) {
830247a4 886 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
887 if (min1) {
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
890 }
891 }
892 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
893 if (min1) {
894 cl_and(data->start_class, &accum);
653099ff 895 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
896 }
897 else {
b81d288d 898 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
907 }
653099ff 908 }
a0ed51b3
LW
909 }
910 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
914 continue;
a0ed51b3
LW
915 }
916 else if (OP(scan) == EXACT) {
cd439c50 917 I32 l = STR_LEN(scan);
1aa99e6b 918 UV uc = *((U8*)STRING(scan));
a0ed51b3 919 if (UTF) {
1aa99e6b
IH
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
9041c2e3 922 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
923 }
924 min += l;
c277df42 925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
b81d288d 931 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 932 }
cd439c50 933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
934 if (UTF)
935 SvUTF8_on(data->last_found);
c277df42
IZ
936 data->last_end = data->pos_min + l;
937 data->pos_min += l; /* As in the first entry. */
938 data->flags &= ~SF_BEFORE_EOL;
939 }
653099ff
GS
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
942 int compat = 1;
943
1aa99e6b 944 if (uc >= 0x100 ||
516a5887 945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 947 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 948 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 949 )
653099ff
GS
950 compat = 0;
951 ANYOF_CLASS_ZERO(data->start_class);
952 ANYOF_BITMAP_ZERO(data->start_class);
953 if (compat)
1aa99e6b 954 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 955 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
956 if (uc < 0x100)
957 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
958 }
959 else if (flags & SCF_DO_STCLASS_OR) {
960 /* false positive possible if the class is case-folded */
1aa99e6b 961 if (uc < 0x100)
9b877dbb
IH
962 ANYOF_BITMAP_SET(data->start_class, uc);
963 else
964 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
965 data->start_class->flags &= ~ANYOF_EOS;
966 cl_and(data->start_class, &and_with);
967 }
968 flags &= ~SCF_DO_STCLASS;
a0ed51b3 969 }
653099ff 970 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 971 I32 l = STR_LEN(scan);
1aa99e6b 972 UV uc = *((U8*)STRING(scan));
653099ff
GS
973
974 /* Search for fixed substrings supports EXACT only. */
b81d288d 975 if (flags & SCF_DO_SUBSTR)
830247a4 976 scan_commit(pRExC_state, data);
a0ed51b3 977 if (UTF) {
1aa99e6b
IH
978 U8 *s = (U8 *)STRING(scan);
979 l = utf8_length(s, s + l);
9041c2e3 980 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
981 }
982 min += l;
c277df42 983 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 984 data->pos_min += l;
653099ff
GS
985 if (flags & SCF_DO_STCLASS_AND) {
986 /* Check whether it is compatible with what we know already! */
987 int compat = 1;
988
1aa99e6b 989 if (uc >= 0x100 ||
516a5887 990 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 991 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 992 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
993 compat = 0;
994 ANYOF_CLASS_ZERO(data->start_class);
995 ANYOF_BITMAP_ZERO(data->start_class);
996 if (compat) {
1aa99e6b 997 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
998 data->start_class->flags &= ~ANYOF_EOS;
999 data->start_class->flags |= ANYOF_FOLD;
1000 if (OP(scan) == EXACTFL)
1001 data->start_class->flags |= ANYOF_LOCALE;
1002 }
1003 }
1004 else if (flags & SCF_DO_STCLASS_OR) {
1005 if (data->start_class->flags & ANYOF_FOLD) {
1006 /* false positive possible if the class is case-folded.
1007 Assume that the locale settings are the same... */
1aa99e6b
IH
1008 if (uc < 0x100)
1009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1010 data->start_class->flags &= ~ANYOF_EOS;
1011 }
1012 cl_and(data->start_class, &and_with);
1013 }
1014 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1015 }
4d61ec05 1016 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1017 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1018 I32 f = flags, pos_before = 0;
c277df42 1019 regnode *oscan = scan;
653099ff
GS
1020 struct regnode_charclass_class this_class;
1021 struct regnode_charclass_class *oclass = NULL;
727f22e3 1022 I32 next_is_eval = 0;
653099ff 1023
22c35a8c 1024 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1025 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1026 scan = NEXTOPER(scan);
1027 goto finish;
1028 case PLUS:
653099ff 1029 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1030 next = NEXTOPER(scan);
653099ff 1031 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1032 mincount = 1;
1033 maxcount = REG_INFTY;
c277df42
IZ
1034 next = regnext(scan);
1035 scan = NEXTOPER(scan);
1036 goto do_curly;
1037 }
1038 }
1039 if (flags & SCF_DO_SUBSTR)
1040 data->pos_min++;
1041 min++;
1042 /* Fall through. */
1043 case STAR:
653099ff
GS
1044 if (flags & SCF_DO_STCLASS) {
1045 mincount = 0;
b81d288d 1046 maxcount = REG_INFTY;
653099ff
GS
1047 next = regnext(scan);
1048 scan = NEXTOPER(scan);
1049 goto do_curly;
1050 }
b81d288d 1051 is_inf = is_inf_internal = 1;
c277df42
IZ
1052 scan = regnext(scan);
1053 if (flags & SCF_DO_SUBSTR) {
830247a4 1054 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1055 data->longest = &(data->longest_float);
1056 }
1057 goto optimize_curly_tail;
1058 case CURLY:
b81d288d 1059 mincount = ARG1(scan);
c277df42
IZ
1060 maxcount = ARG2(scan);
1061 next = regnext(scan);
cb434fcc
IZ
1062 if (OP(scan) == CURLYX) {
1063 I32 lp = (data ? *(data->last_closep) : 0);
1064
1065 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1066 }
c277df42 1067 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1068 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1069 do_curly:
1070 if (flags & SCF_DO_SUBSTR) {
830247a4 1071 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1072 pos_before = data->pos_min;
1073 }
1074 if (data) {
1075 fl = data->flags;
1076 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1077 if (is_inf)
1078 data->flags |= SF_IS_INF;
1079 }
653099ff 1080 if (flags & SCF_DO_STCLASS) {
830247a4 1081 cl_init(pRExC_state, &this_class);
653099ff
GS
1082 oclass = data->start_class;
1083 data->start_class = &this_class;
1084 f |= SCF_DO_STCLASS_AND;
1085 f &= ~SCF_DO_STCLASS_OR;
1086 }
e1901655
IZ
1087 /* These are the cases when once a subexpression
1088 fails at a particular position, it cannot succeed
1089 even after backtracking at the enclosing scope.
b81d288d 1090
e1901655
IZ
1091 XXXX what if minimal match and we are at the
1092 initial run of {n,m}? */
1093 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1095
c277df42 1096 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1097 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1098 mincount == 0
653099ff
GS
1099 ? (f & ~SCF_DO_SUBSTR) : f);
1100
1101 if (flags & SCF_DO_STCLASS)
1102 data->start_class = oclass;
1103 if (mincount == 0 || minnext == 0) {
1104 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1105 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1106 }
1107 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1108 /* Switch to OR mode: cache the old value of
653099ff
GS
1109 * data->start_class */
1110 StructCopy(data->start_class, &and_with,
1111 struct regnode_charclass_class);
1112 flags &= ~SCF_DO_STCLASS_AND;
1113 StructCopy(&this_class, data->start_class,
1114 struct regnode_charclass_class);
1115 flags |= SCF_DO_STCLASS_OR;
1116 data->start_class->flags |= ANYOF_EOS;
1117 }
1118 } else { /* Non-zero len */
1119 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1120 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1121 cl_and(data->start_class, &and_with);
1122 }
1123 else if (flags & SCF_DO_STCLASS_AND)
1124 cl_and(data->start_class, &this_class);
1125 flags &= ~SCF_DO_STCLASS;
1126 }
c277df42
IZ
1127 if (!scan) /* It was not CURLYX, but CURLY. */
1128 scan = next;
84037bb0 1129 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1130 /* ? quantifier ok, except for (?{ ... }) */
1131 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1132 && (minnext == 0) && (deltanext == 0)
99799961 1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1134 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1135 {
830247a4 1136 vWARN(RExC_parse,
b45f050a
JF
1137 "Quantifier unexpected on zero-length expression");
1138 }
1139
c277df42 1140 min += minnext * mincount;
b81d288d 1141 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1142 && (minnext + deltanext) > 0)
1143 || deltanext == I32_MAX);
aca2d497 1144 is_inf |= is_inf_internal;
c277df42
IZ
1145 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1146
1147 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1148 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1149 && data->flags & SF_IN_PAR
1150 && !(data->flags & SF_HAS_EVAL)
1151 && !deltanext && minnext == 1 ) {
1152 /* Try to optimize to CURLYN. */
1153 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1154 regnode *nxt1 = nxt;
1155#ifdef DEBUGGING
1156 regnode *nxt2;
1157#endif
c277df42
IZ
1158
1159 /* Skip open. */
1160 nxt = regnext(nxt);
4d61ec05 1161 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1162 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1163 && STR_LEN(nxt) == 1))
c277df42 1164 goto nogo;
497b47a8 1165#ifdef DEBUGGING
c277df42 1166 nxt2 = nxt;
497b47a8 1167#endif
c277df42 1168 nxt = regnext(nxt);
b81d288d 1169 if (OP(nxt) != CLOSE)
c277df42
IZ
1170 goto nogo;
1171 /* Now we know that nxt2 is the only contents: */
1172 oscan->flags = ARG(nxt);
1173 OP(oscan) = CURLYN;
1174 OP(nxt1) = NOTHING; /* was OPEN. */
1175#ifdef DEBUGGING
1176 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1180 OP(nxt + 1) = OPTIMIZED; /* was count. */
1181 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1182#endif
c277df42 1183 }
c277df42
IZ
1184 nogo:
1185
1186 /* Try optimization CURLYX => CURLYM. */
b81d288d 1187 if ( OP(oscan) == CURLYX && data
c277df42 1188 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1189 && !(data->flags & SF_HAS_EVAL)
1190 && !deltanext ) {
1191 /* XXXX How to optimize if data == 0? */
1192 /* Optimize to a simpler form. */
1193 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194 regnode *nxt2;
1195
1196 OP(oscan) = CURLYM;
1197 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1198 && (OP(nxt2) != WHILEM))
c277df42
IZ
1199 nxt = nxt2;
1200 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1201 /* Need to optimize away parenths. */
1202 if (data->flags & SF_IN_PAR) {
1203 /* Set the parenth number. */
1204 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1205
b81d288d 1206 if (OP(nxt) != CLOSE)
b45f050a 1207 FAIL("Panic opt close");
c277df42
IZ
1208 oscan->flags = ARG(nxt);
1209 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1210 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1211#ifdef DEBUGGING
1212 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213 OP(nxt + 1) = OPTIMIZED; /* was count. */
1214 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1216#endif
c277df42
IZ
1217#if 0
1218 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219 regnode *nnxt = regnext(nxt1);
b81d288d 1220
c277df42
IZ
1221 if (nnxt == nxt) {
1222 if (reg_off_by_arg[OP(nxt1)])
1223 ARG_SET(nxt1, nxt2 - nxt1);
1224 else if (nxt2 - nxt1 < U16_MAX)
1225 NEXT_OFF(nxt1) = nxt2 - nxt1;
1226 else
1227 OP(nxt) = NOTHING; /* Cannot beautify */
1228 }
1229 nxt1 = nnxt;
1230 }
1231#endif
1232 /* Optimize again: */
b81d288d 1233 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1234 NULL, 0);
a0ed51b3
LW
1235 }
1236 else
c277df42 1237 oscan->flags = 0;
c277df42 1238 }
e1901655
IZ
1239 else if ((OP(oscan) == CURLYX)
1240 && (flags & SCF_WHILEM_VISITED_POS)
1241 /* See the comment on a similar expression above.
1242 However, this time it not a subexpression
1243 we care about, but the expression itself. */
1244 && (maxcount == REG_INFTY)
1245 && data && ++data->whilem_c < 16) {
1246 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1247 /* Find WHILEM (as in regexec.c) */
1248 regnode *nxt = oscan + NEXT_OFF(oscan);
1249
1250 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1251 nxt += ARG(nxt);
1252 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1253 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1254 }
b81d288d 1255 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1256 pars++;
1257 if (flags & SCF_DO_SUBSTR) {
1258 SV *last_str = Nullsv;
1259 int counted = mincount != 0;
1260
1261 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1262#if defined(SPARC64_GCC_WORKAROUND)
1263 I32 b = 0;
1264 STRLEN l = 0;
1265 char *s = NULL;
1266 I32 old = 0;
1267
1268 if (pos_before >= data->last_start_min)
1269 b = pos_before;
1270 else
1271 b = data->last_start_min;
1272
1273 l = 0;
1274 s = SvPV(data->last_found, l);
1275 old = b - data->last_start_min;
1276
1277#else
b81d288d 1278 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1279 ? pos_before : data->last_start_min;
1280 STRLEN l;
1281 char *s = SvPV(data->last_found, l);
a0ed51b3 1282 I32 old = b - data->last_start_min;
5d1c421c 1283#endif
a0ed51b3
LW
1284
1285 if (UTF)
1286 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1287
a0ed51b3 1288 l -= old;
c277df42 1289 /* Get the added string: */
79cb57f6 1290 last_str = newSVpvn(s + old, l);
c277df42
IZ
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1293 if (mincount > 1) {
1294 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1295 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
b81d288d 1299 SvCUR_set(data->last_found,
c277df42
IZ
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1302 data->last_end += l * (mincount - 1);
1303 }
2a8d9689
HS
1304 } else {
1305 /* start offset must point into the last copy */
1306 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1307 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308 * (minnext + data->pos_delta);
c277df42
IZ
1309 }
1310 }
1311 /* It is counted once already... */
1312 data->pos_min += minnext * (mincount - counted);
1313 data->pos_delta += - counted * deltanext +
1314 (minnext + deltanext) * maxcount - minnext * mincount;
1315 if (mincount != maxcount) {
653099ff
GS
1316 /* Cannot extend fixed substrings found inside
1317 the group. */
830247a4 1318 scan_commit(pRExC_state,data);
c277df42
IZ
1319 if (mincount && last_str) {
1320 sv_setsv(data->last_found, last_str);
1321 data->last_end = data->pos_min;
b81d288d 1322 data->last_start_min =
a0ed51b3 1323 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1324 data->last_start_max = is_inf
1325 ? I32_MAX
c277df42 1326 : data->pos_min + data->pos_delta
a0ed51b3 1327 - CHR_SVLEN(last_str);
c277df42
IZ
1328 }
1329 data->longest = &(data->longest_float);
1330 }
aca2d497 1331 SvREFCNT_dec(last_str);
c277df42 1332 }
405ff068 1333 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1334 data->flags |= SF_HAS_EVAL;
1335 optimize_curly_tail:
c277df42 1336 if (OP(oscan) != CURLYX) {
22c35a8c 1337 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1338 && NEXT_OFF(next))
1339 NEXT_OFF(oscan) += NEXT_OFF(next);
1340 }
c277df42 1341 continue;
653099ff 1342 default: /* REF and CLUMP only? */
c277df42 1343 if (flags & SCF_DO_SUBSTR) {
830247a4 1344 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1345 data->longest = &(data->longest_float);
1346 }
aca2d497 1347 is_inf = is_inf_internal = 1;
653099ff 1348 if (flags & SCF_DO_STCLASS_OR)
830247a4 1349 cl_anything(pRExC_state, data->start_class);
653099ff 1350 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1351 break;
1352 }
a0ed51b3 1353 }
ffc61ed2 1354 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1355 int value = 0;
653099ff 1356
c277df42 1357 if (flags & SCF_DO_SUBSTR) {
830247a4 1358 scan_commit(pRExC_state,data);
c277df42
IZ
1359 data->pos_min++;
1360 }
1361 min++;
653099ff
GS
1362 if (flags & SCF_DO_STCLASS) {
1363 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1364
1365 /* Some of the logic below assumes that switching
1366 locale on will only add false positives. */
1367 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1368 case SANY:
653099ff
GS
1369 default:
1370 do_default:
1371 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1373 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1374 break;
1375 case REG_ANY:
1376 if (OP(scan) == SANY)
1377 goto do_default;
1378 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1381 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1382 }
1383 if (flags & SCF_DO_STCLASS_AND || !value)
1384 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385 break;
1386 case ANYOF:
1387 if (flags & SCF_DO_STCLASS_AND)
1388 cl_and(data->start_class,
1389 (struct regnode_charclass_class*)scan);
1390 else
830247a4 1391 cl_or(pRExC_state, data->start_class,
653099ff
GS
1392 (struct regnode_charclass_class*)scan);
1393 break;
1394 case ALNUM:
1395 if (flags & SCF_DO_STCLASS_AND) {
1396 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398 for (value = 0; value < 256; value++)
1399 if (!isALNUM(value))
1400 ANYOF_BITMAP_CLEAR(data->start_class, value);
1401 }
1402 }
1403 else {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1406 else {
1407 for (value = 0; value < 256; value++)
1408 if (isALNUM(value))
b81d288d 1409 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1410 }
1411 }
1412 break;
1413 case ALNUML:
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417 }
1418 else {
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 data->start_class->flags |= ANYOF_LOCALE;
1421 }
1422 break;
1423 case NALNUM:
1424 if (flags & SCF_DO_STCLASS_AND) {
1425 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427 for (value = 0; value < 256; value++)
1428 if (isALNUM(value))
1429 ANYOF_BITMAP_CLEAR(data->start_class, value);
1430 }
1431 }
1432 else {
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435 else {
1436 for (value = 0; value < 256; value++)
1437 if (!isALNUM(value))
b81d288d 1438 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1439 }
1440 }
1441 break;
1442 case NALNUML:
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446 }
1447 else {
1448 data->start_class->flags |= ANYOF_LOCALE;
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450 }
1451 break;
1452 case SPACE:
1453 if (flags & SCF_DO_STCLASS_AND) {
1454 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456 for (value = 0; value < 256; value++)
1457 if (!isSPACE(value))
1458 ANYOF_BITMAP_CLEAR(data->start_class, value);
1459 }
1460 }
1461 else {
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464 else {
1465 for (value = 0; value < 256; value++)
1466 if (isSPACE(value))
b81d288d 1467 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1468 }
1469 }
1470 break;
1471 case SPACEL:
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475 }
1476 else {
1477 data->start_class->flags |= ANYOF_LOCALE;
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479 }
1480 break;
1481 case NSPACE:
1482 if (flags & SCF_DO_STCLASS_AND) {
1483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485 for (value = 0; value < 256; value++)
1486 if (isSPACE(value))
1487 ANYOF_BITMAP_CLEAR(data->start_class, value);
1488 }
1489 }
1490 else {
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1493 else {
1494 for (value = 0; value < 256; value++)
1495 if (!isSPACE(value))
b81d288d 1496 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1497 }
1498 }
1499 break;
1500 case NSPACEL:
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (data->start_class->flags & ANYOF_LOCALE) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1505 if (!isSPACE(value))
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1507 }
1508 }
1509 else {
1510 data->start_class->flags |= ANYOF_LOCALE;
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512 }
1513 break;
1514 case DIGIT:
1515 if (flags & SCF_DO_STCLASS_AND) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517 for (value = 0; value < 256; value++)
1518 if (!isDIGIT(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 }
1521 else {
1522 if (data->start_class->flags & ANYOF_LOCALE)
1523 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1524 else {
1525 for (value = 0; value < 256; value++)
1526 if (isDIGIT(value))
b81d288d 1527 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1528 }
1529 }
1530 break;
1531 case NDIGIT:
1532 if (flags & SCF_DO_STCLASS_AND) {
1533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534 for (value = 0; value < 256; value++)
1535 if (isDIGIT(value))
1536 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537 }
1538 else {
1539 if (data->start_class->flags & ANYOF_LOCALE)
1540 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1541 else {
1542 for (value = 0; value < 256; value++)
1543 if (!isDIGIT(value))
b81d288d 1544 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1545 }
1546 }
1547 break;
1548 }
1549 if (flags & SCF_DO_STCLASS_OR)
1550 cl_and(data->start_class, &and_with);
1551 flags &= ~SCF_DO_STCLASS;
1552 }
a0ed51b3 1553 }
22c35a8c 1554 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1555 data->flags |= (OP(scan) == MEOL
1556 ? SF_BEFORE_MEOL
1557 : SF_BEFORE_SEOL);
a0ed51b3 1558 }
653099ff
GS
1559 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1560 /* Lookbehind, or need to calculate parens/evals/stclass: */
1561 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1562 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1563 /* Lookahead/lookbehind */
cb434fcc 1564 I32 deltanext, minnext, fake = 0;
c277df42 1565 regnode *nscan;
653099ff
GS
1566 struct regnode_charclass_class intrnl;
1567 int f = 0;
c277df42
IZ
1568
1569 data_fake.flags = 0;
b81d288d 1570 if (data) {
2c2d71f5 1571 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1572 data_fake.last_closep = data->last_closep;
1573 }
1574 else
1575 data_fake.last_closep = &fake;
653099ff
GS
1576 if ( flags & SCF_DO_STCLASS && !scan->flags
1577 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1578 cl_init(pRExC_state, &intrnl);
653099ff 1579 data_fake.start_class = &intrnl;
e1901655 1580 f |= SCF_DO_STCLASS_AND;
653099ff 1581 }
e1901655
IZ
1582 if (flags & SCF_WHILEM_VISITED_POS)
1583 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1584 next = regnext(scan);
1585 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1586 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1587 if (scan->flags) {
1588 if (deltanext) {
9baa0206 1589 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1590 }
1591 else if (minnext > U8_MAX) {
9baa0206 1592 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1593 }
1594 scan->flags = minnext;
1595 }
1596 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1597 pars++;
405ff068 1598 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1599 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1600 if (data)
1601 data->whilem_c = data_fake.whilem_c;
e1901655 1602 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1603 int was = (data->start_class->flags & ANYOF_EOS);
1604
1605 cl_and(data->start_class, &intrnl);
1606 if (was)
1607 data->start_class->flags |= ANYOF_EOS;
1608 }
a0ed51b3
LW
1609 }
1610 else if (OP(scan) == OPEN) {
c277df42 1611 pars++;
a0ed51b3 1612 }
cb434fcc
IZ
1613 else if (OP(scan) == CLOSE) {
1614 if (ARG(scan) == is_par) {
1615 next = regnext(scan);
c277df42 1616
cb434fcc
IZ
1617 if ( next && (OP(next) != WHILEM) && next < last)
1618 is_par = 0; /* Disable optimization */
1619 }
1620 if (data)
1621 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1622 }
1623 else if (OP(scan) == EVAL) {
c277df42
IZ
1624 if (data)
1625 data->flags |= SF_HAS_EVAL;
1626 }
96776eda 1627 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1628 if (flags & SCF_DO_SUBSTR) {
830247a4 1629 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1630 data->longest = &(data->longest_float);
1631 }
1632 is_inf = is_inf_internal = 1;
653099ff 1633 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1634 cl_anything(pRExC_state, data->start_class);
96776eda 1635 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1636 }
c277df42
IZ
1637 /* Else: zero-length, ignore. */
1638 scan = regnext(scan);
1639 }
1640
1641 finish:
1642 *scanp = scan;
aca2d497 1643 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1644 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1645 data->pos_delta = I32_MAX - data->pos_min;
1646 if (is_par > U8_MAX)
1647 is_par = 0;
1648 if (is_par && pars==1 && data) {
1649 data->flags |= SF_IN_PAR;
1650 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1651 }
1652 else if (pars && data) {
c277df42
IZ
1653 data->flags |= SF_HAS_PAR;
1654 data->flags &= ~SF_IN_PAR;
1655 }
653099ff
GS
1656 if (flags & SCF_DO_STCLASS_OR)
1657 cl_and(data->start_class, &and_with);
c277df42
IZ
1658 return min;
1659}
1660
76e3520e 1661STATIC I32
830247a4 1662S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1663{
830247a4 1664 if (RExC_rx->data) {
b81d288d
AB
1665 Renewc(RExC_rx->data,
1666 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1667 char, struct reg_data);
830247a4
IZ
1668 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669 RExC_rx->data->count += n;
a0ed51b3
LW
1670 }
1671 else {
830247a4 1672 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1673 char, struct reg_data);
830247a4
IZ
1674 New(1208, RExC_rx->data->what, n, U8);
1675 RExC_rx->data->count = n;
c277df42 1676 }
830247a4
IZ
1677 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678 return RExC_rx->data->count - n;
c277df42
IZ
1679}
1680
d88dccdf 1681void
864dbfa3 1682Perl_reginitcolors(pTHX)
d88dccdf 1683{
d88dccdf
IZ
1684 int i = 0;
1685 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1686
d88dccdf
IZ
1687 if (s) {
1688 PL_colors[0] = s = savepv(s);
1689 while (++i < 6) {
1690 s = strchr(s, '\t');
1691 if (s) {
1692 *s = '\0';
1693 PL_colors[i] = ++s;
1694 }
1695 else
c712d376 1696 PL_colors[i] = s = "";
d88dccdf
IZ
1697 }
1698 } else {
b81d288d 1699 while (i < 6)
d88dccdf
IZ
1700 PL_colors[i++] = "";
1701 }
1702 PL_colorset = 1;
1703}
1704
8615cb43 1705
a687059c 1706/*
e50aee73 1707 - pregcomp - compile a regular expression into internal code
a687059c
LW
1708 *
1709 * We can't allocate space until we know how big the compiled form will be,
1710 * but we can't compile it (and thus know how big it is) until we've got a
1711 * place to put the code. So we cheat: we compile it twice, once with code
1712 * generation turned off and size counting turned on, and once "for real".
1713 * This also means that we don't allocate space until we are sure that the
1714 * thing really will compile successfully, and we never have to move the
1715 * code and thus invalidate pointers into it. (Note that it has to be in
1716 * one piece because free() must be able to free it all.) [NB: not true in perl]
1717 *
1718 * Beware that the optimization-preparation code in here knows about some
1719 * of the structure of the compiled regexp. [I'll say.]
1720 */
1721regexp *
864dbfa3 1722Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1723{
a0d0e21e 1724 register regexp *r;
c277df42 1725 regnode *scan;
c277df42 1726 regnode *first;
a0d0e21e 1727 I32 flags;
a0d0e21e
LW
1728 I32 minlen = 0;
1729 I32 sawplus = 0;
1730 I32 sawopen = 0;
2c2d71f5 1731 scan_data_t data;
830247a4
IZ
1732 RExC_state_t RExC_state;
1733 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1734
1735 if (exp == NULL)
c277df42 1736 FAIL("NULL regexp argument");
a0d0e21e 1737
a5961de5 1738 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1739
5cfc7842 1740 RExC_precomp = exp;
a5961de5
JH
1741 DEBUG_r({
1742 if (!PL_colorset) reginitcolors();
1743 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744 PL_colors[4],PL_colors[5],PL_colors[0],
1745 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1746 });
830247a4
IZ
1747 RExC_flags16 = pm->op_pmflags;
1748 RExC_sawback = 0;
bbce6d69 1749
830247a4
IZ
1750 RExC_seen = 0;
1751 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752 RExC_seen_evals = 0;
1753 RExC_extralen = 0;
c277df42 1754
bbce6d69 1755 /* First pass: determine size, legality. */
830247a4 1756 RExC_parse = exp;
fac92740 1757 RExC_start = exp;
830247a4
IZ
1758 RExC_end = xend;
1759 RExC_naughty = 0;
1760 RExC_npar = 1;
1761 RExC_size = 0L;
1762 RExC_emit = &PL_regdummy;
1763 RExC_whilem_seen = 0;
85ddcde9
JH
1764#if 0 /* REGC() is (currently) a NOP at the first pass.
1765 * Clever compilers notice this and complain. --jhi */
830247a4 1766 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1767#endif
830247a4 1768 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1769 RExC_precomp = Nullch;
a0d0e21e
LW
1770 return(NULL);
1771 }
830247a4 1772 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1773
c277df42
IZ
1774 /* Small enough for pointer-storage convention?
1775 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1776 if (RExC_size >= 0x10000L && RExC_extralen)
1777 RExC_size += RExC_extralen;
c277df42 1778 else
830247a4
IZ
1779 RExC_extralen = 0;
1780 if (RExC_whilem_seen > 15)
1781 RExC_whilem_seen = 15;
a0d0e21e 1782
bbce6d69 1783 /* Allocate space and initialize. */
830247a4 1784 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1785 char, regexp);
a0d0e21e 1786 if (r == NULL)
b45f050a
JF
1787 FAIL("Regexp out of space");
1788
0f79a09d
GS
1789#ifdef DEBUGGING
1790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1791 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1792#endif
c277df42 1793 r->refcnt = 1;
bbce6d69 1794 r->prelen = xend - exp;
5cfc7842 1795 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1796 r->subbeg = NULL;
1797 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1798 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1799
1800 r->substrs = 0; /* Useful during FAIL. */
1801 r->startp = 0; /* Useful during FAIL. */
1802 r->endp = 0; /* Useful during FAIL. */
1803
fac92740
MJD
1804 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1805 if (r->offsets) {
1806 r->offsets[0] = RExC_size;
1807 }
1808 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1809 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1810 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1811 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1812
830247a4 1813 RExC_rx = r;
bbce6d69 1814
1815 /* Second pass: emit code. */
055bb491 1816 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1817 RExC_parse = exp;
1818 RExC_end = xend;
1819 RExC_naughty = 0;
1820 RExC_npar = 1;
fac92740 1821 RExC_emit_start = r->program;
830247a4 1822 RExC_emit = r->program;
2cd61cdb 1823 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1824 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1826 r->data = 0;
830247a4 1827 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1828 return(NULL);
1829
1830 /* Dig out information for optimizations. */
cf93c79d 1831 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1832 pm->op_pmflags = RExC_flags16;
a0ed51b3 1833 if (UTF)
5ff6fc6d 1834 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1835 r->regstclass = NULL;
830247a4 1836 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1837 r->reganch |= ROPT_NAUGHTY;
c277df42 1838 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1839
1840 /* XXXX To minimize changes to RE engine we always allocate
1841 3-units-long substrs field. */
1842 Newz(1004, r->substrs, 1, struct reg_substr_data);
1843
2c2d71f5 1844 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1845 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1847 I32 fake;
c5254dd6 1848 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1849 struct regnode_charclass_class ch_class;
1850 int stclass_flag;
cb434fcc 1851 I32 last_close = 0;
a0d0e21e
LW
1852
1853 first = scan;
c277df42 1854 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1855 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1856 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858 (OP(first) == PLUS) ||
1859 (OP(first) == MINMOD) ||
653099ff 1860 /* An {n,m} with n>0 */
22c35a8c 1861 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1862 if (OP(first) == PLUS)
1863 sawplus = 1;
1864 else
1865 first += regarglen[(U8)OP(first)];
1866 first = NEXTOPER(first);
a687059c
LW
1867 }
1868
a0d0e21e
LW
1869 /* Starting-point info. */
1870 again:
653099ff 1871 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1872 if (OP(first) == EXACT)
1873 ; /* Empty, get anchored substr later. */
1874 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1875 r->regstclass = first;
1876 }
653099ff 1877 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1878 r->regstclass = first;
22c35a8c
GS
1879 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1881 r->regstclass = first;
22c35a8c 1882 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1883 r->reganch |= (OP(first) == MBOL
1884 ? ROPT_ANCH_MBOL
1885 : (OP(first) == SBOL
1886 ? ROPT_ANCH_SBOL
1887 : ROPT_ANCH_BOL));
a0d0e21e 1888 first = NEXTOPER(first);
774d564b 1889 goto again;
1890 }
1891 else if (OP(first) == GPOS) {
1892 r->reganch |= ROPT_ANCH_GPOS;
1893 first = NEXTOPER(first);
1894 goto again;
a0d0e21e 1895 }
e09294f4 1896 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1897 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1898 !(r->reganch & ROPT_ANCH) )
1899 {
1900 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1901 int type = OP(NEXTOPER(first));
1902
ffc61ed2 1903 if (type == REG_ANY)
cad2e5aa
JH
1904 type = ROPT_ANCH_MBOL;
1905 else
1906 type = ROPT_ANCH_SBOL;
1907
1908 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1909 first = NEXTOPER(first);
774d564b 1910 goto again;
a0d0e21e 1911 }
b81d288d 1912 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1913 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1914 /* x+ must match at the 1st pos of run of x's */
1915 r->reganch |= ROPT_SKIP;
a0d0e21e 1916
c277df42 1917 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1918 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1919 (IV)(first - scan + 1)));
a0d0e21e
LW
1920 /*
1921 * If there's something expensive in the r.e., find the
1922 * longest literal string that must appear and make it the
1923 * regmust. Resolve ties in favor of later strings, since
1924 * the regstart check works with the beginning of the r.e.
1925 * and avoiding duplication strengthens checking. Not a
1926 * strong reason, but sufficient in the absence of others.
1927 * [Now we resolve ties in favor of the earlier string if
c277df42 1928 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1929 * earlier string may buy us something the later one won't.]
1930 */
a0d0e21e 1931 minlen = 0;
a687059c 1932
79cb57f6
GS
1933 data.longest_fixed = newSVpvn("",0);
1934 data.longest_float = newSVpvn("",0);
1935 data.last_found = newSVpvn("",0);
c277df42
IZ
1936 data.longest = &(data.longest_fixed);
1937 first = scan;
653099ff 1938 if (!r->regstclass) {
830247a4 1939 cl_init(pRExC_state, &ch_class);
653099ff
GS
1940 data.start_class = &ch_class;
1941 stclass_flag = SCF_DO_STCLASS_AND;
1942 } else /* XXXX Check for BOUND? */
1943 stclass_flag = 0;
cb434fcc 1944 data.last_closep = &last_close;
653099ff 1945
830247a4 1946 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1947 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1948 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1949 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1950 && !RExC_seen_zerolen
1951 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1952 r->reganch |= ROPT_CHECK_ALL;
830247a4 1953 scan_commit(pRExC_state, &data);
c277df42
IZ
1954 SvREFCNT_dec(data.last_found);
1955
a0ed51b3 1956 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1957 if (longest_float_length
c277df42
IZ
1958 || (data.flags & SF_FL_BEFORE_EOL
1959 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1960 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1961 int t;
1962
a0ed51b3 1963 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1964 && data.offset_fixed == data.offset_float_min
1965 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966 goto remove_float; /* As in (a)+. */
1967
33b8afdf
JH
1968 if (SvUTF8(data.longest_float)) {
1969 r->float_utf8 = data.longest_float;
1970 r->float_substr = Nullsv;
1971 } else {
1972 r->float_substr = data.longest_float;
1973 r->float_utf8 = Nullsv;
1974 }
c277df42
IZ
1975 r->float_min_offset = data.offset_float_min;
1976 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1977 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1979 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 1980 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1981 }
1982 else {
aca2d497 1983 remove_float:
33b8afdf 1984 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1985 SvREFCNT_dec(data.longest_float);
c5254dd6 1986 longest_float_length = 0;
a0d0e21e 1987 }
c277df42 1988
a0ed51b3 1989 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1990 if (longest_fixed_length
c277df42
IZ
1991 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1993 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1994 int t;
1995
33b8afdf
JH
1996 if (SvUTF8(data.longest_fixed)) {
1997 r->anchored_utf8 = data.longest_fixed;
1998 r->anchored_substr = Nullsv;
1999 } else {
2000 r->anchored_substr = data.longest_fixed;
2001 r->anchored_utf8 = Nullsv;
2002 }
c277df42 2003 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2004 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 2006 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 2007 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2008 }
2009 else {
33b8afdf 2010 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2011 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2012 longest_fixed_length = 0;
a0d0e21e 2013 }
b81d288d 2014 if (r->regstclass
ffc61ed2 2015 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2016 r->regstclass = NULL;
33b8afdf
JH
2017 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2018 && stclass_flag
653099ff
GS
2019 && !(data.start_class->flags & ANYOF_EOS)
2020 && !cl_is_anything(data.start_class)) {
830247a4 2021 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2022
b81d288d 2023 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2024 struct regnode_charclass_class);
2025 StructCopy(data.start_class,
830247a4 2026 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2027 struct regnode_charclass_class);
830247a4 2028 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2029 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2030 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2031 DEBUG_r({ SV *sv = sv_newmortal();
2032 regprop(sv, (regnode*)data.start_class);
2033 PerlIO_printf(Perl_debug_log,
2034 "synthetic stclass `%s'.\n",
2035 SvPVX(sv));});
653099ff 2036 }
c277df42
IZ
2037
2038 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2039 if (longest_fixed_length > longest_float_length) {
c277df42 2040 r->check_substr = r->anchored_substr;
33b8afdf 2041 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2042 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043 if (r->reganch & ROPT_ANCH_SINGLE)
2044 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2045 }
2046 else {
c277df42 2047 r->check_substr = r->float_substr;
33b8afdf 2048 r->check_utf8 = r->float_utf8;
c277df42
IZ
2049 r->check_offset_min = data.offset_float_min;
2050 r->check_offset_max = data.offset_float_max;
a0d0e21e 2051 }
30382c73
IZ
2052 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053 This should be changed ASAP! */
33b8afdf 2054 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2055 r->reganch |= RE_USE_INTUIT;
33b8afdf 2056 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2057 r->reganch |= RE_INTUIT_TAIL;
2058 }
a0ed51b3
LW
2059 }
2060 else {
c277df42
IZ
2061 /* Several toplevels. Best we can is to set minlen. */
2062 I32 fake;
653099ff 2063 struct regnode_charclass_class ch_class;
cb434fcc 2064 I32 last_close = 0;
c277df42
IZ
2065
2066 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067 scan = r->program + 1;
830247a4 2068 cl_init(pRExC_state, &ch_class);
653099ff 2069 data.start_class = &ch_class;
cb434fcc 2070 data.last_closep = &last_close;
e1901655 2071 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2072 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073 = r->float_substr = r->float_utf8 = Nullsv;
653099ff
GS
2074 if (!(data.start_class->flags & ANYOF_EOS)
2075 && !cl_is_anything(data.start_class)) {
830247a4 2076 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2077
b81d288d 2078 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2079 struct regnode_charclass_class);
2080 StructCopy(data.start_class,
830247a4 2081 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2082 struct regnode_charclass_class);
830247a4 2083 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2084 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2085 DEBUG_r({ SV* sv = sv_newmortal();
2086 regprop(sv, (regnode*)data.start_class);
2087 PerlIO_printf(Perl_debug_log,
2088 "synthetic stclass `%s'.\n",
2089 SvPVX(sv));});
653099ff 2090 }
a0d0e21e
LW
2091 }
2092
a0d0e21e 2093 r->minlen = minlen;
b81d288d 2094 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2095 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2096 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2097 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2098 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2099 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2100 if (RExC_seen & REG_SEEN_CANY)
2101 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2102 Newz(1002, r->startp, RExC_npar, I32);
2103 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2104 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2105 DEBUG_r(regdump(r));
2106 return(r);
a687059c
LW
2107}
2108
2109/*
2110 - reg - regular expression, i.e. main body or parenthesized thing
2111 *
2112 * Caller must absorb opening parenthesis.
2113 *
2114 * Combining parenthesis handling with the base level of regular expression
2115 * is a trifle forced, but the need to tie the tails of the branches to what
2116 * follows makes it hard to avoid.
2117 */
76e3520e 2118STATIC regnode *
830247a4 2119S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2120 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2121{
c277df42
IZ
2122 register regnode *ret; /* Will be the head of the group. */
2123 register regnode *br;
2124 register regnode *lastbr;
2125 register regnode *ender = 0;
a0d0e21e 2126 register I32 parno = 0;
830247a4 2127 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2128
2129 /* for (?g), (?gc), and (?o) warnings; warning
2130 about (?c) will warn about (?g) -- japhy */
2131
2132 I32 wastedflags = 0x00,
2133 wasted_o = 0x01,
2134 wasted_g = 0x02,
2135 wasted_gc = 0x02 | 0x04,
2136 wasted_c = 0x04;
2137
fac92740 2138 char * parse_start = RExC_parse; /* MJD */
830247a4 2139 char *oregcomp_parse = RExC_parse;
c277df42 2140 char c;
a0d0e21e 2141
821b33a5 2142 *flagp = 0; /* Tentatively. */
a0d0e21e 2143
9d1d55b5 2144
a0d0e21e
LW
2145 /* Make an OPEN node, if parenthesized. */
2146 if (paren) {
fac92740 2147 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2148 U16 posflags = 0, negflags = 0;
2149 U16 *flagsp = &posflags;
0f5d15d6 2150 int logical = 0;
830247a4 2151 char *seqstart = RExC_parse;
ca9dfc88 2152
830247a4
IZ
2153 RExC_parse++;
2154 paren = *RExC_parse++;
c277df42 2155 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2156 switch (paren) {
fac92740 2157 case '<': /* (?<...) */
830247a4 2158 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2159 if (*RExC_parse == '!')
c277df42 2160 paren = ',';
b81d288d 2161 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2162 goto unknown;
830247a4 2163 RExC_parse++;
fac92740
MJD
2164 case '=': /* (?=...) */
2165 case '!': /* (?!...) */
830247a4 2166 RExC_seen_zerolen++;
fac92740
MJD
2167 case ':': /* (?:...) */
2168 case '>': /* (?>...) */
a0d0e21e 2169 break;
fac92740
MJD
2170 case '$': /* (?$...) */
2171 case '@': /* (?@...) */
8615cb43 2172 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2173 break;
fac92740 2174 case '#': /* (?#...) */
830247a4
IZ
2175 while (*RExC_parse && *RExC_parse != ')')
2176 RExC_parse++;
2177 if (*RExC_parse != ')')
c277df42 2178 FAIL("Sequence (?#... not terminated");
830247a4 2179 nextchar(pRExC_state);
a0d0e21e
LW
2180 *flagp = TRYAGAIN;
2181 return NULL;
fac92740 2182 case 'p': /* (?p...) */
9014280d 2183 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2184 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2185 /* FALL THROUGH*/
fac92740 2186 case '?': /* (??...) */
0f5d15d6 2187 logical = 1;
438a3801
YST
2188 if (*RExC_parse != '{')
2189 goto unknown;
830247a4 2190 paren = *RExC_parse++;
0f5d15d6 2191 /* FALL THROUGH */
fac92740 2192 case '{': /* (?{...}) */
c277df42 2193 {
c277df42
IZ
2194 I32 count = 1, n = 0;
2195 char c;
830247a4 2196 char *s = RExC_parse;
c277df42
IZ
2197 SV *sv;
2198 OP_4tree *sop, *rop;
2199
830247a4
IZ
2200 RExC_seen_zerolen++;
2201 RExC_seen |= REG_SEEN_EVAL;
2202 while (count && (c = *RExC_parse)) {
2203 if (c == '\\' && RExC_parse[1])
2204 RExC_parse++;
b81d288d 2205 else if (c == '{')
c277df42 2206 count++;
b81d288d 2207 else if (c == '}')
c277df42 2208 count--;
830247a4 2209 RExC_parse++;
c277df42 2210 }
830247a4 2211 if (*RExC_parse != ')')
b45f050a 2212 {
b81d288d 2213 RExC_parse = s;
b45f050a
JF
2214 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2215 }
c277df42
IZ
2216 if (!SIZE_ONLY) {
2217 AV *av;
b81d288d
AB
2218
2219 if (RExC_parse - 1 - s)
830247a4 2220 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2221 else
79cb57f6 2222 sv = newSVpvn("", 0);
c277df42 2223
569233ed
SB
2224 ENTER;
2225 Perl_save_re_context(aTHX);
c277df42 2226 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2227 sop->op_private |= OPpREFCOUNTED;
2228 /* re_dup will OpREFCNT_inc */
2229 OpREFCNT_set(sop, 1);
569233ed 2230 LEAVE;
c277df42 2231
830247a4
IZ
2232 n = add_data(pRExC_state, 3, "nop");
2233 RExC_rx->data->data[n] = (void*)rop;
2234 RExC_rx->data->data[n+1] = (void*)sop;
2235 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2236 SvREFCNT_dec(sv);
a0ed51b3 2237 }
e24b16f9 2238 else { /* First pass */
830247a4 2239 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2240 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2241 /* No compiled RE interpolated, has runtime
2242 components ===> unsafe. */
2243 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2244 if (PL_tainting && PL_tainted)
cc6b7395 2245 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2246 }
2247
830247a4 2248 nextchar(pRExC_state);
0f5d15d6 2249 if (logical) {
830247a4 2250 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2251 if (!SIZE_ONLY)
2252 ret->flags = 2;
830247a4 2253 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2254 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2255 return ret;
2256 }
830247a4 2257 return reganode(pRExC_state, EVAL, n);
c277df42 2258 }
fac92740 2259 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2260 {
fac92740 2261 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2262 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263 || RExC_parse[1] == '<'
830247a4 2264 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2265 I32 flag;
2266
830247a4 2267 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2268 if (!SIZE_ONLY)
2269 ret->flags = 1;
830247a4 2270 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2271 goto insert_if;
b81d288d 2272 }
a0ed51b3 2273 }
830247a4 2274 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2275 /* (?(1)...) */
830247a4 2276 parno = atoi(RExC_parse++);
c277df42 2277
830247a4
IZ
2278 while (isDIGIT(*RExC_parse))
2279 RExC_parse++;
fac92740
MJD
2280 ret = reganode(pRExC_state, GROUPP, parno);
2281
830247a4 2282 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2283 vFAIL("Switch condition not recognized");
c277df42 2284 insert_if:
830247a4
IZ
2285 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286 br = regbranch(pRExC_state, &flags, 1);
c277df42 2287 if (br == NULL)
830247a4 2288 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2289 else
830247a4
IZ
2290 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291 c = *nextchar(pRExC_state);
d1b80229
IZ
2292 if (flags&HASWIDTH)
2293 *flagp |= HASWIDTH;
c277df42 2294 if (c == '|') {
830247a4
IZ
2295 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296 regbranch(pRExC_state, &flags, 1);
2297 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2298 if (flags&HASWIDTH)
2299 *flagp |= HASWIDTH;
830247a4 2300 c = *nextchar(pRExC_state);
a0ed51b3
LW
2301 }
2302 else
c277df42
IZ
2303 lastbr = NULL;
2304 if (c != ')')
8615cb43 2305 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2306 ender = reg_node(pRExC_state, TAIL);
2307 regtail(pRExC_state, br, ender);
c277df42 2308 if (lastbr) {
830247a4
IZ
2309 regtail(pRExC_state, lastbr, ender);
2310 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2311 }
2312 else
830247a4 2313 regtail(pRExC_state, ret, ender);
c277df42 2314 return ret;
a0ed51b3
LW
2315 }
2316 else {
830247a4 2317 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2318 }
2319 }
1b1626e4 2320 case 0:
830247a4 2321 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2322 vFAIL("Sequence (? incomplete");
1b1626e4 2323 break;
a0d0e21e 2324 default:
830247a4 2325 --RExC_parse;
fac92740 2326 parse_flags: /* (?i) */
830247a4 2327 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2328 /* (?g), (?gc) and (?o) are useless here
2329 and must be globally applied -- japhy */
2330
2331 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334 if (! (wastedflags & wflagbit) ) {
2335 wastedflags |= wflagbit;
2336 vWARN5(
2337 RExC_parse + 1,
2338 "Useless (%s%c) - %suse /%c modifier",
2339 flagsp == &negflags ? "?-" : "?",
2340 *RExC_parse,
2341 flagsp == &negflags ? "don't " : "",
2342 *RExC_parse
2343 );
2344 }
2345 }
2346 }
2347 else if (*RExC_parse == 'c') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 if (! (wastedflags & wasted_c) ) {
2350 wastedflags |= wasted_gc;
2351 vWARN3(
2352 RExC_parse + 1,
2353 "Useless (%sc) - %suse /gc modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 flagsp == &negflags ? "don't " : ""
2356 );
2357 }
2358 }
2359 }
2360 else { pmflag(flagsp, *RExC_parse); }
2361
830247a4 2362 ++RExC_parse;
ca9dfc88 2363 }
830247a4 2364 if (*RExC_parse == '-') {
ca9dfc88 2365 flagsp = &negflags;
9d1d55b5 2366 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2367 ++RExC_parse;
ca9dfc88 2368 goto parse_flags;
48c036b1 2369 }
830247a4
IZ
2370 RExC_flags16 |= posflags;
2371 RExC_flags16 &= ~negflags;
2372 if (*RExC_parse == ':') {
2373 RExC_parse++;
ca9dfc88
IZ
2374 paren = ':';
2375 break;
2376 }
c277df42 2377 unknown:
830247a4
IZ
2378 if (*RExC_parse != ')') {
2379 RExC_parse++;
2380 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2381 }
830247a4 2382 nextchar(pRExC_state);
a0d0e21e
LW
2383 *flagp = TRYAGAIN;
2384 return NULL;
2385 }
2386 }
fac92740 2387 else { /* (...) */
830247a4
IZ
2388 parno = RExC_npar;
2389 RExC_npar++;
2390 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2391 Set_Node_Length(ret, 1); /* MJD */
2392 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2393 open = 1;
a0d0e21e 2394 }
a0ed51b3 2395 }
fac92740 2396 else /* ! paren */
a0d0e21e
LW
2397 ret = NULL;
2398
2399 /* Pick up the branches, linking them together. */
fac92740 2400 parse_start = RExC_parse; /* MJD */
830247a4 2401 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2402 /* branch_len = (paren != 0); */
2403
a0d0e21e
LW
2404 if (br == NULL)
2405 return(NULL);
830247a4
IZ
2406 if (*RExC_parse == '|') {
2407 if (!SIZE_ONLY && RExC_extralen) {
2408 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2409 }
fac92740 2410 else { /* MJD */
830247a4 2411 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2412 Set_Node_Length(br, paren != 0);
2413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2414 }
c277df42
IZ
2415 have_branch = 1;
2416 if (SIZE_ONLY)
830247a4 2417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2418 }
2419 else if (paren == ':') {
c277df42
IZ
2420 *flagp |= flags&SIMPLE;
2421 }
2422 if (open) { /* Starts with OPEN. */
830247a4 2423 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2424 }
2425 else if (paren != '?') /* Not Conditional */
a0d0e21e 2426 ret = br;
32a0ca98 2427 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2428 lastbr = br;
830247a4
IZ
2429 while (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 ender = reganode(pRExC_state, LONGJMP,0);
2432 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2433 }
2434 if (SIZE_ONLY)
830247a4
IZ
2435 RExC_extralen += 2; /* Account for LONGJMP. */
2436 nextchar(pRExC_state);
2437 br = regbranch(pRExC_state, &flags, 0);
fac92740 2438
a687059c 2439 if (br == NULL)
a0d0e21e 2440 return(NULL);
830247a4 2441 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2442 lastbr = br;
821b33a5
IZ
2443 if (flags&HASWIDTH)
2444 *flagp |= HASWIDTH;
a687059c 2445 *flagp |= flags&SPSTART;
a0d0e21e
LW
2446 }
2447
c277df42
IZ
2448 if (have_branch || paren != ':') {
2449 /* Make a closing node, and hook it on the end. */
2450 switch (paren) {
2451 case ':':
830247a4 2452 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2453 break;
2454 case 1:
830247a4 2455 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2456 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2458 break;
2459 case '<':
c277df42
IZ
2460 case ',':
2461 case '=':
2462 case '!':
c277df42 2463 *flagp &= ~HASWIDTH;
821b33a5
IZ
2464 /* FALL THROUGH */
2465 case '>':
830247a4 2466 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2467 break;
2468 case 0:
830247a4 2469 ender = reg_node(pRExC_state, END);
c277df42
IZ
2470 break;
2471 }
830247a4 2472 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2473
c277df42
IZ
2474 if (have_branch) {
2475 /* Hook the tails of the branches to the closing node. */
2476 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2477 regoptail(pRExC_state, br, ender);
c277df42
IZ
2478 }
2479 }
a0d0e21e 2480 }
c277df42
IZ
2481
2482 {
2483 char *p;
2484 static char parens[] = "=!<,>";
2485
2486 if (paren && (p = strchr(parens, paren))) {
2487 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488 int flag = (p - parens) > 1;
2489
2490 if (paren == '>')
2491 node = SUSPEND, flag = 0;
830247a4 2492 reginsert(pRExC_state, node,ret);
c277df42 2493 ret->flags = flag;
830247a4 2494 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2495 }
a0d0e21e
LW
2496 }
2497
2498 /* Check for proper termination. */
ce3e6498 2499 if (paren) {
830247a4
IZ
2500 RExC_flags16 = oregflags;
2501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502 RExC_parse = oregcomp_parse;
380a0633 2503 vFAIL("Unmatched (");
ce3e6498 2504 }
a0ed51b3 2505 }
830247a4
IZ
2506 else if (!paren && RExC_parse < RExC_end) {
2507 if (*RExC_parse == ')') {
2508 RExC_parse++;
380a0633 2509 vFAIL("Unmatched )");
a0ed51b3
LW
2510 }
2511 else
b45f050a 2512 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2513 /* NOTREACHED */
2514 }
a687059c 2515
a0d0e21e 2516 return(ret);
a687059c
LW
2517}
2518
2519/*
2520 - regbranch - one alternative of an | operator
2521 *
2522 * Implements the concatenation operator.
2523 */
76e3520e 2524STATIC regnode *
830247a4 2525S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2526{
c277df42
IZ
2527 register regnode *ret;
2528 register regnode *chain = NULL;
2529 register regnode *latest;
2530 I32 flags = 0, c = 0;
a0d0e21e 2531
b81d288d 2532 if (first)
c277df42
IZ
2533 ret = NULL;
2534 else {
b81d288d 2535 if (!SIZE_ONLY && RExC_extralen)
830247a4 2536 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2537 else {
830247a4 2538 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2539 Set_Node_Length(ret, 1);
2540 }
c277df42
IZ
2541 }
2542
b81d288d 2543 if (!first && SIZE_ONLY)
830247a4 2544 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2545
c277df42 2546 *flagp = WORST; /* Tentatively. */
a0d0e21e 2547
830247a4
IZ
2548 RExC_parse--;
2549 nextchar(pRExC_state);
2550 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2551 flags &= ~TRYAGAIN;
830247a4 2552 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2553 if (latest == NULL) {
2554 if (flags & TRYAGAIN)
2555 continue;
2556 return(NULL);
a0ed51b3
LW
2557 }
2558 else if (ret == NULL)
c277df42 2559 ret = latest;
a0d0e21e 2560 *flagp |= flags&HASWIDTH;
c277df42 2561 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2562 *flagp |= flags&SPSTART;
2563 else {
830247a4
IZ
2564 RExC_naughty++;
2565 regtail(pRExC_state, chain, latest);
a687059c 2566 }
a0d0e21e 2567 chain = latest;
c277df42
IZ
2568 c++;
2569 }
2570 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2571 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2572 if (ret == NULL)
2573 ret = chain;
2574 }
2575 if (c == 1) {
2576 *flagp |= flags&SIMPLE;
a0d0e21e 2577 }
a687059c 2578
a0d0e21e 2579 return(ret);
a687059c
LW
2580}
2581
2582/*
2583 - regpiece - something followed by possible [*+?]
2584 *
2585 * Note that the branching code sequences used for ? and the general cases
2586 * of * and + are somewhat optimized: they use the same NOTHING node as
2587 * both the endmarker for their branch list and the body of the last branch.
2588 * It might seem that this node could be dispensed with entirely, but the
2589 * endmarker role is not redundant.
2590 */
76e3520e 2591STATIC regnode *
830247a4 2592S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2593{
c277df42 2594 register regnode *ret;
a0d0e21e
LW
2595 register char op;
2596 register char *next;
2597 I32 flags;
830247a4 2598 char *origparse = RExC_parse;
a0d0e21e
LW
2599 char *maxpos;
2600 I32 min;
c277df42 2601 I32 max = REG_INFTY;
fac92740 2602 char *parse_start;
a0d0e21e 2603
830247a4 2604 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2605 if (ret == NULL) {
2606 if (flags & TRYAGAIN)
2607 *flagp |= TRYAGAIN;
2608 return(NULL);
2609 }
2610
830247a4 2611 op = *RExC_parse;
a0d0e21e 2612
830247a4 2613 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2614 parse_start = RExC_parse; /* MJD */
830247a4 2615 next = RExC_parse + 1;
a0d0e21e
LW
2616 maxpos = Nullch;
2617 while (isDIGIT(*next) || *next == ',') {
2618 if (*next == ',') {
2619 if (maxpos)
2620 break;
2621 else
2622 maxpos = next;
a687059c 2623 }
a0d0e21e
LW
2624 next++;
2625 }
2626 if (*next == '}') { /* got one */
2627 if (!maxpos)
2628 maxpos = next;
830247a4
IZ
2629 RExC_parse++;
2630 min = atoi(RExC_parse);
a0d0e21e
LW
2631 if (*maxpos == ',')
2632 maxpos++;
2633 else
830247a4 2634 maxpos = RExC_parse;
a0d0e21e
LW
2635 max = atoi(maxpos);
2636 if (!max && *maxpos != '0')
c277df42
IZ
2637 max = REG_INFTY; /* meaning "infinity" */
2638 else if (max >= REG_INFTY)
8615cb43 2639 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2640 RExC_parse = next;
2641 nextchar(pRExC_state);
a0d0e21e
LW
2642
2643 do_curly:
2644 if ((flags&SIMPLE)) {
830247a4
IZ
2645 RExC_naughty += 2 + RExC_naughty / 2;
2646 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2647 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2649 }
2650 else {
830247a4 2651 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2652
2653 w->flags = 0;
830247a4
IZ
2654 regtail(pRExC_state, ret, w);
2655 if (!SIZE_ONLY && RExC_extralen) {
2656 reginsert(pRExC_state, LONGJMP,ret);
2657 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2658 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2659 }
830247a4 2660 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2661 /* MJD hk */
2662 Set_Node_Offset(ret, parse_start+1);
2663 Set_Node_Length(ret,
2664 op == '{' ? (RExC_parse - parse_start) : 1);
2665
830247a4 2666 if (!SIZE_ONLY && RExC_extralen)
c277df42 2667 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2668 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2669 if (SIZE_ONLY)
830247a4
IZ
2670 RExC_whilem_seen++, RExC_extralen += 3;
2671 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2672 }
c277df42 2673 ret->flags = 0;
a0d0e21e
LW
2674
2675 if (min > 0)
821b33a5
IZ
2676 *flagp = WORST;
2677 if (max > 0)
2678 *flagp |= HASWIDTH;
a0d0e21e 2679 if (max && max < min)
8615cb43 2680 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2681 if (!SIZE_ONLY) {
2682 ARG1_SET(ret, min);
2683 ARG2_SET(ret, max);
a687059c 2684 }
a687059c 2685
a0d0e21e 2686 goto nest_check;
a687059c 2687 }
a0d0e21e 2688 }
a687059c 2689
a0d0e21e
LW
2690 if (!ISMULT1(op)) {
2691 *flagp = flags;
a687059c 2692 return(ret);
a0d0e21e 2693 }
bb20fd44 2694
c277df42 2695#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2696
2697 /* if this is reinstated, don't forget to put this back into perldiag:
2698
2699 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2700
2701 (F) The part of the regexp subject to either the * or + quantifier
2702 could match an empty string. The {#} shows in the regular
2703 expression about where the problem was discovered.
2704
2705 */
2706
bb20fd44 2707 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2708 vFAIL("Regexp *+ operand could be empty");
b81d288d 2709#endif
bb20fd44 2710
fac92740 2711 parse_start = RExC_parse;
830247a4 2712 nextchar(pRExC_state);
a0d0e21e 2713
821b33a5 2714 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2715
2716 if (op == '*' && (flags&SIMPLE)) {
830247a4 2717 reginsert(pRExC_state, STAR, ret);
c277df42 2718 ret->flags = 0;
830247a4 2719 RExC_naughty += 4;
a0d0e21e
LW
2720 }
2721 else if (op == '*') {
2722 min = 0;
2723 goto do_curly;
a0ed51b3
LW
2724 }
2725 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2726 reginsert(pRExC_state, PLUS, ret);
c277df42 2727 ret->flags = 0;
830247a4 2728 RExC_naughty += 3;
a0d0e21e
LW
2729 }
2730 else if (op == '+') {
2731 min = 1;
2732 goto do_curly;
a0ed51b3
LW
2733 }
2734 else if (op == '?') {
a0d0e21e
LW
2735 min = 0; max = 1;
2736 goto do_curly;
2737 }
2738 nest_check:
e476b1b5 2739 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2740 vWARN3(RExC_parse,
b45f050a 2741 "%.*s matches null string many times",
830247a4 2742 RExC_parse - origparse,
b45f050a 2743 origparse);
a0d0e21e
LW
2744 }
2745
830247a4
IZ
2746 if (*RExC_parse == '?') {
2747 nextchar(pRExC_state);
2748 reginsert(pRExC_state, MINMOD, ret);
2749 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2750 }
830247a4
IZ
2751 if (ISMULT2(RExC_parse)) {
2752 RExC_parse++;
b45f050a
JF
2753 vFAIL("Nested quantifiers");
2754 }
a0d0e21e
LW
2755
2756 return(ret);
a687059c
LW
2757}
2758
2759/*
2760 - regatom - the lowest level
2761 *
2762 * Optimization: gobbles an entire sequence of ordinary characters so that
2763 * it can turn them into a single node, which is smaller to store and
2764 * faster to run. Backslashed characters are exceptions, each becoming a
2765 * separate node; the code is simpler that way and it's not worth fixing.
2766 *
b45f050a 2767 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2768STATIC regnode *
830247a4 2769S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2770{
c277df42 2771 register regnode *ret = 0;
a0d0e21e 2772 I32 flags;
f06dbbb7 2773 char *parse_start = 0;
a0d0e21e
LW
2774
2775 *flagp = WORST; /* Tentatively. */
2776
2777tryagain:
830247a4 2778 switch (*RExC_parse) {
a0d0e21e 2779 case '^':
830247a4
IZ
2780 RExC_seen_zerolen++;
2781 nextchar(pRExC_state);
2782 if (RExC_flags16 & PMf_MULTILINE)
2783 ret = reg_node(pRExC_state, MBOL);
2784 else if (RExC_flags16 & PMf_SINGLELINE)
2785 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2786 else
830247a4 2787 ret = reg_node(pRExC_state, BOL);
fac92740 2788 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2789 break;
2790 case '$':
830247a4 2791 nextchar(pRExC_state);
b81d288d 2792 if (*RExC_parse)
830247a4
IZ
2793 RExC_seen_zerolen++;
2794 if (RExC_flags16 & PMf_MULTILINE)
2795 ret = reg_node(pRExC_state, MEOL);
2796 else if (RExC_flags16 & PMf_SINGLELINE)
2797 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2798 else
830247a4 2799 ret = reg_node(pRExC_state, EOL);
fac92740 2800 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2801 break;
2802 case '.':
830247a4 2803 nextchar(pRExC_state);
cce850e4 2804 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2805 ret = reg_node(pRExC_state, SANY);
2806 else
2807 ret = reg_node(pRExC_state, REG_ANY);
2808 *flagp |= HASWIDTH|SIMPLE;
830247a4 2809 RExC_naughty++;
fac92740 2810 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2811 break;
2812 case '[':
b45f050a 2813 {
830247a4 2814 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2815 ret = regclass(pRExC_state);
830247a4
IZ
2816 if (*RExC_parse != ']') {
2817 RExC_parse = oregcomp_parse;
b45f050a
JF
2818 vFAIL("Unmatched [");
2819 }
830247a4 2820 nextchar(pRExC_state);
a0d0e21e 2821 *flagp |= HASWIDTH|SIMPLE;
fac92740 2822 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2823 break;
b45f050a 2824 }
a0d0e21e 2825 case '(':
830247a4
IZ
2826 nextchar(pRExC_state);
2827 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2828 if (ret == NULL) {
bf93d4cc 2829 if (flags & TRYAGAIN) {
830247a4 2830 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2831 /* Make parent create an empty node if needed. */
2832 *flagp |= TRYAGAIN;
2833 return(NULL);
2834 }
a0d0e21e 2835 goto tryagain;
bf93d4cc 2836 }
a0d0e21e
LW
2837 return(NULL);
2838 }
c277df42 2839 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2840 break;
2841 case '|':
2842 case ')':
2843 if (flags & TRYAGAIN) {
2844 *flagp |= TRYAGAIN;
2845 return NULL;
2846 }
b45f050a 2847 vFAIL("Internal urp");
a0d0e21e
LW
2848 /* Supposed to be caught earlier. */
2849 break;
85afd4ae 2850 case '{':
830247a4
IZ
2851 if (!regcurly(RExC_parse)) {
2852 RExC_parse++;
85afd4ae
CS
2853 goto defchar;
2854 }
2855 /* FALL THROUGH */
a0d0e21e
LW
2856 case '?':
2857 case '+':
2858 case '*':
830247a4 2859 RExC_parse++;
b45f050a 2860 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2861 break;
2862 case '\\':
830247a4 2863 switch (*++RExC_parse) {
a0d0e21e 2864 case 'A':
830247a4
IZ
2865 RExC_seen_zerolen++;
2866 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2867 *flagp |= SIMPLE;
830247a4 2868 nextchar(pRExC_state);
fac92740 2869 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2870 break;
2871 case 'G':
830247a4
IZ
2872 ret = reg_node(pRExC_state, GPOS);
2873 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2874 *flagp |= SIMPLE;
830247a4 2875 nextchar(pRExC_state);
fac92740 2876 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2877 break;
2878 case 'Z':
830247a4 2879 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2880 *flagp |= SIMPLE;
a1917ab9 2881 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2882 nextchar(pRExC_state);
a0d0e21e 2883 break;
b85d18e9 2884 case 'z':
830247a4 2885 ret = reg_node(pRExC_state, EOS);
b85d18e9 2886 *flagp |= SIMPLE;
830247a4
IZ
2887 RExC_seen_zerolen++; /* Do not optimize RE away */
2888 nextchar(pRExC_state);
fac92740 2889 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2890 break;
4a2d328f 2891 case 'C':
f33976b4
DB
2892 ret = reg_node(pRExC_state, CANY);
2893 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2894 *flagp |= HASWIDTH|SIMPLE;
830247a4 2895 nextchar(pRExC_state);
fac92740 2896 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2897 break;
2898 case 'X':
830247a4 2899 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2900 *flagp |= HASWIDTH;
830247a4 2901 nextchar(pRExC_state);
fac92740 2902 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2903 break;
a0d0e21e 2904 case 'w':
ffc61ed2 2905 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2906 *flagp |= HASWIDTH|SIMPLE;
830247a4 2907 nextchar(pRExC_state);
fac92740 2908 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2909 break;
2910 case 'W':
ffc61ed2 2911 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2912 *flagp |= HASWIDTH|SIMPLE;
830247a4 2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2915 break;
2916 case 'b':
830247a4
IZ
2917 RExC_seen_zerolen++;
2918 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2919 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2920 *flagp |= SIMPLE;
830247a4 2921 nextchar(pRExC_state);
fac92740 2922 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2923 break;
2924 case 'B':
830247a4
IZ
2925 RExC_seen_zerolen++;
2926 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2927 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2928 *flagp |= SIMPLE;
830247a4 2929 nextchar(pRExC_state);
fac92740 2930 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2931 break;
2932 case 's':
ffc61ed2 2933 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2934 *flagp |= HASWIDTH|SIMPLE;
830247a4 2935 nextchar(pRExC_state);
fac92740 2936 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2937 break;
2938 case 'S':
ffc61ed2 2939 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2940 *flagp |= HASWIDTH|SIMPLE;
830247a4 2941 nextchar(pRExC_state);
fac92740 2942 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2943 break;
2944 case 'd':
ffc61ed2 2945 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2946 *flagp |= HASWIDTH|SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2949 break;
2950 case 'D':
ffc61ed2 2951 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2952 *flagp |= HASWIDTH|SIMPLE;
830247a4 2953 nextchar(pRExC_state);
fac92740 2954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2955 break;
a14b48bc
LW
2956 case 'p':
2957 case 'P':
3568d838 2958 {
830247a4 2959 char* oldregxend = RExC_end;
fac92740 2960 char* parse_start = RExC_parse;
a14b48bc 2961
830247a4 2962 if (RExC_parse[1] == '{') {
3568d838 2963 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2964 RExC_end = strchr(RExC_parse, '}');
2965 if (!RExC_end) {
0da60cf5 2966 U8 c = (U8)*RExC_parse;
830247a4
IZ
2967 RExC_parse += 2;
2968 RExC_end = oldregxend;
0da60cf5 2969 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2970 }
830247a4 2971 RExC_end++;
a14b48bc 2972 }
af6f566e 2973 else {
830247a4 2974 RExC_end = RExC_parse + 2;
af6f566e
HS
2975 if (RExC_end > oldregxend)
2976 RExC_end = oldregxend;
2977 }
830247a4 2978 RExC_parse--;
a14b48bc 2979
ffc61ed2 2980 ret = regclass(pRExC_state);
a14b48bc 2981
830247a4
IZ
2982 RExC_end = oldregxend;
2983 RExC_parse--;
fac92740 2984 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2985 nextchar(pRExC_state);
a14b48bc
LW
2986 *flagp |= HASWIDTH|SIMPLE;
2987 }
2988 break;
a0d0e21e
LW
2989 case 'n':
2990 case 'r':
2991 case 't':
2992 case 'f':
2993 case 'e':
2994 case 'a':
2995 case 'x':
2996 case 'c':
2997 case '0':
2998 goto defchar;
2999 case '1': case '2': case '3': case '4':
3000 case '5': case '6': case '7': case '8': case '9':
3001 {
830247a4 3002 I32 num = atoi(RExC_parse);
a0d0e21e 3003
830247a4 3004 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3005 goto defchar;
3006 else {
fac92740 3007 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3008 while (isDIGIT(*RExC_parse))
3009 RExC_parse++;
b45f050a 3010
830247a4 3011 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 3012 vFAIL("Reference to nonexistent group");
830247a4
IZ
3013 RExC_sawback = 1;
3014 ret = reganode(pRExC_state, FOLD
a0ed51b3 3015 ? (LOC ? REFFL : REFF)
c8756f30 3016 : REF, num);
a0d0e21e 3017 *flagp |= HASWIDTH;
fac92740
MJD
3018
3019 /* override incorrect value set in reganode MJD */
3020 Set_Node_Offset(ret, parse_start+1);
3021 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3022 RExC_parse--;
3023 nextchar(pRExC_state);
a0d0e21e
LW
3024 }
3025 }
3026 break;
3027 case '\0':
830247a4 3028 if (RExC_parse >= RExC_end)
b45f050a 3029 FAIL("Trailing \\");
a0d0e21e
LW
3030 /* FALL THROUGH */
3031 default:
c9f97d15
IZ
3032 /* Do not generate `unrecognized' warnings here, we fall
3033 back into the quick-grab loop below */
a0d0e21e
LW
3034 goto defchar;
3035 }
3036 break;
4633a7c4
LW
3037
3038 case '#':
830247a4
IZ
3039 if (RExC_flags16 & PMf_EXTENDED) {
3040 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3041 if (RExC_parse < RExC_end)
4633a7c4
LW
3042 goto tryagain;
3043 }
3044 /* FALL THROUGH */
3045
a0d0e21e 3046 default: {
ba210ebe 3047 register STRLEN len;
58ae7d3f 3048 register UV ender;
a0d0e21e 3049 register char *p;
c277df42 3050 char *oldp, *s;
ba210ebe 3051 STRLEN numlen;
80aecb99 3052 STRLEN foldlen;
60a8b682 3053 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3054
3055 parse_start = RExC_parse - 1;
a0d0e21e 3056
830247a4 3057 RExC_parse++;
a0d0e21e
LW
3058
3059 defchar:
58ae7d3f 3060 ender = 0;
830247a4 3061 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3062 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3063 : EXACT);
cd439c50 3064 s = STRING(ret);
830247a4
IZ
3065 for (len = 0, p = RExC_parse - 1;
3066 len < 127 && p < RExC_end;
a0d0e21e
LW
3067 len++)
3068 {
3069 oldp = p;
5b5a24f7 3070
830247a4
IZ
3071 if (RExC_flags16 & PMf_EXTENDED)
3072 p = regwhite(p, RExC_end);
a0d0e21e
LW
3073 switch (*p) {
3074 case '^':
3075 case '$':
3076 case '.':
3077 case '[':
3078 case '(':
3079 case ')':
3080 case '|':
3081 goto loopdone;
3082 case '\\':
3083 switch (*++p) {
3084 case 'A':
1ed8eac0
JF
3085 case 'C':
3086 case 'X':
a0d0e21e
LW
3087 case 'G':
3088 case 'Z':
b85d18e9 3089 case 'z':
a0d0e21e
LW
3090 case 'w':
3091 case 'W':
3092 case 'b':
3093 case 'B':
3094 case 's':
3095 case 'S':
3096 case 'd':
3097 case 'D':
a14b48bc
LW
3098 case 'p':
3099 case 'P':
a0d0e21e
LW
3100 --p;
3101 goto loopdone;
3102 case 'n':
3103 ender = '\n';
3104 p++;
a687059c 3105 break;
a0d0e21e
LW
3106 case 'r':
3107 ender = '\r';
3108 p++;
a687059c 3109 break;
a0d0e21e
LW
3110 case 't':
3111 ender = '\t';
3112 p++;
a687059c 3113 break;
a0d0e21e
LW
3114 case 'f':
3115 ender = '\f';
3116 p++;
a687059c 3117 break;
a0d0e21e 3118 case 'e':
c7f1f016 3119 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3120 p++;
a687059c 3121 break;
a0d0e21e 3122 case 'a':
c7f1f016 3123 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3124 p++;
a687059c 3125 break;
a0d0e21e 3126 case 'x':
a0ed51b3
LW
3127 if (*++p == '{') {
3128 char* e = strchr(p, '}');
b81d288d 3129
b45f050a 3130 if (!e) {
830247a4 3131 RExC_parse = p + 1;
b45f050a
JF
3132 vFAIL("Missing right brace on \\x{}");
3133 }
de5f0749 3134 else {
a4c04bdc
NC
3135 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3136 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3137 numlen = e - p - 1;
3138 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3139 if (ender > 0xff)
3140 RExC_utf8 = 1;
b21ed0a9
GS
3141 /* numlen is generous */
3142 if (numlen + len >= 127) {
a0ed51b3
LW
3143 p--;
3144 goto loopdone;
3145 }
3146 p = e + 1;
3147 }
a0ed51b3
LW
3148 }
3149 else {
a4c04bdc 3150 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3151 numlen = 2;
3152 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3153 p += numlen;
3154 }
a687059c 3155 break;
a0d0e21e
LW
3156 case 'c':
3157 p++;
bbce6d69 3158 ender = UCHARAT(p++);
3159 ender = toCTRL(ender);
a687059c 3160 break;
a0d0e21e
LW
3161 case '0': case '1': case '2': case '3':case '4':
3162 case '5': case '6': case '7': case '8':case '9':
3163 if (*p == '0' ||
830247a4 3164 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3165 I32 flags = 0;
3166 numlen = 3;
3167 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3168 p += numlen;
3169 }
3170 else {
3171 --p;
3172 goto loopdone;
a687059c
LW
3173 }
3174 break;
a0d0e21e 3175 case '\0':
830247a4 3176 if (p >= RExC_end)
b45f050a 3177 FAIL("Trailing \\");
a687059c 3178 /* FALL THROUGH */
a0d0e21e 3179 default:
e476b1b5 3180 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3181 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3182 goto normal_default;
a0d0e21e
LW
3183 }
3184 break;
a687059c 3185 default:
a0ed51b3 3186 normal_default:
fd400ab9 3187 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3188 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3189 &numlen, 0);
a0ed51b3
LW
3190 p += numlen;
3191 }
3192 else
3193 ender = *p++;
a0d0e21e 3194 break;
a687059c 3195 }
830247a4
IZ
3196 if (RExC_flags16 & PMf_EXTENDED)
3197 p = regwhite(p, RExC_end);
60a8b682
JH
3198 if (UTF && FOLD) {
3199 /* Prime the casefolded buffer. */
ac7e0132 3200 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3201 }
a0d0e21e
LW
3202 if (ISMULT2(p)) { /* Back off on ?+*. */
3203 if (len)
3204 p = oldp;
16ea2a2e 3205 else if (UTF) {
0ebc6274
JH
3206 STRLEN unilen;
3207
80aecb99 3208 if (FOLD) {
60a8b682 3209 /* Emit all the Unicode characters. */
80aecb99
JH
3210 for (foldbuf = tmpbuf;
3211 foldlen;
3212 foldlen -= numlen) {
3213 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3214 if (numlen > 0) {
0ebc6274
JH
3215 reguni(pRExC_state, ender, s, &unilen);
3216 s += unilen;
3217 len += unilen;
3218 /* In EBCDIC the numlen
3219 * and unilen can differ. */
9dc45d57 3220 foldbuf += numlen;
47654450
JH
3221 if (numlen >= foldlen)
3222 break;
9dc45d57
JH
3223 }
3224 else
3225 break; /* "Can't happen." */
80aecb99
JH
3226 }
3227 }
3228 else {
0ebc6274 3229 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3230 if (unilen > 0) {
0ebc6274
JH
3231 s += unilen;
3232 len += unilen;
9dc45d57 3233 }
80aecb99 3234 }
a0ed51b3 3235 }
a0d0e21e
LW
3236 else {
3237 len++;
cd439c50 3238 REGC(ender, s++);
a0d0e21e
LW
3239 }
3240 break;
a687059c 3241 }
16ea2a2e 3242 if (UTF) {
0ebc6274
JH
3243 STRLEN unilen;
3244
80aecb99 3245 if (FOLD) {
60a8b682 3246 /* Emit all the Unicode characters. */
80aecb99
JH
3247 for (foldbuf = tmpbuf;
3248 foldlen;
3249 foldlen -= numlen) {
3250 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3251 if (numlen > 0) {
0ebc6274
JH
3252 reguni(pRExC_state, ender, s, &unilen);
3253 len += unilen;
3254 s += unilen;
3255 /* In EBCDIC the numlen
3256 * and unilen can differ. */
9dc45d57 3257 foldbuf += numlen;
47654450
JH
3258 if (numlen >= foldlen)
3259 break;
9dc45d57
JH
3260 }
3261 else
3262 break;
80aecb99
JH
3263 }
3264 }
3265 else {
0ebc6274 3266 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3267 if (unilen > 0) {
0ebc6274
JH
3268 s += unilen;
3269 len += unilen;
9dc45d57 3270 }
80aecb99
JH
3271 }
3272 len--;
a0ed51b3
LW
3273 }
3274 else
cd439c50 3275 REGC(ender, s++);
a0d0e21e
LW
3276 }
3277 loopdone:
830247a4 3278 RExC_parse = p - 1;
fac92740 3279 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3280 nextchar(pRExC_state);
793db0cb
JH
3281 {
3282 /* len is STRLEN which is unsigned, need to copy to signed */
3283 IV iv = len;
3284 if (iv < 0)
3285 vFAIL("Internal disaster");
3286 }
a0d0e21e
LW
3287 if (len > 0)
3288 *flagp |= HASWIDTH;
3289 if (len == 1)
3290 *flagp |= SIMPLE;
c277df42 3291 if (!SIZE_ONLY)
cd439c50
IZ
3292 STR_LEN(ret) = len;
3293 if (SIZE_ONLY)
830247a4 3294 RExC_size += STR_SZ(len);
cd439c50 3295 else
830247a4 3296 RExC_emit += STR_SZ(len);
a687059c 3297 }
a0d0e21e
LW
3298 break;
3299 }
a687059c 3300
60a8b682
JH
3301 /* If the encoding pragma is in effect recode the text of
3302 * any EXACT-kind nodes. */
22c54be3 3303 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3304 STRLEN oldlen = STR_LEN(ret);
3305 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3306
3307 if (RExC_utf8)
3308 SvUTF8_on(sv);
3309 if (sv_utf8_downgrade(sv, TRUE)) {
799ef3cb 3310 char *s = sv_recode_to_utf8(sv, PL_encoding);
22c54be3
JH
3311 STRLEN newlen = SvCUR(sv);
3312
3313 if (!SIZE_ONLY) {
3314 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3315 (int)oldlen, STRING(ret),
3316 (int)newlen, s));
3317 Copy(s, STRING(ret), newlen, char);
3318 STR_LEN(ret) += newlen - oldlen;
3319 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3320 } else
3321 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3322 }
a72c7584
JH
3323 }
3324
a0d0e21e 3325 return(ret);
a687059c
LW
3326}
3327
873ef191 3328STATIC char *
cea2e8a9 3329S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3330{
3331 while (p < e) {
3332 if (isSPACE(*p))
3333 ++p;
3334 else if (*p == '#') {
3335 do {
3336 p++;
3337 } while (p < e && *p != '\n');
3338 }
3339 else
3340 break;
3341 }
3342 return p;
3343}
3344
b8c5462f
JH
3345/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3346 Character classes ([:foo:]) can also be negated ([:^foo:]).
3347 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3348 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3349 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3350
3351#define POSIXCC_DONE(c) ((c) == ':')
3352#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3353#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3354
b8c5462f 3355STATIC I32
830247a4 3356S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3357{
3358 char *posixcc = 0;
936ed897 3359 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3360
830247a4 3361 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3362 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3363 POSIXCC(UCHARAT(RExC_parse))) {
3364 char c = UCHARAT(RExC_parse);
830247a4 3365 char* s = RExC_parse++;
b81d288d 3366
9a86a77b 3367 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3368 RExC_parse++;
3369 if (RExC_parse == RExC_end)
620e46c5 3370 /* Grandfather lone [:, [=, [. */
830247a4 3371 RExC_parse = s;
620e46c5 3372 else {
830247a4 3373 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3374
9a86a77b 3375 if (UCHARAT(RExC_parse) == ']') {
830247a4 3376 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3377 posixcc = s + 1;
3378 if (*s == ':') {
3379 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3380 I32 skip = 5; /* the most common skip */
3381
3382 switch (*posixcc) {
3383 case 'a':
3384 if (strnEQ(posixcc, "alnum", 5))
3385 namedclass =
3386 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3387 else if (strnEQ(posixcc, "alpha", 5))
3388 namedclass =
3389 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3390 else if (strnEQ(posixcc, "ascii", 5))
3391 namedclass =
3392 complement ? ANYOF_NASCII : ANYOF_ASCII;
3393 break;
aaa51d5e
JF
3394 case 'b':
3395 if (strnEQ(posixcc, "blank", 5))
3396 namedclass =
3397 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3398 break;
b8c5462f
JH
3399 case 'c':
3400 if (strnEQ(posixcc, "cntrl", 5))
3401 namedclass =
3402 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3403 break;
3404 case 'd':
3405 if (strnEQ(posixcc, "digit", 5))
3406 namedclass =
3407 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3408 break;
3409 case 'g':
3410 if (strnEQ(posixcc, "graph", 5))
3411 namedclass =
3412 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3413 break;
3414 case 'l':
3415 if (strnEQ(posixcc, "lower", 5))
3416 namedclass =
3417 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3418 break;
3419 case 'p':
3420 if (strnEQ(posixcc, "print", 5))
3421 namedclass =
3422 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3423 else if (strnEQ(posixcc, "punct", 5))
3424 namedclass =
3425 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3426 break;
3427 case 's':
3428 if (strnEQ(posixcc, "space", 5))
3429 namedclass =
aaa51d5e 3430 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3431 break;
b8c5462f
JH
3432 case 'u':
3433 if (strnEQ(posixcc, "upper", 5))
3434 namedclass =
3435 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3436 break;
3437 case 'w': /* this is not POSIX, this is the Perl \w */
3438 if (strnEQ(posixcc, "word", 4)) {
3439 namedclass =
3440 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3441 skip = 4;
3442 }
3443 break;
3444 case 'x':
3445 if (strnEQ(posixcc, "xdigit", 6)) {
3446 namedclass =
3447 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3448 skip = 6;
3449 }
3450 break;
3451 }
ac561586
JH
3452 if (namedclass == OOB_NAMEDCLASS ||
3453 posixcc[skip] != ':' ||
3454 posixcc[skip+1] != ']')
b45f050a
JF
3455 {
3456 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3457 t - s - 1, s + 1);
3458 }
3459 } else if (!SIZE_ONLY) {
b8c5462f 3460 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3461
830247a4 3462 /* adjust RExC_parse so the warning shows after
b45f050a 3463 the class closes */
9a86a77b 3464 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3465 RExC_parse++;
b45f050a
JF
3466 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3467 }
b8c5462f
JH
3468 } else {
3469 /* Maternal grandfather:
3470 * "[:" ending in ":" but not in ":]" */
830247a4 3471 RExC_parse = s;
767d463e 3472 }
620e46c5
JH
3473 }
3474 }
3475
b8c5462f
JH
3476 return namedclass;
3477}
3478
3479STATIC void
830247a4 3480S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3481{
b938889d 3482 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3483 char *s = RExC_parse;
93733859 3484 char c = *s++;
b8c5462f
JH
3485
3486 while(*s && isALNUM(*s))
3487 s++;
3488 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3489 if (ckWARN(WARN_REGEXP))
3490 vWARN3(s+2,
3491 "POSIX syntax [%c %c] belongs inside character classes",
3492 c, c);
b45f050a
JF
3493
3494 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3495 if (POSIXCC_NOTYET(c)) {
830247a4 3496 /* adjust RExC_parse so the error shows after
b45f050a 3497 the class closes */
9a86a77b 3498 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3499 ;
3500 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3501 }
b8c5462f
JH
3502 }
3503 }
620e46c5
JH
3504}
3505
76e3520e 3506STATIC regnode *
830247a4 3507S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3508{
ffc61ed2 3509 register UV value;
9a86a77b 3510 register UV nextvalue;
3568d838 3511 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3512 register IV range = 0;
c277df42 3513 register regnode *ret;
ba210ebe 3514 STRLEN numlen;
ffc61ed2 3515 IV namedclass;
9c5ffd7c 3516 char *rangebegin = 0;
936ed897 3517 bool need_class = 0;
9c5ffd7c 3518 SV *listsv = Nullsv;
ffc61ed2
JH
3519 register char *e;
3520 UV n;
9e55ce06
JH
3521 bool optimize_invert = TRUE;
3522 AV* unicode_alternate = 0;
ffc61ed2
JH
3523
3524 ret = reganode(pRExC_state, ANYOF, 0);
3525
3526 if (!SIZE_ONLY)
3527 ANYOF_FLAGS(ret) = 0;
3528
9a86a77b 3529 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3530 RExC_naughty++;
3531 RExC_parse++;
3532 if (!SIZE_ONLY)
3533 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534 }
a0d0e21e 3535
936ed897 3536 if (SIZE_ONLY)
830247a4 3537 RExC_size += ANYOF_SKIP;
936ed897 3538 else {
830247a4 3539 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3540 if (FOLD)
3541 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3542 if (LOC)
3543 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3544 ANYOF_BITMAP_ZERO(ret);
3545 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3546 }
b8c5462f 3547
9a86a77b
JH
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549
b938889d 3550 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3551 checkposixcc(pRExC_state);
b8c5462f 3552
f064b6ad
HS
3553 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554 if (UCHARAT(RExC_parse) == ']')
3555 goto charclassloop;
ffc61ed2 3556
9a86a77b 3557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3558
3559 charclassloop:
3560
3561 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3562
73b437c8 3563 if (!range)
830247a4 3564 rangebegin = RExC_parse;
ffc61ed2 3565 if (UTF) {
5e12f4fb 3566 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3567 RExC_end - RExC_parse,
3568 &numlen, 0);
ffc61ed2
JH
3569 RExC_parse += numlen;
3570 }
3571 else
3572 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3573 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574 if (value == '[' && POSIXCC(nextvalue))
830247a4 3575 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3576 else if (value == '\\') {
ffc61ed2 3577 if (UTF) {
5e12f4fb 3578 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3579 RExC_end - RExC_parse,
3580 &numlen, 0);
3581 RExC_parse += numlen;
3582 }
3583 else
3584 value = UCHARAT(RExC_parse++);
470c3474 3585 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3586 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3587 * be a problem later if we want switch on Unicode.
3588 * A similar issue a little bit later when switching on
3589 * namedclass. --jhi */
ffc61ed2 3590 switch ((I32)value) {
b8c5462f
JH
3591 case 'w': namedclass = ANYOF_ALNUM; break;
3592 case 'W': namedclass = ANYOF_NALNUM; break;
3593 case 's': namedclass = ANYOF_SPACE; break;
3594 case 'S': namedclass = ANYOF_NSPACE; break;
3595 case 'd': namedclass = ANYOF_DIGIT; break;
3596 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3597 case 'p':
3598 case 'P':
af6f566e 3599 if (RExC_parse >= RExC_end)
2a4859cd 3600 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3601 if (*RExC_parse == '{') {
0da60cf5 3602 U8 c = (U8)value;
ffc61ed2
JH
3603 e = strchr(RExC_parse++, '}');
3604 if (!e)
0da60cf5 3605 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3606 while (isSPACE(UCHARAT(RExC_parse)))
3607 RExC_parse++;
3608 if (e == RExC_parse)
0da60cf5 3609 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3610 n = e - RExC_parse;
ab13f0c7
JH
3611 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3612 n--;
ffc61ed2
JH
3613 }
3614 else {
3615 e = RExC_parse;
3616 n = 1;
3617 }
3618 if (!SIZE_ONLY) {
ab13f0c7
JH
3619 if (UCHARAT(RExC_parse) == '^') {
3620 RExC_parse++;
3621 n--;
3622 value = value == 'p' ? 'P' : 'p'; /* toggle */
3623 while (isSPACE(UCHARAT(RExC_parse))) {
3624 RExC_parse++;
3625 n--;
3626 }
3627 }
ffc61ed2 3628 if (value == 'p')
ab13f0c7
JH
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3631 else
ab13f0c7
JH
3632 Perl_sv_catpvf(aTHX_ listsv,
3633 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3634 }
3635 RExC_parse = e + 1;
3636 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3637 continue;
b8c5462f
JH
3638 case 'n': value = '\n'; break;
3639 case 'r': value = '\r'; break;
3640 case 't': value = '\t'; break;
3641 case 'f': value = '\f'; break;
3642 case 'b': value = '\b'; break;
c7f1f016
NIS
3643 case 'e': value = ASCII_TO_NATIVE('\033');break;
3644 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3645 case 'x':
ffc61ed2 3646 if (*RExC_parse == '{') {
a4c04bdc
NC
3647 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3649 e = strchr(RExC_parse++, '}');
b81d288d 3650 if (!e)
ffc61ed2 3651 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3652
3653 numlen = e - RExC_parse;
3654 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3655 RExC_parse = e + 1;
3656 }
3657 else {
a4c04bdc 3658 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3659 numlen = 2;
3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3661 RExC_parse += numlen;
3662 }
b8c5462f
JH
3663 break;
3664 case 'c':
830247a4 3665 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3666 value = toCTRL(value);
3667 break;
3668 case '0': case '1': case '2': case '3': case '4':
3669 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3670 {
3671 I32 flags = 0;
3672 numlen = 3;
3673 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3674 RExC_parse += numlen;
b8c5462f 3675 break;
53305cf1 3676 }
1028017a 3677 default:
e476b1b5 3678 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3679 vWARN2(RExC_parse,
3680 "Unrecognized escape \\%c in character class passed through",
3681 (int)value);
1028017a 3682 break;
b8c5462f 3683 }
ffc61ed2
JH
3684 } /* end of \blah */
3685
3686 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3687
3688 if (!SIZE_ONLY && !need_class)
936ed897 3689 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3690
936ed897 3691 need_class = 1;
ffc61ed2
JH
3692
3693 /* a bad range like a-\d, a-[:digit:] ? */
3694 if (range) {
73b437c8 3695 if (!SIZE_ONLY) {
e476b1b5 3696 if (ckWARN(WARN_REGEXP))
830247a4 3697 vWARN4(RExC_parse,
b45f050a 3698 "False [] range \"%*.*s\"",
830247a4
IZ
3699 RExC_parse - rangebegin,
3700 RExC_parse - rangebegin,
b45f050a 3701 rangebegin);
3568d838
JH
3702 if (prevvalue < 256) {
3703 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3704 ANYOF_BITMAP_SET(ret, '-');
3705 }
3706 else {
3707 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3708 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3709 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3710 }
b8c5462f 3711 }
ffc61ed2
JH
3712
3713 range = 0; /* this was not a true range */
73b437c8 3714 }
ffc61ed2 3715
73b437c8 3716 if (!SIZE_ONLY) {
3568d838
JH
3717 if (namedclass > OOB_NAMEDCLASS)
3718 optimize_invert = FALSE;
e2962f66
JH
3719 /* Possible truncation here but in some 64-bit environments
3720 * the compiler gets heartburn about switch on 64-bit values.
3721 * A similar issue a little earlier when switching on value.
98f323fa 3722 * --jhi */
e2962f66 3723 switch ((I32)namedclass) {
73b437c8
JH
3724 case ANYOF_ALNUM:
3725 if (LOC)
936ed897 3726 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3727 else {
3728 for (value = 0; value < 256; value++)
3729 if (isALNUM(value))
936ed897 3730 ANYOF_BITMAP_SET(ret, value);
73b437c8 3731 }
ffc61ed2 3732 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3733 break;
3734 case ANYOF_NALNUM:
3735 if (LOC)
936ed897 3736 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3737 else {
3738 for (value = 0; value < 256; value++)
3739 if (!isALNUM(value))
936ed897 3740 ANYOF_BITMAP_SET(ret, value);
73b437c8 3741 }
ffc61ed2 3742 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3743 break;
ffc61ed2 3744 case ANYOF_ALNUMC:
73b437c8 3745 if (LOC)
ffc61ed2 3746 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3747 else {
3748 for (value = 0; value < 256; value++)
ffc61ed2 3749 if (isALNUMC(value))
936ed897 3750 ANYOF_BITMAP_SET(ret, value);
73b437c8 3751 }
ffc61ed2 3752 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3753 break;
3754 case ANYOF_NALNUMC:
3755 if (LOC)
936ed897 3756 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3757 else {
3758 for (value = 0; value < 256; value++)
3759 if (!isALNUMC(value))
936ed897 3760 ANYOF_BITMAP_SET(ret, value);
73b437c8 3761 }
ffc61ed2 3762 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3763 break;
3764 case ANYOF_ALPHA:
3765 if (LOC)
936ed897 3766 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3767 else {
3768 for (value = 0; value < 256; value++)
3769 if (isALPHA(value))
936ed897 3770 ANYOF_BITMAP_SET(ret, value);
73b437c8 3771 }
ffc61ed2 3772 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3773 break;
3774 case ANYOF_NALPHA:
3775 if (LOC)
936ed897 3776 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3777 else {
3778 for (value = 0; value < 256; value++)
3779 if (!isALPHA(value))
936ed897 3780 ANYOF_BITMAP_SET(ret, value);
73b437c8 3781 }
ffc61ed2 3782 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3783 break;
3784 case ANYOF_ASCII:
3785 if (LOC)
936ed897 3786 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3787 else {
c7f1f016 3788#ifndef EBCDIC
1ba5c669
JH
3789 for (value = 0; value < 128; value++)
3790 ANYOF_BITMAP_SET(ret, value);
3791#else /* EBCDIC */
ffbc6a93 3792 for (value = 0; value < 256; value++) {
3a3c4447
JH
3793 if (isASCII(value))
3794 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3795 }
1ba5c669 3796#endif /* EBCDIC */
73b437c8 3797 }
ffc61ed2 3798 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3799 break;
3800 case ANYOF_NASCII:
3801 if (LOC)
936ed897 3802 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3803 else {
c7f1f016 3804#ifndef EBCDIC
1ba5c669
JH
3805 for (value = 128; value < 256; value++)
3806 ANYOF_BITMAP_SET(ret, value);
3807#else /* EBCDIC */
ffbc6a93 3808 for (value = 0; value < 256; value++) {
3a3c4447
JH
3809 if (!isASCII(value))
3810 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3811 }
1ba5c669 3812#endif /* EBCDIC */
73b437c8 3813 }
ffc61ed2 3814 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3815 break;
aaa51d5e
JF
3816 case ANYOF_BLANK:
3817 if (LOC)
3818 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3819 else {
3820 for (value = 0; value < 256; value++)
3821 if (isBLANK(value))
3822 ANYOF_BITMAP_SET(ret, value);
3823 }
ffc61ed2 3824 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3825 break;
3826 case ANYOF_NBLANK:
3827 if (LOC)
3828 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3829 else {
3830 for (value = 0; value < 256; value++)
3831 if (!isBLANK(value))
3832 ANYOF_BITMAP_SET(ret, value);
3833 }
ffc61ed2 3834 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3835 break;
73b437c8
JH
3836 case ANYOF_CNTRL:
3837 if (LOC)
936ed897 3838 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3839 else {
3840 for (value = 0; value < 256; value++)
3841 if (isCNTRL(value))
936ed897 3842 ANYOF_BITMAP_SET(ret, value);
73b437c8 3843 }
ffc61ed2 3844 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3845 break;
3846 case ANYOF_NCNTRL:
3847 if (LOC)
936ed897 3848 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3849 else {
3850 for (value = 0; value < 256; value++)
3851 if (!isCNTRL(value))
936ed897 3852 ANYOF_BITMAP_SET(ret, value);
73b437c8 3853 }
ffc61ed2
JH
3854 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3855 break;
3856 case ANYOF_DIGIT:
3857 if (LOC)
3858 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3859 else {
3860 /* consecutive digits assumed */
3861 for (value = '0'; value <= '9'; value++)
3862 ANYOF_BITMAP_SET(ret, value);
3863 }
3864 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3865 break;
3866 case ANYOF_NDIGIT:
3867 if (LOC)
3868 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3869 else {
3870 /* consecutive digits assumed */
3871 for (value = 0; value < '0'; value++)
3872 ANYOF_BITMAP_SET(ret, value);
3873 for (value = '9' + 1; value < 256; value++)
3874 ANYOF_BITMAP_SET(ret, value);
3875 }
3876 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3877 break;
3878 case ANYOF_GRAPH:
3879 if (LOC)
936ed897 3880 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3881 else {
3882 for (value = 0; value < 256; value++)
3883 if (isGRAPH(value))
936ed897 3884 ANYOF_BITMAP_SET(ret, value);
73b437c8 3885 }
ffc61ed2 3886 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3887 break;
3888 case ANYOF_NGRAPH:
3889 if (LOC)
936ed897 3890 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3891 else {
3892 for (value = 0; value < 256; value++)
3893 if (!isGRAPH(value))
936ed897 3894 ANYOF_BITMAP_SET(ret, value);
73b437c8 3895 }
ffc61ed2 3896 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3897 break;
3898 case ANYOF_LOWER:
3899 if (LOC)
936ed897 3900 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3901 else {
3902 for (value = 0; value < 256; value++)
3903 if (isLOWER(value))
936ed897 3904 ANYOF_BITMAP_SET(ret, value);
73b437c8 3905 }
ffc61ed2 3906 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3907 break;
3908 case ANYOF_NLOWER:
3909 if (LOC)
936ed897 3910 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3911 else {
3912 for (value = 0; value < 256; value++)
3913 if (!isLOWER(value))
936ed897 3914 ANYOF_BITMAP_SET(ret, value);
73b437c8 3915 }
ffc61ed2 3916 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3917 break;
3918 case ANYOF_PRINT:
3919 if (LOC)
936ed897 3920 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3921 else {
3922 for (value = 0; value < 256; value++)
3923 if (isPRINT(value))
936ed897 3924 ANYOF_BITMAP_SET(ret, value);
73b437c8 3925 }
ffc61ed2 3926 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3927 break;
3928 case ANYOF_NPRINT:
3929 if (LOC)
936ed897 3930 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3931 else {
3932 for (value = 0; value < 256; value++)
3933 if (!isPRINT(value))
936ed897 3934 ANYOF_BITMAP_SET(ret, value);
73b437c8 3935 }
ffc61ed2 3936 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3937 break;
aaa51d5e
JF
3938 case ANYOF_PSXSPC:
3939 if (LOC)
3940 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3941 else {
3942 for (value = 0; value < 256; value++)
3943 if (isPSXSPC(value))
3944 ANYOF_BITMAP_SET(ret, value);
3945 }
ffc61ed2 3946 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3947 break;
3948 case ANYOF_NPSXSPC:
3949 if (LOC)
3950 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3951 else {
3952 for (value = 0; value < 256; value++)
3953 if (!isPSXSPC(value))
3954 ANYOF_BITMAP_SET(ret, value);
3955 }
ffc61ed2 3956 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3957 break;
73b437c8
JH
3958 case ANYOF_PUNCT:
3959 if (LOC)
936ed897 3960 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3961 else {
3962 for (value = 0; value < 256; value++)
3963 if (isPUNCT(value))
936ed897 3964 ANYOF_BITMAP_SET(ret, value);
73b437c8 3965 }
ffc61ed2 3966 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3967 break;
3968 case ANYOF_NPUNCT:
3969 if (LOC)
936ed897 3970 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3971 else {
3972 for (value = 0; value < 256; value++)
3973 if (!isPUNCT(value))
936ed897 3974 ANYOF_BITMAP_SET(ret, value);
73b437c8 3975 }
ffc61ed2
JH
3976 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3977 break;
3978 case ANYOF_SPACE:
3979 if (LOC)
3980 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3981 else {
3982 for (value = 0; value < 256; value++)
3983 if (isSPACE(value))
3984 ANYOF_BITMAP_SET(ret, value);
3985 }
3986 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3987 break;
3988 case ANYOF_NSPACE:
3989 if (LOC)
3990 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3991 else {
3992 for (value = 0; value < 256; value++)
3993 if (!isSPACE(value))
3994 ANYOF_BITMAP_SET(ret, value);
3995 }
3996 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3997 break;
3998 case ANYOF_UPPER:
3999 if (LOC)
936ed897 4000 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4001 else {
4002 for (value = 0; value < 256; value++)
4003 if (isUPPER(value))
936ed897 4004 ANYOF_BITMAP_SET(ret, value);
73b437c8 4005 }
ffc61ed2 4006 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4007 break;
4008 case ANYOF_NUPPER:
4009 if (LOC)
936ed897 4010 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4011 else {
4012 for (value = 0; value < 256; value++)
4013 if (!isUPPER(value))
936ed897 4014 ANYOF_BITMAP_SET(ret, value);
73b437c8 4015 }
ffc61ed2 4016 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4017 break;
4018 case ANYOF_XDIGIT:
4019 if (LOC)
936ed897 4020 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4021 else {
4022 for (value = 0; value < 256; value++)
4023 if (isXDIGIT(value))
936ed897 4024 ANYOF_BITMAP_SET(ret, value);
73b437c8 4025 }
ffc61ed2 4026 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4027 break;
4028 case ANYOF_NXDIGIT:
4029 if (LOC)
936ed897 4030 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4031 else {
4032 for (value = 0; value < 256; value++)
4033 if (!isXDIGIT(value))
936ed897 4034 ANYOF_BITMAP_SET(ret, value);
73b437c8 4035 }
ffc61ed2 4036 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4037 break;
4038 default:
b45f050a 4039 vFAIL("Invalid [::] class");
73b437c8 4040 break;
b8c5462f 4041 }
b8c5462f 4042 if (LOC)
936ed897 4043 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4044 continue;
a0d0e21e 4045 }
ffc61ed2
JH
4046 } /* end of namedclass \blah */
4047
a0d0e21e 4048 if (range) {
3a3c4447 4049 if (prevvalue > value) /* b-a */ {
b45f050a 4050 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4051 RExC_parse - rangebegin,
4052 RExC_parse - rangebegin,
b45f050a 4053 rangebegin);
3568d838 4054 range = 0; /* not a valid range */
73b437c8 4055 }
a0d0e21e
LW
4056 }
4057 else {
3568d838 4058 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4059 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4060 RExC_parse[1] != ']') {
4061 RExC_parse++;
ffc61ed2
JH
4062
4063 /* a bad range like \w-, [:word:]- ? */
4064 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4065 if (ckWARN(WARN_REGEXP))
830247a4 4066 vWARN4(RExC_parse,
b45f050a 4067 "False [] range \"%*.*s\"",
830247a4
IZ
4068 RExC_parse - rangebegin,
4069 RExC_parse - rangebegin,
b45f050a 4070 rangebegin);
73b437c8 4071 if (!SIZE_ONLY)
936ed897 4072 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4073 } else
ffc61ed2
JH
4074 range = 1; /* yeah, it's a range! */
4075 continue; /* but do it the next time */
a0d0e21e 4076 }
a687059c 4077 }
ffc61ed2 4078
93733859 4079 /* now is the next time */
ae5c130c 4080 if (!SIZE_ONLY) {
3568d838
JH
4081 IV i;
4082
4083 if (prevvalue < 256) {
4084 IV ceilvalue = value < 256 ? value : 255;
4085
4086#ifdef EBCDIC
3a3c4447
JH
4087 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4088 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4089 {
3568d838
JH
4090 if (isLOWER(prevvalue)) {
4091 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4092 if (isLOWER(i))
4093 ANYOF_BITMAP_SET(ret, i);
4094 } else {
3568d838 4095 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4096 if (isUPPER(i))
4097 ANYOF_BITMAP_SET(ret, i);
4098 }
8ada0baa 4099 }
ffc61ed2 4100 else
8ada0baa 4101#endif
a5961de5
JH
4102 for (i = prevvalue; i <= ceilvalue; i++)
4103 ANYOF_BITMAP_SET(ret, i);
3568d838 4104 }
a5961de5 4105 if (value > 255 || UTF) {
b08decb7
JH
4106 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4107 UV natvalue = NATIVE_TO_UNI(value);
4108
ffc61ed2 4109 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4110 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4111 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4112 prevnatvalue, natvalue);
4113 }
4114 else if (prevnatvalue == natvalue) {
4115 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4116 if (FOLD) {
254ba52a
JH
4117 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4118 STRLEN foldlen;
2f3bf011 4119 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4120
c840d2a2
JH
4121 /* If folding and foldable and a single
4122 * character, insert also the folded version
4123 * to the charclass. */
9e55ce06
JH
4124 if (f != value) {
4125 if (foldlen == UNISKIP(f))
4126 Perl_sv_catpvf(aTHX_ listsv,
4127 "%04"UVxf"\n", f);
4128 else {
4129 /* Any multicharacter foldings
4130 * require the following transform:
4131 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4132 * where E folds into "pq" and F folds
4133 * into "rst", all other characters
4134 * fold to single characters. We save
4135 * away these multicharacter foldings,
4136 * to be later saved as part of the
4137 * additional "s" data. */
4138 SV *sv;
4139
4140 if (!unicode_alternate)
4141 unicode_alternate = newAV();
4142 sv = newSVpvn((char*)foldbuf, foldlen);
4143 SvUTF8_on(sv);
4144 av_push(unicode_alternate, sv);
4145 }
4146 }
254ba52a 4147
60a8b682
JH
4148 /* If folding and the value is one of the Greek
4149 * sigmas insert a few more sigmas to make the
4150 * folding rules of the sigmas to work right.
4151 * Note that not all the possible combinations
4152 * are handled here: some of them are handled
9e55ce06
JH
4153 * by the standard folding rules, and some of
4154 * them (literal or EXACTF cases) are handled
4155 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4156 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4157 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4158 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4159 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4160 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4161 }
4162 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4163 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4164 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4165 }
4166 }
ffc61ed2 4167 }
8ada0baa 4168 }
ffc61ed2
JH
4169
4170 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4171 }
ffc61ed2 4172
936ed897 4173 if (need_class) {
4f66b38d 4174 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4175 if (SIZE_ONLY)
830247a4 4176 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4177 else
830247a4 4178 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4179 }
ffc61ed2 4180
ae5c130c 4181 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4182 if (!SIZE_ONLY &&
ffc61ed2 4183 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4184 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4185 ) {
a0ed51b3 4186 for (value = 0; value < 256; ++value) {
936ed897 4187 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4188 IV fold = PL_fold[value];
4189
4190 if (fold != value)
4191 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4192 }
4193 }
936ed897 4194 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4195 }
ffc61ed2 4196
ae5c130c 4197 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4198 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4199 /* If the only flag is inversion. */
4200 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4201 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4202 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4203 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4204 }
a0d0e21e 4205
b81d288d 4206 if (!SIZE_ONLY) {
fde631ed 4207 AV *av = newAV();
ffc61ed2
JH
4208 SV *rv;
4209
9e55ce06
JH
4210 /* The 0th element stores the character class description
4211 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4212 * to initialize the appropriate swash (which gets stored in
4213 * the 1st element), and also useful for dumping the regnode.
4214 * The 2nd element stores the multicharacter foldings,
4215 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4216 av_store(av, 0, listsv);
4217 av_store(av, 1, NULL);
9e55ce06 4218 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4219 rv = newRV_noinc((SV*)av);
19860706 4220 n = add_data(pRExC_state, 1, "s");
830247a4 4221 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4222 ARG_SET(ret, n);
a0ed51b3
LW
4223 }
4224
4225 return ret;
4226}
4227
76e3520e 4228STATIC char*
830247a4 4229S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4230{
830247a4 4231 char* retval = RExC_parse++;
a0d0e21e 4232
4633a7c4 4233 for (;;) {
830247a4
IZ
4234 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4235 RExC_parse[2] == '#') {
4236 while (*RExC_parse && *RExC_parse != ')')
4237 RExC_parse++;
4238 RExC_parse++;
4633a7c4
LW
4239 continue;
4240 }
830247a4
IZ
4241 if (RExC_flags16 & PMf_EXTENDED) {
4242 if (isSPACE(*RExC_parse)) {
4243 RExC_parse++;
748a9306
LW
4244 continue;
4245 }
830247a4
IZ
4246 else if (*RExC_parse == '#') {
4247 while (*RExC_parse && *RExC_parse != '\n')
4248 RExC_parse++;
4249 RExC_parse++;
748a9306
LW
4250 continue;
4251 }
748a9306 4252 }
4633a7c4 4253 return retval;
a0d0e21e 4254 }
a687059c
LW
4255}
4256
4257/*
c277df42 4258- reg_node - emit a node
a0d0e21e 4259*/
76e3520e 4260STATIC regnode * /* Location. */
830247a4 4261S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4262{
c277df42
IZ
4263 register regnode *ret;
4264 register regnode *ptr;
a687059c 4265
830247a4 4266 ret = RExC_emit;
c277df42 4267 if (SIZE_ONLY) {
830247a4
IZ
4268 SIZE_ALIGN(RExC_size);
4269 RExC_size += 1;
a0d0e21e
LW
4270 return(ret);
4271 }
a687059c 4272
c277df42 4273 NODE_ALIGN_FILL(ret);
a0d0e21e 4274 ptr = ret;
c277df42 4275 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4276 if (RExC_offsets) { /* MJD */
4277 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4278 "reg_node", __LINE__,
4279 reg_name[op],
4280 RExC_emit - RExC_emit_start > RExC_offsets[0]
4281 ? "Overwriting end of array!\n" : "OK",
4282 RExC_emit - RExC_emit_start,
4283 RExC_parse - RExC_start,
4284 RExC_offsets[0]));
4285 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4286 }
4287
830247a4 4288 RExC_emit = ptr;
a687059c 4289
a0d0e21e 4290 return(ret);
a687059c
LW
4291}
4292
4293/*
a0d0e21e
LW
4294- reganode - emit a node with an argument
4295*/
76e3520e 4296STATIC regnode * /* Location. */
830247a4 4297S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4298{
c277df42
IZ
4299 register regnode *ret;
4300 register regnode *ptr;
fe14fcc3 4301
830247a4 4302 ret = RExC_emit;
c277df42 4303 if (SIZE_ONLY) {
830247a4
IZ
4304 SIZE_ALIGN(RExC_size);
4305 RExC_size += 2;
a0d0e21e
LW
4306 return(ret);
4307 }
fe14fcc3 4308
c277df42 4309 NODE_ALIGN_FILL(ret);
a0d0e21e 4310 ptr = ret;
c277df42 4311 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4312 if (RExC_offsets) { /* MJD */
4313 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4314 "reganode",
4315 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4316 "Overwriting end of array!\n" : "OK",
4317 RExC_emit - RExC_emit_start,
4318 RExC_parse - RExC_start,
4319 RExC_offsets[0]));
4320 Set_Cur_Node_Offset;
4321 }
4322
830247a4 4323 RExC_emit = ptr;
fe14fcc3 4324
a0d0e21e 4325 return(ret);
fe14fcc3
LW
4326}
4327
4328/*
cd439c50 4329- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4330*/
4331STATIC void
830247a4 4332S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4333{
5e12f4fb 4334 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4335}
4336
4337/*
a0d0e21e
LW
4338- reginsert - insert an operator in front of already-emitted operand
4339*
4340* Means relocating the operand.
4341*/
76e3520e 4342STATIC void
830247a4 4343S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4344{
c277df42
IZ
4345 register regnode *src;
4346 register regnode *dst;
4347 register regnode *place;
4348 register int offset = regarglen[(U8)op];
b81d288d 4349
22c35a8c 4350/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4351
4352 if (SIZE_ONLY) {
830247a4 4353 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4354 return;
4355 }
a687059c 4356
830247a4
IZ
4357 src = RExC_emit;
4358 RExC_emit += NODE_STEP_REGNODE + offset;
4359 dst = RExC_emit;
fac92740 4360 while (src > opnd) {
c277df42 4361 StructCopy(--src, --dst, regnode);
fac92740
MJD
4362 if (RExC_offsets) { /* MJD 20010112 */
4363 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4364 "reg_insert",
4365 dst - RExC_emit_start > RExC_offsets[0]
4366 ? "Overwriting end of array!\n" : "OK",
4367 src - RExC_emit_start,
4368 dst - RExC_emit_start,
4369 RExC_offsets[0]));
4370 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4371 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4372 }
4373 }
4374
a0d0e21e
LW
4375
4376 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4377 if (RExC_offsets) { /* MJD */
4378 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4379 "reginsert",
4380 place - RExC_emit_start > RExC_offsets[0]
4381 ? "Overwriting end of array!\n" : "OK",
4382 place - RExC_emit_start,
4383 RExC_parse - RExC_start,
4384 RExC_offsets[0]));
4385 Set_Node_Offset(place, RExC_parse);
4386 }
c277df42
IZ
4387 src = NEXTOPER(place);
4388 FILL_ADVANCE_NODE(place, op);
4389 Zero(src, offset, regnode);
a687059c
LW
4390}
4391
4392/*
c277df42 4393- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4394*/
76e3520e 4395STATIC void
830247a4 4396S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4397{
c277df42
IZ
4398 register regnode *scan;
4399 register regnode *temp;
a0d0e21e 4400
c277df42 4401 if (SIZE_ONLY)
a0d0e21e
LW
4402 return;
4403
4404 /* Find last node. */
4405 scan = p;
4406 for (;;) {
4407 temp = regnext(scan);
4408 if (temp == NULL)
4409 break;
4410 scan = temp;
4411 }
a687059c 4412
c277df42
IZ
4413 if (reg_off_by_arg[OP(scan)]) {
4414 ARG_SET(scan, val - scan);
a0ed51b3
LW
4415 }
4416 else {
c277df42
IZ
4417 NEXT_OFF(scan) = val - scan;
4418 }
a687059c
LW
4419}
4420
4421/*
a0d0e21e
LW
4422- regoptail - regtail on operand of first argument; nop if operandless
4423*/
76e3520e 4424STATIC void
830247a4 4425S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4426{
a0d0e21e 4427 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4428 if (p == NULL || SIZE_ONLY)
4429 return;
22c35a8c 4430 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4431 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4432 }
22c35a8c 4433 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4434 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4435 }
4436 else
a0d0e21e 4437 return;
a687059c
LW
4438}
4439
4440/*
4441 - regcurly - a little FSA that accepts {\d+,?\d*}
4442 */
79072805 4443STATIC I32
cea2e8a9 4444S_regcurly(pTHX_ register char *s)
a687059c
LW
4445{
4446 if (*s++ != '{')
4447 return FALSE;
f0fcb552 4448 if (!isDIGIT(*s))
a687059c 4449 return FALSE;
f0fcb552 4450 while (isDIGIT(*s))
a687059c
LW
4451 s++;
4452 if (*s == ',')
4453 s++;
f0fcb552 4454 while (isDIGIT(*s))
a687059c
LW
4455 s++;
4456 if (*s != '}')
4457 return FALSE;
4458 return TRUE;
4459}
4460
a687059c 4461
8fa7f367
JH
4462#ifdef DEBUGGING
4463
76e3520e 4464STATIC regnode *
cea2e8a9 4465S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4466{
f248d071 4467 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4468 register regnode *next;
c277df42
IZ
4469
4470 while (op != END && (!last || node < last)) {
4471 /* While that wasn't END last time... */
4472
4473 NODE_ALIGN(node);
4474 op = OP(node);
4475 if (op == CLOSE)
4476 l--;
4477 next = regnext(node);
4478 /* Where, what. */
4479 if (OP(node) == OPTIMIZED)
4480 goto after_print;
4481 regprop(sv, node);
b900a521 4482 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4483 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4484 if (next == NULL) /* Next ptr. */
4485 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4486 else
b900a521 4487 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4488 (void)PerlIO_putc(Perl_debug_log, '\n');
4489 after_print:
22c35a8c 4490 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4491 register regnode *nnode = (OP(next) == LONGJMP
4492 ? regnext(next)
c277df42
IZ
4493 : next);
4494 if (last && nnode > last)
4495 nnode = last;
4496 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4497 }
22c35a8c 4498 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4499 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4500 }
4501 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4502 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4503 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4504 }
22c35a8c 4505 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4506 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4507 next, sv, l + 1);
a0ed51b3
LW
4508 }
4509 else if ( op == PLUS || op == STAR) {
c277df42 4510 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4511 }
4512 else if (op == ANYOF) {
4f66b38d
HS
4513 /* arglen 1 + class block */
4514 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4515 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4516 node = NEXTOPER(node);
a0ed51b3 4517 }
22c35a8c 4518 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4519 /* Literal string, where present. */
cd439c50 4520 node += NODE_SZ_STR(node) - 1;
c277df42 4521 node = NEXTOPER(node);
a0ed51b3
LW
4522 }
4523 else {
c277df42
IZ
4524 node = NEXTOPER(node);
4525 node += regarglen[(U8)op];
4526 }
4527 if (op == CURLYX || op == OPEN)
4528 l++;
4529 else if (op == WHILEM)
4530 l--;
4531 }
4532 return node;
4533}
4534
8fa7f367
JH
4535#endif /* DEBUGGING */
4536
a687059c 4537/*
fd181c75 4538 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4539 */
4540void
864dbfa3 4541Perl_regdump(pTHX_ regexp *r)
a687059c 4542{
35ff7856 4543#ifdef DEBUGGING
46fc3d4c 4544 SV *sv = sv_newmortal();
a687059c 4545
c277df42 4546 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4547
4548 /* Header fields of interest. */
c277df42 4549 if (r->anchored_substr)
7b0972df 4550 PerlIO_printf(Perl_debug_log,
b81d288d 4551 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4552 PL_colors[0],
7b0972df 4553 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4554 SvPVX(r->anchored_substr),
3280af22 4555 PL_colors[1],
c277df42 4556 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4557 (IV)r->anchored_offset);
33b8afdf
JH
4558 else if (r->anchored_utf8)
4559 PerlIO_printf(Perl_debug_log,
4560 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4561 PL_colors[0],
4562 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4563 SvPVX(r->anchored_utf8),
4564 PL_colors[1],
4565 SvTAIL(r->anchored_utf8) ? "$" : "",
4566 (IV)r->anchored_offset);
c277df42 4567 if (r->float_substr)
7b0972df 4568 PerlIO_printf(Perl_debug_log,
b81d288d 4569 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4570 PL_colors[0],
b81d288d 4571 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4572 SvPVX(r->float_substr),
3280af22 4573 PL_colors[1],
c277df42 4574 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4575 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4576 else if (r->float_utf8)
4577 PerlIO_printf(Perl_debug_log,
4578 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4579 PL_colors[0],
4580 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4581 SvPVX(r->float_utf8),
4582 PL_colors[1],
4583 SvTAIL(r->float_utf8) ? "$" : "",
4584 (IV)r->float_min_offset, (UV)r->float_max_offset);
4585 if (r->check_substr || r->check_utf8)
b81d288d
AB
4586 PerlIO_printf(Perl_debug_log,
4587 r->check_substr == r->float_substr
33b8afdf 4588 && r->check_utf8 == r->float_utf8
c277df42
IZ
4589 ? "(checking floating" : "(checking anchored");
4590 if (r->reganch & ROPT_NOSCAN)
4591 PerlIO_printf(Perl_debug_log, " noscan");
4592 if (r->reganch & ROPT_CHECK_ALL)
4593 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4594 if (r->check_substr || r->check_utf8)
c277df42
IZ
4595 PerlIO_printf(Perl_debug_log, ") ");
4596
46fc3d4c 4597 if (r->regstclass) {
4598 regprop(sv, r->regstclass);
4599 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4600 }
774d564b 4601 if (r->reganch & ROPT_ANCH) {
4602 PerlIO_printf(Perl_debug_log, "anchored");
4603 if (r->reganch & ROPT_ANCH_BOL)
4604 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4605 if (r->reganch & ROPT_ANCH_MBOL)
4606 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4607 if (r->reganch & ROPT_ANCH_SBOL)
4608 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4609 if (r->reganch & ROPT_ANCH_GPOS)
4610 PerlIO_printf(Perl_debug_log, "(GPOS)");
4611 PerlIO_putc(Perl_debug_log, ' ');
4612 }
c277df42
IZ
4613 if (r->reganch & ROPT_GPOS_SEEN)
4614 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4615 if (r->reganch & ROPT_SKIP)
760ac839 4616 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4617 if (r->reganch & ROPT_IMPLICIT)
760ac839 4618 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4619 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4620 if (r->reganch & ROPT_EVAL_SEEN)
4621 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4622 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4623 if (r->offsets) {
4624 U32 i;
4625 U32 len = r->offsets[0];
392fbf5d 4626 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4627 for (i = 1; i <= len; i++)
392fbf5d
RB
4628 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4629 (UV)r->offsets[i*2-1],
4630 (UV)r->offsets[i*2]);
fac92740
MJD
4631 PerlIO_printf(Perl_debug_log, "\n");
4632 }
17c3b450 4633#endif /* DEBUGGING */
a687059c
LW
4634}
4635
8fa7f367
JH
4636#ifdef DEBUGGING
4637
653099ff
GS
4638STATIC void
4639S_put_byte(pTHX_ SV *sv, int c)
4640{
7be5a6cf 4641 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4642 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4643 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4644 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4645 else
4646 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4647}
4648
8fa7f367
JH
4649#endif /* DEBUGGING */
4650
a687059c 4651/*
a0d0e21e
LW
4652- regprop - printable representation of opcode
4653*/
46fc3d4c 4654void
864dbfa3 4655Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4656{
35ff7856 4657#ifdef DEBUGGING
9b155405 4658 register int k;
a0d0e21e 4659
54dc92de 4660 sv_setpvn(sv, "", 0);
9b155405 4661 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4662 /* It would be nice to FAIL() here, but this may be called from
4663 regexec.c, and it would be hard to supply pRExC_state. */
4664 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4665 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4666
4667 k = PL_regkind[(U8)OP(o)];
4668
2a782b5b
JH
4669 if (k == EXACT) {
4670 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4671 /* Using is_utf8_string() is a crude hack but it may
4672 * be the best for now since we have no flag "this EXACTish
4673 * node was UTF-8" --jhi */
4674 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4675 char *s = do_utf8 ?
c728cb41
JH
4676 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4677 UNI_DISPLAY_REGEX) :
2a782b5b 4678 STRING(o);
40eddc46 4679 int len = do_utf8 ?
2a782b5b
JH
4680 strlen(s) :
4681 STR_LEN(o);
4682 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4683 PL_colors[0],
4684 len, s,
4685 PL_colors[1]);
4686 }
9b155405 4687 else if (k == CURLY) {
cb434fcc 4688 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4689 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4690 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4691 }
2c2d71f5
JH
4692 else if (k == WHILEM && o->flags) /* Ordinal/of */
4693 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4694 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4695 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4696 else if (k == LOGICAL)
04ebc1ab 4697 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4698 else if (k == ANYOF) {
4699 int i, rangestart = -1;
ffc61ed2 4700 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4701 const char * const anyofs[] = { /* Should be syncronized with
4702 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4703 "\\w",
4704 "\\W",
4705 "\\s",
4706 "\\S",
4707 "\\d",
4708 "\\D",
4709 "[:alnum:]",
4710 "[:^alnum:]",
4711 "[:alpha:]",
4712 "[:^alpha:]",
4713 "[:ascii:]",
4714 "[:^ascii:]",
4715 "[:ctrl:]",
4716 "[:^ctrl:]",
4717 "[:graph:]",
4718 "[:^graph:]",
4719 "[:lower:]",
4720 "[:^lower:]",
4721 "[:print:]",
4722 "[:^print:]",
4723 "[:punct:]",
4724 "[:^punct:]",
4725 "[:upper:]",
aaa51d5e 4726 "[:^upper:]",
653099ff 4727 "[:xdigit:]",
aaa51d5e
JF
4728 "[:^xdigit:]",
4729 "[:space:]",
4730 "[:^space:]",
4731 "[:blank:]",
4732 "[:^blank:]"
653099ff
GS
4733 };
4734
19860706 4735 if (flags & ANYOF_LOCALE)
653099ff 4736 sv_catpv(sv, "{loc}");
19860706 4737 if (flags & ANYOF_FOLD)
653099ff
GS
4738 sv_catpv(sv, "{i}");
4739 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4740 if (flags & ANYOF_INVERT)
653099ff 4741 sv_catpv(sv, "^");
ffc61ed2
JH
4742 for (i = 0; i <= 256; i++) {
4743 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4744 if (rangestart == -1)
4745 rangestart = i;
4746 } else if (rangestart != -1) {
4747 if (i <= rangestart + 3)
4748 for (; rangestart < i; rangestart++)
653099ff 4749 put_byte(sv, rangestart);
ffc61ed2
JH
4750 else {
4751 put_byte(sv, rangestart);
4752 sv_catpv(sv, "-");
4753 put_byte(sv, i - 1);
653099ff 4754 }
ffc61ed2 4755 rangestart = -1;
653099ff 4756 }
847a199f 4757 }
ffc61ed2
JH
4758
4759 if (o->flags & ANYOF_CLASS)
4760 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4761 if (ANYOF_CLASS_TEST(o,i))
4762 sv_catpv(sv, anyofs[i]);
4763
4764 if (flags & ANYOF_UNICODE)
4765 sv_catpv(sv, "{unicode}");
1aa99e6b 4766 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4767 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4768
4769 {
4770 SV *lv;
9e55ce06 4771 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4772
ffc61ed2
JH
4773 if (lv) {
4774 if (sw) {
4775 UV i;
4776 U8 s[UTF8_MAXLEN+1];
b81d288d 4777
ffc61ed2 4778 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4779 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4780
3568d838 4781 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4782 if (rangestart == -1)
4783 rangestart = i;
4784 } else if (rangestart != -1) {
4785 U8 *p;
b81d288d 4786
ffc61ed2
JH
4787 if (i <= rangestart + 3)
4788 for (; rangestart < i; rangestart++) {
2b9d42f0 4789 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4790 put_byte(sv, *p);
4791 }
4792 else {
2b9d42f0 4793 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4794 put_byte(sv, *p);
4795 sv_catpv(sv, "-");
2b9d42f0 4796 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4797 put_byte(sv, *p);
4798 }
4799 rangestart = -1;
4800 }
19860706 4801 }
ffc61ed2
JH
4802
4803 sv_catpv(sv, "..."); /* et cetera */
19860706 4804 }
fde631ed 4805
ffc61ed2
JH
4806 {
4807 char *s = savepv(SvPVX(lv));
4808 char *origs = s;
b81d288d 4809
ffc61ed2 4810 while(*s && *s != '\n') s++;
b81d288d 4811
ffc61ed2
JH
4812 if (*s == '\n') {
4813 char *t = ++s;
4814
4815 while (*s) {
4816 if (*s == '\n')
4817 *s = ' ';
4818 s++;
4819 }
4820 if (s[-1] == ' ')
4821 s[-1] = 0;
4822
4823 sv_catpv(sv, t);
fde631ed 4824 }
b81d288d 4825
ffc61ed2 4826 Safefree(origs);
fde631ed
JH
4827 }
4828 }
653099ff 4829 }
ffc61ed2 4830
653099ff
GS
4831 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4832 }
9b155405 4833 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4834 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4835#endif /* DEBUGGING */
35ff7856 4836}
a687059c 4837
cad2e5aa
JH
4838SV *
4839Perl_re_intuit_string(pTHX_ regexp *prog)
4840{ /* Assume that RE_INTUIT is set */
4841 DEBUG_r(
4842 { STRLEN n_a;
33b8afdf
JH
4843 char *s = SvPV(prog->check_substr
4844 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4845
4846 if (!PL_colorset) reginitcolors();
4847 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4848 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4849 PL_colors[4],
4850 prog->check_substr ? "" : "utf8 ",
4851 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4852 s,
4853 PL_colors[1],
4854 (strlen(s) > 60 ? "..." : ""));
4855 } );
4856
33b8afdf 4857 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4858}
4859
2b69d0c2 4860void
864dbfa3 4861Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4862{
9e55ce06
JH
4863#ifdef DEBUGGING
4864 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4865#endif
7821416a
IZ
4866
4867 if (!r || (--r->refcnt > 0))
4868 return;
9e55ce06 4869 DEBUG_r({
9e55ce06 4870 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4871 UNI_DISPLAY_REGEX);
9e55ce06
JH
4872 int len = SvCUR(dsv);
4873 if (!PL_colorset)
4874 reginitcolors();
4875 PerlIO_printf(Perl_debug_log,
4876 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4877 PL_colors[4],PL_colors[5],PL_colors[0],
4878 len, len, s,
4879 PL_colors[1],
4880 len > 60 ? "..." : "");
4881 });
cad2e5aa 4882
c277df42 4883 if (r->precomp)
a0d0e21e 4884 Safefree(r->precomp);
fac92740
MJD
4885 if (r->offsets) /* 20010421 MJD */
4886 Safefree(r->offsets);
cf93c79d
IZ
4887 if (RX_MATCH_COPIED(r))
4888 Safefree(r->subbeg);
a193d654
GS
4889 if (r->substrs) {
4890 if (r->anchored_substr)
4891 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4892 if (r->anchored_utf8)
4893 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4894 if (r->float_substr)
4895 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4896 if (r->float_utf8)
4897 SvREFCNT_dec(r->float_utf8);
2779dcf1 4898 Safefree(r->substrs);
a193d654 4899 }
c277df42
IZ
4900 if (r->data) {
4901 int n = r->data->count;
dfad63ad
HS
4902 AV* new_comppad = NULL;
4903 AV* old_comppad;
4904 SV** old_curpad;
4905
c277df42 4906 while (--n >= 0) {
261faec3 4907 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4908 switch (r->data->what[n]) {
4909 case 's':
4910 SvREFCNT_dec((SV*)r->data->data[n]);
4911 break;
653099ff
GS
4912 case 'f':
4913 Safefree(r->data->data[n]);
4914 break;
dfad63ad
HS
4915 case 'p':
4916 new_comppad = (AV*)r->data->data[n];
4917 break;
c277df42 4918 case 'o':
dfad63ad 4919 if (new_comppad == NULL)
cea2e8a9 4920 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4921 old_comppad = PL_comppad;
4922 old_curpad = PL_curpad;
1e6dc0b6
SB
4923 /* Watch out for global destruction's random ordering. */
4924 if (SvTYPE(new_comppad) == SVt_PVAV) {
4925 PL_comppad = new_comppad;
4926 PL_curpad = AvARRAY(new_comppad);
4927 }
4928 else
4929 PL_curpad = NULL;
9b978d73
DM
4930
4931 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4932 op_free((OP_4tree*)r->data->data[n]);
4933 }
4934
dfad63ad
HS
4935 PL_comppad = old_comppad;
4936 PL_curpad = old_curpad;
4937 SvREFCNT_dec((SV*)new_comppad);
4938 new_comppad = NULL;
c277df42
IZ
4939 break;
4940 case 'n':
9e55ce06 4941 break;
c277df42 4942 default:
830247a4 4943 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4944 }
4945 }
4946 Safefree(r->data->what);
4947 Safefree(r->data);
a0d0e21e
LW
4948 }
4949 Safefree(r->startp);
4950 Safefree(r->endp);
4951 Safefree(r);
a687059c 4952}
c277df42
IZ
4953
4954/*
4955 - regnext - dig the "next" pointer out of a node
4956 *
4957 * [Note, when REGALIGN is defined there are two places in regmatch()
4958 * that bypass this code for speed.]
4959 */
4960regnode *
864dbfa3 4961Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4962{
4963 register I32 offset;
4964
3280af22 4965 if (p == &PL_regdummy)
c277df42
IZ
4966 return(NULL);
4967
4968 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4969 if (offset == 0)
4970 return(NULL);
4971
c277df42 4972 return(p+offset);
c277df42
IZ
4973}
4974
01f988be 4975STATIC void
cea2e8a9 4976S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4977{
4978 va_list args;
4979 STRLEN l1 = strlen(pat1);
4980 STRLEN l2 = strlen(pat2);
4981 char buf[512];
06bf62c7 4982 SV *msv;
c277df42
IZ
4983 char *message;
4984
4985 if (l1 > 510)
4986 l1 = 510;
4987 if (l1 + l2 > 510)
4988 l2 = 510 - l1;
4989 Copy(pat1, buf, l1 , char);
4990 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4991 buf[l1 + l2] = '\n';
4992 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4993#ifdef I_STDARG
4994 /* ANSI variant takes additional second argument */
c277df42 4995 va_start(args, pat2);
8736538c
AS
4996#else
4997 va_start(args);
4998#endif
5a844595 4999 msv = vmess(buf, &args);
c277df42 5000 va_end(args);
06bf62c7 5001 message = SvPV(msv,l1);
c277df42
IZ
5002 if (l1 > 512)
5003 l1 = 512;
5004 Copy(message, buf, l1 , char);
5005 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5006 Perl_croak(aTHX_ "%s", buf);
c277df42 5007}
a0ed51b3
LW
5008
5009/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5010
5011void
864dbfa3 5012Perl_save_re_context(pTHX)
b81d288d 5013{
830247a4
IZ
5014#if 0
5015 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5016 SAVEI32(RExC_npar); /* () count. */
5017 SAVEI32(RExC_size); /* Code size. */
5018 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5019 SAVEVPTR(RExC_rx); /* from regcomp.c */
5020 SAVEI32(RExC_seen); /* from regcomp.c */
5021 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5022 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5023 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
5024 SAVEPPTR(RExC_end); /* End of input for compile */
5025 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5026#endif
5027
5028 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5029 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5030 SAVEPPTR(PL_reginput); /* String-input pointer. */
5031 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5032 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5033 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5034 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5035 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 5036 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5037 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5038 PL_reg_start_tmp = 0;
a0ed51b3
LW
5039 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5040 PL_reg_start_tmpl = 0;
7766f137 5041 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5042 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5043 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5044 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5045 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5046 SAVEVPTR(PL_regcc); /* from regexec.c */
5047 SAVEVPTR(PL_curcop);
7766f137
GS
5048 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5049 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5050 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5051 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 5052 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 5053 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5054 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5055 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5056 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5057 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5058 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5059#ifdef DEBUGGING
b81d288d 5060 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5061#endif
a0ed51b3 5062}
51371543 5063
51371543 5064static void
acfe0abc 5065clear_re(pTHX_ void *r)
51371543
GS
5066{
5067 ReREFCNT_dec((regexp *)r);
5068}
ffbc6a93 5069