This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Unicode::Collate 0.20.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4 110typedef struct RExC_state_t {
e2509266 111 U32 flags; /* are we folding, multilining? */
830247a4
IZ
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
9014280d 388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
9014280d 395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
f9373011
PM
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
397 } STMT_END \
398
b45f050a
JF
399
400#define vWARN2(loc, m, a1) \
401 STMT_START { \
7a799e20 402 IV offset = loc - RExC_precomp; \
9014280d 403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 404 a1, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN3(loc, m, a1, a2) \
409 STMT_START { \
7a799e20 410 IV offset = loc - RExC_precomp; \
9014280d 411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
b45f050a 412 a1, a2, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
414 } STMT_END
415
416#define vWARN4(loc, m, a1, a2, a3) \
417 STMT_START { \
7a799e20 418 IV offset = loc - RExC_precomp; \
9014280d 419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 420 a1, a2, a3, \
830247a4 421 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
422 } STMT_END
423
9d1d55b5
JP
424/* used for the parse_flags section for (?c) -- japhy */
425#define vWARN5(loc, m, a1, a2, a3, a4) \
426 STMT_START { \
7a799e20 427 IV offset = loc - RExC_precomp; \
9014280d 428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
9d1d55b5
JP
429 a1, a2, a3, a4, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
431 } STMT_END
432
8615cb43 433
cd439c50 434/* Allow for side effects in s */
fd3f0ae2 435#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 436
fac92740
MJD
437/* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
441 */
442
443#define MJD_OFFSET_DEBUG(x)
444/* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447# define Set_Node_Offset_To_R(node,byte) \
448 STMT_START { \
449 if (! SIZE_ONLY) { \
450 if((node) < 0) { \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452 } else { \
453 RExC_offsets[2*(node)-1] = (byte); \
454 } \
455 } \
456 } STMT_END
457
458# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461# define Set_Node_Length_To_R(node,len) \
462 STMT_START { \
463 if (! SIZE_ONLY) { \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465 if((node) < 0) { \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467 } else { \
468 RExC_offsets[2*(node)] = (len); \
469 } \
470 } \
471 } STMT_END
472
473# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
476
477/* Get offsets and lengths */
478#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
acfe0abc 481static void clear_re(pTHX_ void *r);
4327152a 482
653099ff
GS
483/* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
486
4327152a 487STATIC void
830247a4 488S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 489{
a0ed51b3
LW
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 492
c277df42
IZ
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
b81d288d 498 data->flags
c277df42
IZ
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500 else
501 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
502 }
503 else {
c277df42 504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
505 data->offset_float_max = (l
506 ? data->last_start_max
c277df42 507 : data->pos_min + data->pos_delta);
9051bda5
HS
508 if ((U32)data->offset_float_max > (U32)I32_MAX)
509 data->offset_float_max = I32_MAX;
c277df42 510 if (data->flags & SF_BEFORE_EOL)
b81d288d 511 data->flags
c277df42
IZ
512 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513 else
514 data->flags &= ~SF_FL_BEFORE_EOL;
515 }
516 }
517 SvCUR_set(data->last_found, 0);
518 data->last_end = -1;
519 data->flags &= ~SF_BEFORE_EOL;
520}
521
653099ff
GS
522/* Can match anything (initialization) */
523STATIC void
830247a4 524S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 525{
653099ff 526 ANYOF_CLASS_ZERO(cl);
f8bef550 527 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 528 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
529 if (LOC)
530 cl->flags |= ANYOF_LOCALE;
531}
532
533/* Can match anything (initialization) */
534STATIC int
535S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536{
537 int value;
538
aaa51d5e 539 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
540 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541 return 1;
1aa99e6b
IH
542 if (!(cl->flags & ANYOF_UNICODE_ALL))
543 return 0;
f8bef550
NC
544 if (!ANYOF_BITMAP_TESTALLSET(cl))
545 return 0;
653099ff
GS
546 return 1;
547}
548
549/* Can match anything (initialization) */
550STATIC void
830247a4 551S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 552{
8ecf7187 553 Zero(cl, 1, struct regnode_charclass_class);
653099ff 554 cl->type = ANYOF;
830247a4 555 cl_anything(pRExC_state, cl);
653099ff
GS
556}
557
558STATIC void
830247a4 559S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 560{
8ecf7187 561 Zero(cl, 1, struct regnode_charclass_class);
653099ff 562 cl->type = ANYOF;
830247a4 563 cl_anything(pRExC_state, cl);
653099ff
GS
564 if (LOC)
565 cl->flags |= ANYOF_LOCALE;
566}
567
568/* 'And' a given class with another one. Can create false positives */
569/* We assume that cl is not inverted */
570STATIC void
571S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572 struct regnode_charclass_class *and_with)
573{
653099ff
GS
574 if (!(and_with->flags & ANYOF_CLASS)
575 && !(cl->flags & ANYOF_CLASS)
576 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577 && !(and_with->flags & ANYOF_FOLD)
578 && !(cl->flags & ANYOF_FOLD)) {
579 int i;
580
581 if (and_with->flags & ANYOF_INVERT)
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] &= ~and_with->bitmap[i];
584 else
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= and_with->bitmap[i];
587 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588 if (!(and_with->flags & ANYOF_EOS))
589 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
590
591 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592 cl->flags &= ~ANYOF_UNICODE_ALL;
593 cl->flags |= ANYOF_UNICODE;
594 ARG_SET(cl, ARG(and_with));
595 }
596 if (!(and_with->flags & ANYOF_UNICODE_ALL))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
600}
601
602/* 'OR' a given class with another one. Can create false positives */
603/* We assume that cl is not inverted */
604STATIC void
830247a4 605S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 606{
653099ff
GS
607 if (or_with->flags & ANYOF_INVERT) {
608 /* We do not use
609 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610 * <= (B1 | !B2) | (CL1 | !CL2)
611 * which is wasteful if CL2 is small, but we ignore CL2:
612 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613 * XXXX Can we handle case-fold? Unclear:
614 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616 */
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618 && !(or_with->flags & ANYOF_FOLD)
619 && !(cl->flags & ANYOF_FOLD) ) {
620 int i;
621
622 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623 cl->bitmap[i] |= ~or_with->bitmap[i];
624 } /* XXXX: logic is complicated otherwise */
625 else {
830247a4 626 cl_anything(pRExC_state, cl);
653099ff
GS
627 }
628 } else {
629 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 631 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
632 || (cl->flags & ANYOF_FOLD)) ) {
633 int i;
634
635 /* OR char bitmap and class bitmap separately */
636 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637 cl->bitmap[i] |= or_with->bitmap[i];
638 if (or_with->flags & ANYOF_CLASS) {
639 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640 cl->classflags[i] |= or_with->classflags[i];
641 cl->flags |= ANYOF_CLASS;
642 }
643 }
644 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 645 cl_anything(pRExC_state, cl);
653099ff
GS
646 }
647 }
648 if (or_with->flags & ANYOF_EOS)
649 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
650
651 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652 ARG(cl) != ARG(or_with)) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
655 }
656 if (or_with->flags & ANYOF_UNICODE_ALL) {
657 cl->flags |= ANYOF_UNICODE_ALL;
658 cl->flags &= ~ANYOF_UNICODE;
659 }
653099ff
GS
660}
661
5d1c421c
JH
662/*
663 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664 * These need to be revisited when a newer toolchain becomes available.
665 */
666#if defined(__sparc64__) && defined(__GNUC__)
667# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668# undef SPARC64_GCC_WORKAROUND
669# define SPARC64_GCC_WORKAROUND 1
670# endif
671#endif
672
653099ff
GS
673/* REx optimizer. Converts nodes into quickier variants "in place".
674 Finds fixed substrings. */
675
c277df42
IZ
676/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677 to the position after last scanned or to NULL. */
678
76e3520e 679STATIC I32
830247a4 680S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
681 /* scanp: Start here (read-write). */
682 /* deltap: Write maxlen-minlen here. */
683 /* last: Stop before this one. */
684{
685 I32 min = 0, pars = 0, code;
686 regnode *scan = *scanp, *next;
687 I32 delta = 0;
688 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 689 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
690 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691 scan_data_t data_fake;
653099ff 692 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 693
c277df42
IZ
694 while (scan && OP(scan) != END && scan < last) {
695 /* Peephole optimizer: */
696
22c35a8c 697 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 698 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
699 regnode *n = regnext(scan);
700 U32 stringok = 1;
701#ifdef DEBUGGING
702 regnode *stop = scan;
b81d288d 703#endif
c277df42 704
cd439c50 705 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
706 /* Skip NOTHING, merge EXACT*. */
707 while (n &&
b81d288d 708 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
709 (stringok && (OP(n) == OP(scan))))
710 && NEXT_OFF(n)
711 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712 if (OP(n) == TAIL || n > next)
713 stringok = 0;
22c35a8c 714 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
715 NEXT_OFF(scan) += NEXT_OFF(n);
716 next = n + NODE_STEP_REGNODE;
717#ifdef DEBUGGING
718 if (stringok)
719 stop = n;
b81d288d 720#endif
c277df42 721 n = regnext(n);
a0ed51b3 722 }
f49d4d0f 723 else if (stringok) {
cd439c50 724 int oldl = STR_LEN(scan);
c277df42 725 regnode *nnext = regnext(n);
f49d4d0f 726
b81d288d 727 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
728 break;
729 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
730 STR_LEN(scan) += STR_LEN(n);
731 next = n + NODE_SZ_STR(n);
c277df42 732 /* Now we can overwrite *n : */
f49d4d0f 733 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 734#ifdef DEBUGGING
f49d4d0f 735 stop = next - 1;
b81d288d 736#endif
c277df42
IZ
737 n = nnext;
738 }
739 }
61a36c01 740
d65e4eab 741 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
742/*
743 Two problematic code points in Unicode casefolding of EXACT nodes:
744
745 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748 which casefold to
749
750 Unicode UTF-8
751
752 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
753 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755 This means that in case-insensitive matching (or "loose matching",
756 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757 length of the above casefolded versions) can match a target string
758 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759 This would rather mess up the minimum length computation.
760
761 What we'll do is to look for the tail four bytes, and then peek
762 at the preceding two bytes to see whether we need to decrease
763 the minimum length by four (six minus two).
764
765 Thanks to the design of UTF-8, there cannot be false matches:
766 A sequence of valid UTF-8 bytes cannot be a subsequence of
767 another valid sequence of UTF-8 bytes.
768
769*/
770 char *s0 = STRING(scan), *s, *t;
771 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772 char *t0 = "\xcc\x88\xcc\x81";
773 char *t1 = t0 + 3;
774
775 for (s = s0 + 2;
776 s < s2 && (t = ninstr(s, s1, t0, t1));
777 s = t + 4) {
778 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780 min -= 4;
781 }
782 }
783
c277df42
IZ
784#ifdef DEBUGGING
785 /* Allow dumping */
cd439c50 786 n = scan + NODE_SZ_STR(scan);
c277df42 787 while (n <= stop) {
22c35a8c 788 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
789 OP(n) = OPTIMIZED;
790 NEXT_OFF(n) = 0;
791 }
792 n++;
793 }
653099ff 794#endif
c277df42 795 }
653099ff
GS
796 /* Follow the next-chain of the current node and optimize
797 away all the NOTHINGs from it. */
c277df42 798 if (OP(scan) != CURLYX) {
048cfca1
GS
799 int max = (reg_off_by_arg[OP(scan)]
800 ? I32_MAX
801 /* I32 may be smaller than U16 on CRAYs! */
802 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
803 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804 int noff;
805 regnode *n = scan;
b81d288d 806
c277df42
IZ
807 /* Skip NOTHING and LONGJMP. */
808 while ((n = regnext(n))
22c35a8c 809 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
810 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 && off + noff < max)
812 off += noff;
813 if (reg_off_by_arg[OP(scan)])
814 ARG(scan) = off;
b81d288d 815 else
c277df42
IZ
816 NEXT_OFF(scan) = off;
817 }
653099ff
GS
818 /* The principal pseudo-switch. Cannot be a switch, since we
819 look into several different things. */
b81d288d 820 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
821 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822 next = regnext(scan);
823 code = OP(scan);
b81d288d
AB
824
825 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 826 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 827 struct regnode_charclass_class accum;
c277df42 828
653099ff 829 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 830 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 831 if (flags & SCF_DO_STCLASS)
830247a4 832 cl_init_zero(pRExC_state, &accum);
c277df42 833 while (OP(scan) == code) {
830247a4 834 I32 deltanext, minnext, f = 0, fake;
653099ff 835 struct regnode_charclass_class this_class;
c277df42
IZ
836
837 num++;
838 data_fake.flags = 0;
b81d288d 839 if (data) {
2c2d71f5 840 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
841 data_fake.last_closep = data->last_closep;
842 }
843 else
844 data_fake.last_closep = &fake;
c277df42
IZ
845 next = regnext(scan);
846 scan = NEXTOPER(scan);
847 if (code != BRANCH)
848 scan = NEXTOPER(scan);
653099ff 849 if (flags & SCF_DO_STCLASS) {
830247a4 850 cl_init(pRExC_state, &this_class);
653099ff
GS
851 data_fake.start_class = &this_class;
852 f = SCF_DO_STCLASS_AND;
b81d288d 853 }
e1901655
IZ
854 if (flags & SCF_WHILEM_VISITED_POS)
855 f |= SCF_WHILEM_VISITED_POS;
653099ff 856 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
857 minnext = study_chunk(pRExC_state, &scan, &deltanext,
858 next, &data_fake, f);
b81d288d 859 if (min1 > minnext)
c277df42
IZ
860 min1 = minnext;
861 if (max1 < minnext + deltanext)
862 max1 = minnext + deltanext;
863 if (deltanext == I32_MAX)
aca2d497 864 is_inf = is_inf_internal = 1;
c277df42
IZ
865 scan = next;
866 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867 pars++;
405ff068 868 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 869 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
870 if (data)
871 data->whilem_c = data_fake.whilem_c;
653099ff 872 if (flags & SCF_DO_STCLASS)
830247a4 873 cl_or(pRExC_state, &accum, &this_class);
b81d288d 874 if (code == SUSPEND)
c277df42
IZ
875 break;
876 }
877 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878 min1 = 0;
879 if (flags & SCF_DO_SUBSTR) {
880 data->pos_min += min1;
881 data->pos_delta += max1 - min1;
882 if (max1 != min1 || is_inf)
883 data->longest = &(data->longest_float);
884 }
885 min += min1;
886 delta += max1 - min1;
653099ff 887 if (flags & SCF_DO_STCLASS_OR) {
830247a4 888 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
889 if (min1) {
890 cl_and(data->start_class, &and_with);
891 flags &= ~SCF_DO_STCLASS;
892 }
893 }
894 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
895 if (min1) {
896 cl_and(data->start_class, &accum);
653099ff 897 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
898 }
899 else {
b81d288d 900 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
901 * data->start_class */
902 StructCopy(data->start_class, &and_with,
903 struct regnode_charclass_class);
904 flags &= ~SCF_DO_STCLASS_AND;
905 StructCopy(&accum, data->start_class,
906 struct regnode_charclass_class);
907 flags |= SCF_DO_STCLASS_OR;
908 data->start_class->flags |= ANYOF_EOS;
909 }
653099ff 910 }
a0ed51b3
LW
911 }
912 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
913 scan = NEXTOPER(NEXTOPER(scan));
914 else /* single branch is optimized. */
915 scan = NEXTOPER(scan);
916 continue;
a0ed51b3
LW
917 }
918 else if (OP(scan) == EXACT) {
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
a0ed51b3 921 if (UTF) {
1aa99e6b
IH
922 U8 *s = (U8*)STRING(scan);
923 l = utf8_length(s, s + l);
9041c2e3 924 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
925 }
926 min += l;
c277df42 927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
928 /* The code below prefers earlier match for fixed
929 offset, later match for variable offset. */
930 if (data->last_end == -1) { /* Update the start info. */
931 data->last_start_min = data->pos_min;
932 data->last_start_max = is_inf
b81d288d 933 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 934 }
cd439c50 935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
936 if (UTF)
937 SvUTF8_on(data->last_found);
c277df42
IZ
938 data->last_end = data->pos_min + l;
939 data->pos_min += l; /* As in the first entry. */
940 data->flags &= ~SF_BEFORE_EOL;
941 }
653099ff
GS
942 if (flags & SCF_DO_STCLASS_AND) {
943 /* Check whether it is compatible with what we know already! */
944 int compat = 1;
945
1aa99e6b 946 if (uc >= 0x100 ||
516a5887 947 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 948 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 949 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 950 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 951 )
653099ff
GS
952 compat = 0;
953 ANYOF_CLASS_ZERO(data->start_class);
954 ANYOF_BITMAP_ZERO(data->start_class);
955 if (compat)
1aa99e6b 956 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 957 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
958 if (uc < 0x100)
959 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
960 }
961 else if (flags & SCF_DO_STCLASS_OR) {
962 /* false positive possible if the class is case-folded */
1aa99e6b 963 if (uc < 0x100)
9b877dbb
IH
964 ANYOF_BITMAP_SET(data->start_class, uc);
965 else
966 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
967 data->start_class->flags &= ~ANYOF_EOS;
968 cl_and(data->start_class, &and_with);
969 }
970 flags &= ~SCF_DO_STCLASS;
a0ed51b3 971 }
653099ff 972 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 973 I32 l = STR_LEN(scan);
1aa99e6b 974 UV uc = *((U8*)STRING(scan));
653099ff
GS
975
976 /* Search for fixed substrings supports EXACT only. */
b81d288d 977 if (flags & SCF_DO_SUBSTR)
830247a4 978 scan_commit(pRExC_state, data);
a0ed51b3 979 if (UTF) {
1aa99e6b
IH
980 U8 *s = (U8 *)STRING(scan);
981 l = utf8_length(s, s + l);
9041c2e3 982 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
983 }
984 min += l;
c277df42 985 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 986 data->pos_min += l;
653099ff
GS
987 if (flags & SCF_DO_STCLASS_AND) {
988 /* Check whether it is compatible with what we know already! */
989 int compat = 1;
990
1aa99e6b 991 if (uc >= 0x100 ||
516a5887 992 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 993 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 994 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
995 compat = 0;
996 ANYOF_CLASS_ZERO(data->start_class);
997 ANYOF_BITMAP_ZERO(data->start_class);
998 if (compat) {
1aa99e6b 999 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1000 data->start_class->flags &= ~ANYOF_EOS;
1001 data->start_class->flags |= ANYOF_FOLD;
1002 if (OP(scan) == EXACTFL)
1003 data->start_class->flags |= ANYOF_LOCALE;
1004 }
1005 }
1006 else if (flags & SCF_DO_STCLASS_OR) {
1007 if (data->start_class->flags & ANYOF_FOLD) {
1008 /* false positive possible if the class is case-folded.
1009 Assume that the locale settings are the same... */
1aa99e6b
IH
1010 if (uc < 0x100)
1011 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1012 data->start_class->flags &= ~ANYOF_EOS;
1013 }
1014 cl_and(data->start_class, &and_with);
1015 }
1016 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1017 }
4d61ec05 1018 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1019 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1020 I32 f = flags, pos_before = 0;
c277df42 1021 regnode *oscan = scan;
653099ff
GS
1022 struct regnode_charclass_class this_class;
1023 struct regnode_charclass_class *oclass = NULL;
727f22e3 1024 I32 next_is_eval = 0;
653099ff 1025
22c35a8c 1026 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1027 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1028 scan = NEXTOPER(scan);
1029 goto finish;
1030 case PLUS:
653099ff 1031 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1032 next = NEXTOPER(scan);
653099ff 1033 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1034 mincount = 1;
1035 maxcount = REG_INFTY;
c277df42
IZ
1036 next = regnext(scan);
1037 scan = NEXTOPER(scan);
1038 goto do_curly;
1039 }
1040 }
1041 if (flags & SCF_DO_SUBSTR)
1042 data->pos_min++;
1043 min++;
1044 /* Fall through. */
1045 case STAR:
653099ff
GS
1046 if (flags & SCF_DO_STCLASS) {
1047 mincount = 0;
b81d288d 1048 maxcount = REG_INFTY;
653099ff
GS
1049 next = regnext(scan);
1050 scan = NEXTOPER(scan);
1051 goto do_curly;
1052 }
b81d288d 1053 is_inf = is_inf_internal = 1;
c277df42
IZ
1054 scan = regnext(scan);
1055 if (flags & SCF_DO_SUBSTR) {
830247a4 1056 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1057 data->longest = &(data->longest_float);
1058 }
1059 goto optimize_curly_tail;
1060 case CURLY:
b81d288d 1061 mincount = ARG1(scan);
c277df42
IZ
1062 maxcount = ARG2(scan);
1063 next = regnext(scan);
cb434fcc
IZ
1064 if (OP(scan) == CURLYX) {
1065 I32 lp = (data ? *(data->last_closep) : 0);
1066
1067 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 }
c277df42 1069 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1070 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1071 do_curly:
1072 if (flags & SCF_DO_SUBSTR) {
830247a4 1073 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1074 pos_before = data->pos_min;
1075 }
1076 if (data) {
1077 fl = data->flags;
1078 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 if (is_inf)
1080 data->flags |= SF_IS_INF;
1081 }
653099ff 1082 if (flags & SCF_DO_STCLASS) {
830247a4 1083 cl_init(pRExC_state, &this_class);
653099ff
GS
1084 oclass = data->start_class;
1085 data->start_class = &this_class;
1086 f |= SCF_DO_STCLASS_AND;
1087 f &= ~SCF_DO_STCLASS_OR;
1088 }
e1901655
IZ
1089 /* These are the cases when once a subexpression
1090 fails at a particular position, it cannot succeed
1091 even after backtracking at the enclosing scope.
b81d288d 1092
e1901655
IZ
1093 XXXX what if minimal match and we are at the
1094 initial run of {n,m}? */
1095 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1097
c277df42 1098 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1099 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 mincount == 0
653099ff
GS
1101 ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103 if (flags & SCF_DO_STCLASS)
1104 data->start_class = oclass;
1105 if (mincount == 0 || minnext == 0) {
1106 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1107 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1108 }
1109 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1110 /* Switch to OR mode: cache the old value of
653099ff
GS
1111 * data->start_class */
1112 StructCopy(data->start_class, &and_with,
1113 struct regnode_charclass_class);
1114 flags &= ~SCF_DO_STCLASS_AND;
1115 StructCopy(&this_class, data->start_class,
1116 struct regnode_charclass_class);
1117 flags |= SCF_DO_STCLASS_OR;
1118 data->start_class->flags |= ANYOF_EOS;
1119 }
1120 } else { /* Non-zero len */
1121 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1122 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1123 cl_and(data->start_class, &and_with);
1124 }
1125 else if (flags & SCF_DO_STCLASS_AND)
1126 cl_and(data->start_class, &this_class);
1127 flags &= ~SCF_DO_STCLASS;
1128 }
c277df42
IZ
1129 if (!scan) /* It was not CURLYX, but CURLY. */
1130 scan = next;
84037bb0 1131 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1132 /* ? quantifier ok, except for (?{ ... }) */
1133 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1134 && (minnext == 0) && (deltanext == 0)
99799961 1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1136 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1137 {
830247a4 1138 vWARN(RExC_parse,
b45f050a
JF
1139 "Quantifier unexpected on zero-length expression");
1140 }
1141
c277df42 1142 min += minnext * mincount;
b81d288d 1143 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1144 && (minnext + deltanext) > 0)
1145 || deltanext == I32_MAX);
aca2d497 1146 is_inf |= is_inf_internal;
c277df42
IZ
1147 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1150 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1151 && data->flags & SF_IN_PAR
1152 && !(data->flags & SF_HAS_EVAL)
1153 && !deltanext && minnext == 1 ) {
1154 /* Try to optimize to CURLYN. */
1155 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1156 regnode *nxt1 = nxt;
1157#ifdef DEBUGGING
1158 regnode *nxt2;
1159#endif
c277df42
IZ
1160
1161 /* Skip open. */
1162 nxt = regnext(nxt);
4d61ec05 1163 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1164 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1165 && STR_LEN(nxt) == 1))
c277df42 1166 goto nogo;
497b47a8 1167#ifdef DEBUGGING
c277df42 1168 nxt2 = nxt;
497b47a8 1169#endif
c277df42 1170 nxt = regnext(nxt);
b81d288d 1171 if (OP(nxt) != CLOSE)
c277df42
IZ
1172 goto nogo;
1173 /* Now we know that nxt2 is the only contents: */
eb160463 1174 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1175 OP(oscan) = CURLYN;
1176 OP(nxt1) = NOTHING; /* was OPEN. */
1177#ifdef DEBUGGING
1178 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1182 OP(nxt + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1184#endif
c277df42 1185 }
c277df42
IZ
1186 nogo:
1187
1188 /* Try optimization CURLYX => CURLYM. */
b81d288d 1189 if ( OP(oscan) == CURLYX && data
c277df42 1190 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1191 && !(data->flags & SF_HAS_EVAL)
1192 && !deltanext ) {
1193 /* XXXX How to optimize if data == 0? */
1194 /* Optimize to a simpler form. */
1195 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196 regnode *nxt2;
1197
1198 OP(oscan) = CURLYM;
1199 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1200 && (OP(nxt2) != WHILEM))
c277df42
IZ
1201 nxt = nxt2;
1202 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1203 /* Need to optimize away parenths. */
1204 if (data->flags & SF_IN_PAR) {
1205 /* Set the parenth number. */
1206 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
b81d288d 1208 if (OP(nxt) != CLOSE)
b45f050a 1209 FAIL("Panic opt close");
eb160463 1210 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1211 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1212 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1213#ifdef DEBUGGING
1214 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215 OP(nxt + 1) = OPTIMIZED; /* was count. */
1216 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1218#endif
c277df42
IZ
1219#if 0
1220 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221 regnode *nnxt = regnext(nxt1);
b81d288d 1222
c277df42
IZ
1223 if (nnxt == nxt) {
1224 if (reg_off_by_arg[OP(nxt1)])
1225 ARG_SET(nxt1, nxt2 - nxt1);
1226 else if (nxt2 - nxt1 < U16_MAX)
1227 NEXT_OFF(nxt1) = nxt2 - nxt1;
1228 else
1229 OP(nxt) = NOTHING; /* Cannot beautify */
1230 }
1231 nxt1 = nnxt;
1232 }
1233#endif
1234 /* Optimize again: */
b81d288d 1235 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1236 NULL, 0);
a0ed51b3
LW
1237 }
1238 else
c277df42 1239 oscan->flags = 0;
c277df42 1240 }
e1901655
IZ
1241 else if ((OP(oscan) == CURLYX)
1242 && (flags & SCF_WHILEM_VISITED_POS)
1243 /* See the comment on a similar expression above.
1244 However, this time it not a subexpression
1245 we care about, but the expression itself. */
1246 && (maxcount == REG_INFTY)
1247 && data && ++data->whilem_c < 16) {
1248 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1249 /* Find WHILEM (as in regexec.c) */
1250 regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253 nxt += ARG(nxt);
eb160463
GS
1254 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1255 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1256 }
b81d288d 1257 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1258 pars++;
1259 if (flags & SCF_DO_SUBSTR) {
1260 SV *last_str = Nullsv;
1261 int counted = mincount != 0;
1262
1263 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1264#if defined(SPARC64_GCC_WORKAROUND)
1265 I32 b = 0;
1266 STRLEN l = 0;
1267 char *s = NULL;
1268 I32 old = 0;
1269
1270 if (pos_before >= data->last_start_min)
1271 b = pos_before;
1272 else
1273 b = data->last_start_min;
1274
1275 l = 0;
1276 s = SvPV(data->last_found, l);
1277 old = b - data->last_start_min;
1278
1279#else
b81d288d 1280 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1281 ? pos_before : data->last_start_min;
1282 STRLEN l;
1283 char *s = SvPV(data->last_found, l);
a0ed51b3 1284 I32 old = b - data->last_start_min;
5d1c421c 1285#endif
a0ed51b3
LW
1286
1287 if (UTF)
1288 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1289
a0ed51b3 1290 l -= old;
c277df42 1291 /* Get the added string: */
79cb57f6 1292 last_str = newSVpvn(s + old, l);
c277df42
IZ
1293 if (deltanext == 0 && pos_before == b) {
1294 /* What was added is a constant string */
1295 if (mincount > 1) {
1296 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1297 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1298 SvPVX(last_str), l, mincount - 1);
1299 SvCUR(last_str) *= mincount;
1300 /* Add additional parts. */
b81d288d 1301 SvCUR_set(data->last_found,
c277df42
IZ
1302 SvCUR(data->last_found) - l);
1303 sv_catsv(data->last_found, last_str);
1304 data->last_end += l * (mincount - 1);
1305 }
2a8d9689
HS
1306 } else {
1307 /* start offset must point into the last copy */
1308 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1309 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1310 * (minnext + data->pos_delta);
c277df42
IZ
1311 }
1312 }
1313 /* It is counted once already... */
1314 data->pos_min += minnext * (mincount - counted);
1315 data->pos_delta += - counted * deltanext +
1316 (minnext + deltanext) * maxcount - minnext * mincount;
1317 if (mincount != maxcount) {
653099ff
GS
1318 /* Cannot extend fixed substrings found inside
1319 the group. */
830247a4 1320 scan_commit(pRExC_state,data);
c277df42
IZ
1321 if (mincount && last_str) {
1322 sv_setsv(data->last_found, last_str);
1323 data->last_end = data->pos_min;
b81d288d 1324 data->last_start_min =
a0ed51b3 1325 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1326 data->last_start_max = is_inf
1327 ? I32_MAX
c277df42 1328 : data->pos_min + data->pos_delta
a0ed51b3 1329 - CHR_SVLEN(last_str);
c277df42
IZ
1330 }
1331 data->longest = &(data->longest_float);
1332 }
aca2d497 1333 SvREFCNT_dec(last_str);
c277df42 1334 }
405ff068 1335 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1336 data->flags |= SF_HAS_EVAL;
1337 optimize_curly_tail:
c277df42 1338 if (OP(oscan) != CURLYX) {
22c35a8c 1339 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1340 && NEXT_OFF(next))
1341 NEXT_OFF(oscan) += NEXT_OFF(next);
1342 }
c277df42 1343 continue;
653099ff 1344 default: /* REF and CLUMP only? */
c277df42 1345 if (flags & SCF_DO_SUBSTR) {
830247a4 1346 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1347 data->longest = &(data->longest_float);
1348 }
aca2d497 1349 is_inf = is_inf_internal = 1;
653099ff 1350 if (flags & SCF_DO_STCLASS_OR)
830247a4 1351 cl_anything(pRExC_state, data->start_class);
653099ff 1352 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1353 break;
1354 }
a0ed51b3 1355 }
ffc61ed2 1356 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1357 int value = 0;
653099ff 1358
c277df42 1359 if (flags & SCF_DO_SUBSTR) {
830247a4 1360 scan_commit(pRExC_state,data);
c277df42
IZ
1361 data->pos_min++;
1362 }
1363 min++;
653099ff
GS
1364 if (flags & SCF_DO_STCLASS) {
1365 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367 /* Some of the logic below assumes that switching
1368 locale on will only add false positives. */
1369 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1370 case SANY:
653099ff
GS
1371 default:
1372 do_default:
1373 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1375 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1376 break;
1377 case REG_ANY:
1378 if (OP(scan) == SANY)
1379 goto do_default;
1380 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1383 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1384 }
1385 if (flags & SCF_DO_STCLASS_AND || !value)
1386 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387 break;
1388 case ANYOF:
1389 if (flags & SCF_DO_STCLASS_AND)
1390 cl_and(data->start_class,
1391 (struct regnode_charclass_class*)scan);
1392 else
830247a4 1393 cl_or(pRExC_state, data->start_class,
653099ff
GS
1394 (struct regnode_charclass_class*)scan);
1395 break;
1396 case ALNUM:
1397 if (flags & SCF_DO_STCLASS_AND) {
1398 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400 for (value = 0; value < 256; value++)
1401 if (!isALNUM(value))
1402 ANYOF_BITMAP_CLEAR(data->start_class, value);
1403 }
1404 }
1405 else {
1406 if (data->start_class->flags & ANYOF_LOCALE)
1407 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408 else {
1409 for (value = 0; value < 256; value++)
1410 if (isALNUM(value))
b81d288d 1411 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1412 }
1413 }
1414 break;
1415 case ALNUML:
1416 if (flags & SCF_DO_STCLASS_AND) {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419 }
1420 else {
1421 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422 data->start_class->flags |= ANYOF_LOCALE;
1423 }
1424 break;
1425 case NALNUM:
1426 if (flags & SCF_DO_STCLASS_AND) {
1427 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429 for (value = 0; value < 256; value++)
1430 if (isALNUM(value))
1431 ANYOF_BITMAP_CLEAR(data->start_class, value);
1432 }
1433 }
1434 else {
1435 if (data->start_class->flags & ANYOF_LOCALE)
1436 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437 else {
1438 for (value = 0; value < 256; value++)
1439 if (!isALNUM(value))
b81d288d 1440 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1441 }
1442 }
1443 break;
1444 case NALNUML:
1445 if (flags & SCF_DO_STCLASS_AND) {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448 }
1449 else {
1450 data->start_class->flags |= ANYOF_LOCALE;
1451 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452 }
1453 break;
1454 case SPACE:
1455 if (flags & SCF_DO_STCLASS_AND) {
1456 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458 for (value = 0; value < 256; value++)
1459 if (!isSPACE(value))
1460 ANYOF_BITMAP_CLEAR(data->start_class, value);
1461 }
1462 }
1463 else {
1464 if (data->start_class->flags & ANYOF_LOCALE)
1465 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466 else {
1467 for (value = 0; value < 256; value++)
1468 if (isSPACE(value))
b81d288d 1469 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1470 }
1471 }
1472 break;
1473 case SPACEL:
1474 if (flags & SCF_DO_STCLASS_AND) {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477 }
1478 else {
1479 data->start_class->flags |= ANYOF_LOCALE;
1480 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481 }
1482 break;
1483 case NSPACE:
1484 if (flags & SCF_DO_STCLASS_AND) {
1485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487 for (value = 0; value < 256; value++)
1488 if (isSPACE(value))
1489 ANYOF_BITMAP_CLEAR(data->start_class, value);
1490 }
1491 }
1492 else {
1493 if (data->start_class->flags & ANYOF_LOCALE)
1494 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495 else {
1496 for (value = 0; value < 256; value++)
1497 if (!isSPACE(value))
b81d288d 1498 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1499 }
1500 }
1501 break;
1502 case NSPACEL:
1503 if (flags & SCF_DO_STCLASS_AND) {
1504 if (data->start_class->flags & ANYOF_LOCALE) {
1505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506 for (value = 0; value < 256; value++)
1507 if (!isSPACE(value))
1508 ANYOF_BITMAP_CLEAR(data->start_class, value);
1509 }
1510 }
1511 else {
1512 data->start_class->flags |= ANYOF_LOCALE;
1513 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514 }
1515 break;
1516 case DIGIT:
1517 if (flags & SCF_DO_STCLASS_AND) {
1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519 for (value = 0; value < 256; value++)
1520 if (!isDIGIT(value))
1521 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 }
1523 else {
1524 if (data->start_class->flags & ANYOF_LOCALE)
1525 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526 else {
1527 for (value = 0; value < 256; value++)
1528 if (isDIGIT(value))
b81d288d 1529 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1530 }
1531 }
1532 break;
1533 case NDIGIT:
1534 if (flags & SCF_DO_STCLASS_AND) {
1535 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536 for (value = 0; value < 256; value++)
1537 if (isDIGIT(value))
1538 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539 }
1540 else {
1541 if (data->start_class->flags & ANYOF_LOCALE)
1542 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543 else {
1544 for (value = 0; value < 256; value++)
1545 if (!isDIGIT(value))
b81d288d 1546 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1547 }
1548 }
1549 break;
1550 }
1551 if (flags & SCF_DO_STCLASS_OR)
1552 cl_and(data->start_class, &and_with);
1553 flags &= ~SCF_DO_STCLASS;
1554 }
a0ed51b3 1555 }
22c35a8c 1556 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1557 data->flags |= (OP(scan) == MEOL
1558 ? SF_BEFORE_MEOL
1559 : SF_BEFORE_SEOL);
a0ed51b3 1560 }
653099ff
GS
1561 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1562 /* Lookbehind, or need to calculate parens/evals/stclass: */
1563 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1564 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1565 /* Lookahead/lookbehind */
cb434fcc 1566 I32 deltanext, minnext, fake = 0;
c277df42 1567 regnode *nscan;
653099ff
GS
1568 struct regnode_charclass_class intrnl;
1569 int f = 0;
c277df42
IZ
1570
1571 data_fake.flags = 0;
b81d288d 1572 if (data) {
2c2d71f5 1573 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1574 data_fake.last_closep = data->last_closep;
1575 }
1576 else
1577 data_fake.last_closep = &fake;
653099ff
GS
1578 if ( flags & SCF_DO_STCLASS && !scan->flags
1579 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1580 cl_init(pRExC_state, &intrnl);
653099ff 1581 data_fake.start_class = &intrnl;
e1901655 1582 f |= SCF_DO_STCLASS_AND;
653099ff 1583 }
e1901655
IZ
1584 if (flags & SCF_WHILEM_VISITED_POS)
1585 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1586 next = regnext(scan);
1587 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1588 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1589 if (scan->flags) {
1590 if (deltanext) {
9baa0206 1591 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1592 }
1593 else if (minnext > U8_MAX) {
9baa0206 1594 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1595 }
eb160463 1596 scan->flags = (U8)minnext;
c277df42
IZ
1597 }
1598 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599 pars++;
405ff068 1600 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1601 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1602 if (data)
1603 data->whilem_c = data_fake.whilem_c;
e1901655 1604 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1605 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607 cl_and(data->start_class, &intrnl);
1608 if (was)
1609 data->start_class->flags |= ANYOF_EOS;
1610 }
a0ed51b3
LW
1611 }
1612 else if (OP(scan) == OPEN) {
c277df42 1613 pars++;
a0ed51b3 1614 }
cb434fcc 1615 else if (OP(scan) == CLOSE) {
eb160463 1616 if ((I32)ARG(scan) == is_par) {
cb434fcc 1617 next = regnext(scan);
c277df42 1618
cb434fcc
IZ
1619 if ( next && (OP(next) != WHILEM) && next < last)
1620 is_par = 0; /* Disable optimization */
1621 }
1622 if (data)
1623 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1624 }
1625 else if (OP(scan) == EVAL) {
c277df42
IZ
1626 if (data)
1627 data->flags |= SF_HAS_EVAL;
1628 }
96776eda 1629 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1630 if (flags & SCF_DO_SUBSTR) {
830247a4 1631 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1632 data->longest = &(data->longest_float);
1633 }
1634 is_inf = is_inf_internal = 1;
653099ff 1635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1636 cl_anything(pRExC_state, data->start_class);
96776eda 1637 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1638 }
c277df42
IZ
1639 /* Else: zero-length, ignore. */
1640 scan = regnext(scan);
1641 }
1642
1643 finish:
1644 *scanp = scan;
aca2d497 1645 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1646 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1647 data->pos_delta = I32_MAX - data->pos_min;
1648 if (is_par > U8_MAX)
1649 is_par = 0;
1650 if (is_par && pars==1 && data) {
1651 data->flags |= SF_IN_PAR;
1652 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1653 }
1654 else if (pars && data) {
c277df42
IZ
1655 data->flags |= SF_HAS_PAR;
1656 data->flags &= ~SF_IN_PAR;
1657 }
653099ff
GS
1658 if (flags & SCF_DO_STCLASS_OR)
1659 cl_and(data->start_class, &and_with);
c277df42
IZ
1660 return min;
1661}
1662
76e3520e 1663STATIC I32
830247a4 1664S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1665{
830247a4 1666 if (RExC_rx->data) {
b81d288d
AB
1667 Renewc(RExC_rx->data,
1668 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1669 char, struct reg_data);
830247a4
IZ
1670 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671 RExC_rx->data->count += n;
a0ed51b3
LW
1672 }
1673 else {
830247a4 1674 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1675 char, struct reg_data);
830247a4
IZ
1676 New(1208, RExC_rx->data->what, n, U8);
1677 RExC_rx->data->count = n;
c277df42 1678 }
830247a4
IZ
1679 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680 return RExC_rx->data->count - n;
c277df42
IZ
1681}
1682
d88dccdf 1683void
864dbfa3 1684Perl_reginitcolors(pTHX)
d88dccdf 1685{
d88dccdf
IZ
1686 int i = 0;
1687 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1688
d88dccdf
IZ
1689 if (s) {
1690 PL_colors[0] = s = savepv(s);
1691 while (++i < 6) {
1692 s = strchr(s, '\t');
1693 if (s) {
1694 *s = '\0';
1695 PL_colors[i] = ++s;
1696 }
1697 else
c712d376 1698 PL_colors[i] = s = "";
d88dccdf
IZ
1699 }
1700 } else {
b81d288d 1701 while (i < 6)
d88dccdf
IZ
1702 PL_colors[i++] = "";
1703 }
1704 PL_colorset = 1;
1705}
1706
8615cb43 1707
a687059c 1708/*
e50aee73 1709 - pregcomp - compile a regular expression into internal code
a687059c
LW
1710 *
1711 * We can't allocate space until we know how big the compiled form will be,
1712 * but we can't compile it (and thus know how big it is) until we've got a
1713 * place to put the code. So we cheat: we compile it twice, once with code
1714 * generation turned off and size counting turned on, and once "for real".
1715 * This also means that we don't allocate space until we are sure that the
1716 * thing really will compile successfully, and we never have to move the
1717 * code and thus invalidate pointers into it. (Note that it has to be in
1718 * one piece because free() must be able to free it all.) [NB: not true in perl]
1719 *
1720 * Beware that the optimization-preparation code in here knows about some
1721 * of the structure of the compiled regexp. [I'll say.]
1722 */
1723regexp *
864dbfa3 1724Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1725{
a0d0e21e 1726 register regexp *r;
c277df42 1727 regnode *scan;
c277df42 1728 regnode *first;
a0d0e21e 1729 I32 flags;
a0d0e21e
LW
1730 I32 minlen = 0;
1731 I32 sawplus = 0;
1732 I32 sawopen = 0;
2c2d71f5 1733 scan_data_t data;
830247a4
IZ
1734 RExC_state_t RExC_state;
1735 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1736
1737 if (exp == NULL)
c277df42 1738 FAIL("NULL regexp argument");
a0d0e21e 1739
a5961de5 1740 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1741
5cfc7842 1742 RExC_precomp = exp;
a5961de5
JH
1743 DEBUG_r({
1744 if (!PL_colorset) reginitcolors();
1745 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746 PL_colors[4],PL_colors[5],PL_colors[0],
1747 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748 });
e2509266 1749 RExC_flags = pm->op_pmflags;
830247a4 1750 RExC_sawback = 0;
bbce6d69 1751
830247a4
IZ
1752 RExC_seen = 0;
1753 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754 RExC_seen_evals = 0;
1755 RExC_extralen = 0;
c277df42 1756
bbce6d69 1757 /* First pass: determine size, legality. */
830247a4 1758 RExC_parse = exp;
fac92740 1759 RExC_start = exp;
830247a4
IZ
1760 RExC_end = xend;
1761 RExC_naughty = 0;
1762 RExC_npar = 1;
1763 RExC_size = 0L;
1764 RExC_emit = &PL_regdummy;
1765 RExC_whilem_seen = 0;
85ddcde9
JH
1766#if 0 /* REGC() is (currently) a NOP at the first pass.
1767 * Clever compilers notice this and complain. --jhi */
830247a4 1768 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1769#endif
830247a4 1770 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1771 RExC_precomp = Nullch;
a0d0e21e
LW
1772 return(NULL);
1773 }
830247a4 1774 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1775
c277df42
IZ
1776 /* Small enough for pointer-storage convention?
1777 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1778 if (RExC_size >= 0x10000L && RExC_extralen)
1779 RExC_size += RExC_extralen;
c277df42 1780 else
830247a4
IZ
1781 RExC_extralen = 0;
1782 if (RExC_whilem_seen > 15)
1783 RExC_whilem_seen = 15;
a0d0e21e 1784
bbce6d69 1785 /* Allocate space and initialize. */
830247a4 1786 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1787 char, regexp);
a0d0e21e 1788 if (r == NULL)
b45f050a
JF
1789 FAIL("Regexp out of space");
1790
0f79a09d
GS
1791#ifdef DEBUGGING
1792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1793 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1794#endif
c277df42 1795 r->refcnt = 1;
bbce6d69 1796 r->prelen = xend - exp;
5cfc7842 1797 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1798 r->subbeg = NULL;
1799 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1800 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1801
1802 r->substrs = 0; /* Useful during FAIL. */
1803 r->startp = 0; /* Useful during FAIL. */
1804 r->endp = 0; /* Useful during FAIL. */
1805
fac92740
MJD
1806 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807 if (r->offsets) {
1808 r->offsets[0] = RExC_size;
1809 }
1810 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1811 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1812 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1813 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1814
830247a4 1815 RExC_rx = r;
bbce6d69 1816
1817 /* Second pass: emit code. */
e2509266 1818 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1819 RExC_parse = exp;
1820 RExC_end = xend;
1821 RExC_naughty = 0;
1822 RExC_npar = 1;
fac92740 1823 RExC_emit_start = r->program;
830247a4 1824 RExC_emit = r->program;
2cd61cdb 1825 /* Store the count of eval-groups for security checks: */
eb160463 1826 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1827 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1828 r->data = 0;
830247a4 1829 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1830 return(NULL);
1831
1832 /* Dig out information for optimizations. */
cf93c79d 1833 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1834 pm->op_pmflags = RExC_flags;
a0ed51b3 1835 if (UTF)
5ff6fc6d 1836 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1837 r->regstclass = NULL;
830247a4 1838 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1839 r->reganch |= ROPT_NAUGHTY;
c277df42 1840 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1841
1842 /* XXXX To minimize changes to RE engine we always allocate
1843 3-units-long substrs field. */
1844 Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
2c2d71f5 1846 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1847 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1849 I32 fake;
c5254dd6 1850 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1851 struct regnode_charclass_class ch_class;
1852 int stclass_flag;
cb434fcc 1853 I32 last_close = 0;
a0d0e21e
LW
1854
1855 first = scan;
c277df42 1856 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1857 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1858 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860 (OP(first) == PLUS) ||
1861 (OP(first) == MINMOD) ||
653099ff 1862 /* An {n,m} with n>0 */
22c35a8c 1863 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1864 if (OP(first) == PLUS)
1865 sawplus = 1;
1866 else
1867 first += regarglen[(U8)OP(first)];
1868 first = NEXTOPER(first);
a687059c
LW
1869 }
1870
a0d0e21e
LW
1871 /* Starting-point info. */
1872 again:
653099ff 1873 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1874 if (OP(first) == EXACT)
1875 ; /* Empty, get anchored substr later. */
1876 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1877 r->regstclass = first;
1878 }
653099ff 1879 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1880 r->regstclass = first;
22c35a8c
GS
1881 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1883 r->regstclass = first;
22c35a8c 1884 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1885 r->reganch |= (OP(first) == MBOL
1886 ? ROPT_ANCH_MBOL
1887 : (OP(first) == SBOL
1888 ? ROPT_ANCH_SBOL
1889 : ROPT_ANCH_BOL));
a0d0e21e 1890 first = NEXTOPER(first);
774d564b 1891 goto again;
1892 }
1893 else if (OP(first) == GPOS) {
1894 r->reganch |= ROPT_ANCH_GPOS;
1895 first = NEXTOPER(first);
1896 goto again;
a0d0e21e 1897 }
e09294f4 1898 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1899 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1900 !(r->reganch & ROPT_ANCH) )
1901 {
1902 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1903 int type = OP(NEXTOPER(first));
1904
ffc61ed2 1905 if (type == REG_ANY)
cad2e5aa
JH
1906 type = ROPT_ANCH_MBOL;
1907 else
1908 type = ROPT_ANCH_SBOL;
1909
1910 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1911 first = NEXTOPER(first);
774d564b 1912 goto again;
a0d0e21e 1913 }
b81d288d 1914 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1915 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1916 /* x+ must match at the 1st pos of run of x's */
1917 r->reganch |= ROPT_SKIP;
a0d0e21e 1918
c277df42 1919 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1920 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1921 (IV)(first - scan + 1)));
a0d0e21e
LW
1922 /*
1923 * If there's something expensive in the r.e., find the
1924 * longest literal string that must appear and make it the
1925 * regmust. Resolve ties in favor of later strings, since
1926 * the regstart check works with the beginning of the r.e.
1927 * and avoiding duplication strengthens checking. Not a
1928 * strong reason, but sufficient in the absence of others.
1929 * [Now we resolve ties in favor of the earlier string if
c277df42 1930 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1931 * earlier string may buy us something the later one won't.]
1932 */
a0d0e21e 1933 minlen = 0;
a687059c 1934
79cb57f6
GS
1935 data.longest_fixed = newSVpvn("",0);
1936 data.longest_float = newSVpvn("",0);
1937 data.last_found = newSVpvn("",0);
c277df42
IZ
1938 data.longest = &(data.longest_fixed);
1939 first = scan;
653099ff 1940 if (!r->regstclass) {
830247a4 1941 cl_init(pRExC_state, &ch_class);
653099ff
GS
1942 data.start_class = &ch_class;
1943 stclass_flag = SCF_DO_STCLASS_AND;
1944 } else /* XXXX Check for BOUND? */
1945 stclass_flag = 0;
cb434fcc 1946 data.last_closep = &last_close;
653099ff 1947
830247a4 1948 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1949 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1950 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1951 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1952 && !RExC_seen_zerolen
1953 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1954 r->reganch |= ROPT_CHECK_ALL;
830247a4 1955 scan_commit(pRExC_state, &data);
c277df42
IZ
1956 SvREFCNT_dec(data.last_found);
1957
a0ed51b3 1958 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1959 if (longest_float_length
c277df42
IZ
1960 || (data.flags & SF_FL_BEFORE_EOL
1961 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1962 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1963 int t;
1964
a0ed51b3 1965 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1966 && data.offset_fixed == data.offset_float_min
1967 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968 goto remove_float; /* As in (a)+. */
1969
33b8afdf
JH
1970 if (SvUTF8(data.longest_float)) {
1971 r->float_utf8 = data.longest_float;
1972 r->float_substr = Nullsv;
1973 } else {
1974 r->float_substr = data.longest_float;
1975 r->float_utf8 = Nullsv;
1976 }
c277df42
IZ
1977 r->float_min_offset = data.offset_float_min;
1978 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1979 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1981 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1982 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1983 }
1984 else {
aca2d497 1985 remove_float:
33b8afdf 1986 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1987 SvREFCNT_dec(data.longest_float);
c5254dd6 1988 longest_float_length = 0;
a0d0e21e 1989 }
c277df42 1990
a0ed51b3 1991 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1992 if (longest_fixed_length
c277df42
IZ
1993 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 1995 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1996 int t;
1997
33b8afdf
JH
1998 if (SvUTF8(data.longest_fixed)) {
1999 r->anchored_utf8 = data.longest_fixed;
2000 r->anchored_substr = Nullsv;
2001 } else {
2002 r->anchored_substr = data.longest_fixed;
2003 r->anchored_utf8 = Nullsv;
2004 }
c277df42 2005 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2006 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2008 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2009 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2010 }
2011 else {
33b8afdf 2012 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2013 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2014 longest_fixed_length = 0;
a0d0e21e 2015 }
b81d288d 2016 if (r->regstclass
ffc61ed2 2017 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2018 r->regstclass = NULL;
33b8afdf
JH
2019 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020 && stclass_flag
653099ff 2021 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2022 && !cl_is_anything(data.start_class))
2023 {
830247a4 2024 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2025
b81d288d 2026 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2027 struct regnode_charclass_class);
2028 StructCopy(data.start_class,
830247a4 2029 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2030 struct regnode_charclass_class);
830247a4 2031 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2032 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2033 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2034 DEBUG_r({ SV *sv = sv_newmortal();
2035 regprop(sv, (regnode*)data.start_class);
2036 PerlIO_printf(Perl_debug_log,
2037 "synthetic stclass `%s'.\n",
2038 SvPVX(sv));});
653099ff 2039 }
c277df42
IZ
2040
2041 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2042 if (longest_fixed_length > longest_float_length) {
c277df42 2043 r->check_substr = r->anchored_substr;
33b8afdf 2044 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2045 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2046 if (r->reganch & ROPT_ANCH_SINGLE)
2047 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2048 }
2049 else {
c277df42 2050 r->check_substr = r->float_substr;
33b8afdf 2051 r->check_utf8 = r->float_utf8;
c277df42
IZ
2052 r->check_offset_min = data.offset_float_min;
2053 r->check_offset_max = data.offset_float_max;
a0d0e21e 2054 }
30382c73
IZ
2055 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2056 This should be changed ASAP! */
33b8afdf 2057 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2058 r->reganch |= RE_USE_INTUIT;
33b8afdf 2059 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2060 r->reganch |= RE_INTUIT_TAIL;
2061 }
a0ed51b3
LW
2062 }
2063 else {
c277df42
IZ
2064 /* Several toplevels. Best we can is to set minlen. */
2065 I32 fake;
653099ff 2066 struct regnode_charclass_class ch_class;
cb434fcc 2067 I32 last_close = 0;
c277df42
IZ
2068
2069 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2070 scan = r->program + 1;
830247a4 2071 cl_init(pRExC_state, &ch_class);
653099ff 2072 data.start_class = &ch_class;
cb434fcc 2073 data.last_closep = &last_close;
e1901655 2074 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2075 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2076 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2077 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2078 && !cl_is_anything(data.start_class))
2079 {
830247a4 2080 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2081
b81d288d 2082 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2083 struct regnode_charclass_class);
2084 StructCopy(data.start_class,
830247a4 2085 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2086 struct regnode_charclass_class);
830247a4 2087 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2088 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2089 DEBUG_r({ SV* sv = sv_newmortal();
2090 regprop(sv, (regnode*)data.start_class);
2091 PerlIO_printf(Perl_debug_log,
2092 "synthetic stclass `%s'.\n",
2093 SvPVX(sv));});
653099ff 2094 }
a0d0e21e
LW
2095 }
2096
a0d0e21e 2097 r->minlen = minlen;
b81d288d 2098 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2099 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2100 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2101 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2102 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2103 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2104 if (RExC_seen & REG_SEEN_CANY)
2105 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2106 Newz(1002, r->startp, RExC_npar, I32);
2107 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2108 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2109 DEBUG_r(regdump(r));
2110 return(r);
a687059c
LW
2111}
2112
2113/*
2114 - reg - regular expression, i.e. main body or parenthesized thing
2115 *
2116 * Caller must absorb opening parenthesis.
2117 *
2118 * Combining parenthesis handling with the base level of regular expression
2119 * is a trifle forced, but the need to tie the tails of the branches to what
2120 * follows makes it hard to avoid.
2121 */
76e3520e 2122STATIC regnode *
830247a4 2123S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2124 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2125{
c277df42
IZ
2126 register regnode *ret; /* Will be the head of the group. */
2127 register regnode *br;
2128 register regnode *lastbr;
2129 register regnode *ender = 0;
a0d0e21e 2130 register I32 parno = 0;
e2509266 2131 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2132
2133 /* for (?g), (?gc), and (?o) warnings; warning
2134 about (?c) will warn about (?g) -- japhy */
2135
2136 I32 wastedflags = 0x00,
2137 wasted_o = 0x01,
2138 wasted_g = 0x02,
2139 wasted_gc = 0x02 | 0x04,
2140 wasted_c = 0x04;
2141
fac92740 2142 char * parse_start = RExC_parse; /* MJD */
830247a4 2143 char *oregcomp_parse = RExC_parse;
c277df42 2144 char c;
a0d0e21e 2145
821b33a5 2146 *flagp = 0; /* Tentatively. */
a0d0e21e 2147
9d1d55b5 2148
a0d0e21e
LW
2149 /* Make an OPEN node, if parenthesized. */
2150 if (paren) {
fac92740 2151 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2152 U32 posflags = 0, negflags = 0;
2153 U32 *flagsp = &posflags;
0f5d15d6 2154 int logical = 0;
830247a4 2155 char *seqstart = RExC_parse;
ca9dfc88 2156
830247a4
IZ
2157 RExC_parse++;
2158 paren = *RExC_parse++;
c277df42 2159 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2160 switch (paren) {
fac92740 2161 case '<': /* (?<...) */
830247a4 2162 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2163 if (*RExC_parse == '!')
c277df42 2164 paren = ',';
b81d288d 2165 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2166 goto unknown;
830247a4 2167 RExC_parse++;
fac92740
MJD
2168 case '=': /* (?=...) */
2169 case '!': /* (?!...) */
830247a4 2170 RExC_seen_zerolen++;
fac92740
MJD
2171 case ':': /* (?:...) */
2172 case '>': /* (?>...) */
a0d0e21e 2173 break;
fac92740
MJD
2174 case '$': /* (?$...) */
2175 case '@': /* (?@...) */
8615cb43 2176 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2177 break;
fac92740 2178 case '#': /* (?#...) */
830247a4
IZ
2179 while (*RExC_parse && *RExC_parse != ')')
2180 RExC_parse++;
2181 if (*RExC_parse != ')')
c277df42 2182 FAIL("Sequence (?#... not terminated");
830247a4 2183 nextchar(pRExC_state);
a0d0e21e
LW
2184 *flagp = TRYAGAIN;
2185 return NULL;
fac92740 2186 case 'p': /* (?p...) */
9014280d 2187 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2188 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2189 /* FALL THROUGH*/
fac92740 2190 case '?': /* (??...) */
0f5d15d6 2191 logical = 1;
438a3801
YST
2192 if (*RExC_parse != '{')
2193 goto unknown;
830247a4 2194 paren = *RExC_parse++;
0f5d15d6 2195 /* FALL THROUGH */
fac92740 2196 case '{': /* (?{...}) */
c277df42 2197 {
c277df42
IZ
2198 I32 count = 1, n = 0;
2199 char c;
830247a4 2200 char *s = RExC_parse;
c277df42
IZ
2201 SV *sv;
2202 OP_4tree *sop, *rop;
2203
830247a4
IZ
2204 RExC_seen_zerolen++;
2205 RExC_seen |= REG_SEEN_EVAL;
2206 while (count && (c = *RExC_parse)) {
2207 if (c == '\\' && RExC_parse[1])
2208 RExC_parse++;
b81d288d 2209 else if (c == '{')
c277df42 2210 count++;
b81d288d 2211 else if (c == '}')
c277df42 2212 count--;
830247a4 2213 RExC_parse++;
c277df42 2214 }
830247a4 2215 if (*RExC_parse != ')')
b45f050a 2216 {
b81d288d 2217 RExC_parse = s;
b45f050a
JF
2218 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219 }
c277df42
IZ
2220 if (!SIZE_ONLY) {
2221 AV *av;
b81d288d
AB
2222
2223 if (RExC_parse - 1 - s)
830247a4 2224 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2225 else
79cb57f6 2226 sv = newSVpvn("", 0);
c277df42 2227
569233ed
SB
2228 ENTER;
2229 Perl_save_re_context(aTHX);
c277df42 2230 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2231 sop->op_private |= OPpREFCOUNTED;
2232 /* re_dup will OpREFCNT_inc */
2233 OpREFCNT_set(sop, 1);
569233ed 2234 LEAVE;
c277df42 2235
830247a4
IZ
2236 n = add_data(pRExC_state, 3, "nop");
2237 RExC_rx->data->data[n] = (void*)rop;
2238 RExC_rx->data->data[n+1] = (void*)sop;
2239 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2240 SvREFCNT_dec(sv);
a0ed51b3 2241 }
e24b16f9 2242 else { /* First pass */
830247a4 2243 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2244 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2245 /* No compiled RE interpolated, has runtime
2246 components ===> unsafe. */
2247 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2248 if (PL_tainting && PL_tainted)
cc6b7395 2249 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2250 }
2251
830247a4 2252 nextchar(pRExC_state);
0f5d15d6 2253 if (logical) {
830247a4 2254 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2255 if (!SIZE_ONLY)
2256 ret->flags = 2;
830247a4 2257 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2258 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2259 return ret;
2260 }
830247a4 2261 return reganode(pRExC_state, EVAL, n);
c277df42 2262 }
fac92740 2263 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2264 {
fac92740 2265 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2266 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2267 || RExC_parse[1] == '<'
830247a4 2268 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2269 I32 flag;
2270
830247a4 2271 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2272 if (!SIZE_ONLY)
2273 ret->flags = 1;
830247a4 2274 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2275 goto insert_if;
b81d288d 2276 }
a0ed51b3 2277 }
830247a4 2278 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2279 /* (?(1)...) */
830247a4 2280 parno = atoi(RExC_parse++);
c277df42 2281
830247a4
IZ
2282 while (isDIGIT(*RExC_parse))
2283 RExC_parse++;
fac92740
MJD
2284 ret = reganode(pRExC_state, GROUPP, parno);
2285
830247a4 2286 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2287 vFAIL("Switch condition not recognized");
c277df42 2288 insert_if:
830247a4
IZ
2289 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2290 br = regbranch(pRExC_state, &flags, 1);
c277df42 2291 if (br == NULL)
830247a4 2292 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2293 else
830247a4
IZ
2294 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2295 c = *nextchar(pRExC_state);
d1b80229
IZ
2296 if (flags&HASWIDTH)
2297 *flagp |= HASWIDTH;
c277df42 2298 if (c == '|') {
830247a4
IZ
2299 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2300 regbranch(pRExC_state, &flags, 1);
2301 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2302 if (flags&HASWIDTH)
2303 *flagp |= HASWIDTH;
830247a4 2304 c = *nextchar(pRExC_state);
a0ed51b3
LW
2305 }
2306 else
c277df42
IZ
2307 lastbr = NULL;
2308 if (c != ')')
8615cb43 2309 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2310 ender = reg_node(pRExC_state, TAIL);
2311 regtail(pRExC_state, br, ender);
c277df42 2312 if (lastbr) {
830247a4
IZ
2313 regtail(pRExC_state, lastbr, ender);
2314 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2315 }
2316 else
830247a4 2317 regtail(pRExC_state, ret, ender);
c277df42 2318 return ret;
a0ed51b3
LW
2319 }
2320 else {
830247a4 2321 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2322 }
2323 }
1b1626e4 2324 case 0:
830247a4 2325 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2326 vFAIL("Sequence (? incomplete");
1b1626e4 2327 break;
a0d0e21e 2328 default:
830247a4 2329 --RExC_parse;
fac92740 2330 parse_flags: /* (?i) */
830247a4 2331 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2332 /* (?g), (?gc) and (?o) are useless here
2333 and must be globally applied -- japhy */
2334
2335 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2336 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2337 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2338 if (! (wastedflags & wflagbit) ) {
2339 wastedflags |= wflagbit;
2340 vWARN5(
2341 RExC_parse + 1,
2342 "Useless (%s%c) - %suse /%c modifier",
2343 flagsp == &negflags ? "?-" : "?",
2344 *RExC_parse,
2345 flagsp == &negflags ? "don't " : "",
2346 *RExC_parse
2347 );
2348 }
2349 }
2350 }
2351 else if (*RExC_parse == 'c') {
2352 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2353 if (! (wastedflags & wasted_c) ) {
2354 wastedflags |= wasted_gc;
2355 vWARN3(
2356 RExC_parse + 1,
2357 "Useless (%sc) - %suse /gc modifier",
2358 flagsp == &negflags ? "?-" : "?",
2359 flagsp == &negflags ? "don't " : ""
2360 );
2361 }
2362 }
2363 }
2364 else { pmflag(flagsp, *RExC_parse); }
2365
830247a4 2366 ++RExC_parse;
ca9dfc88 2367 }
830247a4 2368 if (*RExC_parse == '-') {
ca9dfc88 2369 flagsp = &negflags;
9d1d55b5 2370 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2371 ++RExC_parse;
ca9dfc88 2372 goto parse_flags;
48c036b1 2373 }
e2509266
JH
2374 RExC_flags |= posflags;
2375 RExC_flags &= ~negflags;
830247a4
IZ
2376 if (*RExC_parse == ':') {
2377 RExC_parse++;
ca9dfc88
IZ
2378 paren = ':';
2379 break;
2380 }
c277df42 2381 unknown:
830247a4
IZ
2382 if (*RExC_parse != ')') {
2383 RExC_parse++;
2384 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2385 }
830247a4 2386 nextchar(pRExC_state);
a0d0e21e
LW
2387 *flagp = TRYAGAIN;
2388 return NULL;
2389 }
2390 }
fac92740 2391 else { /* (...) */
830247a4
IZ
2392 parno = RExC_npar;
2393 RExC_npar++;
2394 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2395 Set_Node_Length(ret, 1); /* MJD */
2396 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2397 open = 1;
a0d0e21e 2398 }
a0ed51b3 2399 }
fac92740 2400 else /* ! paren */
a0d0e21e
LW
2401 ret = NULL;
2402
2403 /* Pick up the branches, linking them together. */
fac92740 2404 parse_start = RExC_parse; /* MJD */
830247a4 2405 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2406 /* branch_len = (paren != 0); */
2407
a0d0e21e
LW
2408 if (br == NULL)
2409 return(NULL);
830247a4
IZ
2410 if (*RExC_parse == '|') {
2411 if (!SIZE_ONLY && RExC_extralen) {
2412 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2413 }
fac92740 2414 else { /* MJD */
830247a4 2415 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2416 Set_Node_Length(br, paren != 0);
2417 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2418 }
c277df42
IZ
2419 have_branch = 1;
2420 if (SIZE_ONLY)
830247a4 2421 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2422 }
2423 else if (paren == ':') {
c277df42
IZ
2424 *flagp |= flags&SIMPLE;
2425 }
2426 if (open) { /* Starts with OPEN. */
830247a4 2427 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2428 }
2429 else if (paren != '?') /* Not Conditional */
a0d0e21e 2430 ret = br;
32a0ca98 2431 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2432 lastbr = br;
830247a4
IZ
2433 while (*RExC_parse == '|') {
2434 if (!SIZE_ONLY && RExC_extralen) {
2435 ender = reganode(pRExC_state, LONGJMP,0);
2436 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2437 }
2438 if (SIZE_ONLY)
830247a4
IZ
2439 RExC_extralen += 2; /* Account for LONGJMP. */
2440 nextchar(pRExC_state);
2441 br = regbranch(pRExC_state, &flags, 0);
fac92740 2442
a687059c 2443 if (br == NULL)
a0d0e21e 2444 return(NULL);
830247a4 2445 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2446 lastbr = br;
821b33a5
IZ
2447 if (flags&HASWIDTH)
2448 *flagp |= HASWIDTH;
a687059c 2449 *flagp |= flags&SPSTART;
a0d0e21e
LW
2450 }
2451
c277df42
IZ
2452 if (have_branch || paren != ':') {
2453 /* Make a closing node, and hook it on the end. */
2454 switch (paren) {
2455 case ':':
830247a4 2456 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2457 break;
2458 case 1:
830247a4 2459 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2460 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2461 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2462 break;
2463 case '<':
c277df42
IZ
2464 case ',':
2465 case '=':
2466 case '!':
c277df42 2467 *flagp &= ~HASWIDTH;
821b33a5
IZ
2468 /* FALL THROUGH */
2469 case '>':
830247a4 2470 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2471 break;
2472 case 0:
830247a4 2473 ender = reg_node(pRExC_state, END);
c277df42
IZ
2474 break;
2475 }
830247a4 2476 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2477
c277df42
IZ
2478 if (have_branch) {
2479 /* Hook the tails of the branches to the closing node. */
2480 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2481 regoptail(pRExC_state, br, ender);
c277df42
IZ
2482 }
2483 }
a0d0e21e 2484 }
c277df42
IZ
2485
2486 {
2487 char *p;
2488 static char parens[] = "=!<,>";
2489
2490 if (paren && (p = strchr(parens, paren))) {
eb160463 2491 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2492 int flag = (p - parens) > 1;
2493
2494 if (paren == '>')
2495 node = SUSPEND, flag = 0;
830247a4 2496 reginsert(pRExC_state, node,ret);
c277df42 2497 ret->flags = flag;
830247a4 2498 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2499 }
a0d0e21e
LW
2500 }
2501
2502 /* Check for proper termination. */
ce3e6498 2503 if (paren) {
e2509266 2504 RExC_flags = oregflags;
830247a4
IZ
2505 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2506 RExC_parse = oregcomp_parse;
380a0633 2507 vFAIL("Unmatched (");
ce3e6498 2508 }
a0ed51b3 2509 }
830247a4
IZ
2510 else if (!paren && RExC_parse < RExC_end) {
2511 if (*RExC_parse == ')') {
2512 RExC_parse++;
380a0633 2513 vFAIL("Unmatched )");
a0ed51b3
LW
2514 }
2515 else
b45f050a 2516 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2517 /* NOTREACHED */
2518 }
a687059c 2519
a0d0e21e 2520 return(ret);
a687059c
LW
2521}
2522
2523/*
2524 - regbranch - one alternative of an | operator
2525 *
2526 * Implements the concatenation operator.
2527 */
76e3520e 2528STATIC regnode *
830247a4 2529S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2530{
c277df42
IZ
2531 register regnode *ret;
2532 register regnode *chain = NULL;
2533 register regnode *latest;
2534 I32 flags = 0, c = 0;
a0d0e21e 2535
b81d288d 2536 if (first)
c277df42
IZ
2537 ret = NULL;
2538 else {
b81d288d 2539 if (!SIZE_ONLY && RExC_extralen)
830247a4 2540 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2541 else {
830247a4 2542 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2543 Set_Node_Length(ret, 1);
2544 }
c277df42
IZ
2545 }
2546
b81d288d 2547 if (!first && SIZE_ONLY)
830247a4 2548 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2549
c277df42 2550 *flagp = WORST; /* Tentatively. */
a0d0e21e 2551
830247a4
IZ
2552 RExC_parse--;
2553 nextchar(pRExC_state);
2554 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2555 flags &= ~TRYAGAIN;
830247a4 2556 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2557 if (latest == NULL) {
2558 if (flags & TRYAGAIN)
2559 continue;
2560 return(NULL);
a0ed51b3
LW
2561 }
2562 else if (ret == NULL)
c277df42 2563 ret = latest;
a0d0e21e 2564 *flagp |= flags&HASWIDTH;
c277df42 2565 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2566 *flagp |= flags&SPSTART;
2567 else {
830247a4
IZ
2568 RExC_naughty++;
2569 regtail(pRExC_state, chain, latest);
a687059c 2570 }
a0d0e21e 2571 chain = latest;
c277df42
IZ
2572 c++;
2573 }
2574 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2575 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2576 if (ret == NULL)
2577 ret = chain;
2578 }
2579 if (c == 1) {
2580 *flagp |= flags&SIMPLE;
a0d0e21e 2581 }
a687059c 2582
a0d0e21e 2583 return(ret);
a687059c
LW
2584}
2585
2586/*
2587 - regpiece - something followed by possible [*+?]
2588 *
2589 * Note that the branching code sequences used for ? and the general cases
2590 * of * and + are somewhat optimized: they use the same NOTHING node as
2591 * both the endmarker for their branch list and the body of the last branch.
2592 * It might seem that this node could be dispensed with entirely, but the
2593 * endmarker role is not redundant.
2594 */
76e3520e 2595STATIC regnode *
830247a4 2596S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2597{
c277df42 2598 register regnode *ret;
a0d0e21e
LW
2599 register char op;
2600 register char *next;
2601 I32 flags;
830247a4 2602 char *origparse = RExC_parse;
a0d0e21e
LW
2603 char *maxpos;
2604 I32 min;
c277df42 2605 I32 max = REG_INFTY;
fac92740 2606 char *parse_start;
a0d0e21e 2607
830247a4 2608 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2609 if (ret == NULL) {
2610 if (flags & TRYAGAIN)
2611 *flagp |= TRYAGAIN;
2612 return(NULL);
2613 }
2614
830247a4 2615 op = *RExC_parse;
a0d0e21e 2616
830247a4 2617 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2618 parse_start = RExC_parse; /* MJD */
830247a4 2619 next = RExC_parse + 1;
a0d0e21e
LW
2620 maxpos = Nullch;
2621 while (isDIGIT(*next) || *next == ',') {
2622 if (*next == ',') {
2623 if (maxpos)
2624 break;
2625 else
2626 maxpos = next;
a687059c 2627 }
a0d0e21e
LW
2628 next++;
2629 }
2630 if (*next == '}') { /* got one */
2631 if (!maxpos)
2632 maxpos = next;
830247a4
IZ
2633 RExC_parse++;
2634 min = atoi(RExC_parse);
a0d0e21e
LW
2635 if (*maxpos == ',')
2636 maxpos++;
2637 else
830247a4 2638 maxpos = RExC_parse;
a0d0e21e
LW
2639 max = atoi(maxpos);
2640 if (!max && *maxpos != '0')
c277df42
IZ
2641 max = REG_INFTY; /* meaning "infinity" */
2642 else if (max >= REG_INFTY)
8615cb43 2643 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2644 RExC_parse = next;
2645 nextchar(pRExC_state);
a0d0e21e
LW
2646
2647 do_curly:
2648 if ((flags&SIMPLE)) {
830247a4
IZ
2649 RExC_naughty += 2 + RExC_naughty / 2;
2650 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2651 Set_Node_Offset(ret, parse_start+1); /* MJD */
2652 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2653 }
2654 else {
830247a4 2655 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2656
2657 w->flags = 0;
830247a4
IZ
2658 regtail(pRExC_state, ret, w);
2659 if (!SIZE_ONLY && RExC_extralen) {
2660 reginsert(pRExC_state, LONGJMP,ret);
2661 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2662 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2663 }
830247a4 2664 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2665 /* MJD hk */
2666 Set_Node_Offset(ret, parse_start+1);
2667 Set_Node_Length(ret,
2668 op == '{' ? (RExC_parse - parse_start) : 1);
2669
830247a4 2670 if (!SIZE_ONLY && RExC_extralen)
c277df42 2671 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2672 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2673 if (SIZE_ONLY)
830247a4
IZ
2674 RExC_whilem_seen++, RExC_extralen += 3;
2675 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2676 }
c277df42 2677 ret->flags = 0;
a0d0e21e
LW
2678
2679 if (min > 0)
821b33a5
IZ
2680 *flagp = WORST;
2681 if (max > 0)
2682 *flagp |= HASWIDTH;
a0d0e21e 2683 if (max && max < min)
8615cb43 2684 vFAIL("Can't do {n,m} with n > m");
c277df42 2685 if (!SIZE_ONLY) {
eb160463
GS
2686 ARG1_SET(ret, (U16)min);
2687 ARG2_SET(ret, (U16)max);
a687059c 2688 }
a687059c 2689
a0d0e21e 2690 goto nest_check;
a687059c 2691 }
a0d0e21e 2692 }
a687059c 2693
a0d0e21e
LW
2694 if (!ISMULT1(op)) {
2695 *flagp = flags;
a687059c 2696 return(ret);
a0d0e21e 2697 }
bb20fd44 2698
c277df42 2699#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2700
2701 /* if this is reinstated, don't forget to put this back into perldiag:
2702
2703 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2704
2705 (F) The part of the regexp subject to either the * or + quantifier
2706 could match an empty string. The {#} shows in the regular
2707 expression about where the problem was discovered.
2708
2709 */
2710
bb20fd44 2711 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2712 vFAIL("Regexp *+ operand could be empty");
b81d288d 2713#endif
bb20fd44 2714
fac92740 2715 parse_start = RExC_parse;
830247a4 2716 nextchar(pRExC_state);
a0d0e21e 2717
821b33a5 2718 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2719
2720 if (op == '*' && (flags&SIMPLE)) {
830247a4 2721 reginsert(pRExC_state, STAR, ret);
c277df42 2722 ret->flags = 0;
830247a4 2723 RExC_naughty += 4;
a0d0e21e
LW
2724 }
2725 else if (op == '*') {
2726 min = 0;
2727 goto do_curly;
a0ed51b3
LW
2728 }
2729 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2730 reginsert(pRExC_state, PLUS, ret);
c277df42 2731 ret->flags = 0;
830247a4 2732 RExC_naughty += 3;
a0d0e21e
LW
2733 }
2734 else if (op == '+') {
2735 min = 1;
2736 goto do_curly;
a0ed51b3
LW
2737 }
2738 else if (op == '?') {
a0d0e21e
LW
2739 min = 0; max = 1;
2740 goto do_curly;
2741 }
2742 nest_check:
e476b1b5 2743 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2744 vWARN3(RExC_parse,
b45f050a 2745 "%.*s matches null string many times",
830247a4 2746 RExC_parse - origparse,
b45f050a 2747 origparse);
a0d0e21e
LW
2748 }
2749
830247a4
IZ
2750 if (*RExC_parse == '?') {
2751 nextchar(pRExC_state);
2752 reginsert(pRExC_state, MINMOD, ret);
2753 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2754 }
830247a4
IZ
2755 if (ISMULT2(RExC_parse)) {
2756 RExC_parse++;
b45f050a
JF
2757 vFAIL("Nested quantifiers");
2758 }
a0d0e21e
LW
2759
2760 return(ret);
a687059c
LW
2761}
2762
2763/*
2764 - regatom - the lowest level
2765 *
2766 * Optimization: gobbles an entire sequence of ordinary characters so that
2767 * it can turn them into a single node, which is smaller to store and
2768 * faster to run. Backslashed characters are exceptions, each becoming a
2769 * separate node; the code is simpler that way and it's not worth fixing.
2770 *
b45f050a 2771 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2772STATIC regnode *
830247a4 2773S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2774{
c277df42 2775 register regnode *ret = 0;
a0d0e21e 2776 I32 flags;
f06dbbb7 2777 char *parse_start = 0;
a0d0e21e
LW
2778
2779 *flagp = WORST; /* Tentatively. */
2780
2781tryagain:
830247a4 2782 switch (*RExC_parse) {
a0d0e21e 2783 case '^':
830247a4
IZ
2784 RExC_seen_zerolen++;
2785 nextchar(pRExC_state);
e2509266 2786 if (RExC_flags & PMf_MULTILINE)
830247a4 2787 ret = reg_node(pRExC_state, MBOL);
e2509266 2788 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2789 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2790 else
830247a4 2791 ret = reg_node(pRExC_state, BOL);
fac92740 2792 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2793 break;
2794 case '$':
830247a4 2795 nextchar(pRExC_state);
b81d288d 2796 if (*RExC_parse)
830247a4 2797 RExC_seen_zerolen++;
e2509266 2798 if (RExC_flags & PMf_MULTILINE)
830247a4 2799 ret = reg_node(pRExC_state, MEOL);
e2509266 2800 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2801 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2802 else
830247a4 2803 ret = reg_node(pRExC_state, EOL);
fac92740 2804 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2805 break;
2806 case '.':
830247a4 2807 nextchar(pRExC_state);
e2509266 2808 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2809 ret = reg_node(pRExC_state, SANY);
2810 else
2811 ret = reg_node(pRExC_state, REG_ANY);
2812 *flagp |= HASWIDTH|SIMPLE;
830247a4 2813 RExC_naughty++;
fac92740 2814 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2815 break;
2816 case '[':
b45f050a 2817 {
830247a4 2818 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2819 ret = regclass(pRExC_state);
830247a4
IZ
2820 if (*RExC_parse != ']') {
2821 RExC_parse = oregcomp_parse;
b45f050a
JF
2822 vFAIL("Unmatched [");
2823 }
830247a4 2824 nextchar(pRExC_state);
a0d0e21e 2825 *flagp |= HASWIDTH|SIMPLE;
fac92740 2826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2827 break;
b45f050a 2828 }
a0d0e21e 2829 case '(':
830247a4
IZ
2830 nextchar(pRExC_state);
2831 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2832 if (ret == NULL) {
bf93d4cc 2833 if (flags & TRYAGAIN) {
830247a4 2834 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2835 /* Make parent create an empty node if needed. */
2836 *flagp |= TRYAGAIN;
2837 return(NULL);
2838 }
a0d0e21e 2839 goto tryagain;
bf93d4cc 2840 }
a0d0e21e
LW
2841 return(NULL);
2842 }
c277df42 2843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2844 break;
2845 case '|':
2846 case ')':
2847 if (flags & TRYAGAIN) {
2848 *flagp |= TRYAGAIN;
2849 return NULL;
2850 }
b45f050a 2851 vFAIL("Internal urp");
a0d0e21e
LW
2852 /* Supposed to be caught earlier. */
2853 break;
85afd4ae 2854 case '{':
830247a4
IZ
2855 if (!regcurly(RExC_parse)) {
2856 RExC_parse++;
85afd4ae
CS
2857 goto defchar;
2858 }
2859 /* FALL THROUGH */
a0d0e21e
LW
2860 case '?':
2861 case '+':
2862 case '*':
830247a4 2863 RExC_parse++;
b45f050a 2864 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2865 break;
2866 case '\\':
830247a4 2867 switch (*++RExC_parse) {
a0d0e21e 2868 case 'A':
830247a4
IZ
2869 RExC_seen_zerolen++;
2870 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2871 *flagp |= SIMPLE;
830247a4 2872 nextchar(pRExC_state);
fac92740 2873 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2874 break;
2875 case 'G':
830247a4
IZ
2876 ret = reg_node(pRExC_state, GPOS);
2877 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2878 *flagp |= SIMPLE;
830247a4 2879 nextchar(pRExC_state);
fac92740 2880 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2881 break;
2882 case 'Z':
830247a4 2883 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2884 *flagp |= SIMPLE;
a1917ab9 2885 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2886 nextchar(pRExC_state);
a0d0e21e 2887 break;
b85d18e9 2888 case 'z':
830247a4 2889 ret = reg_node(pRExC_state, EOS);
b85d18e9 2890 *flagp |= SIMPLE;
830247a4
IZ
2891 RExC_seen_zerolen++; /* Do not optimize RE away */
2892 nextchar(pRExC_state);
fac92740 2893 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2894 break;
4a2d328f 2895 case 'C':
f33976b4
DB
2896 ret = reg_node(pRExC_state, CANY);
2897 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2898 *flagp |= HASWIDTH|SIMPLE;
830247a4 2899 nextchar(pRExC_state);
fac92740 2900 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2901 break;
2902 case 'X':
830247a4 2903 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2904 *flagp |= HASWIDTH;
830247a4 2905 nextchar(pRExC_state);
fac92740 2906 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2907 break;
a0d0e21e 2908 case 'w':
eb160463 2909 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2910 *flagp |= HASWIDTH|SIMPLE;
830247a4 2911 nextchar(pRExC_state);
fac92740 2912 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2913 break;
2914 case 'W':
eb160463 2915 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2916 *flagp |= HASWIDTH|SIMPLE;
830247a4 2917 nextchar(pRExC_state);
fac92740 2918 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2919 break;
2920 case 'b':
830247a4
IZ
2921 RExC_seen_zerolen++;
2922 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2923 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2924 *flagp |= SIMPLE;
830247a4 2925 nextchar(pRExC_state);
fac92740 2926 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2927 break;
2928 case 'B':
830247a4
IZ
2929 RExC_seen_zerolen++;
2930 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2931 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2932 *flagp |= SIMPLE;
830247a4 2933 nextchar(pRExC_state);
fac92740 2934 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2935 break;
2936 case 's':
eb160463 2937 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2938 *flagp |= HASWIDTH|SIMPLE;
830247a4 2939 nextchar(pRExC_state);
fac92740 2940 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2941 break;
2942 case 'S':
eb160463 2943 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2944 *flagp |= HASWIDTH|SIMPLE;
830247a4 2945 nextchar(pRExC_state);
fac92740 2946 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2947 break;
2948 case 'd':
ffc61ed2 2949 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2950 *flagp |= HASWIDTH|SIMPLE;
830247a4 2951 nextchar(pRExC_state);
fac92740 2952 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2953 break;
2954 case 'D':
ffc61ed2 2955 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2956 *flagp |= HASWIDTH|SIMPLE;
830247a4 2957 nextchar(pRExC_state);
fac92740 2958 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2959 break;
a14b48bc
LW
2960 case 'p':
2961 case 'P':
3568d838 2962 {
830247a4 2963 char* oldregxend = RExC_end;
fac92740 2964 char* parse_start = RExC_parse;
a14b48bc 2965
830247a4 2966 if (RExC_parse[1] == '{') {
3568d838 2967 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2968 RExC_end = strchr(RExC_parse, '}');
2969 if (!RExC_end) {
0da60cf5 2970 U8 c = (U8)*RExC_parse;
830247a4
IZ
2971 RExC_parse += 2;
2972 RExC_end = oldregxend;
0da60cf5 2973 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2974 }
830247a4 2975 RExC_end++;
a14b48bc 2976 }
af6f566e 2977 else {
830247a4 2978 RExC_end = RExC_parse + 2;
af6f566e
HS
2979 if (RExC_end > oldregxend)
2980 RExC_end = oldregxend;
2981 }
830247a4 2982 RExC_parse--;
a14b48bc 2983
ffc61ed2 2984 ret = regclass(pRExC_state);
a14b48bc 2985
830247a4
IZ
2986 RExC_end = oldregxend;
2987 RExC_parse--;
fac92740 2988 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2989 nextchar(pRExC_state);
a14b48bc
LW
2990 *flagp |= HASWIDTH|SIMPLE;
2991 }
2992 break;
a0d0e21e
LW
2993 case 'n':
2994 case 'r':
2995 case 't':
2996 case 'f':
2997 case 'e':
2998 case 'a':
2999 case 'x':
3000 case 'c':
3001 case '0':
3002 goto defchar;
3003 case '1': case '2': case '3': case '4':
3004 case '5': case '6': case '7': case '8': case '9':
3005 {
830247a4 3006 I32 num = atoi(RExC_parse);
a0d0e21e 3007
830247a4 3008 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3009 goto defchar;
3010 else {
fac92740 3011 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3012 while (isDIGIT(*RExC_parse))
3013 RExC_parse++;
b45f050a 3014
eb160463 3015 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3016 vFAIL("Reference to nonexistent group");
830247a4 3017 RExC_sawback = 1;
eb160463
GS
3018 ret = reganode(pRExC_state,
3019 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3020 num);
a0d0e21e 3021 *flagp |= HASWIDTH;
fac92740
MJD
3022
3023 /* override incorrect value set in reganode MJD */
3024 Set_Node_Offset(ret, parse_start+1);
3025 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3026 RExC_parse--;
3027 nextchar(pRExC_state);
a0d0e21e
LW
3028 }
3029 }
3030 break;
3031 case '\0':
830247a4 3032 if (RExC_parse >= RExC_end)
b45f050a 3033 FAIL("Trailing \\");
a0d0e21e
LW
3034 /* FALL THROUGH */
3035 default:
c9f97d15
IZ
3036 /* Do not generate `unrecognized' warnings here, we fall
3037 back into the quick-grab loop below */
a0d0e21e
LW
3038 goto defchar;
3039 }
3040 break;
4633a7c4
LW
3041
3042 case '#':
e2509266 3043 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3044 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3045 if (RExC_parse < RExC_end)
4633a7c4
LW
3046 goto tryagain;
3047 }
3048 /* FALL THROUGH */
3049
a0d0e21e 3050 default: {
ba210ebe 3051 register STRLEN len;
58ae7d3f 3052 register UV ender;
a0d0e21e 3053 register char *p;
c277df42 3054 char *oldp, *s;
ba210ebe 3055 STRLEN numlen;
80aecb99 3056 STRLEN foldlen;
60a8b682 3057 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3058
3059 parse_start = RExC_parse - 1;
a0d0e21e 3060
830247a4 3061 RExC_parse++;
a0d0e21e
LW
3062
3063 defchar:
58ae7d3f 3064 ender = 0;
eb160463
GS
3065 ret = reg_node(pRExC_state,
3066 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3067 s = STRING(ret);
830247a4
IZ
3068 for (len = 0, p = RExC_parse - 1;
3069 len < 127 && p < RExC_end;
a0d0e21e
LW
3070 len++)
3071 {
3072 oldp = p;
5b5a24f7 3073
e2509266 3074 if (RExC_flags & PMf_EXTENDED)
830247a4 3075 p = regwhite(p, RExC_end);
a0d0e21e
LW
3076 switch (*p) {
3077 case '^':
3078 case '$':
3079 case '.':
3080 case '[':
3081 case '(':
3082 case ')':
3083 case '|':
3084 goto loopdone;
3085 case '\\':
3086 switch (*++p) {
3087 case 'A':
1ed8eac0
JF
3088 case 'C':
3089 case 'X':
a0d0e21e
LW
3090 case 'G':
3091 case 'Z':
b85d18e9 3092 case 'z':
a0d0e21e
LW
3093 case 'w':
3094 case 'W':
3095 case 'b':
3096 case 'B':
3097 case 's':
3098 case 'S':
3099 case 'd':
3100 case 'D':
a14b48bc
LW
3101 case 'p':
3102 case 'P':
a0d0e21e
LW
3103 --p;
3104 goto loopdone;
3105 case 'n':
3106 ender = '\n';
3107 p++;
a687059c 3108 break;
a0d0e21e
LW
3109 case 'r':
3110 ender = '\r';
3111 p++;
a687059c 3112 break;
a0d0e21e
LW
3113 case 't':
3114 ender = '\t';
3115 p++;
a687059c 3116 break;
a0d0e21e
LW
3117 case 'f':
3118 ender = '\f';
3119 p++;
a687059c 3120 break;
a0d0e21e 3121 case 'e':
c7f1f016 3122 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3123 p++;
a687059c 3124 break;
a0d0e21e 3125 case 'a':
c7f1f016 3126 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3127 p++;
a687059c 3128 break;
a0d0e21e 3129 case 'x':
a0ed51b3
LW
3130 if (*++p == '{') {
3131 char* e = strchr(p, '}');
b81d288d 3132
b45f050a 3133 if (!e) {
830247a4 3134 RExC_parse = p + 1;
b45f050a
JF
3135 vFAIL("Missing right brace on \\x{}");
3136 }
de5f0749 3137 else {
a4c04bdc
NC
3138 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3140 numlen = e - p - 1;
3141 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3142 if (ender > 0xff)
3143 RExC_utf8 = 1;
b21ed0a9
GS
3144 /* numlen is generous */
3145 if (numlen + len >= 127) {
a0ed51b3
LW
3146 p--;
3147 goto loopdone;
3148 }
3149 p = e + 1;
3150 }
a0ed51b3
LW
3151 }
3152 else {
a4c04bdc 3153 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3154 numlen = 2;
3155 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3156 p += numlen;
3157 }
a687059c 3158 break;
a0d0e21e
LW
3159 case 'c':
3160 p++;
bbce6d69 3161 ender = UCHARAT(p++);
3162 ender = toCTRL(ender);
a687059c 3163 break;
a0d0e21e
LW
3164 case '0': case '1': case '2': case '3':case '4':
3165 case '5': case '6': case '7': case '8':case '9':
3166 if (*p == '0' ||
830247a4 3167 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3168 I32 flags = 0;
3169 numlen = 3;
3170 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3171 p += numlen;
3172 }
3173 else {
3174 --p;
3175 goto loopdone;
a687059c
LW
3176 }
3177 break;
a0d0e21e 3178 case '\0':
830247a4 3179 if (p >= RExC_end)
b45f050a 3180 FAIL("Trailing \\");
a687059c 3181 /* FALL THROUGH */
a0d0e21e 3182 default:
e476b1b5 3183 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3184 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3185 goto normal_default;
a0d0e21e
LW
3186 }
3187 break;
a687059c 3188 default:
a0ed51b3 3189 normal_default:
fd400ab9 3190 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3191 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3192 &numlen, 0);
a0ed51b3
LW
3193 p += numlen;
3194 }
3195 else
3196 ender = *p++;
a0d0e21e 3197 break;
a687059c 3198 }
e2509266 3199 if (RExC_flags & PMf_EXTENDED)
830247a4 3200 p = regwhite(p, RExC_end);
60a8b682
JH
3201 if (UTF && FOLD) {
3202 /* Prime the casefolded buffer. */
ac7e0132 3203 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3204 }
a0d0e21e
LW
3205 if (ISMULT2(p)) { /* Back off on ?+*. */
3206 if (len)
3207 p = oldp;
16ea2a2e 3208 else if (UTF) {
0ebc6274
JH
3209 STRLEN unilen;
3210
80aecb99 3211 if (FOLD) {
60a8b682 3212 /* Emit all the Unicode characters. */
80aecb99
JH
3213 for (foldbuf = tmpbuf;
3214 foldlen;
3215 foldlen -= numlen) {
3216 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3217 if (numlen > 0) {
0ebc6274
JH
3218 reguni(pRExC_state, ender, s, &unilen);
3219 s += unilen;
3220 len += unilen;
3221 /* In EBCDIC the numlen
3222 * and unilen can differ. */
9dc45d57 3223 foldbuf += numlen;
47654450
JH
3224 if (numlen >= foldlen)
3225 break;
9dc45d57
JH
3226 }
3227 else
3228 break; /* "Can't happen." */
80aecb99
JH
3229 }
3230 }
3231 else {
0ebc6274 3232 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3233 if (unilen > 0) {
0ebc6274
JH
3234 s += unilen;
3235 len += unilen;
9dc45d57 3236 }
80aecb99 3237 }
a0ed51b3 3238 }
a0d0e21e
LW
3239 else {
3240 len++;
eb160463 3241 REGC((char)ender, s++);
a0d0e21e
LW
3242 }
3243 break;
a687059c 3244 }
16ea2a2e 3245 if (UTF) {
0ebc6274
JH
3246 STRLEN unilen;
3247
80aecb99 3248 if (FOLD) {
60a8b682 3249 /* Emit all the Unicode characters. */
80aecb99
JH
3250 for (foldbuf = tmpbuf;
3251 foldlen;
3252 foldlen -= numlen) {
3253 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3254 if (numlen > 0) {
0ebc6274
JH
3255 reguni(pRExC_state, ender, s, &unilen);
3256 len += unilen;
3257 s += unilen;
3258 /* In EBCDIC the numlen
3259 * and unilen can differ. */
9dc45d57 3260 foldbuf += numlen;
47654450
JH
3261 if (numlen >= foldlen)
3262 break;
9dc45d57
JH
3263 }
3264 else
3265 break;
80aecb99
JH
3266 }
3267 }
3268 else {
0ebc6274 3269 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3270 if (unilen > 0) {
0ebc6274
JH
3271 s += unilen;
3272 len += unilen;
9dc45d57 3273 }
80aecb99
JH
3274 }
3275 len--;
a0ed51b3
LW
3276 }
3277 else
eb160463 3278 REGC((char)ender, s++);
a0d0e21e
LW
3279 }
3280 loopdone:
830247a4 3281 RExC_parse = p - 1;
fac92740 3282 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3283 nextchar(pRExC_state);
793db0cb
JH
3284 {
3285 /* len is STRLEN which is unsigned, need to copy to signed */
3286 IV iv = len;
3287 if (iv < 0)
3288 vFAIL("Internal disaster");
3289 }
a0d0e21e
LW
3290 if (len > 0)
3291 *flagp |= HASWIDTH;
3292 if (len == 1)
3293 *flagp |= SIMPLE;
c277df42 3294 if (!SIZE_ONLY)
cd439c50
IZ
3295 STR_LEN(ret) = len;
3296 if (SIZE_ONLY)
830247a4 3297 RExC_size += STR_SZ(len);
cd439c50 3298 else
830247a4 3299 RExC_emit += STR_SZ(len);
a687059c 3300 }
a0d0e21e
LW
3301 break;
3302 }
a687059c 3303
60a8b682
JH
3304 /* If the encoding pragma is in effect recode the text of
3305 * any EXACT-kind nodes. */
22c54be3 3306 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3307 STRLEN oldlen = STR_LEN(ret);
3308 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3309
3310 if (RExC_utf8)
3311 SvUTF8_on(sv);
3312 if (sv_utf8_downgrade(sv, TRUE)) {
799ef3cb 3313 char *s = sv_recode_to_utf8(sv, PL_encoding);
22c54be3
JH
3314 STRLEN newlen = SvCUR(sv);
3315
3316 if (!SIZE_ONLY) {
3317 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3318 (int)oldlen, STRING(ret),
3319 (int)newlen, s));
3320 Copy(s, STRING(ret), newlen, char);
3321 STR_LEN(ret) += newlen - oldlen;
3322 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3323 } else
3324 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3325 }
a72c7584
JH
3326 }
3327
a0d0e21e 3328 return(ret);
a687059c
LW
3329}
3330
873ef191 3331STATIC char *
cea2e8a9 3332S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3333{
3334 while (p < e) {
3335 if (isSPACE(*p))
3336 ++p;
3337 else if (*p == '#') {
3338 do {
3339 p++;
3340 } while (p < e && *p != '\n');
3341 }
3342 else
3343 break;
3344 }
3345 return p;
3346}
3347
b8c5462f
JH
3348/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3349 Character classes ([:foo:]) can also be negated ([:^foo:]).
3350 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3351 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3352 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3353
3354#define POSIXCC_DONE(c) ((c) == ':')
3355#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3356#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3357
b8c5462f 3358STATIC I32
830247a4 3359S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3360{
3361 char *posixcc = 0;
936ed897 3362 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3363
830247a4 3364 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3365 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3366 POSIXCC(UCHARAT(RExC_parse))) {
3367 char c = UCHARAT(RExC_parse);
830247a4 3368 char* s = RExC_parse++;
b81d288d 3369
9a86a77b 3370 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3371 RExC_parse++;
3372 if (RExC_parse == RExC_end)
620e46c5 3373 /* Grandfather lone [:, [=, [. */
830247a4 3374 RExC_parse = s;
620e46c5 3375 else {
830247a4 3376 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3377
9a86a77b 3378 if (UCHARAT(RExC_parse) == ']') {
830247a4 3379 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3380 posixcc = s + 1;
3381 if (*s == ':') {
3382 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3383 I32 skip = 5; /* the most common skip */
3384
3385 switch (*posixcc) {
3386 case 'a':
3387 if (strnEQ(posixcc, "alnum", 5))
3388 namedclass =
3389 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3390 else if (strnEQ(posixcc, "alpha", 5))
3391 namedclass =
3392 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3393 else if (strnEQ(posixcc, "ascii", 5))
3394 namedclass =
3395 complement ? ANYOF_NASCII : ANYOF_ASCII;
3396 break;
aaa51d5e
JF
3397 case 'b':
3398 if (strnEQ(posixcc, "blank", 5))
3399 namedclass =
3400 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3401 break;
b8c5462f
JH
3402 case 'c':
3403 if (strnEQ(posixcc, "cntrl", 5))
3404 namedclass =
3405 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3406 break;
3407 case 'd':
3408 if (strnEQ(posixcc, "digit", 5))
3409 namedclass =
3410 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3411 break;
3412 case 'g':
3413 if (strnEQ(posixcc, "graph", 5))
3414 namedclass =
3415 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3416 break;
3417 case 'l':
3418 if (strnEQ(posixcc, "lower", 5))
3419 namedclass =
3420 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3421 break;
3422 case 'p':
3423 if (strnEQ(posixcc, "print", 5))
3424 namedclass =
3425 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3426 else if (strnEQ(posixcc, "punct", 5))
3427 namedclass =
3428 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3429 break;
3430 case 's':
3431 if (strnEQ(posixcc, "space", 5))
3432 namedclass =
aaa51d5e 3433 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3434 break;
b8c5462f
JH
3435 case 'u':
3436 if (strnEQ(posixcc, "upper", 5))
3437 namedclass =
3438 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3439 break;
3440 case 'w': /* this is not POSIX, this is the Perl \w */
3441 if (strnEQ(posixcc, "word", 4)) {
3442 namedclass =
3443 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3444 skip = 4;
3445 }
3446 break;
3447 case 'x':
3448 if (strnEQ(posixcc, "xdigit", 6)) {
3449 namedclass =
3450 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3451 skip = 6;
3452 }
3453 break;
3454 }
ac561586
JH
3455 if (namedclass == OOB_NAMEDCLASS ||
3456 posixcc[skip] != ':' ||
3457 posixcc[skip+1] != ']')
b45f050a
JF
3458 {
3459 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3460 t - s - 1, s + 1);
3461 }
3462 } else if (!SIZE_ONLY) {
b8c5462f 3463 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3464
830247a4 3465 /* adjust RExC_parse so the warning shows after
b45f050a 3466 the class closes */
9a86a77b 3467 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3468 RExC_parse++;
b45f050a
JF
3469 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3470 }
b8c5462f
JH
3471 } else {
3472 /* Maternal grandfather:
3473 * "[:" ending in ":" but not in ":]" */
830247a4 3474 RExC_parse = s;
767d463e 3475 }
620e46c5
JH
3476 }
3477 }
3478
b8c5462f
JH
3479 return namedclass;
3480}
3481
3482STATIC void
830247a4 3483S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3484{
b938889d 3485 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3486 char *s = RExC_parse;
93733859 3487 char c = *s++;
b8c5462f
JH
3488
3489 while(*s && isALNUM(*s))
3490 s++;
3491 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3492 if (ckWARN(WARN_REGEXP))
3493 vWARN3(s+2,
3494 "POSIX syntax [%c %c] belongs inside character classes",
3495 c, c);
b45f050a
JF
3496
3497 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3498 if (POSIXCC_NOTYET(c)) {
830247a4 3499 /* adjust RExC_parse so the error shows after
b45f050a 3500 the class closes */
9a86a77b 3501 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3502 ;
3503 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3504 }
b8c5462f
JH
3505 }
3506 }
620e46c5
JH
3507}
3508
76e3520e 3509STATIC regnode *
830247a4 3510S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3511{
ffc61ed2 3512 register UV value;
9a86a77b 3513 register UV nextvalue;
3568d838 3514 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3515 register IV range = 0;
c277df42 3516 register regnode *ret;
ba210ebe 3517 STRLEN numlen;
ffc61ed2 3518 IV namedclass;
9c5ffd7c 3519 char *rangebegin = 0;
936ed897 3520 bool need_class = 0;
9c5ffd7c 3521 SV *listsv = Nullsv;
ffc61ed2
JH
3522 register char *e;
3523 UV n;
9e55ce06
JH
3524 bool optimize_invert = TRUE;
3525 AV* unicode_alternate = 0;
1b2d223b
JH
3526#ifdef EBCDIC
3527 UV literal_endpoint = 0;
3528#endif
ffc61ed2
JH
3529
3530 ret = reganode(pRExC_state, ANYOF, 0);
3531
3532 if (!SIZE_ONLY)
3533 ANYOF_FLAGS(ret) = 0;
3534
9a86a77b 3535 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3536 RExC_naughty++;
3537 RExC_parse++;
3538 if (!SIZE_ONLY)
3539 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3540 }
a0d0e21e 3541
936ed897 3542 if (SIZE_ONLY)
830247a4 3543 RExC_size += ANYOF_SKIP;
936ed897 3544 else {
830247a4 3545 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3546 if (FOLD)
3547 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3548 if (LOC)
3549 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3550 ANYOF_BITMAP_ZERO(ret);
3551 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3552 }
b8c5462f 3553
9a86a77b
JH
3554 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3555
b938889d 3556 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3557 checkposixcc(pRExC_state);
b8c5462f 3558
f064b6ad
HS
3559 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3560 if (UCHARAT(RExC_parse) == ']')
3561 goto charclassloop;
ffc61ed2 3562
9a86a77b 3563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3564
3565 charclassloop:
3566
3567 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3568
73b437c8 3569 if (!range)
830247a4 3570 rangebegin = RExC_parse;
ffc61ed2 3571 if (UTF) {
5e12f4fb 3572 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3573 RExC_end - RExC_parse,
3574 &numlen, 0);
ffc61ed2
JH
3575 RExC_parse += numlen;
3576 }
3577 else
3578 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3579 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3580 if (value == '[' && POSIXCC(nextvalue))
830247a4 3581 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3582 else if (value == '\\') {
ffc61ed2 3583 if (UTF) {
5e12f4fb 3584 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3585 RExC_end - RExC_parse,
3586 &numlen, 0);
3587 RExC_parse += numlen;
3588 }
3589 else
3590 value = UCHARAT(RExC_parse++);
470c3474 3591 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3592 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3593 * be a problem later if we want switch on Unicode.
3594 * A similar issue a little bit later when switching on
3595 * namedclass. --jhi */
ffc61ed2 3596 switch ((I32)value) {
b8c5462f
JH
3597 case 'w': namedclass = ANYOF_ALNUM; break;
3598 case 'W': namedclass = ANYOF_NALNUM; break;
3599 case 's': namedclass = ANYOF_SPACE; break;
3600 case 'S': namedclass = ANYOF_NSPACE; break;
3601 case 'd': namedclass = ANYOF_DIGIT; break;
3602 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3603 case 'p':
3604 case 'P':
af6f566e 3605 if (RExC_parse >= RExC_end)
2a4859cd 3606 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3607 if (*RExC_parse == '{') {
0da60cf5 3608 U8 c = (U8)value;
ffc61ed2
JH
3609 e = strchr(RExC_parse++, '}');
3610 if (!e)
0da60cf5 3611 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3612 while (isSPACE(UCHARAT(RExC_parse)))
3613 RExC_parse++;
3614 if (e == RExC_parse)
0da60cf5 3615 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3616 n = e - RExC_parse;
ab13f0c7
JH
3617 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3618 n--;
ffc61ed2
JH
3619 }
3620 else {
3621 e = RExC_parse;
3622 n = 1;
3623 }
3624 if (!SIZE_ONLY) {
ab13f0c7
JH
3625 if (UCHARAT(RExC_parse) == '^') {
3626 RExC_parse++;
3627 n--;
3628 value = value == 'p' ? 'P' : 'p'; /* toggle */
3629 while (isSPACE(UCHARAT(RExC_parse))) {
3630 RExC_parse++;
3631 n--;
3632 }
3633 }
ffc61ed2 3634 if (value == 'p')
ab13f0c7
JH
3635 Perl_sv_catpvf(aTHX_ listsv,
3636 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3637 else
ab13f0c7
JH
3638 Perl_sv_catpvf(aTHX_ listsv,
3639 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3640 }
3641 RExC_parse = e + 1;
3642 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3643 continue;
b8c5462f
JH
3644 case 'n': value = '\n'; break;
3645 case 'r': value = '\r'; break;
3646 case 't': value = '\t'; break;
3647 case 'f': value = '\f'; break;
3648 case 'b': value = '\b'; break;
c7f1f016
NIS
3649 case 'e': value = ASCII_TO_NATIVE('\033');break;
3650 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3651 case 'x':
ffc61ed2 3652 if (*RExC_parse == '{') {
a4c04bdc
NC
3653 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3654 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3655 e = strchr(RExC_parse++, '}');
b81d288d 3656 if (!e)
ffc61ed2 3657 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3658
3659 numlen = e - RExC_parse;
3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3661 RExC_parse = e + 1;
3662 }
3663 else {
a4c04bdc 3664 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3665 numlen = 2;
3666 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3667 RExC_parse += numlen;
3668 }
b8c5462f
JH
3669 break;
3670 case 'c':
830247a4 3671 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3672 value = toCTRL(value);
3673 break;
3674 case '0': case '1': case '2': case '3': case '4':
3675 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3676 {
3677 I32 flags = 0;
3678 numlen = 3;
3679 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3680 RExC_parse += numlen;
b8c5462f 3681 break;
53305cf1 3682 }
1028017a 3683 default:
e476b1b5 3684 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3685 vWARN2(RExC_parse,
3686 "Unrecognized escape \\%c in character class passed through",
3687 (int)value);
1028017a 3688 break;
b8c5462f 3689 }
ffc61ed2 3690 } /* end of \blah */
1b2d223b
JH
3691#ifdef EBCDIC
3692 else
3693 literal_endpoint++;
3694#endif
ffc61ed2
JH
3695
3696 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3697
3698 if (!SIZE_ONLY && !need_class)
936ed897 3699 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3700
936ed897 3701 need_class = 1;
ffc61ed2
JH
3702
3703 /* a bad range like a-\d, a-[:digit:] ? */
3704 if (range) {
73b437c8 3705 if (!SIZE_ONLY) {
e476b1b5 3706 if (ckWARN(WARN_REGEXP))
830247a4 3707 vWARN4(RExC_parse,
b45f050a 3708 "False [] range \"%*.*s\"",
830247a4
IZ
3709 RExC_parse - rangebegin,
3710 RExC_parse - rangebegin,
b45f050a 3711 rangebegin);
3568d838
JH
3712 if (prevvalue < 256) {
3713 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3714 ANYOF_BITMAP_SET(ret, '-');
3715 }
3716 else {
3717 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3718 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3719 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3720 }
b8c5462f 3721 }
ffc61ed2
JH
3722
3723 range = 0; /* this was not a true range */
73b437c8 3724 }
ffc61ed2 3725
73b437c8 3726 if (!SIZE_ONLY) {
3568d838
JH
3727 if (namedclass > OOB_NAMEDCLASS)
3728 optimize_invert = FALSE;
e2962f66
JH
3729 /* Possible truncation here but in some 64-bit environments
3730 * the compiler gets heartburn about switch on 64-bit values.
3731 * A similar issue a little earlier when switching on value.
98f323fa 3732 * --jhi */
e2962f66 3733 switch ((I32)namedclass) {
73b437c8
JH
3734 case ANYOF_ALNUM:
3735 if (LOC)
936ed897 3736 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3737 else {
3738 for (value = 0; value < 256; value++)
3739 if (isALNUM(value))
936ed897 3740 ANYOF_BITMAP_SET(ret, value);
73b437c8 3741 }
ffc61ed2 3742 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3743 break;
3744 case ANYOF_NALNUM:
3745 if (LOC)
936ed897 3746 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3747 else {
3748 for (value = 0; value < 256; value++)
3749 if (!isALNUM(value))
936ed897 3750 ANYOF_BITMAP_SET(ret, value);
73b437c8 3751 }
ffc61ed2 3752 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3753 break;
ffc61ed2 3754 case ANYOF_ALNUMC:
73b437c8 3755 if (LOC)
ffc61ed2 3756 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3757 else {
3758 for (value = 0; value < 256; value++)
ffc61ed2 3759 if (isALNUMC(value))
936ed897 3760 ANYOF_BITMAP_SET(ret, value);
73b437c8 3761 }
ffc61ed2 3762 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3763 break;
3764 case ANYOF_NALNUMC:
3765 if (LOC)
936ed897 3766 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3767 else {
3768 for (value = 0; value < 256; value++)
3769 if (!isALNUMC(value))
936ed897 3770 ANYOF_BITMAP_SET(ret, value);
73b437c8 3771 }
ffc61ed2 3772 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3773 break;
3774 case ANYOF_ALPHA:
3775 if (LOC)
936ed897 3776 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3777 else {
3778 for (value = 0; value < 256; value++)
3779 if (isALPHA(value))
936ed897 3780 ANYOF_BITMAP_SET(ret, value);
73b437c8 3781 }
ffc61ed2 3782 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3783 break;
3784 case ANYOF_NALPHA:
3785 if (LOC)
936ed897 3786 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3787 else {
3788 for (value = 0; value < 256; value++)
3789 if (!isALPHA(value))
936ed897 3790 ANYOF_BITMAP_SET(ret, value);
73b437c8 3791 }
ffc61ed2 3792 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3793 break;
3794 case ANYOF_ASCII:
3795 if (LOC)
936ed897 3796 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3797 else {
c7f1f016 3798#ifndef EBCDIC
1ba5c669
JH
3799 for (value = 0; value < 128; value++)
3800 ANYOF_BITMAP_SET(ret, value);
3801#else /* EBCDIC */
ffbc6a93 3802 for (value = 0; value < 256; value++) {
3a3c4447
JH
3803 if (isASCII(value))
3804 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3805 }
1ba5c669 3806#endif /* EBCDIC */
73b437c8 3807 }
ffc61ed2 3808 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3809 break;
3810 case ANYOF_NASCII:
3811 if (LOC)
936ed897 3812 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3813 else {
c7f1f016 3814#ifndef EBCDIC
1ba5c669
JH
3815 for (value = 128; value < 256; value++)
3816 ANYOF_BITMAP_SET(ret, value);
3817#else /* EBCDIC */
ffbc6a93 3818 for (value = 0; value < 256; value++) {
3a3c4447
JH
3819 if (!isASCII(value))
3820 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3821 }
1ba5c669 3822#endif /* EBCDIC */
73b437c8 3823 }
ffc61ed2 3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3825 break;
aaa51d5e
JF
3826 case ANYOF_BLANK:
3827 if (LOC)
3828 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3829 else {
3830 for (value = 0; value < 256; value++)
3831 if (isBLANK(value))
3832 ANYOF_BITMAP_SET(ret, value);
3833 }
ffc61ed2 3834 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3835 break;
3836 case ANYOF_NBLANK:
3837 if (LOC)
3838 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3839 else {
3840 for (value = 0; value < 256; value++)
3841 if (!isBLANK(value))
3842 ANYOF_BITMAP_SET(ret, value);
3843 }
ffc61ed2 3844 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3845 break;
73b437c8
JH
3846 case ANYOF_CNTRL:
3847 if (LOC)
936ed897 3848 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3849 else {
3850 for (value = 0; value < 256; value++)
3851 if (isCNTRL(value))
936ed897 3852 ANYOF_BITMAP_SET(ret, value);
73b437c8 3853 }
ffc61ed2 3854 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3855 break;
3856 case ANYOF_NCNTRL:
3857 if (LOC)
936ed897 3858 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3859 else {
3860 for (value = 0; value < 256; value++)
3861 if (!isCNTRL(value))
936ed897 3862 ANYOF_BITMAP_SET(ret, value);
73b437c8 3863 }
ffc61ed2
JH
3864 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3865 break;
3866 case ANYOF_DIGIT:
3867 if (LOC)
3868 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3869 else {
3870 /* consecutive digits assumed */
3871 for (value = '0'; value <= '9'; value++)
3872 ANYOF_BITMAP_SET(ret, value);
3873 }
3874 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3875 break;
3876 case ANYOF_NDIGIT:
3877 if (LOC)
3878 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3879 else {
3880 /* consecutive digits assumed */
3881 for (value = 0; value < '0'; value++)
3882 ANYOF_BITMAP_SET(ret, value);
3883 for (value = '9' + 1; value < 256; value++)
3884 ANYOF_BITMAP_SET(ret, value);
3885 }
3886 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3887 break;
3888 case ANYOF_GRAPH:
3889 if (LOC)
936ed897 3890 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3891 else {
3892 for (value = 0; value < 256; value++)
3893 if (isGRAPH(value))
936ed897 3894 ANYOF_BITMAP_SET(ret, value);
73b437c8 3895 }
ffc61ed2 3896 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3897 break;
3898 case ANYOF_NGRAPH:
3899 if (LOC)
936ed897 3900 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3901 else {
3902 for (value = 0; value < 256; value++)
3903 if (!isGRAPH(value))
936ed897 3904 ANYOF_BITMAP_SET(ret, value);
73b437c8 3905 }
ffc61ed2 3906 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3907 break;
3908 case ANYOF_LOWER:
3909 if (LOC)
936ed897 3910 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3911 else {
3912 for (value = 0; value < 256; value++)
3913 if (isLOWER(value))
936ed897 3914 ANYOF_BITMAP_SET(ret, value);
73b437c8 3915 }
ffc61ed2 3916 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3917 break;
3918 case ANYOF_NLOWER:
3919 if (LOC)
936ed897 3920 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3921 else {
3922 for (value = 0; value < 256; value++)
3923 if (!isLOWER(value))
936ed897 3924 ANYOF_BITMAP_SET(ret, value);
73b437c8 3925 }
ffc61ed2 3926 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3927 break;
3928 case ANYOF_PRINT:
3929 if (LOC)
936ed897 3930 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3931 else {
3932 for (value = 0; value < 256; value++)
3933 if (isPRINT(value))
936ed897 3934 ANYOF_BITMAP_SET(ret, value);
73b437c8 3935 }
ffc61ed2 3936 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3937 break;
3938 case ANYOF_NPRINT:
3939 if (LOC)
936ed897 3940 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3941 else {
3942 for (value = 0; value < 256; value++)
3943 if (!isPRINT(value))
936ed897 3944 ANYOF_BITMAP_SET(ret, value);
73b437c8 3945 }
ffc61ed2 3946 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3947 break;
aaa51d5e
JF
3948 case ANYOF_PSXSPC:
3949 if (LOC)
3950 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3951 else {
3952 for (value = 0; value < 256; value++)
3953 if (isPSXSPC(value))
3954 ANYOF_BITMAP_SET(ret, value);
3955 }
ffc61ed2 3956 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3957 break;
3958 case ANYOF_NPSXSPC:
3959 if (LOC)
3960 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3961 else {
3962 for (value = 0; value < 256; value++)
3963 if (!isPSXSPC(value))
3964 ANYOF_BITMAP_SET(ret, value);
3965 }
ffc61ed2 3966 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3967 break;
73b437c8
JH
3968 case ANYOF_PUNCT:
3969 if (LOC)
936ed897 3970 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3971 else {
3972 for (value = 0; value < 256; value++)
3973 if (isPUNCT(value))
936ed897 3974 ANYOF_BITMAP_SET(ret, value);
73b437c8 3975 }
ffc61ed2 3976 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3977 break;
3978 case ANYOF_NPUNCT:
3979 if (LOC)
936ed897 3980 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3981 else {
3982 for (value = 0; value < 256; value++)
3983 if (!isPUNCT(value))
936ed897 3984 ANYOF_BITMAP_SET(ret, value);
73b437c8 3985 }
ffc61ed2
JH
3986 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3987 break;
3988 case ANYOF_SPACE:
3989 if (LOC)
3990 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3991 else {
3992 for (value = 0; value < 256; value++)
3993 if (isSPACE(value))
3994 ANYOF_BITMAP_SET(ret, value);
3995 }
3996 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3997 break;
3998 case ANYOF_NSPACE:
3999 if (LOC)
4000 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4001 else {
4002 for (value = 0; value < 256; value++)
4003 if (!isSPACE(value))
4004 ANYOF_BITMAP_SET(ret, value);
4005 }
4006 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4007 break;
4008 case ANYOF_UPPER:
4009 if (LOC)
936ed897 4010 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4011 else {
4012 for (value = 0; value < 256; value++)
4013 if (isUPPER(value))
936ed897 4014 ANYOF_BITMAP_SET(ret, value);
73b437c8 4015 }
ffc61ed2 4016 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4017 break;
4018 case ANYOF_NUPPER:
4019 if (LOC)
936ed897 4020 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4021 else {
4022 for (value = 0; value < 256; value++)
4023 if (!isUPPER(value))
936ed897 4024 ANYOF_BITMAP_SET(ret, value);
73b437c8 4025 }
ffc61ed2 4026 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4027 break;
4028 case ANYOF_XDIGIT:
4029 if (LOC)
936ed897 4030 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4031 else {
4032 for (value = 0; value < 256; value++)
4033 if (isXDIGIT(value))
936ed897 4034 ANYOF_BITMAP_SET(ret, value);
73b437c8 4035 }
ffc61ed2 4036 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4037 break;
4038 case ANYOF_NXDIGIT:
4039 if (LOC)
936ed897 4040 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4041 else {
4042 for (value = 0; value < 256; value++)
4043 if (!isXDIGIT(value))
936ed897 4044 ANYOF_BITMAP_SET(ret, value);
73b437c8 4045 }
ffc61ed2 4046 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4047 break;
4048 default:
b45f050a 4049 vFAIL("Invalid [::] class");
73b437c8 4050 break;
b8c5462f 4051 }
b8c5462f 4052 if (LOC)
936ed897 4053 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4054 continue;
a0d0e21e 4055 }
ffc61ed2
JH
4056 } /* end of namedclass \blah */
4057
a0d0e21e 4058 if (range) {
eb160463 4059 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4060 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4061 RExC_parse - rangebegin,
4062 RExC_parse - rangebegin,
b45f050a 4063 rangebegin);
3568d838 4064 range = 0; /* not a valid range */
73b437c8 4065 }
a0d0e21e
LW
4066 }
4067 else {
3568d838 4068 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4069 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4070 RExC_parse[1] != ']') {
4071 RExC_parse++;
ffc61ed2
JH
4072
4073 /* a bad range like \w-, [:word:]- ? */
4074 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4075 if (ckWARN(WARN_REGEXP))
830247a4 4076 vWARN4(RExC_parse,
b45f050a 4077 "False [] range \"%*.*s\"",
830247a4
IZ
4078 RExC_parse - rangebegin,
4079 RExC_parse - rangebegin,
b45f050a 4080 rangebegin);
73b437c8 4081 if (!SIZE_ONLY)
936ed897 4082 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4083 } else
ffc61ed2
JH
4084 range = 1; /* yeah, it's a range! */
4085 continue; /* but do it the next time */
a0d0e21e 4086 }
a687059c 4087 }
ffc61ed2 4088
93733859 4089 /* now is the next time */
ae5c130c 4090 if (!SIZE_ONLY) {
3568d838
JH
4091 IV i;
4092
4093 if (prevvalue < 256) {
4094 IV ceilvalue = value < 256 ? value : 255;
4095
4096#ifdef EBCDIC
1b2d223b
JH
4097 /* In EBCDIC [\x89-\x91] should include
4098 * the \x8e but [i-j] should not. */
4099 if (literal_endpoint == 2 &&
4100 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4101 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4102 {
3568d838
JH
4103 if (isLOWER(prevvalue)) {
4104 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4105 if (isLOWER(i))
4106 ANYOF_BITMAP_SET(ret, i);
4107 } else {
3568d838 4108 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4109 if (isUPPER(i))
4110 ANYOF_BITMAP_SET(ret, i);
4111 }
8ada0baa 4112 }
ffc61ed2 4113 else
8ada0baa 4114#endif
a5961de5
JH
4115 for (i = prevvalue; i <= ceilvalue; i++)
4116 ANYOF_BITMAP_SET(ret, i);
3568d838 4117 }
a5961de5 4118 if (value > 255 || UTF) {
b08decb7
JH
4119 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4120 UV natvalue = NATIVE_TO_UNI(value);
4121
ffc61ed2 4122 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4123 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4124 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4125 prevnatvalue, natvalue);
4126 }
4127 else if (prevnatvalue == natvalue) {
4128 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4129 if (FOLD) {
254ba52a
JH
4130 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4131 STRLEN foldlen;
2f3bf011 4132 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4133
c840d2a2
JH
4134 /* If folding and foldable and a single
4135 * character, insert also the folded version
4136 * to the charclass. */
9e55ce06 4137 if (f != value) {
eb160463 4138 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4139 Perl_sv_catpvf(aTHX_ listsv,
4140 "%04"UVxf"\n", f);
4141 else {
4142 /* Any multicharacter foldings
4143 * require the following transform:
4144 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4145 * where E folds into "pq" and F folds
4146 * into "rst", all other characters
4147 * fold to single characters. We save
4148 * away these multicharacter foldings,
4149 * to be later saved as part of the
4150 * additional "s" data. */
4151 SV *sv;
4152
4153 if (!unicode_alternate)
4154 unicode_alternate = newAV();
4155 sv = newSVpvn((char*)foldbuf, foldlen);
4156 SvUTF8_on(sv);
4157 av_push(unicode_alternate, sv);
4158 }
4159 }
254ba52a 4160
60a8b682
JH
4161 /* If folding and the value is one of the Greek
4162 * sigmas insert a few more sigmas to make the
4163 * folding rules of the sigmas to work right.
4164 * Note that not all the possible combinations
4165 * are handled here: some of them are handled
9e55ce06
JH
4166 * by the standard folding rules, and some of
4167 * them (literal or EXACTF cases) are handled
4168 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4169 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4170 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4171 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4172 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4173 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4174 }
4175 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4176 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4177 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4178 }
4179 }
ffc61ed2 4180 }
1b2d223b
JH
4181#ifdef EBCDIC
4182 literal_endpoint = 0;
4183#endif
8ada0baa 4184 }
ffc61ed2
JH
4185
4186 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4187 }
ffc61ed2 4188
936ed897 4189 if (need_class) {
4f66b38d 4190 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4191 if (SIZE_ONLY)
830247a4 4192 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4193 else
830247a4 4194 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4195 }
ffc61ed2 4196
ae5c130c 4197 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4198 if (!SIZE_ONLY &&
ffc61ed2 4199 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4200 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4201 ) {
a0ed51b3 4202 for (value = 0; value < 256; ++value) {
936ed897 4203 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4204 UV fold = PL_fold[value];
ffc61ed2
JH
4205
4206 if (fold != value)
4207 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4208 }
4209 }
936ed897 4210 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4211 }
ffc61ed2 4212
ae5c130c 4213 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4214 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4215 /* If the only flag is inversion. */
4216 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4217 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4218 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4219 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4220 }
a0d0e21e 4221
b81d288d 4222 if (!SIZE_ONLY) {
fde631ed 4223 AV *av = newAV();
ffc61ed2
JH
4224 SV *rv;
4225
9e55ce06
JH
4226 /* The 0th element stores the character class description
4227 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4228 * to initialize the appropriate swash (which gets stored in
4229 * the 1st element), and also useful for dumping the regnode.
4230 * The 2nd element stores the multicharacter foldings,
4231 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4232 av_store(av, 0, listsv);
4233 av_store(av, 1, NULL);
9e55ce06 4234 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4235 rv = newRV_noinc((SV*)av);
19860706 4236 n = add_data(pRExC_state, 1, "s");
830247a4 4237 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4238 ARG_SET(ret, n);
a0ed51b3
LW
4239 }
4240
4241 return ret;
4242}
4243
76e3520e 4244STATIC char*
830247a4 4245S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4246{
830247a4 4247 char* retval = RExC_parse++;
a0d0e21e 4248
4633a7c4 4249 for (;;) {
830247a4
IZ
4250 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4251 RExC_parse[2] == '#') {
4252 while (*RExC_parse && *RExC_parse != ')')
4253 RExC_parse++;
4254 RExC_parse++;
4633a7c4
LW
4255 continue;
4256 }
e2509266 4257 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4258 if (isSPACE(*RExC_parse)) {
4259 RExC_parse++;
748a9306
LW
4260 continue;
4261 }
830247a4
IZ
4262 else if (*RExC_parse == '#') {
4263 while (*RExC_parse && *RExC_parse != '\n')
4264 RExC_parse++;
4265 RExC_parse++;
748a9306
LW
4266 continue;
4267 }
748a9306 4268 }
4633a7c4 4269 return retval;
a0d0e21e 4270 }
a687059c
LW
4271}
4272
4273/*
c277df42 4274- reg_node - emit a node
a0d0e21e 4275*/
76e3520e 4276STATIC regnode * /* Location. */
830247a4 4277S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4278{
c277df42
IZ
4279 register regnode *ret;
4280 register regnode *ptr;
a687059c 4281
830247a4 4282 ret = RExC_emit;
c277df42 4283 if (SIZE_ONLY) {
830247a4
IZ
4284 SIZE_ALIGN(RExC_size);
4285 RExC_size += 1;
a0d0e21e
LW
4286 return(ret);
4287 }
a687059c 4288
c277df42 4289 NODE_ALIGN_FILL(ret);
a0d0e21e 4290 ptr = ret;
c277df42 4291 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4292 if (RExC_offsets) { /* MJD */
4293 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4294 "reg_node", __LINE__,
4295 reg_name[op],
4296 RExC_emit - RExC_emit_start > RExC_offsets[0]
4297 ? "Overwriting end of array!\n" : "OK",
4298 RExC_emit - RExC_emit_start,
4299 RExC_parse - RExC_start,
4300 RExC_offsets[0]));
4301 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4302 }
4303
830247a4 4304 RExC_emit = ptr;
a687059c 4305
a0d0e21e 4306 return(ret);
a687059c
LW
4307}
4308
4309/*
a0d0e21e
LW
4310- reganode - emit a node with an argument
4311*/
76e3520e 4312STATIC regnode * /* Location. */
830247a4 4313S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4314{
c277df42
IZ
4315 register regnode *ret;
4316 register regnode *ptr;
fe14fcc3 4317
830247a4 4318 ret = RExC_emit;
c277df42 4319 if (SIZE_ONLY) {
830247a4
IZ
4320 SIZE_ALIGN(RExC_size);
4321 RExC_size += 2;
a0d0e21e
LW
4322 return(ret);
4323 }
fe14fcc3 4324
c277df42 4325 NODE_ALIGN_FILL(ret);
a0d0e21e 4326 ptr = ret;
c277df42 4327 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4328 if (RExC_offsets) { /* MJD */
4329 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4330 "reganode",
4331 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4332 "Overwriting end of array!\n" : "OK",
4333 RExC_emit - RExC_emit_start,
4334 RExC_parse - RExC_start,
4335 RExC_offsets[0]));
4336 Set_Cur_Node_Offset;
4337 }
4338
830247a4 4339 RExC_emit = ptr;
fe14fcc3 4340
a0d0e21e 4341 return(ret);
fe14fcc3
LW
4342}
4343
4344/*
cd439c50 4345- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4346*/
4347STATIC void
830247a4 4348S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4349{
5e12f4fb 4350 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4351}
4352
4353/*
a0d0e21e
LW
4354- reginsert - insert an operator in front of already-emitted operand
4355*
4356* Means relocating the operand.
4357*/
76e3520e 4358STATIC void
830247a4 4359S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4360{
c277df42
IZ
4361 register regnode *src;
4362 register regnode *dst;
4363 register regnode *place;
4364 register int offset = regarglen[(U8)op];
b81d288d 4365
22c35a8c 4366/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4367
4368 if (SIZE_ONLY) {
830247a4 4369 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4370 return;
4371 }
a687059c 4372
830247a4
IZ
4373 src = RExC_emit;
4374 RExC_emit += NODE_STEP_REGNODE + offset;
4375 dst = RExC_emit;
fac92740 4376 while (src > opnd) {
c277df42 4377 StructCopy(--src, --dst, regnode);
fac92740
MJD
4378 if (RExC_offsets) { /* MJD 20010112 */
4379 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4380 "reg_insert",
4381 dst - RExC_emit_start > RExC_offsets[0]
4382 ? "Overwriting end of array!\n" : "OK",
4383 src - RExC_emit_start,
4384 dst - RExC_emit_start,
4385 RExC_offsets[0]));
4386 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4387 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4388 }
4389 }
4390
a0d0e21e
LW
4391
4392 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4393 if (RExC_offsets) { /* MJD */
4394 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4395 "reginsert",
4396 place - RExC_emit_start > RExC_offsets[0]
4397 ? "Overwriting end of array!\n" : "OK",
4398 place - RExC_emit_start,
4399 RExC_parse - RExC_start,
4400 RExC_offsets[0]));
4401 Set_Node_Offset(place, RExC_parse);
4402 }
c277df42
IZ
4403 src = NEXTOPER(place);
4404 FILL_ADVANCE_NODE(place, op);
4405 Zero(src, offset, regnode);
a687059c
LW
4406}
4407
4408/*
c277df42 4409- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4410*/
76e3520e 4411STATIC void
830247a4 4412S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4413{
c277df42
IZ
4414 register regnode *scan;
4415 register regnode *temp;
a0d0e21e 4416
c277df42 4417 if (SIZE_ONLY)
a0d0e21e
LW
4418 return;
4419
4420 /* Find last node. */
4421 scan = p;
4422 for (;;) {
4423 temp = regnext(scan);
4424 if (temp == NULL)
4425 break;
4426 scan = temp;
4427 }
a687059c 4428
c277df42
IZ
4429 if (reg_off_by_arg[OP(scan)]) {
4430 ARG_SET(scan, val - scan);
a0ed51b3
LW
4431 }
4432 else {
c277df42
IZ
4433 NEXT_OFF(scan) = val - scan;
4434 }
a687059c
LW
4435}
4436
4437/*
a0d0e21e
LW
4438- regoptail - regtail on operand of first argument; nop if operandless
4439*/
76e3520e 4440STATIC void
830247a4 4441S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4442{
a0d0e21e 4443 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4444 if (p == NULL || SIZE_ONLY)
4445 return;
22c35a8c 4446 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4447 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4448 }
22c35a8c 4449 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4450 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4451 }
4452 else
a0d0e21e 4453 return;
a687059c
LW
4454}
4455
4456/*
4457 - regcurly - a little FSA that accepts {\d+,?\d*}
4458 */
79072805 4459STATIC I32
cea2e8a9 4460S_regcurly(pTHX_ register char *s)
a687059c
LW
4461{
4462 if (*s++ != '{')
4463 return FALSE;
f0fcb552 4464 if (!isDIGIT(*s))
a687059c 4465 return FALSE;
f0fcb552 4466 while (isDIGIT(*s))
a687059c
LW
4467 s++;
4468 if (*s == ',')
4469 s++;
f0fcb552 4470 while (isDIGIT(*s))
a687059c
LW
4471 s++;
4472 if (*s != '}')
4473 return FALSE;
4474 return TRUE;
4475}
4476
a687059c 4477
8fa7f367
JH
4478#ifdef DEBUGGING
4479
76e3520e 4480STATIC regnode *
cea2e8a9 4481S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4482{
f248d071 4483 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4484 register regnode *next;
c277df42
IZ
4485
4486 while (op != END && (!last || node < last)) {
4487 /* While that wasn't END last time... */
4488
4489 NODE_ALIGN(node);
4490 op = OP(node);
4491 if (op == CLOSE)
4492 l--;
4493 next = regnext(node);
4494 /* Where, what. */
4495 if (OP(node) == OPTIMIZED)
4496 goto after_print;
4497 regprop(sv, node);
b900a521 4498 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4499 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4500 if (next == NULL) /* Next ptr. */
4501 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4502 else
b900a521 4503 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4504 (void)PerlIO_putc(Perl_debug_log, '\n');
4505 after_print:
22c35a8c 4506 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4507 register regnode *nnode = (OP(next) == LONGJMP
4508 ? regnext(next)
c277df42
IZ
4509 : next);
4510 if (last && nnode > last)
4511 nnode = last;
4512 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4513 }
22c35a8c 4514 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4515 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4516 }
4517 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4518 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4519 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4520 }
22c35a8c 4521 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4522 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4523 next, sv, l + 1);
a0ed51b3
LW
4524 }
4525 else if ( op == PLUS || op == STAR) {
c277df42 4526 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4527 }
4528 else if (op == ANYOF) {
4f66b38d
HS
4529 /* arglen 1 + class block */
4530 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4531 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4532 node = NEXTOPER(node);
a0ed51b3 4533 }
22c35a8c 4534 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4535 /* Literal string, where present. */
cd439c50 4536 node += NODE_SZ_STR(node) - 1;
c277df42 4537 node = NEXTOPER(node);
a0ed51b3
LW
4538 }
4539 else {
c277df42
IZ
4540 node = NEXTOPER(node);
4541 node += regarglen[(U8)op];
4542 }
4543 if (op == CURLYX || op == OPEN)
4544 l++;
4545 else if (op == WHILEM)
4546 l--;
4547 }
4548 return node;
4549}
4550
8fa7f367
JH
4551#endif /* DEBUGGING */
4552
a687059c 4553/*
fd181c75 4554 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4555 */
4556void
864dbfa3 4557Perl_regdump(pTHX_ regexp *r)
a687059c 4558{
35ff7856 4559#ifdef DEBUGGING
46fc3d4c 4560 SV *sv = sv_newmortal();
a687059c 4561
c277df42 4562 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4563
4564 /* Header fields of interest. */
c277df42 4565 if (r->anchored_substr)
7b0972df 4566 PerlIO_printf(Perl_debug_log,
b81d288d 4567 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4568 PL_colors[0],
7b0972df 4569 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4570 SvPVX(r->anchored_substr),
3280af22 4571 PL_colors[1],
c277df42 4572 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4573 (IV)r->anchored_offset);
33b8afdf
JH
4574 else if (r->anchored_utf8)
4575 PerlIO_printf(Perl_debug_log,
4576 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4577 PL_colors[0],
4578 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4579 SvPVX(r->anchored_utf8),
4580 PL_colors[1],
4581 SvTAIL(r->anchored_utf8) ? "$" : "",
4582 (IV)r->anchored_offset);
c277df42 4583 if (r->float_substr)
7b0972df 4584 PerlIO_printf(Perl_debug_log,
b81d288d 4585 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4586 PL_colors[0],
b81d288d 4587 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4588 SvPVX(r->float_substr),
3280af22 4589 PL_colors[1],
c277df42 4590 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4591 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4592 else if (r->float_utf8)
4593 PerlIO_printf(Perl_debug_log,
4594 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4595 PL_colors[0],
4596 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4597 SvPVX(r->float_utf8),
4598 PL_colors[1],
4599 SvTAIL(r->float_utf8) ? "$" : "",
4600 (IV)r->float_min_offset, (UV)r->float_max_offset);
4601 if (r->check_substr || r->check_utf8)
b81d288d
AB
4602 PerlIO_printf(Perl_debug_log,
4603 r->check_substr == r->float_substr
33b8afdf 4604 && r->check_utf8 == r->float_utf8
c277df42
IZ
4605 ? "(checking floating" : "(checking anchored");
4606 if (r->reganch & ROPT_NOSCAN)
4607 PerlIO_printf(Perl_debug_log, " noscan");
4608 if (r->reganch & ROPT_CHECK_ALL)
4609 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4610 if (r->check_substr || r->check_utf8)
c277df42
IZ
4611 PerlIO_printf(Perl_debug_log, ") ");
4612
46fc3d4c 4613 if (r->regstclass) {
4614 regprop(sv, r->regstclass);
4615 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4616 }
774d564b 4617 if (r->reganch & ROPT_ANCH) {
4618 PerlIO_printf(Perl_debug_log, "anchored");
4619 if (r->reganch & ROPT_ANCH_BOL)
4620 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4621 if (r->reganch & ROPT_ANCH_MBOL)
4622 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4623 if (r->reganch & ROPT_ANCH_SBOL)
4624 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4625 if (r->reganch & ROPT_ANCH_GPOS)
4626 PerlIO_printf(Perl_debug_log, "(GPOS)");
4627 PerlIO_putc(Perl_debug_log, ' ');
4628 }
c277df42
IZ
4629 if (r->reganch & ROPT_GPOS_SEEN)
4630 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4631 if (r->reganch & ROPT_SKIP)
760ac839 4632 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4633 if (r->reganch & ROPT_IMPLICIT)
760ac839 4634 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4635 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4636 if (r->reganch & ROPT_EVAL_SEEN)
4637 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4638 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4639 if (r->offsets) {
4640 U32 i;
4641 U32 len = r->offsets[0];
392fbf5d 4642 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4643 for (i = 1; i <= len; i++)
392fbf5d
RB
4644 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4645 (UV)r->offsets[i*2-1],
4646 (UV)r->offsets[i*2]);
fac92740
MJD
4647 PerlIO_printf(Perl_debug_log, "\n");
4648 }
17c3b450 4649#endif /* DEBUGGING */
a687059c
LW
4650}
4651
8fa7f367
JH
4652#ifdef DEBUGGING
4653
653099ff
GS
4654STATIC void
4655S_put_byte(pTHX_ SV *sv, int c)
4656{
7be5a6cf 4657 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4658 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4659 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4660 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4661 else
4662 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4663}
4664
8fa7f367
JH
4665#endif /* DEBUGGING */
4666
a687059c 4667/*
a0d0e21e
LW
4668- regprop - printable representation of opcode
4669*/
46fc3d4c 4670void
864dbfa3 4671Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4672{
35ff7856 4673#ifdef DEBUGGING
9b155405 4674 register int k;
a0d0e21e 4675
54dc92de 4676 sv_setpvn(sv, "", 0);
9b155405 4677 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4678 /* It would be nice to FAIL() here, but this may be called from
4679 regexec.c, and it would be hard to supply pRExC_state. */
4680 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4681 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4682
4683 k = PL_regkind[(U8)OP(o)];
4684
2a782b5b
JH
4685 if (k == EXACT) {
4686 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4687 /* Using is_utf8_string() is a crude hack but it may
4688 * be the best for now since we have no flag "this EXACTish
4689 * node was UTF-8" --jhi */
4690 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4691 char *s = do_utf8 ?
c728cb41
JH
4692 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4693 UNI_DISPLAY_REGEX) :
2a782b5b 4694 STRING(o);
40eddc46 4695 int len = do_utf8 ?
2a782b5b
JH
4696 strlen(s) :
4697 STR_LEN(o);
4698 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4699 PL_colors[0],
4700 len, s,
4701 PL_colors[1]);
4702 }
9b155405 4703 else if (k == CURLY) {
cb434fcc 4704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4707 }
2c2d71f5
JH
4708 else if (k == WHILEM && o->flags) /* Ordinal/of */
4709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4712 else if (k == LOGICAL)
04ebc1ab 4713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4714 else if (k == ANYOF) {
4715 int i, rangestart = -1;
ffc61ed2 4716 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4717 const char * const anyofs[] = { /* Should be syncronized with
4718 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4719 "\\w",
4720 "\\W",
4721 "\\s",
4722 "\\S",
4723 "\\d",
4724 "\\D",
4725 "[:alnum:]",
4726 "[:^alnum:]",
4727 "[:alpha:]",
4728 "[:^alpha:]",
4729 "[:ascii:]",
4730 "[:^ascii:]",
4731 "[:ctrl:]",
4732 "[:^ctrl:]",
4733 "[:graph:]",
4734 "[:^graph:]",
4735 "[:lower:]",
4736 "[:^lower:]",
4737 "[:print:]",
4738 "[:^print:]",
4739 "[:punct:]",
4740 "[:^punct:]",
4741 "[:upper:]",
aaa51d5e 4742 "[:^upper:]",
653099ff 4743 "[:xdigit:]",
aaa51d5e
JF
4744 "[:^xdigit:]",
4745 "[:space:]",
4746 "[:^space:]",
4747 "[:blank:]",
4748 "[:^blank:]"
653099ff
GS
4749 };
4750
19860706 4751 if (flags & ANYOF_LOCALE)
653099ff 4752 sv_catpv(sv, "{loc}");
19860706 4753 if (flags & ANYOF_FOLD)
653099ff
GS
4754 sv_catpv(sv, "{i}");
4755 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4756 if (flags & ANYOF_INVERT)
653099ff 4757 sv_catpv(sv, "^");
ffc61ed2
JH
4758 for (i = 0; i <= 256; i++) {
4759 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4760 if (rangestart == -1)
4761 rangestart = i;
4762 } else if (rangestart != -1) {
4763 if (i <= rangestart + 3)
4764 for (; rangestart < i; rangestart++)
653099ff 4765 put_byte(sv, rangestart);
ffc61ed2
JH
4766 else {
4767 put_byte(sv, rangestart);
4768 sv_catpv(sv, "-");
4769 put_byte(sv, i - 1);
653099ff 4770 }
ffc61ed2 4771 rangestart = -1;
653099ff 4772 }
847a199f 4773 }
ffc61ed2
JH
4774
4775 if (o->flags & ANYOF_CLASS)
4776 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4777 if (ANYOF_CLASS_TEST(o,i))
4778 sv_catpv(sv, anyofs[i]);
4779
4780 if (flags & ANYOF_UNICODE)
4781 sv_catpv(sv, "{unicode}");
1aa99e6b 4782 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4783 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4784
4785 {
4786 SV *lv;
9e55ce06 4787 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4788
ffc61ed2
JH
4789 if (lv) {
4790 if (sw) {
ffc61ed2 4791 U8 s[UTF8_MAXLEN+1];
b81d288d 4792
ffc61ed2 4793 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4794 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4795
3568d838 4796 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4797 if (rangestart == -1)
4798 rangestart = i;
4799 } else if (rangestart != -1) {
4800 U8 *p;
b81d288d 4801
ffc61ed2
JH
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++) {
2b9d42f0 4804 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4805 put_byte(sv, *p);
4806 }
4807 else {
2b9d42f0 4808 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4809 put_byte(sv, *p);
4810 sv_catpv(sv, "-");
2b9d42f0 4811 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4812 put_byte(sv, *p);
4813 }
4814 rangestart = -1;
4815 }
19860706 4816 }
ffc61ed2
JH
4817
4818 sv_catpv(sv, "..."); /* et cetera */
19860706 4819 }
fde631ed 4820
ffc61ed2
JH
4821 {
4822 char *s = savepv(SvPVX(lv));
4823 char *origs = s;
b81d288d 4824
ffc61ed2 4825 while(*s && *s != '\n') s++;
b81d288d 4826
ffc61ed2
JH
4827 if (*s == '\n') {
4828 char *t = ++s;
4829
4830 while (*s) {
4831 if (*s == '\n')
4832 *s = ' ';
4833 s++;
4834 }
4835 if (s[-1] == ' ')
4836 s[-1] = 0;
4837
4838 sv_catpv(sv, t);
fde631ed 4839 }
b81d288d 4840
ffc61ed2 4841 Safefree(origs);
fde631ed
JH
4842 }
4843 }
653099ff 4844 }
ffc61ed2 4845
653099ff
GS
4846 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4847 }
9b155405 4848 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4849 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4850#endif /* DEBUGGING */
35ff7856 4851}
a687059c 4852
cad2e5aa
JH
4853SV *
4854Perl_re_intuit_string(pTHX_ regexp *prog)
4855{ /* Assume that RE_INTUIT is set */
4856 DEBUG_r(
4857 { STRLEN n_a;
33b8afdf
JH
4858 char *s = SvPV(prog->check_substr
4859 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4860
4861 if (!PL_colorset) reginitcolors();
4862 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4863 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4864 PL_colors[4],
4865 prog->check_substr ? "" : "utf8 ",
4866 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4867 s,
4868 PL_colors[1],
4869 (strlen(s) > 60 ? "..." : ""));
4870 } );
4871
33b8afdf 4872 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4873}
4874
2b69d0c2 4875void
864dbfa3 4876Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4877{
9e55ce06
JH
4878#ifdef DEBUGGING
4879 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4880#endif
7821416a
IZ
4881
4882 if (!r || (--r->refcnt > 0))
4883 return;
9e55ce06 4884 DEBUG_r({
d103360b
HS
4885 int len;
4886 char *s;
4887
4888 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4889 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4890 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4891 len = SvCUR(dsv);
9e55ce06
JH
4892 if (!PL_colorset)
4893 reginitcolors();
4894 PerlIO_printf(Perl_debug_log,
4895 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4896 PL_colors[4],PL_colors[5],PL_colors[0],
4897 len, len, s,
4898 PL_colors[1],
4899 len > 60 ? "..." : "");
4900 });
cad2e5aa 4901
c277df42 4902 if (r->precomp)
a0d0e21e 4903 Safefree(r->precomp);
fac92740
MJD
4904 if (r->offsets) /* 20010421 MJD */
4905 Safefree(r->offsets);
cf93c79d
IZ
4906 if (RX_MATCH_COPIED(r))
4907 Safefree(r->subbeg);
a193d654
GS
4908 if (r->substrs) {
4909 if (r->anchored_substr)
4910 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4911 if (r->anchored_utf8)
4912 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4913 if (r->float_substr)
4914 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4915 if (r->float_utf8)
4916 SvREFCNT_dec(r->float_utf8);
2779dcf1 4917 Safefree(r->substrs);
a193d654 4918 }
c277df42
IZ
4919 if (r->data) {
4920 int n = r->data->count;
dfad63ad
HS
4921 AV* new_comppad = NULL;
4922 AV* old_comppad;
4923 SV** old_curpad;
4924
c277df42 4925 while (--n >= 0) {
261faec3 4926 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4927 switch (r->data->what[n]) {
4928 case 's':
4929 SvREFCNT_dec((SV*)r->data->data[n]);
4930 break;
653099ff
GS
4931 case 'f':
4932 Safefree(r->data->data[n]);
4933 break;
dfad63ad
HS
4934 case 'p':
4935 new_comppad = (AV*)r->data->data[n];
4936 break;
c277df42 4937 case 'o':
dfad63ad 4938 if (new_comppad == NULL)
cea2e8a9 4939 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4940 old_comppad = PL_comppad;
4941 old_curpad = PL_curpad;
1e6dc0b6
SB
4942 /* Watch out for global destruction's random ordering. */
4943 if (SvTYPE(new_comppad) == SVt_PVAV) {
4944 PL_comppad = new_comppad;
4945 PL_curpad = AvARRAY(new_comppad);
4946 }
4947 else
4948 PL_curpad = NULL;
9b978d73
DM
4949
4950 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4951 op_free((OP_4tree*)r->data->data[n]);
4952 }
4953
dfad63ad
HS
4954 PL_comppad = old_comppad;
4955 PL_curpad = old_curpad;
4956 SvREFCNT_dec((SV*)new_comppad);
4957 new_comppad = NULL;
c277df42
IZ
4958 break;
4959 case 'n':
9e55ce06 4960 break;
c277df42 4961 default:
830247a4 4962 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4963 }
4964 }
4965 Safefree(r->data->what);
4966 Safefree(r->data);
a0d0e21e
LW
4967 }
4968 Safefree(r->startp);
4969 Safefree(r->endp);
4970 Safefree(r);
a687059c 4971}
c277df42
IZ
4972
4973/*
4974 - regnext - dig the "next" pointer out of a node
4975 *
4976 * [Note, when REGALIGN is defined there are two places in regmatch()
4977 * that bypass this code for speed.]
4978 */
4979regnode *
864dbfa3 4980Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4981{
4982 register I32 offset;
4983
3280af22 4984 if (p == &PL_regdummy)
c277df42
IZ
4985 return(NULL);
4986
4987 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4988 if (offset == 0)
4989 return(NULL);
4990
c277df42 4991 return(p+offset);
c277df42
IZ
4992}
4993
01f988be 4994STATIC void
cea2e8a9 4995S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4996{
4997 va_list args;
4998 STRLEN l1 = strlen(pat1);
4999 STRLEN l2 = strlen(pat2);
5000 char buf[512];
06bf62c7 5001 SV *msv;
c277df42
IZ
5002 char *message;
5003
5004 if (l1 > 510)
5005 l1 = 510;
5006 if (l1 + l2 > 510)
5007 l2 = 510 - l1;
5008 Copy(pat1, buf, l1 , char);
5009 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5010 buf[l1 + l2] = '\n';
5011 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5012#ifdef I_STDARG
5013 /* ANSI variant takes additional second argument */
c277df42 5014 va_start(args, pat2);
8736538c
AS
5015#else
5016 va_start(args);
5017#endif
5a844595 5018 msv = vmess(buf, &args);
c277df42 5019 va_end(args);
06bf62c7 5020 message = SvPV(msv,l1);
c277df42
IZ
5021 if (l1 > 512)
5022 l1 = 512;
5023 Copy(message, buf, l1 , char);
5024 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5025 Perl_croak(aTHX_ "%s", buf);
c277df42 5026}
a0ed51b3
LW
5027
5028/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5029
5030void
864dbfa3 5031Perl_save_re_context(pTHX)
b81d288d 5032{
830247a4
IZ
5033#if 0
5034 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5035 SAVEI32(RExC_npar); /* () count. */
5036 SAVEI32(RExC_size); /* Code size. */
e2509266 5037 SAVEI32(RExC_flags); /* are we folding, multilining? */
830247a4
IZ
5038 SAVEVPTR(RExC_rx); /* from regcomp.c */
5039 SAVEI32(RExC_seen); /* from regcomp.c */
5040 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5041 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5042 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
5043 SAVEPPTR(RExC_end); /* End of input for compile */
5044 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5045#endif
5046
5047 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5048 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5049 SAVEPPTR(PL_reginput); /* String-input pointer. */
5050 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5051 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5052 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5053 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5054 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 5055 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5056 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5057 PL_reg_start_tmp = 0;
a0ed51b3
LW
5058 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5059 PL_reg_start_tmpl = 0;
7766f137 5060 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5061 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5062 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5063 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5064 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5065 SAVEVPTR(PL_regcc); /* from regexec.c */
5066 SAVEVPTR(PL_curcop);
7766f137
GS
5067 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5068 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5069 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5070 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 5071 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 5072 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5073 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5074 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5075 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5076 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5077 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5078#ifdef DEBUGGING
b81d288d 5079 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5080#endif
a0ed51b3 5081}
51371543 5082
51371543 5083static void
acfe0abc 5084clear_re(pTHX_ void *r)
51371543
GS
5085{
5086 ReREFCNT_dec((regexp *)r);
5087}
ffbc6a93 5088