This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
MANIFES --> MANIFEST
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
9014280d 388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
9014280d 395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
f9373011
PM
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
397 } STMT_END \
398
b45f050a
JF
399
400#define vWARN2(loc, m, a1) \
401 STMT_START { \
7a799e20 402 IV offset = loc - RExC_precomp; \
9014280d 403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 404 a1, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN3(loc, m, a1, a2) \
409 STMT_START { \
7a799e20 410 IV offset = loc - RExC_precomp; \
9014280d 411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
b45f050a 412 a1, a2, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
414 } STMT_END
415
416#define vWARN4(loc, m, a1, a2, a3) \
417 STMT_START { \
7a799e20 418 IV offset = loc - RExC_precomp; \
9014280d 419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
b45f050a 420 a1, a2, a3, \
830247a4 421 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
422 } STMT_END
423
9d1d55b5
JP
424/* used for the parse_flags section for (?c) -- japhy */
425#define vWARN5(loc, m, a1, a2, a3, a4) \
426 STMT_START { \
7a799e20 427 IV offset = loc - RExC_precomp; \
9014280d 428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
9d1d55b5
JP
429 a1, a2, a3, a4, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
431 } STMT_END
432
8615cb43 433
cd439c50 434/* Allow for side effects in s */
fd3f0ae2 435#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 436
fac92740
MJD
437/* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
441 */
442
443#define MJD_OFFSET_DEBUG(x)
444/* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447# define Set_Node_Offset_To_R(node,byte) \
448 STMT_START { \
449 if (! SIZE_ONLY) { \
450 if((node) < 0) { \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452 } else { \
453 RExC_offsets[2*(node)-1] = (byte); \
454 } \
455 } \
456 } STMT_END
457
458# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461# define Set_Node_Length_To_R(node,len) \
462 STMT_START { \
463 if (! SIZE_ONLY) { \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465 if((node) < 0) { \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467 } else { \
468 RExC_offsets[2*(node)] = (len); \
469 } \
470 } \
471 } STMT_END
472
473# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
476
477/* Get offsets and lengths */
478#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
acfe0abc 481static void clear_re(pTHX_ void *r);
4327152a 482
653099ff
GS
483/* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
486
4327152a 487STATIC void
830247a4 488S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 489{
a0ed51b3
LW
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 492
c277df42
IZ
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
b81d288d 498 data->flags
c277df42
IZ
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500 else
501 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
502 }
503 else {
c277df42 504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
505 data->offset_float_max = (l
506 ? data->last_start_max
c277df42
IZ
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
b81d288d 509 data->flags
c277df42
IZ
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511 else
512 data->flags &= ~SF_FL_BEFORE_EOL;
513 }
514 }
515 SvCUR_set(data->last_found, 0);
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
588
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
593 }
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
598}
599
600/* 'OR' a given class with another one. Can create false positives */
601/* We assume that cl is not inverted */
602STATIC void
830247a4 603S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 604{
653099ff
GS
605 if (or_with->flags & ANYOF_INVERT) {
606 /* We do not use
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614 */
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
618 int i;
619
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
623 else {
830247a4 624 cl_anything(pRExC_state, cl);
653099ff
GS
625 }
626 } else {
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 629 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
630 || (cl->flags & ANYOF_FOLD)) ) {
631 int i;
632
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
640 }
641 }
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 643 cl_anything(pRExC_state, cl);
653099ff
GS
644 }
645 }
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
648
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
653 }
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
657 }
653099ff
GS
658}
659
5d1c421c
JH
660/*
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
663 */
664#if defined(__sparc64__) && defined(__GNUC__)
665# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666# undef SPARC64_GCC_WORKAROUND
667# define SPARC64_GCC_WORKAROUND 1
668# endif
669#endif
670
653099ff
GS
671/* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
673
c277df42
IZ
674/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
676
76e3520e 677STATIC I32
830247a4 678S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
682{
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
685 I32 delta = 0;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 687 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
653099ff 690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 691
c277df42
IZ
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
694
22c35a8c 695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 696 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
697 regnode *n = regnext(scan);
698 U32 stringok = 1;
699#ifdef DEBUGGING
700 regnode *stop = scan;
b81d288d 701#endif
c277df42 702
cd439c50 703 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
704 /* Skip NOTHING, merge EXACT*. */
705 while (n &&
b81d288d 706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
707 (stringok && (OP(n) == OP(scan))))
708 && NEXT_OFF(n)
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
711 stringok = 0;
22c35a8c 712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
715#ifdef DEBUGGING
716 if (stringok)
717 stop = n;
b81d288d 718#endif
c277df42 719 n = regnext(n);
a0ed51b3 720 }
f49d4d0f 721 else if (stringok) {
cd439c50 722 int oldl = STR_LEN(scan);
c277df42 723 regnode *nnext = regnext(n);
f49d4d0f 724
b81d288d 725 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
726 break;
727 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
c277df42 730 /* Now we can overwrite *n : */
f49d4d0f 731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 732#ifdef DEBUGGING
f49d4d0f 733 stop = next - 1;
b81d288d 734#endif
c277df42
IZ
735 n = nnext;
736 }
737 }
61a36c01 738
d65e4eab 739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
740/*
741 Two problematic code points in Unicode casefolding of EXACT nodes:
742
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746 which casefold to
747
748 Unicode UTF-8
749
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
758
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
762
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
766
767*/
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
771 char *t1 = t0 + 3;
772
773 for (s = s0 + 2;
774 s < s2 && (t = ninstr(s, s1, t0, t1));
775 s = t + 4) {
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778 min -= 4;
779 }
780 }
781
c277df42
IZ
782#ifdef DEBUGGING
783 /* Allow dumping */
cd439c50 784 n = scan + NODE_SZ_STR(scan);
c277df42 785 while (n <= stop) {
22c35a8c 786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
787 OP(n) = OPTIMIZED;
788 NEXT_OFF(n) = 0;
789 }
790 n++;
791 }
653099ff 792#endif
c277df42 793 }
653099ff
GS
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
c277df42 796 if (OP(scan) != CURLYX) {
048cfca1
GS
797 int max = (reg_off_by_arg[OP(scan)]
798 ? I32_MAX
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802 int noff;
803 regnode *n = scan;
b81d288d 804
c277df42
IZ
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
22c35a8c 807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809 && off + noff < max)
810 off += noff;
811 if (reg_off_by_arg[OP(scan)])
812 ARG(scan) = off;
b81d288d 813 else
c277df42
IZ
814 NEXT_OFF(scan) = off;
815 }
653099ff
GS
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
b81d288d 818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
821 code = OP(scan);
b81d288d
AB
822
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 824 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 825 struct regnode_charclass_class accum;
c277df42 826
653099ff 827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 829 if (flags & SCF_DO_STCLASS)
830247a4 830 cl_init_zero(pRExC_state, &accum);
c277df42 831 while (OP(scan) == code) {
830247a4 832 I32 deltanext, minnext, f = 0, fake;
653099ff 833 struct regnode_charclass_class this_class;
c277df42
IZ
834
835 num++;
836 data_fake.flags = 0;
b81d288d 837 if (data) {
2c2d71f5 838 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
839 data_fake.last_closep = data->last_closep;
840 }
841 else
842 data_fake.last_closep = &fake;
c277df42
IZ
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
845 if (code != BRANCH)
846 scan = NEXTOPER(scan);
653099ff 847 if (flags & SCF_DO_STCLASS) {
830247a4 848 cl_init(pRExC_state, &this_class);
653099ff
GS
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
b81d288d 851 }
e1901655
IZ
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
653099ff 854 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
b81d288d 857 if (min1 > minnext)
c277df42
IZ
858 min1 = minnext;
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
aca2d497 862 is_inf = is_inf_internal = 1;
c277df42
IZ
863 scan = next;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865 pars++;
405ff068 866 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 867 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
868 if (data)
869 data->whilem_c = data_fake.whilem_c;
653099ff 870 if (flags & SCF_DO_STCLASS)
830247a4 871 cl_or(pRExC_state, &accum, &this_class);
b81d288d 872 if (code == SUSPEND)
c277df42
IZ
873 break;
874 }
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876 min1 = 0;
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
882 }
883 min += min1;
884 delta += max1 - min1;
653099ff 885 if (flags & SCF_DO_STCLASS_OR) {
830247a4 886 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
887 if (min1) {
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
890 }
891 }
892 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
893 if (min1) {
894 cl_and(data->start_class, &accum);
653099ff 895 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
896 }
897 else {
b81d288d 898 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
907 }
653099ff 908 }
a0ed51b3
LW
909 }
910 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
914 continue;
a0ed51b3
LW
915 }
916 else if (OP(scan) == EXACT) {
cd439c50 917 I32 l = STR_LEN(scan);
1aa99e6b 918 UV uc = *((U8*)STRING(scan));
a0ed51b3 919 if (UTF) {
1aa99e6b
IH
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
9041c2e3 922 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
923 }
924 min += l;
c277df42 925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
b81d288d 931 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 932 }
cd439c50 933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
33b8afdf
JH
934 if (UTF)
935 SvUTF8_on(data->last_found);
c277df42
IZ
936 data->last_end = data->pos_min + l;
937 data->pos_min += l; /* As in the first entry. */
938 data->flags &= ~SF_BEFORE_EOL;
939 }
653099ff
GS
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
942 int compat = 1;
943
1aa99e6b 944 if (uc >= 0x100 ||
516a5887 945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 947 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 948 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 949 )
653099ff
GS
950 compat = 0;
951 ANYOF_CLASS_ZERO(data->start_class);
952 ANYOF_BITMAP_ZERO(data->start_class);
953 if (compat)
1aa99e6b 954 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 955 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
956 if (uc < 0x100)
957 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
958 }
959 else if (flags & SCF_DO_STCLASS_OR) {
960 /* false positive possible if the class is case-folded */
1aa99e6b 961 if (uc < 0x100)
9b877dbb
IH
962 ANYOF_BITMAP_SET(data->start_class, uc);
963 else
964 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
965 data->start_class->flags &= ~ANYOF_EOS;
966 cl_and(data->start_class, &and_with);
967 }
968 flags &= ~SCF_DO_STCLASS;
a0ed51b3 969 }
653099ff 970 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 971 I32 l = STR_LEN(scan);
1aa99e6b 972 UV uc = *((U8*)STRING(scan));
653099ff
GS
973
974 /* Search for fixed substrings supports EXACT only. */
b81d288d 975 if (flags & SCF_DO_SUBSTR)
830247a4 976 scan_commit(pRExC_state, data);
a0ed51b3 977 if (UTF) {
1aa99e6b
IH
978 U8 *s = (U8 *)STRING(scan);
979 l = utf8_length(s, s + l);
9041c2e3 980 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
981 }
982 min += l;
c277df42 983 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 984 data->pos_min += l;
653099ff
GS
985 if (flags & SCF_DO_STCLASS_AND) {
986 /* Check whether it is compatible with what we know already! */
987 int compat = 1;
988
1aa99e6b 989 if (uc >= 0x100 ||
516a5887 990 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 991 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 992 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
993 compat = 0;
994 ANYOF_CLASS_ZERO(data->start_class);
995 ANYOF_BITMAP_ZERO(data->start_class);
996 if (compat) {
1aa99e6b 997 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
998 data->start_class->flags &= ~ANYOF_EOS;
999 data->start_class->flags |= ANYOF_FOLD;
1000 if (OP(scan) == EXACTFL)
1001 data->start_class->flags |= ANYOF_LOCALE;
1002 }
1003 }
1004 else if (flags & SCF_DO_STCLASS_OR) {
1005 if (data->start_class->flags & ANYOF_FOLD) {
1006 /* false positive possible if the class is case-folded.
1007 Assume that the locale settings are the same... */
1aa99e6b
IH
1008 if (uc < 0x100)
1009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1010 data->start_class->flags &= ~ANYOF_EOS;
1011 }
1012 cl_and(data->start_class, &and_with);
1013 }
1014 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1015 }
4d61ec05 1016 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1017 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1018 I32 f = flags, pos_before = 0;
c277df42 1019 regnode *oscan = scan;
653099ff
GS
1020 struct regnode_charclass_class this_class;
1021 struct regnode_charclass_class *oclass = NULL;
727f22e3 1022 I32 next_is_eval = 0;
653099ff 1023
22c35a8c 1024 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1025 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1026 scan = NEXTOPER(scan);
1027 goto finish;
1028 case PLUS:
653099ff 1029 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1030 next = NEXTOPER(scan);
653099ff 1031 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1032 mincount = 1;
1033 maxcount = REG_INFTY;
c277df42
IZ
1034 next = regnext(scan);
1035 scan = NEXTOPER(scan);
1036 goto do_curly;
1037 }
1038 }
1039 if (flags & SCF_DO_SUBSTR)
1040 data->pos_min++;
1041 min++;
1042 /* Fall through. */
1043 case STAR:
653099ff
GS
1044 if (flags & SCF_DO_STCLASS) {
1045 mincount = 0;
b81d288d 1046 maxcount = REG_INFTY;
653099ff
GS
1047 next = regnext(scan);
1048 scan = NEXTOPER(scan);
1049 goto do_curly;
1050 }
b81d288d 1051 is_inf = is_inf_internal = 1;
c277df42
IZ
1052 scan = regnext(scan);
1053 if (flags & SCF_DO_SUBSTR) {
830247a4 1054 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1055 data->longest = &(data->longest_float);
1056 }
1057 goto optimize_curly_tail;
1058 case CURLY:
b81d288d 1059 mincount = ARG1(scan);
c277df42
IZ
1060 maxcount = ARG2(scan);
1061 next = regnext(scan);
cb434fcc
IZ
1062 if (OP(scan) == CURLYX) {
1063 I32 lp = (data ? *(data->last_closep) : 0);
1064
1065 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1066 }
c277df42 1067 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1068 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1069 do_curly:
1070 if (flags & SCF_DO_SUBSTR) {
830247a4 1071 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1072 pos_before = data->pos_min;
1073 }
1074 if (data) {
1075 fl = data->flags;
1076 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1077 if (is_inf)
1078 data->flags |= SF_IS_INF;
1079 }
653099ff 1080 if (flags & SCF_DO_STCLASS) {
830247a4 1081 cl_init(pRExC_state, &this_class);
653099ff
GS
1082 oclass = data->start_class;
1083 data->start_class = &this_class;
1084 f |= SCF_DO_STCLASS_AND;
1085 f &= ~SCF_DO_STCLASS_OR;
1086 }
e1901655
IZ
1087 /* These are the cases when once a subexpression
1088 fails at a particular position, it cannot succeed
1089 even after backtracking at the enclosing scope.
b81d288d 1090
e1901655
IZ
1091 XXXX what if minimal match and we are at the
1092 initial run of {n,m}? */
1093 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1095
c277df42 1096 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1097 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1098 mincount == 0
653099ff
GS
1099 ? (f & ~SCF_DO_SUBSTR) : f);
1100
1101 if (flags & SCF_DO_STCLASS)
1102 data->start_class = oclass;
1103 if (mincount == 0 || minnext == 0) {
1104 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1105 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1106 }
1107 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1108 /* Switch to OR mode: cache the old value of
653099ff
GS
1109 * data->start_class */
1110 StructCopy(data->start_class, &and_with,
1111 struct regnode_charclass_class);
1112 flags &= ~SCF_DO_STCLASS_AND;
1113 StructCopy(&this_class, data->start_class,
1114 struct regnode_charclass_class);
1115 flags |= SCF_DO_STCLASS_OR;
1116 data->start_class->flags |= ANYOF_EOS;
1117 }
1118 } else { /* Non-zero len */
1119 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1120 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1121 cl_and(data->start_class, &and_with);
1122 }
1123 else if (flags & SCF_DO_STCLASS_AND)
1124 cl_and(data->start_class, &this_class);
1125 flags &= ~SCF_DO_STCLASS;
1126 }
c277df42
IZ
1127 if (!scan) /* It was not CURLYX, but CURLY. */
1128 scan = next;
84037bb0 1129 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1130 /* ? quantifier ok, except for (?{ ... }) */
1131 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1132 && (minnext == 0) && (deltanext == 0)
99799961 1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1134 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1135 {
830247a4 1136 vWARN(RExC_parse,
b45f050a
JF
1137 "Quantifier unexpected on zero-length expression");
1138 }
1139
c277df42 1140 min += minnext * mincount;
b81d288d 1141 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1142 && (minnext + deltanext) > 0)
1143 || deltanext == I32_MAX);
aca2d497 1144 is_inf |= is_inf_internal;
c277df42
IZ
1145 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1146
1147 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1148 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1149 && data->flags & SF_IN_PAR
1150 && !(data->flags & SF_HAS_EVAL)
1151 && !deltanext && minnext == 1 ) {
1152 /* Try to optimize to CURLYN. */
1153 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1154 regnode *nxt1 = nxt;
1155#ifdef DEBUGGING
1156 regnode *nxt2;
1157#endif
c277df42
IZ
1158
1159 /* Skip open. */
1160 nxt = regnext(nxt);
4d61ec05 1161 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1162 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1163 && STR_LEN(nxt) == 1))
c277df42 1164 goto nogo;
497b47a8 1165#ifdef DEBUGGING
c277df42 1166 nxt2 = nxt;
497b47a8 1167#endif
c277df42 1168 nxt = regnext(nxt);
b81d288d 1169 if (OP(nxt) != CLOSE)
c277df42
IZ
1170 goto nogo;
1171 /* Now we know that nxt2 is the only contents: */
1172 oscan->flags = ARG(nxt);
1173 OP(oscan) = CURLYN;
1174 OP(nxt1) = NOTHING; /* was OPEN. */
1175#ifdef DEBUGGING
1176 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1180 OP(nxt + 1) = OPTIMIZED; /* was count. */
1181 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1182#endif
c277df42 1183 }
c277df42
IZ
1184 nogo:
1185
1186 /* Try optimization CURLYX => CURLYM. */
b81d288d 1187 if ( OP(oscan) == CURLYX && data
c277df42 1188 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1189 && !(data->flags & SF_HAS_EVAL)
1190 && !deltanext ) {
1191 /* XXXX How to optimize if data == 0? */
1192 /* Optimize to a simpler form. */
1193 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194 regnode *nxt2;
1195
1196 OP(oscan) = CURLYM;
1197 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1198 && (OP(nxt2) != WHILEM))
c277df42
IZ
1199 nxt = nxt2;
1200 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1201 /* Need to optimize away parenths. */
1202 if (data->flags & SF_IN_PAR) {
1203 /* Set the parenth number. */
1204 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1205
b81d288d 1206 if (OP(nxt) != CLOSE)
b45f050a 1207 FAIL("Panic opt close");
c277df42
IZ
1208 oscan->flags = ARG(nxt);
1209 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1210 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1211#ifdef DEBUGGING
1212 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213 OP(nxt + 1) = OPTIMIZED; /* was count. */
1214 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1216#endif
c277df42
IZ
1217#if 0
1218 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219 regnode *nnxt = regnext(nxt1);
b81d288d 1220
c277df42
IZ
1221 if (nnxt == nxt) {
1222 if (reg_off_by_arg[OP(nxt1)])
1223 ARG_SET(nxt1, nxt2 - nxt1);
1224 else if (nxt2 - nxt1 < U16_MAX)
1225 NEXT_OFF(nxt1) = nxt2 - nxt1;
1226 else
1227 OP(nxt) = NOTHING; /* Cannot beautify */
1228 }
1229 nxt1 = nnxt;
1230 }
1231#endif
1232 /* Optimize again: */
b81d288d 1233 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1234 NULL, 0);
a0ed51b3
LW
1235 }
1236 else
c277df42 1237 oscan->flags = 0;
c277df42 1238 }
e1901655
IZ
1239 else if ((OP(oscan) == CURLYX)
1240 && (flags & SCF_WHILEM_VISITED_POS)
1241 /* See the comment on a similar expression above.
1242 However, this time it not a subexpression
1243 we care about, but the expression itself. */
1244 && (maxcount == REG_INFTY)
1245 && data && ++data->whilem_c < 16) {
1246 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1247 /* Find WHILEM (as in regexec.c) */
1248 regnode *nxt = oscan + NEXT_OFF(oscan);
1249
1250 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1251 nxt += ARG(nxt);
1252 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1253 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1254 }
b81d288d 1255 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1256 pars++;
1257 if (flags & SCF_DO_SUBSTR) {
1258 SV *last_str = Nullsv;
1259 int counted = mincount != 0;
1260
1261 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1262#if defined(SPARC64_GCC_WORKAROUND)
1263 I32 b = 0;
1264 STRLEN l = 0;
1265 char *s = NULL;
1266 I32 old = 0;
1267
1268 if (pos_before >= data->last_start_min)
1269 b = pos_before;
1270 else
1271 b = data->last_start_min;
1272
1273 l = 0;
1274 s = SvPV(data->last_found, l);
1275 old = b - data->last_start_min;
1276
1277#else
b81d288d 1278 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1279 ? pos_before : data->last_start_min;
1280 STRLEN l;
1281 char *s = SvPV(data->last_found, l);
a0ed51b3 1282 I32 old = b - data->last_start_min;
5d1c421c 1283#endif
a0ed51b3
LW
1284
1285 if (UTF)
1286 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1287
a0ed51b3 1288 l -= old;
c277df42 1289 /* Get the added string: */
79cb57f6 1290 last_str = newSVpvn(s + old, l);
c277df42
IZ
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1293 if (mincount > 1) {
1294 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1295 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
b81d288d 1299 SvCUR_set(data->last_found,
c277df42
IZ
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1302 data->last_end += l * (mincount - 1);
1303 }
2a8d9689
HS
1304 } else {
1305 /* start offset must point into the last copy */
1306 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1307 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308 * (minnext + data->pos_delta);
c277df42
IZ
1309 }
1310 }
1311 /* It is counted once already... */
1312 data->pos_min += minnext * (mincount - counted);
1313 data->pos_delta += - counted * deltanext +
1314 (minnext + deltanext) * maxcount - minnext * mincount;
1315 if (mincount != maxcount) {
653099ff
GS
1316 /* Cannot extend fixed substrings found inside
1317 the group. */
830247a4 1318 scan_commit(pRExC_state,data);
c277df42
IZ
1319 if (mincount && last_str) {
1320 sv_setsv(data->last_found, last_str);
1321 data->last_end = data->pos_min;
b81d288d 1322 data->last_start_min =
a0ed51b3 1323 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1324 data->last_start_max = is_inf
1325 ? I32_MAX
c277df42 1326 : data->pos_min + data->pos_delta
a0ed51b3 1327 - CHR_SVLEN(last_str);
c277df42
IZ
1328 }
1329 data->longest = &(data->longest_float);
1330 }
aca2d497 1331 SvREFCNT_dec(last_str);
c277df42 1332 }
405ff068 1333 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1334 data->flags |= SF_HAS_EVAL;
1335 optimize_curly_tail:
c277df42 1336 if (OP(oscan) != CURLYX) {
22c35a8c 1337 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1338 && NEXT_OFF(next))
1339 NEXT_OFF(oscan) += NEXT_OFF(next);
1340 }
c277df42 1341 continue;
653099ff 1342 default: /* REF and CLUMP only? */
c277df42 1343 if (flags & SCF_DO_SUBSTR) {
830247a4 1344 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1345 data->longest = &(data->longest_float);
1346 }
aca2d497 1347 is_inf = is_inf_internal = 1;
653099ff 1348 if (flags & SCF_DO_STCLASS_OR)
830247a4 1349 cl_anything(pRExC_state, data->start_class);
653099ff 1350 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1351 break;
1352 }
a0ed51b3 1353 }
ffc61ed2 1354 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1355 int value = 0;
653099ff 1356
c277df42 1357 if (flags & SCF_DO_SUBSTR) {
830247a4 1358 scan_commit(pRExC_state,data);
c277df42
IZ
1359 data->pos_min++;
1360 }
1361 min++;
653099ff
GS
1362 if (flags & SCF_DO_STCLASS) {
1363 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1364
1365 /* Some of the logic below assumes that switching
1366 locale on will only add false positives. */
1367 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1368 case SANY:
653099ff
GS
1369 default:
1370 do_default:
1371 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1373 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1374 break;
1375 case REG_ANY:
1376 if (OP(scan) == SANY)
1377 goto do_default;
1378 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1381 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1382 }
1383 if (flags & SCF_DO_STCLASS_AND || !value)
1384 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385 break;
1386 case ANYOF:
1387 if (flags & SCF_DO_STCLASS_AND)
1388 cl_and(data->start_class,
1389 (struct regnode_charclass_class*)scan);
1390 else
830247a4 1391 cl_or(pRExC_state, data->start_class,
653099ff
GS
1392 (struct regnode_charclass_class*)scan);
1393 break;
1394 case ALNUM:
1395 if (flags & SCF_DO_STCLASS_AND) {
1396 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398 for (value = 0; value < 256; value++)
1399 if (!isALNUM(value))
1400 ANYOF_BITMAP_CLEAR(data->start_class, value);
1401 }
1402 }
1403 else {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1406 else {
1407 for (value = 0; value < 256; value++)
1408 if (isALNUM(value))
b81d288d 1409 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1410 }
1411 }
1412 break;
1413 case ALNUML:
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417 }
1418 else {
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 data->start_class->flags |= ANYOF_LOCALE;
1421 }
1422 break;
1423 case NALNUM:
1424 if (flags & SCF_DO_STCLASS_AND) {
1425 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427 for (value = 0; value < 256; value++)
1428 if (isALNUM(value))
1429 ANYOF_BITMAP_CLEAR(data->start_class, value);
1430 }
1431 }
1432 else {
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435 else {
1436 for (value = 0; value < 256; value++)
1437 if (!isALNUM(value))
b81d288d 1438 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1439 }
1440 }
1441 break;
1442 case NALNUML:
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446 }
1447 else {
1448 data->start_class->flags |= ANYOF_LOCALE;
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450 }
1451 break;
1452 case SPACE:
1453 if (flags & SCF_DO_STCLASS_AND) {
1454 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456 for (value = 0; value < 256; value++)
1457 if (!isSPACE(value))
1458 ANYOF_BITMAP_CLEAR(data->start_class, value);
1459 }
1460 }
1461 else {
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464 else {
1465 for (value = 0; value < 256; value++)
1466 if (isSPACE(value))
b81d288d 1467 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1468 }
1469 }
1470 break;
1471 case SPACEL:
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475 }
1476 else {
1477 data->start_class->flags |= ANYOF_LOCALE;
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479 }
1480 break;
1481 case NSPACE:
1482 if (flags & SCF_DO_STCLASS_AND) {
1483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485 for (value = 0; value < 256; value++)
1486 if (isSPACE(value))
1487 ANYOF_BITMAP_CLEAR(data->start_class, value);
1488 }
1489 }
1490 else {
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1493 else {
1494 for (value = 0; value < 256; value++)
1495 if (!isSPACE(value))
b81d288d 1496 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1497 }
1498 }
1499 break;
1500 case NSPACEL:
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (data->start_class->flags & ANYOF_LOCALE) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1505 if (!isSPACE(value))
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1507 }
1508 }
1509 else {
1510 data->start_class->flags |= ANYOF_LOCALE;
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512 }
1513 break;
1514 case DIGIT:
1515 if (flags & SCF_DO_STCLASS_AND) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517 for (value = 0; value < 256; value++)
1518 if (!isDIGIT(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 }
1521 else {
1522 if (data->start_class->flags & ANYOF_LOCALE)
1523 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1524 else {
1525 for (value = 0; value < 256; value++)
1526 if (isDIGIT(value))
b81d288d 1527 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1528 }
1529 }
1530 break;
1531 case NDIGIT:
1532 if (flags & SCF_DO_STCLASS_AND) {
1533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534 for (value = 0; value < 256; value++)
1535 if (isDIGIT(value))
1536 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537 }
1538 else {
1539 if (data->start_class->flags & ANYOF_LOCALE)
1540 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1541 else {
1542 for (value = 0; value < 256; value++)
1543 if (!isDIGIT(value))
b81d288d 1544 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1545 }
1546 }
1547 break;
1548 }
1549 if (flags & SCF_DO_STCLASS_OR)
1550 cl_and(data->start_class, &and_with);
1551 flags &= ~SCF_DO_STCLASS;
1552 }
a0ed51b3 1553 }
22c35a8c 1554 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1555 data->flags |= (OP(scan) == MEOL
1556 ? SF_BEFORE_MEOL
1557 : SF_BEFORE_SEOL);
a0ed51b3 1558 }
653099ff
GS
1559 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1560 /* Lookbehind, or need to calculate parens/evals/stclass: */
1561 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1562 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1563 /* Lookahead/lookbehind */
cb434fcc 1564 I32 deltanext, minnext, fake = 0;
c277df42 1565 regnode *nscan;
653099ff
GS
1566 struct regnode_charclass_class intrnl;
1567 int f = 0;
c277df42
IZ
1568
1569 data_fake.flags = 0;
b81d288d 1570 if (data) {
2c2d71f5 1571 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1572 data_fake.last_closep = data->last_closep;
1573 }
1574 else
1575 data_fake.last_closep = &fake;
653099ff
GS
1576 if ( flags & SCF_DO_STCLASS && !scan->flags
1577 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1578 cl_init(pRExC_state, &intrnl);
653099ff 1579 data_fake.start_class = &intrnl;
e1901655 1580 f |= SCF_DO_STCLASS_AND;
653099ff 1581 }
e1901655
IZ
1582 if (flags & SCF_WHILEM_VISITED_POS)
1583 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1584 next = regnext(scan);
1585 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1586 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1587 if (scan->flags) {
1588 if (deltanext) {
9baa0206 1589 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1590 }
1591 else if (minnext > U8_MAX) {
9baa0206 1592 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1593 }
1594 scan->flags = minnext;
1595 }
1596 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1597 pars++;
405ff068 1598 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1599 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1600 if (data)
1601 data->whilem_c = data_fake.whilem_c;
e1901655 1602 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1603 int was = (data->start_class->flags & ANYOF_EOS);
1604
1605 cl_and(data->start_class, &intrnl);
1606 if (was)
1607 data->start_class->flags |= ANYOF_EOS;
1608 }
a0ed51b3
LW
1609 }
1610 else if (OP(scan) == OPEN) {
c277df42 1611 pars++;
a0ed51b3 1612 }
cb434fcc
IZ
1613 else if (OP(scan) == CLOSE) {
1614 if (ARG(scan) == is_par) {
1615 next = regnext(scan);
c277df42 1616
cb434fcc
IZ
1617 if ( next && (OP(next) != WHILEM) && next < last)
1618 is_par = 0; /* Disable optimization */
1619 }
1620 if (data)
1621 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1622 }
1623 else if (OP(scan) == EVAL) {
c277df42
IZ
1624 if (data)
1625 data->flags |= SF_HAS_EVAL;
1626 }
96776eda 1627 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1628 if (flags & SCF_DO_SUBSTR) {
830247a4 1629 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1630 data->longest = &(data->longest_float);
1631 }
1632 is_inf = is_inf_internal = 1;
653099ff 1633 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1634 cl_anything(pRExC_state, data->start_class);
96776eda 1635 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1636 }
c277df42
IZ
1637 /* Else: zero-length, ignore. */
1638 scan = regnext(scan);
1639 }
1640
1641 finish:
1642 *scanp = scan;
aca2d497 1643 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1644 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1645 data->pos_delta = I32_MAX - data->pos_min;
1646 if (is_par > U8_MAX)
1647 is_par = 0;
1648 if (is_par && pars==1 && data) {
1649 data->flags |= SF_IN_PAR;
1650 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1651 }
1652 else if (pars && data) {
c277df42
IZ
1653 data->flags |= SF_HAS_PAR;
1654 data->flags &= ~SF_IN_PAR;
1655 }
653099ff
GS
1656 if (flags & SCF_DO_STCLASS_OR)
1657 cl_and(data->start_class, &and_with);
c277df42
IZ
1658 return min;
1659}
1660
76e3520e 1661STATIC I32
830247a4 1662S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1663{
830247a4 1664 if (RExC_rx->data) {
b81d288d
AB
1665 Renewc(RExC_rx->data,
1666 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1667 char, struct reg_data);
830247a4
IZ
1668 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669 RExC_rx->data->count += n;
a0ed51b3
LW
1670 }
1671 else {
830247a4 1672 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1673 char, struct reg_data);
830247a4
IZ
1674 New(1208, RExC_rx->data->what, n, U8);
1675 RExC_rx->data->count = n;
c277df42 1676 }
830247a4
IZ
1677 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678 return RExC_rx->data->count - n;
c277df42
IZ
1679}
1680
d88dccdf 1681void
864dbfa3 1682Perl_reginitcolors(pTHX)
d88dccdf 1683{
d88dccdf
IZ
1684 int i = 0;
1685 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1686
d88dccdf
IZ
1687 if (s) {
1688 PL_colors[0] = s = savepv(s);
1689 while (++i < 6) {
1690 s = strchr(s, '\t');
1691 if (s) {
1692 *s = '\0';
1693 PL_colors[i] = ++s;
1694 }
1695 else
c712d376 1696 PL_colors[i] = s = "";
d88dccdf
IZ
1697 }
1698 } else {
b81d288d 1699 while (i < 6)
d88dccdf
IZ
1700 PL_colors[i++] = "";
1701 }
1702 PL_colorset = 1;
1703}
1704
8615cb43 1705
a687059c 1706/*
e50aee73 1707 - pregcomp - compile a regular expression into internal code
a687059c
LW
1708 *
1709 * We can't allocate space until we know how big the compiled form will be,
1710 * but we can't compile it (and thus know how big it is) until we've got a
1711 * place to put the code. So we cheat: we compile it twice, once with code
1712 * generation turned off and size counting turned on, and once "for real".
1713 * This also means that we don't allocate space until we are sure that the
1714 * thing really will compile successfully, and we never have to move the
1715 * code and thus invalidate pointers into it. (Note that it has to be in
1716 * one piece because free() must be able to free it all.) [NB: not true in perl]
1717 *
1718 * Beware that the optimization-preparation code in here knows about some
1719 * of the structure of the compiled regexp. [I'll say.]
1720 */
1721regexp *
864dbfa3 1722Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1723{
a0d0e21e 1724 register regexp *r;
c277df42 1725 regnode *scan;
c277df42 1726 regnode *first;
a0d0e21e 1727 I32 flags;
a0d0e21e
LW
1728 I32 minlen = 0;
1729 I32 sawplus = 0;
1730 I32 sawopen = 0;
2c2d71f5 1731 scan_data_t data;
830247a4
IZ
1732 RExC_state_t RExC_state;
1733 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1734
1735 if (exp == NULL)
c277df42 1736 FAIL("NULL regexp argument");
a0d0e21e 1737
a5961de5 1738 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1739
5cfc7842 1740 RExC_precomp = exp;
a5961de5
JH
1741 DEBUG_r({
1742 if (!PL_colorset) reginitcolors();
1743 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744 PL_colors[4],PL_colors[5],PL_colors[0],
1745 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1746 });
830247a4
IZ
1747 RExC_flags16 = pm->op_pmflags;
1748 RExC_sawback = 0;
bbce6d69 1749
830247a4
IZ
1750 RExC_seen = 0;
1751 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752 RExC_seen_evals = 0;
1753 RExC_extralen = 0;
c277df42 1754
bbce6d69 1755 /* First pass: determine size, legality. */
830247a4 1756 RExC_parse = exp;
fac92740 1757 RExC_start = exp;
830247a4
IZ
1758 RExC_end = xend;
1759 RExC_naughty = 0;
1760 RExC_npar = 1;
1761 RExC_size = 0L;
1762 RExC_emit = &PL_regdummy;
1763 RExC_whilem_seen = 0;
85ddcde9
JH
1764#if 0 /* REGC() is (currently) a NOP at the first pass.
1765 * Clever compilers notice this and complain. --jhi */
830247a4 1766 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1767#endif
830247a4 1768 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1769 RExC_precomp = Nullch;
a0d0e21e
LW
1770 return(NULL);
1771 }
830247a4 1772 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1773
c277df42
IZ
1774 /* Small enough for pointer-storage convention?
1775 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1776 if (RExC_size >= 0x10000L && RExC_extralen)
1777 RExC_size += RExC_extralen;
c277df42 1778 else
830247a4
IZ
1779 RExC_extralen = 0;
1780 if (RExC_whilem_seen > 15)
1781 RExC_whilem_seen = 15;
a0d0e21e 1782
bbce6d69 1783 /* Allocate space and initialize. */
830247a4 1784 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1785 char, regexp);
a0d0e21e 1786 if (r == NULL)
b45f050a
JF
1787 FAIL("Regexp out of space");
1788
0f79a09d
GS
1789#ifdef DEBUGGING
1790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1791 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1792#endif
c277df42 1793 r->refcnt = 1;
bbce6d69 1794 r->prelen = xend - exp;
5cfc7842 1795 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1796 r->subbeg = NULL;
1797 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1798 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1799
1800 r->substrs = 0; /* Useful during FAIL. */
1801 r->startp = 0; /* Useful during FAIL. */
1802 r->endp = 0; /* Useful during FAIL. */
1803
fac92740
MJD
1804 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1805 if (r->offsets) {
1806 r->offsets[0] = RExC_size;
1807 }
1808 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1809 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1810 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1811 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1812
830247a4 1813 RExC_rx = r;
bbce6d69 1814
1815 /* Second pass: emit code. */
055bb491 1816 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1817 RExC_parse = exp;
1818 RExC_end = xend;
1819 RExC_naughty = 0;
1820 RExC_npar = 1;
fac92740 1821 RExC_emit_start = r->program;
830247a4 1822 RExC_emit = r->program;
2cd61cdb 1823 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1824 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1826 r->data = 0;
830247a4 1827 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1828 return(NULL);
1829
1830 /* Dig out information for optimizations. */
cf93c79d 1831 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1832 pm->op_pmflags = RExC_flags16;
a0ed51b3 1833 if (UTF)
5ff6fc6d 1834 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1835 r->regstclass = NULL;
830247a4 1836 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1837 r->reganch |= ROPT_NAUGHTY;
c277df42 1838 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1839
1840 /* XXXX To minimize changes to RE engine we always allocate
1841 3-units-long substrs field. */
1842 Newz(1004, r->substrs, 1, struct reg_substr_data);
1843
2c2d71f5 1844 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1845 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1847 I32 fake;
c5254dd6 1848 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1849 struct regnode_charclass_class ch_class;
1850 int stclass_flag;
cb434fcc 1851 I32 last_close = 0;
a0d0e21e
LW
1852
1853 first = scan;
c277df42 1854 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1855 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1856 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858 (OP(first) == PLUS) ||
1859 (OP(first) == MINMOD) ||
653099ff 1860 /* An {n,m} with n>0 */
22c35a8c 1861 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1862 if (OP(first) == PLUS)
1863 sawplus = 1;
1864 else
1865 first += regarglen[(U8)OP(first)];
1866 first = NEXTOPER(first);
a687059c
LW
1867 }
1868
a0d0e21e
LW
1869 /* Starting-point info. */
1870 again:
653099ff 1871 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1872 if (OP(first) == EXACT)
1873 ; /* Empty, get anchored substr later. */
1874 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1875 r->regstclass = first;
1876 }
653099ff 1877 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1878 r->regstclass = first;
22c35a8c
GS
1879 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1881 r->regstclass = first;
22c35a8c 1882 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1883 r->reganch |= (OP(first) == MBOL
1884 ? ROPT_ANCH_MBOL
1885 : (OP(first) == SBOL
1886 ? ROPT_ANCH_SBOL
1887 : ROPT_ANCH_BOL));
a0d0e21e 1888 first = NEXTOPER(first);
774d564b 1889 goto again;
1890 }
1891 else if (OP(first) == GPOS) {
1892 r->reganch |= ROPT_ANCH_GPOS;
1893 first = NEXTOPER(first);
1894 goto again;
a0d0e21e 1895 }
e09294f4 1896 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1897 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1898 !(r->reganch & ROPT_ANCH) )
1899 {
1900 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1901 int type = OP(NEXTOPER(first));
1902
ffc61ed2 1903 if (type == REG_ANY)
cad2e5aa
JH
1904 type = ROPT_ANCH_MBOL;
1905 else
1906 type = ROPT_ANCH_SBOL;
1907
1908 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1909 first = NEXTOPER(first);
774d564b 1910 goto again;
a0d0e21e 1911 }
b81d288d 1912 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1913 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1914 /* x+ must match at the 1st pos of run of x's */
1915 r->reganch |= ROPT_SKIP;
a0d0e21e 1916
c277df42 1917 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1918 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1919 (IV)(first - scan + 1)));
a0d0e21e
LW
1920 /*
1921 * If there's something expensive in the r.e., find the
1922 * longest literal string that must appear and make it the
1923 * regmust. Resolve ties in favor of later strings, since
1924 * the regstart check works with the beginning of the r.e.
1925 * and avoiding duplication strengthens checking. Not a
1926 * strong reason, but sufficient in the absence of others.
1927 * [Now we resolve ties in favor of the earlier string if
c277df42 1928 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1929 * earlier string may buy us something the later one won't.]
1930 */
a0d0e21e 1931 minlen = 0;
a687059c 1932
79cb57f6
GS
1933 data.longest_fixed = newSVpvn("",0);
1934 data.longest_float = newSVpvn("",0);
1935 data.last_found = newSVpvn("",0);
c277df42
IZ
1936 data.longest = &(data.longest_fixed);
1937 first = scan;
653099ff 1938 if (!r->regstclass) {
830247a4 1939 cl_init(pRExC_state, &ch_class);
653099ff
GS
1940 data.start_class = &ch_class;
1941 stclass_flag = SCF_DO_STCLASS_AND;
1942 } else /* XXXX Check for BOUND? */
1943 stclass_flag = 0;
cb434fcc 1944 data.last_closep = &last_close;
653099ff 1945
830247a4 1946 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1947 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1948 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1949 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1950 && !RExC_seen_zerolen
1951 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1952 r->reganch |= ROPT_CHECK_ALL;
830247a4 1953 scan_commit(pRExC_state, &data);
c277df42
IZ
1954 SvREFCNT_dec(data.last_found);
1955
a0ed51b3 1956 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1957 if (longest_float_length
c277df42
IZ
1958 || (data.flags & SF_FL_BEFORE_EOL
1959 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1960 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1961 int t;
1962
a0ed51b3 1963 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1964 && data.offset_fixed == data.offset_float_min
1965 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966 goto remove_float; /* As in (a)+. */
1967
33b8afdf
JH
1968 if (SvUTF8(data.longest_float)) {
1969 r->float_utf8 = data.longest_float;
1970 r->float_substr = Nullsv;
1971 } else {
1972 r->float_substr = data.longest_float;
1973 r->float_utf8 = Nullsv;
1974 }
c277df42
IZ
1975 r->float_min_offset = data.offset_float_min;
1976 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1977 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1979 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 1980 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1981 }
1982 else {
aca2d497 1983 remove_float:
33b8afdf 1984 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1985 SvREFCNT_dec(data.longest_float);
c5254dd6 1986 longest_float_length = 0;
a0d0e21e 1987 }
c277df42 1988
a0ed51b3 1989 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1990 if (longest_fixed_length
c277df42
IZ
1991 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1993 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1994 int t;
1995
33b8afdf
JH
1996 if (SvUTF8(data.longest_fixed)) {
1997 r->anchored_utf8 = data.longest_fixed;
1998 r->anchored_substr = Nullsv;
1999 } else {
2000 r->anchored_substr = data.longest_fixed;
2001 r->anchored_utf8 = Nullsv;
2002 }
c277df42 2003 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2004 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 2006 || (RExC_flags16 & PMf_MULTILINE)));
33b8afdf 2007 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2008 }
2009 else {
33b8afdf 2010 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2011 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2012 longest_fixed_length = 0;
a0d0e21e 2013 }
b81d288d 2014 if (r->regstclass
ffc61ed2 2015 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2016 r->regstclass = NULL;
33b8afdf
JH
2017 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2018 && stclass_flag
653099ff
GS
2019 && !(data.start_class->flags & ANYOF_EOS)
2020 && !cl_is_anything(data.start_class)) {
830247a4 2021 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2022
b81d288d 2023 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2024 struct regnode_charclass_class);
2025 StructCopy(data.start_class,
830247a4 2026 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2027 struct regnode_charclass_class);
830247a4 2028 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2029 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2030 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2031 DEBUG_r({ SV *sv = sv_newmortal();
2032 regprop(sv, (regnode*)data.start_class);
2033 PerlIO_printf(Perl_debug_log,
2034 "synthetic stclass `%s'.\n",
2035 SvPVX(sv));});
653099ff 2036 }
c277df42
IZ
2037
2038 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2039 if (longest_fixed_length > longest_float_length) {
c277df42 2040 r->check_substr = r->anchored_substr;
33b8afdf 2041 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2042 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043 if (r->reganch & ROPT_ANCH_SINGLE)
2044 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2045 }
2046 else {
c277df42 2047 r->check_substr = r->float_substr;
33b8afdf 2048 r->check_utf8 = r->float_utf8;
c277df42
IZ
2049 r->check_offset_min = data.offset_float_min;
2050 r->check_offset_max = data.offset_float_max;
a0d0e21e 2051 }
30382c73
IZ
2052 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053 This should be changed ASAP! */
33b8afdf 2054 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2055 r->reganch |= RE_USE_INTUIT;
33b8afdf 2056 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2057 r->reganch |= RE_INTUIT_TAIL;
2058 }
a0ed51b3
LW
2059 }
2060 else {
c277df42
IZ
2061 /* Several toplevels. Best we can is to set minlen. */
2062 I32 fake;
653099ff 2063 struct regnode_charclass_class ch_class;
cb434fcc 2064 I32 last_close = 0;
c277df42
IZ
2065
2066 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067 scan = r->program + 1;
830247a4 2068 cl_init(pRExC_state, &ch_class);
653099ff 2069 data.start_class = &ch_class;
cb434fcc 2070 data.last_closep = &last_close;
e1901655 2071 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2072 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073 = r->float_substr = r->float_utf8 = Nullsv;
653099ff
GS
2074 if (!(data.start_class->flags & ANYOF_EOS)
2075 && !cl_is_anything(data.start_class)) {
830247a4 2076 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2077
b81d288d 2078 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2079 struct regnode_charclass_class);
2080 StructCopy(data.start_class,
830247a4 2081 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2082 struct regnode_charclass_class);
830247a4 2083 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2084 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2085 DEBUG_r({ SV* sv = sv_newmortal();
2086 regprop(sv, (regnode*)data.start_class);
2087 PerlIO_printf(Perl_debug_log,
2088 "synthetic stclass `%s'.\n",
2089 SvPVX(sv));});
653099ff 2090 }
a0d0e21e
LW
2091 }
2092
a0d0e21e 2093 r->minlen = minlen;
b81d288d 2094 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2095 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2096 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2097 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2098 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2099 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2100 if (RExC_seen & REG_SEEN_CANY)
2101 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2102 Newz(1002, r->startp, RExC_npar, I32);
2103 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2104 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2105 DEBUG_r(regdump(r));
2106 return(r);
a687059c
LW
2107}
2108
2109/*
2110 - reg - regular expression, i.e. main body or parenthesized thing
2111 *
2112 * Caller must absorb opening parenthesis.
2113 *
2114 * Combining parenthesis handling with the base level of regular expression
2115 * is a trifle forced, but the need to tie the tails of the branches to what
2116 * follows makes it hard to avoid.
2117 */
76e3520e 2118STATIC regnode *
830247a4 2119S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2120 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2121{
c277df42
IZ
2122 register regnode *ret; /* Will be the head of the group. */
2123 register regnode *br;
2124 register regnode *lastbr;
2125 register regnode *ender = 0;
a0d0e21e 2126 register I32 parno = 0;
830247a4 2127 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2128
2129 /* for (?g), (?gc), and (?o) warnings; warning
2130 about (?c) will warn about (?g) -- japhy */
2131
2132 I32 wastedflags = 0x00,
2133 wasted_o = 0x01,
2134 wasted_g = 0x02,
2135 wasted_gc = 0x02 | 0x04,
2136 wasted_c = 0x04;
2137
fac92740 2138 char * parse_start = RExC_parse; /* MJD */
830247a4 2139 char *oregcomp_parse = RExC_parse;
c277df42 2140 char c;
a0d0e21e 2141
821b33a5 2142 *flagp = 0; /* Tentatively. */
a0d0e21e 2143
9d1d55b5 2144
a0d0e21e
LW
2145 /* Make an OPEN node, if parenthesized. */
2146 if (paren) {
fac92740 2147 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2148 U16 posflags = 0, negflags = 0;
2149 U16 *flagsp = &posflags;
0f5d15d6 2150 int logical = 0;
830247a4 2151 char *seqstart = RExC_parse;
ca9dfc88 2152
830247a4
IZ
2153 RExC_parse++;
2154 paren = *RExC_parse++;
c277df42 2155 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2156 switch (paren) {
fac92740 2157 case '<': /* (?<...) */
830247a4 2158 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2159 if (*RExC_parse == '!')
c277df42 2160 paren = ',';
b81d288d 2161 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2162 goto unknown;
830247a4 2163 RExC_parse++;
fac92740
MJD
2164 case '=': /* (?=...) */
2165 case '!': /* (?!...) */
830247a4 2166 RExC_seen_zerolen++;
fac92740
MJD
2167 case ':': /* (?:...) */
2168 case '>': /* (?>...) */
a0d0e21e 2169 break;
fac92740
MJD
2170 case '$': /* (?$...) */
2171 case '@': /* (?@...) */
8615cb43 2172 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2173 break;
fac92740 2174 case '#': /* (?#...) */
830247a4
IZ
2175 while (*RExC_parse && *RExC_parse != ')')
2176 RExC_parse++;
2177 if (*RExC_parse != ')')
c277df42 2178 FAIL("Sequence (?#... not terminated");
830247a4 2179 nextchar(pRExC_state);
a0d0e21e
LW
2180 *flagp = TRYAGAIN;
2181 return NULL;
fac92740 2182 case 'p': /* (?p...) */
9014280d 2183 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2184 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2185 /* FALL THROUGH*/
fac92740 2186 case '?': /* (??...) */
0f5d15d6 2187 logical = 1;
438a3801
YST
2188 if (*RExC_parse != '{')
2189 goto unknown;
830247a4 2190 paren = *RExC_parse++;
0f5d15d6 2191 /* FALL THROUGH */
fac92740 2192 case '{': /* (?{...}) */
c277df42 2193 {
c277df42
IZ
2194 I32 count = 1, n = 0;
2195 char c;
830247a4 2196 char *s = RExC_parse;
c277df42
IZ
2197 SV *sv;
2198 OP_4tree *sop, *rop;
2199
830247a4
IZ
2200 RExC_seen_zerolen++;
2201 RExC_seen |= REG_SEEN_EVAL;
2202 while (count && (c = *RExC_parse)) {
2203 if (c == '\\' && RExC_parse[1])
2204 RExC_parse++;
b81d288d 2205 else if (c == '{')
c277df42 2206 count++;
b81d288d 2207 else if (c == '}')
c277df42 2208 count--;
830247a4 2209 RExC_parse++;
c277df42 2210 }
830247a4 2211 if (*RExC_parse != ')')
b45f050a 2212 {
b81d288d 2213 RExC_parse = s;
b45f050a
JF
2214 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2215 }
c277df42
IZ
2216 if (!SIZE_ONLY) {
2217 AV *av;
b81d288d
AB
2218
2219 if (RExC_parse - 1 - s)
830247a4 2220 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2221 else
79cb57f6 2222 sv = newSVpvn("", 0);
c277df42 2223
569233ed
SB
2224 ENTER;
2225 Perl_save_re_context(aTHX);
c277df42 2226 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2227 sop->op_private |= OPpREFCOUNTED;
2228 /* re_dup will OpREFCNT_inc */
2229 OpREFCNT_set(sop, 1);
569233ed 2230 LEAVE;
c277df42 2231
830247a4
IZ
2232 n = add_data(pRExC_state, 3, "nop");
2233 RExC_rx->data->data[n] = (void*)rop;
2234 RExC_rx->data->data[n+1] = (void*)sop;
2235 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2236 SvREFCNT_dec(sv);
a0ed51b3 2237 }
e24b16f9 2238 else { /* First pass */
830247a4 2239 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2240 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2241 /* No compiled RE interpolated, has runtime
2242 components ===> unsafe. */
2243 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2244 if (PL_tainting && PL_tainted)
cc6b7395 2245 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2246 }
2247
830247a4 2248 nextchar(pRExC_state);
0f5d15d6 2249 if (logical) {
830247a4 2250 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2251 if (!SIZE_ONLY)
2252 ret->flags = 2;
830247a4 2253 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2254 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2255 return ret;
2256 }
830247a4 2257 return reganode(pRExC_state, EVAL, n);
c277df42 2258 }
fac92740 2259 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2260 {
fac92740 2261 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2262 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263 || RExC_parse[1] == '<'
830247a4 2264 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2265 I32 flag;
2266
830247a4 2267 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2268 if (!SIZE_ONLY)
2269 ret->flags = 1;
830247a4 2270 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2271 goto insert_if;
b81d288d 2272 }
a0ed51b3 2273 }
830247a4 2274 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2275 /* (?(1)...) */
830247a4 2276 parno = atoi(RExC_parse++);
c277df42 2277
830247a4
IZ
2278 while (isDIGIT(*RExC_parse))
2279 RExC_parse++;
fac92740
MJD
2280 ret = reganode(pRExC_state, GROUPP, parno);
2281
830247a4 2282 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2283 vFAIL("Switch condition not recognized");
c277df42 2284 insert_if:
830247a4
IZ
2285 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286 br = regbranch(pRExC_state, &flags, 1);
c277df42 2287 if (br == NULL)
830247a4 2288 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2289 else
830247a4
IZ
2290 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291 c = *nextchar(pRExC_state);
d1b80229
IZ
2292 if (flags&HASWIDTH)
2293 *flagp |= HASWIDTH;
c277df42 2294 if (c == '|') {
830247a4
IZ
2295 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296 regbranch(pRExC_state, &flags, 1);
2297 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2298 if (flags&HASWIDTH)
2299 *flagp |= HASWIDTH;
830247a4 2300 c = *nextchar(pRExC_state);
a0ed51b3
LW
2301 }
2302 else
c277df42
IZ
2303 lastbr = NULL;
2304 if (c != ')')
8615cb43 2305 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2306 ender = reg_node(pRExC_state, TAIL);
2307 regtail(pRExC_state, br, ender);
c277df42 2308 if (lastbr) {
830247a4
IZ
2309 regtail(pRExC_state, lastbr, ender);
2310 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2311 }
2312 else
830247a4 2313 regtail(pRExC_state, ret, ender);
c277df42 2314 return ret;
a0ed51b3
LW
2315 }
2316 else {
830247a4 2317 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2318 }
2319 }
1b1626e4 2320 case 0:
830247a4 2321 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2322 vFAIL("Sequence (? incomplete");
1b1626e4 2323 break;
a0d0e21e 2324 default:
830247a4 2325 --RExC_parse;
fac92740 2326 parse_flags: /* (?i) */
830247a4 2327 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2328 /* (?g), (?gc) and (?o) are useless here
2329 and must be globally applied -- japhy */
2330
2331 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334 if (! (wastedflags & wflagbit) ) {
2335 wastedflags |= wflagbit;
2336 vWARN5(
2337 RExC_parse + 1,
2338 "Useless (%s%c) - %suse /%c modifier",
2339 flagsp == &negflags ? "?-" : "?",
2340 *RExC_parse,
2341 flagsp == &negflags ? "don't " : "",
2342 *RExC_parse
2343 );
2344 }
2345 }
2346 }
2347 else if (*RExC_parse == 'c') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 if (! (wastedflags & wasted_c) ) {
2350 wastedflags |= wasted_gc;
2351 vWARN3(
2352 RExC_parse + 1,
2353 "Useless (%sc) - %suse /gc modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 flagsp == &negflags ? "don't " : ""
2356 );
2357 }
2358 }
2359 }
2360 else { pmflag(flagsp, *RExC_parse); }
2361
830247a4 2362 ++RExC_parse;
ca9dfc88 2363 }
830247a4 2364 if (*RExC_parse == '-') {
ca9dfc88 2365 flagsp = &negflags;
9d1d55b5 2366 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2367 ++RExC_parse;
ca9dfc88 2368 goto parse_flags;
48c036b1 2369 }
830247a4
IZ
2370 RExC_flags16 |= posflags;
2371 RExC_flags16 &= ~negflags;
2372 if (*RExC_parse == ':') {
2373 RExC_parse++;
ca9dfc88
IZ
2374 paren = ':';
2375 break;
2376 }
c277df42 2377 unknown:
830247a4
IZ
2378 if (*RExC_parse != ')') {
2379 RExC_parse++;
2380 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2381 }
830247a4 2382 nextchar(pRExC_state);
a0d0e21e
LW
2383 *flagp = TRYAGAIN;
2384 return NULL;
2385 }
2386 }
fac92740 2387 else { /* (...) */
830247a4
IZ
2388 parno = RExC_npar;
2389 RExC_npar++;
2390 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2391 Set_Node_Length(ret, 1); /* MJD */
2392 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2393 open = 1;
a0d0e21e 2394 }
a0ed51b3 2395 }
fac92740 2396 else /* ! paren */
a0d0e21e
LW
2397 ret = NULL;
2398
2399 /* Pick up the branches, linking them together. */
fac92740 2400 parse_start = RExC_parse; /* MJD */
830247a4 2401 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2402 /* branch_len = (paren != 0); */
2403
a0d0e21e
LW
2404 if (br == NULL)
2405 return(NULL);
830247a4
IZ
2406 if (*RExC_parse == '|') {
2407 if (!SIZE_ONLY && RExC_extralen) {
2408 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2409 }
fac92740 2410 else { /* MJD */
830247a4 2411 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2412 Set_Node_Length(br, paren != 0);
2413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2414 }
c277df42
IZ
2415 have_branch = 1;
2416 if (SIZE_ONLY)
830247a4 2417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2418 }
2419 else if (paren == ':') {
c277df42
IZ
2420 *flagp |= flags&SIMPLE;
2421 }
2422 if (open) { /* Starts with OPEN. */
830247a4 2423 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2424 }
2425 else if (paren != '?') /* Not Conditional */
a0d0e21e 2426 ret = br;
32a0ca98 2427 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2428 lastbr = br;
830247a4
IZ
2429 while (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 ender = reganode(pRExC_state, LONGJMP,0);
2432 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2433 }
2434 if (SIZE_ONLY)
830247a4
IZ
2435 RExC_extralen += 2; /* Account for LONGJMP. */
2436 nextchar(pRExC_state);
2437 br = regbranch(pRExC_state, &flags, 0);
fac92740 2438
a687059c 2439 if (br == NULL)
a0d0e21e 2440 return(NULL);
830247a4 2441 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2442 lastbr = br;
821b33a5
IZ
2443 if (flags&HASWIDTH)
2444 *flagp |= HASWIDTH;
a687059c 2445 *flagp |= flags&SPSTART;
a0d0e21e
LW
2446 }
2447
c277df42
IZ
2448 if (have_branch || paren != ':') {
2449 /* Make a closing node, and hook it on the end. */
2450 switch (paren) {
2451 case ':':
830247a4 2452 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2453 break;
2454 case 1:
830247a4 2455 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2456 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2458 break;
2459 case '<':
c277df42
IZ
2460 case ',':
2461 case '=':
2462 case '!':
c277df42 2463 *flagp &= ~HASWIDTH;
821b33a5
IZ
2464 /* FALL THROUGH */
2465 case '>':
830247a4 2466 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2467 break;
2468 case 0:
830247a4 2469 ender = reg_node(pRExC_state, END);
c277df42
IZ
2470 break;
2471 }
830247a4 2472 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2473
c277df42
IZ
2474 if (have_branch) {
2475 /* Hook the tails of the branches to the closing node. */
2476 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2477 regoptail(pRExC_state, br, ender);
c277df42
IZ
2478 }
2479 }
a0d0e21e 2480 }
c277df42
IZ
2481
2482 {
2483 char *p;
2484 static char parens[] = "=!<,>";
2485
2486 if (paren && (p = strchr(parens, paren))) {
2487 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488 int flag = (p - parens) > 1;
2489
2490 if (paren == '>')
2491 node = SUSPEND, flag = 0;
830247a4 2492 reginsert(pRExC_state, node,ret);
c277df42 2493 ret->flags = flag;
830247a4 2494 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2495 }
a0d0e21e
LW
2496 }
2497
2498 /* Check for proper termination. */
ce3e6498 2499 if (paren) {
830247a4
IZ
2500 RExC_flags16 = oregflags;
2501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502 RExC_parse = oregcomp_parse;
380a0633 2503 vFAIL("Unmatched (");
ce3e6498 2504 }
a0ed51b3 2505 }
830247a4
IZ
2506 else if (!paren && RExC_parse < RExC_end) {
2507 if (*RExC_parse == ')') {
2508 RExC_parse++;
380a0633 2509 vFAIL("Unmatched )");
a0ed51b3
LW
2510 }
2511 else
b45f050a 2512 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2513 /* NOTREACHED */
2514 }
a687059c 2515
a0d0e21e 2516 return(ret);
a687059c
LW
2517}
2518
2519/*
2520 - regbranch - one alternative of an | operator
2521 *
2522 * Implements the concatenation operator.
2523 */
76e3520e 2524STATIC regnode *
830247a4 2525S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2526{
c277df42
IZ
2527 register regnode *ret;
2528 register regnode *chain = NULL;
2529 register regnode *latest;
2530 I32 flags = 0, c = 0;
a0d0e21e 2531
b81d288d 2532 if (first)
c277df42
IZ
2533 ret = NULL;
2534 else {
b81d288d 2535 if (!SIZE_ONLY && RExC_extralen)
830247a4 2536 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2537 else {
830247a4 2538 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2539 Set_Node_Length(ret, 1);
2540 }
c277df42
IZ
2541 }
2542
b81d288d 2543 if (!first && SIZE_ONLY)
830247a4 2544 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2545
c277df42 2546 *flagp = WORST; /* Tentatively. */
a0d0e21e 2547
830247a4
IZ
2548 RExC_parse--;
2549 nextchar(pRExC_state);
2550 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2551 flags &= ~TRYAGAIN;
830247a4 2552 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2553 if (latest == NULL) {
2554 if (flags & TRYAGAIN)
2555 continue;
2556 return(NULL);
a0ed51b3
LW
2557 }
2558 else if (ret == NULL)
c277df42 2559 ret = latest;
a0d0e21e 2560 *flagp |= flags&HASWIDTH;
c277df42 2561 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2562 *flagp |= flags&SPSTART;
2563 else {
830247a4
IZ
2564 RExC_naughty++;
2565 regtail(pRExC_state, chain, latest);
a687059c 2566 }
a0d0e21e 2567 chain = latest;
c277df42
IZ
2568 c++;
2569 }
2570 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2571 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2572 if (ret == NULL)
2573 ret = chain;
2574 }
2575 if (c == 1) {
2576 *flagp |= flags&SIMPLE;
a0d0e21e 2577 }
a687059c 2578
a0d0e21e 2579 return(ret);
a687059c
LW
2580}
2581
2582/*
2583 - regpiece - something followed by possible [*+?]
2584 *
2585 * Note that the branching code sequences used for ? and the general cases
2586 * of * and + are somewhat optimized: they use the same NOTHING node as
2587 * both the endmarker for their branch list and the body of the last branch.
2588 * It might seem that this node could be dispensed with entirely, but the
2589 * endmarker role is not redundant.
2590 */
76e3520e 2591STATIC regnode *
830247a4 2592S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2593{
c277df42 2594 register regnode *ret;
a0d0e21e
LW
2595 register char op;
2596 register char *next;
2597 I32 flags;
830247a4 2598 char *origparse = RExC_parse;
a0d0e21e
LW
2599 char *maxpos;
2600 I32 min;
c277df42 2601 I32 max = REG_INFTY;
fac92740 2602 char *parse_start;
a0d0e21e 2603
830247a4 2604 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2605 if (ret == NULL) {
2606 if (flags & TRYAGAIN)
2607 *flagp |= TRYAGAIN;
2608 return(NULL);
2609 }
2610
830247a4 2611 op = *RExC_parse;
a0d0e21e 2612
830247a4 2613 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2614 parse_start = RExC_parse; /* MJD */
830247a4 2615 next = RExC_parse + 1;
a0d0e21e
LW
2616 maxpos = Nullch;
2617 while (isDIGIT(*next) || *next == ',') {
2618 if (*next == ',') {
2619 if (maxpos)
2620 break;
2621 else
2622 maxpos = next;
a687059c 2623 }
a0d0e21e
LW
2624 next++;
2625 }
2626 if (*next == '}') { /* got one */
2627 if (!maxpos)
2628 maxpos = next;
830247a4
IZ
2629 RExC_parse++;
2630 min = atoi(RExC_parse);
a0d0e21e
LW
2631 if (*maxpos == ',')
2632 maxpos++;
2633 else
830247a4 2634 maxpos = RExC_parse;
a0d0e21e
LW
2635 max = atoi(maxpos);
2636 if (!max && *maxpos != '0')
c277df42
IZ
2637 max = REG_INFTY; /* meaning "infinity" */
2638 else if (max >= REG_INFTY)
8615cb43 2639 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2640 RExC_parse = next;
2641 nextchar(pRExC_state);
a0d0e21e
LW
2642
2643 do_curly:
2644 if ((flags&SIMPLE)) {
830247a4
IZ
2645 RExC_naughty += 2 + RExC_naughty / 2;
2646 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2647 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2649 }
2650 else {
830247a4 2651 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2652
2653 w->flags = 0;
830247a4
IZ
2654 regtail(pRExC_state, ret, w);
2655 if (!SIZE_ONLY && RExC_extralen) {
2656 reginsert(pRExC_state, LONGJMP,ret);
2657 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2658 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2659 }
830247a4 2660 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2661 /* MJD hk */
2662 Set_Node_Offset(ret, parse_start+1);
2663 Set_Node_Length(ret,
2664 op == '{' ? (RExC_parse - parse_start) : 1);
2665
830247a4 2666 if (!SIZE_ONLY && RExC_extralen)
c277df42 2667 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2668 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2669 if (SIZE_ONLY)
830247a4
IZ
2670 RExC_whilem_seen++, RExC_extralen += 3;
2671 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2672 }
c277df42 2673 ret->flags = 0;
a0d0e21e
LW
2674
2675 if (min > 0)
821b33a5
IZ
2676 *flagp = WORST;
2677 if (max > 0)
2678 *flagp |= HASWIDTH;
a0d0e21e 2679 if (max && max < min)
8615cb43 2680 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2681 if (!SIZE_ONLY) {
2682 ARG1_SET(ret, min);
2683 ARG2_SET(ret, max);
a687059c 2684 }
a687059c 2685
a0d0e21e 2686 goto nest_check;
a687059c 2687 }
a0d0e21e 2688 }
a687059c 2689
a0d0e21e
LW
2690 if (!ISMULT1(op)) {
2691 *flagp = flags;
a687059c 2692 return(ret);
a0d0e21e 2693 }
bb20fd44 2694
c277df42 2695#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2696
2697 /* if this is reinstated, don't forget to put this back into perldiag:
2698
2699 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2700
2701 (F) The part of the regexp subject to either the * or + quantifier
2702 could match an empty string. The {#} shows in the regular
2703 expression about where the problem was discovered.
2704
2705 */
2706
bb20fd44 2707 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2708 vFAIL("Regexp *+ operand could be empty");
b81d288d 2709#endif
bb20fd44 2710
fac92740 2711 parse_start = RExC_parse;
830247a4 2712 nextchar(pRExC_state);
a0d0e21e 2713
821b33a5 2714 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2715
2716 if (op == '*' && (flags&SIMPLE)) {
830247a4 2717 reginsert(pRExC_state, STAR, ret);
c277df42 2718 ret->flags = 0;
830247a4 2719 RExC_naughty += 4;
a0d0e21e
LW
2720 }
2721 else if (op == '*') {
2722 min = 0;
2723 goto do_curly;
a0ed51b3
LW
2724 }
2725 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2726 reginsert(pRExC_state, PLUS, ret);
c277df42 2727 ret->flags = 0;
830247a4 2728 RExC_naughty += 3;
a0d0e21e
LW
2729 }
2730 else if (op == '+') {
2731 min = 1;
2732 goto do_curly;
a0ed51b3
LW
2733 }
2734 else if (op == '?') {
a0d0e21e
LW
2735 min = 0; max = 1;
2736 goto do_curly;
2737 }
2738 nest_check:
e476b1b5 2739 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2740 vWARN3(RExC_parse,
b45f050a 2741 "%.*s matches null string many times",
830247a4 2742 RExC_parse - origparse,
b45f050a 2743 origparse);
a0d0e21e
LW
2744 }
2745
830247a4
IZ
2746 if (*RExC_parse == '?') {
2747 nextchar(pRExC_state);
2748 reginsert(pRExC_state, MINMOD, ret);
2749 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2750 }
830247a4
IZ
2751 if (ISMULT2(RExC_parse)) {
2752 RExC_parse++;
b45f050a
JF
2753 vFAIL("Nested quantifiers");
2754 }
a0d0e21e
LW
2755
2756 return(ret);
a687059c
LW
2757}
2758
2759/*
2760 - regatom - the lowest level
2761 *
2762 * Optimization: gobbles an entire sequence of ordinary characters so that
2763 * it can turn them into a single node, which is smaller to store and
2764 * faster to run. Backslashed characters are exceptions, each becoming a
2765 * separate node; the code is simpler that way and it's not worth fixing.
2766 *
b45f050a 2767 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2768STATIC regnode *
830247a4 2769S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2770{
c277df42 2771 register regnode *ret = 0;
a0d0e21e 2772 I32 flags;
f06dbbb7 2773 char *parse_start = 0;
a0d0e21e
LW
2774
2775 *flagp = WORST; /* Tentatively. */
2776
2777tryagain:
830247a4 2778 switch (*RExC_parse) {
a0d0e21e 2779 case '^':
830247a4
IZ
2780 RExC_seen_zerolen++;
2781 nextchar(pRExC_state);
2782 if (RExC_flags16 & PMf_MULTILINE)
2783 ret = reg_node(pRExC_state, MBOL);
2784 else if (RExC_flags16 & PMf_SINGLELINE)
2785 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2786 else
830247a4 2787 ret = reg_node(pRExC_state, BOL);
fac92740 2788 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2789 break;
2790 case '$':
830247a4 2791 nextchar(pRExC_state);
b81d288d 2792 if (*RExC_parse)
830247a4
IZ
2793 RExC_seen_zerolen++;
2794 if (RExC_flags16 & PMf_MULTILINE)
2795 ret = reg_node(pRExC_state, MEOL);
2796 else if (RExC_flags16 & PMf_SINGLELINE)
2797 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2798 else
830247a4 2799 ret = reg_node(pRExC_state, EOL);
fac92740 2800 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2801 break;
2802 case '.':
830247a4 2803 nextchar(pRExC_state);
cce850e4 2804 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2805 ret = reg_node(pRExC_state, SANY);
2806 else
2807 ret = reg_node(pRExC_state, REG_ANY);
2808 *flagp |= HASWIDTH|SIMPLE;
830247a4 2809 RExC_naughty++;
fac92740 2810 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2811 break;
2812 case '[':
b45f050a 2813 {
830247a4 2814 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2815 ret = regclass(pRExC_state);
830247a4
IZ
2816 if (*RExC_parse != ']') {
2817 RExC_parse = oregcomp_parse;
b45f050a
JF
2818 vFAIL("Unmatched [");
2819 }
830247a4 2820 nextchar(pRExC_state);
a0d0e21e 2821 *flagp |= HASWIDTH|SIMPLE;
fac92740 2822 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2823 break;
b45f050a 2824 }
a0d0e21e 2825 case '(':
830247a4
IZ
2826 nextchar(pRExC_state);
2827 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2828 if (ret == NULL) {
bf93d4cc 2829 if (flags & TRYAGAIN) {
830247a4 2830 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2831 /* Make parent create an empty node if needed. */
2832 *flagp |= TRYAGAIN;
2833 return(NULL);
2834 }
a0d0e21e 2835 goto tryagain;
bf93d4cc 2836 }
a0d0e21e
LW
2837 return(NULL);
2838 }
c277df42 2839 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2840 break;
2841 case '|':
2842 case ')':
2843 if (flags & TRYAGAIN) {
2844 *flagp |= TRYAGAIN;
2845 return NULL;
2846 }
b45f050a 2847 vFAIL("Internal urp");
a0d0e21e
LW
2848 /* Supposed to be caught earlier. */
2849 break;
85afd4ae 2850 case '{':
830247a4
IZ
2851 if (!regcurly(RExC_parse)) {
2852 RExC_parse++;
85afd4ae
CS
2853 goto defchar;
2854 }
2855 /* FALL THROUGH */
a0d0e21e
LW
2856 case '?':
2857 case '+':
2858 case '*':
830247a4 2859 RExC_parse++;
b45f050a 2860 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2861 break;
2862 case '\\':
830247a4 2863 switch (*++RExC_parse) {
a0d0e21e 2864 case 'A':
830247a4
IZ
2865 RExC_seen_zerolen++;
2866 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2867 *flagp |= SIMPLE;
830247a4 2868 nextchar(pRExC_state);
fac92740 2869 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2870 break;
2871 case 'G':
830247a4
IZ
2872 ret = reg_node(pRExC_state, GPOS);
2873 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2874 *flagp |= SIMPLE;
830247a4 2875 nextchar(pRExC_state);
fac92740 2876 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2877 break;
2878 case 'Z':
830247a4 2879 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2880 *flagp |= SIMPLE;
a1917ab9 2881 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2882 nextchar(pRExC_state);
a0d0e21e 2883 break;
b85d18e9 2884 case 'z':
830247a4 2885 ret = reg_node(pRExC_state, EOS);
b85d18e9 2886 *flagp |= SIMPLE;
830247a4
IZ
2887 RExC_seen_zerolen++; /* Do not optimize RE away */
2888 nextchar(pRExC_state);
fac92740 2889 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2890 break;
4a2d328f 2891 case 'C':
f33976b4
DB
2892 ret = reg_node(pRExC_state, CANY);
2893 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2894 *flagp |= HASWIDTH|SIMPLE;
830247a4 2895 nextchar(pRExC_state);
fac92740 2896 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2897 break;
2898 case 'X':
830247a4 2899 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2900 *flagp |= HASWIDTH;
830247a4 2901 nextchar(pRExC_state);
fac92740 2902 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2903 break;
a0d0e21e 2904 case 'w':
ffc61ed2 2905 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2906 *flagp |= HASWIDTH|SIMPLE;
830247a4 2907 nextchar(pRExC_state);
fac92740 2908 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2909 break;
2910 case 'W':
ffc61ed2 2911 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2912 *flagp |= HASWIDTH|SIMPLE;
830247a4 2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2915 break;
2916 case 'b':
830247a4
IZ
2917 RExC_seen_zerolen++;
2918 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2919 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2920 *flagp |= SIMPLE;
830247a4 2921 nextchar(pRExC_state);
fac92740 2922 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2923 break;
2924 case 'B':
830247a4
IZ
2925 RExC_seen_zerolen++;
2926 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2927 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2928 *flagp |= SIMPLE;
830247a4 2929 nextchar(pRExC_state);
fac92740 2930 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2931 break;
2932 case 's':
ffc61ed2 2933 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2934 *flagp |= HASWIDTH|SIMPLE;
830247a4 2935 nextchar(pRExC_state);
fac92740 2936 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2937 break;
2938 case 'S':
ffc61ed2 2939 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2940 *flagp |= HASWIDTH|SIMPLE;
830247a4 2941 nextchar(pRExC_state);
fac92740 2942 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2943 break;
2944 case 'd':
ffc61ed2 2945 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2946 *flagp |= HASWIDTH|SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2949 break;
2950 case 'D':
ffc61ed2 2951 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2952 *flagp |= HASWIDTH|SIMPLE;
830247a4 2953 nextchar(pRExC_state);
fac92740 2954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2955 break;
a14b48bc
LW
2956 case 'p':
2957 case 'P':
3568d838 2958 {
830247a4 2959 char* oldregxend = RExC_end;
fac92740 2960 char* parse_start = RExC_parse;
a14b48bc 2961
830247a4 2962 if (RExC_parse[1] == '{') {
3568d838 2963 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2964 RExC_end = strchr(RExC_parse, '}');
2965 if (!RExC_end) {
0da60cf5 2966 U8 c = (U8)*RExC_parse;
830247a4
IZ
2967 RExC_parse += 2;
2968 RExC_end = oldregxend;
0da60cf5 2969 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2970 }
830247a4 2971 RExC_end++;
a14b48bc
LW
2972 }
2973 else
830247a4
IZ
2974 RExC_end = RExC_parse + 2;
2975 RExC_parse--;
a14b48bc 2976
ffc61ed2 2977 ret = regclass(pRExC_state);
a14b48bc 2978
830247a4
IZ
2979 RExC_end = oldregxend;
2980 RExC_parse--;
fac92740 2981 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2982 nextchar(pRExC_state);
a14b48bc
LW
2983 *flagp |= HASWIDTH|SIMPLE;
2984 }
2985 break;
a0d0e21e
LW
2986 case 'n':
2987 case 'r':
2988 case 't':
2989 case 'f':
2990 case 'e':
2991 case 'a':
2992 case 'x':
2993 case 'c':
2994 case '0':
2995 goto defchar;
2996 case '1': case '2': case '3': case '4':
2997 case '5': case '6': case '7': case '8': case '9':
2998 {
830247a4 2999 I32 num = atoi(RExC_parse);
a0d0e21e 3000
830247a4 3001 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3002 goto defchar;
3003 else {
fac92740 3004 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3005 while (isDIGIT(*RExC_parse))
3006 RExC_parse++;
b45f050a 3007
830247a4 3008 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 3009 vFAIL("Reference to nonexistent group");
830247a4
IZ
3010 RExC_sawback = 1;
3011 ret = reganode(pRExC_state, FOLD
a0ed51b3 3012 ? (LOC ? REFFL : REFF)
c8756f30 3013 : REF, num);
a0d0e21e 3014 *flagp |= HASWIDTH;
fac92740
MJD
3015
3016 /* override incorrect value set in reganode MJD */
3017 Set_Node_Offset(ret, parse_start+1);
3018 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3019 RExC_parse--;
3020 nextchar(pRExC_state);
a0d0e21e
LW
3021 }
3022 }
3023 break;
3024 case '\0':
830247a4 3025 if (RExC_parse >= RExC_end)
b45f050a 3026 FAIL("Trailing \\");
a0d0e21e
LW
3027 /* FALL THROUGH */
3028 default:
c9f97d15
IZ
3029 /* Do not generate `unrecognized' warnings here, we fall
3030 back into the quick-grab loop below */
a0d0e21e
LW
3031 goto defchar;
3032 }
3033 break;
4633a7c4
LW
3034
3035 case '#':
830247a4
IZ
3036 if (RExC_flags16 & PMf_EXTENDED) {
3037 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3038 if (RExC_parse < RExC_end)
4633a7c4
LW
3039 goto tryagain;
3040 }
3041 /* FALL THROUGH */
3042
a0d0e21e 3043 default: {
ba210ebe 3044 register STRLEN len;
58ae7d3f 3045 register UV ender;
a0d0e21e 3046 register char *p;
c277df42 3047 char *oldp, *s;
ba210ebe 3048 STRLEN numlen;
80aecb99 3049 STRLEN foldlen;
60a8b682 3050 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3051
3052 parse_start = RExC_parse - 1;
a0d0e21e 3053
830247a4 3054 RExC_parse++;
a0d0e21e
LW
3055
3056 defchar:
58ae7d3f 3057 ender = 0;
830247a4 3058 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3059 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3060 : EXACT);
cd439c50 3061 s = STRING(ret);
830247a4
IZ
3062 for (len = 0, p = RExC_parse - 1;
3063 len < 127 && p < RExC_end;
a0d0e21e
LW
3064 len++)
3065 {
3066 oldp = p;
5b5a24f7 3067
830247a4
IZ
3068 if (RExC_flags16 & PMf_EXTENDED)
3069 p = regwhite(p, RExC_end);
a0d0e21e
LW
3070 switch (*p) {
3071 case '^':
3072 case '$':
3073 case '.':
3074 case '[':
3075 case '(':
3076 case ')':
3077 case '|':
3078 goto loopdone;
3079 case '\\':
3080 switch (*++p) {
3081 case 'A':
1ed8eac0
JF
3082 case 'C':
3083 case 'X':
a0d0e21e
LW
3084 case 'G':
3085 case 'Z':
b85d18e9 3086 case 'z':
a0d0e21e
LW
3087 case 'w':
3088 case 'W':
3089 case 'b':
3090 case 'B':
3091 case 's':
3092 case 'S':
3093 case 'd':
3094 case 'D':
a14b48bc
LW
3095 case 'p':
3096 case 'P':
a0d0e21e
LW
3097 --p;
3098 goto loopdone;
3099 case 'n':
3100 ender = '\n';
3101 p++;
a687059c 3102 break;
a0d0e21e
LW
3103 case 'r':
3104 ender = '\r';
3105 p++;
a687059c 3106 break;
a0d0e21e
LW
3107 case 't':
3108 ender = '\t';
3109 p++;
a687059c 3110 break;
a0d0e21e
LW
3111 case 'f':
3112 ender = '\f';
3113 p++;
a687059c 3114 break;
a0d0e21e 3115 case 'e':
c7f1f016 3116 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3117 p++;
a687059c 3118 break;
a0d0e21e 3119 case 'a':
c7f1f016 3120 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3121 p++;
a687059c 3122 break;
a0d0e21e 3123 case 'x':
a0ed51b3
LW
3124 if (*++p == '{') {
3125 char* e = strchr(p, '}');
b81d288d 3126
b45f050a 3127 if (!e) {
830247a4 3128 RExC_parse = p + 1;
b45f050a
JF
3129 vFAIL("Missing right brace on \\x{}");
3130 }
de5f0749 3131 else {
a4c04bdc
NC
3132 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3133 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3134 numlen = e - p - 1;
3135 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3136 if (ender > 0xff)
3137 RExC_utf8 = 1;
b21ed0a9
GS
3138 /* numlen is generous */
3139 if (numlen + len >= 127) {
a0ed51b3
LW
3140 p--;
3141 goto loopdone;
3142 }
3143 p = e + 1;
3144 }
a0ed51b3
LW
3145 }
3146 else {
a4c04bdc 3147 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3148 numlen = 2;
3149 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3150 p += numlen;
3151 }
a687059c 3152 break;
a0d0e21e
LW
3153 case 'c':
3154 p++;
bbce6d69 3155 ender = UCHARAT(p++);
3156 ender = toCTRL(ender);
a687059c 3157 break;
a0d0e21e
LW
3158 case '0': case '1': case '2': case '3':case '4':
3159 case '5': case '6': case '7': case '8':case '9':
3160 if (*p == '0' ||
830247a4 3161 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3162 I32 flags = 0;
3163 numlen = 3;
3164 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3165 p += numlen;
3166 }
3167 else {
3168 --p;
3169 goto loopdone;
a687059c
LW
3170 }
3171 break;
a0d0e21e 3172 case '\0':
830247a4 3173 if (p >= RExC_end)
b45f050a 3174 FAIL("Trailing \\");
a687059c 3175 /* FALL THROUGH */
a0d0e21e 3176 default:
e476b1b5 3177 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3178 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3179 goto normal_default;
a0d0e21e
LW
3180 }
3181 break;
a687059c 3182 default:
a0ed51b3 3183 normal_default:
fd400ab9 3184 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3185 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3186 &numlen, 0);
a0ed51b3
LW
3187 p += numlen;
3188 }
3189 else
3190 ender = *p++;
a0d0e21e 3191 break;
a687059c 3192 }
830247a4
IZ
3193 if (RExC_flags16 & PMf_EXTENDED)
3194 p = regwhite(p, RExC_end);
60a8b682
JH
3195 if (UTF && FOLD) {
3196 /* Prime the casefolded buffer. */
ac7e0132 3197 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3198 }
a0d0e21e
LW
3199 if (ISMULT2(p)) { /* Back off on ?+*. */
3200 if (len)
3201 p = oldp;
16ea2a2e 3202 else if (UTF) {
0ebc6274
JH
3203 STRLEN unilen;
3204
80aecb99 3205 if (FOLD) {
60a8b682 3206 /* Emit all the Unicode characters. */
80aecb99
JH
3207 for (foldbuf = tmpbuf;
3208 foldlen;
3209 foldlen -= numlen) {
3210 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3211 if (numlen > 0) {
0ebc6274
JH
3212 reguni(pRExC_state, ender, s, &unilen);
3213 s += unilen;
3214 len += unilen;
3215 /* In EBCDIC the numlen
3216 * and unilen can differ. */
9dc45d57 3217 foldbuf += numlen;
47654450
JH
3218 if (numlen >= foldlen)
3219 break;
9dc45d57
JH
3220 }
3221 else
3222 break; /* "Can't happen." */
80aecb99
JH
3223 }
3224 }
3225 else {
0ebc6274 3226 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3227 if (unilen > 0) {
0ebc6274
JH
3228 s += unilen;
3229 len += unilen;
9dc45d57 3230 }
80aecb99 3231 }
a0ed51b3 3232 }
a0d0e21e
LW
3233 else {
3234 len++;
cd439c50 3235 REGC(ender, s++);
a0d0e21e
LW
3236 }
3237 break;
a687059c 3238 }
16ea2a2e 3239 if (UTF) {
0ebc6274
JH
3240 STRLEN unilen;
3241
80aecb99 3242 if (FOLD) {
60a8b682 3243 /* Emit all the Unicode characters. */
80aecb99
JH
3244 for (foldbuf = tmpbuf;
3245 foldlen;
3246 foldlen -= numlen) {
3247 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3248 if (numlen > 0) {
0ebc6274
JH
3249 reguni(pRExC_state, ender, s, &unilen);
3250 len += unilen;
3251 s += unilen;
3252 /* In EBCDIC the numlen
3253 * and unilen can differ. */
9dc45d57 3254 foldbuf += numlen;
47654450
JH
3255 if (numlen >= foldlen)
3256 break;
9dc45d57
JH
3257 }
3258 else
3259 break;
80aecb99
JH
3260 }
3261 }
3262 else {
0ebc6274 3263 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3264 if (unilen > 0) {
0ebc6274
JH
3265 s += unilen;
3266 len += unilen;
9dc45d57 3267 }
80aecb99
JH
3268 }
3269 len--;
a0ed51b3
LW
3270 }
3271 else
cd439c50 3272 REGC(ender, s++);
a0d0e21e
LW
3273 }
3274 loopdone:
830247a4 3275 RExC_parse = p - 1;
fac92740 3276 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3277 nextchar(pRExC_state);
793db0cb
JH
3278 {
3279 /* len is STRLEN which is unsigned, need to copy to signed */
3280 IV iv = len;
3281 if (iv < 0)
3282 vFAIL("Internal disaster");
3283 }
a0d0e21e
LW
3284 if (len > 0)
3285 *flagp |= HASWIDTH;
3286 if (len == 1)
3287 *flagp |= SIMPLE;
c277df42 3288 if (!SIZE_ONLY)
cd439c50
IZ
3289 STR_LEN(ret) = len;
3290 if (SIZE_ONLY)
830247a4 3291 RExC_size += STR_SZ(len);
cd439c50 3292 else
830247a4 3293 RExC_emit += STR_SZ(len);
a687059c 3294 }
a0d0e21e
LW
3295 break;
3296 }
a687059c 3297
60a8b682
JH
3298 /* If the encoding pragma is in effect recode the text of
3299 * any EXACT-kind nodes. */
22c54be3 3300 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3301 STRLEN oldlen = STR_LEN(ret);
3302 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3303
3304 if (RExC_utf8)
3305 SvUTF8_on(sv);
3306 if (sv_utf8_downgrade(sv, TRUE)) {
799ef3cb 3307 char *s = sv_recode_to_utf8(sv, PL_encoding);
22c54be3
JH
3308 STRLEN newlen = SvCUR(sv);
3309
3310 if (!SIZE_ONLY) {
3311 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312 (int)oldlen, STRING(ret),
3313 (int)newlen, s));
3314 Copy(s, STRING(ret), newlen, char);
3315 STR_LEN(ret) += newlen - oldlen;
3316 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3317 } else
3318 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3319 }
a72c7584
JH
3320 }
3321
a0d0e21e 3322 return(ret);
a687059c
LW
3323}
3324
873ef191 3325STATIC char *
cea2e8a9 3326S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3327{
3328 while (p < e) {
3329 if (isSPACE(*p))
3330 ++p;
3331 else if (*p == '#') {
3332 do {
3333 p++;
3334 } while (p < e && *p != '\n');
3335 }
3336 else
3337 break;
3338 }
3339 return p;
3340}
3341
b8c5462f
JH
3342/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343 Character classes ([:foo:]) can also be negated ([:^foo:]).
3344 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3346 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3347
3348#define POSIXCC_DONE(c) ((c) == ':')
3349#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3351
b8c5462f 3352STATIC I32
830247a4 3353S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3354{
3355 char *posixcc = 0;
936ed897 3356 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3357
830247a4 3358 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3359 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3360 POSIXCC(UCHARAT(RExC_parse))) {
3361 char c = UCHARAT(RExC_parse);
830247a4 3362 char* s = RExC_parse++;
b81d288d 3363
9a86a77b 3364 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3365 RExC_parse++;
3366 if (RExC_parse == RExC_end)
620e46c5 3367 /* Grandfather lone [:, [=, [. */
830247a4 3368 RExC_parse = s;
620e46c5 3369 else {
830247a4 3370 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3371
9a86a77b 3372 if (UCHARAT(RExC_parse) == ']') {
830247a4 3373 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3374 posixcc = s + 1;
3375 if (*s == ':') {
3376 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377 I32 skip = 5; /* the most common skip */
3378
3379 switch (*posixcc) {
3380 case 'a':
3381 if (strnEQ(posixcc, "alnum", 5))
3382 namedclass =
3383 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384 else if (strnEQ(posixcc, "alpha", 5))
3385 namedclass =
3386 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387 else if (strnEQ(posixcc, "ascii", 5))
3388 namedclass =
3389 complement ? ANYOF_NASCII : ANYOF_ASCII;
3390 break;
aaa51d5e
JF
3391 case 'b':
3392 if (strnEQ(posixcc, "blank", 5))
3393 namedclass =
3394 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3395 break;
b8c5462f
JH
3396 case 'c':
3397 if (strnEQ(posixcc, "cntrl", 5))
3398 namedclass =
3399 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3400 break;
3401 case 'd':
3402 if (strnEQ(posixcc, "digit", 5))
3403 namedclass =
3404 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3405 break;
3406 case 'g':
3407 if (strnEQ(posixcc, "graph", 5))
3408 namedclass =
3409 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3410 break;
3411 case 'l':
3412 if (strnEQ(posixcc, "lower", 5))
3413 namedclass =
3414 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3415 break;
3416 case 'p':
3417 if (strnEQ(posixcc, "print", 5))
3418 namedclass =
3419 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420 else if (strnEQ(posixcc, "punct", 5))
3421 namedclass =
3422 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3423 break;
3424 case 's':
3425 if (strnEQ(posixcc, "space", 5))
3426 namedclass =
aaa51d5e 3427 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3428 break;
b8c5462f
JH
3429 case 'u':
3430 if (strnEQ(posixcc, "upper", 5))
3431 namedclass =
3432 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3433 break;
3434 case 'w': /* this is not POSIX, this is the Perl \w */
3435 if (strnEQ(posixcc, "word", 4)) {
3436 namedclass =
3437 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3438 skip = 4;
3439 }
3440 break;
3441 case 'x':
3442 if (strnEQ(posixcc, "xdigit", 6)) {
3443 namedclass =
3444 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3445 skip = 6;
3446 }
3447 break;
3448 }
ac561586
JH
3449 if (namedclass == OOB_NAMEDCLASS ||
3450 posixcc[skip] != ':' ||
3451 posixcc[skip+1] != ']')
b45f050a
JF
3452 {
3453 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3454 t - s - 1, s + 1);
3455 }
3456 } else if (!SIZE_ONLY) {
b8c5462f 3457 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3458
830247a4 3459 /* adjust RExC_parse so the warning shows after
b45f050a 3460 the class closes */
9a86a77b 3461 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3462 RExC_parse++;
b45f050a
JF
3463 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3464 }
b8c5462f
JH
3465 } else {
3466 /* Maternal grandfather:
3467 * "[:" ending in ":" but not in ":]" */
830247a4 3468 RExC_parse = s;
767d463e 3469 }
620e46c5
JH
3470 }
3471 }
3472
b8c5462f
JH
3473 return namedclass;
3474}
3475
3476STATIC void
830247a4 3477S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3478{
b938889d 3479 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3480 char *s = RExC_parse;
93733859 3481 char c = *s++;
b8c5462f
JH
3482
3483 while(*s && isALNUM(*s))
3484 s++;
3485 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3486 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3487
3488 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3489 if (POSIXCC_NOTYET(c)) {
830247a4 3490 /* adjust RExC_parse so the error shows after
b45f050a 3491 the class closes */
9a86a77b 3492 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3493 ;
3494 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3495 }
b8c5462f
JH
3496 }
3497 }
620e46c5
JH
3498}
3499
76e3520e 3500STATIC regnode *
830247a4 3501S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3502{
ffc61ed2 3503 register UV value;
9a86a77b 3504 register UV nextvalue;
3568d838 3505 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3506 register IV range = 0;
c277df42 3507 register regnode *ret;
ba210ebe 3508 STRLEN numlen;
ffc61ed2 3509 IV namedclass;
9c5ffd7c 3510 char *rangebegin = 0;
936ed897 3511 bool need_class = 0;
9c5ffd7c 3512 SV *listsv = Nullsv;
ffc61ed2
JH
3513 register char *e;
3514 UV n;
9e55ce06
JH
3515 bool optimize_invert = TRUE;
3516 AV* unicode_alternate = 0;
ffc61ed2
JH
3517
3518 ret = reganode(pRExC_state, ANYOF, 0);
3519
3520 if (!SIZE_ONLY)
3521 ANYOF_FLAGS(ret) = 0;
3522
9a86a77b 3523 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3524 RExC_naughty++;
3525 RExC_parse++;
3526 if (!SIZE_ONLY)
3527 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3528 }
a0d0e21e 3529
936ed897 3530 if (SIZE_ONLY)
830247a4 3531 RExC_size += ANYOF_SKIP;
936ed897 3532 else {
830247a4 3533 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3534 if (FOLD)
3535 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3536 if (LOC)
3537 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3538 ANYOF_BITMAP_ZERO(ret);
3539 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3540 }
b8c5462f 3541
9a86a77b
JH
3542 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3543
b938889d 3544 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3545 checkposixcc(pRExC_state);
b8c5462f 3546
f064b6ad
HS
3547 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3548 if (UCHARAT(RExC_parse) == ']')
3549 goto charclassloop;
ffc61ed2 3550
9a86a77b 3551 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3552
3553 charclassloop:
3554
3555 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3556
73b437c8 3557 if (!range)
830247a4 3558 rangebegin = RExC_parse;
ffc61ed2 3559 if (UTF) {
5e12f4fb 3560 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3561 RExC_end - RExC_parse,
3562 &numlen, 0);
ffc61ed2
JH
3563 RExC_parse += numlen;
3564 }
3565 else
3566 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3567 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3568 if (value == '[' && POSIXCC(nextvalue))
830247a4 3569 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3570 else if (value == '\\') {
ffc61ed2 3571 if (UTF) {
5e12f4fb 3572 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3573 RExC_end - RExC_parse,
3574 &numlen, 0);
3575 RExC_parse += numlen;
3576 }
3577 else
3578 value = UCHARAT(RExC_parse++);
470c3474 3579 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3580 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3581 * be a problem later if we want switch on Unicode.
3582 * A similar issue a little bit later when switching on
3583 * namedclass. --jhi */
ffc61ed2 3584 switch ((I32)value) {
b8c5462f
JH
3585 case 'w': namedclass = ANYOF_ALNUM; break;
3586 case 'W': namedclass = ANYOF_NALNUM; break;
3587 case 's': namedclass = ANYOF_SPACE; break;
3588 case 'S': namedclass = ANYOF_NSPACE; break;
3589 case 'd': namedclass = ANYOF_DIGIT; break;
3590 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3591 case 'p':
3592 case 'P':
3593 if (*RExC_parse == '{') {
0da60cf5 3594 U8 c = (U8)value;
ffc61ed2
JH
3595 e = strchr(RExC_parse++, '}');
3596 if (!e)
0da60cf5 3597 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3598 while (isSPACE(UCHARAT(RExC_parse)))
3599 RExC_parse++;
3600 if (e == RExC_parse)
0da60cf5 3601 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3602 n = e - RExC_parse;
ab13f0c7
JH
3603 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3604 n--;
ffc61ed2
JH
3605 }
3606 else {
3607 e = RExC_parse;
3608 n = 1;
3609 }
3610 if (!SIZE_ONLY) {
ab13f0c7
JH
3611 if (UCHARAT(RExC_parse) == '^') {
3612 RExC_parse++;
3613 n--;
3614 value = value == 'p' ? 'P' : 'p'; /* toggle */
3615 while (isSPACE(UCHARAT(RExC_parse))) {
3616 RExC_parse++;
3617 n--;
3618 }
3619 }
ffc61ed2 3620 if (value == 'p')
ab13f0c7
JH
3621 Perl_sv_catpvf(aTHX_ listsv,
3622 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3623 else
ab13f0c7
JH
3624 Perl_sv_catpvf(aTHX_ listsv,
3625 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3626 }
3627 RExC_parse = e + 1;
3628 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3629 continue;
b8c5462f
JH
3630 case 'n': value = '\n'; break;
3631 case 'r': value = '\r'; break;
3632 case 't': value = '\t'; break;
3633 case 'f': value = '\f'; break;
3634 case 'b': value = '\b'; break;
c7f1f016
NIS
3635 case 'e': value = ASCII_TO_NATIVE('\033');break;
3636 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3637 case 'x':
ffc61ed2 3638 if (*RExC_parse == '{') {
a4c04bdc
NC
3639 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3640 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3641 e = strchr(RExC_parse++, '}');
b81d288d 3642 if (!e)
ffc61ed2 3643 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3644
3645 numlen = e - RExC_parse;
3646 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3647 RExC_parse = e + 1;
3648 }
3649 else {
a4c04bdc 3650 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3651 numlen = 2;
3652 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3653 RExC_parse += numlen;
3654 }
b8c5462f
JH
3655 break;
3656 case 'c':
830247a4 3657 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3658 value = toCTRL(value);
3659 break;
3660 case '0': case '1': case '2': case '3': case '4':
3661 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3662 {
3663 I32 flags = 0;
3664 numlen = 3;
3665 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3666 RExC_parse += numlen;
b8c5462f 3667 break;
53305cf1 3668 }
1028017a 3669 default:
e476b1b5 3670 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3671 vWARN2(RExC_parse,
3672 "Unrecognized escape \\%c in character class passed through",
3673 (int)value);
1028017a 3674 break;
b8c5462f 3675 }
ffc61ed2
JH
3676 } /* end of \blah */
3677
3678 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3679
3680 if (!SIZE_ONLY && !need_class)
936ed897 3681 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3682
936ed897 3683 need_class = 1;
ffc61ed2
JH
3684
3685 /* a bad range like a-\d, a-[:digit:] ? */
3686 if (range) {
73b437c8 3687 if (!SIZE_ONLY) {
e476b1b5 3688 if (ckWARN(WARN_REGEXP))
830247a4 3689 vWARN4(RExC_parse,
b45f050a 3690 "False [] range \"%*.*s\"",
830247a4
IZ
3691 RExC_parse - rangebegin,
3692 RExC_parse - rangebegin,
b45f050a 3693 rangebegin);
3568d838
JH
3694 if (prevvalue < 256) {
3695 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3696 ANYOF_BITMAP_SET(ret, '-');
3697 }
3698 else {
3699 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3700 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3701 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3702 }
b8c5462f 3703 }
ffc61ed2
JH
3704
3705 range = 0; /* this was not a true range */
73b437c8 3706 }
ffc61ed2 3707
73b437c8 3708 if (!SIZE_ONLY) {
3568d838
JH
3709 if (namedclass > OOB_NAMEDCLASS)
3710 optimize_invert = FALSE;
e2962f66
JH
3711 /* Possible truncation here but in some 64-bit environments
3712 * the compiler gets heartburn about switch on 64-bit values.
3713 * A similar issue a little earlier when switching on value.
98f323fa 3714 * --jhi */
e2962f66 3715 switch ((I32)namedclass) {
73b437c8
JH
3716 case ANYOF_ALNUM:
3717 if (LOC)
936ed897 3718 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3719 else {
3720 for (value = 0; value < 256; value++)
3721 if (isALNUM(value))
936ed897 3722 ANYOF_BITMAP_SET(ret, value);
73b437c8 3723 }
ffc61ed2 3724 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3725 break;
3726 case ANYOF_NALNUM:
3727 if (LOC)
936ed897 3728 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3729 else {
3730 for (value = 0; value < 256; value++)
3731 if (!isALNUM(value))
936ed897 3732 ANYOF_BITMAP_SET(ret, value);
73b437c8 3733 }
ffc61ed2 3734 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3735 break;
ffc61ed2 3736 case ANYOF_ALNUMC:
73b437c8 3737 if (LOC)
ffc61ed2 3738 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3739 else {
3740 for (value = 0; value < 256; value++)
ffc61ed2 3741 if (isALNUMC(value))
936ed897 3742 ANYOF_BITMAP_SET(ret, value);
73b437c8 3743 }
ffc61ed2 3744 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3745 break;
3746 case ANYOF_NALNUMC:
3747 if (LOC)
936ed897 3748 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3749 else {
3750 for (value = 0; value < 256; value++)
3751 if (!isALNUMC(value))
936ed897 3752 ANYOF_BITMAP_SET(ret, value);
73b437c8 3753 }
ffc61ed2 3754 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3755 break;
3756 case ANYOF_ALPHA:
3757 if (LOC)
936ed897 3758 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3759 else {
3760 for (value = 0; value < 256; value++)
3761 if (isALPHA(value))
936ed897 3762 ANYOF_BITMAP_SET(ret, value);
73b437c8 3763 }
ffc61ed2 3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3765 break;
3766 case ANYOF_NALPHA:
3767 if (LOC)
936ed897 3768 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3769 else {
3770 for (value = 0; value < 256; value++)
3771 if (!isALPHA(value))
936ed897 3772 ANYOF_BITMAP_SET(ret, value);
73b437c8 3773 }
ffc61ed2 3774 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3775 break;
3776 case ANYOF_ASCII:
3777 if (LOC)
936ed897 3778 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3779 else {
c7f1f016 3780#ifndef EBCDIC
1ba5c669
JH
3781 for (value = 0; value < 128; value++)
3782 ANYOF_BITMAP_SET(ret, value);
3783#else /* EBCDIC */
ffbc6a93 3784 for (value = 0; value < 256; value++) {
3a3c4447
JH
3785 if (isASCII(value))
3786 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3787 }
1ba5c669 3788#endif /* EBCDIC */
73b437c8 3789 }
ffc61ed2 3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3791 break;
3792 case ANYOF_NASCII:
3793 if (LOC)
936ed897 3794 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3795 else {
c7f1f016 3796#ifndef EBCDIC
1ba5c669
JH
3797 for (value = 128; value < 256; value++)
3798 ANYOF_BITMAP_SET(ret, value);
3799#else /* EBCDIC */
ffbc6a93 3800 for (value = 0; value < 256; value++) {
3a3c4447
JH
3801 if (!isASCII(value))
3802 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3803 }
1ba5c669 3804#endif /* EBCDIC */
73b437c8 3805 }
ffc61ed2 3806 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3807 break;
aaa51d5e
JF
3808 case ANYOF_BLANK:
3809 if (LOC)
3810 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3811 else {
3812 for (value = 0; value < 256; value++)
3813 if (isBLANK(value))
3814 ANYOF_BITMAP_SET(ret, value);
3815 }
ffc61ed2 3816 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3817 break;
3818 case ANYOF_NBLANK:
3819 if (LOC)
3820 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3821 else {
3822 for (value = 0; value < 256; value++)
3823 if (!isBLANK(value))
3824 ANYOF_BITMAP_SET(ret, value);
3825 }
ffc61ed2 3826 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3827 break;
73b437c8
JH
3828 case ANYOF_CNTRL:
3829 if (LOC)
936ed897 3830 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3831 else {
3832 for (value = 0; value < 256; value++)
3833 if (isCNTRL(value))
936ed897 3834 ANYOF_BITMAP_SET(ret, value);
73b437c8 3835 }
ffc61ed2 3836 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3837 break;
3838 case ANYOF_NCNTRL:
3839 if (LOC)
936ed897 3840 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3841 else {
3842 for (value = 0; value < 256; value++)
3843 if (!isCNTRL(value))
936ed897 3844 ANYOF_BITMAP_SET(ret, value);
73b437c8 3845 }
ffc61ed2
JH
3846 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3847 break;
3848 case ANYOF_DIGIT:
3849 if (LOC)
3850 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3851 else {
3852 /* consecutive digits assumed */
3853 for (value = '0'; value <= '9'; value++)
3854 ANYOF_BITMAP_SET(ret, value);
3855 }
3856 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3857 break;
3858 case ANYOF_NDIGIT:
3859 if (LOC)
3860 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3861 else {
3862 /* consecutive digits assumed */
3863 for (value = 0; value < '0'; value++)
3864 ANYOF_BITMAP_SET(ret, value);
3865 for (value = '9' + 1; value < 256; value++)
3866 ANYOF_BITMAP_SET(ret, value);
3867 }
3868 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3869 break;
3870 case ANYOF_GRAPH:
3871 if (LOC)
936ed897 3872 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3873 else {
3874 for (value = 0; value < 256; value++)
3875 if (isGRAPH(value))
936ed897 3876 ANYOF_BITMAP_SET(ret, value);
73b437c8 3877 }
ffc61ed2 3878 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3879 break;
3880 case ANYOF_NGRAPH:
3881 if (LOC)
936ed897 3882 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3883 else {
3884 for (value = 0; value < 256; value++)
3885 if (!isGRAPH(value))
936ed897 3886 ANYOF_BITMAP_SET(ret, value);
73b437c8 3887 }
ffc61ed2 3888 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3889 break;
3890 case ANYOF_LOWER:
3891 if (LOC)
936ed897 3892 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3893 else {
3894 for (value = 0; value < 256; value++)
3895 if (isLOWER(value))
936ed897 3896 ANYOF_BITMAP_SET(ret, value);
73b437c8 3897 }
ffc61ed2 3898 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3899 break;
3900 case ANYOF_NLOWER:
3901 if (LOC)
936ed897 3902 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3903 else {
3904 for (value = 0; value < 256; value++)
3905 if (!isLOWER(value))
936ed897 3906 ANYOF_BITMAP_SET(ret, value);
73b437c8 3907 }
ffc61ed2 3908 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3909 break;
3910 case ANYOF_PRINT:
3911 if (LOC)
936ed897 3912 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3913 else {
3914 for (value = 0; value < 256; value++)
3915 if (isPRINT(value))
936ed897 3916 ANYOF_BITMAP_SET(ret, value);
73b437c8 3917 }
ffc61ed2 3918 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3919 break;
3920 case ANYOF_NPRINT:
3921 if (LOC)
936ed897 3922 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3923 else {
3924 for (value = 0; value < 256; value++)
3925 if (!isPRINT(value))
936ed897 3926 ANYOF_BITMAP_SET(ret, value);
73b437c8 3927 }
ffc61ed2 3928 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3929 break;
aaa51d5e
JF
3930 case ANYOF_PSXSPC:
3931 if (LOC)
3932 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3933 else {
3934 for (value = 0; value < 256; value++)
3935 if (isPSXSPC(value))
3936 ANYOF_BITMAP_SET(ret, value);
3937 }
ffc61ed2 3938 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3939 break;
3940 case ANYOF_NPSXSPC:
3941 if (LOC)
3942 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3943 else {
3944 for (value = 0; value < 256; value++)
3945 if (!isPSXSPC(value))
3946 ANYOF_BITMAP_SET(ret, value);
3947 }
ffc61ed2 3948 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3949 break;
73b437c8
JH
3950 case ANYOF_PUNCT:
3951 if (LOC)
936ed897 3952 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3953 else {
3954 for (value = 0; value < 256; value++)
3955 if (isPUNCT(value))
936ed897 3956 ANYOF_BITMAP_SET(ret, value);
73b437c8 3957 }
ffc61ed2 3958 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3959 break;
3960 case ANYOF_NPUNCT:
3961 if (LOC)
936ed897 3962 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3963 else {
3964 for (value = 0; value < 256; value++)
3965 if (!isPUNCT(value))
936ed897 3966 ANYOF_BITMAP_SET(ret, value);
73b437c8 3967 }
ffc61ed2
JH
3968 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3969 break;
3970 case ANYOF_SPACE:
3971 if (LOC)
3972 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3973 else {
3974 for (value = 0; value < 256; value++)
3975 if (isSPACE(value))
3976 ANYOF_BITMAP_SET(ret, value);
3977 }
3978 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3979 break;
3980 case ANYOF_NSPACE:
3981 if (LOC)
3982 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3983 else {
3984 for (value = 0; value < 256; value++)
3985 if (!isSPACE(value))
3986 ANYOF_BITMAP_SET(ret, value);
3987 }
3988 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3989 break;
3990 case ANYOF_UPPER:
3991 if (LOC)
936ed897 3992 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3993 else {
3994 for (value = 0; value < 256; value++)
3995 if (isUPPER(value))
936ed897 3996 ANYOF_BITMAP_SET(ret, value);
73b437c8 3997 }
ffc61ed2 3998 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3999 break;
4000 case ANYOF_NUPPER:
4001 if (LOC)
936ed897 4002 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4003 else {
4004 for (value = 0; value < 256; value++)
4005 if (!isUPPER(value))
936ed897 4006 ANYOF_BITMAP_SET(ret, value);
73b437c8 4007 }
ffc61ed2 4008 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4009 break;
4010 case ANYOF_XDIGIT:
4011 if (LOC)
936ed897 4012 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4013 else {
4014 for (value = 0; value < 256; value++)
4015 if (isXDIGIT(value))
936ed897 4016 ANYOF_BITMAP_SET(ret, value);
73b437c8 4017 }
ffc61ed2 4018 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4019 break;
4020 case ANYOF_NXDIGIT:
4021 if (LOC)
936ed897 4022 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4023 else {
4024 for (value = 0; value < 256; value++)
4025 if (!isXDIGIT(value))
936ed897 4026 ANYOF_BITMAP_SET(ret, value);
73b437c8 4027 }
ffc61ed2 4028 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4029 break;
4030 default:
b45f050a 4031 vFAIL("Invalid [::] class");
73b437c8 4032 break;
b8c5462f 4033 }
b8c5462f 4034 if (LOC)
936ed897 4035 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4036 continue;
a0d0e21e 4037 }
ffc61ed2
JH
4038 } /* end of namedclass \blah */
4039
a0d0e21e 4040 if (range) {
3a3c4447 4041 if (prevvalue > value) /* b-a */ {
b45f050a 4042 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4043 RExC_parse - rangebegin,
4044 RExC_parse - rangebegin,
b45f050a 4045 rangebegin);
3568d838 4046 range = 0; /* not a valid range */
73b437c8 4047 }
a0d0e21e
LW
4048 }
4049 else {
3568d838 4050 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4051 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4052 RExC_parse[1] != ']') {
4053 RExC_parse++;
ffc61ed2
JH
4054
4055 /* a bad range like \w-, [:word:]- ? */
4056 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4057 if (ckWARN(WARN_REGEXP))
830247a4 4058 vWARN4(RExC_parse,
b45f050a 4059 "False [] range \"%*.*s\"",
830247a4
IZ
4060 RExC_parse - rangebegin,
4061 RExC_parse - rangebegin,
b45f050a 4062 rangebegin);
73b437c8 4063 if (!SIZE_ONLY)
936ed897 4064 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4065 } else
ffc61ed2
JH
4066 range = 1; /* yeah, it's a range! */
4067 continue; /* but do it the next time */
a0d0e21e 4068 }
a687059c 4069 }
ffc61ed2 4070
93733859 4071 /* now is the next time */
ae5c130c 4072 if (!SIZE_ONLY) {
3568d838
JH
4073 IV i;
4074
4075 if (prevvalue < 256) {
4076 IV ceilvalue = value < 256 ? value : 255;
4077
4078#ifdef EBCDIC
3a3c4447
JH
4079 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4080 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4081 {
3568d838
JH
4082 if (isLOWER(prevvalue)) {
4083 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4084 if (isLOWER(i))
4085 ANYOF_BITMAP_SET(ret, i);
4086 } else {
3568d838 4087 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4088 if (isUPPER(i))
4089 ANYOF_BITMAP_SET(ret, i);
4090 }
8ada0baa 4091 }
ffc61ed2 4092 else
8ada0baa 4093#endif
a5961de5
JH
4094 for (i = prevvalue; i <= ceilvalue; i++)
4095 ANYOF_BITMAP_SET(ret, i);
3568d838 4096 }
a5961de5 4097 if (value > 255 || UTF) {
b08decb7
JH
4098 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4099 UV natvalue = NATIVE_TO_UNI(value);
4100
ffc61ed2 4101 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4102 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4103 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4104 prevnatvalue, natvalue);
4105 }
4106 else if (prevnatvalue == natvalue) {
4107 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4108 if (FOLD) {
254ba52a
JH
4109 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4110 STRLEN foldlen;
2f3bf011 4111 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4112
c840d2a2
JH
4113 /* If folding and foldable and a single
4114 * character, insert also the folded version
4115 * to the charclass. */
9e55ce06
JH
4116 if (f != value) {
4117 if (foldlen == UNISKIP(f))
4118 Perl_sv_catpvf(aTHX_ listsv,
4119 "%04"UVxf"\n", f);
4120 else {
4121 /* Any multicharacter foldings
4122 * require the following transform:
4123 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4124 * where E folds into "pq" and F folds
4125 * into "rst", all other characters
4126 * fold to single characters. We save
4127 * away these multicharacter foldings,
4128 * to be later saved as part of the
4129 * additional "s" data. */
4130 SV *sv;
4131
4132 if (!unicode_alternate)
4133 unicode_alternate = newAV();
4134 sv = newSVpvn((char*)foldbuf, foldlen);
4135 SvUTF8_on(sv);
4136 av_push(unicode_alternate, sv);
4137 }
4138 }
254ba52a 4139
60a8b682
JH
4140 /* If folding and the value is one of the Greek
4141 * sigmas insert a few more sigmas to make the
4142 * folding rules of the sigmas to work right.
4143 * Note that not all the possible combinations
4144 * are handled here: some of them are handled
9e55ce06
JH
4145 * by the standard folding rules, and some of
4146 * them (literal or EXACTF cases) are handled
4147 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4148 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4149 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4150 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4151 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4152 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4153 }
4154 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4155 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4156 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4157 }
4158 }
ffc61ed2 4159 }
8ada0baa 4160 }
ffc61ed2
JH
4161
4162 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4163 }
ffc61ed2 4164
936ed897 4165 if (need_class) {
4f66b38d 4166 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4167 if (SIZE_ONLY)
830247a4 4168 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4169 else
830247a4 4170 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4171 }
ffc61ed2 4172
ae5c130c 4173 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4174 if (!SIZE_ONLY &&
ffc61ed2 4175 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4176 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4177 ) {
a0ed51b3 4178 for (value = 0; value < 256; ++value) {
936ed897 4179 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4180 IV fold = PL_fold[value];
4181
4182 if (fold != value)
4183 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4184 }
4185 }
936ed897 4186 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4187 }
ffc61ed2 4188
ae5c130c 4189 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4190 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4191 /* If the only flag is inversion. */
4192 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4193 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4194 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4195 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4196 }
a0d0e21e 4197
b81d288d 4198 if (!SIZE_ONLY) {
fde631ed 4199 AV *av = newAV();
ffc61ed2
JH
4200 SV *rv;
4201
9e55ce06
JH
4202 /* The 0th element stores the character class description
4203 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4204 * to initialize the appropriate swash (which gets stored in
4205 * the 1st element), and also useful for dumping the regnode.
4206 * The 2nd element stores the multicharacter foldings,
4207 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4208 av_store(av, 0, listsv);
4209 av_store(av, 1, NULL);
9e55ce06 4210 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4211 rv = newRV_noinc((SV*)av);
19860706 4212 n = add_data(pRExC_state, 1, "s");
830247a4 4213 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4214 ARG_SET(ret, n);
a0ed51b3
LW
4215 }
4216
4217 return ret;
4218}
4219
76e3520e 4220STATIC char*
830247a4 4221S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4222{
830247a4 4223 char* retval = RExC_parse++;
a0d0e21e 4224
4633a7c4 4225 for (;;) {
830247a4
IZ
4226 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4227 RExC_parse[2] == '#') {
4228 while (*RExC_parse && *RExC_parse != ')')
4229 RExC_parse++;
4230 RExC_parse++;
4633a7c4
LW
4231 continue;
4232 }
830247a4
IZ
4233 if (RExC_flags16 & PMf_EXTENDED) {
4234 if (isSPACE(*RExC_parse)) {
4235 RExC_parse++;
748a9306
LW
4236 continue;
4237 }
830247a4
IZ
4238 else if (*RExC_parse == '#') {
4239 while (*RExC_parse && *RExC_parse != '\n')
4240 RExC_parse++;
4241 RExC_parse++;
748a9306
LW
4242 continue;
4243 }
748a9306 4244 }
4633a7c4 4245 return retval;
a0d0e21e 4246 }
a687059c
LW
4247}
4248
4249/*
c277df42 4250- reg_node - emit a node
a0d0e21e 4251*/
76e3520e 4252STATIC regnode * /* Location. */
830247a4 4253S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4254{
c277df42
IZ
4255 register regnode *ret;
4256 register regnode *ptr;
a687059c 4257
830247a4 4258 ret = RExC_emit;
c277df42 4259 if (SIZE_ONLY) {
830247a4
IZ
4260 SIZE_ALIGN(RExC_size);
4261 RExC_size += 1;
a0d0e21e
LW
4262 return(ret);
4263 }
a687059c 4264
c277df42 4265 NODE_ALIGN_FILL(ret);
a0d0e21e 4266 ptr = ret;
c277df42 4267 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4268 if (RExC_offsets) { /* MJD */
4269 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4270 "reg_node", __LINE__,
4271 reg_name[op],
4272 RExC_emit - RExC_emit_start > RExC_offsets[0]
4273 ? "Overwriting end of array!\n" : "OK",
4274 RExC_emit - RExC_emit_start,
4275 RExC_parse - RExC_start,
4276 RExC_offsets[0]));
4277 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4278 }
4279
830247a4 4280 RExC_emit = ptr;
a687059c 4281
a0d0e21e 4282 return(ret);
a687059c
LW
4283}
4284
4285/*
a0d0e21e
LW
4286- reganode - emit a node with an argument
4287*/
76e3520e 4288STATIC regnode * /* Location. */
830247a4 4289S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4290{
c277df42
IZ
4291 register regnode *ret;
4292 register regnode *ptr;
fe14fcc3 4293
830247a4 4294 ret = RExC_emit;
c277df42 4295 if (SIZE_ONLY) {
830247a4
IZ
4296 SIZE_ALIGN(RExC_size);
4297 RExC_size += 2;
a0d0e21e
LW
4298 return(ret);
4299 }
fe14fcc3 4300
c277df42 4301 NODE_ALIGN_FILL(ret);
a0d0e21e 4302 ptr = ret;
c277df42 4303 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4304 if (RExC_offsets) { /* MJD */
4305 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4306 "reganode",
4307 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4308 "Overwriting end of array!\n" : "OK",
4309 RExC_emit - RExC_emit_start,
4310 RExC_parse - RExC_start,
4311 RExC_offsets[0]));
4312 Set_Cur_Node_Offset;
4313 }
4314
830247a4 4315 RExC_emit = ptr;
fe14fcc3 4316
a0d0e21e 4317 return(ret);
fe14fcc3
LW
4318}
4319
4320/*
cd439c50 4321- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4322*/
4323STATIC void
830247a4 4324S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4325{
5e12f4fb 4326 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4327}
4328
4329/*
a0d0e21e
LW
4330- reginsert - insert an operator in front of already-emitted operand
4331*
4332* Means relocating the operand.
4333*/
76e3520e 4334STATIC void
830247a4 4335S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4336{
c277df42
IZ
4337 register regnode *src;
4338 register regnode *dst;
4339 register regnode *place;
4340 register int offset = regarglen[(U8)op];
b81d288d 4341
22c35a8c 4342/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4343
4344 if (SIZE_ONLY) {
830247a4 4345 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4346 return;
4347 }
a687059c 4348
830247a4
IZ
4349 src = RExC_emit;
4350 RExC_emit += NODE_STEP_REGNODE + offset;
4351 dst = RExC_emit;
fac92740 4352 while (src > opnd) {
c277df42 4353 StructCopy(--src, --dst, regnode);
fac92740
MJD
4354 if (RExC_offsets) { /* MJD 20010112 */
4355 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4356 "reg_insert",
4357 dst - RExC_emit_start > RExC_offsets[0]
4358 ? "Overwriting end of array!\n" : "OK",
4359 src - RExC_emit_start,
4360 dst - RExC_emit_start,
4361 RExC_offsets[0]));
4362 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4363 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4364 }
4365 }
4366
a0d0e21e
LW
4367
4368 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4369 if (RExC_offsets) { /* MJD */
4370 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4371 "reginsert",
4372 place - RExC_emit_start > RExC_offsets[0]
4373 ? "Overwriting end of array!\n" : "OK",
4374 place - RExC_emit_start,
4375 RExC_parse - RExC_start,
4376 RExC_offsets[0]));
4377 Set_Node_Offset(place, RExC_parse);
4378 }
c277df42
IZ
4379 src = NEXTOPER(place);
4380 FILL_ADVANCE_NODE(place, op);
4381 Zero(src, offset, regnode);
a687059c
LW
4382}
4383
4384/*
c277df42 4385- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4386*/
76e3520e 4387STATIC void
830247a4 4388S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4389{
c277df42
IZ
4390 register regnode *scan;
4391 register regnode *temp;
a0d0e21e 4392
c277df42 4393 if (SIZE_ONLY)
a0d0e21e
LW
4394 return;
4395
4396 /* Find last node. */
4397 scan = p;
4398 for (;;) {
4399 temp = regnext(scan);
4400 if (temp == NULL)
4401 break;
4402 scan = temp;
4403 }
a687059c 4404
c277df42
IZ
4405 if (reg_off_by_arg[OP(scan)]) {
4406 ARG_SET(scan, val - scan);
a0ed51b3
LW
4407 }
4408 else {
c277df42
IZ
4409 NEXT_OFF(scan) = val - scan;
4410 }
a687059c
LW
4411}
4412
4413/*
a0d0e21e
LW
4414- regoptail - regtail on operand of first argument; nop if operandless
4415*/
76e3520e 4416STATIC void
830247a4 4417S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4418{
a0d0e21e 4419 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4420 if (p == NULL || SIZE_ONLY)
4421 return;
22c35a8c 4422 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4423 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4424 }
22c35a8c 4425 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4426 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4427 }
4428 else
a0d0e21e 4429 return;
a687059c
LW
4430}
4431
4432/*
4433 - regcurly - a little FSA that accepts {\d+,?\d*}
4434 */
79072805 4435STATIC I32
cea2e8a9 4436S_regcurly(pTHX_ register char *s)
a687059c
LW
4437{
4438 if (*s++ != '{')
4439 return FALSE;
f0fcb552 4440 if (!isDIGIT(*s))
a687059c 4441 return FALSE;
f0fcb552 4442 while (isDIGIT(*s))
a687059c
LW
4443 s++;
4444 if (*s == ',')
4445 s++;
f0fcb552 4446 while (isDIGIT(*s))
a687059c
LW
4447 s++;
4448 if (*s != '}')
4449 return FALSE;
4450 return TRUE;
4451}
4452
a687059c 4453
8fa7f367
JH
4454#ifdef DEBUGGING
4455
76e3520e 4456STATIC regnode *
cea2e8a9 4457S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4458{
f248d071 4459 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4460 register regnode *next;
c277df42
IZ
4461
4462 while (op != END && (!last || node < last)) {
4463 /* While that wasn't END last time... */
4464
4465 NODE_ALIGN(node);
4466 op = OP(node);
4467 if (op == CLOSE)
4468 l--;
4469 next = regnext(node);
4470 /* Where, what. */
4471 if (OP(node) == OPTIMIZED)
4472 goto after_print;
4473 regprop(sv, node);
b900a521 4474 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4475 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4476 if (next == NULL) /* Next ptr. */
4477 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4478 else
b900a521 4479 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4480 (void)PerlIO_putc(Perl_debug_log, '\n');
4481 after_print:
22c35a8c 4482 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4483 register regnode *nnode = (OP(next) == LONGJMP
4484 ? regnext(next)
c277df42
IZ
4485 : next);
4486 if (last && nnode > last)
4487 nnode = last;
4488 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4489 }
22c35a8c 4490 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4491 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4492 }
4493 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4494 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4495 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4496 }
22c35a8c 4497 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4498 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4499 next, sv, l + 1);
a0ed51b3
LW
4500 }
4501 else if ( op == PLUS || op == STAR) {
c277df42 4502 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4503 }
4504 else if (op == ANYOF) {
4f66b38d
HS
4505 /* arglen 1 + class block */
4506 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4507 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4508 node = NEXTOPER(node);
a0ed51b3 4509 }
22c35a8c 4510 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4511 /* Literal string, where present. */
cd439c50 4512 node += NODE_SZ_STR(node) - 1;
c277df42 4513 node = NEXTOPER(node);
a0ed51b3
LW
4514 }
4515 else {
c277df42
IZ
4516 node = NEXTOPER(node);
4517 node += regarglen[(U8)op];
4518 }
4519 if (op == CURLYX || op == OPEN)
4520 l++;
4521 else if (op == WHILEM)
4522 l--;
4523 }
4524 return node;
4525}
4526
8fa7f367
JH
4527#endif /* DEBUGGING */
4528
a687059c 4529/*
fd181c75 4530 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4531 */
4532void
864dbfa3 4533Perl_regdump(pTHX_ regexp *r)
a687059c 4534{
35ff7856 4535#ifdef DEBUGGING
46fc3d4c 4536 SV *sv = sv_newmortal();
a687059c 4537
c277df42 4538 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4539
4540 /* Header fields of interest. */
c277df42 4541 if (r->anchored_substr)
7b0972df 4542 PerlIO_printf(Perl_debug_log,
b81d288d 4543 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4544 PL_colors[0],
7b0972df 4545 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4546 SvPVX(r->anchored_substr),
3280af22 4547 PL_colors[1],
c277df42 4548 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4549 (IV)r->anchored_offset);
33b8afdf
JH
4550 else if (r->anchored_utf8)
4551 PerlIO_printf(Perl_debug_log,
4552 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4553 PL_colors[0],
4554 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4555 SvPVX(r->anchored_utf8),
4556 PL_colors[1],
4557 SvTAIL(r->anchored_utf8) ? "$" : "",
4558 (IV)r->anchored_offset);
c277df42 4559 if (r->float_substr)
7b0972df 4560 PerlIO_printf(Perl_debug_log,
b81d288d 4561 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4562 PL_colors[0],
b81d288d 4563 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4564 SvPVX(r->float_substr),
3280af22 4565 PL_colors[1],
c277df42 4566 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4567 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4568 else if (r->float_utf8)
4569 PerlIO_printf(Perl_debug_log,
4570 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4571 PL_colors[0],
4572 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4573 SvPVX(r->float_utf8),
4574 PL_colors[1],
4575 SvTAIL(r->float_utf8) ? "$" : "",
4576 (IV)r->float_min_offset, (UV)r->float_max_offset);
4577 if (r->check_substr || r->check_utf8)
b81d288d
AB
4578 PerlIO_printf(Perl_debug_log,
4579 r->check_substr == r->float_substr
33b8afdf 4580 && r->check_utf8 == r->float_utf8
c277df42
IZ
4581 ? "(checking floating" : "(checking anchored");
4582 if (r->reganch & ROPT_NOSCAN)
4583 PerlIO_printf(Perl_debug_log, " noscan");
4584 if (r->reganch & ROPT_CHECK_ALL)
4585 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4586 if (r->check_substr || r->check_utf8)
c277df42
IZ
4587 PerlIO_printf(Perl_debug_log, ") ");
4588
46fc3d4c 4589 if (r->regstclass) {
4590 regprop(sv, r->regstclass);
4591 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4592 }
774d564b 4593 if (r->reganch & ROPT_ANCH) {
4594 PerlIO_printf(Perl_debug_log, "anchored");
4595 if (r->reganch & ROPT_ANCH_BOL)
4596 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4597 if (r->reganch & ROPT_ANCH_MBOL)
4598 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4599 if (r->reganch & ROPT_ANCH_SBOL)
4600 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4601 if (r->reganch & ROPT_ANCH_GPOS)
4602 PerlIO_printf(Perl_debug_log, "(GPOS)");
4603 PerlIO_putc(Perl_debug_log, ' ');
4604 }
c277df42
IZ
4605 if (r->reganch & ROPT_GPOS_SEEN)
4606 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4607 if (r->reganch & ROPT_SKIP)
760ac839 4608 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4609 if (r->reganch & ROPT_IMPLICIT)
760ac839 4610 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4611 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4612 if (r->reganch & ROPT_EVAL_SEEN)
4613 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4614 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4615 if (r->offsets) {
4616 U32 i;
4617 U32 len = r->offsets[0];
392fbf5d 4618 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4619 for (i = 1; i <= len; i++)
392fbf5d
RB
4620 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4621 (UV)r->offsets[i*2-1],
4622 (UV)r->offsets[i*2]);
fac92740
MJD
4623 PerlIO_printf(Perl_debug_log, "\n");
4624 }
17c3b450 4625#endif /* DEBUGGING */
a687059c
LW
4626}
4627
8fa7f367
JH
4628#ifdef DEBUGGING
4629
653099ff
GS
4630STATIC void
4631S_put_byte(pTHX_ SV *sv, int c)
4632{
7be5a6cf 4633 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4634 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4635 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4636 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4637 else
4638 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4639}
4640
8fa7f367
JH
4641#endif /* DEBUGGING */
4642
a687059c 4643/*
a0d0e21e
LW
4644- regprop - printable representation of opcode
4645*/
46fc3d4c 4646void
864dbfa3 4647Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4648{
35ff7856 4649#ifdef DEBUGGING
9b155405 4650 register int k;
a0d0e21e 4651
54dc92de 4652 sv_setpvn(sv, "", 0);
9b155405 4653 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4654 /* It would be nice to FAIL() here, but this may be called from
4655 regexec.c, and it would be hard to supply pRExC_state. */
4656 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4657 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4658
4659 k = PL_regkind[(U8)OP(o)];
4660
2a782b5b
JH
4661 if (k == EXACT) {
4662 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4663 /* Using is_utf8_string() is a crude hack but it may
4664 * be the best for now since we have no flag "this EXACTish
4665 * node was UTF-8" --jhi */
4666 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4667 char *s = do_utf8 ?
c728cb41
JH
4668 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4669 UNI_DISPLAY_REGEX) :
2a782b5b 4670 STRING(o);
40eddc46 4671 int len = do_utf8 ?
2a782b5b
JH
4672 strlen(s) :
4673 STR_LEN(o);
4674 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4675 PL_colors[0],
4676 len, s,
4677 PL_colors[1]);
4678 }
9b155405 4679 else if (k == CURLY) {
cb434fcc 4680 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4681 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4682 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4683 }
2c2d71f5
JH
4684 else if (k == WHILEM && o->flags) /* Ordinal/of */
4685 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4686 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4687 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4688 else if (k == LOGICAL)
04ebc1ab 4689 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4690 else if (k == ANYOF) {
4691 int i, rangestart = -1;
ffc61ed2 4692 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4693 const char * const anyofs[] = { /* Should be syncronized with
4694 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4695 "\\w",
4696 "\\W",
4697 "\\s",
4698 "\\S",
4699 "\\d",
4700 "\\D",
4701 "[:alnum:]",
4702 "[:^alnum:]",
4703 "[:alpha:]",
4704 "[:^alpha:]",
4705 "[:ascii:]",
4706 "[:^ascii:]",
4707 "[:ctrl:]",
4708 "[:^ctrl:]",
4709 "[:graph:]",
4710 "[:^graph:]",
4711 "[:lower:]",
4712 "[:^lower:]",
4713 "[:print:]",
4714 "[:^print:]",
4715 "[:punct:]",
4716 "[:^punct:]",
4717 "[:upper:]",
aaa51d5e 4718 "[:^upper:]",
653099ff 4719 "[:xdigit:]",
aaa51d5e
JF
4720 "[:^xdigit:]",
4721 "[:space:]",
4722 "[:^space:]",
4723 "[:blank:]",
4724 "[:^blank:]"
653099ff
GS
4725 };
4726
19860706 4727 if (flags & ANYOF_LOCALE)
653099ff 4728 sv_catpv(sv, "{loc}");
19860706 4729 if (flags & ANYOF_FOLD)
653099ff
GS
4730 sv_catpv(sv, "{i}");
4731 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4732 if (flags & ANYOF_INVERT)
653099ff 4733 sv_catpv(sv, "^");
ffc61ed2
JH
4734 for (i = 0; i <= 256; i++) {
4735 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4736 if (rangestart == -1)
4737 rangestart = i;
4738 } else if (rangestart != -1) {
4739 if (i <= rangestart + 3)
4740 for (; rangestart < i; rangestart++)
653099ff 4741 put_byte(sv, rangestart);
ffc61ed2
JH
4742 else {
4743 put_byte(sv, rangestart);
4744 sv_catpv(sv, "-");
4745 put_byte(sv, i - 1);
653099ff 4746 }
ffc61ed2 4747 rangestart = -1;
653099ff 4748 }
847a199f 4749 }
ffc61ed2
JH
4750
4751 if (o->flags & ANYOF_CLASS)
4752 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4753 if (ANYOF_CLASS_TEST(o,i))
4754 sv_catpv(sv, anyofs[i]);
4755
4756 if (flags & ANYOF_UNICODE)
4757 sv_catpv(sv, "{unicode}");
1aa99e6b 4758 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4759 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4760
4761 {
4762 SV *lv;
9e55ce06 4763 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4764
ffc61ed2
JH
4765 if (lv) {
4766 if (sw) {
4767 UV i;
4768 U8 s[UTF8_MAXLEN+1];
b81d288d 4769
ffc61ed2 4770 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4771 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4772
3568d838 4773 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4774 if (rangestart == -1)
4775 rangestart = i;
4776 } else if (rangestart != -1) {
4777 U8 *p;
b81d288d 4778
ffc61ed2
JH
4779 if (i <= rangestart + 3)
4780 for (; rangestart < i; rangestart++) {
2b9d42f0 4781 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4782 put_byte(sv, *p);
4783 }
4784 else {
2b9d42f0 4785 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4786 put_byte(sv, *p);
4787 sv_catpv(sv, "-");
2b9d42f0 4788 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4789 put_byte(sv, *p);
4790 }
4791 rangestart = -1;
4792 }
19860706 4793 }
ffc61ed2
JH
4794
4795 sv_catpv(sv, "..."); /* et cetera */
19860706 4796 }
fde631ed 4797
ffc61ed2
JH
4798 {
4799 char *s = savepv(SvPVX(lv));
4800 char *origs = s;
b81d288d 4801
ffc61ed2 4802 while(*s && *s != '\n') s++;
b81d288d 4803
ffc61ed2
JH
4804 if (*s == '\n') {
4805 char *t = ++s;
4806
4807 while (*s) {
4808 if (*s == '\n')
4809 *s = ' ';
4810 s++;
4811 }
4812 if (s[-1] == ' ')
4813 s[-1] = 0;
4814
4815 sv_catpv(sv, t);
fde631ed 4816 }
b81d288d 4817
ffc61ed2 4818 Safefree(origs);
fde631ed
JH
4819 }
4820 }
653099ff 4821 }
ffc61ed2 4822
653099ff
GS
4823 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4824 }
9b155405 4825 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4826 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4827#endif /* DEBUGGING */
35ff7856 4828}
a687059c 4829
cad2e5aa
JH
4830SV *
4831Perl_re_intuit_string(pTHX_ regexp *prog)
4832{ /* Assume that RE_INTUIT is set */
4833 DEBUG_r(
4834 { STRLEN n_a;
33b8afdf
JH
4835 char *s = SvPV(prog->check_substr
4836 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4837
4838 if (!PL_colorset) reginitcolors();
4839 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4840 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4841 PL_colors[4],
4842 prog->check_substr ? "" : "utf8 ",
4843 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4844 s,
4845 PL_colors[1],
4846 (strlen(s) > 60 ? "..." : ""));
4847 } );
4848
33b8afdf 4849 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4850}
4851
2b69d0c2 4852void
864dbfa3 4853Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4854{
9e55ce06
JH
4855#ifdef DEBUGGING
4856 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4857#endif
7821416a
IZ
4858
4859 if (!r || (--r->refcnt > 0))
4860 return;
9e55ce06 4861 DEBUG_r({
9e55ce06 4862 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4863 UNI_DISPLAY_REGEX);
9e55ce06
JH
4864 int len = SvCUR(dsv);
4865 if (!PL_colorset)
4866 reginitcolors();
4867 PerlIO_printf(Perl_debug_log,
4868 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4869 PL_colors[4],PL_colors[5],PL_colors[0],
4870 len, len, s,
4871 PL_colors[1],
4872 len > 60 ? "..." : "");
4873 });
cad2e5aa 4874
c277df42 4875 if (r->precomp)
a0d0e21e 4876 Safefree(r->precomp);
fac92740
MJD
4877 if (r->offsets) /* 20010421 MJD */
4878 Safefree(r->offsets);
cf93c79d
IZ
4879 if (RX_MATCH_COPIED(r))
4880 Safefree(r->subbeg);
a193d654
GS
4881 if (r->substrs) {
4882 if (r->anchored_substr)
4883 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4884 if (r->anchored_utf8)
4885 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4886 if (r->float_substr)
4887 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4888 if (r->float_utf8)
4889 SvREFCNT_dec(r->float_utf8);
2779dcf1 4890 Safefree(r->substrs);
a193d654 4891 }
c277df42
IZ
4892 if (r->data) {
4893 int n = r->data->count;
dfad63ad
HS
4894 AV* new_comppad = NULL;
4895 AV* old_comppad;
4896 SV** old_curpad;
4897
c277df42 4898 while (--n >= 0) {
261faec3 4899 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4900 switch (r->data->what[n]) {
4901 case 's':
4902 SvREFCNT_dec((SV*)r->data->data[n]);
4903 break;
653099ff
GS
4904 case 'f':
4905 Safefree(r->data->data[n]);
4906 break;
dfad63ad
HS
4907 case 'p':
4908 new_comppad = (AV*)r->data->data[n];
4909 break;
c277df42 4910 case 'o':
dfad63ad 4911 if (new_comppad == NULL)
cea2e8a9 4912 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4913 old_comppad = PL_comppad;
4914 old_curpad = PL_curpad;
1e6dc0b6
SB
4915 /* Watch out for global destruction's random ordering. */
4916 if (SvTYPE(new_comppad) == SVt_PVAV) {
4917 PL_comppad = new_comppad;
4918 PL_curpad = AvARRAY(new_comppad);
4919 }
4920 else
4921 PL_curpad = NULL;
9b978d73
DM
4922
4923 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4924 op_free((OP_4tree*)r->data->data[n]);
4925 }
4926
dfad63ad
HS
4927 PL_comppad = old_comppad;
4928 PL_curpad = old_curpad;
4929 SvREFCNT_dec((SV*)new_comppad);
4930 new_comppad = NULL;
c277df42
IZ
4931 break;
4932 case 'n':
9e55ce06 4933 break;
c277df42 4934 default:
830247a4 4935 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4936 }
4937 }
4938 Safefree(r->data->what);
4939 Safefree(r->data);
a0d0e21e
LW
4940 }
4941 Safefree(r->startp);
4942 Safefree(r->endp);
4943 Safefree(r);
a687059c 4944}
c277df42
IZ
4945
4946/*
4947 - regnext - dig the "next" pointer out of a node
4948 *
4949 * [Note, when REGALIGN is defined there are two places in regmatch()
4950 * that bypass this code for speed.]
4951 */
4952regnode *
864dbfa3 4953Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4954{
4955 register I32 offset;
4956
3280af22 4957 if (p == &PL_regdummy)
c277df42
IZ
4958 return(NULL);
4959
4960 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4961 if (offset == 0)
4962 return(NULL);
4963
c277df42 4964 return(p+offset);
c277df42
IZ
4965}
4966
01f988be 4967STATIC void
cea2e8a9 4968S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4969{
4970 va_list args;
4971 STRLEN l1 = strlen(pat1);
4972 STRLEN l2 = strlen(pat2);
4973 char buf[512];
06bf62c7 4974 SV *msv;
c277df42
IZ
4975 char *message;
4976
4977 if (l1 > 510)
4978 l1 = 510;
4979 if (l1 + l2 > 510)
4980 l2 = 510 - l1;
4981 Copy(pat1, buf, l1 , char);
4982 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4983 buf[l1 + l2] = '\n';
4984 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4985#ifdef I_STDARG
4986 /* ANSI variant takes additional second argument */
c277df42 4987 va_start(args, pat2);
8736538c
AS
4988#else
4989 va_start(args);
4990#endif
5a844595 4991 msv = vmess(buf, &args);
c277df42 4992 va_end(args);
06bf62c7 4993 message = SvPV(msv,l1);
c277df42
IZ
4994 if (l1 > 512)
4995 l1 = 512;
4996 Copy(message, buf, l1 , char);
4997 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4998 Perl_croak(aTHX_ "%s", buf);
c277df42 4999}
a0ed51b3
LW
5000
5001/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5002
5003void
864dbfa3 5004Perl_save_re_context(pTHX)
b81d288d 5005{
830247a4
IZ
5006#if 0
5007 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5008 SAVEI32(RExC_npar); /* () count. */
5009 SAVEI32(RExC_size); /* Code size. */
5010 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5011 SAVEVPTR(RExC_rx); /* from regcomp.c */
5012 SAVEI32(RExC_seen); /* from regcomp.c */
5013 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5014 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5015 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
5016 SAVEPPTR(RExC_end); /* End of input for compile */
5017 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5018#endif
5019
5020 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5021 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5022 SAVEPPTR(PL_reginput); /* String-input pointer. */
5023 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5024 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5025 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5026 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5027 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 5028 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5029 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5030 PL_reg_start_tmp = 0;
a0ed51b3
LW
5031 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5032 PL_reg_start_tmpl = 0;
7766f137 5033 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5034 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5035 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5036 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5037 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5038 SAVEVPTR(PL_regcc); /* from regexec.c */
5039 SAVEVPTR(PL_curcop);
7766f137
GS
5040 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5041 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5042 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5043 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 5044 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 5045 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5046 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5047 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5048 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5049 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5050 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5051#ifdef DEBUGGING
b81d288d 5052 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5053#endif
a0ed51b3 5054}
51371543 5055
51371543 5056static void
acfe0abc 5057clear_re(pTHX_ void *r)
51371543
GS
5058{
5059 ReREFCNT_dec((regexp *)r);
5060}
ffbc6a93 5061