This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Proper skip tests and VMS unlink
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
739#ifdef DEBUGGING
740 /* Allow dumping */
cd439c50 741 n = scan + NODE_SZ_STR(scan);
c277df42 742 while (n <= stop) {
22c35a8c 743 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
744 OP(n) = OPTIMIZED;
745 NEXT_OFF(n) = 0;
746 }
747 n++;
748 }
653099ff 749#endif
c277df42 750 }
653099ff
GS
751 /* Follow the next-chain of the current node and optimize
752 away all the NOTHINGs from it. */
c277df42 753 if (OP(scan) != CURLYX) {
048cfca1
GS
754 int max = (reg_off_by_arg[OP(scan)]
755 ? I32_MAX
756 /* I32 may be smaller than U16 on CRAYs! */
757 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
758 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
759 int noff;
760 regnode *n = scan;
b81d288d 761
c277df42
IZ
762 /* Skip NOTHING and LONGJMP. */
763 while ((n = regnext(n))
22c35a8c 764 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
765 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
766 && off + noff < max)
767 off += noff;
768 if (reg_off_by_arg[OP(scan)])
769 ARG(scan) = off;
b81d288d 770 else
c277df42
IZ
771 NEXT_OFF(scan) = off;
772 }
653099ff
GS
773 /* The principal pseudo-switch. Cannot be a switch, since we
774 look into several different things. */
b81d288d 775 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
776 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
777 next = regnext(scan);
778 code = OP(scan);
b81d288d
AB
779
780 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 781 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 782 struct regnode_charclass_class accum;
c277df42 783
653099ff 784 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 785 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 786 if (flags & SCF_DO_STCLASS)
830247a4 787 cl_init_zero(pRExC_state, &accum);
c277df42 788 while (OP(scan) == code) {
830247a4 789 I32 deltanext, minnext, f = 0, fake;
653099ff 790 struct regnode_charclass_class this_class;
c277df42
IZ
791
792 num++;
793 data_fake.flags = 0;
b81d288d 794 if (data) {
2c2d71f5 795 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
796 data_fake.last_closep = data->last_closep;
797 }
798 else
799 data_fake.last_closep = &fake;
c277df42
IZ
800 next = regnext(scan);
801 scan = NEXTOPER(scan);
802 if (code != BRANCH)
803 scan = NEXTOPER(scan);
653099ff 804 if (flags & SCF_DO_STCLASS) {
830247a4 805 cl_init(pRExC_state, &this_class);
653099ff
GS
806 data_fake.start_class = &this_class;
807 f = SCF_DO_STCLASS_AND;
b81d288d 808 }
e1901655
IZ
809 if (flags & SCF_WHILEM_VISITED_POS)
810 f |= SCF_WHILEM_VISITED_POS;
653099ff 811 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
812 minnext = study_chunk(pRExC_state, &scan, &deltanext,
813 next, &data_fake, f);
b81d288d 814 if (min1 > minnext)
c277df42
IZ
815 min1 = minnext;
816 if (max1 < minnext + deltanext)
817 max1 = minnext + deltanext;
818 if (deltanext == I32_MAX)
aca2d497 819 is_inf = is_inf_internal = 1;
c277df42
IZ
820 scan = next;
821 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
822 pars++;
405ff068 823 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 824 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
825 if (data)
826 data->whilem_c = data_fake.whilem_c;
653099ff 827 if (flags & SCF_DO_STCLASS)
830247a4 828 cl_or(pRExC_state, &accum, &this_class);
b81d288d 829 if (code == SUSPEND)
c277df42
IZ
830 break;
831 }
832 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
833 min1 = 0;
834 if (flags & SCF_DO_SUBSTR) {
835 data->pos_min += min1;
836 data->pos_delta += max1 - min1;
837 if (max1 != min1 || is_inf)
838 data->longest = &(data->longest_float);
839 }
840 min += min1;
841 delta += max1 - min1;
653099ff 842 if (flags & SCF_DO_STCLASS_OR) {
830247a4 843 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
844 if (min1) {
845 cl_and(data->start_class, &and_with);
846 flags &= ~SCF_DO_STCLASS;
847 }
848 }
849 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
850 if (min1) {
851 cl_and(data->start_class, &accum);
653099ff 852 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
853 }
854 else {
b81d288d 855 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
856 * data->start_class */
857 StructCopy(data->start_class, &and_with,
858 struct regnode_charclass_class);
859 flags &= ~SCF_DO_STCLASS_AND;
860 StructCopy(&accum, data->start_class,
861 struct regnode_charclass_class);
862 flags |= SCF_DO_STCLASS_OR;
863 data->start_class->flags |= ANYOF_EOS;
864 }
653099ff 865 }
a0ed51b3
LW
866 }
867 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
868 scan = NEXTOPER(NEXTOPER(scan));
869 else /* single branch is optimized. */
870 scan = NEXTOPER(scan);
871 continue;
a0ed51b3
LW
872 }
873 else if (OP(scan) == EXACT) {
cd439c50 874 I32 l = STR_LEN(scan);
1aa99e6b 875 UV uc = *((U8*)STRING(scan));
a0ed51b3 876 if (UTF) {
1aa99e6b
IH
877 U8 *s = (U8*)STRING(scan);
878 l = utf8_length(s, s + l);
9041c2e3 879 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
880 }
881 min += l;
c277df42 882 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
883 /* The code below prefers earlier match for fixed
884 offset, later match for variable offset. */
885 if (data->last_end == -1) { /* Update the start info. */
886 data->last_start_min = data->pos_min;
887 data->last_start_max = is_inf
b81d288d 888 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 889 }
cd439c50 890 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
891 data->last_end = data->pos_min + l;
892 data->pos_min += l; /* As in the first entry. */
893 data->flags &= ~SF_BEFORE_EOL;
894 }
653099ff
GS
895 if (flags & SCF_DO_STCLASS_AND) {
896 /* Check whether it is compatible with what we know already! */
897 int compat = 1;
898
1aa99e6b 899 if (uc >= 0x100 ||
516a5887 900 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 901 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 902 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 903 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 904 )
653099ff
GS
905 compat = 0;
906 ANYOF_CLASS_ZERO(data->start_class);
907 ANYOF_BITMAP_ZERO(data->start_class);
908 if (compat)
1aa99e6b 909 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 910 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
911 if (uc < 0x100)
912 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
913 }
914 else if (flags & SCF_DO_STCLASS_OR) {
915 /* false positive possible if the class is case-folded */
1aa99e6b 916 if (uc < 0x100)
9b877dbb
IH
917 ANYOF_BITMAP_SET(data->start_class, uc);
918 else
919 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
920 data->start_class->flags &= ~ANYOF_EOS;
921 cl_and(data->start_class, &and_with);
922 }
923 flags &= ~SCF_DO_STCLASS;
a0ed51b3 924 }
653099ff 925 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 926 I32 l = STR_LEN(scan);
1aa99e6b 927 UV uc = *((U8*)STRING(scan));
653099ff
GS
928
929 /* Search for fixed substrings supports EXACT only. */
b81d288d 930 if (flags & SCF_DO_SUBSTR)
830247a4 931 scan_commit(pRExC_state, data);
a0ed51b3 932 if (UTF) {
1aa99e6b
IH
933 U8 *s = (U8 *)STRING(scan);
934 l = utf8_length(s, s + l);
9041c2e3 935 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
936 }
937 min += l;
c277df42 938 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 939 data->pos_min += l;
653099ff
GS
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
942 int compat = 1;
943
1aa99e6b 944 if (uc >= 0x100 ||
516a5887 945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 947 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat) {
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
953 data->start_class->flags &= ~ANYOF_EOS;
954 data->start_class->flags |= ANYOF_FOLD;
955 if (OP(scan) == EXACTFL)
956 data->start_class->flags |= ANYOF_LOCALE;
957 }
958 }
959 else if (flags & SCF_DO_STCLASS_OR) {
960 if (data->start_class->flags & ANYOF_FOLD) {
961 /* false positive possible if the class is case-folded.
962 Assume that the locale settings are the same... */
1aa99e6b
IH
963 if (uc < 0x100)
964 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
965 data->start_class->flags &= ~ANYOF_EOS;
966 }
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
4d61ec05 971 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 972 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 973 I32 f = flags, pos_before = 0;
c277df42 974 regnode *oscan = scan;
653099ff
GS
975 struct regnode_charclass_class this_class;
976 struct regnode_charclass_class *oclass = NULL;
727f22e3 977 I32 next_is_eval = 0;
653099ff 978
22c35a8c 979 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 980 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
981 scan = NEXTOPER(scan);
982 goto finish;
983 case PLUS:
653099ff 984 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 985 next = NEXTOPER(scan);
653099ff 986 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
987 mincount = 1;
988 maxcount = REG_INFTY;
c277df42
IZ
989 next = regnext(scan);
990 scan = NEXTOPER(scan);
991 goto do_curly;
992 }
993 }
994 if (flags & SCF_DO_SUBSTR)
995 data->pos_min++;
996 min++;
997 /* Fall through. */
998 case STAR:
653099ff
GS
999 if (flags & SCF_DO_STCLASS) {
1000 mincount = 0;
b81d288d 1001 maxcount = REG_INFTY;
653099ff
GS
1002 next = regnext(scan);
1003 scan = NEXTOPER(scan);
1004 goto do_curly;
1005 }
b81d288d 1006 is_inf = is_inf_internal = 1;
c277df42
IZ
1007 scan = regnext(scan);
1008 if (flags & SCF_DO_SUBSTR) {
830247a4 1009 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1010 data->longest = &(data->longest_float);
1011 }
1012 goto optimize_curly_tail;
1013 case CURLY:
b81d288d 1014 mincount = ARG1(scan);
c277df42
IZ
1015 maxcount = ARG2(scan);
1016 next = regnext(scan);
cb434fcc
IZ
1017 if (OP(scan) == CURLYX) {
1018 I32 lp = (data ? *(data->last_closep) : 0);
1019
1020 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1021 }
c277df42 1022 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1023 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1024 do_curly:
1025 if (flags & SCF_DO_SUBSTR) {
830247a4 1026 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1027 pos_before = data->pos_min;
1028 }
1029 if (data) {
1030 fl = data->flags;
1031 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1032 if (is_inf)
1033 data->flags |= SF_IS_INF;
1034 }
653099ff 1035 if (flags & SCF_DO_STCLASS) {
830247a4 1036 cl_init(pRExC_state, &this_class);
653099ff
GS
1037 oclass = data->start_class;
1038 data->start_class = &this_class;
1039 f |= SCF_DO_STCLASS_AND;
1040 f &= ~SCF_DO_STCLASS_OR;
1041 }
e1901655
IZ
1042 /* These are the cases when once a subexpression
1043 fails at a particular position, it cannot succeed
1044 even after backtracking at the enclosing scope.
b81d288d 1045
e1901655
IZ
1046 XXXX what if minimal match and we are at the
1047 initial run of {n,m}? */
1048 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1049 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1050
c277df42 1051 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1052 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1053 mincount == 0
653099ff
GS
1054 ? (f & ~SCF_DO_SUBSTR) : f);
1055
1056 if (flags & SCF_DO_STCLASS)
1057 data->start_class = oclass;
1058 if (mincount == 0 || minnext == 0) {
1059 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1060 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1061 }
1062 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1063 /* Switch to OR mode: cache the old value of
653099ff
GS
1064 * data->start_class */
1065 StructCopy(data->start_class, &and_with,
1066 struct regnode_charclass_class);
1067 flags &= ~SCF_DO_STCLASS_AND;
1068 StructCopy(&this_class, data->start_class,
1069 struct regnode_charclass_class);
1070 flags |= SCF_DO_STCLASS_OR;
1071 data->start_class->flags |= ANYOF_EOS;
1072 }
1073 } else { /* Non-zero len */
1074 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1075 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1076 cl_and(data->start_class, &and_with);
1077 }
1078 else if (flags & SCF_DO_STCLASS_AND)
1079 cl_and(data->start_class, &this_class);
1080 flags &= ~SCF_DO_STCLASS;
1081 }
c277df42
IZ
1082 if (!scan) /* It was not CURLYX, but CURLY. */
1083 scan = next;
84037bb0 1084 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1085 /* ? quantifier ok, except for (?{ ... }) */
1086 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1087 && (minnext == 0) && (deltanext == 0)
99799961 1088 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1089 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1090 {
830247a4 1091 vWARN(RExC_parse,
b45f050a
JF
1092 "Quantifier unexpected on zero-length expression");
1093 }
1094
c277df42 1095 min += minnext * mincount;
b81d288d 1096 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1097 && (minnext + deltanext) > 0)
1098 || deltanext == I32_MAX);
aca2d497 1099 is_inf |= is_inf_internal;
c277df42
IZ
1100 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1101
1102 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1103 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1104 && data->flags & SF_IN_PAR
1105 && !(data->flags & SF_HAS_EVAL)
1106 && !deltanext && minnext == 1 ) {
1107 /* Try to optimize to CURLYN. */
1108 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1109 regnode *nxt1 = nxt;
1110#ifdef DEBUGGING
1111 regnode *nxt2;
1112#endif
c277df42
IZ
1113
1114 /* Skip open. */
1115 nxt = regnext(nxt);
4d61ec05 1116 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1117 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1118 && STR_LEN(nxt) == 1))
c277df42 1119 goto nogo;
497b47a8 1120#ifdef DEBUGGING
c277df42 1121 nxt2 = nxt;
497b47a8 1122#endif
c277df42 1123 nxt = regnext(nxt);
b81d288d 1124 if (OP(nxt) != CLOSE)
c277df42
IZ
1125 goto nogo;
1126 /* Now we know that nxt2 is the only contents: */
1127 oscan->flags = ARG(nxt);
1128 OP(oscan) = CURLYN;
1129 OP(nxt1) = NOTHING; /* was OPEN. */
1130#ifdef DEBUGGING
1131 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1132 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1133 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1135 OP(nxt + 1) = OPTIMIZED; /* was count. */
1136 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1137#endif
c277df42 1138 }
c277df42
IZ
1139 nogo:
1140
1141 /* Try optimization CURLYX => CURLYM. */
b81d288d 1142 if ( OP(oscan) == CURLYX && data
c277df42 1143 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1144 && !(data->flags & SF_HAS_EVAL)
1145 && !deltanext ) {
1146 /* XXXX How to optimize if data == 0? */
1147 /* Optimize to a simpler form. */
1148 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1149 regnode *nxt2;
1150
1151 OP(oscan) = CURLYM;
1152 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1153 && (OP(nxt2) != WHILEM))
c277df42
IZ
1154 nxt = nxt2;
1155 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1156 /* Need to optimize away parenths. */
1157 if (data->flags & SF_IN_PAR) {
1158 /* Set the parenth number. */
1159 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1160
b81d288d 1161 if (OP(nxt) != CLOSE)
b45f050a 1162 FAIL("Panic opt close");
c277df42
IZ
1163 oscan->flags = ARG(nxt);
1164 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1165 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1166#ifdef DEBUGGING
1167 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1168 OP(nxt + 1) = OPTIMIZED; /* was count. */
1169 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1170 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1171#endif
c277df42
IZ
1172#if 0
1173 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1174 regnode *nnxt = regnext(nxt1);
b81d288d 1175
c277df42
IZ
1176 if (nnxt == nxt) {
1177 if (reg_off_by_arg[OP(nxt1)])
1178 ARG_SET(nxt1, nxt2 - nxt1);
1179 else if (nxt2 - nxt1 < U16_MAX)
1180 NEXT_OFF(nxt1) = nxt2 - nxt1;
1181 else
1182 OP(nxt) = NOTHING; /* Cannot beautify */
1183 }
1184 nxt1 = nnxt;
1185 }
1186#endif
1187 /* Optimize again: */
b81d288d 1188 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1189 NULL, 0);
a0ed51b3
LW
1190 }
1191 else
c277df42 1192 oscan->flags = 0;
c277df42 1193 }
e1901655
IZ
1194 else if ((OP(oscan) == CURLYX)
1195 && (flags & SCF_WHILEM_VISITED_POS)
1196 /* See the comment on a similar expression above.
1197 However, this time it not a subexpression
1198 we care about, but the expression itself. */
1199 && (maxcount == REG_INFTY)
1200 && data && ++data->whilem_c < 16) {
1201 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1202 /* Find WHILEM (as in regexec.c) */
1203 regnode *nxt = oscan + NEXT_OFF(oscan);
1204
1205 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1206 nxt += ARG(nxt);
1207 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1208 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1209 }
b81d288d 1210 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1211 pars++;
1212 if (flags & SCF_DO_SUBSTR) {
1213 SV *last_str = Nullsv;
1214 int counted = mincount != 0;
1215
1216 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1217#if defined(SPARC64_GCC_WORKAROUND)
1218 I32 b = 0;
1219 STRLEN l = 0;
1220 char *s = NULL;
1221 I32 old = 0;
1222
1223 if (pos_before >= data->last_start_min)
1224 b = pos_before;
1225 else
1226 b = data->last_start_min;
1227
1228 l = 0;
1229 s = SvPV(data->last_found, l);
1230 old = b - data->last_start_min;
1231
1232#else
b81d288d 1233 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1234 ? pos_before : data->last_start_min;
1235 STRLEN l;
1236 char *s = SvPV(data->last_found, l);
a0ed51b3 1237 I32 old = b - data->last_start_min;
5d1c421c 1238#endif
a0ed51b3
LW
1239
1240 if (UTF)
1241 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1242
a0ed51b3 1243 l -= old;
c277df42 1244 /* Get the added string: */
79cb57f6 1245 last_str = newSVpvn(s + old, l);
c277df42
IZ
1246 if (deltanext == 0 && pos_before == b) {
1247 /* What was added is a constant string */
1248 if (mincount > 1) {
1249 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1250 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1251 SvPVX(last_str), l, mincount - 1);
1252 SvCUR(last_str) *= mincount;
1253 /* Add additional parts. */
b81d288d 1254 SvCUR_set(data->last_found,
c277df42
IZ
1255 SvCUR(data->last_found) - l);
1256 sv_catsv(data->last_found, last_str);
1257 data->last_end += l * (mincount - 1);
1258 }
2a8d9689
HS
1259 } else {
1260 /* start offset must point into the last copy */
1261 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1262 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1263 * (minnext + data->pos_delta);
c277df42
IZ
1264 }
1265 }
1266 /* It is counted once already... */
1267 data->pos_min += minnext * (mincount - counted);
1268 data->pos_delta += - counted * deltanext +
1269 (minnext + deltanext) * maxcount - minnext * mincount;
1270 if (mincount != maxcount) {
653099ff
GS
1271 /* Cannot extend fixed substrings found inside
1272 the group. */
830247a4 1273 scan_commit(pRExC_state,data);
c277df42
IZ
1274 if (mincount && last_str) {
1275 sv_setsv(data->last_found, last_str);
1276 data->last_end = data->pos_min;
b81d288d 1277 data->last_start_min =
a0ed51b3 1278 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1279 data->last_start_max = is_inf
1280 ? I32_MAX
c277df42 1281 : data->pos_min + data->pos_delta
a0ed51b3 1282 - CHR_SVLEN(last_str);
c277df42
IZ
1283 }
1284 data->longest = &(data->longest_float);
1285 }
aca2d497 1286 SvREFCNT_dec(last_str);
c277df42 1287 }
405ff068 1288 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1289 data->flags |= SF_HAS_EVAL;
1290 optimize_curly_tail:
c277df42 1291 if (OP(oscan) != CURLYX) {
22c35a8c 1292 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1293 && NEXT_OFF(next))
1294 NEXT_OFF(oscan) += NEXT_OFF(next);
1295 }
c277df42 1296 continue;
653099ff 1297 default: /* REF and CLUMP only? */
c277df42 1298 if (flags & SCF_DO_SUBSTR) {
830247a4 1299 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1300 data->longest = &(data->longest_float);
1301 }
aca2d497 1302 is_inf = is_inf_internal = 1;
653099ff 1303 if (flags & SCF_DO_STCLASS_OR)
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff 1305 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1306 break;
1307 }
a0ed51b3 1308 }
ffc61ed2 1309 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1310 int value = 0;
653099ff 1311
c277df42 1312 if (flags & SCF_DO_SUBSTR) {
830247a4 1313 scan_commit(pRExC_state,data);
c277df42
IZ
1314 data->pos_min++;
1315 }
1316 min++;
653099ff
GS
1317 if (flags & SCF_DO_STCLASS) {
1318 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1319
1320 /* Some of the logic below assumes that switching
1321 locale on will only add false positives. */
1322 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1323 case SANY:
653099ff
GS
1324 default:
1325 do_default:
1326 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1327 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1328 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1329 break;
1330 case REG_ANY:
1331 if (OP(scan) == SANY)
1332 goto do_default;
1333 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1334 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1335 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1336 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1337 }
1338 if (flags & SCF_DO_STCLASS_AND || !value)
1339 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1340 break;
1341 case ANYOF:
1342 if (flags & SCF_DO_STCLASS_AND)
1343 cl_and(data->start_class,
1344 (struct regnode_charclass_class*)scan);
1345 else
830247a4 1346 cl_or(pRExC_state, data->start_class,
653099ff
GS
1347 (struct regnode_charclass_class*)scan);
1348 break;
1349 case ALNUM:
1350 if (flags & SCF_DO_STCLASS_AND) {
1351 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1353 for (value = 0; value < 256; value++)
1354 if (!isALNUM(value))
1355 ANYOF_BITMAP_CLEAR(data->start_class, value);
1356 }
1357 }
1358 else {
1359 if (data->start_class->flags & ANYOF_LOCALE)
1360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1361 else {
1362 for (value = 0; value < 256; value++)
1363 if (isALNUM(value))
b81d288d 1364 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1365 }
1366 }
1367 break;
1368 case ALNUML:
1369 if (flags & SCF_DO_STCLASS_AND) {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1372 }
1373 else {
1374 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1375 data->start_class->flags |= ANYOF_LOCALE;
1376 }
1377 break;
1378 case NALNUM:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (!isALNUM(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case NALNUML:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1401 }
1402 else {
1403 data->start_class->flags |= ANYOF_LOCALE;
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1405 }
1406 break;
1407 case SPACE:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1411 for (value = 0; value < 256; value++)
1412 if (!isSPACE(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isSPACE(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case SPACEL:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1430 }
1431 else {
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1434 }
1435 break;
1436 case NSPACE:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1440 for (value = 0; value < 256; value++)
1441 if (isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isSPACE(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case NSPACEL:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE) {
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1459 for (value = 0; value < 256; value++)
1460 if (!isSPACE(value))
1461 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 }
1463 }
1464 else {
1465 data->start_class->flags |= ANYOF_LOCALE;
1466 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1467 }
1468 break;
1469 case DIGIT:
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1472 for (value = 0; value < 256; value++)
1473 if (!isDIGIT(value))
1474 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 }
1476 else {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1479 else {
1480 for (value = 0; value < 256; value++)
1481 if (isDIGIT(value))
b81d288d 1482 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1483 }
1484 }
1485 break;
1486 case NDIGIT:
1487 if (flags & SCF_DO_STCLASS_AND) {
1488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1489 for (value = 0; value < 256; value++)
1490 if (isDIGIT(value))
1491 ANYOF_BITMAP_CLEAR(data->start_class, value);
1492 }
1493 else {
1494 if (data->start_class->flags & ANYOF_LOCALE)
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1496 else {
1497 for (value = 0; value < 256; value++)
1498 if (!isDIGIT(value))
b81d288d 1499 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1500 }
1501 }
1502 break;
1503 }
1504 if (flags & SCF_DO_STCLASS_OR)
1505 cl_and(data->start_class, &and_with);
1506 flags &= ~SCF_DO_STCLASS;
1507 }
a0ed51b3 1508 }
22c35a8c 1509 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1510 data->flags |= (OP(scan) == MEOL
1511 ? SF_BEFORE_MEOL
1512 : SF_BEFORE_SEOL);
a0ed51b3 1513 }
653099ff
GS
1514 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1515 /* Lookbehind, or need to calculate parens/evals/stclass: */
1516 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1517 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1518 /* Lookahead/lookbehind */
cb434fcc 1519 I32 deltanext, minnext, fake = 0;
c277df42 1520 regnode *nscan;
653099ff
GS
1521 struct regnode_charclass_class intrnl;
1522 int f = 0;
c277df42
IZ
1523
1524 data_fake.flags = 0;
b81d288d 1525 if (data) {
2c2d71f5 1526 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1527 data_fake.last_closep = data->last_closep;
1528 }
1529 else
1530 data_fake.last_closep = &fake;
653099ff
GS
1531 if ( flags & SCF_DO_STCLASS && !scan->flags
1532 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1533 cl_init(pRExC_state, &intrnl);
653099ff 1534 data_fake.start_class = &intrnl;
e1901655 1535 f |= SCF_DO_STCLASS_AND;
653099ff 1536 }
e1901655
IZ
1537 if (flags & SCF_WHILEM_VISITED_POS)
1538 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1539 next = regnext(scan);
1540 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1541 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1542 if (scan->flags) {
1543 if (deltanext) {
9baa0206 1544 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1545 }
1546 else if (minnext > U8_MAX) {
9baa0206 1547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1548 }
1549 scan->flags = minnext;
1550 }
1551 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1552 pars++;
405ff068 1553 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1554 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1555 if (data)
1556 data->whilem_c = data_fake.whilem_c;
e1901655 1557 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1558 int was = (data->start_class->flags & ANYOF_EOS);
1559
1560 cl_and(data->start_class, &intrnl);
1561 if (was)
1562 data->start_class->flags |= ANYOF_EOS;
1563 }
a0ed51b3
LW
1564 }
1565 else if (OP(scan) == OPEN) {
c277df42 1566 pars++;
a0ed51b3 1567 }
cb434fcc
IZ
1568 else if (OP(scan) == CLOSE) {
1569 if (ARG(scan) == is_par) {
1570 next = regnext(scan);
c277df42 1571
cb434fcc
IZ
1572 if ( next && (OP(next) != WHILEM) && next < last)
1573 is_par = 0; /* Disable optimization */
1574 }
1575 if (data)
1576 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1577 }
1578 else if (OP(scan) == EVAL) {
c277df42
IZ
1579 if (data)
1580 data->flags |= SF_HAS_EVAL;
1581 }
96776eda 1582 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1583 if (flags & SCF_DO_SUBSTR) {
830247a4 1584 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1585 data->longest = &(data->longest_float);
1586 }
1587 is_inf = is_inf_internal = 1;
653099ff 1588 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1589 cl_anything(pRExC_state, data->start_class);
96776eda 1590 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1591 }
c277df42
IZ
1592 /* Else: zero-length, ignore. */
1593 scan = regnext(scan);
1594 }
1595
1596 finish:
1597 *scanp = scan;
aca2d497 1598 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1599 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1600 data->pos_delta = I32_MAX - data->pos_min;
1601 if (is_par > U8_MAX)
1602 is_par = 0;
1603 if (is_par && pars==1 && data) {
1604 data->flags |= SF_IN_PAR;
1605 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1606 }
1607 else if (pars && data) {
c277df42
IZ
1608 data->flags |= SF_HAS_PAR;
1609 data->flags &= ~SF_IN_PAR;
1610 }
653099ff
GS
1611 if (flags & SCF_DO_STCLASS_OR)
1612 cl_and(data->start_class, &and_with);
c277df42
IZ
1613 return min;
1614}
1615
76e3520e 1616STATIC I32
830247a4 1617S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1618{
830247a4 1619 if (RExC_rx->data) {
b81d288d
AB
1620 Renewc(RExC_rx->data,
1621 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1622 char, struct reg_data);
830247a4
IZ
1623 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1624 RExC_rx->data->count += n;
a0ed51b3
LW
1625 }
1626 else {
830247a4 1627 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1628 char, struct reg_data);
830247a4
IZ
1629 New(1208, RExC_rx->data->what, n, U8);
1630 RExC_rx->data->count = n;
c277df42 1631 }
830247a4
IZ
1632 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1633 return RExC_rx->data->count - n;
c277df42
IZ
1634}
1635
d88dccdf 1636void
864dbfa3 1637Perl_reginitcolors(pTHX)
d88dccdf 1638{
d88dccdf
IZ
1639 int i = 0;
1640 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1641
d88dccdf
IZ
1642 if (s) {
1643 PL_colors[0] = s = savepv(s);
1644 while (++i < 6) {
1645 s = strchr(s, '\t');
1646 if (s) {
1647 *s = '\0';
1648 PL_colors[i] = ++s;
1649 }
1650 else
c712d376 1651 PL_colors[i] = s = "";
d88dccdf
IZ
1652 }
1653 } else {
b81d288d 1654 while (i < 6)
d88dccdf
IZ
1655 PL_colors[i++] = "";
1656 }
1657 PL_colorset = 1;
1658}
1659
8615cb43 1660
a687059c 1661/*
e50aee73 1662 - pregcomp - compile a regular expression into internal code
a687059c
LW
1663 *
1664 * We can't allocate space until we know how big the compiled form will be,
1665 * but we can't compile it (and thus know how big it is) until we've got a
1666 * place to put the code. So we cheat: we compile it twice, once with code
1667 * generation turned off and size counting turned on, and once "for real".
1668 * This also means that we don't allocate space until we are sure that the
1669 * thing really will compile successfully, and we never have to move the
1670 * code and thus invalidate pointers into it. (Note that it has to be in
1671 * one piece because free() must be able to free it all.) [NB: not true in perl]
1672 *
1673 * Beware that the optimization-preparation code in here knows about some
1674 * of the structure of the compiled regexp. [I'll say.]
1675 */
1676regexp *
864dbfa3 1677Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1678{
a0d0e21e 1679 register regexp *r;
c277df42 1680 regnode *scan;
c277df42 1681 regnode *first;
a0d0e21e 1682 I32 flags;
a0d0e21e
LW
1683 I32 minlen = 0;
1684 I32 sawplus = 0;
1685 I32 sawopen = 0;
2c2d71f5 1686 scan_data_t data;
830247a4
IZ
1687 RExC_state_t RExC_state;
1688 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1689
1690 if (exp == NULL)
c277df42 1691 FAIL("NULL regexp argument");
a0d0e21e 1692
84e09d5e
JH
1693 /* XXXX This looks very suspicious... */
1694 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1695 RExC_utf8 = 1;
1696 else
1697 RExC_utf8 = 0;
a0ed51b3 1698
5cfc7842 1699 RExC_precomp = exp;
35ef4773 1700 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1701 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1702 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1703 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1704 RExC_flags16 = pm->op_pmflags;
1705 RExC_sawback = 0;
bbce6d69 1706
830247a4
IZ
1707 RExC_seen = 0;
1708 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1709 RExC_seen_evals = 0;
1710 RExC_extralen = 0;
c277df42 1711
bbce6d69 1712 /* First pass: determine size, legality. */
830247a4 1713 RExC_parse = exp;
fac92740 1714 RExC_start = exp;
830247a4
IZ
1715 RExC_end = xend;
1716 RExC_naughty = 0;
1717 RExC_npar = 1;
1718 RExC_size = 0L;
1719 RExC_emit = &PL_regdummy;
1720 RExC_whilem_seen = 0;
85ddcde9
JH
1721#if 0 /* REGC() is (currently) a NOP at the first pass.
1722 * Clever compilers notice this and complain. --jhi */
830247a4 1723 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1724#endif
830247a4 1725 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1726 RExC_precomp = Nullch;
a0d0e21e
LW
1727 return(NULL);
1728 }
830247a4 1729 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1730
c277df42
IZ
1731 /* Small enough for pointer-storage convention?
1732 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1733 if (RExC_size >= 0x10000L && RExC_extralen)
1734 RExC_size += RExC_extralen;
c277df42 1735 else
830247a4
IZ
1736 RExC_extralen = 0;
1737 if (RExC_whilem_seen > 15)
1738 RExC_whilem_seen = 15;
a0d0e21e 1739
bbce6d69 1740 /* Allocate space and initialize. */
830247a4 1741 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1742 char, regexp);
a0d0e21e 1743 if (r == NULL)
b45f050a
JF
1744 FAIL("Regexp out of space");
1745
0f79a09d
GS
1746#ifdef DEBUGGING
1747 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1748 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1749#endif
c277df42 1750 r->refcnt = 1;
bbce6d69 1751 r->prelen = xend - exp;
5cfc7842 1752 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1753 r->subbeg = NULL;
1754 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1755 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1756
1757 r->substrs = 0; /* Useful during FAIL. */
1758 r->startp = 0; /* Useful during FAIL. */
1759 r->endp = 0; /* Useful during FAIL. */
1760
fac92740
MJD
1761 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1762 if (r->offsets) {
1763 r->offsets[0] = RExC_size;
1764 }
1765 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1766 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1767 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1768 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1769
830247a4 1770 RExC_rx = r;
bbce6d69 1771
1772 /* Second pass: emit code. */
055bb491 1773 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1774 RExC_parse = exp;
1775 RExC_end = xend;
1776 RExC_naughty = 0;
1777 RExC_npar = 1;
fac92740 1778 RExC_emit_start = r->program;
830247a4 1779 RExC_emit = r->program;
2cd61cdb 1780 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1781 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1782 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1783 r->data = 0;
830247a4 1784 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1785 return(NULL);
1786
1787 /* Dig out information for optimizations. */
cf93c79d 1788 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1789 pm->op_pmflags = RExC_flags16;
a0ed51b3 1790 if (UTF)
5ff6fc6d 1791 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1792 r->regstclass = NULL;
830247a4 1793 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1794 r->reganch |= ROPT_NAUGHTY;
c277df42 1795 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1796
1797 /* XXXX To minimize changes to RE engine we always allocate
1798 3-units-long substrs field. */
1799 Newz(1004, r->substrs, 1, struct reg_substr_data);
1800
2c2d71f5 1801 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1802 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1803 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1804 I32 fake;
c5254dd6 1805 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1806 struct regnode_charclass_class ch_class;
1807 int stclass_flag;
cb434fcc 1808 I32 last_close = 0;
a0d0e21e
LW
1809
1810 first = scan;
c277df42 1811 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1812 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1813 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1814 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1815 (OP(first) == PLUS) ||
1816 (OP(first) == MINMOD) ||
653099ff 1817 /* An {n,m} with n>0 */
22c35a8c 1818 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1819 if (OP(first) == PLUS)
1820 sawplus = 1;
1821 else
1822 first += regarglen[(U8)OP(first)];
1823 first = NEXTOPER(first);
a687059c
LW
1824 }
1825
a0d0e21e
LW
1826 /* Starting-point info. */
1827 again:
653099ff 1828 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1829 if (OP(first) == EXACT)
1830 ; /* Empty, get anchored substr later. */
1831 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1832 r->regstclass = first;
1833 }
653099ff 1834 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1835 r->regstclass = first;
22c35a8c
GS
1836 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1837 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1838 r->regstclass = first;
22c35a8c 1839 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1840 r->reganch |= (OP(first) == MBOL
1841 ? ROPT_ANCH_MBOL
1842 : (OP(first) == SBOL
1843 ? ROPT_ANCH_SBOL
1844 : ROPT_ANCH_BOL));
a0d0e21e 1845 first = NEXTOPER(first);
774d564b 1846 goto again;
1847 }
1848 else if (OP(first) == GPOS) {
1849 r->reganch |= ROPT_ANCH_GPOS;
1850 first = NEXTOPER(first);
1851 goto again;
a0d0e21e
LW
1852 }
1853 else if ((OP(first) == STAR &&
22c35a8c 1854 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1855 !(r->reganch & ROPT_ANCH) )
1856 {
1857 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1858 int type = OP(NEXTOPER(first));
1859
ffc61ed2 1860 if (type == REG_ANY)
cad2e5aa
JH
1861 type = ROPT_ANCH_MBOL;
1862 else
1863 type = ROPT_ANCH_SBOL;
1864
1865 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1866 first = NEXTOPER(first);
774d564b 1867 goto again;
a0d0e21e 1868 }
b81d288d 1869 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1870 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1871 /* x+ must match at the 1st pos of run of x's */
1872 r->reganch |= ROPT_SKIP;
a0d0e21e 1873
c277df42 1874 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1875 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1876 (IV)(first - scan + 1)));
a0d0e21e
LW
1877 /*
1878 * If there's something expensive in the r.e., find the
1879 * longest literal string that must appear and make it the
1880 * regmust. Resolve ties in favor of later strings, since
1881 * the regstart check works with the beginning of the r.e.
1882 * and avoiding duplication strengthens checking. Not a
1883 * strong reason, but sufficient in the absence of others.
1884 * [Now we resolve ties in favor of the earlier string if
c277df42 1885 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1886 * earlier string may buy us something the later one won't.]
1887 */
a0d0e21e 1888 minlen = 0;
a687059c 1889
79cb57f6
GS
1890 data.longest_fixed = newSVpvn("",0);
1891 data.longest_float = newSVpvn("",0);
1892 data.last_found = newSVpvn("",0);
c277df42
IZ
1893 data.longest = &(data.longest_fixed);
1894 first = scan;
653099ff 1895 if (!r->regstclass) {
830247a4 1896 cl_init(pRExC_state, &ch_class);
653099ff
GS
1897 data.start_class = &ch_class;
1898 stclass_flag = SCF_DO_STCLASS_AND;
1899 } else /* XXXX Check for BOUND? */
1900 stclass_flag = 0;
cb434fcc 1901 data.last_closep = &last_close;
653099ff 1902
830247a4 1903 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1904 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1905 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1906 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1907 && !RExC_seen_zerolen
1908 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1909 r->reganch |= ROPT_CHECK_ALL;
830247a4 1910 scan_commit(pRExC_state, &data);
c277df42
IZ
1911 SvREFCNT_dec(data.last_found);
1912
a0ed51b3 1913 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1914 if (longest_float_length
c277df42
IZ
1915 || (data.flags & SF_FL_BEFORE_EOL
1916 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1917 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1918 int t;
1919
a0ed51b3 1920 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1921 && data.offset_fixed == data.offset_float_min
1922 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1923 goto remove_float; /* As in (a)+. */
1924
c277df42
IZ
1925 r->float_substr = data.longest_float;
1926 r->float_min_offset = data.offset_float_min;
1927 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1928 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1929 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1930 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1931 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1932 }
1933 else {
aca2d497 1934 remove_float:
c277df42
IZ
1935 r->float_substr = Nullsv;
1936 SvREFCNT_dec(data.longest_float);
c5254dd6 1937 longest_float_length = 0;
a0d0e21e 1938 }
c277df42 1939
a0ed51b3 1940 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1941 if (longest_fixed_length
c277df42
IZ
1942 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1943 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1944 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1945 int t;
1946
c277df42
IZ
1947 r->anchored_substr = data.longest_fixed;
1948 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1949 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1950 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1951 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1952 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1953 }
1954 else {
c277df42
IZ
1955 r->anchored_substr = Nullsv;
1956 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1957 longest_fixed_length = 0;
a0d0e21e 1958 }
b81d288d 1959 if (r->regstclass
ffc61ed2 1960 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1961 r->regstclass = NULL;
1962 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1963 && !(data.start_class->flags & ANYOF_EOS)
1964 && !cl_is_anything(data.start_class)) {
830247a4 1965 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1966
b81d288d 1967 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1968 struct regnode_charclass_class);
1969 StructCopy(data.start_class,
830247a4 1970 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1971 struct regnode_charclass_class);
830247a4 1972 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1973 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1974 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1975 DEBUG_r({ SV *sv = sv_newmortal();
1976 regprop(sv, (regnode*)data.start_class);
1977 PerlIO_printf(Perl_debug_log,
1978 "synthetic stclass `%s'.\n",
1979 SvPVX(sv));});
653099ff 1980 }
c277df42
IZ
1981
1982 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1983 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1984 r->check_substr = r->anchored_substr;
1985 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1986 if (r->reganch & ROPT_ANCH_SINGLE)
1987 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1988 }
1989 else {
c277df42
IZ
1990 r->check_substr = r->float_substr;
1991 r->check_offset_min = data.offset_float_min;
1992 r->check_offset_max = data.offset_float_max;
a0d0e21e 1993 }
30382c73
IZ
1994 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1995 This should be changed ASAP! */
1996 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1997 r->reganch |= RE_USE_INTUIT;
1998 if (SvTAIL(r->check_substr))
1999 r->reganch |= RE_INTUIT_TAIL;
2000 }
a0ed51b3
LW
2001 }
2002 else {
c277df42
IZ
2003 /* Several toplevels. Best we can is to set minlen. */
2004 I32 fake;
653099ff 2005 struct regnode_charclass_class ch_class;
cb434fcc 2006 I32 last_close = 0;
c277df42
IZ
2007
2008 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2009 scan = r->program + 1;
830247a4 2010 cl_init(pRExC_state, &ch_class);
653099ff 2011 data.start_class = &ch_class;
cb434fcc 2012 data.last_closep = &last_close;
e1901655 2013 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2014 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2015 if (!(data.start_class->flags & ANYOF_EOS)
2016 && !cl_is_anything(data.start_class)) {
830247a4 2017 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2018
b81d288d 2019 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2020 struct regnode_charclass_class);
2021 StructCopy(data.start_class,
830247a4 2022 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2023 struct regnode_charclass_class);
830247a4 2024 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2025 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2026 DEBUG_r({ SV* sv = sv_newmortal();
2027 regprop(sv, (regnode*)data.start_class);
2028 PerlIO_printf(Perl_debug_log,
2029 "synthetic stclass `%s'.\n",
2030 SvPVX(sv));});
653099ff 2031 }
a0d0e21e
LW
2032 }
2033
a0d0e21e 2034 r->minlen = minlen;
b81d288d 2035 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2036 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2037 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2038 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2039 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2040 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2041 if (RExC_seen & REG_SEEN_CANY)
2042 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2043 Newz(1002, r->startp, RExC_npar, I32);
2044 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2045 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2046 DEBUG_r(regdump(r));
2047 return(r);
a687059c
LW
2048}
2049
2050/*
2051 - reg - regular expression, i.e. main body or parenthesized thing
2052 *
2053 * Caller must absorb opening parenthesis.
2054 *
2055 * Combining parenthesis handling with the base level of regular expression
2056 * is a trifle forced, but the need to tie the tails of the branches to what
2057 * follows makes it hard to avoid.
2058 */
76e3520e 2059STATIC regnode *
830247a4 2060S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2061 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2062{
c277df42
IZ
2063 register regnode *ret; /* Will be the head of the group. */
2064 register regnode *br;
2065 register regnode *lastbr;
2066 register regnode *ender = 0;
a0d0e21e 2067 register I32 parno = 0;
830247a4 2068 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2069
2070 /* for (?g), (?gc), and (?o) warnings; warning
2071 about (?c) will warn about (?g) -- japhy */
2072
2073 I32 wastedflags = 0x00,
2074 wasted_o = 0x01,
2075 wasted_g = 0x02,
2076 wasted_gc = 0x02 | 0x04,
2077 wasted_c = 0x04;
2078
fac92740 2079 char * parse_start = RExC_parse; /* MJD */
830247a4 2080 char *oregcomp_parse = RExC_parse;
c277df42 2081 char c;
a0d0e21e 2082
821b33a5 2083 *flagp = 0; /* Tentatively. */
a0d0e21e 2084
9d1d55b5 2085
a0d0e21e
LW
2086 /* Make an OPEN node, if parenthesized. */
2087 if (paren) {
fac92740 2088 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2089 U16 posflags = 0, negflags = 0;
2090 U16 *flagsp = &posflags;
0f5d15d6 2091 int logical = 0;
830247a4 2092 char *seqstart = RExC_parse;
ca9dfc88 2093
830247a4
IZ
2094 RExC_parse++;
2095 paren = *RExC_parse++;
c277df42 2096 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2097 switch (paren) {
fac92740 2098 case '<': /* (?<...) */
830247a4 2099 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2100 if (*RExC_parse == '!')
c277df42 2101 paren = ',';
b81d288d 2102 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2103 goto unknown;
830247a4 2104 RExC_parse++;
fac92740
MJD
2105 case '=': /* (?=...) */
2106 case '!': /* (?!...) */
830247a4 2107 RExC_seen_zerolen++;
fac92740
MJD
2108 case ':': /* (?:...) */
2109 case '>': /* (?>...) */
a0d0e21e 2110 break;
fac92740
MJD
2111 case '$': /* (?$...) */
2112 case '@': /* (?@...) */
8615cb43 2113 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2114 break;
fac92740 2115 case '#': /* (?#...) */
830247a4
IZ
2116 while (*RExC_parse && *RExC_parse != ')')
2117 RExC_parse++;
2118 if (*RExC_parse != ')')
c277df42 2119 FAIL("Sequence (?#... not terminated");
830247a4 2120 nextchar(pRExC_state);
a0d0e21e
LW
2121 *flagp = TRYAGAIN;
2122 return NULL;
fac92740 2123 case 'p': /* (?p...) */
f9373011
PM
2124 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2125 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2126 /* FALL THROUGH*/
fac92740 2127 case '?': /* (??...) */
0f5d15d6 2128 logical = 1;
830247a4 2129 paren = *RExC_parse++;
0f5d15d6 2130 /* FALL THROUGH */
fac92740 2131 case '{': /* (?{...}) */
c277df42 2132 {
c277df42
IZ
2133 I32 count = 1, n = 0;
2134 char c;
830247a4 2135 char *s = RExC_parse;
c277df42
IZ
2136 SV *sv;
2137 OP_4tree *sop, *rop;
2138
830247a4
IZ
2139 RExC_seen_zerolen++;
2140 RExC_seen |= REG_SEEN_EVAL;
2141 while (count && (c = *RExC_parse)) {
2142 if (c == '\\' && RExC_parse[1])
2143 RExC_parse++;
b81d288d 2144 else if (c == '{')
c277df42 2145 count++;
b81d288d 2146 else if (c == '}')
c277df42 2147 count--;
830247a4 2148 RExC_parse++;
c277df42 2149 }
830247a4 2150 if (*RExC_parse != ')')
b45f050a 2151 {
b81d288d 2152 RExC_parse = s;
b45f050a
JF
2153 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2154 }
c277df42
IZ
2155 if (!SIZE_ONLY) {
2156 AV *av;
b81d288d
AB
2157
2158 if (RExC_parse - 1 - s)
830247a4 2159 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2160 else
79cb57f6 2161 sv = newSVpvn("", 0);
c277df42 2162
569233ed
SB
2163 ENTER;
2164 Perl_save_re_context(aTHX);
c277df42 2165 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2166 sop->op_private |= OPpREFCOUNTED;
2167 /* re_dup will OpREFCNT_inc */
2168 OpREFCNT_set(sop, 1);
569233ed 2169 LEAVE;
c277df42 2170
830247a4
IZ
2171 n = add_data(pRExC_state, 3, "nop");
2172 RExC_rx->data->data[n] = (void*)rop;
2173 RExC_rx->data->data[n+1] = (void*)sop;
2174 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2175 SvREFCNT_dec(sv);
a0ed51b3 2176 }
e24b16f9 2177 else { /* First pass */
830247a4 2178 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2179 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2180 /* No compiled RE interpolated, has runtime
2181 components ===> unsafe. */
2182 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2183 if (PL_tainting && PL_tainted)
cc6b7395 2184 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2185 }
2186
830247a4 2187 nextchar(pRExC_state);
0f5d15d6 2188 if (logical) {
830247a4 2189 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2190 if (!SIZE_ONLY)
2191 ret->flags = 2;
830247a4 2192 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2193 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2194 return ret;
2195 }
830247a4 2196 return reganode(pRExC_state, EVAL, n);
c277df42 2197 }
fac92740 2198 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2199 {
fac92740 2200 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2201 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2202 || RExC_parse[1] == '<'
830247a4 2203 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2204 I32 flag;
2205
830247a4 2206 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2207 if (!SIZE_ONLY)
2208 ret->flags = 1;
830247a4 2209 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2210 goto insert_if;
b81d288d 2211 }
a0ed51b3 2212 }
830247a4 2213 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2214 /* (?(1)...) */
830247a4 2215 parno = atoi(RExC_parse++);
c277df42 2216
830247a4
IZ
2217 while (isDIGIT(*RExC_parse))
2218 RExC_parse++;
fac92740
MJD
2219 ret = reganode(pRExC_state, GROUPP, parno);
2220
830247a4 2221 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2222 vFAIL("Switch condition not recognized");
c277df42 2223 insert_if:
830247a4
IZ
2224 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2225 br = regbranch(pRExC_state, &flags, 1);
c277df42 2226 if (br == NULL)
830247a4 2227 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2228 else
830247a4
IZ
2229 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2230 c = *nextchar(pRExC_state);
d1b80229
IZ
2231 if (flags&HASWIDTH)
2232 *flagp |= HASWIDTH;
c277df42 2233 if (c == '|') {
830247a4
IZ
2234 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2235 regbranch(pRExC_state, &flags, 1);
2236 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2237 if (flags&HASWIDTH)
2238 *flagp |= HASWIDTH;
830247a4 2239 c = *nextchar(pRExC_state);
a0ed51b3
LW
2240 }
2241 else
c277df42
IZ
2242 lastbr = NULL;
2243 if (c != ')')
8615cb43 2244 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2245 ender = reg_node(pRExC_state, TAIL);
2246 regtail(pRExC_state, br, ender);
c277df42 2247 if (lastbr) {
830247a4
IZ
2248 regtail(pRExC_state, lastbr, ender);
2249 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2250 }
2251 else
830247a4 2252 regtail(pRExC_state, ret, ender);
c277df42 2253 return ret;
a0ed51b3
LW
2254 }
2255 else {
830247a4 2256 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2257 }
2258 }
1b1626e4 2259 case 0:
830247a4 2260 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2261 vFAIL("Sequence (? incomplete");
1b1626e4 2262 break;
a0d0e21e 2263 default:
830247a4 2264 --RExC_parse;
fac92740 2265 parse_flags: /* (?i) */
830247a4 2266 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2267 /* (?g), (?gc) and (?o) are useless here
2268 and must be globally applied -- japhy */
2269
2270 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2271 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2272 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2273 if (! (wastedflags & wflagbit) ) {
2274 wastedflags |= wflagbit;
2275 vWARN5(
2276 RExC_parse + 1,
2277 "Useless (%s%c) - %suse /%c modifier",
2278 flagsp == &negflags ? "?-" : "?",
2279 *RExC_parse,
2280 flagsp == &negflags ? "don't " : "",
2281 *RExC_parse
2282 );
2283 }
2284 }
2285 }
2286 else if (*RExC_parse == 'c') {
2287 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2288 if (! (wastedflags & wasted_c) ) {
2289 wastedflags |= wasted_gc;
2290 vWARN3(
2291 RExC_parse + 1,
2292 "Useless (%sc) - %suse /gc modifier",
2293 flagsp == &negflags ? "?-" : "?",
2294 flagsp == &negflags ? "don't " : ""
2295 );
2296 }
2297 }
2298 }
2299 else { pmflag(flagsp, *RExC_parse); }
2300
830247a4 2301 ++RExC_parse;
ca9dfc88 2302 }
830247a4 2303 if (*RExC_parse == '-') {
ca9dfc88 2304 flagsp = &negflags;
9d1d55b5 2305 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2306 ++RExC_parse;
ca9dfc88 2307 goto parse_flags;
48c036b1 2308 }
830247a4
IZ
2309 RExC_flags16 |= posflags;
2310 RExC_flags16 &= ~negflags;
2311 if (*RExC_parse == ':') {
2312 RExC_parse++;
ca9dfc88
IZ
2313 paren = ':';
2314 break;
2315 }
c277df42 2316 unknown:
830247a4
IZ
2317 if (*RExC_parse != ')') {
2318 RExC_parse++;
2319 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2320 }
830247a4 2321 nextchar(pRExC_state);
a0d0e21e
LW
2322 *flagp = TRYAGAIN;
2323 return NULL;
2324 }
2325 }
fac92740 2326 else { /* (...) */
830247a4
IZ
2327 parno = RExC_npar;
2328 RExC_npar++;
2329 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2330 Set_Node_Length(ret, 1); /* MJD */
2331 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2332 open = 1;
a0d0e21e 2333 }
a0ed51b3 2334 }
fac92740 2335 else /* ! paren */
a0d0e21e
LW
2336 ret = NULL;
2337
2338 /* Pick up the branches, linking them together. */
fac92740 2339 parse_start = RExC_parse; /* MJD */
830247a4 2340 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2341 /* branch_len = (paren != 0); */
2342
a0d0e21e
LW
2343 if (br == NULL)
2344 return(NULL);
830247a4
IZ
2345 if (*RExC_parse == '|') {
2346 if (!SIZE_ONLY && RExC_extralen) {
2347 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2348 }
fac92740 2349 else { /* MJD */
830247a4 2350 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2351 Set_Node_Length(br, paren != 0);
2352 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2353 }
c277df42
IZ
2354 have_branch = 1;
2355 if (SIZE_ONLY)
830247a4 2356 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2357 }
2358 else if (paren == ':') {
c277df42
IZ
2359 *flagp |= flags&SIMPLE;
2360 }
2361 if (open) { /* Starts with OPEN. */
830247a4 2362 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2363 }
2364 else if (paren != '?') /* Not Conditional */
a0d0e21e 2365 ret = br;
821b33a5
IZ
2366 if (flags&HASWIDTH)
2367 *flagp |= HASWIDTH;
a0d0e21e 2368 *flagp |= flags&SPSTART;
c277df42 2369 lastbr = br;
830247a4
IZ
2370 while (*RExC_parse == '|') {
2371 if (!SIZE_ONLY && RExC_extralen) {
2372 ender = reganode(pRExC_state, LONGJMP,0);
2373 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2374 }
2375 if (SIZE_ONLY)
830247a4
IZ
2376 RExC_extralen += 2; /* Account for LONGJMP. */
2377 nextchar(pRExC_state);
2378 br = regbranch(pRExC_state, &flags, 0);
fac92740 2379
a687059c 2380 if (br == NULL)
a0d0e21e 2381 return(NULL);
830247a4 2382 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2383 lastbr = br;
821b33a5
IZ
2384 if (flags&HASWIDTH)
2385 *flagp |= HASWIDTH;
a687059c 2386 *flagp |= flags&SPSTART;
a0d0e21e
LW
2387 }
2388
c277df42
IZ
2389 if (have_branch || paren != ':') {
2390 /* Make a closing node, and hook it on the end. */
2391 switch (paren) {
2392 case ':':
830247a4 2393 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2394 break;
2395 case 1:
830247a4 2396 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2397 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2398 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2399 break;
2400 case '<':
c277df42
IZ
2401 case ',':
2402 case '=':
2403 case '!':
c277df42 2404 *flagp &= ~HASWIDTH;
821b33a5
IZ
2405 /* FALL THROUGH */
2406 case '>':
830247a4 2407 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2408 break;
2409 case 0:
830247a4 2410 ender = reg_node(pRExC_state, END);
c277df42
IZ
2411 break;
2412 }
830247a4 2413 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2414
c277df42
IZ
2415 if (have_branch) {
2416 /* Hook the tails of the branches to the closing node. */
2417 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2418 regoptail(pRExC_state, br, ender);
c277df42
IZ
2419 }
2420 }
a0d0e21e 2421 }
c277df42
IZ
2422
2423 {
2424 char *p;
2425 static char parens[] = "=!<,>";
2426
2427 if (paren && (p = strchr(parens, paren))) {
2428 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2429 int flag = (p - parens) > 1;
2430
2431 if (paren == '>')
2432 node = SUSPEND, flag = 0;
830247a4 2433 reginsert(pRExC_state, node,ret);
c277df42 2434 ret->flags = flag;
830247a4 2435 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2436 }
a0d0e21e
LW
2437 }
2438
2439 /* Check for proper termination. */
ce3e6498 2440 if (paren) {
830247a4
IZ
2441 RExC_flags16 = oregflags;
2442 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2443 RExC_parse = oregcomp_parse;
380a0633 2444 vFAIL("Unmatched (");
ce3e6498 2445 }
a0ed51b3 2446 }
830247a4
IZ
2447 else if (!paren && RExC_parse < RExC_end) {
2448 if (*RExC_parse == ')') {
2449 RExC_parse++;
380a0633 2450 vFAIL("Unmatched )");
a0ed51b3
LW
2451 }
2452 else
b45f050a 2453 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2454 /* NOTREACHED */
2455 }
a687059c 2456
a0d0e21e 2457 return(ret);
a687059c
LW
2458}
2459
2460/*
2461 - regbranch - one alternative of an | operator
2462 *
2463 * Implements the concatenation operator.
2464 */
76e3520e 2465STATIC regnode *
830247a4 2466S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2467{
c277df42
IZ
2468 register regnode *ret;
2469 register regnode *chain = NULL;
2470 register regnode *latest;
2471 I32 flags = 0, c = 0;
a0d0e21e 2472
b81d288d 2473 if (first)
c277df42
IZ
2474 ret = NULL;
2475 else {
b81d288d 2476 if (!SIZE_ONLY && RExC_extralen)
830247a4 2477 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2478 else {
830247a4 2479 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2480 Set_Node_Length(ret, 1);
2481 }
c277df42
IZ
2482 }
2483
b81d288d 2484 if (!first && SIZE_ONLY)
830247a4 2485 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2486
c277df42 2487 *flagp = WORST; /* Tentatively. */
a0d0e21e 2488
830247a4
IZ
2489 RExC_parse--;
2490 nextchar(pRExC_state);
2491 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2492 flags &= ~TRYAGAIN;
830247a4 2493 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2494 if (latest == NULL) {
2495 if (flags & TRYAGAIN)
2496 continue;
2497 return(NULL);
a0ed51b3
LW
2498 }
2499 else if (ret == NULL)
c277df42 2500 ret = latest;
a0d0e21e 2501 *flagp |= flags&HASWIDTH;
c277df42 2502 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2503 *flagp |= flags&SPSTART;
2504 else {
830247a4
IZ
2505 RExC_naughty++;
2506 regtail(pRExC_state, chain, latest);
a687059c 2507 }
a0d0e21e 2508 chain = latest;
c277df42
IZ
2509 c++;
2510 }
2511 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2512 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2513 if (ret == NULL)
2514 ret = chain;
2515 }
2516 if (c == 1) {
2517 *flagp |= flags&SIMPLE;
a0d0e21e 2518 }
a687059c 2519
a0d0e21e 2520 return(ret);
a687059c
LW
2521}
2522
2523/*
2524 - regpiece - something followed by possible [*+?]
2525 *
2526 * Note that the branching code sequences used for ? and the general cases
2527 * of * and + are somewhat optimized: they use the same NOTHING node as
2528 * both the endmarker for their branch list and the body of the last branch.
2529 * It might seem that this node could be dispensed with entirely, but the
2530 * endmarker role is not redundant.
2531 */
76e3520e 2532STATIC regnode *
830247a4 2533S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2534{
c277df42 2535 register regnode *ret;
a0d0e21e
LW
2536 register char op;
2537 register char *next;
2538 I32 flags;
830247a4 2539 char *origparse = RExC_parse;
a0d0e21e
LW
2540 char *maxpos;
2541 I32 min;
c277df42 2542 I32 max = REG_INFTY;
fac92740 2543 char *parse_start;
a0d0e21e 2544
830247a4 2545 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2546 if (ret == NULL) {
2547 if (flags & TRYAGAIN)
2548 *flagp |= TRYAGAIN;
2549 return(NULL);
2550 }
2551
830247a4 2552 op = *RExC_parse;
a0d0e21e 2553
830247a4 2554 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2555 parse_start = RExC_parse; /* MJD */
830247a4 2556 next = RExC_parse + 1;
a0d0e21e
LW
2557 maxpos = Nullch;
2558 while (isDIGIT(*next) || *next == ',') {
2559 if (*next == ',') {
2560 if (maxpos)
2561 break;
2562 else
2563 maxpos = next;
a687059c 2564 }
a0d0e21e
LW
2565 next++;
2566 }
2567 if (*next == '}') { /* got one */
2568 if (!maxpos)
2569 maxpos = next;
830247a4
IZ
2570 RExC_parse++;
2571 min = atoi(RExC_parse);
a0d0e21e
LW
2572 if (*maxpos == ',')
2573 maxpos++;
2574 else
830247a4 2575 maxpos = RExC_parse;
a0d0e21e
LW
2576 max = atoi(maxpos);
2577 if (!max && *maxpos != '0')
c277df42
IZ
2578 max = REG_INFTY; /* meaning "infinity" */
2579 else if (max >= REG_INFTY)
8615cb43 2580 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2581 RExC_parse = next;
2582 nextchar(pRExC_state);
a0d0e21e
LW
2583
2584 do_curly:
2585 if ((flags&SIMPLE)) {
830247a4
IZ
2586 RExC_naughty += 2 + RExC_naughty / 2;
2587 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2588 Set_Node_Offset(ret, parse_start+1); /* MJD */
2589 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2590 }
2591 else {
830247a4 2592 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2593
2594 w->flags = 0;
830247a4
IZ
2595 regtail(pRExC_state, ret, w);
2596 if (!SIZE_ONLY && RExC_extralen) {
2597 reginsert(pRExC_state, LONGJMP,ret);
2598 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2599 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2600 }
830247a4 2601 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2602 /* MJD hk */
2603 Set_Node_Offset(ret, parse_start+1);
2604 Set_Node_Length(ret,
2605 op == '{' ? (RExC_parse - parse_start) : 1);
2606
830247a4 2607 if (!SIZE_ONLY && RExC_extralen)
c277df42 2608 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2609 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2610 if (SIZE_ONLY)
830247a4
IZ
2611 RExC_whilem_seen++, RExC_extralen += 3;
2612 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2613 }
c277df42 2614 ret->flags = 0;
a0d0e21e
LW
2615
2616 if (min > 0)
821b33a5
IZ
2617 *flagp = WORST;
2618 if (max > 0)
2619 *flagp |= HASWIDTH;
a0d0e21e 2620 if (max && max < min)
8615cb43 2621 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2622 if (!SIZE_ONLY) {
2623 ARG1_SET(ret, min);
2624 ARG2_SET(ret, max);
a687059c 2625 }
a687059c 2626
a0d0e21e 2627 goto nest_check;
a687059c 2628 }
a0d0e21e 2629 }
a687059c 2630
a0d0e21e
LW
2631 if (!ISMULT1(op)) {
2632 *flagp = flags;
a687059c 2633 return(ret);
a0d0e21e 2634 }
bb20fd44 2635
c277df42 2636#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2637
2638 /* if this is reinstated, don't forget to put this back into perldiag:
2639
2640 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2641
2642 (F) The part of the regexp subject to either the * or + quantifier
2643 could match an empty string. The {#} shows in the regular
2644 expression about where the problem was discovered.
2645
2646 */
2647
bb20fd44 2648 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2649 vFAIL("Regexp *+ operand could be empty");
b81d288d 2650#endif
bb20fd44 2651
fac92740 2652 parse_start = RExC_parse;
830247a4 2653 nextchar(pRExC_state);
a0d0e21e 2654
821b33a5 2655 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2656
2657 if (op == '*' && (flags&SIMPLE)) {
830247a4 2658 reginsert(pRExC_state, STAR, ret);
c277df42 2659 ret->flags = 0;
830247a4 2660 RExC_naughty += 4;
a0d0e21e
LW
2661 }
2662 else if (op == '*') {
2663 min = 0;
2664 goto do_curly;
a0ed51b3
LW
2665 }
2666 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2667 reginsert(pRExC_state, PLUS, ret);
c277df42 2668 ret->flags = 0;
830247a4 2669 RExC_naughty += 3;
a0d0e21e
LW
2670 }
2671 else if (op == '+') {
2672 min = 1;
2673 goto do_curly;
a0ed51b3
LW
2674 }
2675 else if (op == '?') {
a0d0e21e
LW
2676 min = 0; max = 1;
2677 goto do_curly;
2678 }
2679 nest_check:
e476b1b5 2680 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2681 vWARN3(RExC_parse,
b45f050a 2682 "%.*s matches null string many times",
830247a4 2683 RExC_parse - origparse,
b45f050a 2684 origparse);
a0d0e21e
LW
2685 }
2686
830247a4
IZ
2687 if (*RExC_parse == '?') {
2688 nextchar(pRExC_state);
2689 reginsert(pRExC_state, MINMOD, ret);
2690 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2691 }
830247a4
IZ
2692 if (ISMULT2(RExC_parse)) {
2693 RExC_parse++;
b45f050a
JF
2694 vFAIL("Nested quantifiers");
2695 }
a0d0e21e
LW
2696
2697 return(ret);
a687059c
LW
2698}
2699
2700/*
2701 - regatom - the lowest level
2702 *
2703 * Optimization: gobbles an entire sequence of ordinary characters so that
2704 * it can turn them into a single node, which is smaller to store and
2705 * faster to run. Backslashed characters are exceptions, each becoming a
2706 * separate node; the code is simpler that way and it's not worth fixing.
2707 *
b45f050a 2708 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2709STATIC regnode *
830247a4 2710S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2711{
c277df42 2712 register regnode *ret = 0;
a0d0e21e 2713 I32 flags;
f06dbbb7 2714 char *parse_start = 0;
a0d0e21e
LW
2715
2716 *flagp = WORST; /* Tentatively. */
2717
2718tryagain:
830247a4 2719 switch (*RExC_parse) {
a0d0e21e 2720 case '^':
830247a4
IZ
2721 RExC_seen_zerolen++;
2722 nextchar(pRExC_state);
2723 if (RExC_flags16 & PMf_MULTILINE)
2724 ret = reg_node(pRExC_state, MBOL);
2725 else if (RExC_flags16 & PMf_SINGLELINE)
2726 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2727 else
830247a4 2728 ret = reg_node(pRExC_state, BOL);
fac92740 2729 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2730 break;
2731 case '$':
830247a4 2732 nextchar(pRExC_state);
b81d288d 2733 if (*RExC_parse)
830247a4
IZ
2734 RExC_seen_zerolen++;
2735 if (RExC_flags16 & PMf_MULTILINE)
2736 ret = reg_node(pRExC_state, MEOL);
2737 else if (RExC_flags16 & PMf_SINGLELINE)
2738 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2739 else
830247a4 2740 ret = reg_node(pRExC_state, EOL);
fac92740 2741 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2742 break;
2743 case '.':
830247a4 2744 nextchar(pRExC_state);
cce850e4 2745 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2746 ret = reg_node(pRExC_state, SANY);
2747 else
2748 ret = reg_node(pRExC_state, REG_ANY);
2749 *flagp |= HASWIDTH|SIMPLE;
830247a4 2750 RExC_naughty++;
fac92740 2751 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2752 break;
2753 case '[':
b45f050a 2754 {
830247a4 2755 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2756 ret = regclass(pRExC_state);
830247a4
IZ
2757 if (*RExC_parse != ']') {
2758 RExC_parse = oregcomp_parse;
b45f050a
JF
2759 vFAIL("Unmatched [");
2760 }
830247a4 2761 nextchar(pRExC_state);
a0d0e21e 2762 *flagp |= HASWIDTH|SIMPLE;
fac92740 2763 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2764 break;
b45f050a 2765 }
a0d0e21e 2766 case '(':
830247a4
IZ
2767 nextchar(pRExC_state);
2768 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2769 if (ret == NULL) {
bf93d4cc 2770 if (flags & TRYAGAIN) {
830247a4 2771 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2772 /* Make parent create an empty node if needed. */
2773 *flagp |= TRYAGAIN;
2774 return(NULL);
2775 }
a0d0e21e 2776 goto tryagain;
bf93d4cc 2777 }
a0d0e21e
LW
2778 return(NULL);
2779 }
c277df42 2780 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2781 break;
2782 case '|':
2783 case ')':
2784 if (flags & TRYAGAIN) {
2785 *flagp |= TRYAGAIN;
2786 return NULL;
2787 }
b45f050a 2788 vFAIL("Internal urp");
a0d0e21e
LW
2789 /* Supposed to be caught earlier. */
2790 break;
85afd4ae 2791 case '{':
830247a4
IZ
2792 if (!regcurly(RExC_parse)) {
2793 RExC_parse++;
85afd4ae
CS
2794 goto defchar;
2795 }
2796 /* FALL THROUGH */
a0d0e21e
LW
2797 case '?':
2798 case '+':
2799 case '*':
830247a4 2800 RExC_parse++;
b45f050a 2801 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2802 break;
2803 case '\\':
830247a4 2804 switch (*++RExC_parse) {
a0d0e21e 2805 case 'A':
830247a4
IZ
2806 RExC_seen_zerolen++;
2807 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2808 *flagp |= SIMPLE;
830247a4 2809 nextchar(pRExC_state);
fac92740 2810 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2811 break;
2812 case 'G':
830247a4
IZ
2813 ret = reg_node(pRExC_state, GPOS);
2814 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2815 *flagp |= SIMPLE;
830247a4 2816 nextchar(pRExC_state);
fac92740 2817 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2818 break;
2819 case 'Z':
830247a4 2820 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2821 *flagp |= SIMPLE;
a1917ab9 2822 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2823 nextchar(pRExC_state);
a0d0e21e 2824 break;
b85d18e9 2825 case 'z':
830247a4 2826 ret = reg_node(pRExC_state, EOS);
b85d18e9 2827 *flagp |= SIMPLE;
830247a4
IZ
2828 RExC_seen_zerolen++; /* Do not optimize RE away */
2829 nextchar(pRExC_state);
fac92740 2830 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2831 break;
4a2d328f 2832 case 'C':
f33976b4
DB
2833 ret = reg_node(pRExC_state, CANY);
2834 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2835 *flagp |= HASWIDTH|SIMPLE;
830247a4 2836 nextchar(pRExC_state);
fac92740 2837 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2838 break;
2839 case 'X':
830247a4 2840 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2841 *flagp |= HASWIDTH;
830247a4 2842 nextchar(pRExC_state);
fac92740 2843 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2844 break;
a0d0e21e 2845 case 'w':
ffc61ed2 2846 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2847 *flagp |= HASWIDTH|SIMPLE;
830247a4 2848 nextchar(pRExC_state);
fac92740 2849 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2850 break;
2851 case 'W':
ffc61ed2 2852 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2853 *flagp |= HASWIDTH|SIMPLE;
830247a4 2854 nextchar(pRExC_state);
fac92740 2855 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2856 break;
2857 case 'b':
830247a4
IZ
2858 RExC_seen_zerolen++;
2859 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2860 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2861 *flagp |= SIMPLE;
830247a4 2862 nextchar(pRExC_state);
fac92740 2863 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2864 break;
2865 case 'B':
830247a4
IZ
2866 RExC_seen_zerolen++;
2867 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2868 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2869 *flagp |= SIMPLE;
830247a4 2870 nextchar(pRExC_state);
fac92740 2871 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2872 break;
2873 case 's':
ffc61ed2 2874 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2875 *flagp |= HASWIDTH|SIMPLE;
830247a4 2876 nextchar(pRExC_state);
fac92740 2877 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2878 break;
2879 case 'S':
ffc61ed2 2880 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2881 *flagp |= HASWIDTH|SIMPLE;
830247a4 2882 nextchar(pRExC_state);
fac92740 2883 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2884 break;
2885 case 'd':
ffc61ed2 2886 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2887 *flagp |= HASWIDTH|SIMPLE;
830247a4 2888 nextchar(pRExC_state);
fac92740 2889 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2890 break;
2891 case 'D':
ffc61ed2 2892 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2893 *flagp |= HASWIDTH|SIMPLE;
830247a4 2894 nextchar(pRExC_state);
fac92740 2895 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2896 break;
a14b48bc
LW
2897 case 'p':
2898 case 'P':
3568d838 2899 {
830247a4 2900 char* oldregxend = RExC_end;
fac92740 2901 char* parse_start = RExC_parse;
a14b48bc 2902
830247a4 2903 if (RExC_parse[1] == '{') {
3568d838 2904 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2905 RExC_end = strchr(RExC_parse, '}');
2906 if (!RExC_end) {
0da60cf5 2907 U8 c = (U8)*RExC_parse;
830247a4
IZ
2908 RExC_parse += 2;
2909 RExC_end = oldregxend;
0da60cf5 2910 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2911 }
830247a4 2912 RExC_end++;
a14b48bc
LW
2913 }
2914 else
830247a4
IZ
2915 RExC_end = RExC_parse + 2;
2916 RExC_parse--;
a14b48bc 2917
ffc61ed2 2918 ret = regclass(pRExC_state);
a14b48bc 2919
830247a4
IZ
2920 RExC_end = oldregxend;
2921 RExC_parse--;
fac92740 2922 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2923 nextchar(pRExC_state);
a14b48bc
LW
2924 *flagp |= HASWIDTH|SIMPLE;
2925 }
2926 break;
a0d0e21e
LW
2927 case 'n':
2928 case 'r':
2929 case 't':
2930 case 'f':
2931 case 'e':
2932 case 'a':
2933 case 'x':
2934 case 'c':
2935 case '0':
2936 goto defchar;
2937 case '1': case '2': case '3': case '4':
2938 case '5': case '6': case '7': case '8': case '9':
2939 {
830247a4 2940 I32 num = atoi(RExC_parse);
a0d0e21e 2941
830247a4 2942 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2943 goto defchar;
2944 else {
fac92740 2945 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2946 while (isDIGIT(*RExC_parse))
2947 RExC_parse++;
b45f050a 2948
830247a4 2949 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2950 vFAIL("Reference to nonexistent group");
830247a4
IZ
2951 RExC_sawback = 1;
2952 ret = reganode(pRExC_state, FOLD
a0ed51b3 2953 ? (LOC ? REFFL : REFF)
c8756f30 2954 : REF, num);
a0d0e21e 2955 *flagp |= HASWIDTH;
fac92740
MJD
2956
2957 /* override incorrect value set in reganode MJD */
2958 Set_Node_Offset(ret, parse_start+1);
2959 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2960 RExC_parse--;
2961 nextchar(pRExC_state);
a0d0e21e
LW
2962 }
2963 }
2964 break;
2965 case '\0':
830247a4 2966 if (RExC_parse >= RExC_end)
b45f050a 2967 FAIL("Trailing \\");
a0d0e21e
LW
2968 /* FALL THROUGH */
2969 default:
c9f97d15
IZ
2970 /* Do not generate `unrecognized' warnings here, we fall
2971 back into the quick-grab loop below */
a0d0e21e
LW
2972 goto defchar;
2973 }
2974 break;
4633a7c4
LW
2975
2976 case '#':
830247a4
IZ
2977 if (RExC_flags16 & PMf_EXTENDED) {
2978 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2979 if (RExC_parse < RExC_end)
4633a7c4
LW
2980 goto tryagain;
2981 }
2982 /* FALL THROUGH */
2983
a0d0e21e 2984 default: {
ba210ebe 2985 register STRLEN len;
a0ed51b3 2986 register UV ender;
a0d0e21e 2987 register char *p;
c277df42 2988 char *oldp, *s;
ba210ebe 2989 STRLEN numlen;
a2a2844f
JH
2990 STRLEN ulen;
2991 U8 tmpbuf[UTF8_MAXLEN*2+1];
f06dbbb7
JH
2992
2993 parse_start = RExC_parse - 1;
a0d0e21e 2994
830247a4 2995 RExC_parse++;
a0d0e21e
LW
2996
2997 defchar:
830247a4 2998 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2999 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3000 : EXACT);
cd439c50 3001 s = STRING(ret);
830247a4
IZ
3002 for (len = 0, p = RExC_parse - 1;
3003 len < 127 && p < RExC_end;
a0d0e21e
LW
3004 len++)
3005 {
3006 oldp = p;
5b5a24f7 3007
830247a4
IZ
3008 if (RExC_flags16 & PMf_EXTENDED)
3009 p = regwhite(p, RExC_end);
a0d0e21e
LW
3010 switch (*p) {
3011 case '^':
3012 case '$':
3013 case '.':
3014 case '[':
3015 case '(':
3016 case ')':
3017 case '|':
3018 goto loopdone;
3019 case '\\':
3020 switch (*++p) {
3021 case 'A':
3022 case 'G':
3023 case 'Z':
b85d18e9 3024 case 'z':
a0d0e21e
LW
3025 case 'w':
3026 case 'W':
3027 case 'b':
3028 case 'B':
3029 case 's':
3030 case 'S':
3031 case 'd':
3032 case 'D':
a14b48bc
LW
3033 case 'p':
3034 case 'P':
a0d0e21e
LW
3035 --p;
3036 goto loopdone;
3037 case 'n':
3038 ender = '\n';
3039 p++;
a687059c 3040 break;
a0d0e21e
LW
3041 case 'r':
3042 ender = '\r';
3043 p++;
a687059c 3044 break;
a0d0e21e
LW
3045 case 't':
3046 ender = '\t';
3047 p++;
a687059c 3048 break;
a0d0e21e
LW
3049 case 'f':
3050 ender = '\f';
3051 p++;
a687059c 3052 break;
a0d0e21e 3053 case 'e':
c7f1f016 3054 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3055 p++;
a687059c 3056 break;
a0d0e21e 3057 case 'a':
c7f1f016 3058 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3059 p++;
a687059c 3060 break;
a0d0e21e 3061 case 'x':
a0ed51b3
LW
3062 if (*++p == '{') {
3063 char* e = strchr(p, '}');
b81d288d 3064
b45f050a 3065 if (!e) {
830247a4 3066 RExC_parse = p + 1;
b45f050a
JF
3067 vFAIL("Missing right brace on \\x{}");
3068 }
de5f0749 3069 else {
a4c04bdc
NC
3070 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3071 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3072 numlen = e - p - 1;
3073 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3074 if (ender > 0xff)
3075 RExC_utf8 = 1;
b21ed0a9
GS
3076 /* numlen is generous */
3077 if (numlen + len >= 127) {
a0ed51b3
LW
3078 p--;
3079 goto loopdone;
3080 }
3081 p = e + 1;
3082 }
a0ed51b3
LW
3083 }
3084 else {
a4c04bdc 3085 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3086 numlen = 2;
3087 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3088 p += numlen;
3089 }
a687059c 3090 break;
a0d0e21e
LW
3091 case 'c':
3092 p++;
bbce6d69 3093 ender = UCHARAT(p++);
3094 ender = toCTRL(ender);
a687059c 3095 break;
a0d0e21e
LW
3096 case '0': case '1': case '2': case '3':case '4':
3097 case '5': case '6': case '7': case '8':case '9':
3098 if (*p == '0' ||
830247a4 3099 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3100 I32 flags = 0;
3101 numlen = 3;
3102 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3103 p += numlen;
3104 }
3105 else {
3106 --p;
3107 goto loopdone;
a687059c
LW
3108 }
3109 break;
a0d0e21e 3110 case '\0':
830247a4 3111 if (p >= RExC_end)
b45f050a 3112 FAIL("Trailing \\");
a687059c 3113 /* FALL THROUGH */
a0d0e21e 3114 default:
e476b1b5 3115 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3116 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3117 goto normal_default;
a0d0e21e
LW
3118 }
3119 break;
a687059c 3120 default:
a0ed51b3 3121 normal_default:
fd400ab9 3122 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3123 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3124 &numlen, 0);
a0ed51b3
LW
3125 p += numlen;
3126 }
3127 else
3128 ender = *p++;
a0d0e21e 3129 break;
a687059c 3130 }
830247a4
IZ
3131 if (RExC_flags16 & PMf_EXTENDED)
3132 p = regwhite(p, RExC_end);
a0ed51b3 3133 if (UTF && FOLD) {
a2a2844f
JH
3134 toLOWER_uni(ender, tmpbuf, &ulen);
3135 ender = utf8_to_uvchr(tmpbuf, 0);
a0ed51b3 3136 }
a0d0e21e
LW
3137 if (ISMULT2(p)) { /* Back off on ?+*. */
3138 if (len)
3139 p = oldp;
2b9d42f0 3140 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3141 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3142 s += numlen;
3143 len += numlen;
3144 }
a0d0e21e
LW
3145 else {
3146 len++;
cd439c50 3147 REGC(ender, s++);
a0d0e21e
LW
3148 }
3149 break;
a687059c 3150 }
2b9d42f0 3151 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3152 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3153 s += numlen;
3154 len += numlen - 1;
3155 }
3156 else
cd439c50 3157 REGC(ender, s++);
a0d0e21e
LW
3158 }
3159 loopdone:
830247a4 3160 RExC_parse = p - 1;
fac92740 3161 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3162 nextchar(pRExC_state);
793db0cb
JH
3163 {
3164 /* len is STRLEN which is unsigned, need to copy to signed */
3165 IV iv = len;
3166 if (iv < 0)
3167 vFAIL("Internal disaster");
3168 }
a0d0e21e
LW
3169 if (len > 0)
3170 *flagp |= HASWIDTH;
3171 if (len == 1)
3172 *flagp |= SIMPLE;
c277df42 3173 if (!SIZE_ONLY)
cd439c50
IZ
3174 STR_LEN(ret) = len;
3175 if (SIZE_ONLY)
830247a4 3176 RExC_size += STR_SZ(len);
cd439c50 3177 else
830247a4 3178 RExC_emit += STR_SZ(len);
a687059c 3179 }
a0d0e21e
LW
3180 break;
3181 }
a687059c 3182
a72c7584
JH
3183 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
3184 STRLEN oldlen = STR_LEN(ret);
3185 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3186 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3187 STRLEN newlen = SvCUR(sv);
3188 if (!SIZE_ONLY) {
3189 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
fe5a182c 3190 (int)oldlen, STRING(ret), (int)newlen, s));
a72c7584
JH
3191 Copy(s, STRING(ret), newlen, char);
3192 STR_LEN(ret) += newlen - oldlen;
3193 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3194 } else
3195 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5ff6fc6d 3196 RExC_utf8 = 1;
a72c7584
JH
3197 }
3198
a0d0e21e 3199 return(ret);
a687059c
LW
3200}
3201
873ef191 3202STATIC char *
cea2e8a9 3203S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3204{
3205 while (p < e) {
3206 if (isSPACE(*p))
3207 ++p;
3208 else if (*p == '#') {
3209 do {
3210 p++;
3211 } while (p < e && *p != '\n');
3212 }
3213 else
3214 break;
3215 }
3216 return p;
3217}
3218
b8c5462f
JH
3219/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3220 Character classes ([:foo:]) can also be negated ([:^foo:]).
3221 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3222 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3223 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3224
3225#define POSIXCC_DONE(c) ((c) == ':')
3226#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3227#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3228
b8c5462f 3229STATIC I32
830247a4 3230S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3231{
3232 char *posixcc = 0;
936ed897 3233 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3234
830247a4 3235 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3236 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3237 POSIXCC(UCHARAT(RExC_parse))) {
3238 char c = UCHARAT(RExC_parse);
830247a4 3239 char* s = RExC_parse++;
b81d288d 3240
9a86a77b 3241 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3242 RExC_parse++;
3243 if (RExC_parse == RExC_end)
620e46c5 3244 /* Grandfather lone [:, [=, [. */
830247a4 3245 RExC_parse = s;
620e46c5 3246 else {
830247a4 3247 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3248
9a86a77b 3249 if (UCHARAT(RExC_parse) == ']') {
830247a4 3250 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3251 posixcc = s + 1;
3252 if (*s == ':') {
3253 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3254 I32 skip = 5; /* the most common skip */
3255
3256 switch (*posixcc) {
3257 case 'a':
3258 if (strnEQ(posixcc, "alnum", 5))
3259 namedclass =
3260 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3261 else if (strnEQ(posixcc, "alpha", 5))
3262 namedclass =
3263 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3264 else if (strnEQ(posixcc, "ascii", 5))
3265 namedclass =
3266 complement ? ANYOF_NASCII : ANYOF_ASCII;
3267 break;
aaa51d5e
JF
3268 case 'b':
3269 if (strnEQ(posixcc, "blank", 5))
3270 namedclass =
3271 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3272 break;
b8c5462f
JH
3273 case 'c':
3274 if (strnEQ(posixcc, "cntrl", 5))
3275 namedclass =
3276 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3277 break;
3278 case 'd':
3279 if (strnEQ(posixcc, "digit", 5))
3280 namedclass =
3281 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3282 break;
3283 case 'g':
3284 if (strnEQ(posixcc, "graph", 5))
3285 namedclass =
3286 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3287 break;
3288 case 'l':
3289 if (strnEQ(posixcc, "lower", 5))
3290 namedclass =
3291 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3292 break;
3293 case 'p':
3294 if (strnEQ(posixcc, "print", 5))
3295 namedclass =
3296 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3297 else if (strnEQ(posixcc, "punct", 5))
3298 namedclass =
3299 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3300 break;
3301 case 's':
3302 if (strnEQ(posixcc, "space", 5))
3303 namedclass =
aaa51d5e 3304 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3305 break;
b8c5462f
JH
3306 case 'u':
3307 if (strnEQ(posixcc, "upper", 5))
3308 namedclass =
3309 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3310 break;
3311 case 'w': /* this is not POSIX, this is the Perl \w */
3312 if (strnEQ(posixcc, "word", 4)) {
3313 namedclass =
3314 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3315 skip = 4;
3316 }
3317 break;
3318 case 'x':
3319 if (strnEQ(posixcc, "xdigit", 6)) {
3320 namedclass =
3321 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3322 skip = 6;
3323 }
3324 break;
3325 }
ac561586
JH
3326 if (namedclass == OOB_NAMEDCLASS ||
3327 posixcc[skip] != ':' ||
3328 posixcc[skip+1] != ']')
b45f050a
JF
3329 {
3330 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3331 t - s - 1, s + 1);
3332 }
3333 } else if (!SIZE_ONLY) {
b8c5462f 3334 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3335
830247a4 3336 /* adjust RExC_parse so the warning shows after
b45f050a 3337 the class closes */
9a86a77b 3338 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3339 RExC_parse++;
b45f050a
JF
3340 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3341 }
b8c5462f
JH
3342 } else {
3343 /* Maternal grandfather:
3344 * "[:" ending in ":" but not in ":]" */
830247a4 3345 RExC_parse = s;
767d463e 3346 }
620e46c5
JH
3347 }
3348 }
3349
b8c5462f
JH
3350 return namedclass;
3351}
3352
3353STATIC void
830247a4 3354S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3355{
e476b1b5 3356 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3357 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3358 char *s = RExC_parse;
93733859 3359 char c = *s++;
b8c5462f
JH
3360
3361 while(*s && isALNUM(*s))
3362 s++;
3363 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3364 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3365
3366 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3367 if (POSIXCC_NOTYET(c)) {
830247a4 3368 /* adjust RExC_parse so the error shows after
b45f050a 3369 the class closes */
9a86a77b 3370 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3371 ;
3372 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3373 }
b8c5462f
JH
3374 }
3375 }
620e46c5
JH
3376}
3377
76e3520e 3378STATIC regnode *
830247a4 3379S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3380{
ffc61ed2 3381 register UV value;
9a86a77b 3382 register UV nextvalue;
3568d838 3383 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3384 register IV range = 0;
c277df42 3385 register regnode *ret;
ba210ebe 3386 STRLEN numlen;
ffc61ed2 3387 IV namedclass;
9c5ffd7c 3388 char *rangebegin = 0;
936ed897 3389 bool need_class = 0;
9c5ffd7c 3390 SV *listsv = Nullsv;
ffc61ed2
JH
3391 register char *e;
3392 UV n;
3568d838 3393 bool optimize_invert = TRUE;
ffc61ed2
JH
3394
3395 ret = reganode(pRExC_state, ANYOF, 0);
3396
3397 if (!SIZE_ONLY)
3398 ANYOF_FLAGS(ret) = 0;
3399
9a86a77b 3400 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3401 RExC_naughty++;
3402 RExC_parse++;
3403 if (!SIZE_ONLY)
3404 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3405 }
a0d0e21e 3406
936ed897 3407 if (SIZE_ONLY)
830247a4 3408 RExC_size += ANYOF_SKIP;
936ed897 3409 else {
830247a4 3410 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3411 if (FOLD)
3412 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3413 if (LOC)
3414 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3415 ANYOF_BITMAP_ZERO(ret);
3416 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3417 }
b8c5462f 3418
9a86a77b
JH
3419 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3420
3421 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
830247a4 3422 checkposixcc(pRExC_state);
b8c5462f 3423
9a86a77b 3424 if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
ffc61ed2
JH
3425 goto charclassloop; /* allow 1st char to be ] or - */
3426
9a86a77b 3427 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3428
3429 charclassloop:
3430
3431 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3432
73b437c8 3433 if (!range)
830247a4 3434 rangebegin = RExC_parse;
ffc61ed2 3435 if (UTF) {
5e12f4fb 3436 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3437 RExC_end - RExC_parse,
3438 &numlen, 0);
ffc61ed2
JH
3439 RExC_parse += numlen;
3440 }
3441 else
3442 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3443 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3444 if (value == '[' && POSIXCC(nextvalue))
830247a4 3445 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3446 else if (value == '\\') {
ffc61ed2 3447 if (UTF) {
5e12f4fb 3448 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3449 RExC_end - RExC_parse,
3450 &numlen, 0);
3451 RExC_parse += numlen;
3452 }
3453 else
3454 value = UCHARAT(RExC_parse++);
470c3474 3455 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3456 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3457 * be a problem later if we want switch on Unicode.
3458 * A similar issue a little bit later when switching on
3459 * namedclass. --jhi */
ffc61ed2 3460 switch ((I32)value) {
b8c5462f
JH
3461 case 'w': namedclass = ANYOF_ALNUM; break;
3462 case 'W': namedclass = ANYOF_NALNUM; break;
3463 case 's': namedclass = ANYOF_SPACE; break;
3464 case 'S': namedclass = ANYOF_NSPACE; break;
3465 case 'd': namedclass = ANYOF_DIGIT; break;
3466 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3467 case 'p':
3468 case 'P':
3469 if (*RExC_parse == '{') {
0da60cf5 3470 U8 c = (U8)value;
ffc61ed2
JH
3471 e = strchr(RExC_parse++, '}');
3472 if (!e)
0da60cf5 3473 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3474 while (isSPACE(UCHARAT(RExC_parse)))
3475 RExC_parse++;
3476 if (e == RExC_parse)
0da60cf5 3477 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3478 n = e - RExC_parse;
ab13f0c7
JH
3479 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3480 n--;
ffc61ed2
JH
3481 }
3482 else {
3483 e = RExC_parse;
3484 n = 1;
3485 }
3486 if (!SIZE_ONLY) {
ab13f0c7
JH
3487 if (UCHARAT(RExC_parse) == '^') {
3488 RExC_parse++;
3489 n--;
3490 value = value == 'p' ? 'P' : 'p'; /* toggle */
3491 while (isSPACE(UCHARAT(RExC_parse))) {
3492 RExC_parse++;
3493 n--;
3494 }
3495 }
ffc61ed2 3496 if (value == 'p')
ab13f0c7
JH
3497 Perl_sv_catpvf(aTHX_ listsv,
3498 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3499 else
ab13f0c7
JH
3500 Perl_sv_catpvf(aTHX_ listsv,
3501 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3502 }
3503 RExC_parse = e + 1;
3504 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3505 continue;
b8c5462f
JH
3506 case 'n': value = '\n'; break;
3507 case 'r': value = '\r'; break;
3508 case 't': value = '\t'; break;
3509 case 'f': value = '\f'; break;
3510 case 'b': value = '\b'; break;
c7f1f016
NIS
3511 case 'e': value = ASCII_TO_NATIVE('\033');break;
3512 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3513 case 'x':
ffc61ed2 3514 if (*RExC_parse == '{') {
a4c04bdc
NC
3515 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3516 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3517 e = strchr(RExC_parse++, '}');
b81d288d 3518 if (!e)
ffc61ed2 3519 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3520
3521 numlen = e - RExC_parse;
3522 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3523 RExC_parse = e + 1;
3524 }
3525 else {
a4c04bdc 3526 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3527 numlen = 2;
3528 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3529 RExC_parse += numlen;
3530 }
b8c5462f
JH
3531 break;
3532 case 'c':
830247a4 3533 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3534 value = toCTRL(value);
3535 break;
3536 case '0': case '1': case '2': case '3': case '4':
3537 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3538 {
3539 I32 flags = 0;
3540 numlen = 3;
3541 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3542 RExC_parse += numlen;
b8c5462f 3543 break;
53305cf1 3544 }
1028017a 3545 default:
e476b1b5 3546 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3547 vWARN2(RExC_parse,
3548 "Unrecognized escape \\%c in character class passed through",
3549 (int)value);
1028017a 3550 break;
b8c5462f 3551 }
ffc61ed2
JH
3552 } /* end of \blah */
3553
3554 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3555
3556 if (!SIZE_ONLY && !need_class)
936ed897 3557 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3558
936ed897 3559 need_class = 1;
ffc61ed2
JH
3560
3561 /* a bad range like a-\d, a-[:digit:] ? */
3562 if (range) {
73b437c8 3563 if (!SIZE_ONLY) {
e476b1b5 3564 if (ckWARN(WARN_REGEXP))
830247a4 3565 vWARN4(RExC_parse,
b45f050a 3566 "False [] range \"%*.*s\"",
830247a4
IZ
3567 RExC_parse - rangebegin,
3568 RExC_parse - rangebegin,
b45f050a 3569 rangebegin);
3568d838
JH
3570 if (prevvalue < 256) {
3571 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3572 ANYOF_BITMAP_SET(ret, '-');
3573 }
3574 else {
3575 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3576 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3577 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3578 }
b8c5462f 3579 }
ffc61ed2
JH
3580
3581 range = 0; /* this was not a true range */
73b437c8 3582 }
ffc61ed2 3583
73b437c8 3584 if (!SIZE_ONLY) {
3568d838
JH
3585 if (namedclass > OOB_NAMEDCLASS)
3586 optimize_invert = FALSE;
e2962f66
JH
3587 /* Possible truncation here but in some 64-bit environments
3588 * the compiler gets heartburn about switch on 64-bit values.
3589 * A similar issue a little earlier when switching on value.
98f323fa 3590 * --jhi */
e2962f66 3591 switch ((I32)namedclass) {
73b437c8
JH
3592 case ANYOF_ALNUM:
3593 if (LOC)
936ed897 3594 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3595 else {
3596 for (value = 0; value < 256; value++)
3597 if (isALNUM(value))
936ed897 3598 ANYOF_BITMAP_SET(ret, value);
73b437c8 3599 }
ffc61ed2 3600 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3601 break;
3602 case ANYOF_NALNUM:
3603 if (LOC)
936ed897 3604 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3605 else {
3606 for (value = 0; value < 256; value++)
3607 if (!isALNUM(value))
936ed897 3608 ANYOF_BITMAP_SET(ret, value);
73b437c8 3609 }
ffc61ed2 3610 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3611 break;
ffc61ed2 3612 case ANYOF_ALNUMC:
73b437c8 3613 if (LOC)
ffc61ed2 3614 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3615 else {
3616 for (value = 0; value < 256; value++)
ffc61ed2 3617 if (isALNUMC(value))
936ed897 3618 ANYOF_BITMAP_SET(ret, value);
73b437c8 3619 }
ffc61ed2 3620 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3621 break;
3622 case ANYOF_NALNUMC:
3623 if (LOC)
936ed897 3624 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3625 else {
3626 for (value = 0; value < 256; value++)
3627 if (!isALNUMC(value))
936ed897 3628 ANYOF_BITMAP_SET(ret, value);
73b437c8 3629 }
ffc61ed2 3630 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3631 break;
3632 case ANYOF_ALPHA:
3633 if (LOC)
936ed897 3634 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3635 else {
3636 for (value = 0; value < 256; value++)
3637 if (isALPHA(value))
936ed897 3638 ANYOF_BITMAP_SET(ret, value);
73b437c8 3639 }
ffc61ed2 3640 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3641 break;
3642 case ANYOF_NALPHA:
3643 if (LOC)
936ed897 3644 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (!isALPHA(value))
936ed897 3648 ANYOF_BITMAP_SET(ret, value);
73b437c8 3649 }
ffc61ed2 3650 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3651 break;
3652 case ANYOF_ASCII:
3653 if (LOC)
936ed897 3654 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3655 else {
c7f1f016 3656#ifndef EBCDIC
1ba5c669
JH
3657 for (value = 0; value < 128; value++)
3658 ANYOF_BITMAP_SET(ret, value);
3659#else /* EBCDIC */
ffbc6a93 3660 for (value = 0; value < 256; value++) {
3a3c4447
JH
3661 if (isASCII(value))
3662 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3663 }
1ba5c669 3664#endif /* EBCDIC */
73b437c8 3665 }
ffc61ed2 3666 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3667 break;
3668 case ANYOF_NASCII:
3669 if (LOC)
936ed897 3670 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3671 else {
c7f1f016 3672#ifndef EBCDIC
1ba5c669
JH
3673 for (value = 128; value < 256; value++)
3674 ANYOF_BITMAP_SET(ret, value);
3675#else /* EBCDIC */
ffbc6a93 3676 for (value = 0; value < 256; value++) {
3a3c4447
JH
3677 if (!isASCII(value))
3678 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3679 }
1ba5c669 3680#endif /* EBCDIC */
73b437c8 3681 }
ffc61ed2 3682 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3683 break;
aaa51d5e
JF
3684 case ANYOF_BLANK:
3685 if (LOC)
3686 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3687 else {
3688 for (value = 0; value < 256; value++)
3689 if (isBLANK(value))
3690 ANYOF_BITMAP_SET(ret, value);
3691 }
ffc61ed2 3692 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3693 break;
3694 case ANYOF_NBLANK:
3695 if (LOC)
3696 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3697 else {
3698 for (value = 0; value < 256; value++)
3699 if (!isBLANK(value))
3700 ANYOF_BITMAP_SET(ret, value);
3701 }
ffc61ed2 3702 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3703 break;
73b437c8
JH
3704 case ANYOF_CNTRL:
3705 if (LOC)
936ed897 3706 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3707 else {
3708 for (value = 0; value < 256; value++)
3709 if (isCNTRL(value))
936ed897 3710 ANYOF_BITMAP_SET(ret, value);
73b437c8 3711 }
ffc61ed2 3712 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3713 break;
3714 case ANYOF_NCNTRL:
3715 if (LOC)
936ed897 3716 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3717 else {
3718 for (value = 0; value < 256; value++)
3719 if (!isCNTRL(value))
936ed897 3720 ANYOF_BITMAP_SET(ret, value);
73b437c8 3721 }
ffc61ed2
JH
3722 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3723 break;
3724 case ANYOF_DIGIT:
3725 if (LOC)
3726 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3727 else {
3728 /* consecutive digits assumed */
3729 for (value = '0'; value <= '9'; value++)
3730 ANYOF_BITMAP_SET(ret, value);
3731 }
3732 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3733 break;
3734 case ANYOF_NDIGIT:
3735 if (LOC)
3736 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3737 else {
3738 /* consecutive digits assumed */
3739 for (value = 0; value < '0'; value++)
3740 ANYOF_BITMAP_SET(ret, value);
3741 for (value = '9' + 1; value < 256; value++)
3742 ANYOF_BITMAP_SET(ret, value);
3743 }
3744 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3745 break;
3746 case ANYOF_GRAPH:
3747 if (LOC)
936ed897 3748 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3749 else {
3750 for (value = 0; value < 256; value++)
3751 if (isGRAPH(value))
936ed897 3752 ANYOF_BITMAP_SET(ret, value);
73b437c8 3753 }
ffc61ed2 3754 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3755 break;
3756 case ANYOF_NGRAPH:
3757 if (LOC)
936ed897 3758 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3759 else {
3760 for (value = 0; value < 256; value++)
3761 if (!isGRAPH(value))
936ed897 3762 ANYOF_BITMAP_SET(ret, value);
73b437c8 3763 }
ffc61ed2 3764 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3765 break;
3766 case ANYOF_LOWER:
3767 if (LOC)
936ed897 3768 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3769 else {
3770 for (value = 0; value < 256; value++)
3771 if (isLOWER(value))
936ed897 3772 ANYOF_BITMAP_SET(ret, value);
73b437c8 3773 }
ffc61ed2 3774 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3775 break;
3776 case ANYOF_NLOWER:
3777 if (LOC)
936ed897 3778 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3779 else {
3780 for (value = 0; value < 256; value++)
3781 if (!isLOWER(value))
936ed897 3782 ANYOF_BITMAP_SET(ret, value);
73b437c8 3783 }
ffc61ed2 3784 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3785 break;
3786 case ANYOF_PRINT:
3787 if (LOC)
936ed897 3788 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3789 else {
3790 for (value = 0; value < 256; value++)
3791 if (isPRINT(value))
936ed897 3792 ANYOF_BITMAP_SET(ret, value);
73b437c8 3793 }
ffc61ed2 3794 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3795 break;
3796 case ANYOF_NPRINT:
3797 if (LOC)
936ed897 3798 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3799 else {
3800 for (value = 0; value < 256; value++)
3801 if (!isPRINT(value))
936ed897 3802 ANYOF_BITMAP_SET(ret, value);
73b437c8 3803 }
ffc61ed2 3804 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3805 break;
aaa51d5e
JF
3806 case ANYOF_PSXSPC:
3807 if (LOC)
3808 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3809 else {
3810 for (value = 0; value < 256; value++)
3811 if (isPSXSPC(value))
3812 ANYOF_BITMAP_SET(ret, value);
3813 }
ffc61ed2 3814 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3815 break;
3816 case ANYOF_NPSXSPC:
3817 if (LOC)
3818 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3819 else {
3820 for (value = 0; value < 256; value++)
3821 if (!isPSXSPC(value))
3822 ANYOF_BITMAP_SET(ret, value);
3823 }
ffc61ed2 3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3825 break;
73b437c8
JH
3826 case ANYOF_PUNCT:
3827 if (LOC)
936ed897 3828 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3829 else {
3830 for (value = 0; value < 256; value++)
3831 if (isPUNCT(value))
936ed897 3832 ANYOF_BITMAP_SET(ret, value);
73b437c8 3833 }
ffc61ed2 3834 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3835 break;
3836 case ANYOF_NPUNCT:
3837 if (LOC)
936ed897 3838 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3839 else {
3840 for (value = 0; value < 256; value++)
3841 if (!isPUNCT(value))
936ed897 3842 ANYOF_BITMAP_SET(ret, value);
73b437c8 3843 }
ffc61ed2
JH
3844 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3845 break;
3846 case ANYOF_SPACE:
3847 if (LOC)
3848 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3849 else {
3850 for (value = 0; value < 256; value++)
3851 if (isSPACE(value))
3852 ANYOF_BITMAP_SET(ret, value);
3853 }
3854 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3855 break;
3856 case ANYOF_NSPACE:
3857 if (LOC)
3858 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3859 else {
3860 for (value = 0; value < 256; value++)
3861 if (!isSPACE(value))
3862 ANYOF_BITMAP_SET(ret, value);
3863 }
3864 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3865 break;
3866 case ANYOF_UPPER:
3867 if (LOC)
936ed897 3868 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3869 else {
3870 for (value = 0; value < 256; value++)
3871 if (isUPPER(value))
936ed897 3872 ANYOF_BITMAP_SET(ret, value);
73b437c8 3873 }
ffc61ed2 3874 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3875 break;
3876 case ANYOF_NUPPER:
3877 if (LOC)
936ed897 3878 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3879 else {
3880 for (value = 0; value < 256; value++)
3881 if (!isUPPER(value))
936ed897 3882 ANYOF_BITMAP_SET(ret, value);
73b437c8 3883 }
ffc61ed2 3884 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3885 break;
3886 case ANYOF_XDIGIT:
3887 if (LOC)
936ed897 3888 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3889 else {
3890 for (value = 0; value < 256; value++)
3891 if (isXDIGIT(value))
936ed897 3892 ANYOF_BITMAP_SET(ret, value);
73b437c8 3893 }
ffc61ed2 3894 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3895 break;
3896 case ANYOF_NXDIGIT:
3897 if (LOC)
936ed897 3898 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3899 else {
3900 for (value = 0; value < 256; value++)
3901 if (!isXDIGIT(value))
936ed897 3902 ANYOF_BITMAP_SET(ret, value);
73b437c8 3903 }
ffc61ed2 3904 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3905 break;
3906 default:
b45f050a 3907 vFAIL("Invalid [::] class");
73b437c8 3908 break;
b8c5462f 3909 }
b8c5462f 3910 if (LOC)
936ed897 3911 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3912 continue;
a0d0e21e 3913 }
ffc61ed2
JH
3914 } /* end of namedclass \blah */
3915
a0d0e21e 3916 if (range) {
3a3c4447 3917 if (prevvalue > value) /* b-a */ {
b45f050a 3918 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3919 RExC_parse - rangebegin,
3920 RExC_parse - rangebegin,
b45f050a 3921 rangebegin);
3568d838 3922 range = 0; /* not a valid range */
73b437c8 3923 }
a0d0e21e
LW
3924 }
3925 else {
3568d838 3926 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3927 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3928 RExC_parse[1] != ']') {
3929 RExC_parse++;
ffc61ed2
JH
3930
3931 /* a bad range like \w-, [:word:]- ? */
3932 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3933 if (ckWARN(WARN_REGEXP))
830247a4 3934 vWARN4(RExC_parse,
b45f050a 3935 "False [] range \"%*.*s\"",
830247a4
IZ
3936 RExC_parse - rangebegin,
3937 RExC_parse - rangebegin,
b45f050a 3938 rangebegin);
73b437c8 3939 if (!SIZE_ONLY)
936ed897 3940 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3941 } else
ffc61ed2
JH
3942 range = 1; /* yeah, it's a range! */
3943 continue; /* but do it the next time */
a0d0e21e 3944 }
a687059c 3945 }
ffc61ed2 3946
93733859 3947 /* now is the next time */
ae5c130c 3948 if (!SIZE_ONLY) {
3568d838
JH
3949 IV i;
3950
3951 if (prevvalue < 256) {
3952 IV ceilvalue = value < 256 ? value : 255;
3953
3954#ifdef EBCDIC
3a3c4447
JH
3955 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3956 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3957 {
3568d838
JH
3958 if (isLOWER(prevvalue)) {
3959 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3960 if (isLOWER(i))
3961 ANYOF_BITMAP_SET(ret, i);
3962 } else {
3568d838 3963 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3964 if (isUPPER(i))
3965 ANYOF_BITMAP_SET(ret, i);
3966 }
8ada0baa 3967 }
ffc61ed2 3968 else
8ada0baa 3969#endif
3568d838
JH
3970 for (i = prevvalue; i <= ceilvalue; i++)
3971 ANYOF_BITMAP_SET(ret, i);
3972 }
3973 if (value > 255) {
ffc61ed2 3974 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3975 if (prevvalue < value)
ffc61ed2 3976 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3977 (UV)prevvalue, (UV)value);
3978 else if (prevvalue == value)
ffc61ed2
JH
3979 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3980 (UV)value);
3981 }
8ada0baa 3982 }
ffc61ed2
JH
3983
3984 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3985 }
ffc61ed2 3986
936ed897 3987 if (need_class) {
4f66b38d 3988 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3989 if (SIZE_ONLY)
830247a4 3990 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3991 else
830247a4 3992 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3993 }
ffc61ed2 3994
ae5c130c 3995 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3996 if (!SIZE_ONLY &&
ffc61ed2 3997 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3998 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3999 ) {
a0ed51b3 4000 for (value = 0; value < 256; ++value) {
936ed897 4001 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4002 IV fold = PL_fold[value];
4003
4004 if (fold != value)
4005 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4006 }
4007 }
936ed897 4008 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4009 }
ffc61ed2 4010
ae5c130c 4011 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4012 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4013 /* If the only flag is inversion. */
4014 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4015 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4016 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4017 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4018 }
a0d0e21e 4019
b81d288d 4020 if (!SIZE_ONLY) {
fde631ed 4021 AV *av = newAV();
ffc61ed2
JH
4022 SV *rv;
4023
4024 av_store(av, 0, listsv);
4025 av_store(av, 1, NULL);
4026 rv = newRV_noinc((SV*)av);
19860706 4027 n = add_data(pRExC_state, 1, "s");
830247a4 4028 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4029 ARG_SET(ret, n);
a0ed51b3
LW
4030 }
4031
4032 return ret;
4033}
4034
76e3520e 4035STATIC char*
830247a4 4036S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4037{
830247a4 4038 char* retval = RExC_parse++;
a0d0e21e 4039
4633a7c4 4040 for (;;) {
830247a4
IZ
4041 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4042 RExC_parse[2] == '#') {
4043 while (*RExC_parse && *RExC_parse != ')')
4044 RExC_parse++;
4045 RExC_parse++;
4633a7c4
LW
4046 continue;
4047 }
830247a4
IZ
4048 if (RExC_flags16 & PMf_EXTENDED) {
4049 if (isSPACE(*RExC_parse)) {
4050 RExC_parse++;
748a9306
LW
4051 continue;
4052 }
830247a4
IZ
4053 else if (*RExC_parse == '#') {
4054 while (*RExC_parse && *RExC_parse != '\n')
4055 RExC_parse++;
4056 RExC_parse++;
748a9306
LW
4057 continue;
4058 }
748a9306 4059 }
4633a7c4 4060 return retval;
a0d0e21e 4061 }
a687059c
LW
4062}
4063
4064/*
c277df42 4065- reg_node - emit a node
a0d0e21e 4066*/
76e3520e 4067STATIC regnode * /* Location. */
830247a4 4068S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4069{
c277df42
IZ
4070 register regnode *ret;
4071 register regnode *ptr;
a687059c 4072
830247a4 4073 ret = RExC_emit;
c277df42 4074 if (SIZE_ONLY) {
830247a4
IZ
4075 SIZE_ALIGN(RExC_size);
4076 RExC_size += 1;
a0d0e21e
LW
4077 return(ret);
4078 }
a687059c 4079
c277df42 4080 NODE_ALIGN_FILL(ret);
a0d0e21e 4081 ptr = ret;
c277df42 4082 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4083 if (RExC_offsets) { /* MJD */
4084 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4085 "reg_node", __LINE__,
4086 reg_name[op],
4087 RExC_emit - RExC_emit_start > RExC_offsets[0]
4088 ? "Overwriting end of array!\n" : "OK",
4089 RExC_emit - RExC_emit_start,
4090 RExC_parse - RExC_start,
4091 RExC_offsets[0]));
4092 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4093 }
4094
830247a4 4095 RExC_emit = ptr;
a687059c 4096
a0d0e21e 4097 return(ret);
a687059c
LW
4098}
4099
4100/*
a0d0e21e
LW
4101- reganode - emit a node with an argument
4102*/
76e3520e 4103STATIC regnode * /* Location. */
830247a4 4104S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4105{
c277df42
IZ
4106 register regnode *ret;
4107 register regnode *ptr;
fe14fcc3 4108
830247a4 4109 ret = RExC_emit;
c277df42 4110 if (SIZE_ONLY) {
830247a4
IZ
4111 SIZE_ALIGN(RExC_size);
4112 RExC_size += 2;
a0d0e21e
LW
4113 return(ret);
4114 }
fe14fcc3 4115
c277df42 4116 NODE_ALIGN_FILL(ret);
a0d0e21e 4117 ptr = ret;
c277df42 4118 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4119 if (RExC_offsets) { /* MJD */
4120 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4121 "reganode",
4122 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4123 "Overwriting end of array!\n" : "OK",
4124 RExC_emit - RExC_emit_start,
4125 RExC_parse - RExC_start,
4126 RExC_offsets[0]));
4127 Set_Cur_Node_Offset;
4128 }
4129
830247a4 4130 RExC_emit = ptr;
fe14fcc3 4131
a0d0e21e 4132 return(ret);
fe14fcc3
LW
4133}
4134
4135/*
cd439c50 4136- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4137*/
4138STATIC void
830247a4 4139S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4140{
5e12f4fb 4141 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4142}
4143
4144/*
a0d0e21e
LW
4145- reginsert - insert an operator in front of already-emitted operand
4146*
4147* Means relocating the operand.
4148*/
76e3520e 4149STATIC void
830247a4 4150S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4151{
c277df42
IZ
4152 register regnode *src;
4153 register regnode *dst;
4154 register regnode *place;
4155 register int offset = regarglen[(U8)op];
b81d288d 4156
22c35a8c 4157/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4158
4159 if (SIZE_ONLY) {
830247a4 4160 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4161 return;
4162 }
a687059c 4163
830247a4
IZ
4164 src = RExC_emit;
4165 RExC_emit += NODE_STEP_REGNODE + offset;
4166 dst = RExC_emit;
fac92740 4167 while (src > opnd) {
c277df42 4168 StructCopy(--src, --dst, regnode);
fac92740
MJD
4169 if (RExC_offsets) { /* MJD 20010112 */
4170 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4171 "reg_insert",
4172 dst - RExC_emit_start > RExC_offsets[0]
4173 ? "Overwriting end of array!\n" : "OK",
4174 src - RExC_emit_start,
4175 dst - RExC_emit_start,
4176 RExC_offsets[0]));
4177 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4178 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4179 }
4180 }
4181
a0d0e21e
LW
4182
4183 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4184 if (RExC_offsets) { /* MJD */
4185 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4186 "reginsert",
4187 place - RExC_emit_start > RExC_offsets[0]
4188 ? "Overwriting end of array!\n" : "OK",
4189 place - RExC_emit_start,
4190 RExC_parse - RExC_start,
4191 RExC_offsets[0]));
4192 Set_Node_Offset(place, RExC_parse);
4193 }
c277df42
IZ
4194 src = NEXTOPER(place);
4195 FILL_ADVANCE_NODE(place, op);
4196 Zero(src, offset, regnode);
a687059c
LW
4197}
4198
4199/*
c277df42 4200- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4201*/
76e3520e 4202STATIC void
830247a4 4203S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4204{
c277df42
IZ
4205 register regnode *scan;
4206 register regnode *temp;
a0d0e21e 4207
c277df42 4208 if (SIZE_ONLY)
a0d0e21e
LW
4209 return;
4210
4211 /* Find last node. */
4212 scan = p;
4213 for (;;) {
4214 temp = regnext(scan);
4215 if (temp == NULL)
4216 break;
4217 scan = temp;
4218 }
a687059c 4219
c277df42
IZ
4220 if (reg_off_by_arg[OP(scan)]) {
4221 ARG_SET(scan, val - scan);
a0ed51b3
LW
4222 }
4223 else {
c277df42
IZ
4224 NEXT_OFF(scan) = val - scan;
4225 }
a687059c
LW
4226}
4227
4228/*
a0d0e21e
LW
4229- regoptail - regtail on operand of first argument; nop if operandless
4230*/
76e3520e 4231STATIC void
830247a4 4232S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4233{
a0d0e21e 4234 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4235 if (p == NULL || SIZE_ONLY)
4236 return;
22c35a8c 4237 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4238 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4239 }
22c35a8c 4240 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4241 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4242 }
4243 else
a0d0e21e 4244 return;
a687059c
LW
4245}
4246
4247/*
4248 - regcurly - a little FSA that accepts {\d+,?\d*}
4249 */
79072805 4250STATIC I32
cea2e8a9 4251S_regcurly(pTHX_ register char *s)
a687059c
LW
4252{
4253 if (*s++ != '{')
4254 return FALSE;
f0fcb552 4255 if (!isDIGIT(*s))
a687059c 4256 return FALSE;
f0fcb552 4257 while (isDIGIT(*s))
a687059c
LW
4258 s++;
4259 if (*s == ',')
4260 s++;
f0fcb552 4261 while (isDIGIT(*s))
a687059c
LW
4262 s++;
4263 if (*s != '}')
4264 return FALSE;
4265 return TRUE;
4266}
4267
a687059c 4268
8fa7f367
JH
4269#ifdef DEBUGGING
4270
76e3520e 4271STATIC regnode *
cea2e8a9 4272S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4273{
f248d071 4274 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4275 register regnode *next;
c277df42
IZ
4276
4277 while (op != END && (!last || node < last)) {
4278 /* While that wasn't END last time... */
4279
4280 NODE_ALIGN(node);
4281 op = OP(node);
4282 if (op == CLOSE)
4283 l--;
4284 next = regnext(node);
4285 /* Where, what. */
4286 if (OP(node) == OPTIMIZED)
4287 goto after_print;
4288 regprop(sv, node);
b900a521 4289 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4290 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4291 if (next == NULL) /* Next ptr. */
4292 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4293 else
b900a521 4294 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4295 (void)PerlIO_putc(Perl_debug_log, '\n');
4296 after_print:
22c35a8c 4297 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4298 register regnode *nnode = (OP(next) == LONGJMP
4299 ? regnext(next)
c277df42
IZ
4300 : next);
4301 if (last && nnode > last)
4302 nnode = last;
4303 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4304 }
22c35a8c 4305 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4306 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4307 }
4308 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4309 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4310 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4311 }
22c35a8c 4312 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4313 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4314 next, sv, l + 1);
a0ed51b3
LW
4315 }
4316 else if ( op == PLUS || op == STAR) {
c277df42 4317 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4318 }
4319 else if (op == ANYOF) {
4f66b38d
HS
4320 /* arglen 1 + class block */
4321 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4322 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4323 node = NEXTOPER(node);
a0ed51b3 4324 }
22c35a8c 4325 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4326 /* Literal string, where present. */
cd439c50 4327 node += NODE_SZ_STR(node) - 1;
c277df42 4328 node = NEXTOPER(node);
a0ed51b3
LW
4329 }
4330 else {
c277df42
IZ
4331 node = NEXTOPER(node);
4332 node += regarglen[(U8)op];
4333 }
4334 if (op == CURLYX || op == OPEN)
4335 l++;
4336 else if (op == WHILEM)
4337 l--;
4338 }
4339 return node;
4340}
4341
8fa7f367
JH
4342#endif /* DEBUGGING */
4343
a687059c 4344/*
fd181c75 4345 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4346 */
4347void
864dbfa3 4348Perl_regdump(pTHX_ regexp *r)
a687059c 4349{
35ff7856 4350#ifdef DEBUGGING
46fc3d4c 4351 SV *sv = sv_newmortal();
a687059c 4352
c277df42 4353 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4354
4355 /* Header fields of interest. */
c277df42 4356 if (r->anchored_substr)
7b0972df 4357 PerlIO_printf(Perl_debug_log,
b81d288d 4358 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4359 PL_colors[0],
7b0972df 4360 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4361 SvPVX(r->anchored_substr),
3280af22 4362 PL_colors[1],
c277df42 4363 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4364 (IV)r->anchored_offset);
c277df42 4365 if (r->float_substr)
7b0972df 4366 PerlIO_printf(Perl_debug_log,
b81d288d 4367 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4368 PL_colors[0],
b81d288d 4369 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4370 SvPVX(r->float_substr),
3280af22 4371 PL_colors[1],
c277df42 4372 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4373 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4374 if (r->check_substr)
b81d288d
AB
4375 PerlIO_printf(Perl_debug_log,
4376 r->check_substr == r->float_substr
c277df42
IZ
4377 ? "(checking floating" : "(checking anchored");
4378 if (r->reganch & ROPT_NOSCAN)
4379 PerlIO_printf(Perl_debug_log, " noscan");
4380 if (r->reganch & ROPT_CHECK_ALL)
4381 PerlIO_printf(Perl_debug_log, " isall");
4382 if (r->check_substr)
4383 PerlIO_printf(Perl_debug_log, ") ");
4384
46fc3d4c 4385 if (r->regstclass) {
4386 regprop(sv, r->regstclass);
4387 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4388 }
774d564b 4389 if (r->reganch & ROPT_ANCH) {
4390 PerlIO_printf(Perl_debug_log, "anchored");
4391 if (r->reganch & ROPT_ANCH_BOL)
4392 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4393 if (r->reganch & ROPT_ANCH_MBOL)
4394 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4395 if (r->reganch & ROPT_ANCH_SBOL)
4396 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4397 if (r->reganch & ROPT_ANCH_GPOS)
4398 PerlIO_printf(Perl_debug_log, "(GPOS)");
4399 PerlIO_putc(Perl_debug_log, ' ');
4400 }
c277df42
IZ
4401 if (r->reganch & ROPT_GPOS_SEEN)
4402 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4403 if (r->reganch & ROPT_SKIP)
760ac839 4404 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4405 if (r->reganch & ROPT_IMPLICIT)
760ac839 4406 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4407 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4408 if (r->reganch & ROPT_EVAL_SEEN)
4409 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4410 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4411 if (r->offsets) {
4412 U32 i;
4413 U32 len = r->offsets[0];
392fbf5d 4414 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4415 for (i = 1; i <= len; i++)
392fbf5d
RB
4416 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4417 (UV)r->offsets[i*2-1],
4418 (UV)r->offsets[i*2]);
fac92740
MJD
4419 PerlIO_printf(Perl_debug_log, "\n");
4420 }
17c3b450 4421#endif /* DEBUGGING */
a687059c
LW
4422}
4423
8fa7f367
JH
4424#ifdef DEBUGGING
4425
653099ff
GS
4426STATIC void
4427S_put_byte(pTHX_ SV *sv, int c)
4428{
7be5a6cf 4429 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4430 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4431 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4432 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4433 else
4434 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4435}
4436
8fa7f367
JH
4437#endif /* DEBUGGING */
4438
a687059c 4439/*
a0d0e21e
LW
4440- regprop - printable representation of opcode
4441*/
46fc3d4c 4442void
864dbfa3 4443Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4444{
35ff7856 4445#ifdef DEBUGGING
9b155405 4446 register int k;
a0d0e21e 4447
54dc92de 4448 sv_setpvn(sv, "", 0);
9b155405 4449 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4450 /* It would be nice to FAIL() here, but this may be called from
4451 regexec.c, and it would be hard to supply pRExC_state. */
4452 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4453 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4454
4455 k = PL_regkind[(U8)OP(o)];
4456
2a782b5b
JH
4457 if (k == EXACT) {
4458 SV *dsv = sv_2mortal(newSVpvn("", 0));
df1ffd02 4459 bool do_utf8 = DO_UTF8(sv);
8a989385 4460 char *s = do_utf8 ?
2a782b5b
JH
4461 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
4462 STRING(o);
40eddc46 4463 int len = do_utf8 ?
2a782b5b
JH
4464 strlen(s) :
4465 STR_LEN(o);
4466 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4467 PL_colors[0],
4468 len, s,
4469 PL_colors[1]);
4470 }
9b155405 4471 else if (k == CURLY) {
cb434fcc 4472 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4473 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4474 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4475 }
2c2d71f5
JH
4476 else if (k == WHILEM && o->flags) /* Ordinal/of */
4477 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4478 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4479 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4480 else if (k == LOGICAL)
04ebc1ab 4481 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4482 else if (k == ANYOF) {
4483 int i, rangestart = -1;
ffc61ed2 4484 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4485 const char * const anyofs[] = { /* Should be syncronized with
4486 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4487 "\\w",
4488 "\\W",
4489 "\\s",
4490 "\\S",
4491 "\\d",
4492 "\\D",
4493 "[:alnum:]",
4494 "[:^alnum:]",
4495 "[:alpha:]",
4496 "[:^alpha:]",
4497 "[:ascii:]",
4498 "[:^ascii:]",
4499 "[:ctrl:]",
4500 "[:^ctrl:]",
4501 "[:graph:]",
4502 "[:^graph:]",
4503 "[:lower:]",
4504 "[:^lower:]",
4505 "[:print:]",
4506 "[:^print:]",
4507 "[:punct:]",
4508 "[:^punct:]",
4509 "[:upper:]",
aaa51d5e 4510 "[:^upper:]",
653099ff 4511 "[:xdigit:]",
aaa51d5e
JF
4512 "[:^xdigit:]",
4513 "[:space:]",
4514 "[:^space:]",
4515 "[:blank:]",
4516 "[:^blank:]"
653099ff
GS
4517 };
4518
19860706 4519 if (flags & ANYOF_LOCALE)
653099ff 4520 sv_catpv(sv, "{loc}");
19860706 4521 if (flags & ANYOF_FOLD)
653099ff
GS
4522 sv_catpv(sv, "{i}");
4523 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4524 if (flags & ANYOF_INVERT)
653099ff 4525 sv_catpv(sv, "^");
ffc61ed2
JH
4526 for (i = 0; i <= 256; i++) {
4527 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4528 if (rangestart == -1)
4529 rangestart = i;
4530 } else if (rangestart != -1) {
4531 if (i <= rangestart + 3)
4532 for (; rangestart < i; rangestart++)
653099ff 4533 put_byte(sv, rangestart);
ffc61ed2
JH
4534 else {
4535 put_byte(sv, rangestart);
4536 sv_catpv(sv, "-");
4537 put_byte(sv, i - 1);
653099ff 4538 }
ffc61ed2 4539 rangestart = -1;
653099ff 4540 }
847a199f 4541 }
ffc61ed2
JH
4542
4543 if (o->flags & ANYOF_CLASS)
4544 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4545 if (ANYOF_CLASS_TEST(o,i))
4546 sv_catpv(sv, anyofs[i]);
4547
4548 if (flags & ANYOF_UNICODE)
4549 sv_catpv(sv, "{unicode}");
1aa99e6b 4550 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4551 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4552
4553 {
4554 SV *lv;
4555 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4556
ffc61ed2
JH
4557 if (lv) {
4558 if (sw) {
4559 UV i;
4560 U8 s[UTF8_MAXLEN+1];
b81d288d 4561
ffc61ed2 4562 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4563 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4564
3568d838 4565 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4566 if (rangestart == -1)
4567 rangestart = i;
4568 } else if (rangestart != -1) {
4569 U8 *p;
b81d288d 4570
ffc61ed2
JH
4571 if (i <= rangestart + 3)
4572 for (; rangestart < i; rangestart++) {
2b9d42f0 4573 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4574 put_byte(sv, *p);
4575 }
4576 else {
2b9d42f0 4577 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4578 put_byte(sv, *p);
4579 sv_catpv(sv, "-");
2b9d42f0 4580 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4581 put_byte(sv, *p);
4582 }
4583 rangestart = -1;
4584 }
19860706 4585 }
ffc61ed2
JH
4586
4587 sv_catpv(sv, "..."); /* et cetera */
19860706 4588 }
fde631ed 4589
ffc61ed2
JH
4590 {
4591 char *s = savepv(SvPVX(lv));
4592 char *origs = s;
b81d288d 4593
ffc61ed2 4594 while(*s && *s != '\n') s++;
b81d288d 4595
ffc61ed2
JH
4596 if (*s == '\n') {
4597 char *t = ++s;
4598
4599 while (*s) {
4600 if (*s == '\n')
4601 *s = ' ';
4602 s++;
4603 }
4604 if (s[-1] == ' ')
4605 s[-1] = 0;
4606
4607 sv_catpv(sv, t);
fde631ed 4608 }
b81d288d 4609
ffc61ed2 4610 Safefree(origs);
fde631ed
JH
4611 }
4612 }
653099ff 4613 }
ffc61ed2 4614
653099ff
GS
4615 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4616 }
9b155405 4617 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4618 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4619#endif /* DEBUGGING */
35ff7856 4620}
a687059c 4621
cad2e5aa
JH
4622SV *
4623Perl_re_intuit_string(pTHX_ regexp *prog)
4624{ /* Assume that RE_INTUIT is set */
4625 DEBUG_r(
4626 { STRLEN n_a;
4627 char *s = SvPV(prog->check_substr,n_a);
4628
4629 if (!PL_colorset) reginitcolors();
4630 PerlIO_printf(Perl_debug_log,
4631 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4632 PL_colors[4],PL_colors[5],PL_colors[0],
4633 s,
4634 PL_colors[1],
4635 (strlen(s) > 60 ? "..." : ""));
4636 } );
4637
4638 return prog->check_substr;
4639}
4640
2b69d0c2 4641void
864dbfa3 4642Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4643{
adac82c7 4644 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4645
4646 if (!r || (--r->refcnt > 0))
4647 return;
cad2e5aa
JH
4648 DEBUG_r(PerlIO_printf(Perl_debug_log,
4649 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4650 PL_colors[4],PL_colors[5],PL_colors[0],
4651 r->precomp,
4652 PL_colors[1],
4653 (strlen(r->precomp) > 60 ? "..." : "")));
4654
c277df42 4655 if (r->precomp)
a0d0e21e 4656 Safefree(r->precomp);
fac92740
MJD
4657 if (r->offsets) /* 20010421 MJD */
4658 Safefree(r->offsets);
cf93c79d
IZ
4659 if (RX_MATCH_COPIED(r))
4660 Safefree(r->subbeg);
a193d654
GS
4661 if (r->substrs) {
4662 if (r->anchored_substr)
4663 SvREFCNT_dec(r->anchored_substr);
4664 if (r->float_substr)
4665 SvREFCNT_dec(r->float_substr);
2779dcf1 4666 Safefree(r->substrs);
a193d654 4667 }
c277df42
IZ
4668 if (r->data) {
4669 int n = r->data->count;
dfad63ad
HS
4670 AV* new_comppad = NULL;
4671 AV* old_comppad;
4672 SV** old_curpad;
4673
c277df42 4674 while (--n >= 0) {
261faec3 4675 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4676 switch (r->data->what[n]) {
4677 case 's':
4678 SvREFCNT_dec((SV*)r->data->data[n]);
4679 break;
653099ff
GS
4680 case 'f':
4681 Safefree(r->data->data[n]);
4682 break;
dfad63ad
HS
4683 case 'p':
4684 new_comppad = (AV*)r->data->data[n];
4685 break;
c277df42 4686 case 'o':
dfad63ad 4687 if (new_comppad == NULL)
cea2e8a9 4688 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4689 old_comppad = PL_comppad;
4690 old_curpad = PL_curpad;
1e6dc0b6
SB
4691 /* Watch out for global destruction's random ordering. */
4692 if (SvTYPE(new_comppad) == SVt_PVAV) {
4693 PL_comppad = new_comppad;
4694 PL_curpad = AvARRAY(new_comppad);
4695 }
4696 else
4697 PL_curpad = NULL;
9b978d73
DM
4698
4699 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4700 op_free((OP_4tree*)r->data->data[n]);
4701 }
4702
dfad63ad
HS
4703 PL_comppad = old_comppad;
4704 PL_curpad = old_curpad;
4705 SvREFCNT_dec((SV*)new_comppad);
4706 new_comppad = NULL;
c277df42
IZ
4707 break;
4708 case 'n':
4709 break;
4710 default:
830247a4 4711 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4712 }
4713 }
4714 Safefree(r->data->what);
4715 Safefree(r->data);
a0d0e21e
LW
4716 }
4717 Safefree(r->startp);
4718 Safefree(r->endp);
4719 Safefree(r);
a687059c 4720}
c277df42
IZ
4721
4722/*
4723 - regnext - dig the "next" pointer out of a node
4724 *
4725 * [Note, when REGALIGN is defined there are two places in regmatch()
4726 * that bypass this code for speed.]
4727 */
4728regnode *
864dbfa3 4729Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4730{
4731 register I32 offset;
4732
3280af22 4733 if (p == &PL_regdummy)
c277df42
IZ
4734 return(NULL);
4735
4736 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4737 if (offset == 0)
4738 return(NULL);
4739
c277df42 4740 return(p+offset);
c277df42
IZ
4741}
4742
01f988be 4743STATIC void
cea2e8a9 4744S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4745{
4746 va_list args;
4747 STRLEN l1 = strlen(pat1);
4748 STRLEN l2 = strlen(pat2);
4749 char buf[512];
06bf62c7 4750 SV *msv;
c277df42
IZ
4751 char *message;
4752
4753 if (l1 > 510)
4754 l1 = 510;
4755 if (l1 + l2 > 510)
4756 l2 = 510 - l1;
4757 Copy(pat1, buf, l1 , char);
4758 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4759 buf[l1 + l2] = '\n';
4760 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4761#ifdef I_STDARG
4762 /* ANSI variant takes additional second argument */
c277df42 4763 va_start(args, pat2);
8736538c
AS
4764#else
4765 va_start(args);
4766#endif
5a844595 4767 msv = vmess(buf, &args);
c277df42 4768 va_end(args);
06bf62c7 4769 message = SvPV(msv,l1);
c277df42
IZ
4770 if (l1 > 512)
4771 l1 = 512;
4772 Copy(message, buf, l1 , char);
4773 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4774 Perl_croak(aTHX_ "%s", buf);
c277df42 4775}
a0ed51b3
LW
4776
4777/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4778
4779void
864dbfa3 4780Perl_save_re_context(pTHX)
b81d288d 4781{
830247a4
IZ
4782#if 0
4783 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4784 SAVEI32(RExC_npar); /* () count. */
4785 SAVEI32(RExC_size); /* Code size. */
4786 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4787 SAVEVPTR(RExC_rx); /* from regcomp.c */
4788 SAVEI32(RExC_seen); /* from regcomp.c */
4789 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4790 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4791 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4792 SAVEPPTR(RExC_end); /* End of input for compile */
4793 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4794#endif
4795
4796 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4797 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4798 SAVEPPTR(PL_reginput); /* String-input pointer. */
4799 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4800 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4801 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4802 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4803 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4804 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4805 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4806 PL_reg_start_tmp = 0;
a0ed51b3
LW
4807 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4808 PL_reg_start_tmpl = 0;
7766f137 4809 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4810 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4811 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4812 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4813 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4814 SAVEVPTR(PL_regcc); /* from regexec.c */
4815 SAVEVPTR(PL_curcop);
7766f137
GS
4816 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4817 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4818 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4819 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4820 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4821 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4822 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4823 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4824 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4825 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4826 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4827#ifdef DEBUGGING
b81d288d 4828 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4829#endif
a0ed51b3 4830}
51371543 4831
51371543 4832static void
acfe0abc 4833clear_re(pTHX_ void *r)
51371543
GS
4834{
4835 ReREFCNT_dec((regexp *)r);
4836}
ffbc6a93 4837