This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
glob-basic.t, runenv.t fix-ups
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
118 char *end; /* End of input for compile */
119 char *parse; /* Input-scan pointer. */
120 I32 whilem_seen; /* number of WHILEM in this expr */
ffc61ed2 121 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
125 I32 size; /* Code size. */
126 I32 npar; /* () count. */
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
1aa99e6b 130 I32 utf8;
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
135} RExC_state_t;
136
137#define RExC_flags16 (pRExC_state->flags16)
138#define RExC_precomp (pRExC_state->precomp)
139#define RExC_rx (pRExC_state->rx)
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
143#define RExC_emit (pRExC_state->emit)
144#define RExC_naughty (pRExC_state->naughty)
145#define RExC_sawback (pRExC_state->sawback)
146#define RExC_seen (pRExC_state->seen)
147#define RExC_size (pRExC_state->size)
148#define RExC_npar (pRExC_state->npar)
149#define RExC_extralen (pRExC_state->extralen)
150#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
151#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 152#define RExC_utf8 (pRExC_state->utf8)
830247a4 153
a687059c
LW
154#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
155#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
156 ((*s) == '{' && regcurly(s)))
a687059c 157
35c8bce7
LW
158#ifdef SPSTART
159#undef SPSTART /* dratted cpp namespace... */
160#endif
a687059c
LW
161/*
162 * Flags to be passed up and down.
163 */
a687059c 164#define WORST 0 /* Worst case. */
821b33a5 165#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
166#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
167#define SPSTART 0x4 /* Starts with * or +. */
168#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 169
2c2d71f5
JH
170/* Length of a variant. */
171
172typedef struct scan_data_t {
173 I32 len_min;
174 I32 len_delta;
175 I32 pos_min;
176 I32 pos_delta;
177 SV *last_found;
178 I32 last_end; /* min value, <0 unless valid. */
179 I32 last_start_min;
180 I32 last_start_max;
181 SV **longest; /* Either &l_fixed, or &l_float. */
182 SV *longest_fixed;
183 I32 offset_fixed;
184 SV *longest_float;
185 I32 offset_float_min;
186 I32 offset_float_max;
187 I32 flags;
188 I32 whilem_c;
cb434fcc 189 I32 *last_closep;
653099ff 190 struct regnode_charclass_class *start_class;
2c2d71f5
JH
191} scan_data_t;
192
a687059c 193/*
e50aee73 194 * Forward declarations for pregcomp()'s friends.
a687059c 195 */
a0d0e21e 196
b81d288d 197static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 198 0, 0, 0, 0, 0, 0};
c277df42
IZ
199
200#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
201#define SF_BEFORE_SEOL 0x1
202#define SF_BEFORE_MEOL 0x2
203#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
204#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
205
09b7f37c
CB
206#ifdef NO_UNARY_PLUS
207# define SF_FIX_SHIFT_EOL (0+2)
208# define SF_FL_SHIFT_EOL (0+4)
209#else
210# define SF_FIX_SHIFT_EOL (+2)
211# define SF_FL_SHIFT_EOL (+4)
212#endif
c277df42
IZ
213
214#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
215#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
216
217#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
218#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
219#define SF_IS_INF 0x40
220#define SF_HAS_PAR 0x80
221#define SF_IN_PAR 0x100
222#define SF_HAS_EVAL 0x200
4bfe0158 223#define SCF_DO_SUBSTR 0x400
653099ff
GS
224#define SCF_DO_STCLASS_AND 0x0800
225#define SCF_DO_STCLASS_OR 0x1000
226#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 227#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 228
1aa99e6b 229#define UTF RExC_utf8
830247a4
IZ
230#define LOC (RExC_flags16 & PMf_LOCALE)
231#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 232
ffc61ed2 233#define OOB_UNICODE 12345678
93733859 234#define OOB_NAMEDCLASS -1
b8c5462f 235
a0ed51b3
LW
236#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
237#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
238
8615cb43 239
b45f050a
JF
240/* length of regex to show in messages that don't mark a position within */
241#define RegexLengthToShowInErrorMessages 127
242
243/*
244 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
245 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
246 * op/pragma/warn/regcomp.
247 */
79eeca27
JH
248#define MARKER1 "HERE" /* marker as it appears in the description */
249#define MARKER2 " << HERE " /* marker as it appears within the regex */
b81d288d 250
79eeca27 251#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
b45f050a
JF
252
253/*
254 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
255 * arg. Show regex, up to a maximum length. If it's too long, chop and add
256 * "...".
257 */
a4eb266f 258#define FAIL(msg) \
8615cb43 259 STMT_START { \
a4eb266f 260 char *ellipses = ""; \
830247a4 261 unsigned len = strlen(RExC_precomp); \
b45f050a 262 \
8615cb43 263 if (!SIZE_ONLY) \
830247a4 264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
265 \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 269 ellipses = "..."; \
b45f050a
JF
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 272 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
273 } STMT_END
274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
a4eb266f 280#define FAIL2(pat,msg) \
8615cb43 281 STMT_START { \
a4eb266f 282 char *ellipses = ""; \
830247a4 283 unsigned len = strlen(RExC_precomp); \
b45f050a 284 \
8615cb43 285 if (!SIZE_ONLY) \
830247a4 286 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 287 \
b45f050a
JF
288 if (len > RegexLengthToShowInErrorMessages) { \
289 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
290 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 291 ellipses = "..."; \
b45f050a
JF
292 } \
293 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 294 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
295 } STMT_END
296
297
298/*
299 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
300 */
301#define Simple_vFAIL(m) \
302 STMT_START { \
830247a4 303 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
304 \
305 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 306 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
307 } STMT_END
308
309/*
310 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
311 */
312#define vFAIL(m) \
313 STMT_START { \
314 if (!SIZE_ONLY) \
830247a4 315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
316 Simple_vFAIL(m); \
317 } STMT_END
318
319/*
320 * Like Simple_vFAIL(), but accepts two arguments.
321 */
322#define Simple_vFAIL2(m,a1) \
323 STMT_START { \
830247a4 324 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
325 \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 327 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
328 } STMT_END
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
332 */
333#define vFAIL2(m,a1) \
334 STMT_START { \
335 if (!SIZE_ONLY) \
830247a4 336 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
337 Simple_vFAIL2(m, a1); \
338 } STMT_END
339
340
341/*
342 * Like Simple_vFAIL(), but accepts three arguments.
343 */
344#define Simple_vFAIL3(m, a1, a2) \
345 STMT_START { \
830247a4 346 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
347 \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 349 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
350 } STMT_END
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
355#define vFAIL3(m,a1,a2) \
356 STMT_START { \
357 if (!SIZE_ONLY) \
830247a4 358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
359 Simple_vFAIL3(m, a1, a2); \
360 } STMT_END
361
362/*
363 * Like Simple_vFAIL(), but accepts four arguments.
364 */
365#define Simple_vFAIL4(m, a1, a2, a3) \
366 STMT_START { \
830247a4 367 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
368 \
369 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 370 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
371 } STMT_END
372
373/*
374 * Like Simple_vFAIL(), but accepts five arguments.
375 */
376#define Simple_vFAIL5(m, a1, a2, a3, a4) \
377 STMT_START { \
830247a4 378 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 379 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 380 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
381 } STMT_END
382
383
384#define vWARN(loc,m) \
385 STMT_START { \
830247a4 386 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 387 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 388 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
389 } STMT_END \
390
391
392#define vWARN2(loc, m, a1) \
393 STMT_START { \
830247a4 394 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
395 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
396 a1, \
830247a4 397 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
398 } STMT_END
399
400#define vWARN3(loc, m, a1, a2) \
401 STMT_START { \
830247a4 402 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
403 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
404 a1, a2, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN4(loc, m, a1, a2, a3) \
409 STMT_START { \
830247a4 410 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
411 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
412 a1, a2, a3, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
414 } STMT_END
415
416
cd439c50
IZ
417/* Allow for side effects in s */
418#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
419
51371543 420static void clear_re(pTHXo_ void *r);
4327152a 421
653099ff
GS
422/* Mark that we cannot extend a found fixed substring at this point.
423 Updata the longest found anchored substring and the longest found
424 floating substrings if needed. */
425
4327152a 426STATIC void
830247a4 427S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 428{
a0ed51b3
LW
429 STRLEN l = CHR_SVLEN(data->last_found);
430 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 431
c277df42
IZ
432 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
433 sv_setsv(*data->longest, data->last_found);
434 if (*data->longest == data->longest_fixed) {
435 data->offset_fixed = l ? data->last_start_min : data->pos_min;
436 if (data->flags & SF_BEFORE_EOL)
b81d288d 437 data->flags
c277df42
IZ
438 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
439 else
440 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
441 }
442 else {
c277df42 443 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
444 data->offset_float_max = (l
445 ? data->last_start_max
c277df42
IZ
446 : data->pos_min + data->pos_delta);
447 if (data->flags & SF_BEFORE_EOL)
b81d288d 448 data->flags
c277df42
IZ
449 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
450 else
451 data->flags &= ~SF_FL_BEFORE_EOL;
452 }
453 }
454 SvCUR_set(data->last_found, 0);
455 data->last_end = -1;
456 data->flags &= ~SF_BEFORE_EOL;
457}
458
653099ff
GS
459/* Can match anything (initialization) */
460STATIC void
830247a4 461S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 462{
9af0acdb
JH
463 int value;
464
653099ff 465 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
466 for (value = 0; value < 256; ++value)
467 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 468 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
469 if (LOC)
470 cl->flags |= ANYOF_LOCALE;
471}
472
473/* Can match anything (initialization) */
474STATIC int
475S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
476{
477 int value;
478
aaa51d5e 479 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
480 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
481 return 1;
1aa99e6b
IH
482 if (!(cl->flags & ANYOF_UNICODE_ALL))
483 return 0;
9af0acdb
JH
484 for (value = 0; value < 256; ++value)
485 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
486 return 0;
487 return 1;
488}
489
490/* Can match anything (initialization) */
491STATIC void
830247a4 492S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 493{
8ecf7187 494 Zero(cl, 1, struct regnode_charclass_class);
653099ff 495 cl->type = ANYOF;
830247a4 496 cl_anything(pRExC_state, cl);
653099ff
GS
497}
498
499STATIC void
830247a4 500S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 501{
8ecf7187 502 Zero(cl, 1, struct regnode_charclass_class);
653099ff 503 cl->type = ANYOF;
830247a4 504 cl_anything(pRExC_state, cl);
653099ff
GS
505 if (LOC)
506 cl->flags |= ANYOF_LOCALE;
507}
508
509/* 'And' a given class with another one. Can create false positives */
510/* We assume that cl is not inverted */
511STATIC void
512S_cl_and(pTHX_ struct regnode_charclass_class *cl,
513 struct regnode_charclass_class *and_with)
514{
653099ff
GS
515 if (!(and_with->flags & ANYOF_CLASS)
516 && !(cl->flags & ANYOF_CLASS)
517 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
518 && !(and_with->flags & ANYOF_FOLD)
519 && !(cl->flags & ANYOF_FOLD)) {
520 int i;
521
522 if (and_with->flags & ANYOF_INVERT)
523 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
524 cl->bitmap[i] &= ~and_with->bitmap[i];
525 else
526 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
527 cl->bitmap[i] &= and_with->bitmap[i];
528 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
529 if (!(and_with->flags & ANYOF_EOS))
530 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
531
532 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
533 cl->flags &= ~ANYOF_UNICODE_ALL;
534 cl->flags |= ANYOF_UNICODE;
535 ARG_SET(cl, ARG(and_with));
536 }
537 if (!(and_with->flags & ANYOF_UNICODE_ALL))
538 cl->flags &= ~ANYOF_UNICODE_ALL;
539 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
540 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
541}
542
543/* 'OR' a given class with another one. Can create false positives */
544/* We assume that cl is not inverted */
545STATIC void
830247a4 546S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 547{
653099ff
GS
548 if (or_with->flags & ANYOF_INVERT) {
549 /* We do not use
550 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
551 * <= (B1 | !B2) | (CL1 | !CL2)
552 * which is wasteful if CL2 is small, but we ignore CL2:
553 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
554 * XXXX Can we handle case-fold? Unclear:
555 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
556 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
557 */
558 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
559 && !(or_with->flags & ANYOF_FOLD)
560 && !(cl->flags & ANYOF_FOLD) ) {
561 int i;
562
563 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
564 cl->bitmap[i] |= ~or_with->bitmap[i];
565 } /* XXXX: logic is complicated otherwise */
566 else {
830247a4 567 cl_anything(pRExC_state, cl);
653099ff
GS
568 }
569 } else {
570 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
571 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 572 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
573 || (cl->flags & ANYOF_FOLD)) ) {
574 int i;
575
576 /* OR char bitmap and class bitmap separately */
577 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
578 cl->bitmap[i] |= or_with->bitmap[i];
579 if (or_with->flags & ANYOF_CLASS) {
580 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
581 cl->classflags[i] |= or_with->classflags[i];
582 cl->flags |= ANYOF_CLASS;
583 }
584 }
585 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 }
589 if (or_with->flags & ANYOF_EOS)
590 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
591
592 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
593 ARG(cl) != ARG(or_with)) {
594 cl->flags |= ANYOF_UNICODE_ALL;
595 cl->flags &= ~ANYOF_UNICODE;
596 }
597 if (or_with->flags & ANYOF_UNICODE_ALL) {
598 cl->flags |= ANYOF_UNICODE_ALL;
599 cl->flags &= ~ANYOF_UNICODE;
600 }
653099ff
GS
601}
602
603/* REx optimizer. Converts nodes into quickier variants "in place".
604 Finds fixed substrings. */
605
c277df42
IZ
606/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
607 to the position after last scanned or to NULL. */
608
76e3520e 609STATIC I32
830247a4 610S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
611 /* scanp: Start here (read-write). */
612 /* deltap: Write maxlen-minlen here. */
613 /* last: Stop before this one. */
614{
615 I32 min = 0, pars = 0, code;
616 regnode *scan = *scanp, *next;
617 I32 delta = 0;
618 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 619 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
620 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
621 scan_data_t data_fake;
653099ff 622 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 623
c277df42
IZ
624 while (scan && OP(scan) != END && scan < last) {
625 /* Peephole optimizer: */
626
22c35a8c 627 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 628 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
629 regnode *n = regnext(scan);
630 U32 stringok = 1;
631#ifdef DEBUGGING
632 regnode *stop = scan;
b81d288d 633#endif
c277df42 634
cd439c50 635 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
636 /* Skip NOTHING, merge EXACT*. */
637 while (n &&
b81d288d 638 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
639 (stringok && (OP(n) == OP(scan))))
640 && NEXT_OFF(n)
641 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
642 if (OP(n) == TAIL || n > next)
643 stringok = 0;
22c35a8c 644 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
645 NEXT_OFF(scan) += NEXT_OFF(n);
646 next = n + NODE_STEP_REGNODE;
647#ifdef DEBUGGING
648 if (stringok)
649 stop = n;
b81d288d 650#endif
c277df42 651 n = regnext(n);
a0ed51b3 652 }
f49d4d0f 653 else if (stringok) {
cd439c50 654 int oldl = STR_LEN(scan);
c277df42 655 regnode *nnext = regnext(n);
f49d4d0f 656
b81d288d 657 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
658 break;
659 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
660 STR_LEN(scan) += STR_LEN(n);
661 next = n + NODE_SZ_STR(n);
c277df42 662 /* Now we can overwrite *n : */
f49d4d0f 663 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 664#ifdef DEBUGGING
f49d4d0f 665 stop = next - 1;
b81d288d 666#endif
c277df42
IZ
667 n = nnext;
668 }
669 }
670#ifdef DEBUGGING
671 /* Allow dumping */
cd439c50 672 n = scan + NODE_SZ_STR(scan);
c277df42 673 while (n <= stop) {
22c35a8c 674 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
675 OP(n) = OPTIMIZED;
676 NEXT_OFF(n) = 0;
677 }
678 n++;
679 }
653099ff 680#endif
c277df42 681 }
653099ff
GS
682 /* Follow the next-chain of the current node and optimize
683 away all the NOTHINGs from it. */
c277df42 684 if (OP(scan) != CURLYX) {
048cfca1
GS
685 int max = (reg_off_by_arg[OP(scan)]
686 ? I32_MAX
687 /* I32 may be smaller than U16 on CRAYs! */
688 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
689 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
690 int noff;
691 regnode *n = scan;
b81d288d 692
c277df42
IZ
693 /* Skip NOTHING and LONGJMP. */
694 while ((n = regnext(n))
22c35a8c 695 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
696 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
697 && off + noff < max)
698 off += noff;
699 if (reg_off_by_arg[OP(scan)])
700 ARG(scan) = off;
b81d288d 701 else
c277df42
IZ
702 NEXT_OFF(scan) = off;
703 }
653099ff
GS
704 /* The principal pseudo-switch. Cannot be a switch, since we
705 look into several different things. */
b81d288d 706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
708 next = regnext(scan);
709 code = OP(scan);
b81d288d
AB
710
711 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 712 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 713 struct regnode_charclass_class accum;
c277df42 714
653099ff 715 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 716 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 717 if (flags & SCF_DO_STCLASS)
830247a4 718 cl_init_zero(pRExC_state, &accum);
c277df42 719 while (OP(scan) == code) {
830247a4 720 I32 deltanext, minnext, f = 0, fake;
653099ff 721 struct regnode_charclass_class this_class;
c277df42
IZ
722
723 num++;
724 data_fake.flags = 0;
b81d288d 725 if (data) {
2c2d71f5 726 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
727 data_fake.last_closep = data->last_closep;
728 }
729 else
730 data_fake.last_closep = &fake;
c277df42
IZ
731 next = regnext(scan);
732 scan = NEXTOPER(scan);
733 if (code != BRANCH)
734 scan = NEXTOPER(scan);
653099ff 735 if (flags & SCF_DO_STCLASS) {
830247a4 736 cl_init(pRExC_state, &this_class);
653099ff
GS
737 data_fake.start_class = &this_class;
738 f = SCF_DO_STCLASS_AND;
b81d288d 739 }
e1901655
IZ
740 if (flags & SCF_WHILEM_VISITED_POS)
741 f |= SCF_WHILEM_VISITED_POS;
653099ff 742 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
743 minnext = study_chunk(pRExC_state, &scan, &deltanext,
744 next, &data_fake, f);
b81d288d 745 if (min1 > minnext)
c277df42
IZ
746 min1 = minnext;
747 if (max1 < minnext + deltanext)
748 max1 = minnext + deltanext;
749 if (deltanext == I32_MAX)
aca2d497 750 is_inf = is_inf_internal = 1;
c277df42
IZ
751 scan = next;
752 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
753 pars++;
405ff068 754 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 755 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
756 if (data)
757 data->whilem_c = data_fake.whilem_c;
653099ff 758 if (flags & SCF_DO_STCLASS)
830247a4 759 cl_or(pRExC_state, &accum, &this_class);
b81d288d 760 if (code == SUSPEND)
c277df42
IZ
761 break;
762 }
763 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
764 min1 = 0;
765 if (flags & SCF_DO_SUBSTR) {
766 data->pos_min += min1;
767 data->pos_delta += max1 - min1;
768 if (max1 != min1 || is_inf)
769 data->longest = &(data->longest_float);
770 }
771 min += min1;
772 delta += max1 - min1;
653099ff 773 if (flags & SCF_DO_STCLASS_OR) {
830247a4 774 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
775 if (min1) {
776 cl_and(data->start_class, &and_with);
777 flags &= ~SCF_DO_STCLASS;
778 }
779 }
780 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
781 if (min1) {
782 cl_and(data->start_class, &accum);
653099ff 783 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
784 }
785 else {
b81d288d 786 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
787 * data->start_class */
788 StructCopy(data->start_class, &and_with,
789 struct regnode_charclass_class);
790 flags &= ~SCF_DO_STCLASS_AND;
791 StructCopy(&accum, data->start_class,
792 struct regnode_charclass_class);
793 flags |= SCF_DO_STCLASS_OR;
794 data->start_class->flags |= ANYOF_EOS;
795 }
653099ff 796 }
a0ed51b3
LW
797 }
798 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
799 scan = NEXTOPER(NEXTOPER(scan));
800 else /* single branch is optimized. */
801 scan = NEXTOPER(scan);
802 continue;
a0ed51b3
LW
803 }
804 else if (OP(scan) == EXACT) {
cd439c50 805 I32 l = STR_LEN(scan);
1aa99e6b 806 UV uc = *((U8*)STRING(scan));
a0ed51b3 807 if (UTF) {
1aa99e6b
IH
808 U8 *s = (U8*)STRING(scan);
809 l = utf8_length(s, s + l);
810 uc = utf8_to_uv_simple(s, NULL);
a0ed51b3
LW
811 }
812 min += l;
c277df42 813 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
814 /* The code below prefers earlier match for fixed
815 offset, later match for variable offset. */
816 if (data->last_end == -1) { /* Update the start info. */
817 data->last_start_min = data->pos_min;
818 data->last_start_max = is_inf
b81d288d 819 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 820 }
cd439c50 821 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
822 data->last_end = data->pos_min + l;
823 data->pos_min += l; /* As in the first entry. */
824 data->flags &= ~SF_BEFORE_EOL;
825 }
653099ff
GS
826 if (flags & SCF_DO_STCLASS_AND) {
827 /* Check whether it is compatible with what we know already! */
828 int compat = 1;
829
1aa99e6b 830 if (uc >= 0x100 ||
b81d288d 831 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 833 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 834 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
835 compat = 0;
836 ANYOF_CLASS_ZERO(data->start_class);
837 ANYOF_BITMAP_ZERO(data->start_class);
838 if (compat)
1aa99e6b 839 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 840 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
841 if (uc < 0x100)
842 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
843 }
844 else if (flags & SCF_DO_STCLASS_OR) {
845 /* false positive possible if the class is case-folded */
1aa99e6b 846 if (uc < 0x100)
9b877dbb
IH
847 ANYOF_BITMAP_SET(data->start_class, uc);
848 else
849 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
850 data->start_class->flags &= ~ANYOF_EOS;
851 cl_and(data->start_class, &and_with);
852 }
853 flags &= ~SCF_DO_STCLASS;
a0ed51b3 854 }
653099ff 855 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 856 I32 l = STR_LEN(scan);
1aa99e6b 857 UV uc = *((U8*)STRING(scan));
653099ff
GS
858
859 /* Search for fixed substrings supports EXACT only. */
b81d288d 860 if (flags & SCF_DO_SUBSTR)
830247a4 861 scan_commit(pRExC_state, data);
a0ed51b3 862 if (UTF) {
1aa99e6b
IH
863 U8 *s = (U8 *)STRING(scan);
864 l = utf8_length(s, s + l);
865 uc = utf8_to_uv_simple(s, NULL);
a0ed51b3
LW
866 }
867 min += l;
c277df42 868 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 869 data->pos_min += l;
653099ff
GS
870 if (flags & SCF_DO_STCLASS_AND) {
871 /* Check whether it is compatible with what we know already! */
872 int compat = 1;
873
1aa99e6b 874 if (uc >= 0x100 ||
b81d288d 875 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b
IH
876 && !ANYOF_BITMAP_TEST(data->start_class, uc)
877 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
653099ff
GS
878 compat = 0;
879 ANYOF_CLASS_ZERO(data->start_class);
880 ANYOF_BITMAP_ZERO(data->start_class);
881 if (compat) {
1aa99e6b 882 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
883 data->start_class->flags &= ~ANYOF_EOS;
884 data->start_class->flags |= ANYOF_FOLD;
885 if (OP(scan) == EXACTFL)
886 data->start_class->flags |= ANYOF_LOCALE;
887 }
888 }
889 else if (flags & SCF_DO_STCLASS_OR) {
890 if (data->start_class->flags & ANYOF_FOLD) {
891 /* false positive possible if the class is case-folded.
892 Assume that the locale settings are the same... */
1aa99e6b
IH
893 if (uc < 0x100)
894 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
895 data->start_class->flags &= ~ANYOF_EOS;
896 }
897 cl_and(data->start_class, &and_with);
898 }
899 flags &= ~SCF_DO_STCLASS;
a0ed51b3 900 }
4d61ec05 901 else if (strchr((char*)PL_varies,OP(scan))) {
aa7a4b56
MH
902 I32 mincount, maxcount, minnext, deltanext, fl;
903 I32 f = flags, pos_before = 0;
c277df42 904 regnode *oscan = scan;
653099ff
GS
905 struct regnode_charclass_class this_class;
906 struct regnode_charclass_class *oclass = NULL;
907
22c35a8c 908 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 909 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
910 scan = NEXTOPER(scan);
911 goto finish;
912 case PLUS:
653099ff 913 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 914 next = NEXTOPER(scan);
653099ff 915 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
916 mincount = 1;
917 maxcount = REG_INFTY;
c277df42
IZ
918 next = regnext(scan);
919 scan = NEXTOPER(scan);
920 goto do_curly;
921 }
922 }
923 if (flags & SCF_DO_SUBSTR)
924 data->pos_min++;
925 min++;
926 /* Fall through. */
927 case STAR:
653099ff
GS
928 if (flags & SCF_DO_STCLASS) {
929 mincount = 0;
b81d288d 930 maxcount = REG_INFTY;
653099ff
GS
931 next = regnext(scan);
932 scan = NEXTOPER(scan);
933 goto do_curly;
934 }
b81d288d 935 is_inf = is_inf_internal = 1;
c277df42
IZ
936 scan = regnext(scan);
937 if (flags & SCF_DO_SUBSTR) {
830247a4 938 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
939 data->longest = &(data->longest_float);
940 }
941 goto optimize_curly_tail;
942 case CURLY:
b81d288d 943 mincount = ARG1(scan);
c277df42
IZ
944 maxcount = ARG2(scan);
945 next = regnext(scan);
cb434fcc
IZ
946 if (OP(scan) == CURLYX) {
947 I32 lp = (data ? *(data->last_closep) : 0);
948
949 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
950 }
c277df42
IZ
951 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
952 do_curly:
953 if (flags & SCF_DO_SUBSTR) {
830247a4 954 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
955 pos_before = data->pos_min;
956 }
957 if (data) {
958 fl = data->flags;
959 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
960 if (is_inf)
961 data->flags |= SF_IS_INF;
962 }
653099ff 963 if (flags & SCF_DO_STCLASS) {
830247a4 964 cl_init(pRExC_state, &this_class);
653099ff
GS
965 oclass = data->start_class;
966 data->start_class = &this_class;
967 f |= SCF_DO_STCLASS_AND;
968 f &= ~SCF_DO_STCLASS_OR;
969 }
e1901655
IZ
970 /* These are the cases when once a subexpression
971 fails at a particular position, it cannot succeed
972 even after backtracking at the enclosing scope.
b81d288d 973
e1901655
IZ
974 XXXX what if minimal match and we are at the
975 initial run of {n,m}? */
976 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
977 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 978
c277df42 979 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
980 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
981 mincount == 0
653099ff
GS
982 ? (f & ~SCF_DO_SUBSTR) : f);
983
984 if (flags & SCF_DO_STCLASS)
985 data->start_class = oclass;
986 if (mincount == 0 || minnext == 0) {
987 if (flags & SCF_DO_STCLASS_OR) {
830247a4 988 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
989 }
990 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 991 /* Switch to OR mode: cache the old value of
653099ff
GS
992 * data->start_class */
993 StructCopy(data->start_class, &and_with,
994 struct regnode_charclass_class);
995 flags &= ~SCF_DO_STCLASS_AND;
996 StructCopy(&this_class, data->start_class,
997 struct regnode_charclass_class);
998 flags |= SCF_DO_STCLASS_OR;
999 data->start_class->flags |= ANYOF_EOS;
1000 }
1001 } else { /* Non-zero len */
1002 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1003 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1004 cl_and(data->start_class, &and_with);
1005 }
1006 else if (flags & SCF_DO_STCLASS_AND)
1007 cl_and(data->start_class, &this_class);
1008 flags &= ~SCF_DO_STCLASS;
1009 }
c277df42
IZ
1010 if (!scan) /* It was not CURLYX, but CURLY. */
1011 scan = next;
b81d288d 1012 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
821b33a5 1013 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1014 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1015 {
830247a4 1016 vWARN(RExC_parse,
b45f050a
JF
1017 "Quantifier unexpected on zero-length expression");
1018 }
1019
c277df42 1020 min += minnext * mincount;
b81d288d 1021 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1022 && (minnext + deltanext) > 0)
1023 || deltanext == I32_MAX);
aca2d497 1024 is_inf |= is_inf_internal;
c277df42
IZ
1025 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1026
1027 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1028 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1029 && data->flags & SF_IN_PAR
1030 && !(data->flags & SF_HAS_EVAL)
1031 && !deltanext && minnext == 1 ) {
1032 /* Try to optimize to CURLYN. */
1033 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1034 regnode *nxt1 = nxt, *nxt2;
1035
1036 /* Skip open. */
1037 nxt = regnext(nxt);
4d61ec05 1038 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1039 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1040 && STR_LEN(nxt) == 1))
c277df42
IZ
1041 goto nogo;
1042 nxt2 = nxt;
1043 nxt = regnext(nxt);
b81d288d 1044 if (OP(nxt) != CLOSE)
c277df42
IZ
1045 goto nogo;
1046 /* Now we know that nxt2 is the only contents: */
1047 oscan->flags = ARG(nxt);
1048 OP(oscan) = CURLYN;
1049 OP(nxt1) = NOTHING; /* was OPEN. */
1050#ifdef DEBUGGING
1051 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1052 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1053 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1054 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1055 OP(nxt + 1) = OPTIMIZED; /* was count. */
1056 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1057#endif
c277df42 1058 }
c277df42
IZ
1059 nogo:
1060
1061 /* Try optimization CURLYX => CURLYM. */
b81d288d 1062 if ( OP(oscan) == CURLYX && data
c277df42 1063 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1064 && !(data->flags & SF_HAS_EVAL)
1065 && !deltanext ) {
1066 /* XXXX How to optimize if data == 0? */
1067 /* Optimize to a simpler form. */
1068 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1069 regnode *nxt2;
1070
1071 OP(oscan) = CURLYM;
1072 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1073 && (OP(nxt2) != WHILEM))
c277df42
IZ
1074 nxt = nxt2;
1075 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1076 /* Need to optimize away parenths. */
1077 if (data->flags & SF_IN_PAR) {
1078 /* Set the parenth number. */
1079 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1080
b81d288d 1081 if (OP(nxt) != CLOSE)
b45f050a 1082 FAIL("Panic opt close");
c277df42
IZ
1083 oscan->flags = ARG(nxt);
1084 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1085 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1086#ifdef DEBUGGING
1087 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1088 OP(nxt + 1) = OPTIMIZED; /* was count. */
1089 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1090 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1091#endif
c277df42
IZ
1092#if 0
1093 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1094 regnode *nnxt = regnext(nxt1);
b81d288d 1095
c277df42
IZ
1096 if (nnxt == nxt) {
1097 if (reg_off_by_arg[OP(nxt1)])
1098 ARG_SET(nxt1, nxt2 - nxt1);
1099 else if (nxt2 - nxt1 < U16_MAX)
1100 NEXT_OFF(nxt1) = nxt2 - nxt1;
1101 else
1102 OP(nxt) = NOTHING; /* Cannot beautify */
1103 }
1104 nxt1 = nnxt;
1105 }
1106#endif
1107 /* Optimize again: */
b81d288d 1108 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1109 NULL, 0);
a0ed51b3
LW
1110 }
1111 else
c277df42 1112 oscan->flags = 0;
c277df42 1113 }
e1901655
IZ
1114 else if ((OP(oscan) == CURLYX)
1115 && (flags & SCF_WHILEM_VISITED_POS)
1116 /* See the comment on a similar expression above.
1117 However, this time it not a subexpression
1118 we care about, but the expression itself. */
1119 && (maxcount == REG_INFTY)
1120 && data && ++data->whilem_c < 16) {
1121 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1122 /* Find WHILEM (as in regexec.c) */
1123 regnode *nxt = oscan + NEXT_OFF(oscan);
1124
1125 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1126 nxt += ARG(nxt);
1127 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1128 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1129 }
b81d288d 1130 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1131 pars++;
1132 if (flags & SCF_DO_SUBSTR) {
1133 SV *last_str = Nullsv;
1134 int counted = mincount != 0;
1135
1136 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1137 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1138 ? pos_before : data->last_start_min;
1139 STRLEN l;
1140 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1141 I32 old = b - data->last_start_min;
1142
1143 if (UTF)
1144 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1145
a0ed51b3 1146 l -= old;
c277df42 1147 /* Get the added string: */
79cb57f6 1148 last_str = newSVpvn(s + old, l);
c277df42
IZ
1149 if (deltanext == 0 && pos_before == b) {
1150 /* What was added is a constant string */
1151 if (mincount > 1) {
1152 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1153 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1154 SvPVX(last_str), l, mincount - 1);
1155 SvCUR(last_str) *= mincount;
1156 /* Add additional parts. */
b81d288d 1157 SvCUR_set(data->last_found,
c277df42
IZ
1158 SvCUR(data->last_found) - l);
1159 sv_catsv(data->last_found, last_str);
1160 data->last_end += l * (mincount - 1);
1161 }
2a8d9689
HS
1162 } else {
1163 /* start offset must point into the last copy */
1164 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1165 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1166 * (minnext + data->pos_delta);
c277df42
IZ
1167 }
1168 }
1169 /* It is counted once already... */
1170 data->pos_min += minnext * (mincount - counted);
1171 data->pos_delta += - counted * deltanext +
1172 (minnext + deltanext) * maxcount - minnext * mincount;
1173 if (mincount != maxcount) {
653099ff
GS
1174 /* Cannot extend fixed substrings found inside
1175 the group. */
830247a4 1176 scan_commit(pRExC_state,data);
c277df42
IZ
1177 if (mincount && last_str) {
1178 sv_setsv(data->last_found, last_str);
1179 data->last_end = data->pos_min;
b81d288d 1180 data->last_start_min =
a0ed51b3 1181 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1182 data->last_start_max = is_inf
1183 ? I32_MAX
c277df42 1184 : data->pos_min + data->pos_delta
a0ed51b3 1185 - CHR_SVLEN(last_str);
c277df42
IZ
1186 }
1187 data->longest = &(data->longest_float);
1188 }
aca2d497 1189 SvREFCNT_dec(last_str);
c277df42 1190 }
405ff068 1191 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1192 data->flags |= SF_HAS_EVAL;
1193 optimize_curly_tail:
c277df42 1194 if (OP(oscan) != CURLYX) {
22c35a8c 1195 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1196 && NEXT_OFF(next))
1197 NEXT_OFF(oscan) += NEXT_OFF(next);
1198 }
c277df42 1199 continue;
653099ff 1200 default: /* REF and CLUMP only? */
c277df42 1201 if (flags & SCF_DO_SUBSTR) {
830247a4 1202 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1203 data->longest = &(data->longest_float);
1204 }
aca2d497 1205 is_inf = is_inf_internal = 1;
653099ff 1206 if (flags & SCF_DO_STCLASS_OR)
830247a4 1207 cl_anything(pRExC_state, data->start_class);
653099ff 1208 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1209 break;
1210 }
a0ed51b3 1211 }
ffc61ed2 1212 else if (strchr((char*)PL_simple,OP(scan))) {
653099ff
GS
1213 int value;
1214
c277df42 1215 if (flags & SCF_DO_SUBSTR) {
830247a4 1216 scan_commit(pRExC_state,data);
c277df42
IZ
1217 data->pos_min++;
1218 }
1219 min++;
653099ff
GS
1220 if (flags & SCF_DO_STCLASS) {
1221 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1222
1223 /* Some of the logic below assumes that switching
1224 locale on will only add false positives. */
1225 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1226 case SANY:
653099ff
GS
1227 default:
1228 do_default:
1229 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1230 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1231 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1232 break;
1233 case REG_ANY:
1234 if (OP(scan) == SANY)
1235 goto do_default;
1236 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1237 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1238 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1239 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1240 }
1241 if (flags & SCF_DO_STCLASS_AND || !value)
1242 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1243 break;
1244 case ANYOF:
1245 if (flags & SCF_DO_STCLASS_AND)
1246 cl_and(data->start_class,
1247 (struct regnode_charclass_class*)scan);
1248 else
830247a4 1249 cl_or(pRExC_state, data->start_class,
653099ff
GS
1250 (struct regnode_charclass_class*)scan);
1251 break;
1252 case ALNUM:
1253 if (flags & SCF_DO_STCLASS_AND) {
1254 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1255 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1256 for (value = 0; value < 256; value++)
1257 if (!isALNUM(value))
1258 ANYOF_BITMAP_CLEAR(data->start_class, value);
1259 }
1260 }
1261 else {
1262 if (data->start_class->flags & ANYOF_LOCALE)
1263 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1264 else {
1265 for (value = 0; value < 256; value++)
1266 if (isALNUM(value))
b81d288d 1267 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1268 }
1269 }
1270 break;
1271 case ALNUML:
1272 if (flags & SCF_DO_STCLASS_AND) {
1273 if (data->start_class->flags & ANYOF_LOCALE)
1274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1275 }
1276 else {
1277 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1278 data->start_class->flags |= ANYOF_LOCALE;
1279 }
1280 break;
1281 case NALNUM:
1282 if (flags & SCF_DO_STCLASS_AND) {
1283 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1284 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1285 for (value = 0; value < 256; value++)
1286 if (isALNUM(value))
1287 ANYOF_BITMAP_CLEAR(data->start_class, value);
1288 }
1289 }
1290 else {
1291 if (data->start_class->flags & ANYOF_LOCALE)
1292 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1293 else {
1294 for (value = 0; value < 256; value++)
1295 if (!isALNUM(value))
b81d288d 1296 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1297 }
1298 }
1299 break;
1300 case NALNUML:
1301 if (flags & SCF_DO_STCLASS_AND) {
1302 if (data->start_class->flags & ANYOF_LOCALE)
1303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1304 }
1305 else {
1306 data->start_class->flags |= ANYOF_LOCALE;
1307 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1308 }
1309 break;
1310 case SPACE:
1311 if (flags & SCF_DO_STCLASS_AND) {
1312 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1313 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1314 for (value = 0; value < 256; value++)
1315 if (!isSPACE(value))
1316 ANYOF_BITMAP_CLEAR(data->start_class, value);
1317 }
1318 }
1319 else {
1320 if (data->start_class->flags & ANYOF_LOCALE)
1321 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1322 else {
1323 for (value = 0; value < 256; value++)
1324 if (isSPACE(value))
b81d288d 1325 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1326 }
1327 }
1328 break;
1329 case SPACEL:
1330 if (flags & SCF_DO_STCLASS_AND) {
1331 if (data->start_class->flags & ANYOF_LOCALE)
1332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1333 }
1334 else {
1335 data->start_class->flags |= ANYOF_LOCALE;
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1337 }
1338 break;
1339 case NSPACE:
1340 if (flags & SCF_DO_STCLASS_AND) {
1341 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1342 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1343 for (value = 0; value < 256; value++)
1344 if (isSPACE(value))
1345 ANYOF_BITMAP_CLEAR(data->start_class, value);
1346 }
1347 }
1348 else {
1349 if (data->start_class->flags & ANYOF_LOCALE)
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1351 else {
1352 for (value = 0; value < 256; value++)
1353 if (!isSPACE(value))
b81d288d 1354 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1355 }
1356 }
1357 break;
1358 case NSPACEL:
1359 if (flags & SCF_DO_STCLASS_AND) {
1360 if (data->start_class->flags & ANYOF_LOCALE) {
1361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1362 for (value = 0; value < 256; value++)
1363 if (!isSPACE(value))
1364 ANYOF_BITMAP_CLEAR(data->start_class, value);
1365 }
1366 }
1367 else {
1368 data->start_class->flags |= ANYOF_LOCALE;
1369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1370 }
1371 break;
1372 case DIGIT:
1373 if (flags & SCF_DO_STCLASS_AND) {
1374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1375 for (value = 0; value < 256; value++)
1376 if (!isDIGIT(value))
1377 ANYOF_BITMAP_CLEAR(data->start_class, value);
1378 }
1379 else {
1380 if (data->start_class->flags & ANYOF_LOCALE)
1381 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1382 else {
1383 for (value = 0; value < 256; value++)
1384 if (isDIGIT(value))
b81d288d 1385 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1386 }
1387 }
1388 break;
1389 case NDIGIT:
1390 if (flags & SCF_DO_STCLASS_AND) {
1391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1392 for (value = 0; value < 256; value++)
1393 if (isDIGIT(value))
1394 ANYOF_BITMAP_CLEAR(data->start_class, value);
1395 }
1396 else {
1397 if (data->start_class->flags & ANYOF_LOCALE)
1398 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1399 else {
1400 for (value = 0; value < 256; value++)
1401 if (!isDIGIT(value))
b81d288d 1402 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1403 }
1404 }
1405 break;
1406 }
1407 if (flags & SCF_DO_STCLASS_OR)
1408 cl_and(data->start_class, &and_with);
1409 flags &= ~SCF_DO_STCLASS;
1410 }
a0ed51b3 1411 }
22c35a8c 1412 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1413 data->flags |= (OP(scan) == MEOL
1414 ? SF_BEFORE_MEOL
1415 : SF_BEFORE_SEOL);
a0ed51b3 1416 }
653099ff
GS
1417 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1418 /* Lookbehind, or need to calculate parens/evals/stclass: */
1419 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1420 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1421 /* Lookahead/lookbehind */
cb434fcc 1422 I32 deltanext, minnext, fake = 0;
c277df42 1423 regnode *nscan;
653099ff
GS
1424 struct regnode_charclass_class intrnl;
1425 int f = 0;
c277df42
IZ
1426
1427 data_fake.flags = 0;
b81d288d 1428 if (data) {
2c2d71f5 1429 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1430 data_fake.last_closep = data->last_closep;
1431 }
1432 else
1433 data_fake.last_closep = &fake;
653099ff
GS
1434 if ( flags & SCF_DO_STCLASS && !scan->flags
1435 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1436 cl_init(pRExC_state, &intrnl);
653099ff 1437 data_fake.start_class = &intrnl;
e1901655 1438 f |= SCF_DO_STCLASS_AND;
653099ff 1439 }
e1901655
IZ
1440 if (flags & SCF_WHILEM_VISITED_POS)
1441 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1442 next = regnext(scan);
1443 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1444 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1445 if (scan->flags) {
1446 if (deltanext) {
9baa0206 1447 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1448 }
1449 else if (minnext > U8_MAX) {
9baa0206 1450 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1451 }
1452 scan->flags = minnext;
1453 }
1454 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1455 pars++;
405ff068 1456 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1457 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1458 if (data)
1459 data->whilem_c = data_fake.whilem_c;
e1901655 1460 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1461 int was = (data->start_class->flags & ANYOF_EOS);
1462
1463 cl_and(data->start_class, &intrnl);
1464 if (was)
1465 data->start_class->flags |= ANYOF_EOS;
1466 }
a0ed51b3
LW
1467 }
1468 else if (OP(scan) == OPEN) {
c277df42 1469 pars++;
a0ed51b3 1470 }
cb434fcc
IZ
1471 else if (OP(scan) == CLOSE) {
1472 if (ARG(scan) == is_par) {
1473 next = regnext(scan);
c277df42 1474
cb434fcc
IZ
1475 if ( next && (OP(next) != WHILEM) && next < last)
1476 is_par = 0; /* Disable optimization */
1477 }
1478 if (data)
1479 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1480 }
1481 else if (OP(scan) == EVAL) {
c277df42
IZ
1482 if (data)
1483 data->flags |= SF_HAS_EVAL;
1484 }
96776eda 1485 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1486 if (flags & SCF_DO_SUBSTR) {
830247a4 1487 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1488 data->longest = &(data->longest_float);
1489 }
1490 is_inf = is_inf_internal = 1;
653099ff 1491 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1492 cl_anything(pRExC_state, data->start_class);
96776eda 1493 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1494 }
c277df42
IZ
1495 /* Else: zero-length, ignore. */
1496 scan = regnext(scan);
1497 }
1498
1499 finish:
1500 *scanp = scan;
aca2d497 1501 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1502 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1503 data->pos_delta = I32_MAX - data->pos_min;
1504 if (is_par > U8_MAX)
1505 is_par = 0;
1506 if (is_par && pars==1 && data) {
1507 data->flags |= SF_IN_PAR;
1508 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1509 }
1510 else if (pars && data) {
c277df42
IZ
1511 data->flags |= SF_HAS_PAR;
1512 data->flags &= ~SF_IN_PAR;
1513 }
653099ff
GS
1514 if (flags & SCF_DO_STCLASS_OR)
1515 cl_and(data->start_class, &and_with);
c277df42
IZ
1516 return min;
1517}
1518
76e3520e 1519STATIC I32
830247a4 1520S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1521{
830247a4 1522 if (RExC_rx->data) {
b81d288d
AB
1523 Renewc(RExC_rx->data,
1524 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1525 char, struct reg_data);
830247a4
IZ
1526 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1527 RExC_rx->data->count += n;
a0ed51b3
LW
1528 }
1529 else {
830247a4 1530 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1531 char, struct reg_data);
830247a4
IZ
1532 New(1208, RExC_rx->data->what, n, U8);
1533 RExC_rx->data->count = n;
c277df42 1534 }
830247a4
IZ
1535 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1536 return RExC_rx->data->count - n;
c277df42
IZ
1537}
1538
d88dccdf 1539void
864dbfa3 1540Perl_reginitcolors(pTHX)
d88dccdf 1541{
d88dccdf
IZ
1542 int i = 0;
1543 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1544
d88dccdf
IZ
1545 if (s) {
1546 PL_colors[0] = s = savepv(s);
1547 while (++i < 6) {
1548 s = strchr(s, '\t');
1549 if (s) {
1550 *s = '\0';
1551 PL_colors[i] = ++s;
1552 }
1553 else
c712d376 1554 PL_colors[i] = s = "";
d88dccdf
IZ
1555 }
1556 } else {
b81d288d 1557 while (i < 6)
d88dccdf
IZ
1558 PL_colors[i++] = "";
1559 }
1560 PL_colorset = 1;
1561}
1562
8615cb43 1563
a687059c 1564/*
e50aee73 1565 - pregcomp - compile a regular expression into internal code
a687059c
LW
1566 *
1567 * We can't allocate space until we know how big the compiled form will be,
1568 * but we can't compile it (and thus know how big it is) until we've got a
1569 * place to put the code. So we cheat: we compile it twice, once with code
1570 * generation turned off and size counting turned on, and once "for real".
1571 * This also means that we don't allocate space until we are sure that the
1572 * thing really will compile successfully, and we never have to move the
1573 * code and thus invalidate pointers into it. (Note that it has to be in
1574 * one piece because free() must be able to free it all.) [NB: not true in perl]
1575 *
1576 * Beware that the optimization-preparation code in here knows about some
1577 * of the structure of the compiled regexp. [I'll say.]
1578 */
1579regexp *
864dbfa3 1580Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1581{
a0d0e21e 1582 register regexp *r;
c277df42 1583 regnode *scan;
c277df42 1584 regnode *first;
a0d0e21e 1585 I32 flags;
a0d0e21e
LW
1586 I32 minlen = 0;
1587 I32 sawplus = 0;
1588 I32 sawopen = 0;
2c2d71f5 1589 scan_data_t data;
830247a4
IZ
1590 RExC_state_t RExC_state;
1591 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1592
1593 if (exp == NULL)
c277df42 1594 FAIL("NULL regexp argument");
a0d0e21e 1595
830247a4 1596 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1597 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1598 RExC_utf8 = 1;
a0ed51b3 1599 else
1aa99e6b 1600 RExC_utf8 = 0;
a0ed51b3 1601
830247a4 1602 RExC_precomp = savepvn(exp, xend - exp);
35ef4773 1603 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1604 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1605 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1606 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1607 RExC_flags16 = pm->op_pmflags;
1608 RExC_sawback = 0;
bbce6d69 1609
830247a4
IZ
1610 RExC_seen = 0;
1611 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1612 RExC_seen_evals = 0;
1613 RExC_extralen = 0;
c277df42 1614
bbce6d69 1615 /* First pass: determine size, legality. */
830247a4
IZ
1616 RExC_parse = exp;
1617 RExC_end = xend;
1618 RExC_naughty = 0;
1619 RExC_npar = 1;
1620 RExC_size = 0L;
1621 RExC_emit = &PL_regdummy;
1622 RExC_whilem_seen = 0;
85ddcde9
JH
1623#if 0 /* REGC() is (currently) a NOP at the first pass.
1624 * Clever compilers notice this and complain. --jhi */
830247a4 1625 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1626#endif
830247a4
IZ
1627 if (reg(pRExC_state, 0, &flags) == NULL) {
1628 Safefree(RExC_precomp);
1629 RExC_precomp = Nullch;
a0d0e21e
LW
1630 return(NULL);
1631 }
830247a4 1632 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1633
c277df42
IZ
1634 /* Small enough for pointer-storage convention?
1635 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1636 if (RExC_size >= 0x10000L && RExC_extralen)
1637 RExC_size += RExC_extralen;
c277df42 1638 else
830247a4
IZ
1639 RExC_extralen = 0;
1640 if (RExC_whilem_seen > 15)
1641 RExC_whilem_seen = 15;
a0d0e21e 1642
bbce6d69 1643 /* Allocate space and initialize. */
830247a4 1644 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1645 char, regexp);
a0d0e21e 1646 if (r == NULL)
b45f050a
JF
1647 FAIL("Regexp out of space");
1648
0f79a09d
GS
1649#ifdef DEBUGGING
1650 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1651 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1652#endif
c277df42 1653 r->refcnt = 1;
bbce6d69 1654 r->prelen = xend - exp;
830247a4 1655 r->precomp = RExC_precomp;
cf93c79d
IZ
1656 r->subbeg = NULL;
1657 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1658 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1659
1660 r->substrs = 0; /* Useful during FAIL. */
1661 r->startp = 0; /* Useful during FAIL. */
1662 r->endp = 0; /* Useful during FAIL. */
1663
830247a4 1664 RExC_rx = r;
bbce6d69 1665
1666 /* Second pass: emit code. */
830247a4
IZ
1667 RExC_parse = exp;
1668 RExC_end = xend;
1669 RExC_naughty = 0;
1670 RExC_npar = 1;
1671 RExC_emit = r->program;
2cd61cdb 1672 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1673 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1674 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1675 r->data = 0;
830247a4 1676 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1677 return(NULL);
1678
1679 /* Dig out information for optimizations. */
cf93c79d 1680 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1681 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1682 if (UTF)
1683 r->reganch |= ROPT_UTF8;
c277df42 1684 r->regstclass = NULL;
830247a4 1685 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1686 r->reganch |= ROPT_NAUGHTY;
c277df42 1687 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1688
1689 /* XXXX To minimize changes to RE engine we always allocate
1690 3-units-long substrs field. */
1691 Newz(1004, r->substrs, 1, struct reg_substr_data);
1692
2c2d71f5 1693 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1694 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1695 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1696 I32 fake;
c5254dd6 1697 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1698 struct regnode_charclass_class ch_class;
1699 int stclass_flag;
cb434fcc 1700 I32 last_close = 0;
a0d0e21e
LW
1701
1702 first = scan;
c277df42 1703 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1704 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1705 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1706 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1707 (OP(first) == PLUS) ||
1708 (OP(first) == MINMOD) ||
653099ff 1709 /* An {n,m} with n>0 */
22c35a8c 1710 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1711 if (OP(first) == PLUS)
1712 sawplus = 1;
1713 else
1714 first += regarglen[(U8)OP(first)];
1715 first = NEXTOPER(first);
a687059c
LW
1716 }
1717
a0d0e21e
LW
1718 /* Starting-point info. */
1719 again:
653099ff 1720 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1721 if (OP(first) == EXACT)
1722 ; /* Empty, get anchored substr later. */
1723 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1724 r->regstclass = first;
1725 }
653099ff 1726 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1727 r->regstclass = first;
22c35a8c
GS
1728 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1729 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1730 r->regstclass = first;
22c35a8c 1731 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1732 r->reganch |= (OP(first) == MBOL
1733 ? ROPT_ANCH_MBOL
1734 : (OP(first) == SBOL
1735 ? ROPT_ANCH_SBOL
1736 : ROPT_ANCH_BOL));
a0d0e21e 1737 first = NEXTOPER(first);
774d564b 1738 goto again;
1739 }
1740 else if (OP(first) == GPOS) {
1741 r->reganch |= ROPT_ANCH_GPOS;
1742 first = NEXTOPER(first);
1743 goto again;
a0d0e21e
LW
1744 }
1745 else if ((OP(first) == STAR &&
22c35a8c 1746 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1747 !(r->reganch & ROPT_ANCH) )
1748 {
1749 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1750 int type = OP(NEXTOPER(first));
1751
ffc61ed2 1752 if (type == REG_ANY)
cad2e5aa
JH
1753 type = ROPT_ANCH_MBOL;
1754 else
1755 type = ROPT_ANCH_SBOL;
1756
1757 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1758 first = NEXTOPER(first);
774d564b 1759 goto again;
a0d0e21e 1760 }
b81d288d 1761 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1762 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1763 /* x+ must match at the 1st pos of run of x's */
1764 r->reganch |= ROPT_SKIP;
a0d0e21e 1765
c277df42 1766 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1767 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1768 (IV)(first - scan + 1)));
a0d0e21e
LW
1769 /*
1770 * If there's something expensive in the r.e., find the
1771 * longest literal string that must appear and make it the
1772 * regmust. Resolve ties in favor of later strings, since
1773 * the regstart check works with the beginning of the r.e.
1774 * and avoiding duplication strengthens checking. Not a
1775 * strong reason, but sufficient in the absence of others.
1776 * [Now we resolve ties in favor of the earlier string if
c277df42 1777 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1778 * earlier string may buy us something the later one won't.]
1779 */
a0d0e21e 1780 minlen = 0;
a687059c 1781
79cb57f6
GS
1782 data.longest_fixed = newSVpvn("",0);
1783 data.longest_float = newSVpvn("",0);
1784 data.last_found = newSVpvn("",0);
c277df42
IZ
1785 data.longest = &(data.longest_fixed);
1786 first = scan;
653099ff 1787 if (!r->regstclass) {
830247a4 1788 cl_init(pRExC_state, &ch_class);
653099ff
GS
1789 data.start_class = &ch_class;
1790 stclass_flag = SCF_DO_STCLASS_AND;
1791 } else /* XXXX Check for BOUND? */
1792 stclass_flag = 0;
cb434fcc 1793 data.last_closep = &last_close;
653099ff 1794
830247a4 1795 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1796 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1797 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1798 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1799 && !RExC_seen_zerolen
1800 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1801 r->reganch |= ROPT_CHECK_ALL;
830247a4 1802 scan_commit(pRExC_state, &data);
c277df42
IZ
1803 SvREFCNT_dec(data.last_found);
1804
a0ed51b3 1805 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1806 if (longest_float_length
c277df42
IZ
1807 || (data.flags & SF_FL_BEFORE_EOL
1808 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1809 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1810 int t;
1811
a0ed51b3 1812 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1813 && data.offset_fixed == data.offset_float_min
1814 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1815 goto remove_float; /* As in (a)+. */
1816
c277df42
IZ
1817 r->float_substr = data.longest_float;
1818 r->float_min_offset = data.offset_float_min;
1819 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1820 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1821 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1822 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1823 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1824 }
1825 else {
aca2d497 1826 remove_float:
c277df42
IZ
1827 r->float_substr = Nullsv;
1828 SvREFCNT_dec(data.longest_float);
c5254dd6 1829 longest_float_length = 0;
a0d0e21e 1830 }
c277df42 1831
a0ed51b3 1832 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1833 if (longest_fixed_length
c277df42
IZ
1834 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1835 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1836 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1837 int t;
1838
c277df42
IZ
1839 r->anchored_substr = data.longest_fixed;
1840 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1841 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1842 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1843 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1844 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1845 }
1846 else {
c277df42
IZ
1847 r->anchored_substr = Nullsv;
1848 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1849 longest_fixed_length = 0;
a0d0e21e 1850 }
b81d288d 1851 if (r->regstclass
ffc61ed2 1852 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1853 r->regstclass = NULL;
1854 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1855 && !(data.start_class->flags & ANYOF_EOS)
1856 && !cl_is_anything(data.start_class)) {
1857 SV *sv;
830247a4 1858 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1859
b81d288d 1860 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1861 struct regnode_charclass_class);
1862 StructCopy(data.start_class,
830247a4 1863 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1864 struct regnode_charclass_class);
830247a4 1865 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1866 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1867 PL_regdata = r->data; /* for regprop() */
653099ff
GS
1868 DEBUG_r((sv = sv_newmortal(),
1869 regprop(sv, (regnode*)data.start_class),
894356b3 1870 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1871 SvPVX(sv))));
1872 }
c277df42
IZ
1873
1874 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1875 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1876 r->check_substr = r->anchored_substr;
1877 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1878 if (r->reganch & ROPT_ANCH_SINGLE)
1879 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1880 }
1881 else {
c277df42
IZ
1882 r->check_substr = r->float_substr;
1883 r->check_offset_min = data.offset_float_min;
1884 r->check_offset_max = data.offset_float_max;
a0d0e21e 1885 }
30382c73
IZ
1886 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1887 This should be changed ASAP! */
1888 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1889 r->reganch |= RE_USE_INTUIT;
1890 if (SvTAIL(r->check_substr))
1891 r->reganch |= RE_INTUIT_TAIL;
1892 }
a0ed51b3
LW
1893 }
1894 else {
c277df42
IZ
1895 /* Several toplevels. Best we can is to set minlen. */
1896 I32 fake;
653099ff 1897 struct regnode_charclass_class ch_class;
cb434fcc 1898 I32 last_close = 0;
c277df42
IZ
1899
1900 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1901 scan = r->program + 1;
830247a4 1902 cl_init(pRExC_state, &ch_class);
653099ff 1903 data.start_class = &ch_class;
cb434fcc 1904 data.last_closep = &last_close;
e1901655 1905 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1906 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1907 if (!(data.start_class->flags & ANYOF_EOS)
1908 && !cl_is_anything(data.start_class)) {
1909 SV *sv;
830247a4 1910 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1911
b81d288d 1912 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1913 struct regnode_charclass_class);
1914 StructCopy(data.start_class,
830247a4 1915 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1916 struct regnode_charclass_class);
830247a4 1917 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff
GS
1918 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
1919 DEBUG_r((sv = sv_newmortal(),
1920 regprop(sv, (regnode*)data.start_class),
894356b3 1921 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1922 SvPVX(sv))));
1923 }
a0d0e21e
LW
1924 }
1925
a0d0e21e 1926 r->minlen = minlen;
b81d288d 1927 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1928 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1929 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1930 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1931 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1932 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
1933 if (RExC_seen & REG_SEEN_SANY)
1934 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
1935 Newz(1002, r->startp, RExC_npar, I32);
1936 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1937 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1938 DEBUG_r(regdump(r));
1939 return(r);
a687059c
LW
1940}
1941
1942/*
1943 - reg - regular expression, i.e. main body or parenthesized thing
1944 *
1945 * Caller must absorb opening parenthesis.
1946 *
1947 * Combining parenthesis handling with the base level of regular expression
1948 * is a trifle forced, but the need to tie the tails of the branches to what
1949 * follows makes it hard to avoid.
1950 */
76e3520e 1951STATIC regnode *
830247a4 1952S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 1953 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 1954{
c277df42
IZ
1955 register regnode *ret; /* Will be the head of the group. */
1956 register regnode *br;
1957 register regnode *lastbr;
1958 register regnode *ender = 0;
a0d0e21e 1959 register I32 parno = 0;
830247a4
IZ
1960 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
1961 char *oregcomp_parse = RExC_parse;
c277df42 1962 char c;
a0d0e21e 1963
821b33a5 1964 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
1965
1966 /* Make an OPEN node, if parenthesized. */
1967 if (paren) {
830247a4 1968 if (*RExC_parse == '?') {
ca9dfc88
IZ
1969 U16 posflags = 0, negflags = 0;
1970 U16 *flagsp = &posflags;
0f5d15d6 1971 int logical = 0;
830247a4 1972 char *seqstart = RExC_parse;
ca9dfc88 1973
830247a4
IZ
1974 RExC_parse++;
1975 paren = *RExC_parse++;
c277df42 1976 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 1977 switch (paren) {
c277df42 1978 case '<':
830247a4 1979 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 1980 if (*RExC_parse == '!')
c277df42 1981 paren = ',';
b81d288d 1982 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 1983 goto unknown;
830247a4 1984 RExC_parse++;
a0d0e21e
LW
1985 case '=':
1986 case '!':
830247a4 1987 RExC_seen_zerolen++;
c277df42
IZ
1988 case ':':
1989 case '>':
a0d0e21e
LW
1990 break;
1991 case '$':
1992 case '@':
8615cb43 1993 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
1994 break;
1995 case '#':
830247a4
IZ
1996 while (*RExC_parse && *RExC_parse != ')')
1997 RExC_parse++;
1998 if (*RExC_parse != ')')
c277df42 1999 FAIL("Sequence (?#... not terminated");
830247a4 2000 nextchar(pRExC_state);
a0d0e21e
LW
2001 *flagp = TRYAGAIN;
2002 return NULL;
8c8ad484 2003 case 'p':
d7e9e385 2004 if (SIZE_ONLY)
830247a4 2005 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2006 /* FALL THROUGH*/
14455d6c 2007 case '?':
0f5d15d6 2008 logical = 1;
830247a4 2009 paren = *RExC_parse++;
0f5d15d6 2010 /* FALL THROUGH */
c277df42
IZ
2011 case '{':
2012 {
c277df42
IZ
2013 I32 count = 1, n = 0;
2014 char c;
830247a4 2015 char *s = RExC_parse;
c277df42
IZ
2016 SV *sv;
2017 OP_4tree *sop, *rop;
2018
830247a4
IZ
2019 RExC_seen_zerolen++;
2020 RExC_seen |= REG_SEEN_EVAL;
2021 while (count && (c = *RExC_parse)) {
2022 if (c == '\\' && RExC_parse[1])
2023 RExC_parse++;
b81d288d 2024 else if (c == '{')
c277df42 2025 count++;
b81d288d 2026 else if (c == '}')
c277df42 2027 count--;
830247a4 2028 RExC_parse++;
c277df42 2029 }
830247a4 2030 if (*RExC_parse != ')')
b45f050a 2031 {
b81d288d 2032 RExC_parse = s;
b45f050a
JF
2033 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2034 }
c277df42
IZ
2035 if (!SIZE_ONLY) {
2036 AV *av;
b81d288d
AB
2037
2038 if (RExC_parse - 1 - s)
830247a4 2039 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2040 else
79cb57f6 2041 sv = newSVpvn("", 0);
c277df42 2042
569233ed
SB
2043 ENTER;
2044 Perl_save_re_context(aTHX);
c277df42 2045 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2046 LEAVE;
c277df42 2047
830247a4
IZ
2048 n = add_data(pRExC_state, 3, "nop");
2049 RExC_rx->data->data[n] = (void*)rop;
2050 RExC_rx->data->data[n+1] = (void*)sop;
2051 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2052 SvREFCNT_dec(sv);
a0ed51b3 2053 }
e24b16f9 2054 else { /* First pass */
830247a4 2055 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2056 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2057 /* No compiled RE interpolated, has runtime
2058 components ===> unsafe. */
2059 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3280af22 2060 if (PL_tainted)
cc6b7395 2061 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2062 }
2063
830247a4 2064 nextchar(pRExC_state);
0f5d15d6 2065 if (logical) {
830247a4 2066 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2067 if (!SIZE_ONLY)
2068 ret->flags = 2;
830247a4 2069 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
0f5d15d6
IZ
2070 return ret;
2071 }
830247a4 2072 return reganode(pRExC_state, EVAL, n);
c277df42
IZ
2073 }
2074 case '(':
2075 {
830247a4 2076 if (RExC_parse[0] == '?') {
b81d288d
AB
2077 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2078 || RExC_parse[1] == '<'
830247a4 2079 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2080 I32 flag;
2081
830247a4 2082 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2083 if (!SIZE_ONLY)
2084 ret->flags = 1;
830247a4 2085 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2086 goto insert_if;
b81d288d 2087 }
a0ed51b3 2088 }
830247a4
IZ
2089 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2090 parno = atoi(RExC_parse++);
c277df42 2091
830247a4
IZ
2092 while (isDIGIT(*RExC_parse))
2093 RExC_parse++;
2094 ret = reganode(pRExC_state, GROUPP, parno);
2095 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2096 vFAIL("Switch condition not recognized");
c277df42 2097 insert_if:
830247a4
IZ
2098 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2099 br = regbranch(pRExC_state, &flags, 1);
c277df42 2100 if (br == NULL)
830247a4 2101 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2102 else
830247a4
IZ
2103 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2104 c = *nextchar(pRExC_state);
d1b80229
IZ
2105 if (flags&HASWIDTH)
2106 *flagp |= HASWIDTH;
c277df42 2107 if (c == '|') {
830247a4
IZ
2108 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2109 regbranch(pRExC_state, &flags, 1);
2110 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2111 if (flags&HASWIDTH)
2112 *flagp |= HASWIDTH;
830247a4 2113 c = *nextchar(pRExC_state);
a0ed51b3
LW
2114 }
2115 else
c277df42
IZ
2116 lastbr = NULL;
2117 if (c != ')')
8615cb43 2118 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2119 ender = reg_node(pRExC_state, TAIL);
2120 regtail(pRExC_state, br, ender);
c277df42 2121 if (lastbr) {
830247a4
IZ
2122 regtail(pRExC_state, lastbr, ender);
2123 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2124 }
2125 else
830247a4 2126 regtail(pRExC_state, ret, ender);
c277df42 2127 return ret;
a0ed51b3
LW
2128 }
2129 else {
830247a4 2130 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2131 }
2132 }
1b1626e4 2133 case 0:
830247a4 2134 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2135 vFAIL("Sequence (? incomplete");
1b1626e4 2136 break;
a0d0e21e 2137 default:
830247a4 2138 --RExC_parse;
ca9dfc88 2139 parse_flags:
830247a4
IZ
2140 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2141 if (*RExC_parse != 'o')
2142 pmflag(flagsp, *RExC_parse);
2143 ++RExC_parse;
ca9dfc88 2144 }
830247a4 2145 if (*RExC_parse == '-') {
ca9dfc88 2146 flagsp = &negflags;
830247a4 2147 ++RExC_parse;
ca9dfc88 2148 goto parse_flags;
48c036b1 2149 }
830247a4
IZ
2150 RExC_flags16 |= posflags;
2151 RExC_flags16 &= ~negflags;
2152 if (*RExC_parse == ':') {
2153 RExC_parse++;
ca9dfc88
IZ
2154 paren = ':';
2155 break;
2156 }
c277df42 2157 unknown:
830247a4
IZ
2158 if (*RExC_parse != ')') {
2159 RExC_parse++;
2160 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2161 }
830247a4 2162 nextchar(pRExC_state);
a0d0e21e
LW
2163 *flagp = TRYAGAIN;
2164 return NULL;
2165 }
2166 }
2167 else {
830247a4
IZ
2168 parno = RExC_npar;
2169 RExC_npar++;
2170 ret = reganode(pRExC_state, OPEN, parno);
c277df42 2171 open = 1;
a0d0e21e 2172 }
a0ed51b3
LW
2173 }
2174 else
a0d0e21e
LW
2175 ret = NULL;
2176
2177 /* Pick up the branches, linking them together. */
830247a4 2178 br = regbranch(pRExC_state, &flags, 1);
a0d0e21e
LW
2179 if (br == NULL)
2180 return(NULL);
830247a4
IZ
2181 if (*RExC_parse == '|') {
2182 if (!SIZE_ONLY && RExC_extralen) {
2183 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3
LW
2184 }
2185 else
830247a4 2186 reginsert(pRExC_state, BRANCH, br);
c277df42
IZ
2187 have_branch = 1;
2188 if (SIZE_ONLY)
830247a4 2189 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2190 }
2191 else if (paren == ':') {
c277df42
IZ
2192 *flagp |= flags&SIMPLE;
2193 }
2194 if (open) { /* Starts with OPEN. */
830247a4 2195 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2196 }
2197 else if (paren != '?') /* Not Conditional */
a0d0e21e 2198 ret = br;
821b33a5
IZ
2199 if (flags&HASWIDTH)
2200 *flagp |= HASWIDTH;
a0d0e21e 2201 *flagp |= flags&SPSTART;
c277df42 2202 lastbr = br;
830247a4
IZ
2203 while (*RExC_parse == '|') {
2204 if (!SIZE_ONLY && RExC_extralen) {
2205 ender = reganode(pRExC_state, LONGJMP,0);
2206 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2207 }
2208 if (SIZE_ONLY)
830247a4
IZ
2209 RExC_extralen += 2; /* Account for LONGJMP. */
2210 nextchar(pRExC_state);
2211 br = regbranch(pRExC_state, &flags, 0);
a687059c 2212 if (br == NULL)
a0d0e21e 2213 return(NULL);
830247a4 2214 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2215 lastbr = br;
821b33a5
IZ
2216 if (flags&HASWIDTH)
2217 *flagp |= HASWIDTH;
a687059c 2218 *flagp |= flags&SPSTART;
a0d0e21e
LW
2219 }
2220
c277df42
IZ
2221 if (have_branch || paren != ':') {
2222 /* Make a closing node, and hook it on the end. */
2223 switch (paren) {
2224 case ':':
830247a4 2225 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2226 break;
2227 case 1:
830247a4 2228 ender = reganode(pRExC_state, CLOSE, parno);
c277df42
IZ
2229 break;
2230 case '<':
c277df42
IZ
2231 case ',':
2232 case '=':
2233 case '!':
c277df42 2234 *flagp &= ~HASWIDTH;
821b33a5
IZ
2235 /* FALL THROUGH */
2236 case '>':
830247a4 2237 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2238 break;
2239 case 0:
830247a4 2240 ender = reg_node(pRExC_state, END);
c277df42
IZ
2241 break;
2242 }
830247a4 2243 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2244
c277df42
IZ
2245 if (have_branch) {
2246 /* Hook the tails of the branches to the closing node. */
2247 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2248 regoptail(pRExC_state, br, ender);
c277df42
IZ
2249 }
2250 }
a0d0e21e 2251 }
c277df42
IZ
2252
2253 {
2254 char *p;
2255 static char parens[] = "=!<,>";
2256
2257 if (paren && (p = strchr(parens, paren))) {
2258 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2259 int flag = (p - parens) > 1;
2260
2261 if (paren == '>')
2262 node = SUSPEND, flag = 0;
830247a4 2263 reginsert(pRExC_state, node,ret);
c277df42 2264 ret->flags = flag;
830247a4 2265 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2266 }
a0d0e21e
LW
2267 }
2268
2269 /* Check for proper termination. */
ce3e6498 2270 if (paren) {
830247a4
IZ
2271 RExC_flags16 = oregflags;
2272 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2273 RExC_parse = oregcomp_parse;
380a0633 2274 vFAIL("Unmatched (");
ce3e6498 2275 }
a0ed51b3 2276 }
830247a4
IZ
2277 else if (!paren && RExC_parse < RExC_end) {
2278 if (*RExC_parse == ')') {
2279 RExC_parse++;
380a0633 2280 vFAIL("Unmatched )");
a0ed51b3
LW
2281 }
2282 else
b45f050a 2283 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2284 /* NOTREACHED */
2285 }
a687059c 2286
a0d0e21e 2287 return(ret);
a687059c
LW
2288}
2289
2290/*
2291 - regbranch - one alternative of an | operator
2292 *
2293 * Implements the concatenation operator.
2294 */
76e3520e 2295STATIC regnode *
830247a4 2296S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2297{
c277df42
IZ
2298 register regnode *ret;
2299 register regnode *chain = NULL;
2300 register regnode *latest;
2301 I32 flags = 0, c = 0;
a0d0e21e 2302
b81d288d 2303 if (first)
c277df42
IZ
2304 ret = NULL;
2305 else {
b81d288d 2306 if (!SIZE_ONLY && RExC_extralen)
830247a4 2307 ret = reganode(pRExC_state, BRANCHJ,0);
c277df42 2308 else
830247a4 2309 ret = reg_node(pRExC_state, BRANCH);
c277df42
IZ
2310 }
2311
b81d288d 2312 if (!first && SIZE_ONLY)
830247a4 2313 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2314
c277df42 2315 *flagp = WORST; /* Tentatively. */
a0d0e21e 2316
830247a4
IZ
2317 RExC_parse--;
2318 nextchar(pRExC_state);
2319 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2320 flags &= ~TRYAGAIN;
830247a4 2321 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2322 if (latest == NULL) {
2323 if (flags & TRYAGAIN)
2324 continue;
2325 return(NULL);
a0ed51b3
LW
2326 }
2327 else if (ret == NULL)
c277df42 2328 ret = latest;
a0d0e21e 2329 *flagp |= flags&HASWIDTH;
c277df42 2330 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2331 *flagp |= flags&SPSTART;
2332 else {
830247a4
IZ
2333 RExC_naughty++;
2334 regtail(pRExC_state, chain, latest);
a687059c 2335 }
a0d0e21e 2336 chain = latest;
c277df42
IZ
2337 c++;
2338 }
2339 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2340 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2341 if (ret == NULL)
2342 ret = chain;
2343 }
2344 if (c == 1) {
2345 *flagp |= flags&SIMPLE;
a0d0e21e 2346 }
a687059c 2347
a0d0e21e 2348 return(ret);
a687059c
LW
2349}
2350
2351/*
2352 - regpiece - something followed by possible [*+?]
2353 *
2354 * Note that the branching code sequences used for ? and the general cases
2355 * of * and + are somewhat optimized: they use the same NOTHING node as
2356 * both the endmarker for their branch list and the body of the last branch.
2357 * It might seem that this node could be dispensed with entirely, but the
2358 * endmarker role is not redundant.
2359 */
76e3520e 2360STATIC regnode *
830247a4 2361S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2362{
c277df42 2363 register regnode *ret;
a0d0e21e
LW
2364 register char op;
2365 register char *next;
2366 I32 flags;
830247a4 2367 char *origparse = RExC_parse;
a0d0e21e
LW
2368 char *maxpos;
2369 I32 min;
c277df42 2370 I32 max = REG_INFTY;
a0d0e21e 2371
830247a4 2372 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2373 if (ret == NULL) {
2374 if (flags & TRYAGAIN)
2375 *flagp |= TRYAGAIN;
2376 return(NULL);
2377 }
2378
830247a4 2379 op = *RExC_parse;
a0d0e21e 2380
830247a4
IZ
2381 if (op == '{' && regcurly(RExC_parse)) {
2382 next = RExC_parse + 1;
a0d0e21e
LW
2383 maxpos = Nullch;
2384 while (isDIGIT(*next) || *next == ',') {
2385 if (*next == ',') {
2386 if (maxpos)
2387 break;
2388 else
2389 maxpos = next;
a687059c 2390 }
a0d0e21e
LW
2391 next++;
2392 }
2393 if (*next == '}') { /* got one */
2394 if (!maxpos)
2395 maxpos = next;
830247a4
IZ
2396 RExC_parse++;
2397 min = atoi(RExC_parse);
a0d0e21e
LW
2398 if (*maxpos == ',')
2399 maxpos++;
2400 else
830247a4 2401 maxpos = RExC_parse;
a0d0e21e
LW
2402 max = atoi(maxpos);
2403 if (!max && *maxpos != '0')
c277df42
IZ
2404 max = REG_INFTY; /* meaning "infinity" */
2405 else if (max >= REG_INFTY)
8615cb43 2406 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2407 RExC_parse = next;
2408 nextchar(pRExC_state);
a0d0e21e
LW
2409
2410 do_curly:
2411 if ((flags&SIMPLE)) {
830247a4
IZ
2412 RExC_naughty += 2 + RExC_naughty / 2;
2413 reginsert(pRExC_state, CURLY, ret);
a0d0e21e
LW
2414 }
2415 else {
830247a4 2416 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2417
2418 w->flags = 0;
830247a4
IZ
2419 regtail(pRExC_state, ret, w);
2420 if (!SIZE_ONLY && RExC_extralen) {
2421 reginsert(pRExC_state, LONGJMP,ret);
2422 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2423 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2424 }
830247a4
IZ
2425 reginsert(pRExC_state, CURLYX,ret);
2426 if (!SIZE_ONLY && RExC_extralen)
c277df42 2427 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2428 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2429 if (SIZE_ONLY)
830247a4
IZ
2430 RExC_whilem_seen++, RExC_extralen += 3;
2431 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2432 }
c277df42 2433 ret->flags = 0;
a0d0e21e
LW
2434
2435 if (min > 0)
821b33a5
IZ
2436 *flagp = WORST;
2437 if (max > 0)
2438 *flagp |= HASWIDTH;
a0d0e21e 2439 if (max && max < min)
8615cb43 2440 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2441 if (!SIZE_ONLY) {
2442 ARG1_SET(ret, min);
2443 ARG2_SET(ret, max);
a687059c 2444 }
a687059c 2445
a0d0e21e 2446 goto nest_check;
a687059c 2447 }
a0d0e21e 2448 }
a687059c 2449
a0d0e21e
LW
2450 if (!ISMULT1(op)) {
2451 *flagp = flags;
a687059c 2452 return(ret);
a0d0e21e 2453 }
bb20fd44 2454
c277df42 2455#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2456
2457 /* if this is reinstated, don't forget to put this back into perldiag:
2458
2459 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2460
2461 (F) The part of the regexp subject to either the * or + quantifier
2462 could match an empty string. The {#} shows in the regular
2463 expression about where the problem was discovered.
2464
2465 */
2466
bb20fd44 2467 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2468 vFAIL("Regexp *+ operand could be empty");
b81d288d 2469#endif
bb20fd44 2470
830247a4 2471 nextchar(pRExC_state);
a0d0e21e 2472
821b33a5 2473 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2474
2475 if (op == '*' && (flags&SIMPLE)) {
830247a4 2476 reginsert(pRExC_state, STAR, ret);
c277df42 2477 ret->flags = 0;
830247a4 2478 RExC_naughty += 4;
a0d0e21e
LW
2479 }
2480 else if (op == '*') {
2481 min = 0;
2482 goto do_curly;
a0ed51b3
LW
2483 }
2484 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2485 reginsert(pRExC_state, PLUS, ret);
c277df42 2486 ret->flags = 0;
830247a4 2487 RExC_naughty += 3;
a0d0e21e
LW
2488 }
2489 else if (op == '+') {
2490 min = 1;
2491 goto do_curly;
a0ed51b3
LW
2492 }
2493 else if (op == '?') {
a0d0e21e
LW
2494 min = 0; max = 1;
2495 goto do_curly;
2496 }
2497 nest_check:
e476b1b5 2498 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2499 vWARN3(RExC_parse,
b45f050a 2500 "%.*s matches null string many times",
830247a4 2501 RExC_parse - origparse,
b45f050a 2502 origparse);
a0d0e21e
LW
2503 }
2504
830247a4
IZ
2505 if (*RExC_parse == '?') {
2506 nextchar(pRExC_state);
2507 reginsert(pRExC_state, MINMOD, ret);
2508 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2509 }
830247a4
IZ
2510 if (ISMULT2(RExC_parse)) {
2511 RExC_parse++;
b45f050a
JF
2512 vFAIL("Nested quantifiers");
2513 }
a0d0e21e
LW
2514
2515 return(ret);
a687059c
LW
2516}
2517
2518/*
2519 - regatom - the lowest level
2520 *
2521 * Optimization: gobbles an entire sequence of ordinary characters so that
2522 * it can turn them into a single node, which is smaller to store and
2523 * faster to run. Backslashed characters are exceptions, each becoming a
2524 * separate node; the code is simpler that way and it's not worth fixing.
2525 *
b45f050a 2526 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2527STATIC regnode *
830247a4 2528S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2529{
c277df42 2530 register regnode *ret = 0;
a0d0e21e
LW
2531 I32 flags;
2532
2533 *flagp = WORST; /* Tentatively. */
2534
2535tryagain:
830247a4 2536 switch (*RExC_parse) {
a0d0e21e 2537 case '^':
830247a4
IZ
2538 RExC_seen_zerolen++;
2539 nextchar(pRExC_state);
2540 if (RExC_flags16 & PMf_MULTILINE)
2541 ret = reg_node(pRExC_state, MBOL);
2542 else if (RExC_flags16 & PMf_SINGLELINE)
2543 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2544 else
830247a4 2545 ret = reg_node(pRExC_state, BOL);
a0d0e21e
LW
2546 break;
2547 case '$':
830247a4 2548 nextchar(pRExC_state);
b81d288d 2549 if (*RExC_parse)
830247a4
IZ
2550 RExC_seen_zerolen++;
2551 if (RExC_flags16 & PMf_MULTILINE)
2552 ret = reg_node(pRExC_state, MEOL);
2553 else if (RExC_flags16 & PMf_SINGLELINE)
2554 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2555 else
830247a4 2556 ret = reg_node(pRExC_state, EOL);
a0d0e21e
LW
2557 break;
2558 case '.':
830247a4 2559 nextchar(pRExC_state);
ffc61ed2
JH
2560 if (RExC_flags16 & PMf_SINGLELINE)
2561 ret = reg_node(pRExC_state, SANY);
2562 else
2563 ret = reg_node(pRExC_state, REG_ANY);
2564 *flagp |= HASWIDTH|SIMPLE;
830247a4 2565 RExC_naughty++;
a0d0e21e
LW
2566 break;
2567 case '[':
b45f050a 2568 {
830247a4 2569 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2570 ret = regclass(pRExC_state);
830247a4
IZ
2571 if (*RExC_parse != ']') {
2572 RExC_parse = oregcomp_parse;
b45f050a
JF
2573 vFAIL("Unmatched [");
2574 }
830247a4 2575 nextchar(pRExC_state);
a0d0e21e
LW
2576 *flagp |= HASWIDTH|SIMPLE;
2577 break;
b45f050a 2578 }
a0d0e21e 2579 case '(':
830247a4
IZ
2580 nextchar(pRExC_state);
2581 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2582 if (ret == NULL) {
bf93d4cc 2583 if (flags & TRYAGAIN) {
830247a4 2584 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2585 /* Make parent create an empty node if needed. */
2586 *flagp |= TRYAGAIN;
2587 return(NULL);
2588 }
a0d0e21e 2589 goto tryagain;
bf93d4cc 2590 }
a0d0e21e
LW
2591 return(NULL);
2592 }
c277df42 2593 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2594 break;
2595 case '|':
2596 case ')':
2597 if (flags & TRYAGAIN) {
2598 *flagp |= TRYAGAIN;
2599 return NULL;
2600 }
b45f050a 2601 vFAIL("Internal urp");
a0d0e21e
LW
2602 /* Supposed to be caught earlier. */
2603 break;
85afd4ae 2604 case '{':
830247a4
IZ
2605 if (!regcurly(RExC_parse)) {
2606 RExC_parse++;
85afd4ae
CS
2607 goto defchar;
2608 }
2609 /* FALL THROUGH */
a0d0e21e
LW
2610 case '?':
2611 case '+':
2612 case '*':
830247a4 2613 RExC_parse++;
b45f050a 2614 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2615 break;
2616 case '\\':
830247a4 2617 switch (*++RExC_parse) {
a0d0e21e 2618 case 'A':
830247a4
IZ
2619 RExC_seen_zerolen++;
2620 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2621 *flagp |= SIMPLE;
830247a4 2622 nextchar(pRExC_state);
a0d0e21e
LW
2623 break;
2624 case 'G':
830247a4
IZ
2625 ret = reg_node(pRExC_state, GPOS);
2626 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2627 *flagp |= SIMPLE;
830247a4 2628 nextchar(pRExC_state);
a0d0e21e
LW
2629 break;
2630 case 'Z':
830247a4 2631 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2632 *flagp |= SIMPLE;
830247a4 2633 nextchar(pRExC_state);
a0d0e21e 2634 break;
b85d18e9 2635 case 'z':
830247a4 2636 ret = reg_node(pRExC_state, EOS);
b85d18e9 2637 *flagp |= SIMPLE;
830247a4
IZ
2638 RExC_seen_zerolen++; /* Do not optimize RE away */
2639 nextchar(pRExC_state);
b85d18e9 2640 break;
4a2d328f 2641 case 'C':
830247a4 2642 ret = reg_node(pRExC_state, SANY);
3baa4c62 2643 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2644 *flagp |= HASWIDTH|SIMPLE;
830247a4 2645 nextchar(pRExC_state);
a0ed51b3
LW
2646 break;
2647 case 'X':
830247a4 2648 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2649 *flagp |= HASWIDTH;
830247a4 2650 nextchar(pRExC_state);
a0ed51b3 2651 if (UTF && !PL_utf8_mark)
dfe13c55 2652 is_utf8_mark((U8*)"~"); /* preload table */
a0ed51b3 2653 break;
a0d0e21e 2654 case 'w':
ffc61ed2 2655 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2656 *flagp |= HASWIDTH|SIMPLE;
830247a4 2657 nextchar(pRExC_state);
a0ed51b3 2658 if (UTF && !PL_utf8_alnum)
dfe13c55 2659 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2660 break;
2661 case 'W':
ffc61ed2 2662 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2663 *flagp |= HASWIDTH|SIMPLE;
830247a4 2664 nextchar(pRExC_state);
a0ed51b3 2665 if (UTF && !PL_utf8_alnum)
dfe13c55 2666 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2667 break;
2668 case 'b':
830247a4
IZ
2669 RExC_seen_zerolen++;
2670 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2671 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2672 *flagp |= SIMPLE;
830247a4 2673 nextchar(pRExC_state);
a0ed51b3 2674 if (UTF && !PL_utf8_alnum)
dfe13c55 2675 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2676 break;
2677 case 'B':
830247a4
IZ
2678 RExC_seen_zerolen++;
2679 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2680 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2681 *flagp |= SIMPLE;
830247a4 2682 nextchar(pRExC_state);
a0ed51b3 2683 if (UTF && !PL_utf8_alnum)
dfe13c55 2684 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2685 break;
2686 case 's':
ffc61ed2 2687 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2688 *flagp |= HASWIDTH|SIMPLE;
830247a4 2689 nextchar(pRExC_state);
a0ed51b3 2690 if (UTF && !PL_utf8_space)
dfe13c55 2691 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
2692 break;
2693 case 'S':
ffc61ed2 2694 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2695 *flagp |= HASWIDTH|SIMPLE;
830247a4 2696 nextchar(pRExC_state);
a0ed51b3 2697 if (UTF && !PL_utf8_space)
dfe13c55 2698 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
2699 break;
2700 case 'd':
ffc61ed2 2701 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2702 *flagp |= HASWIDTH|SIMPLE;
830247a4 2703 nextchar(pRExC_state);
a0ed51b3 2704 if (UTF && !PL_utf8_digit)
dfe13c55 2705 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e
LW
2706 break;
2707 case 'D':
ffc61ed2 2708 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2709 *flagp |= HASWIDTH|SIMPLE;
830247a4 2710 nextchar(pRExC_state);
a0ed51b3 2711 if (UTF && !PL_utf8_digit)
dfe13c55 2712 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e 2713 break;
a14b48bc
LW
2714 case 'p':
2715 case 'P':
2716 { /* a lovely hack--pretend we saw [\pX] instead */
830247a4 2717 char* oldregxend = RExC_end;
a14b48bc 2718
830247a4
IZ
2719 if (RExC_parse[1] == '{') {
2720 RExC_end = strchr(RExC_parse, '}');
2721 if (!RExC_end) {
2722 RExC_parse += 2;
2723 RExC_end = oldregxend;
b45f050a
JF
2724 vFAIL("Missing right brace on \\p{}");
2725 }
830247a4 2726 RExC_end++;
a14b48bc
LW
2727 }
2728 else
830247a4
IZ
2729 RExC_end = RExC_parse + 2;
2730 RExC_parse--;
a14b48bc 2731
ffc61ed2 2732 ret = regclass(pRExC_state);
a14b48bc 2733
830247a4
IZ
2734 RExC_end = oldregxend;
2735 RExC_parse--;
2736 nextchar(pRExC_state);
a14b48bc
LW
2737 *flagp |= HASWIDTH|SIMPLE;
2738 }
2739 break;
a0d0e21e
LW
2740 case 'n':
2741 case 'r':
2742 case 't':
2743 case 'f':
2744 case 'e':
2745 case 'a':
2746 case 'x':
2747 case 'c':
2748 case '0':
2749 goto defchar;
2750 case '1': case '2': case '3': case '4':
2751 case '5': case '6': case '7': case '8': case '9':
2752 {
830247a4 2753 I32 num = atoi(RExC_parse);
a0d0e21e 2754
830247a4 2755 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2756 goto defchar;
2757 else {
830247a4
IZ
2758 while (isDIGIT(*RExC_parse))
2759 RExC_parse++;
b45f050a 2760
830247a4 2761 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2762 vFAIL("Reference to nonexistent group");
830247a4
IZ
2763 RExC_sawback = 1;
2764 ret = reganode(pRExC_state, FOLD
a0ed51b3 2765 ? (LOC ? REFFL : REFF)
c8756f30 2766 : REF, num);
a0d0e21e 2767 *flagp |= HASWIDTH;
830247a4
IZ
2768 RExC_parse--;
2769 nextchar(pRExC_state);
a0d0e21e
LW
2770 }
2771 }
2772 break;
2773 case '\0':
830247a4 2774 if (RExC_parse >= RExC_end)
b45f050a 2775 FAIL("Trailing \\");
a0d0e21e
LW
2776 /* FALL THROUGH */
2777 default:
c9f97d15
IZ
2778 /* Do not generate `unrecognized' warnings here, we fall
2779 back into the quick-grab loop below */
a0d0e21e
LW
2780 goto defchar;
2781 }
2782 break;
4633a7c4
LW
2783
2784 case '#':
830247a4
IZ
2785 if (RExC_flags16 & PMf_EXTENDED) {
2786 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2787 if (RExC_parse < RExC_end)
4633a7c4
LW
2788 goto tryagain;
2789 }
2790 /* FALL THROUGH */
2791
a0d0e21e 2792 default: {
ba210ebe 2793 register STRLEN len;
a0ed51b3 2794 register UV ender;
a0d0e21e 2795 register char *p;
c277df42 2796 char *oldp, *s;
ba210ebe 2797 STRLEN numlen;
a0d0e21e 2798
830247a4 2799 RExC_parse++;
a0d0e21e
LW
2800
2801 defchar:
830247a4 2802 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2803 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2804 : EXACT);
cd439c50 2805 s = STRING(ret);
830247a4
IZ
2806 for (len = 0, p = RExC_parse - 1;
2807 len < 127 && p < RExC_end;
a0d0e21e
LW
2808 len++)
2809 {
2810 oldp = p;
5b5a24f7 2811
830247a4
IZ
2812 if (RExC_flags16 & PMf_EXTENDED)
2813 p = regwhite(p, RExC_end);
a0d0e21e
LW
2814 switch (*p) {
2815 case '^':
2816 case '$':
2817 case '.':
2818 case '[':
2819 case '(':
2820 case ')':
2821 case '|':
2822 goto loopdone;
2823 case '\\':
2824 switch (*++p) {
2825 case 'A':
2826 case 'G':
2827 case 'Z':
b85d18e9 2828 case 'z':
a0d0e21e
LW
2829 case 'w':
2830 case 'W':
2831 case 'b':
2832 case 'B':
2833 case 's':
2834 case 'S':
2835 case 'd':
2836 case 'D':
a14b48bc
LW
2837 case 'p':
2838 case 'P':
a0d0e21e
LW
2839 --p;
2840 goto loopdone;
2841 case 'n':
2842 ender = '\n';
2843 p++;
a687059c 2844 break;
a0d0e21e
LW
2845 case 'r':
2846 ender = '\r';
2847 p++;
a687059c 2848 break;
a0d0e21e
LW
2849 case 't':
2850 ender = '\t';
2851 p++;
a687059c 2852 break;
a0d0e21e
LW
2853 case 'f':
2854 ender = '\f';
2855 p++;
a687059c 2856 break;
a0d0e21e 2857 case 'e':
0407a77b
GS
2858#ifdef ASCIIish
2859 ender = '\033';
2860#else
2861 ender = '\047';
2862#endif
a0d0e21e 2863 p++;
a687059c 2864 break;
a0d0e21e 2865 case 'a':
0407a77b
GS
2866#ifdef ASCIIish
2867 ender = '\007';
2868#else
2869 ender = '\057';
2870#endif
a0d0e21e 2871 p++;
a687059c 2872 break;
a0d0e21e 2873 case 'x':
a0ed51b3
LW
2874 if (*++p == '{') {
2875 char* e = strchr(p, '}');
b81d288d 2876
b45f050a 2877 if (!e) {
830247a4 2878 RExC_parse = p + 1;
b45f050a
JF
2879 vFAIL("Missing right brace on \\x{}");
2880 }
de5f0749 2881 else {
b21ed0a9 2882 numlen = 1; /* allow underscores */
de35ba6f 2883 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
b21ed0a9
GS
2884 /* numlen is generous */
2885 if (numlen + len >= 127) {
a0ed51b3
LW
2886 p--;
2887 goto loopdone;
2888 }
2889 p = e + 1;
2890 }
a0ed51b3
LW
2891 }
2892 else {
b21ed0a9 2893 numlen = 0; /* disallow underscores */
dff6d3cd 2894 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2895 p += numlen;
2896 }
a687059c 2897 break;
a0d0e21e
LW
2898 case 'c':
2899 p++;
bbce6d69 2900 ender = UCHARAT(p++);
2901 ender = toCTRL(ender);
a687059c 2902 break;
a0d0e21e
LW
2903 case '0': case '1': case '2': case '3':case '4':
2904 case '5': case '6': case '7': case '8':case '9':
2905 if (*p == '0' ||
830247a4 2906 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2907 numlen = 0; /* disallow underscores */
dff6d3cd 2908 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2909 p += numlen;
2910 }
2911 else {
2912 --p;
2913 goto loopdone;
a687059c
LW
2914 }
2915 break;
a0d0e21e 2916 case '\0':
830247a4 2917 if (p >= RExC_end)
b45f050a 2918 FAIL("Trailing \\");
a687059c 2919 /* FALL THROUGH */
a0d0e21e 2920 default:
e476b1b5 2921 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 2922 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 2923 goto normal_default;
a0d0e21e
LW
2924 }
2925 break;
a687059c 2926 default:
a0ed51b3 2927 normal_default:
fd400ab9 2928 if (UTF8_IS_START(*p) && UTF) {
830247a4 2929 ender = utf8_to_uv((U8*)p, RExC_end - p,
ba210ebe 2930 &numlen, 0);
a0ed51b3
LW
2931 p += numlen;
2932 }
2933 else
2934 ender = *p++;
a0d0e21e 2935 break;
a687059c 2936 }
830247a4
IZ
2937 if (RExC_flags16 & PMf_EXTENDED)
2938 p = regwhite(p, RExC_end);
a0ed51b3
LW
2939 if (UTF && FOLD) {
2940 if (LOC)
2941 ender = toLOWER_LC_uni(ender);
2942 else
2943 ender = toLOWER_uni(ender);
2944 }
a0d0e21e
LW
2945 if (ISMULT2(p)) { /* Back off on ?+*. */
2946 if (len)
2947 p = oldp;
fd400ab9
JH
2948 /* ender is a Unicode value so it can be > 0xff --
2949 * in other words, do not use UTF8_IS_CONTINUED(). */
a0ed51b3 2950 else if (ender >= 0x80 && UTF) {
830247a4 2951 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2952 s += numlen;
2953 len += numlen;
2954 }
a0d0e21e
LW
2955 else {
2956 len++;
cd439c50 2957 REGC(ender, s++);
a0d0e21e
LW
2958 }
2959 break;
a687059c 2960 }
fd400ab9
JH
2961 /* ender is a Unicode value so it can be > 0xff --
2962 * in other words, do not use UTF8_IS_CONTINUED(). */
a0ed51b3 2963 if (ender >= 0x80 && UTF) {
830247a4 2964 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2965 s += numlen;
2966 len += numlen - 1;
2967 }
2968 else
cd439c50 2969 REGC(ender, s++);
a0d0e21e
LW
2970 }
2971 loopdone:
830247a4
IZ
2972 RExC_parse = p - 1;
2973 nextchar(pRExC_state);
793db0cb
JH
2974 {
2975 /* len is STRLEN which is unsigned, need to copy to signed */
2976 IV iv = len;
2977 if (iv < 0)
2978 vFAIL("Internal disaster");
2979 }
a0d0e21e
LW
2980 if (len > 0)
2981 *flagp |= HASWIDTH;
2982 if (len == 1)
2983 *flagp |= SIMPLE;
c277df42 2984 if (!SIZE_ONLY)
cd439c50
IZ
2985 STR_LEN(ret) = len;
2986 if (SIZE_ONLY)
830247a4 2987 RExC_size += STR_SZ(len);
cd439c50 2988 else
830247a4 2989 RExC_emit += STR_SZ(len);
a687059c 2990 }
a0d0e21e
LW
2991 break;
2992 }
a687059c 2993
a0d0e21e 2994 return(ret);
a687059c
LW
2995}
2996
873ef191 2997STATIC char *
cea2e8a9 2998S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
2999{
3000 while (p < e) {
3001 if (isSPACE(*p))
3002 ++p;
3003 else if (*p == '#') {
3004 do {
3005 p++;
3006 } while (p < e && *p != '\n');
3007 }
3008 else
3009 break;
3010 }
3011 return p;
3012}
3013
b8c5462f
JH
3014/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3015 Character classes ([:foo:]) can also be negated ([:^foo:]).
3016 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3017 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3018 but trigger warnings because they are currently unimplemented. */
3019STATIC I32
830247a4 3020S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3021{
3022 char *posixcc = 0;
936ed897 3023 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3024
830247a4 3025 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3026 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3027 (*RExC_parse == ':' ||
3028 *RExC_parse == '=' ||
3029 *RExC_parse == '.')) {
3030 char c = *RExC_parse;
3031 char* s = RExC_parse++;
b81d288d 3032
830247a4
IZ
3033 while (RExC_parse < RExC_end && *RExC_parse != c)
3034 RExC_parse++;
3035 if (RExC_parse == RExC_end)
620e46c5 3036 /* Grandfather lone [:, [=, [. */
830247a4 3037 RExC_parse = s;
620e46c5 3038 else {
830247a4 3039 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3040
830247a4
IZ
3041 if (*RExC_parse == ']') {
3042 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3043 posixcc = s + 1;
3044 if (*s == ':') {
3045 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3046 I32 skip = 5; /* the most common skip */
3047
3048 switch (*posixcc) {
3049 case 'a':
3050 if (strnEQ(posixcc, "alnum", 5))
3051 namedclass =
3052 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3053 else if (strnEQ(posixcc, "alpha", 5))
3054 namedclass =
3055 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3056 else if (strnEQ(posixcc, "ascii", 5))
3057 namedclass =
3058 complement ? ANYOF_NASCII : ANYOF_ASCII;
3059 break;
aaa51d5e
JF
3060 case 'b':
3061 if (strnEQ(posixcc, "blank", 5))
3062 namedclass =
3063 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3064 break;
b8c5462f
JH
3065 case 'c':
3066 if (strnEQ(posixcc, "cntrl", 5))
3067 namedclass =
3068 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3069 break;
3070 case 'd':
3071 if (strnEQ(posixcc, "digit", 5))
3072 namedclass =
3073 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3074 break;
3075 case 'g':
3076 if (strnEQ(posixcc, "graph", 5))
3077 namedclass =
3078 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3079 break;
3080 case 'l':
3081 if (strnEQ(posixcc, "lower", 5))
3082 namedclass =
3083 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3084 break;
3085 case 'p':
3086 if (strnEQ(posixcc, "print", 5))
3087 namedclass =
3088 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3089 else if (strnEQ(posixcc, "punct", 5))
3090 namedclass =
3091 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3092 break;
3093 case 's':
3094 if (strnEQ(posixcc, "space", 5))
3095 namedclass =
aaa51d5e 3096 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3097 break;
b8c5462f
JH
3098 case 'u':
3099 if (strnEQ(posixcc, "upper", 5))
3100 namedclass =
3101 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3102 break;
3103 case 'w': /* this is not POSIX, this is the Perl \w */
3104 if (strnEQ(posixcc, "word", 4)) {
3105 namedclass =
3106 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3107 skip = 4;
3108 }
3109 break;
3110 case 'x':
3111 if (strnEQ(posixcc, "xdigit", 6)) {
3112 namedclass =
3113 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3114 skip = 6;
3115 }
3116 break;
3117 }
ac561586
JH
3118 if (namedclass == OOB_NAMEDCLASS ||
3119 posixcc[skip] != ':' ||
3120 posixcc[skip+1] != ']')
b45f050a
JF
3121 {
3122 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3123 t - s - 1, s + 1);
3124 }
3125 } else if (!SIZE_ONLY) {
b8c5462f 3126 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3127
830247a4 3128 /* adjust RExC_parse so the warning shows after
b45f050a 3129 the class closes */
830247a4
IZ
3130 while (*RExC_parse && *RExC_parse != ']')
3131 RExC_parse++;
b45f050a
JF
3132 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3133 }
b8c5462f
JH
3134 } else {
3135 /* Maternal grandfather:
3136 * "[:" ending in ":" but not in ":]" */
830247a4 3137 RExC_parse = s;
767d463e 3138 }
620e46c5
JH
3139 }
3140 }
3141
b8c5462f
JH
3142 return namedclass;
3143}
3144
3145STATIC void
830247a4 3146S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3147{
e476b1b5 3148 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3149 (*RExC_parse == ':' ||
3150 *RExC_parse == '=' ||
3151 *RExC_parse == '.')) {
3152 char *s = RExC_parse;
93733859 3153 char c = *s++;
b8c5462f
JH
3154
3155 while(*s && isALNUM(*s))
3156 s++;
3157 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3158 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3159
3160 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3161 if (c == '=' || c == '.')
b45f050a 3162 {
830247a4 3163 /* adjust RExC_parse so the error shows after
b45f050a 3164 the class closes */
830247a4 3165 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3166 ;
3167 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3168 }
b8c5462f
JH
3169 }
3170 }
620e46c5
JH
3171}
3172
76e3520e 3173STATIC regnode *
830247a4 3174S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3175{
ffc61ed2
JH
3176 register UV value;
3177 register IV lastvalue = OOB_UNICODE;
3178 register IV range = 0;
c277df42 3179 register regnode *ret;
ba210ebe 3180 STRLEN numlen;
ffc61ed2 3181 IV namedclass;
73b437c8 3182 char *rangebegin;
936ed897 3183 bool need_class = 0;
ffc61ed2
JH
3184 SV *listsv;
3185 register char *e;
3186 UV n;
1aa99e6b 3187 bool dont_optimize_invert = FALSE;
ffc61ed2
JH
3188
3189 ret = reganode(pRExC_state, ANYOF, 0);
3190
3191 if (!SIZE_ONLY)
3192 ANYOF_FLAGS(ret) = 0;
3193
3194 if (*RExC_parse == '^') { /* Complement of range. */
3195 RExC_naughty++;
3196 RExC_parse++;
3197 if (!SIZE_ONLY)
3198 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3199 }
a0d0e21e 3200
936ed897 3201 if (SIZE_ONLY)
830247a4 3202 RExC_size += ANYOF_SKIP;
936ed897 3203 else {
830247a4 3204 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3205 if (FOLD)
3206 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3207 if (LOC)
3208 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3209 ANYOF_BITMAP_ZERO(ret);
3210 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3211 }
b8c5462f 3212
e476b1b5 3213 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3214 checkposixcc(pRExC_state);
b8c5462f 3215
830247a4 3216 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3217 goto charclassloop; /* allow 1st char to be ] or - */
3218
830247a4 3219 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3220
3221 charclassloop:
3222
3223 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3224
73b437c8 3225 if (!range)
830247a4 3226 rangebegin = RExC_parse;
ffc61ed2
JH
3227 if (UTF) {
3228 value = utf8_to_uv((U8*)RExC_parse,
3229 RExC_end - RExC_parse,
3230 &numlen, 0);
3231 RExC_parse += numlen;
3232 }
3233 else
3234 value = UCHARAT(RExC_parse++);
620e46c5 3235 if (value == '[')
830247a4 3236 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3237 else if (value == '\\') {
ffc61ed2
JH
3238 if (UTF) {
3239 value = utf8_to_uv((U8*)RExC_parse,
3240 RExC_end - RExC_parse,
3241 &numlen, 0);
3242 RExC_parse += numlen;
3243 }
3244 else
3245 value = UCHARAT(RExC_parse++);
470c3474 3246 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3247 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3248 * be a problem later if we want switch on Unicode.
3249 * A similar issue a little bit later when switching on
3250 * namedclass. --jhi */
ffc61ed2 3251 switch ((I32)value) {
b8c5462f
JH
3252 case 'w': namedclass = ANYOF_ALNUM; break;
3253 case 'W': namedclass = ANYOF_NALNUM; break;
3254 case 's': namedclass = ANYOF_SPACE; break;
3255 case 'S': namedclass = ANYOF_NSPACE; break;
3256 case 'd': namedclass = ANYOF_DIGIT; break;
3257 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3258 case 'p':
3259 case 'P':
3260 if (*RExC_parse == '{') {
3261 e = strchr(RExC_parse++, '}');
3262 if (!e)
3263 vFAIL("Missing right brace on \\p{}");
3264 n = e - RExC_parse;
3265 }
3266 else {
3267 e = RExC_parse;
3268 n = 1;
3269 }
3270 if (!SIZE_ONLY) {
3271 if (value == 'p')
3272 Perl_sv_catpvf(aTHX_ listsv,
3273 "+utf8::%.*s\n", (int)n, RExC_parse);
3274 else
3275 Perl_sv_catpvf(aTHX_ listsv,
3276 "!utf8::%.*s\n", (int)n, RExC_parse);
3277 }
3278 RExC_parse = e + 1;
3279 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3280 continue;
b8c5462f
JH
3281 case 'n': value = '\n'; break;
3282 case 'r': value = '\r'; break;
3283 case 't': value = '\t'; break;
3284 case 'f': value = '\f'; break;
3285 case 'b': value = '\b'; break;
0407a77b 3286#ifdef ASCIIish
b8c5462f
JH
3287 case 'e': value = '\033'; break;
3288 case 'a': value = '\007'; break;
0407a77b
GS
3289#else
3290 case 'e': value = '\047'; break;
3291 case 'a': value = '\057'; break;
3292#endif
b8c5462f 3293 case 'x':
ffc61ed2
JH
3294 if (*RExC_parse == '{') {
3295 e = strchr(RExC_parse++, '}');
b81d288d 3296 if (!e)
ffc61ed2
JH
3297 vFAIL("Missing right brace on \\x{}");
3298 numlen = 1; /* allow underscores */
3299 value = (UV)scan_hex(RExC_parse,
3300 e - RExC_parse,
3301 &numlen);
3302 RExC_parse = e + 1;
3303 }
3304 else {
3305 numlen = 0; /* disallow underscores */
3306 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3307 RExC_parse += numlen;
3308 }
b8c5462f
JH
3309 break;
3310 case 'c':
830247a4 3311 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3312 value = toCTRL(value);
3313 break;
3314 case '0': case '1': case '2': case '3': case '4':
3315 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3316 numlen = 0; /* disallow underscores */
830247a4
IZ
3317 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3318 RExC_parse += numlen;
b8c5462f 3319 break;
1028017a 3320 default:
e476b1b5 3321 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3322 vWARN2(RExC_parse,
3323 "Unrecognized escape \\%c in character class passed through",
3324 (int)value);
1028017a 3325 break;
b8c5462f 3326 }
ffc61ed2
JH
3327 } /* end of \blah */
3328
3329 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3330
3331 if (!SIZE_ONLY && !need_class)
936ed897 3332 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3333
936ed897 3334 need_class = 1;
ffc61ed2
JH
3335
3336 /* a bad range like a-\d, a-[:digit:] ? */
3337 if (range) {
73b437c8 3338 if (!SIZE_ONLY) {
e476b1b5 3339 if (ckWARN(WARN_REGEXP))
830247a4 3340 vWARN4(RExC_parse,
b45f050a 3341 "False [] range \"%*.*s\"",
830247a4
IZ
3342 RExC_parse - rangebegin,
3343 RExC_parse - rangebegin,
b45f050a 3344 rangebegin);
ffc61ed2
JH
3345 if (lastvalue < 256) {
3346 ANYOF_BITMAP_SET(ret, lastvalue);
3347 ANYOF_BITMAP_SET(ret, '-');
3348 }
3349 else {
3350 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3351 Perl_sv_catpvf(aTHX_ listsv,
3352 /* 0x002D is Unicode for '-' */
3353 "%04"UVxf"\n002D\n", (UV)lastvalue);
3354 }
b8c5462f 3355 }
ffc61ed2
JH
3356
3357 range = 0; /* this was not a true range */
73b437c8 3358 }
ffc61ed2 3359
73b437c8 3360 if (!SIZE_ONLY) {
e2962f66
JH
3361 /* Possible truncation here but in some 64-bit environments
3362 * the compiler gets heartburn about switch on 64-bit values.
3363 * A similar issue a little earlier when switching on value.
98f323fa 3364 * --jhi */
e2962f66 3365 switch ((I32)namedclass) {
73b437c8
JH
3366 case ANYOF_ALNUM:
3367 if (LOC)
936ed897 3368 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isALNUM(value))
936ed897 3372 ANYOF_BITMAP_SET(ret, value);
73b437c8 3373 }
1aa99e6b 3374 dont_optimize_invert = TRUE;
ffc61ed2 3375 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3376 break;
3377 case ANYOF_NALNUM:
3378 if (LOC)
936ed897 3379 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3380 else {
3381 for (value = 0; value < 256; value++)
3382 if (!isALNUM(value))
936ed897 3383 ANYOF_BITMAP_SET(ret, value);
73b437c8 3384 }
1aa99e6b 3385 dont_optimize_invert = TRUE;
ffc61ed2 3386 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3387 break;
ffc61ed2 3388 case ANYOF_ALNUMC:
73b437c8 3389 if (LOC)
ffc61ed2 3390 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3391 else {
3392 for (value = 0; value < 256; value++)
ffc61ed2 3393 if (isALNUMC(value))
936ed897 3394 ANYOF_BITMAP_SET(ret, value);
73b437c8 3395 }
1aa99e6b 3396 dont_optimize_invert = TRUE;
ffc61ed2 3397 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3398 break;
3399 case ANYOF_NALNUMC:
3400 if (LOC)
936ed897 3401 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3402 else {
3403 for (value = 0; value < 256; value++)
3404 if (!isALNUMC(value))
936ed897 3405 ANYOF_BITMAP_SET(ret, value);
73b437c8 3406 }
1aa99e6b 3407 dont_optimize_invert = TRUE;
ffc61ed2 3408 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3409 break;
3410 case ANYOF_ALPHA:
3411 if (LOC)
936ed897 3412 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3413 else {
3414 for (value = 0; value < 256; value++)
3415 if (isALPHA(value))
936ed897 3416 ANYOF_BITMAP_SET(ret, value);
73b437c8 3417 }
1aa99e6b 3418 dont_optimize_invert = TRUE;
ffc61ed2 3419 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3420 break;
3421 case ANYOF_NALPHA:
3422 if (LOC)
936ed897 3423 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3424 else {
3425 for (value = 0; value < 256; value++)
3426 if (!isALPHA(value))
936ed897 3427 ANYOF_BITMAP_SET(ret, value);
73b437c8 3428 }
1aa99e6b 3429 dont_optimize_invert = TRUE;
ffc61ed2 3430 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3431 break;
3432 case ANYOF_ASCII:
3433 if (LOC)
936ed897 3434 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3435 else {
a5993822 3436#ifdef ASCIIish
73b437c8 3437 for (value = 0; value < 128; value++)
936ed897 3438 ANYOF_BITMAP_SET(ret, value);
a5993822
PP
3439#else /* EBCDIC */
3440 for (value = 0; value < 256; value++)
3441 if (isASCII(value))
3442 ANYOF_BITMAP_SET(ret, value);
3443#endif /* EBCDIC */
73b437c8 3444 }
1aa99e6b 3445 dont_optimize_invert = TRUE;
ffc61ed2 3446 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3447 break;
3448 case ANYOF_NASCII:
3449 if (LOC)
936ed897 3450 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3451 else {
a5993822 3452#ifdef ASCIIish
73b437c8 3453 for (value = 128; value < 256; value++)
936ed897 3454 ANYOF_BITMAP_SET(ret, value);
a5993822
PP
3455#else /* EBCDIC */
3456 for (value = 0; value < 256; value++)
3457 if (!isASCII(value))
3458 ANYOF_BITMAP_SET(ret, value);
3459#endif /* EBCDIC */
73b437c8 3460 }
1aa99e6b 3461 dont_optimize_invert = TRUE;
ffc61ed2 3462 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3463 break;
aaa51d5e
JF
3464 case ANYOF_BLANK:
3465 if (LOC)
3466 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3467 else {
3468 for (value = 0; value < 256; value++)
3469 if (isBLANK(value))
3470 ANYOF_BITMAP_SET(ret, value);
3471 }
1aa99e6b 3472 dont_optimize_invert = TRUE;
ffc61ed2 3473 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3474 break;
3475 case ANYOF_NBLANK:
3476 if (LOC)
3477 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3478 else {
3479 for (value = 0; value < 256; value++)
3480 if (!isBLANK(value))
3481 ANYOF_BITMAP_SET(ret, value);
3482 }
1aa99e6b 3483 dont_optimize_invert = TRUE;
ffc61ed2 3484 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3485 break;
73b437c8
JH
3486 case ANYOF_CNTRL:
3487 if (LOC)
936ed897 3488 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3489 else {
3490 for (value = 0; value < 256; value++)
3491 if (isCNTRL(value))
936ed897 3492 ANYOF_BITMAP_SET(ret, value);
73b437c8 3493 }
1aa99e6b 3494 dont_optimize_invert = TRUE;
ffc61ed2 3495 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3496 break;
3497 case ANYOF_NCNTRL:
3498 if (LOC)
936ed897 3499 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3500 else {
3501 for (value = 0; value < 256; value++)
3502 if (!isCNTRL(value))
936ed897 3503 ANYOF_BITMAP_SET(ret, value);
73b437c8 3504 }
1aa99e6b 3505 dont_optimize_invert = TRUE;
ffc61ed2
JH
3506 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3507 break;
3508 case ANYOF_DIGIT:
3509 if (LOC)
3510 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3511 else {
3512 /* consecutive digits assumed */
3513 for (value = '0'; value <= '9'; value++)
3514 ANYOF_BITMAP_SET(ret, value);
3515 }
1aa99e6b 3516 dont_optimize_invert = TRUE;
ffc61ed2
JH
3517 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3518 break;
3519 case ANYOF_NDIGIT:
3520 if (LOC)
3521 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3522 else {
3523 /* consecutive digits assumed */
3524 for (value = 0; value < '0'; value++)
3525 ANYOF_BITMAP_SET(ret, value);
3526 for (value = '9' + 1; value < 256; value++)
3527 ANYOF_BITMAP_SET(ret, value);
3528 }
1aa99e6b 3529 dont_optimize_invert = TRUE;
ffc61ed2 3530 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3531 break;
3532 case ANYOF_GRAPH:
3533 if (LOC)
936ed897 3534 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3535 else {
3536 for (value = 0; value < 256; value++)
3537 if (isGRAPH(value))
936ed897 3538 ANYOF_BITMAP_SET(ret, value);
73b437c8 3539 }
1aa99e6b 3540 dont_optimize_invert = TRUE;
ffc61ed2 3541 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3542 break;
3543 case ANYOF_NGRAPH:
3544 if (LOC)
936ed897 3545 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3546 else {
3547 for (value = 0; value < 256; value++)
3548 if (!isGRAPH(value))
936ed897 3549 ANYOF_BITMAP_SET(ret, value);
73b437c8 3550 }
1aa99e6b 3551 dont_optimize_invert = TRUE;
ffc61ed2 3552 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3553 break;
3554 case ANYOF_LOWER:
3555 if (LOC)
936ed897 3556 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3557 else {
3558 for (value = 0; value < 256; value++)
3559 if (isLOWER(value))
936ed897 3560 ANYOF_BITMAP_SET(ret, value);
73b437c8 3561 }
1aa99e6b 3562 dont_optimize_invert = TRUE;
ffc61ed2 3563 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3564 break;
3565 case ANYOF_NLOWER:
3566 if (LOC)
936ed897 3567 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3568 else {
3569 for (value = 0; value < 256; value++)
3570 if (!isLOWER(value))
936ed897 3571 ANYOF_BITMAP_SET(ret, value);
73b437c8 3572 }
1aa99e6b 3573 dont_optimize_invert = TRUE;
ffc61ed2 3574 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3575 break;
3576 case ANYOF_PRINT:
3577 if (LOC)
936ed897 3578 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3579 else {
3580 for (value = 0; value < 256; value++)
3581 if (isPRINT(value))
936ed897 3582 ANYOF_BITMAP_SET(ret, value);
73b437c8 3583 }
1aa99e6b 3584 dont_optimize_invert = TRUE;
ffc61ed2 3585 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3586 break;
3587 case ANYOF_NPRINT:
3588 if (LOC)
936ed897 3589 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3590 else {
3591 for (value = 0; value < 256; value++)
3592 if (!isPRINT(value))
936ed897 3593 ANYOF_BITMAP_SET(ret, value);
73b437c8 3594 }
1aa99e6b 3595 dont_optimize_invert = TRUE;
ffc61ed2 3596 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3597 break;
aaa51d5e
JF
3598 case ANYOF_PSXSPC:
3599 if (LOC)
3600 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3601 else {
3602 for (value = 0; value < 256; value++)
3603 if (isPSXSPC(value))
3604 ANYOF_BITMAP_SET(ret, value);
3605 }
1aa99e6b 3606 dont_optimize_invert = TRUE;
ffc61ed2 3607 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3608 break;
3609 case ANYOF_NPSXSPC:
3610 if (LOC)
3611 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3612 else {
3613 for (value = 0; value < 256; value++)
3614 if (!isPSXSPC(value))
3615 ANYOF_BITMAP_SET(ret, value);
3616 }
1aa99e6b 3617 dont_optimize_invert = TRUE;
ffc61ed2 3618 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3619 break;
73b437c8
JH
3620 case ANYOF_PUNCT:
3621 if (LOC)
936ed897 3622 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3623 else {
3624 for (value = 0; value < 256; value++)
3625 if (isPUNCT(value))
936ed897 3626 ANYOF_BITMAP_SET(ret, value);
73b437c8 3627 }
1aa99e6b 3628 dont_optimize_invert = TRUE;
ffc61ed2 3629 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3630 break;
3631 case ANYOF_NPUNCT:
3632 if (LOC)
936ed897 3633 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3634 else {
3635 for (value = 0; value < 256; value++)
3636 if (!isPUNCT(value))
936ed897 3637 ANYOF_BITMAP_SET(ret, value);
73b437c8 3638 }
1aa99e6b 3639 dont_optimize_invert = TRUE;
ffc61ed2
JH
3640 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3641 break;
3642 case ANYOF_SPACE:
3643 if (LOC)
3644 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (isSPACE(value))
3648 ANYOF_BITMAP_SET(ret, value);
3649 }
1aa99e6b 3650 dont_optimize_invert = TRUE;
ffc61ed2
JH
3651 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3652 break;
3653 case ANYOF_NSPACE:
3654 if (LOC)
3655 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3656 else {
3657 for (value = 0; value < 256; value++)
3658 if (!isSPACE(value))
3659 ANYOF_BITMAP_SET(ret, value);
3660 }
1aa99e6b 3661 dont_optimize_invert = TRUE;
ffc61ed2 3662 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3663 break;
3664 case ANYOF_UPPER:
3665 if (LOC)
936ed897 3666 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3667 else {
3668 for (value = 0; value < 256; value++)
3669 if (isUPPER(value))
936ed897 3670 ANYOF_BITMAP_SET(ret, value);
73b437c8 3671 }
1aa99e6b 3672 dont_optimize_invert = TRUE;
ffc61ed2 3673 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3674 break;
3675 case ANYOF_NUPPER:
3676 if (LOC)
936ed897 3677 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (!isUPPER(value))
936ed897 3681 ANYOF_BITMAP_SET(ret, value);
73b437c8 3682 }
1aa99e6b 3683 dont_optimize_invert = TRUE;
ffc61ed2 3684 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3685 break;
3686 case ANYOF_XDIGIT:
3687 if (LOC)
936ed897 3688 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3689 else {
3690 for (value = 0; value < 256; value++)
3691 if (isXDIGIT(value))
936ed897 3692 ANYOF_BITMAP_SET(ret, value);
73b437c8 3693 }
1aa99e6b 3694 dont_optimize_invert = TRUE;
ffc61ed2 3695 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3696 break;
3697 case ANYOF_NXDIGIT:
3698 if (LOC)
936ed897 3699 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3700 else {
3701 for (value = 0; value < 256; value++)
3702 if (!isXDIGIT(value))
936ed897 3703 ANYOF_BITMAP_SET(ret, value);
73b437c8 3704 }
1aa99e6b 3705 dont_optimize_invert = TRUE;
ffc61ed2 3706 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3707 break;
3708 default:
b45f050a 3709 vFAIL("Invalid [::] class");
73b437c8 3710 break;
b8c5462f 3711 }
b8c5462f 3712 if (LOC)
936ed897 3713 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3714 continue;
a0d0e21e 3715 }
ffc61ed2
JH
3716 } /* end of namedclass \blah */
3717
a0d0e21e 3718 if (range) {
73b437c8 3719 if (lastvalue > value) /* b-a */ {
b45f050a 3720 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3721 RExC_parse - rangebegin,
3722 RExC_parse - rangebegin,
b45f050a 3723 rangebegin);
73b437c8 3724 }
ffc61ed2 3725 range = 0; /* not a true range */
a0d0e21e
LW
3726 }
3727 else {
ffc61ed2 3728 lastvalue = value; /* save the beginning of the range */
830247a4
IZ
3729 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3730 RExC_parse[1] != ']') {
3731 RExC_parse++;
ffc61ed2
JH
3732
3733 /* a bad range like \w-, [:word:]- ? */
3734 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3735 if (ckWARN(WARN_REGEXP))
830247a4 3736 vWARN4(RExC_parse,
b45f050a 3737 "False [] range \"%*.*s\"",
830247a4
IZ
3738 RExC_parse - rangebegin,
3739 RExC_parse - rangebegin,
b45f050a 3740 rangebegin);
73b437c8 3741 if (!SIZE_ONLY)
936ed897 3742 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3743 } else
ffc61ed2
JH
3744 range = 1; /* yeah, it's a range! */
3745 continue; /* but do it the next time */
a0d0e21e 3746 }
a687059c 3747 }
ffc61ed2 3748
93733859 3749 /* now is the next time */
ae5c130c 3750 if (!SIZE_ONLY) {
ffc61ed2 3751 if (lastvalue < 256 && value < 256) {
93733859 3752#ifndef ASCIIish /* EBCDIC, for example. */
ffc61ed2
JH
3753 if ((isLOWER(lastvalue) && isLOWER(value)) ||
3754 (isUPPER(lastvalue) && isUPPER(value)))
3755 {
3756 IV i;
3757 if (isLOWER(lastvalue)) {
3758 for (i = lastvalue; i <= value; i++)
3759 if (isLOWER(i))
3760 ANYOF_BITMAP_SET(ret, i);
3761 } else {
3762 for (i = lastvalue; i <= value; i++)
3763 if (isUPPER(i))
3764 ANYOF_BITMAP_SET(ret, i);
3765 }
8ada0baa 3766 }
ffc61ed2 3767 else
8ada0baa 3768#endif
ffc61ed2
JH
3769 for ( ; lastvalue <= value; lastvalue++)
3770 ANYOF_BITMAP_SET(ret, lastvalue);
3771 } else {
3772 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3773 if (lastvalue < value)
3774 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3775 (UV)lastvalue, (UV)value);
3776 else
3777 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3778 (UV)value);
3779 }
8ada0baa 3780 }
ffc61ed2
JH
3781
3782 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3783 }
ffc61ed2 3784
936ed897
IZ
3785 if (need_class) {
3786 if (SIZE_ONLY)
830247a4 3787 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3788 else
830247a4 3789 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3790 }
ffc61ed2 3791
ae5c130c 3792 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3793 if (!SIZE_ONLY &&
ffc61ed2
JH
3794 (ANYOF_FLAGS(ret) &
3795 /* If the only flag is folding (plus possibly inversion). */
3796 (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
a0ed51b3 3797 for (value = 0; value < 256; ++value) {
936ed897 3798 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3799 IV fold = PL_fold[value];
3800
3801 if (fold != value)
3802 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3803 }
3804 }
936ed897 3805 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3806 }
ffc61ed2 3807
ae5c130c 3808 /* optimize inverted simple patterns (e.g. [^a-z]) */
1aa99e6b 3809 if (!SIZE_ONLY && !dont_optimize_invert &&
ffc61ed2
JH
3810 /* If the only flag is inversion. */
3811 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3812 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3813 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3814 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3815 }
a0d0e21e 3816
b81d288d 3817 if (!SIZE_ONLY) {
fde631ed 3818 AV *av = newAV();
ffc61ed2
JH
3819 SV *rv;
3820
3821 av_store(av, 0, listsv);
3822 av_store(av, 1, NULL);
3823 rv = newRV_noinc((SV*)av);
19860706 3824 n = add_data(pRExC_state, 1, "s");
830247a4 3825 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3826 ARG_SET(ret, n);
a0ed51b3
LW
3827 }
3828
3829 return ret;
3830}
3831
76e3520e 3832STATIC char*
830247a4 3833S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3834{
830247a4 3835 char* retval = RExC_parse++;
a0d0e21e 3836
4633a7c4 3837 for (;;) {
830247a4
IZ
3838 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3839 RExC_parse[2] == '#') {
3840 while (*RExC_parse && *RExC_parse != ')')
3841 RExC_parse++;
3842 RExC_parse++;
4633a7c4
LW
3843 continue;
3844 }
830247a4
IZ
3845 if (RExC_flags16 & PMf_EXTENDED) {
3846 if (isSPACE(*RExC_parse)) {
3847 RExC_parse++;
748a9306
LW
3848 continue;
3849 }
830247a4
IZ
3850 else if (*RExC_parse == '#') {
3851 while (*RExC_parse && *RExC_parse != '\n')
3852 RExC_parse++;
3853 RExC_parse++;
748a9306
LW
3854 continue;
3855 }
748a9306 3856 }
4633a7c4 3857 return retval;
a0d0e21e 3858 }
a687059c
LW
3859}
3860
3861/*
c277df42 3862- reg_node - emit a node
a0d0e21e 3863*/
76e3520e 3864STATIC regnode * /* Location. */
830247a4 3865S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3866{
c277df42
IZ
3867 register regnode *ret;
3868 register regnode *ptr;
a687059c 3869
830247a4 3870 ret = RExC_emit;
c277df42 3871 if (SIZE_ONLY) {
830247a4
IZ
3872 SIZE_ALIGN(RExC_size);
3873 RExC_size += 1;
a0d0e21e
LW
3874 return(ret);
3875 }
a687059c 3876
c277df42 3877 NODE_ALIGN_FILL(ret);
a0d0e21e 3878 ptr = ret;
c277df42 3879 FILL_ADVANCE_NODE(ptr, op);
830247a4 3880 RExC_emit = ptr;
a687059c 3881
a0d0e21e 3882 return(ret);
a687059c
LW
3883}
3884
3885/*
a0d0e21e
LW
3886- reganode - emit a node with an argument
3887*/
76e3520e 3888STATIC regnode * /* Location. */
830247a4 3889S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 3890{
c277df42
IZ
3891 register regnode *ret;
3892 register regnode *ptr;
fe14fcc3 3893
830247a4 3894 ret = RExC_emit;
c277df42 3895 if (SIZE_ONLY) {
830247a4
IZ
3896 SIZE_ALIGN(RExC_size);
3897 RExC_size += 2;
a0d0e21e
LW
3898 return(ret);
3899 }
fe14fcc3 3900
c277df42 3901 NODE_ALIGN_FILL(ret);
a0d0e21e 3902 ptr = ret;
c277df42 3903 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
830247a4 3904 RExC_emit = ptr;
fe14fcc3 3905
a0d0e21e 3906 return(ret);
fe14fcc3
LW
3907}
3908
3909/*
cd439c50 3910- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
3911*/
3912STATIC void
830247a4 3913S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 3914{
ad391ad9 3915 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
3916}
3917
3918/*
a0d0e21e
LW
3919- reginsert - insert an operator in front of already-emitted operand
3920*
3921* Means relocating the operand.
3922*/
76e3520e 3923STATIC void
830247a4 3924S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 3925{
c277df42
IZ
3926 register regnode *src;
3927 register regnode *dst;
3928 register regnode *place;
3929 register int offset = regarglen[(U8)op];
b81d288d 3930
22c35a8c 3931/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
3932
3933 if (SIZE_ONLY) {
830247a4 3934 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
3935 return;
3936 }
a687059c 3937
830247a4
IZ
3938 src = RExC_emit;
3939 RExC_emit += NODE_STEP_REGNODE + offset;
3940 dst = RExC_emit;
a0d0e21e 3941 while (src > opnd)
c277df42 3942 StructCopy(--src, --dst, regnode);
a0d0e21e
LW
3943
3944 place = opnd; /* Op node, where operand used to be. */
c277df42
IZ
3945 src = NEXTOPER(place);
3946 FILL_ADVANCE_NODE(place, op);
3947 Zero(src, offset, regnode);
a687059c
LW
3948}
3949
3950/*
c277df42 3951- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 3952*/
76e3520e 3953STATIC void
830247a4 3954S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3955{
c277df42
IZ
3956 register regnode *scan;
3957 register regnode *temp;
a0d0e21e 3958
c277df42 3959 if (SIZE_ONLY)
a0d0e21e
LW
3960 return;
3961
3962 /* Find last node. */
3963 scan = p;
3964 for (;;) {
3965 temp = regnext(scan);
3966 if (temp == NULL)
3967 break;
3968 scan = temp;
3969 }
a687059c 3970
c277df42
IZ
3971 if (reg_off_by_arg[OP(scan)]) {
3972 ARG_SET(scan, val - scan);
a0ed51b3
LW
3973 }
3974 else {
c277df42
IZ
3975 NEXT_OFF(scan) = val - scan;
3976 }
a687059c
LW
3977}
3978
3979/*
a0d0e21e
LW
3980- regoptail - regtail on operand of first argument; nop if operandless
3981*/
76e3520e 3982STATIC void
830247a4 3983S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3984{
a0d0e21e 3985 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
3986 if (p == NULL || SIZE_ONLY)
3987 return;
22c35a8c 3988 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 3989 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 3990 }
22c35a8c 3991 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 3992 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
3993 }
3994 else
a0d0e21e 3995 return;
a687059c
LW
3996}
3997
3998/*
3999 - regcurly - a little FSA that accepts {\d+,?\d*}
4000 */
79072805 4001STATIC I32
cea2e8a9 4002S_regcurly(pTHX_ register char *s)
a687059c
LW
4003{
4004 if (*s++ != '{')
4005 return FALSE;
f0fcb552 4006 if (!isDIGIT(*s))
a687059c 4007 return FALSE;
f0fcb552 4008 while (isDIGIT(*s))
a687059c
LW
4009 s++;
4010 if (*s == ',')
4011 s++;
f0fcb552 4012 while (isDIGIT(*s))
a687059c
LW
4013 s++;
4014 if (*s != '}')
4015 return FALSE;
4016 return TRUE;
4017}
4018
a687059c 4019
76e3520e 4020STATIC regnode *
cea2e8a9 4021S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4022{
35ff7856 4023#ifdef DEBUGGING
f248d071 4024 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4025 register regnode *next;
c277df42
IZ
4026
4027 while (op != END && (!last || node < last)) {
4028 /* While that wasn't END last time... */
4029
4030 NODE_ALIGN(node);
4031 op = OP(node);
4032 if (op == CLOSE)
4033 l--;
4034 next = regnext(node);
4035 /* Where, what. */
4036 if (OP(node) == OPTIMIZED)
4037 goto after_print;
4038 regprop(sv, node);
b900a521 4039 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4040 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4041 if (next == NULL) /* Next ptr. */
4042 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4043 else
b900a521 4044 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4045 (void)PerlIO_putc(Perl_debug_log, '\n');
4046 after_print:
22c35a8c 4047 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4048 register regnode *nnode = (OP(next) == LONGJMP
4049 ? regnext(next)
c277df42
IZ
4050 : next);
4051 if (last && nnode > last)
4052 nnode = last;
4053 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4054 }
22c35a8c 4055 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4056 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4057 }
4058 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4059 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4060 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4061 }
22c35a8c 4062 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4063 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4064 next, sv, l + 1);
a0ed51b3
LW
4065 }
4066 else if ( op == PLUS || op == STAR) {
c277df42 4067 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4068 }
4069 else if (op == ANYOF) {
c277df42 4070 node = NEXTOPER(node);
936ed897 4071 node += ANYOF_SKIP;
a0ed51b3 4072 }
22c35a8c 4073 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4074 /* Literal string, where present. */
cd439c50 4075 node += NODE_SZ_STR(node) - 1;
c277df42 4076 node = NEXTOPER(node);
a0ed51b3
LW
4077 }
4078 else {
c277df42
IZ
4079 node = NEXTOPER(node);
4080 node += regarglen[(U8)op];
4081 }
4082 if (op == CURLYX || op == OPEN)
4083 l++;
4084 else if (op == WHILEM)
4085 l--;
4086 }
17c3b450 4087#endif /* DEBUGGING */
c277df42
IZ
4088 return node;
4089}
4090
a687059c 4091/*
fd181c75 4092 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4093 */
4094void
864dbfa3 4095Perl_regdump(pTHX_ regexp *r)
a687059c 4096{
35ff7856 4097#ifdef DEBUGGING
46fc3d4c 4098 SV *sv = sv_newmortal();
a687059c 4099
c277df42 4100 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4101
4102 /* Header fields of interest. */
c277df42 4103 if (r->anchored_substr)
7b0972df 4104 PerlIO_printf(Perl_debug_log,
b81d288d 4105 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4106 PL_colors[0],
7b0972df 4107 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4108 SvPVX(r->anchored_substr),
3280af22 4109 PL_colors[1],
c277df42 4110 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4111 (IV)r->anchored_offset);
c277df42 4112 if (r->float_substr)
7b0972df 4113 PerlIO_printf(Perl_debug_log,
b81d288d 4114 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4115 PL_colors[0],
b81d288d 4116 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4117 SvPVX(r->float_substr),
3280af22 4118 PL_colors[1],
c277df42 4119 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4120 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4121 if (r->check_substr)
b81d288d
AB
4122 PerlIO_printf(Perl_debug_log,
4123 r->check_substr == r->float_substr
c277df42
IZ
4124 ? "(checking floating" : "(checking anchored");
4125 if (r->reganch & ROPT_NOSCAN)
4126 PerlIO_printf(Perl_debug_log, " noscan");
4127 if (r->reganch & ROPT_CHECK_ALL)
4128 PerlIO_printf(Perl_debug_log, " isall");
4129 if (r->check_substr)
4130 PerlIO_printf(Perl_debug_log, ") ");
4131
46fc3d4c 4132 if (r->regstclass) {
4133 regprop(sv, r->regstclass);
4134 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4135 }
774d564b 4136 if (r->reganch & ROPT_ANCH) {
4137 PerlIO_printf(Perl_debug_log, "anchored");
4138 if (r->reganch & ROPT_ANCH_BOL)
4139 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4140 if (r->reganch & ROPT_ANCH_MBOL)
4141 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4142 if (r->reganch & ROPT_ANCH_SBOL)
4143 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4144 if (r->reganch & ROPT_ANCH_GPOS)
4145 PerlIO_printf(Perl_debug_log, "(GPOS)");
4146 PerlIO_putc(Perl_debug_log, ' ');
4147 }
c277df42
IZ
4148 if (r->reganch & ROPT_GPOS_SEEN)
4149 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4150 if (r->reganch & ROPT_SKIP)
760ac839 4151 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4152 if (r->reganch & ROPT_IMPLICIT)
760ac839 4153 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4154 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4155 if (r->reganch & ROPT_EVAL_SEEN)
4156 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4157 PerlIO_printf(Perl_debug_log, "\n");
17c3b450 4158#endif /* DEBUGGING */
a687059c
LW
4159}
4160
653099ff
GS
4161STATIC void
4162S_put_byte(pTHX_ SV *sv, int c)
4163{
ffc61ed2 4164 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4165 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4166 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4167 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4168 else
4169 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4170}
4171
a687059c 4172/*
a0d0e21e
LW
4173- regprop - printable representation of opcode
4174*/
46fc3d4c 4175void
864dbfa3 4176Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4177{
35ff7856 4178#ifdef DEBUGGING
9b155405 4179 register int k;
a0d0e21e 4180
54dc92de 4181 sv_setpvn(sv, "", 0);
9b155405 4182 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4183 /* It would be nice to FAIL() here, but this may be called from
4184 regexec.c, and it would be hard to supply pRExC_state. */
4185 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4186 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4187
4188 k = PL_regkind[(U8)OP(o)];
4189
4190 if (k == EXACT)
7821416a 4191 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4192 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4193 else if (k == CURLY) {
cb434fcc 4194 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4195 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4196 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4197 }
2c2d71f5
JH
4198 else if (k == WHILEM && o->flags) /* Ordinal/of */
4199 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4200 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4201 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4202 else if (k == LOGICAL)
04ebc1ab 4203 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4204 else if (k == ANYOF) {
4205 int i, rangestart = -1;
ffc61ed2 4206 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4207 const char * const anyofs[] = { /* Should be syncronized with
4208 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4209 "\\w",
4210 "\\W",
4211 "\\s",
4212 "\\S",
4213 "\\d",
4214 "\\D",
4215 "[:alnum:]",
4216 "[:^alnum:]",
4217 "[:alpha:]",
4218 "[:^alpha:]",
4219 "[:ascii:]",
4220 "[:^ascii:]",
4221 "[:ctrl:]",
4222 "[:^ctrl:]",
4223 "[:graph:]",
4224 "[:^graph:]",
4225 "[:lower:]",
4226 "[:^lower:]",
4227 "[:print:]",
4228 "[:^print:]",
4229 "[:punct:]",
4230 "[:^punct:]",
4231 "[:upper:]",
aaa51d5e 4232 "[:^upper:]",
653099ff 4233 "[:xdigit:]",
aaa51d5e
JF
4234 "[:^xdigit:]",
4235 "[:space:]",
4236 "[:^space:]",
4237 "[:blank:]",
4238 "[:^blank:]"
653099ff
GS
4239 };
4240
19860706 4241 if (flags & ANYOF_LOCALE)
653099ff 4242 sv_catpv(sv, "{loc}");
19860706 4243 if (flags & ANYOF_FOLD)
653099ff
GS
4244 sv_catpv(sv, "{i}");
4245 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4246 if (flags & ANYOF_INVERT)
653099ff 4247 sv_catpv(sv, "^");
ffc61ed2
JH
4248 for (i = 0; i <= 256; i++) {
4249 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4250 if (rangestart == -1)
4251 rangestart = i;
4252 } else if (rangestart != -1) {
4253 if (i <= rangestart + 3)
4254 for (; rangestart < i; rangestart++)
653099ff 4255 put_byte(sv, rangestart);
ffc61ed2
JH
4256 else {
4257 put_byte(sv, rangestart);
4258 sv_catpv(sv, "-");
4259 put_byte(sv, i - 1);
653099ff 4260 }
ffc61ed2 4261 rangestart = -1;
653099ff 4262 }
847a199f 4263 }
ffc61ed2
JH
4264
4265 if (o->flags & ANYOF_CLASS)
4266 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4267 if (ANYOF_CLASS_TEST(o,i))
4268 sv_catpv(sv, anyofs[i]);
4269
4270 if (flags & ANYOF_UNICODE)
4271 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4272 else if (flags & ANYOF_UNICODE_ALL)
4273 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4274
4275 {
4276 SV *lv;
4277 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4278
ffc61ed2
JH
4279 if (lv) {
4280 if (sw) {
4281 UV i;
4282 U8 s[UTF8_MAXLEN+1];
b81d288d 4283
ffc61ed2
JH
4284 for (i = 0; i <= 256; i++) { /* just the first 256 */
4285 U8 *e = uv_to_utf8(s, i);
4286
4287 if (i < 256 && swash_fetch(sw, s)) {
4288 if (rangestart == -1)
4289 rangestart = i;
4290 } else if (rangestart != -1) {
4291 U8 *p;
b81d288d 4292
ffc61ed2
JH
4293 if (i <= rangestart + 3)
4294 for (; rangestart < i; rangestart++) {
4295 for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
4296 put_byte(sv, *p);
4297 }
4298 else {
4299 for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
4300 put_byte(sv, *p);
4301 sv_catpv(sv, "-");
4302 for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
4303 put_byte(sv, *p);
4304 }
4305 rangestart = -1;
4306 }
19860706 4307 }
ffc61ed2
JH
4308
4309 sv_catpv(sv, "..."); /* et cetera */
19860706 4310 }
fde631ed 4311
ffc61ed2
JH
4312 {
4313 char *s = savepv(SvPVX(lv));
4314 char *origs = s;
b81d288d 4315
ffc61ed2 4316 while(*s && *s != '\n') s++;
b81d288d 4317
ffc61ed2
JH
4318 if (*s == '\n') {
4319 char *t = ++s;
4320
4321 while (*s) {
4322 if (*s == '\n')
4323 *s = ' ';
4324 s++;
4325 }
4326 if (s[-1] == ' ')
4327 s[-1] = 0;
4328
4329 sv_catpv(sv, t);
fde631ed 4330 }
b81d288d 4331
ffc61ed2 4332 Safefree(origs);
fde631ed
JH
4333 }
4334 }
653099ff 4335 }
ffc61ed2 4336
653099ff
GS
4337 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4338 }
9b155405 4339 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4340 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4341#endif /* DEBUGGING */
35ff7856 4342}
a687059c 4343
cad2e5aa
JH
4344SV *
4345Perl_re_intuit_string(pTHX_ regexp *prog)
4346{ /* Assume that RE_INTUIT is set */
4347 DEBUG_r(
4348 { STRLEN n_a;
4349 char *s = SvPV(prog->check_substr,n_a);
4350
4351 if (!PL_colorset) reginitcolors();
4352 PerlIO_printf(Perl_debug_log,
4353 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4354 PL_colors[4],PL_colors[5],PL_colors[0],
4355 s,
4356 PL_colors[1],
4357 (strlen(s) > 60 ? "..." : ""));
4358 } );
4359
4360 return prog->check_substr;
4361}
4362
2b69d0c2 4363void
864dbfa3 4364Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4365{
adac82c7 4366 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4367
4368 if (!r || (--r->refcnt > 0))
4369 return;
cad2e5aa
JH
4370 DEBUG_r(PerlIO_printf(Perl_debug_log,
4371 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4372 PL_colors[4],PL_colors[5],PL_colors[0],
4373 r->precomp,
4374 PL_colors[1],
4375 (strlen(r->precomp) > 60 ? "..." : "")));
4376
c277df42 4377 if (r->precomp)
a0d0e21e 4378 Safefree(r->precomp);
cf93c79d
IZ
4379 if (RX_MATCH_COPIED(r))
4380 Safefree(r->subbeg);
a193d654
GS
4381 if (r->substrs) {
4382 if (r->anchored_substr)
4383 SvREFCNT_dec(r->anchored_substr);
4384 if (r->float_substr)
4385 SvREFCNT_dec(r->float_substr);
2779dcf1 4386 Safefree(r->substrs);
a193d654 4387 }
c277df42
IZ
4388 if (r->data) {
4389 int n = r->data->count;
dfad63ad
HS
4390 AV* new_comppad = NULL;
4391 AV* old_comppad;
4392 SV** old_curpad;
4393
c277df42
IZ
4394 while (--n >= 0) {
4395 switch (r->data->what[n]) {
4396 case 's':
4397 SvREFCNT_dec((SV*)r->data->data[n]);
4398 break;
653099ff
GS
4399 case 'f':
4400 Safefree(r->data->data[n]);
4401 break;
dfad63ad
HS
4402 case 'p':
4403 new_comppad = (AV*)r->data->data[n];
4404 break;
c277df42 4405 case 'o':
dfad63ad 4406 if (new_comppad == NULL)
cea2e8a9 4407 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4408 old_comppad = PL_comppad;
4409 old_curpad = PL_curpad;
1e6dc0b6
SB
4410 /* Watch out for global destruction's random ordering. */
4411 if (SvTYPE(new_comppad) == SVt_PVAV) {
4412 PL_comppad = new_comppad;
4413 PL_curpad = AvARRAY(new_comppad);
4414 }
4415 else
4416 PL_curpad = NULL;
c277df42 4417 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4418 PL_comppad = old_comppad;
4419 PL_curpad = old_curpad;
4420 SvREFCNT_dec((SV*)new_comppad);
4421 new_comppad = NULL;
c277df42
IZ
4422 break;
4423 case 'n':
4424 break;
4425 default:
830247a4 4426 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4427 }
4428 }
4429 Safefree(r->data->what);
4430 Safefree(r->data);
a0d0e21e
LW
4431 }
4432 Safefree(r->startp);
4433 Safefree(r->endp);
4434 Safefree(r);
a687059c 4435}
c277df42
IZ
4436
4437/*
4438 - regnext - dig the "next" pointer out of a node
4439 *
4440 * [Note, when REGALIGN is defined there are two places in regmatch()
4441 * that bypass this code for speed.]
4442 */
4443regnode *
864dbfa3 4444Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4445{
4446 register I32 offset;
4447
3280af22 4448 if (p == &PL_regdummy)
c277df42
IZ
4449 return(NULL);
4450
4451 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4452 if (offset == 0)
4453 return(NULL);
4454
c277df42 4455 return(p+offset);
c277df42
IZ
4456}
4457
01f988be 4458STATIC void
cea2e8a9 4459S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4460{
4461 va_list args;
4462 STRLEN l1 = strlen(pat1);
4463 STRLEN l2 = strlen(pat2);
4464 char buf[512];
06bf62c7 4465 SV *msv;
c277df42
IZ
4466 char *message;
4467
4468 if (l1 > 510)
4469 l1 = 510;
4470 if (l1 + l2 > 510)
4471 l2 = 510 - l1;
4472 Copy(pat1, buf, l1 , char);
4473 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4474 buf[l1 + l2] = '\n';
4475 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4476#ifdef I_STDARG
4477 /* ANSI variant takes additional second argument */
c277df42 4478 va_start(args, pat2);
8736538c
AS
4479#else
4480 va_start(args);
4481#endif
5a844595 4482 msv = vmess(buf, &args);
c277df42 4483 va_end(args);
06bf62c7 4484 message = SvPV(msv,l1);
c277df42
IZ
4485 if (l1 > 512)
4486 l1 = 512;
4487 Copy(message, buf, l1 , char);
4488 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4489 Perl_croak(aTHX_ "%s", buf);
c277df42 4490}
a0ed51b3
LW
4491
4492/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4493
4494void
864dbfa3 4495Perl_save_re_context(pTHX)
b81d288d 4496{
830247a4
IZ
4497#if 0
4498 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4499 SAVEI32(RExC_npar); /* () count. */
4500 SAVEI32(RExC_size); /* Code size. */
4501 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4502 SAVEVPTR(RExC_rx); /* from regcomp.c */
4503 SAVEI32(RExC_seen); /* from regcomp.c */
4504 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4505 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4506 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4507 SAVEPPTR(RExC_end); /* End of input for compile */
4508 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4509#endif
4510
4511 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4512 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4513 SAVEPPTR(PL_reginput); /* String-input pointer. */
4514 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4515 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4516 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4517 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4518 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4519 SAVEPPTR(PL_regtill); /* How far we are required to go. */
e8347627 4520 SAVEI8(PL_regprev); /* char before regbol, \n if none */
b81d288d 4521 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4522 PL_reg_start_tmp = 0;
a0ed51b3
LW
4523 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4524 PL_reg_start_tmpl = 0;
7766f137 4525 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4526 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4527 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4528 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4529 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4530 SAVEVPTR(PL_regcc); /* from regexec.c */
4531 SAVEVPTR(PL_curcop);
7766f137
GS
4532 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4533 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4534 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4535 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4536 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4537 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4538 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4539 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4540 SAVEI32(PL_regnpar); /* () count. */
54b6e2fa 4541#ifdef DEBUGGING
b81d288d 4542 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4543#endif
a0ed51b3 4544}
51371543
GS
4545
4546#ifdef PERL_OBJECT
51371543
GS
4547#include "XSUB.h"
4548#undef this
4549#define this pPerl
4550#endif
4551
4552static void
4553clear_re(pTHXo_ void *r)
4554{
4555 ReREFCNT_dec((regexp *)r);
4556}