This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate perlio:
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
118 char *end; /* End of input for compile */
119 char *parse; /* Input-scan pointer. */
120 I32 whilem_seen; /* number of WHILEM in this expr */
ffc61ed2 121 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
125 I32 size; /* Code size. */
126 I32 npar; /* () count. */
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
1aa99e6b 130 I32 utf8;
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
135} RExC_state_t;
136
137#define RExC_flags16 (pRExC_state->flags16)
138#define RExC_precomp (pRExC_state->precomp)
139#define RExC_rx (pRExC_state->rx)
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
143#define RExC_emit (pRExC_state->emit)
144#define RExC_naughty (pRExC_state->naughty)
145#define RExC_sawback (pRExC_state->sawback)
146#define RExC_seen (pRExC_state->seen)
147#define RExC_size (pRExC_state->size)
148#define RExC_npar (pRExC_state->npar)
149#define RExC_extralen (pRExC_state->extralen)
150#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
151#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 152#define RExC_utf8 (pRExC_state->utf8)
830247a4 153
a687059c
LW
154#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
155#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
156 ((*s) == '{' && regcurly(s)))
a687059c 157
35c8bce7
LW
158#ifdef SPSTART
159#undef SPSTART /* dratted cpp namespace... */
160#endif
a687059c
LW
161/*
162 * Flags to be passed up and down.
163 */
a687059c 164#define WORST 0 /* Worst case. */
821b33a5 165#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
166#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
167#define SPSTART 0x4 /* Starts with * or +. */
168#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 169
2c2d71f5
JH
170/* Length of a variant. */
171
172typedef struct scan_data_t {
173 I32 len_min;
174 I32 len_delta;
175 I32 pos_min;
176 I32 pos_delta;
177 SV *last_found;
178 I32 last_end; /* min value, <0 unless valid. */
179 I32 last_start_min;
180 I32 last_start_max;
181 SV **longest; /* Either &l_fixed, or &l_float. */
182 SV *longest_fixed;
183 I32 offset_fixed;
184 SV *longest_float;
185 I32 offset_float_min;
186 I32 offset_float_max;
187 I32 flags;
188 I32 whilem_c;
cb434fcc 189 I32 *last_closep;
653099ff 190 struct regnode_charclass_class *start_class;
2c2d71f5
JH
191} scan_data_t;
192
a687059c 193/*
e50aee73 194 * Forward declarations for pregcomp()'s friends.
a687059c 195 */
a0d0e21e 196
b81d288d 197static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 198 0, 0, 0, 0, 0, 0};
c277df42
IZ
199
200#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
201#define SF_BEFORE_SEOL 0x1
202#define SF_BEFORE_MEOL 0x2
203#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
204#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
205
09b7f37c
CB
206#ifdef NO_UNARY_PLUS
207# define SF_FIX_SHIFT_EOL (0+2)
208# define SF_FL_SHIFT_EOL (0+4)
209#else
210# define SF_FIX_SHIFT_EOL (+2)
211# define SF_FL_SHIFT_EOL (+4)
212#endif
c277df42
IZ
213
214#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
215#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
216
217#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
218#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
219#define SF_IS_INF 0x40
220#define SF_HAS_PAR 0x80
221#define SF_IN_PAR 0x100
222#define SF_HAS_EVAL 0x200
4bfe0158 223#define SCF_DO_SUBSTR 0x400
653099ff
GS
224#define SCF_DO_STCLASS_AND 0x0800
225#define SCF_DO_STCLASS_OR 0x1000
226#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 227#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 228
1aa99e6b 229#define UTF RExC_utf8
830247a4
IZ
230#define LOC (RExC_flags16 & PMf_LOCALE)
231#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 232
ffc61ed2 233#define OOB_UNICODE 12345678
93733859 234#define OOB_NAMEDCLASS -1
b8c5462f 235
a0ed51b3
LW
236#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
237#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
238
8615cb43 239
b45f050a
JF
240/* length of regex to show in messages that don't mark a position within */
241#define RegexLengthToShowInErrorMessages 127
242
243/*
244 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
245 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
246 * op/pragma/warn/regcomp.
247 */
79eeca27
JH
248#define MARKER1 "HERE" /* marker as it appears in the description */
249#define MARKER2 " << HERE " /* marker as it appears within the regex */
b81d288d 250
79eeca27 251#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
b45f050a
JF
252
253/*
254 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
255 * arg. Show regex, up to a maximum length. If it's too long, chop and add
256 * "...".
257 */
a4eb266f 258#define FAIL(msg) \
8615cb43 259 STMT_START { \
a4eb266f 260 char *ellipses = ""; \
830247a4 261 unsigned len = strlen(RExC_precomp); \
b45f050a 262 \
8615cb43 263 if (!SIZE_ONLY) \
830247a4 264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
265 \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 269 ellipses = "..."; \
b45f050a
JF
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 272 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
273 } STMT_END
274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
a4eb266f 280#define FAIL2(pat,msg) \
8615cb43 281 STMT_START { \
a4eb266f 282 char *ellipses = ""; \
830247a4 283 unsigned len = strlen(RExC_precomp); \
b45f050a 284 \
8615cb43 285 if (!SIZE_ONLY) \
830247a4 286 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 287 \
b45f050a
JF
288 if (len > RegexLengthToShowInErrorMessages) { \
289 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
290 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 291 ellipses = "..."; \
b45f050a
JF
292 } \
293 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 294 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
295 } STMT_END
296
297
298/*
299 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
300 */
301#define Simple_vFAIL(m) \
302 STMT_START { \
830247a4 303 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
304 \
305 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 306 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
307 } STMT_END
308
309/*
310 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
311 */
312#define vFAIL(m) \
313 STMT_START { \
314 if (!SIZE_ONLY) \
830247a4 315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
316 Simple_vFAIL(m); \
317 } STMT_END
318
319/*
320 * Like Simple_vFAIL(), but accepts two arguments.
321 */
322#define Simple_vFAIL2(m,a1) \
323 STMT_START { \
830247a4 324 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
325 \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 327 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
328 } STMT_END
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
332 */
333#define vFAIL2(m,a1) \
334 STMT_START { \
335 if (!SIZE_ONLY) \
830247a4 336 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
337 Simple_vFAIL2(m, a1); \
338 } STMT_END
339
340
341/*
342 * Like Simple_vFAIL(), but accepts three arguments.
343 */
344#define Simple_vFAIL3(m, a1, a2) \
345 STMT_START { \
830247a4 346 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
347 \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 349 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
350 } STMT_END
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
355#define vFAIL3(m,a1,a2) \
356 STMT_START { \
357 if (!SIZE_ONLY) \
830247a4 358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
359 Simple_vFAIL3(m, a1, a2); \
360 } STMT_END
361
362/*
363 * Like Simple_vFAIL(), but accepts four arguments.
364 */
365#define Simple_vFAIL4(m, a1, a2, a3) \
366 STMT_START { \
830247a4 367 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
368 \
369 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 370 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
371 } STMT_END
372
373/*
374 * Like Simple_vFAIL(), but accepts five arguments.
375 */
376#define Simple_vFAIL5(m, a1, a2, a3, a4) \
377 STMT_START { \
830247a4 378 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 379 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 380 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
381 } STMT_END
382
383
384#define vWARN(loc,m) \
385 STMT_START { \
830247a4 386 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 387 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 388 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
389 } STMT_END \
390
391
392#define vWARN2(loc, m, a1) \
393 STMT_START { \
830247a4 394 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
395 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
396 a1, \
830247a4 397 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
398 } STMT_END
399
400#define vWARN3(loc, m, a1, a2) \
401 STMT_START { \
830247a4 402 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
403 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
404 a1, a2, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN4(loc, m, a1, a2, a3) \
409 STMT_START { \
830247a4 410 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
411 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
412 a1, a2, a3, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
414 } STMT_END
415
416
cd439c50
IZ
417/* Allow for side effects in s */
418#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
419
51371543 420static void clear_re(pTHXo_ void *r);
4327152a 421
653099ff
GS
422/* Mark that we cannot extend a found fixed substring at this point.
423 Updata the longest found anchored substring and the longest found
424 floating substrings if needed. */
425
4327152a 426STATIC void
830247a4 427S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 428{
a0ed51b3
LW
429 STRLEN l = CHR_SVLEN(data->last_found);
430 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 431
c277df42
IZ
432 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
433 sv_setsv(*data->longest, data->last_found);
434 if (*data->longest == data->longest_fixed) {
435 data->offset_fixed = l ? data->last_start_min : data->pos_min;
436 if (data->flags & SF_BEFORE_EOL)
b81d288d 437 data->flags
c277df42
IZ
438 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
439 else
440 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
441 }
442 else {
c277df42 443 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
444 data->offset_float_max = (l
445 ? data->last_start_max
c277df42
IZ
446 : data->pos_min + data->pos_delta);
447 if (data->flags & SF_BEFORE_EOL)
b81d288d 448 data->flags
c277df42
IZ
449 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
450 else
451 data->flags &= ~SF_FL_BEFORE_EOL;
452 }
453 }
454 SvCUR_set(data->last_found, 0);
455 data->last_end = -1;
456 data->flags &= ~SF_BEFORE_EOL;
457}
458
653099ff
GS
459/* Can match anything (initialization) */
460STATIC void
830247a4 461S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 462{
9af0acdb
JH
463 int value;
464
653099ff 465 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
466 for (value = 0; value < 256; ++value)
467 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 468 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
469 if (LOC)
470 cl->flags |= ANYOF_LOCALE;
471}
472
473/* Can match anything (initialization) */
474STATIC int
475S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
476{
477 int value;
478
aaa51d5e 479 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
480 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
481 return 1;
1aa99e6b
IH
482 if (!(cl->flags & ANYOF_UNICODE_ALL))
483 return 0;
9af0acdb
JH
484 for (value = 0; value < 256; ++value)
485 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
486 return 0;
487 return 1;
488}
489
490/* Can match anything (initialization) */
491STATIC void
830247a4 492S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 493{
8ecf7187 494 Zero(cl, 1, struct regnode_charclass_class);
653099ff 495 cl->type = ANYOF;
830247a4 496 cl_anything(pRExC_state, cl);
653099ff
GS
497}
498
499STATIC void
830247a4 500S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 501{
8ecf7187 502 Zero(cl, 1, struct regnode_charclass_class);
653099ff 503 cl->type = ANYOF;
830247a4 504 cl_anything(pRExC_state, cl);
653099ff
GS
505 if (LOC)
506 cl->flags |= ANYOF_LOCALE;
507}
508
509/* 'And' a given class with another one. Can create false positives */
510/* We assume that cl is not inverted */
511STATIC void
512S_cl_and(pTHX_ struct regnode_charclass_class *cl,
513 struct regnode_charclass_class *and_with)
514{
653099ff
GS
515 if (!(and_with->flags & ANYOF_CLASS)
516 && !(cl->flags & ANYOF_CLASS)
517 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
518 && !(and_with->flags & ANYOF_FOLD)
519 && !(cl->flags & ANYOF_FOLD)) {
520 int i;
521
522 if (and_with->flags & ANYOF_INVERT)
523 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
524 cl->bitmap[i] &= ~and_with->bitmap[i];
525 else
526 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
527 cl->bitmap[i] &= and_with->bitmap[i];
528 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
529 if (!(and_with->flags & ANYOF_EOS))
530 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
531
532 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
533 cl->flags &= ~ANYOF_UNICODE_ALL;
534 cl->flags |= ANYOF_UNICODE;
535 ARG_SET(cl, ARG(and_with));
536 }
537 if (!(and_with->flags & ANYOF_UNICODE_ALL))
538 cl->flags &= ~ANYOF_UNICODE_ALL;
539 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
540 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
541}
542
543/* 'OR' a given class with another one. Can create false positives */
544/* We assume that cl is not inverted */
545STATIC void
830247a4 546S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 547{
653099ff
GS
548 if (or_with->flags & ANYOF_INVERT) {
549 /* We do not use
550 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
551 * <= (B1 | !B2) | (CL1 | !CL2)
552 * which is wasteful if CL2 is small, but we ignore CL2:
553 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
554 * XXXX Can we handle case-fold? Unclear:
555 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
556 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
557 */
558 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
559 && !(or_with->flags & ANYOF_FOLD)
560 && !(cl->flags & ANYOF_FOLD) ) {
561 int i;
562
563 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
564 cl->bitmap[i] |= ~or_with->bitmap[i];
565 } /* XXXX: logic is complicated otherwise */
566 else {
830247a4 567 cl_anything(pRExC_state, cl);
653099ff
GS
568 }
569 } else {
570 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
571 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 572 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
573 || (cl->flags & ANYOF_FOLD)) ) {
574 int i;
575
576 /* OR char bitmap and class bitmap separately */
577 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
578 cl->bitmap[i] |= or_with->bitmap[i];
579 if (or_with->flags & ANYOF_CLASS) {
580 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
581 cl->classflags[i] |= or_with->classflags[i];
582 cl->flags |= ANYOF_CLASS;
583 }
584 }
585 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 }
589 if (or_with->flags & ANYOF_EOS)
590 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
591
592 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
593 ARG(cl) != ARG(or_with)) {
594 cl->flags |= ANYOF_UNICODE_ALL;
595 cl->flags &= ~ANYOF_UNICODE;
596 }
597 if (or_with->flags & ANYOF_UNICODE_ALL) {
598 cl->flags |= ANYOF_UNICODE_ALL;
599 cl->flags &= ~ANYOF_UNICODE;
600 }
653099ff
GS
601}
602
603/* REx optimizer. Converts nodes into quickier variants "in place".
604 Finds fixed substrings. */
605
c277df42
IZ
606/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
607 to the position after last scanned or to NULL. */
608
76e3520e 609STATIC I32
830247a4 610S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
611 /* scanp: Start here (read-write). */
612 /* deltap: Write maxlen-minlen here. */
613 /* last: Stop before this one. */
614{
615 I32 min = 0, pars = 0, code;
616 regnode *scan = *scanp, *next;
617 I32 delta = 0;
618 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 619 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
620 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
621 scan_data_t data_fake;
653099ff 622 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 623
c277df42
IZ
624 while (scan && OP(scan) != END && scan < last) {
625 /* Peephole optimizer: */
626
22c35a8c 627 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 628 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
629 regnode *n = regnext(scan);
630 U32 stringok = 1;
631#ifdef DEBUGGING
632 regnode *stop = scan;
b81d288d 633#endif
c277df42 634
cd439c50 635 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
636 /* Skip NOTHING, merge EXACT*. */
637 while (n &&
b81d288d 638 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
639 (stringok && (OP(n) == OP(scan))))
640 && NEXT_OFF(n)
641 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
642 if (OP(n) == TAIL || n > next)
643 stringok = 0;
22c35a8c 644 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
645 NEXT_OFF(scan) += NEXT_OFF(n);
646 next = n + NODE_STEP_REGNODE;
647#ifdef DEBUGGING
648 if (stringok)
649 stop = n;
b81d288d 650#endif
c277df42 651 n = regnext(n);
a0ed51b3 652 }
f49d4d0f 653 else if (stringok) {
cd439c50 654 int oldl = STR_LEN(scan);
c277df42 655 regnode *nnext = regnext(n);
f49d4d0f 656
b81d288d 657 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
658 break;
659 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
660 STR_LEN(scan) += STR_LEN(n);
661 next = n + NODE_SZ_STR(n);
c277df42 662 /* Now we can overwrite *n : */
f49d4d0f 663 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 664#ifdef DEBUGGING
f49d4d0f 665 stop = next - 1;
b81d288d 666#endif
c277df42
IZ
667 n = nnext;
668 }
669 }
670#ifdef DEBUGGING
671 /* Allow dumping */
cd439c50 672 n = scan + NODE_SZ_STR(scan);
c277df42 673 while (n <= stop) {
22c35a8c 674 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
675 OP(n) = OPTIMIZED;
676 NEXT_OFF(n) = 0;
677 }
678 n++;
679 }
653099ff 680#endif
c277df42 681 }
653099ff
GS
682 /* Follow the next-chain of the current node and optimize
683 away all the NOTHINGs from it. */
c277df42 684 if (OP(scan) != CURLYX) {
048cfca1
GS
685 int max = (reg_off_by_arg[OP(scan)]
686 ? I32_MAX
687 /* I32 may be smaller than U16 on CRAYs! */
688 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
689 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
690 int noff;
691 regnode *n = scan;
b81d288d 692
c277df42
IZ
693 /* Skip NOTHING and LONGJMP. */
694 while ((n = regnext(n))
22c35a8c 695 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
696 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
697 && off + noff < max)
698 off += noff;
699 if (reg_off_by_arg[OP(scan)])
700 ARG(scan) = off;
b81d288d 701 else
c277df42
IZ
702 NEXT_OFF(scan) = off;
703 }
653099ff
GS
704 /* The principal pseudo-switch. Cannot be a switch, since we
705 look into several different things. */
b81d288d 706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
708 next = regnext(scan);
709 code = OP(scan);
b81d288d
AB
710
711 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 712 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 713 struct regnode_charclass_class accum;
c277df42 714
653099ff 715 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 716 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 717 if (flags & SCF_DO_STCLASS)
830247a4 718 cl_init_zero(pRExC_state, &accum);
c277df42 719 while (OP(scan) == code) {
830247a4 720 I32 deltanext, minnext, f = 0, fake;
653099ff 721 struct regnode_charclass_class this_class;
c277df42
IZ
722
723 num++;
724 data_fake.flags = 0;
b81d288d 725 if (data) {
2c2d71f5 726 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
727 data_fake.last_closep = data->last_closep;
728 }
729 else
730 data_fake.last_closep = &fake;
c277df42
IZ
731 next = regnext(scan);
732 scan = NEXTOPER(scan);
733 if (code != BRANCH)
734 scan = NEXTOPER(scan);
653099ff 735 if (flags & SCF_DO_STCLASS) {
830247a4 736 cl_init(pRExC_state, &this_class);
653099ff
GS
737 data_fake.start_class = &this_class;
738 f = SCF_DO_STCLASS_AND;
b81d288d 739 }
e1901655
IZ
740 if (flags & SCF_WHILEM_VISITED_POS)
741 f |= SCF_WHILEM_VISITED_POS;
653099ff 742 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
743 minnext = study_chunk(pRExC_state, &scan, &deltanext,
744 next, &data_fake, f);
b81d288d 745 if (min1 > minnext)
c277df42
IZ
746 min1 = minnext;
747 if (max1 < minnext + deltanext)
748 max1 = minnext + deltanext;
749 if (deltanext == I32_MAX)
aca2d497 750 is_inf = is_inf_internal = 1;
c277df42
IZ
751 scan = next;
752 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
753 pars++;
405ff068 754 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 755 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
756 if (data)
757 data->whilem_c = data_fake.whilem_c;
653099ff 758 if (flags & SCF_DO_STCLASS)
830247a4 759 cl_or(pRExC_state, &accum, &this_class);
b81d288d 760 if (code == SUSPEND)
c277df42
IZ
761 break;
762 }
763 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
764 min1 = 0;
765 if (flags & SCF_DO_SUBSTR) {
766 data->pos_min += min1;
767 data->pos_delta += max1 - min1;
768 if (max1 != min1 || is_inf)
769 data->longest = &(data->longest_float);
770 }
771 min += min1;
772 delta += max1 - min1;
653099ff 773 if (flags & SCF_DO_STCLASS_OR) {
830247a4 774 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
775 if (min1) {
776 cl_and(data->start_class, &and_with);
777 flags &= ~SCF_DO_STCLASS;
778 }
779 }
780 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
781 if (min1) {
782 cl_and(data->start_class, &accum);
653099ff 783 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
784 }
785 else {
b81d288d 786 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
787 * data->start_class */
788 StructCopy(data->start_class, &and_with,
789 struct regnode_charclass_class);
790 flags &= ~SCF_DO_STCLASS_AND;
791 StructCopy(&accum, data->start_class,
792 struct regnode_charclass_class);
793 flags |= SCF_DO_STCLASS_OR;
794 data->start_class->flags |= ANYOF_EOS;
795 }
653099ff 796 }
a0ed51b3
LW
797 }
798 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
799 scan = NEXTOPER(NEXTOPER(scan));
800 else /* single branch is optimized. */
801 scan = NEXTOPER(scan);
802 continue;
a0ed51b3
LW
803 }
804 else if (OP(scan) == EXACT) {
cd439c50 805 I32 l = STR_LEN(scan);
1aa99e6b 806 UV uc = *((U8*)STRING(scan));
a0ed51b3 807 if (UTF) {
1aa99e6b
IH
808 U8 *s = (U8*)STRING(scan);
809 l = utf8_length(s, s + l);
810 uc = utf8_to_uv_simple(s, NULL);
a0ed51b3
LW
811 }
812 min += l;
c277df42 813 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
814 /* The code below prefers earlier match for fixed
815 offset, later match for variable offset. */
816 if (data->last_end == -1) { /* Update the start info. */
817 data->last_start_min = data->pos_min;
818 data->last_start_max = is_inf
b81d288d 819 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 820 }
cd439c50 821 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
822 data->last_end = data->pos_min + l;
823 data->pos_min += l; /* As in the first entry. */
824 data->flags &= ~SF_BEFORE_EOL;
825 }
653099ff
GS
826 if (flags & SCF_DO_STCLASS_AND) {
827 /* Check whether it is compatible with what we know already! */
828 int compat = 1;
829
1aa99e6b 830 if (uc >= 0x100 ||
b81d288d 831 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 833 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 834 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
835 compat = 0;
836 ANYOF_CLASS_ZERO(data->start_class);
837 ANYOF_BITMAP_ZERO(data->start_class);
838 if (compat)
1aa99e6b 839 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 840 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
841 if (uc < 0x100)
842 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
843 }
844 else if (flags & SCF_DO_STCLASS_OR) {
845 /* false positive possible if the class is case-folded */
1aa99e6b 846 if (uc < 0x100)
9b877dbb
IH
847 ANYOF_BITMAP_SET(data->start_class, uc);
848 else
849 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
850 data->start_class->flags &= ~ANYOF_EOS;
851 cl_and(data->start_class, &and_with);
852 }
853 flags &= ~SCF_DO_STCLASS;
a0ed51b3 854 }
653099ff 855 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 856 I32 l = STR_LEN(scan);
1aa99e6b 857 UV uc = *((U8*)STRING(scan));
653099ff
GS
858
859 /* Search for fixed substrings supports EXACT only. */
b81d288d 860 if (flags & SCF_DO_SUBSTR)
830247a4 861 scan_commit(pRExC_state, data);
a0ed51b3 862 if (UTF) {
1aa99e6b
IH
863 U8 *s = (U8 *)STRING(scan);
864 l = utf8_length(s, s + l);
865 uc = utf8_to_uv_simple(s, NULL);
a0ed51b3
LW
866 }
867 min += l;
c277df42 868 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 869 data->pos_min += l;
653099ff
GS
870 if (flags & SCF_DO_STCLASS_AND) {
871 /* Check whether it is compatible with what we know already! */
872 int compat = 1;
873
1aa99e6b 874 if (uc >= 0x100 ||
b81d288d 875 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b
IH
876 && !ANYOF_BITMAP_TEST(data->start_class, uc)
877 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
653099ff
GS
878 compat = 0;
879 ANYOF_CLASS_ZERO(data->start_class);
880 ANYOF_BITMAP_ZERO(data->start_class);
881 if (compat) {
1aa99e6b 882 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
883 data->start_class->flags &= ~ANYOF_EOS;
884 data->start_class->flags |= ANYOF_FOLD;
885 if (OP(scan) == EXACTFL)
886 data->start_class->flags |= ANYOF_LOCALE;
887 }
888 }
889 else if (flags & SCF_DO_STCLASS_OR) {
890 if (data->start_class->flags & ANYOF_FOLD) {
891 /* false positive possible if the class is case-folded.
892 Assume that the locale settings are the same... */
1aa99e6b
IH
893 if (uc < 0x100)
894 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
895 data->start_class->flags &= ~ANYOF_EOS;
896 }
897 cl_and(data->start_class, &and_with);
898 }
899 flags &= ~SCF_DO_STCLASS;
a0ed51b3 900 }
4d61ec05 901 else if (strchr((char*)PL_varies,OP(scan))) {
aa7a4b56
MH
902 I32 mincount, maxcount, minnext, deltanext, fl;
903 I32 f = flags, pos_before = 0;
c277df42 904 regnode *oscan = scan;
653099ff
GS
905 struct regnode_charclass_class this_class;
906 struct regnode_charclass_class *oclass = NULL;
907
22c35a8c 908 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 909 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
910 scan = NEXTOPER(scan);
911 goto finish;
912 case PLUS:
653099ff 913 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 914 next = NEXTOPER(scan);
653099ff 915 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
916 mincount = 1;
917 maxcount = REG_INFTY;
c277df42
IZ
918 next = regnext(scan);
919 scan = NEXTOPER(scan);
920 goto do_curly;
921 }
922 }
923 if (flags & SCF_DO_SUBSTR)
924 data->pos_min++;
925 min++;
926 /* Fall through. */
927 case STAR:
653099ff
GS
928 if (flags & SCF_DO_STCLASS) {
929 mincount = 0;
b81d288d 930 maxcount = REG_INFTY;
653099ff
GS
931 next = regnext(scan);
932 scan = NEXTOPER(scan);
933 goto do_curly;
934 }
b81d288d 935 is_inf = is_inf_internal = 1;
c277df42
IZ
936 scan = regnext(scan);
937 if (flags & SCF_DO_SUBSTR) {
830247a4 938 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
939 data->longest = &(data->longest_float);
940 }
941 goto optimize_curly_tail;
942 case CURLY:
b81d288d 943 mincount = ARG1(scan);
c277df42
IZ
944 maxcount = ARG2(scan);
945 next = regnext(scan);
cb434fcc
IZ
946 if (OP(scan) == CURLYX) {
947 I32 lp = (data ? *(data->last_closep) : 0);
948
949 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
950 }
c277df42
IZ
951 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
952 do_curly:
953 if (flags & SCF_DO_SUBSTR) {
830247a4 954 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
955 pos_before = data->pos_min;
956 }
957 if (data) {
958 fl = data->flags;
959 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
960 if (is_inf)
961 data->flags |= SF_IS_INF;
962 }
653099ff 963 if (flags & SCF_DO_STCLASS) {
830247a4 964 cl_init(pRExC_state, &this_class);
653099ff
GS
965 oclass = data->start_class;
966 data->start_class = &this_class;
967 f |= SCF_DO_STCLASS_AND;
968 f &= ~SCF_DO_STCLASS_OR;
969 }
e1901655
IZ
970 /* These are the cases when once a subexpression
971 fails at a particular position, it cannot succeed
972 even after backtracking at the enclosing scope.
b81d288d 973
e1901655
IZ
974 XXXX what if minimal match and we are at the
975 initial run of {n,m}? */
976 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
977 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 978
c277df42 979 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
980 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
981 mincount == 0
653099ff
GS
982 ? (f & ~SCF_DO_SUBSTR) : f);
983
984 if (flags & SCF_DO_STCLASS)
985 data->start_class = oclass;
986 if (mincount == 0 || minnext == 0) {
987 if (flags & SCF_DO_STCLASS_OR) {
830247a4 988 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
989 }
990 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 991 /* Switch to OR mode: cache the old value of
653099ff
GS
992 * data->start_class */
993 StructCopy(data->start_class, &and_with,
994 struct regnode_charclass_class);
995 flags &= ~SCF_DO_STCLASS_AND;
996 StructCopy(&this_class, data->start_class,
997 struct regnode_charclass_class);
998 flags |= SCF_DO_STCLASS_OR;
999 data->start_class->flags |= ANYOF_EOS;
1000 }
1001 } else { /* Non-zero len */
1002 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1003 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1004 cl_and(data->start_class, &and_with);
1005 }
1006 else if (flags & SCF_DO_STCLASS_AND)
1007 cl_and(data->start_class, &this_class);
1008 flags &= ~SCF_DO_STCLASS;
1009 }
c277df42
IZ
1010 if (!scan) /* It was not CURLYX, but CURLY. */
1011 scan = next;
b81d288d 1012 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
821b33a5 1013 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1014 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1015 {
830247a4 1016 vWARN(RExC_parse,
b45f050a
JF
1017 "Quantifier unexpected on zero-length expression");
1018 }
1019
c277df42 1020 min += minnext * mincount;
b81d288d 1021 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1022 && (minnext + deltanext) > 0)
1023 || deltanext == I32_MAX);
aca2d497 1024 is_inf |= is_inf_internal;
c277df42
IZ
1025 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1026
1027 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1028 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1029 && data->flags & SF_IN_PAR
1030 && !(data->flags & SF_HAS_EVAL)
1031 && !deltanext && minnext == 1 ) {
1032 /* Try to optimize to CURLYN. */
1033 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1034 regnode *nxt1 = nxt, *nxt2;
1035
1036 /* Skip open. */
1037 nxt = regnext(nxt);
4d61ec05 1038 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1039 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1040 && STR_LEN(nxt) == 1))
c277df42
IZ
1041 goto nogo;
1042 nxt2 = nxt;
1043 nxt = regnext(nxt);
b81d288d 1044 if (OP(nxt) != CLOSE)
c277df42
IZ
1045 goto nogo;
1046 /* Now we know that nxt2 is the only contents: */
1047 oscan->flags = ARG(nxt);
1048 OP(oscan) = CURLYN;
1049 OP(nxt1) = NOTHING; /* was OPEN. */
1050#ifdef DEBUGGING
1051 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1052 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1053 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1054 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1055 OP(nxt + 1) = OPTIMIZED; /* was count. */
1056 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1057#endif
c277df42 1058 }
c277df42
IZ
1059 nogo:
1060
1061 /* Try optimization CURLYX => CURLYM. */
b81d288d 1062 if ( OP(oscan) == CURLYX && data
c277df42 1063 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1064 && !(data->flags & SF_HAS_EVAL)
1065 && !deltanext ) {
1066 /* XXXX How to optimize if data == 0? */
1067 /* Optimize to a simpler form. */
1068 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1069 regnode *nxt2;
1070
1071 OP(oscan) = CURLYM;
1072 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1073 && (OP(nxt2) != WHILEM))
c277df42
IZ
1074 nxt = nxt2;
1075 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1076 /* Need to optimize away parenths. */
1077 if (data->flags & SF_IN_PAR) {
1078 /* Set the parenth number. */
1079 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1080
b81d288d 1081 if (OP(nxt) != CLOSE)
b45f050a 1082 FAIL("Panic opt close");
c277df42
IZ
1083 oscan->flags = ARG(nxt);
1084 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1085 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1086#ifdef DEBUGGING
1087 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1088 OP(nxt + 1) = OPTIMIZED; /* was count. */
1089 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1090 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1091#endif
c277df42
IZ
1092#if 0
1093 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1094 regnode *nnxt = regnext(nxt1);
b81d288d 1095
c277df42
IZ
1096 if (nnxt == nxt) {
1097 if (reg_off_by_arg[OP(nxt1)])
1098 ARG_SET(nxt1, nxt2 - nxt1);
1099 else if (nxt2 - nxt1 < U16_MAX)
1100 NEXT_OFF(nxt1) = nxt2 - nxt1;
1101 else
1102 OP(nxt) = NOTHING; /* Cannot beautify */
1103 }
1104 nxt1 = nnxt;
1105 }
1106#endif
1107 /* Optimize again: */
b81d288d 1108 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1109 NULL, 0);
a0ed51b3
LW
1110 }
1111 else
c277df42 1112 oscan->flags = 0;
c277df42 1113 }
e1901655
IZ
1114 else if ((OP(oscan) == CURLYX)
1115 && (flags & SCF_WHILEM_VISITED_POS)
1116 /* See the comment on a similar expression above.
1117 However, this time it not a subexpression
1118 we care about, but the expression itself. */
1119 && (maxcount == REG_INFTY)
1120 && data && ++data->whilem_c < 16) {
1121 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1122 /* Find WHILEM (as in regexec.c) */
1123 regnode *nxt = oscan + NEXT_OFF(oscan);
1124
1125 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1126 nxt += ARG(nxt);
1127 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1128 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1129 }
b81d288d 1130 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1131 pars++;
1132 if (flags & SCF_DO_SUBSTR) {
1133 SV *last_str = Nullsv;
1134 int counted = mincount != 0;
1135
1136 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1137 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1138 ? pos_before : data->last_start_min;
1139 STRLEN l;
1140 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1141 I32 old = b - data->last_start_min;
1142
1143 if (UTF)
1144 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1145
a0ed51b3 1146 l -= old;
c277df42 1147 /* Get the added string: */
79cb57f6 1148 last_str = newSVpvn(s + old, l);
c277df42
IZ
1149 if (deltanext == 0 && pos_before == b) {
1150 /* What was added is a constant string */
1151 if (mincount > 1) {
1152 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1153 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1154 SvPVX(last_str), l, mincount - 1);
1155 SvCUR(last_str) *= mincount;
1156 /* Add additional parts. */
b81d288d 1157 SvCUR_set(data->last_found,
c277df42
IZ
1158 SvCUR(data->last_found) - l);
1159 sv_catsv(data->last_found, last_str);
1160 data->last_end += l * (mincount - 1);
1161 }
2a8d9689
HS
1162 } else {
1163 /* start offset must point into the last copy */
1164 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1165 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1166 * (minnext + data->pos_delta);
c277df42
IZ
1167 }
1168 }
1169 /* It is counted once already... */
1170 data->pos_min += minnext * (mincount - counted);
1171 data->pos_delta += - counted * deltanext +
1172 (minnext + deltanext) * maxcount - minnext * mincount;
1173 if (mincount != maxcount) {
653099ff
GS
1174 /* Cannot extend fixed substrings found inside
1175 the group. */
830247a4 1176 scan_commit(pRExC_state,data);
c277df42
IZ
1177 if (mincount && last_str) {
1178 sv_setsv(data->last_found, last_str);
1179 data->last_end = data->pos_min;
b81d288d 1180 data->last_start_min =
a0ed51b3 1181 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1182 data->last_start_max = is_inf
1183 ? I32_MAX
c277df42 1184 : data->pos_min + data->pos_delta
a0ed51b3 1185 - CHR_SVLEN(last_str);
c277df42
IZ
1186 }
1187 data->longest = &(data->longest_float);
1188 }
aca2d497 1189 SvREFCNT_dec(last_str);
c277df42 1190 }
405ff068 1191 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1192 data->flags |= SF_HAS_EVAL;
1193 optimize_curly_tail:
c277df42 1194 if (OP(oscan) != CURLYX) {
22c35a8c 1195 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1196 && NEXT_OFF(next))
1197 NEXT_OFF(oscan) += NEXT_OFF(next);
1198 }
c277df42 1199 continue;
653099ff 1200 default: /* REF and CLUMP only? */
c277df42 1201 if (flags & SCF_DO_SUBSTR) {
830247a4 1202 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1203 data->longest = &(data->longest_float);
1204 }
aca2d497 1205 is_inf = is_inf_internal = 1;
653099ff 1206 if (flags & SCF_DO_STCLASS_OR)
830247a4 1207 cl_anything(pRExC_state, data->start_class);
653099ff 1208 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1209 break;
1210 }
a0ed51b3 1211 }
ffc61ed2 1212 else if (strchr((char*)PL_simple,OP(scan))) {
653099ff
GS
1213 int value;
1214
c277df42 1215 if (flags & SCF_DO_SUBSTR) {
830247a4 1216 scan_commit(pRExC_state,data);
c277df42
IZ
1217 data->pos_min++;
1218 }
1219 min++;
653099ff
GS
1220 if (flags & SCF_DO_STCLASS) {
1221 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1222
1223 /* Some of the logic below assumes that switching
1224 locale on will only add false positives. */
1225 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1226 case SANY:
653099ff
GS
1227 default:
1228 do_default:
1229 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1230 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1231 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1232 break;
1233 case REG_ANY:
1234 if (OP(scan) == SANY)
1235 goto do_default;
1236 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1237 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1238 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1239 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1240 }
1241 if (flags & SCF_DO_STCLASS_AND || !value)
1242 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1243 break;
1244 case ANYOF:
1245 if (flags & SCF_DO_STCLASS_AND)
1246 cl_and(data->start_class,
1247 (struct regnode_charclass_class*)scan);
1248 else
830247a4 1249 cl_or(pRExC_state, data->start_class,
653099ff
GS
1250 (struct regnode_charclass_class*)scan);
1251 break;
1252 case ALNUM:
1253 if (flags & SCF_DO_STCLASS_AND) {
1254 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1255 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1256 for (value = 0; value < 256; value++)
1257 if (!isALNUM(value))
1258 ANYOF_BITMAP_CLEAR(data->start_class, value);
1259 }
1260 }
1261 else {
1262 if (data->start_class->flags & ANYOF_LOCALE)
1263 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1264 else {
1265 for (value = 0; value < 256; value++)
1266 if (isALNUM(value))
b81d288d 1267 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1268 }
1269 }
1270 break;
1271 case ALNUML:
1272 if (flags & SCF_DO_STCLASS_AND) {
1273 if (data->start_class->flags & ANYOF_LOCALE)
1274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1275 }
1276 else {
1277 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1278 data->start_class->flags |= ANYOF_LOCALE;
1279 }
1280 break;
1281 case NALNUM:
1282 if (flags & SCF_DO_STCLASS_AND) {
1283 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1284 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1285 for (value = 0; value < 256; value++)
1286 if (isALNUM(value))
1287 ANYOF_BITMAP_CLEAR(data->start_class, value);
1288 }
1289 }
1290 else {
1291 if (data->start_class->flags & ANYOF_LOCALE)
1292 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1293 else {
1294 for (value = 0; value < 256; value++)
1295 if (!isALNUM(value))
b81d288d 1296 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1297 }
1298 }
1299 break;
1300 case NALNUML:
1301 if (flags & SCF_DO_STCLASS_AND) {
1302 if (data->start_class->flags & ANYOF_LOCALE)
1303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1304 }
1305 else {
1306 data->start_class->flags |= ANYOF_LOCALE;
1307 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1308 }
1309 break;
1310 case SPACE:
1311 if (flags & SCF_DO_STCLASS_AND) {
1312 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1313 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1314 for (value = 0; value < 256; value++)
1315 if (!isSPACE(value))
1316 ANYOF_BITMAP_CLEAR(data->start_class, value);
1317 }
1318 }
1319 else {
1320 if (data->start_class->flags & ANYOF_LOCALE)
1321 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1322 else {
1323 for (value = 0; value < 256; value++)
1324 if (isSPACE(value))
b81d288d 1325 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1326 }
1327 }
1328 break;
1329 case SPACEL:
1330 if (flags & SCF_DO_STCLASS_AND) {
1331 if (data->start_class->flags & ANYOF_LOCALE)
1332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1333 }
1334 else {
1335 data->start_class->flags |= ANYOF_LOCALE;
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1337 }
1338 break;
1339 case NSPACE:
1340 if (flags & SCF_DO_STCLASS_AND) {
1341 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1342 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1343 for (value = 0; value < 256; value++)
1344 if (isSPACE(value))
1345 ANYOF_BITMAP_CLEAR(data->start_class, value);
1346 }
1347 }
1348 else {
1349 if (data->start_class->flags & ANYOF_LOCALE)
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1351 else {
1352 for (value = 0; value < 256; value++)
1353 if (!isSPACE(value))
b81d288d 1354 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1355 }
1356 }
1357 break;
1358 case NSPACEL:
1359 if (flags & SCF_DO_STCLASS_AND) {
1360 if (data->start_class->flags & ANYOF_LOCALE) {
1361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1362 for (value = 0; value < 256; value++)
1363 if (!isSPACE(value))
1364 ANYOF_BITMAP_CLEAR(data->start_class, value);
1365 }
1366 }
1367 else {
1368 data->start_class->flags |= ANYOF_LOCALE;
1369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1370 }
1371 break;
1372 case DIGIT:
1373 if (flags & SCF_DO_STCLASS_AND) {
1374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1375 for (value = 0; value < 256; value++)
1376 if (!isDIGIT(value))
1377 ANYOF_BITMAP_CLEAR(data->start_class, value);
1378 }
1379 else {
1380 if (data->start_class->flags & ANYOF_LOCALE)
1381 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1382 else {
1383 for (value = 0; value < 256; value++)
1384 if (isDIGIT(value))
b81d288d 1385 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1386 }
1387 }
1388 break;
1389 case NDIGIT:
1390 if (flags & SCF_DO_STCLASS_AND) {
1391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1392 for (value = 0; value < 256; value++)
1393 if (isDIGIT(value))
1394 ANYOF_BITMAP_CLEAR(data->start_class, value);
1395 }
1396 else {
1397 if (data->start_class->flags & ANYOF_LOCALE)
1398 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1399 else {
1400 for (value = 0; value < 256; value++)
1401 if (!isDIGIT(value))
b81d288d 1402 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1403 }
1404 }
1405 break;
1406 }
1407 if (flags & SCF_DO_STCLASS_OR)
1408 cl_and(data->start_class, &and_with);
1409 flags &= ~SCF_DO_STCLASS;
1410 }
a0ed51b3 1411 }
22c35a8c 1412 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1413 data->flags |= (OP(scan) == MEOL
1414 ? SF_BEFORE_MEOL
1415 : SF_BEFORE_SEOL);
a0ed51b3 1416 }
653099ff
GS
1417 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1418 /* Lookbehind, or need to calculate parens/evals/stclass: */
1419 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1420 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1421 /* Lookahead/lookbehind */
cb434fcc 1422 I32 deltanext, minnext, fake = 0;
c277df42 1423 regnode *nscan;
653099ff
GS
1424 struct regnode_charclass_class intrnl;
1425 int f = 0;
c277df42
IZ
1426
1427 data_fake.flags = 0;
b81d288d 1428 if (data) {
2c2d71f5 1429 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1430 data_fake.last_closep = data->last_closep;
1431 }
1432 else
1433 data_fake.last_closep = &fake;
653099ff
GS
1434 if ( flags & SCF_DO_STCLASS && !scan->flags
1435 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1436 cl_init(pRExC_state, &intrnl);
653099ff 1437 data_fake.start_class = &intrnl;
e1901655 1438 f |= SCF_DO_STCLASS_AND;
653099ff 1439 }
e1901655
IZ
1440 if (flags & SCF_WHILEM_VISITED_POS)
1441 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1442 next = regnext(scan);
1443 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1444 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1445 if (scan->flags) {
1446 if (deltanext) {
9baa0206 1447 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1448 }
1449 else if (minnext > U8_MAX) {
9baa0206 1450 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1451 }
1452 scan->flags = minnext;
1453 }
1454 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1455 pars++;
405ff068 1456 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1457 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1458 if (data)
1459 data->whilem_c = data_fake.whilem_c;
e1901655 1460 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1461 int was = (data->start_class->flags & ANYOF_EOS);
1462
1463 cl_and(data->start_class, &intrnl);
1464 if (was)
1465 data->start_class->flags |= ANYOF_EOS;
1466 }
a0ed51b3
LW
1467 }
1468 else if (OP(scan) == OPEN) {
c277df42 1469 pars++;
a0ed51b3 1470 }
cb434fcc
IZ
1471 else if (OP(scan) == CLOSE) {
1472 if (ARG(scan) == is_par) {
1473 next = regnext(scan);
c277df42 1474
cb434fcc
IZ
1475 if ( next && (OP(next) != WHILEM) && next < last)
1476 is_par = 0; /* Disable optimization */
1477 }
1478 if (data)
1479 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1480 }
1481 else if (OP(scan) == EVAL) {
c277df42
IZ
1482 if (data)
1483 data->flags |= SF_HAS_EVAL;
1484 }
96776eda 1485 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1486 if (flags & SCF_DO_SUBSTR) {
830247a4 1487 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1488 data->longest = &(data->longest_float);
1489 }
1490 is_inf = is_inf_internal = 1;
653099ff 1491 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1492 cl_anything(pRExC_state, data->start_class);
96776eda 1493 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1494 }
c277df42
IZ
1495 /* Else: zero-length, ignore. */
1496 scan = regnext(scan);
1497 }
1498
1499 finish:
1500 *scanp = scan;
aca2d497 1501 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1502 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1503 data->pos_delta = I32_MAX - data->pos_min;
1504 if (is_par > U8_MAX)
1505 is_par = 0;
1506 if (is_par && pars==1 && data) {
1507 data->flags |= SF_IN_PAR;
1508 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1509 }
1510 else if (pars && data) {
c277df42
IZ
1511 data->flags |= SF_HAS_PAR;
1512 data->flags &= ~SF_IN_PAR;
1513 }
653099ff
GS
1514 if (flags & SCF_DO_STCLASS_OR)
1515 cl_and(data->start_class, &and_with);
c277df42
IZ
1516 return min;
1517}
1518
76e3520e 1519STATIC I32
830247a4 1520S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1521{
830247a4 1522 if (RExC_rx->data) {
b81d288d
AB
1523 Renewc(RExC_rx->data,
1524 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1525 char, struct reg_data);
830247a4
IZ
1526 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1527 RExC_rx->data->count += n;
a0ed51b3
LW
1528 }
1529 else {
830247a4 1530 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1531 char, struct reg_data);
830247a4
IZ
1532 New(1208, RExC_rx->data->what, n, U8);
1533 RExC_rx->data->count = n;
c277df42 1534 }
830247a4
IZ
1535 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1536 return RExC_rx->data->count - n;
c277df42
IZ
1537}
1538
d88dccdf 1539void
864dbfa3 1540Perl_reginitcolors(pTHX)
d88dccdf 1541{
d88dccdf
IZ
1542 int i = 0;
1543 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1544
d88dccdf
IZ
1545 if (s) {
1546 PL_colors[0] = s = savepv(s);
1547 while (++i < 6) {
1548 s = strchr(s, '\t');
1549 if (s) {
1550 *s = '\0';
1551 PL_colors[i] = ++s;
1552 }
1553 else
c712d376 1554 PL_colors[i] = s = "";
d88dccdf
IZ
1555 }
1556 } else {
b81d288d 1557 while (i < 6)
d88dccdf
IZ
1558 PL_colors[i++] = "";
1559 }
1560 PL_colorset = 1;
1561}
1562
8615cb43 1563
a687059c 1564/*
e50aee73 1565 - pregcomp - compile a regular expression into internal code
a687059c
LW
1566 *
1567 * We can't allocate space until we know how big the compiled form will be,
1568 * but we can't compile it (and thus know how big it is) until we've got a
1569 * place to put the code. So we cheat: we compile it twice, once with code
1570 * generation turned off and size counting turned on, and once "for real".
1571 * This also means that we don't allocate space until we are sure that the
1572 * thing really will compile successfully, and we never have to move the
1573 * code and thus invalidate pointers into it. (Note that it has to be in
1574 * one piece because free() must be able to free it all.) [NB: not true in perl]
1575 *
1576 * Beware that the optimization-preparation code in here knows about some
1577 * of the structure of the compiled regexp. [I'll say.]
1578 */
1579regexp *
864dbfa3 1580Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1581{
a0d0e21e 1582 register regexp *r;
c277df42 1583 regnode *scan;
c277df42 1584 regnode *first;
a0d0e21e 1585 I32 flags;
a0d0e21e
LW
1586 I32 minlen = 0;
1587 I32 sawplus = 0;
1588 I32 sawopen = 0;
2c2d71f5 1589 scan_data_t data;
830247a4
IZ
1590 RExC_state_t RExC_state;
1591 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1592
1593 if (exp == NULL)
c277df42 1594 FAIL("NULL regexp argument");
a0d0e21e 1595
830247a4 1596 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1597 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1598 RExC_utf8 = 1;
a0ed51b3 1599 else
1aa99e6b 1600 RExC_utf8 = 0;
a0ed51b3 1601
830247a4 1602 RExC_precomp = savepvn(exp, xend - exp);
35ef4773 1603 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1604 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1605 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1606 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1607 RExC_flags16 = pm->op_pmflags;
1608 RExC_sawback = 0;
bbce6d69 1609
830247a4
IZ
1610 RExC_seen = 0;
1611 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1612 RExC_seen_evals = 0;
1613 RExC_extralen = 0;
c277df42 1614
bbce6d69 1615 /* First pass: determine size, legality. */
830247a4
IZ
1616 RExC_parse = exp;
1617 RExC_end = xend;
1618 RExC_naughty = 0;
1619 RExC_npar = 1;
1620 RExC_size = 0L;
1621 RExC_emit = &PL_regdummy;
1622 RExC_whilem_seen = 0;
85ddcde9
JH
1623#if 0 /* REGC() is (currently) a NOP at the first pass.
1624 * Clever compilers notice this and complain. --jhi */
830247a4 1625 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1626#endif
830247a4
IZ
1627 if (reg(pRExC_state, 0, &flags) == NULL) {
1628 Safefree(RExC_precomp);
1629 RExC_precomp = Nullch;
a0d0e21e
LW
1630 return(NULL);
1631 }
830247a4 1632 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1633
c277df42
IZ
1634 /* Small enough for pointer-storage convention?
1635 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1636 if (RExC_size >= 0x10000L && RExC_extralen)
1637 RExC_size += RExC_extralen;
c277df42 1638 else
830247a4
IZ
1639 RExC_extralen = 0;
1640 if (RExC_whilem_seen > 15)
1641 RExC_whilem_seen = 15;
a0d0e21e 1642
bbce6d69 1643 /* Allocate space and initialize. */
830247a4 1644 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1645 char, regexp);
a0d0e21e 1646 if (r == NULL)
b45f050a
JF
1647 FAIL("Regexp out of space");
1648
0f79a09d
GS
1649#ifdef DEBUGGING
1650 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1651 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1652#endif
c277df42 1653 r->refcnt = 1;
bbce6d69 1654 r->prelen = xend - exp;
830247a4 1655 r->precomp = RExC_precomp;
cf93c79d
IZ
1656 r->subbeg = NULL;
1657 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1658 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1659
1660 r->substrs = 0; /* Useful during FAIL. */
1661 r->startp = 0; /* Useful during FAIL. */
1662 r->endp = 0; /* Useful during FAIL. */
1663
830247a4 1664 RExC_rx = r;
bbce6d69 1665
1666 /* Second pass: emit code. */
830247a4
IZ
1667 RExC_parse = exp;
1668 RExC_end = xend;
1669 RExC_naughty = 0;
1670 RExC_npar = 1;
1671 RExC_emit = r->program;
2cd61cdb 1672 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1673 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1674 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1675 r->data = 0;
830247a4 1676 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1677 return(NULL);
1678
1679 /* Dig out information for optimizations. */
cf93c79d 1680 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1681 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1682 if (UTF)
1683 r->reganch |= ROPT_UTF8;
c277df42 1684 r->regstclass = NULL;
830247a4 1685 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1686 r->reganch |= ROPT_NAUGHTY;
c277df42 1687 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1688
1689 /* XXXX To minimize changes to RE engine we always allocate
1690 3-units-long substrs field. */
1691 Newz(1004, r->substrs, 1, struct reg_substr_data);
1692
2c2d71f5 1693 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1694 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1695 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1696 I32 fake;
c5254dd6 1697 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1698 struct regnode_charclass_class ch_class;
1699 int stclass_flag;
cb434fcc 1700 I32 last_close = 0;
a0d0e21e
LW
1701
1702 first = scan;
c277df42 1703 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1704 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1705 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1706 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1707 (OP(first) == PLUS) ||
1708 (OP(first) == MINMOD) ||
653099ff 1709 /* An {n,m} with n>0 */
22c35a8c 1710 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1711 if (OP(first) == PLUS)
1712 sawplus = 1;
1713 else
1714 first += regarglen[(U8)OP(first)];
1715 first = NEXTOPER(first);
a687059c
LW
1716 }
1717
a0d0e21e
LW
1718 /* Starting-point info. */
1719 again:
653099ff 1720 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1721 if (OP(first) == EXACT)
1722 ; /* Empty, get anchored substr later. */
1723 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1724 r->regstclass = first;
1725 }
653099ff 1726 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1727 r->regstclass = first;
22c35a8c
GS
1728 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1729 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1730 r->regstclass = first;
22c35a8c 1731 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1732 r->reganch |= (OP(first) == MBOL
1733 ? ROPT_ANCH_MBOL
1734 : (OP(first) == SBOL
1735 ? ROPT_ANCH_SBOL
1736 : ROPT_ANCH_BOL));
a0d0e21e 1737 first = NEXTOPER(first);
774d564b 1738 goto again;
1739 }
1740 else if (OP(first) == GPOS) {
1741 r->reganch |= ROPT_ANCH_GPOS;
1742 first = NEXTOPER(first);
1743 goto again;
a0d0e21e
LW
1744 }
1745 else if ((OP(first) == STAR &&
22c35a8c 1746 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1747 !(r->reganch & ROPT_ANCH) )
1748 {
1749 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1750 int type = OP(NEXTOPER(first));
1751
ffc61ed2 1752 if (type == REG_ANY)
cad2e5aa
JH
1753 type = ROPT_ANCH_MBOL;
1754 else
1755 type = ROPT_ANCH_SBOL;
1756
1757 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1758 first = NEXTOPER(first);
774d564b 1759 goto again;
a0d0e21e 1760 }
b81d288d 1761 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1762 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1763 /* x+ must match at the 1st pos of run of x's */
1764 r->reganch |= ROPT_SKIP;
a0d0e21e 1765
c277df42 1766 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1767 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1768 (IV)(first - scan + 1)));
a0d0e21e
LW
1769 /*
1770 * If there's something expensive in the r.e., find the
1771 * longest literal string that must appear and make it the
1772 * regmust. Resolve ties in favor of later strings, since
1773 * the regstart check works with the beginning of the r.e.
1774 * and avoiding duplication strengthens checking. Not a
1775 * strong reason, but sufficient in the absence of others.
1776 * [Now we resolve ties in favor of the earlier string if
c277df42 1777 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1778 * earlier string may buy us something the later one won't.]
1779 */
a0d0e21e 1780 minlen = 0;
a687059c 1781
79cb57f6
GS
1782 data.longest_fixed = newSVpvn("",0);
1783 data.longest_float = newSVpvn("",0);
1784 data.last_found = newSVpvn("",0);
c277df42
IZ
1785 data.longest = &(data.longest_fixed);
1786 first = scan;
653099ff 1787 if (!r->regstclass) {
830247a4 1788 cl_init(pRExC_state, &ch_class);
653099ff
GS
1789 data.start_class = &ch_class;
1790 stclass_flag = SCF_DO_STCLASS_AND;
1791 } else /* XXXX Check for BOUND? */
1792 stclass_flag = 0;
cb434fcc 1793 data.last_closep = &last_close;
653099ff 1794
830247a4 1795 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1796 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1797 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1798 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1799 && !RExC_seen_zerolen
1800 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1801 r->reganch |= ROPT_CHECK_ALL;
830247a4 1802 scan_commit(pRExC_state, &data);
c277df42
IZ
1803 SvREFCNT_dec(data.last_found);
1804
a0ed51b3 1805 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1806 if (longest_float_length
c277df42
IZ
1807 || (data.flags & SF_FL_BEFORE_EOL
1808 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1809 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1810 int t;
1811
a0ed51b3 1812 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1813 && data.offset_fixed == data.offset_float_min
1814 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1815 goto remove_float; /* As in (a)+. */
1816
c277df42
IZ
1817 r->float_substr = data.longest_float;
1818 r->float_min_offset = data.offset_float_min;
1819 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1820 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1821 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1822 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1823 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1824 }
1825 else {
aca2d497 1826 remove_float:
c277df42
IZ
1827 r->float_substr = Nullsv;
1828 SvREFCNT_dec(data.longest_float);
c5254dd6 1829 longest_float_length = 0;
a0d0e21e 1830 }
c277df42 1831
a0ed51b3 1832 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1833 if (longest_fixed_length
c277df42
IZ
1834 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1835 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1836 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1837 int t;
1838
c277df42
IZ
1839 r->anchored_substr = data.longest_fixed;
1840 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1841 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1842 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1843 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1844 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1845 }
1846 else {
c277df42
IZ
1847 r->anchored_substr = Nullsv;
1848 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1849 longest_fixed_length = 0;
a0d0e21e 1850 }
b81d288d 1851 if (r->regstclass
ffc61ed2 1852 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1853 r->regstclass = NULL;
1854 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1855 && !(data.start_class->flags & ANYOF_EOS)
1856 && !cl_is_anything(data.start_class)) {
1857 SV *sv;
830247a4 1858 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1859
b81d288d 1860 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1861 struct regnode_charclass_class);
1862 StructCopy(data.start_class,
830247a4 1863 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1864 struct regnode_charclass_class);
830247a4 1865 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1866 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1867 PL_regdata = r->data; /* for regprop() */
653099ff
GS
1868 DEBUG_r((sv = sv_newmortal(),
1869 regprop(sv, (regnode*)data.start_class),
894356b3 1870 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1871 SvPVX(sv))));
1872 }
c277df42
IZ
1873
1874 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1875 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1876 r->check_substr = r->anchored_substr;
1877 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1878 if (r->reganch & ROPT_ANCH_SINGLE)
1879 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1880 }
1881 else {
c277df42
IZ
1882 r->check_substr = r->float_substr;
1883 r->check_offset_min = data.offset_float_min;
1884 r->check_offset_max = data.offset_float_max;
a0d0e21e 1885 }
30382c73
IZ
1886 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1887 This should be changed ASAP! */
1888 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1889 r->reganch |= RE_USE_INTUIT;
1890 if (SvTAIL(r->check_substr))
1891 r->reganch |= RE_INTUIT_TAIL;
1892 }
a0ed51b3
LW
1893 }
1894 else {
c277df42
IZ
1895 /* Several toplevels. Best we can is to set minlen. */
1896 I32 fake;
653099ff 1897 struct regnode_charclass_class ch_class;
cb434fcc 1898 I32 last_close = 0;
c277df42
IZ
1899
1900 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1901 scan = r->program + 1;
830247a4 1902 cl_init(pRExC_state, &ch_class);
653099ff 1903 data.start_class = &ch_class;
cb434fcc 1904 data.last_closep = &last_close;
e1901655 1905 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1906 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1907 if (!(data.start_class->flags & ANYOF_EOS)
1908 && !cl_is_anything(data.start_class)) {
1909 SV *sv;
830247a4 1910 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1911
b81d288d 1912 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1913 struct regnode_charclass_class);
1914 StructCopy(data.start_class,
830247a4 1915 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1916 struct regnode_charclass_class);
830247a4 1917 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff
GS
1918 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
1919 DEBUG_r((sv = sv_newmortal(),
1920 regprop(sv, (regnode*)data.start_class),
894356b3 1921 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1922 SvPVX(sv))));
1923 }
a0d0e21e
LW
1924 }
1925
a0d0e21e 1926 r->minlen = minlen;
b81d288d 1927 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1928 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1929 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1930 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1931 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1932 r->reganch |= ROPT_EVAL_SEEN;
830247a4
IZ
1933 Newz(1002, r->startp, RExC_npar, I32);
1934 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1935 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1936 DEBUG_r(regdump(r));
1937 return(r);
a687059c
LW
1938}
1939
1940/*
1941 - reg - regular expression, i.e. main body or parenthesized thing
1942 *
1943 * Caller must absorb opening parenthesis.
1944 *
1945 * Combining parenthesis handling with the base level of regular expression
1946 * is a trifle forced, but the need to tie the tails of the branches to what
1947 * follows makes it hard to avoid.
1948 */
76e3520e 1949STATIC regnode *
830247a4 1950S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 1951 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 1952{
c277df42
IZ
1953 register regnode *ret; /* Will be the head of the group. */
1954 register regnode *br;
1955 register regnode *lastbr;
1956 register regnode *ender = 0;
a0d0e21e 1957 register I32 parno = 0;
830247a4
IZ
1958 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
1959 char *oregcomp_parse = RExC_parse;
c277df42 1960 char c;
a0d0e21e 1961
821b33a5 1962 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
1963
1964 /* Make an OPEN node, if parenthesized. */
1965 if (paren) {
830247a4 1966 if (*RExC_parse == '?') {
ca9dfc88
IZ
1967 U16 posflags = 0, negflags = 0;
1968 U16 *flagsp = &posflags;
0f5d15d6 1969 int logical = 0;
830247a4 1970 char *seqstart = RExC_parse;
ca9dfc88 1971
830247a4
IZ
1972 RExC_parse++;
1973 paren = *RExC_parse++;
c277df42 1974 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 1975 switch (paren) {
c277df42 1976 case '<':
830247a4 1977 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 1978 if (*RExC_parse == '!')
c277df42 1979 paren = ',';
b81d288d 1980 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 1981 goto unknown;
830247a4 1982 RExC_parse++;
a0d0e21e
LW
1983 case '=':
1984 case '!':
830247a4 1985 RExC_seen_zerolen++;
c277df42
IZ
1986 case ':':
1987 case '>':
a0d0e21e
LW
1988 break;
1989 case '$':
1990 case '@':
8615cb43 1991 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
1992 break;
1993 case '#':
830247a4
IZ
1994 while (*RExC_parse && *RExC_parse != ')')
1995 RExC_parse++;
1996 if (*RExC_parse != ')')
c277df42 1997 FAIL("Sequence (?#... not terminated");
830247a4 1998 nextchar(pRExC_state);
a0d0e21e
LW
1999 *flagp = TRYAGAIN;
2000 return NULL;
8c8ad484 2001 case 'p':
d7e9e385 2002 if (SIZE_ONLY)
830247a4 2003 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2004 /* FALL THROUGH*/
14455d6c 2005 case '?':
0f5d15d6 2006 logical = 1;
830247a4 2007 paren = *RExC_parse++;
0f5d15d6 2008 /* FALL THROUGH */
c277df42
IZ
2009 case '{':
2010 {
c277df42
IZ
2011 I32 count = 1, n = 0;
2012 char c;
830247a4 2013 char *s = RExC_parse;
c277df42
IZ
2014 SV *sv;
2015 OP_4tree *sop, *rop;
2016
830247a4
IZ
2017 RExC_seen_zerolen++;
2018 RExC_seen |= REG_SEEN_EVAL;
2019 while (count && (c = *RExC_parse)) {
2020 if (c == '\\' && RExC_parse[1])
2021 RExC_parse++;
b81d288d 2022 else if (c == '{')
c277df42 2023 count++;
b81d288d 2024 else if (c == '}')
c277df42 2025 count--;
830247a4 2026 RExC_parse++;
c277df42 2027 }
830247a4 2028 if (*RExC_parse != ')')
b45f050a 2029 {
b81d288d 2030 RExC_parse = s;
b45f050a
JF
2031 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2032 }
c277df42
IZ
2033 if (!SIZE_ONLY) {
2034 AV *av;
b81d288d
AB
2035
2036 if (RExC_parse - 1 - s)
830247a4 2037 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2038 else
79cb57f6 2039 sv = newSVpvn("", 0);
c277df42 2040
569233ed
SB
2041 ENTER;
2042 Perl_save_re_context(aTHX);
c277df42 2043 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2044 LEAVE;
c277df42 2045
830247a4
IZ
2046 n = add_data(pRExC_state, 3, "nop");
2047 RExC_rx->data->data[n] = (void*)rop;
2048 RExC_rx->data->data[n+1] = (void*)sop;
2049 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2050 SvREFCNT_dec(sv);
a0ed51b3 2051 }
e24b16f9 2052 else { /* First pass */
830247a4 2053 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2054 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2055 /* No compiled RE interpolated, has runtime
2056 components ===> unsafe. */
2057 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3280af22 2058 if (PL_tainted)
cc6b7395 2059 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2060 }
2061
830247a4 2062 nextchar(pRExC_state);
0f5d15d6 2063 if (logical) {
830247a4 2064 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2065 if (!SIZE_ONLY)
2066 ret->flags = 2;
830247a4 2067 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
0f5d15d6
IZ
2068 return ret;
2069 }
830247a4 2070 return reganode(pRExC_state, EVAL, n);
c277df42
IZ
2071 }
2072 case '(':
2073 {
830247a4 2074 if (RExC_parse[0] == '?') {
b81d288d
AB
2075 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2076 || RExC_parse[1] == '<'
830247a4 2077 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2078 I32 flag;
2079
830247a4 2080 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2081 if (!SIZE_ONLY)
2082 ret->flags = 1;
830247a4 2083 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2084 goto insert_if;
b81d288d 2085 }
a0ed51b3 2086 }
830247a4
IZ
2087 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2088 parno = atoi(RExC_parse++);
c277df42 2089
830247a4
IZ
2090 while (isDIGIT(*RExC_parse))
2091 RExC_parse++;
2092 ret = reganode(pRExC_state, GROUPP, parno);
2093 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2094 vFAIL("Switch condition not recognized");
c277df42 2095 insert_if:
830247a4
IZ
2096 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2097 br = regbranch(pRExC_state, &flags, 1);
c277df42 2098 if (br == NULL)
830247a4 2099 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2100 else
830247a4
IZ
2101 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2102 c = *nextchar(pRExC_state);
d1b80229
IZ
2103 if (flags&HASWIDTH)
2104 *flagp |= HASWIDTH;
c277df42 2105 if (c == '|') {
830247a4
IZ
2106 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2107 regbranch(pRExC_state, &flags, 1);
2108 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2109 if (flags&HASWIDTH)
2110 *flagp |= HASWIDTH;
830247a4 2111 c = *nextchar(pRExC_state);
a0ed51b3
LW
2112 }
2113 else
c277df42
IZ
2114 lastbr = NULL;
2115 if (c != ')')
8615cb43 2116 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2117 ender = reg_node(pRExC_state, TAIL);
2118 regtail(pRExC_state, br, ender);
c277df42 2119 if (lastbr) {
830247a4
IZ
2120 regtail(pRExC_state, lastbr, ender);
2121 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2122 }
2123 else
830247a4 2124 regtail(pRExC_state, ret, ender);
c277df42 2125 return ret;
a0ed51b3
LW
2126 }
2127 else {
830247a4 2128 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2129 }
2130 }
1b1626e4 2131 case 0:
830247a4 2132 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2133 vFAIL("Sequence (? incomplete");
1b1626e4 2134 break;
a0d0e21e 2135 default:
830247a4 2136 --RExC_parse;
ca9dfc88 2137 parse_flags:
830247a4
IZ
2138 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2139 if (*RExC_parse != 'o')
2140 pmflag(flagsp, *RExC_parse);
2141 ++RExC_parse;
ca9dfc88 2142 }
830247a4 2143 if (*RExC_parse == '-') {
ca9dfc88 2144 flagsp = &negflags;
830247a4 2145 ++RExC_parse;
ca9dfc88 2146 goto parse_flags;
48c036b1 2147 }
830247a4
IZ
2148 RExC_flags16 |= posflags;
2149 RExC_flags16 &= ~negflags;
2150 if (*RExC_parse == ':') {
2151 RExC_parse++;
ca9dfc88
IZ
2152 paren = ':';
2153 break;
2154 }
c277df42 2155 unknown:
830247a4
IZ
2156 if (*RExC_parse != ')') {
2157 RExC_parse++;
2158 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2159 }
830247a4 2160 nextchar(pRExC_state);
a0d0e21e
LW
2161 *flagp = TRYAGAIN;
2162 return NULL;
2163 }
2164 }
2165 else {
830247a4
IZ
2166 parno = RExC_npar;
2167 RExC_npar++;
2168 ret = reganode(pRExC_state, OPEN, parno);
c277df42 2169 open = 1;
a0d0e21e 2170 }
a0ed51b3
LW
2171 }
2172 else
a0d0e21e
LW
2173 ret = NULL;
2174
2175 /* Pick up the branches, linking them together. */
830247a4 2176 br = regbranch(pRExC_state, &flags, 1);
a0d0e21e
LW
2177 if (br == NULL)
2178 return(NULL);
830247a4
IZ
2179 if (*RExC_parse == '|') {
2180 if (!SIZE_ONLY && RExC_extralen) {
2181 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3
LW
2182 }
2183 else
830247a4 2184 reginsert(pRExC_state, BRANCH, br);
c277df42
IZ
2185 have_branch = 1;
2186 if (SIZE_ONLY)
830247a4 2187 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2188 }
2189 else if (paren == ':') {
c277df42
IZ
2190 *flagp |= flags&SIMPLE;
2191 }
2192 if (open) { /* Starts with OPEN. */
830247a4 2193 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2194 }
2195 else if (paren != '?') /* Not Conditional */
a0d0e21e 2196 ret = br;
821b33a5
IZ
2197 if (flags&HASWIDTH)
2198 *flagp |= HASWIDTH;
a0d0e21e 2199 *flagp |= flags&SPSTART;
c277df42 2200 lastbr = br;
830247a4
IZ
2201 while (*RExC_parse == '|') {
2202 if (!SIZE_ONLY && RExC_extralen) {
2203 ender = reganode(pRExC_state, LONGJMP,0);
2204 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2205 }
2206 if (SIZE_ONLY)
830247a4
IZ
2207 RExC_extralen += 2; /* Account for LONGJMP. */
2208 nextchar(pRExC_state);
2209 br = regbranch(pRExC_state, &flags, 0);
a687059c 2210 if (br == NULL)
a0d0e21e 2211 return(NULL);
830247a4 2212 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2213 lastbr = br;
821b33a5
IZ
2214 if (flags&HASWIDTH)
2215 *flagp |= HASWIDTH;
a687059c 2216 *flagp |= flags&SPSTART;
a0d0e21e
LW
2217 }
2218
c277df42
IZ
2219 if (have_branch || paren != ':') {
2220 /* Make a closing node, and hook it on the end. */
2221 switch (paren) {
2222 case ':':
830247a4 2223 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2224 break;
2225 case 1:
830247a4 2226 ender = reganode(pRExC_state, CLOSE, parno);
c277df42
IZ
2227 break;
2228 case '<':
c277df42
IZ
2229 case ',':
2230 case '=':
2231 case '!':
c277df42 2232 *flagp &= ~HASWIDTH;
821b33a5
IZ
2233 /* FALL THROUGH */
2234 case '>':
830247a4 2235 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2236 break;
2237 case 0:
830247a4 2238 ender = reg_node(pRExC_state, END);
c277df42
IZ
2239 break;
2240 }
830247a4 2241 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2242
c277df42
IZ
2243 if (have_branch) {
2244 /* Hook the tails of the branches to the closing node. */
2245 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2246 regoptail(pRExC_state, br, ender);
c277df42
IZ
2247 }
2248 }
a0d0e21e 2249 }
c277df42
IZ
2250
2251 {
2252 char *p;
2253 static char parens[] = "=!<,>";
2254
2255 if (paren && (p = strchr(parens, paren))) {
2256 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2257 int flag = (p - parens) > 1;
2258
2259 if (paren == '>')
2260 node = SUSPEND, flag = 0;
830247a4 2261 reginsert(pRExC_state, node,ret);
c277df42 2262 ret->flags = flag;
830247a4 2263 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2264 }
a0d0e21e
LW
2265 }
2266
2267 /* Check for proper termination. */
ce3e6498 2268 if (paren) {
830247a4
IZ
2269 RExC_flags16 = oregflags;
2270 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2271 RExC_parse = oregcomp_parse;
380a0633 2272 vFAIL("Unmatched (");
ce3e6498 2273 }
a0ed51b3 2274 }
830247a4
IZ
2275 else if (!paren && RExC_parse < RExC_end) {
2276 if (*RExC_parse == ')') {
2277 RExC_parse++;
380a0633 2278 vFAIL("Unmatched )");
a0ed51b3
LW
2279 }
2280 else
b45f050a 2281 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2282 /* NOTREACHED */
2283 }
a687059c 2284
a0d0e21e 2285 return(ret);
a687059c
LW
2286}
2287
2288/*
2289 - regbranch - one alternative of an | operator
2290 *
2291 * Implements the concatenation operator.
2292 */
76e3520e 2293STATIC regnode *
830247a4 2294S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2295{
c277df42
IZ
2296 register regnode *ret;
2297 register regnode *chain = NULL;
2298 register regnode *latest;
2299 I32 flags = 0, c = 0;
a0d0e21e 2300
b81d288d 2301 if (first)
c277df42
IZ
2302 ret = NULL;
2303 else {
b81d288d 2304 if (!SIZE_ONLY && RExC_extralen)
830247a4 2305 ret = reganode(pRExC_state, BRANCHJ,0);
c277df42 2306 else
830247a4 2307 ret = reg_node(pRExC_state, BRANCH);
c277df42
IZ
2308 }
2309
b81d288d 2310 if (!first && SIZE_ONLY)
830247a4 2311 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2312
c277df42 2313 *flagp = WORST; /* Tentatively. */
a0d0e21e 2314
830247a4
IZ
2315 RExC_parse--;
2316 nextchar(pRExC_state);
2317 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2318 flags &= ~TRYAGAIN;
830247a4 2319 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2320 if (latest == NULL) {
2321 if (flags & TRYAGAIN)
2322 continue;
2323 return(NULL);
a0ed51b3
LW
2324 }
2325 else if (ret == NULL)
c277df42 2326 ret = latest;
a0d0e21e 2327 *flagp |= flags&HASWIDTH;
c277df42 2328 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2329 *flagp |= flags&SPSTART;
2330 else {
830247a4
IZ
2331 RExC_naughty++;
2332 regtail(pRExC_state, chain, latest);
a687059c 2333 }
a0d0e21e 2334 chain = latest;
c277df42
IZ
2335 c++;
2336 }
2337 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2338 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2339 if (ret == NULL)
2340 ret = chain;
2341 }
2342 if (c == 1) {
2343 *flagp |= flags&SIMPLE;
a0d0e21e 2344 }
a687059c 2345
a0d0e21e 2346 return(ret);
a687059c
LW
2347}
2348
2349/*
2350 - regpiece - something followed by possible [*+?]
2351 *
2352 * Note that the branching code sequences used for ? and the general cases
2353 * of * and + are somewhat optimized: they use the same NOTHING node as
2354 * both the endmarker for their branch list and the body of the last branch.
2355 * It might seem that this node could be dispensed with entirely, but the
2356 * endmarker role is not redundant.
2357 */
76e3520e 2358STATIC regnode *
830247a4 2359S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2360{
c277df42 2361 register regnode *ret;
a0d0e21e
LW
2362 register char op;
2363 register char *next;
2364 I32 flags;
830247a4 2365 char *origparse = RExC_parse;
a0d0e21e
LW
2366 char *maxpos;
2367 I32 min;
c277df42 2368 I32 max = REG_INFTY;
a0d0e21e 2369
830247a4 2370 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2371 if (ret == NULL) {
2372 if (flags & TRYAGAIN)
2373 *flagp |= TRYAGAIN;
2374 return(NULL);
2375 }
2376
830247a4 2377 op = *RExC_parse;
a0d0e21e 2378
830247a4
IZ
2379 if (op == '{' && regcurly(RExC_parse)) {
2380 next = RExC_parse + 1;
a0d0e21e
LW
2381 maxpos = Nullch;
2382 while (isDIGIT(*next) || *next == ',') {
2383 if (*next == ',') {
2384 if (maxpos)
2385 break;
2386 else
2387 maxpos = next;
a687059c 2388 }
a0d0e21e
LW
2389 next++;
2390 }
2391 if (*next == '}') { /* got one */
2392 if (!maxpos)
2393 maxpos = next;
830247a4
IZ
2394 RExC_parse++;
2395 min = atoi(RExC_parse);
a0d0e21e
LW
2396 if (*maxpos == ',')
2397 maxpos++;
2398 else
830247a4 2399 maxpos = RExC_parse;
a0d0e21e
LW
2400 max = atoi(maxpos);
2401 if (!max && *maxpos != '0')
c277df42
IZ
2402 max = REG_INFTY; /* meaning "infinity" */
2403 else if (max >= REG_INFTY)
8615cb43 2404 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2405 RExC_parse = next;
2406 nextchar(pRExC_state);
a0d0e21e
LW
2407
2408 do_curly:
2409 if ((flags&SIMPLE)) {
830247a4
IZ
2410 RExC_naughty += 2 + RExC_naughty / 2;
2411 reginsert(pRExC_state, CURLY, ret);
a0d0e21e
LW
2412 }
2413 else {
830247a4 2414 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2415
2416 w->flags = 0;
830247a4
IZ
2417 regtail(pRExC_state, ret, w);
2418 if (!SIZE_ONLY && RExC_extralen) {
2419 reginsert(pRExC_state, LONGJMP,ret);
2420 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2421 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2422 }
830247a4
IZ
2423 reginsert(pRExC_state, CURLYX,ret);
2424 if (!SIZE_ONLY && RExC_extralen)
c277df42 2425 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2426 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2427 if (SIZE_ONLY)
830247a4
IZ
2428 RExC_whilem_seen++, RExC_extralen += 3;
2429 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2430 }
c277df42 2431 ret->flags = 0;
a0d0e21e
LW
2432
2433 if (min > 0)
821b33a5
IZ
2434 *flagp = WORST;
2435 if (max > 0)
2436 *flagp |= HASWIDTH;
a0d0e21e 2437 if (max && max < min)
8615cb43 2438 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2439 if (!SIZE_ONLY) {
2440 ARG1_SET(ret, min);
2441 ARG2_SET(ret, max);
a687059c 2442 }
a687059c 2443
a0d0e21e 2444 goto nest_check;
a687059c 2445 }
a0d0e21e 2446 }
a687059c 2447
a0d0e21e
LW
2448 if (!ISMULT1(op)) {
2449 *flagp = flags;
a687059c 2450 return(ret);
a0d0e21e 2451 }
bb20fd44 2452
c277df42 2453#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2454
2455 /* if this is reinstated, don't forget to put this back into perldiag:
2456
2457 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2458
2459 (F) The part of the regexp subject to either the * or + quantifier
2460 could match an empty string. The {#} shows in the regular
2461 expression about where the problem was discovered.
2462
2463 */
2464
bb20fd44 2465 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2466 vFAIL("Regexp *+ operand could be empty");
b81d288d 2467#endif
bb20fd44 2468
830247a4 2469 nextchar(pRExC_state);
a0d0e21e 2470
821b33a5 2471 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2472
2473 if (op == '*' && (flags&SIMPLE)) {
830247a4 2474 reginsert(pRExC_state, STAR, ret);
c277df42 2475 ret->flags = 0;
830247a4 2476 RExC_naughty += 4;
a0d0e21e
LW
2477 }
2478 else if (op == '*') {
2479 min = 0;
2480 goto do_curly;
a0ed51b3
LW
2481 }
2482 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2483 reginsert(pRExC_state, PLUS, ret);
c277df42 2484 ret->flags = 0;
830247a4 2485 RExC_naughty += 3;
a0d0e21e
LW
2486 }
2487 else if (op == '+') {
2488 min = 1;
2489 goto do_curly;
a0ed51b3
LW
2490 }
2491 else if (op == '?') {
a0d0e21e
LW
2492 min = 0; max = 1;
2493 goto do_curly;
2494 }
2495 nest_check:
e476b1b5 2496 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2497 vWARN3(RExC_parse,
b45f050a 2498 "%.*s matches null string many times",
830247a4 2499 RExC_parse - origparse,
b45f050a 2500 origparse);
a0d0e21e
LW
2501 }
2502
830247a4
IZ
2503 if (*RExC_parse == '?') {
2504 nextchar(pRExC_state);
2505 reginsert(pRExC_state, MINMOD, ret);
2506 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2507 }
830247a4
IZ
2508 if (ISMULT2(RExC_parse)) {
2509 RExC_parse++;
b45f050a
JF
2510 vFAIL("Nested quantifiers");
2511 }
a0d0e21e
LW
2512
2513 return(ret);
a687059c
LW
2514}
2515
2516/*
2517 - regatom - the lowest level
2518 *
2519 * Optimization: gobbles an entire sequence of ordinary characters so that
2520 * it can turn them into a single node, which is smaller to store and
2521 * faster to run. Backslashed characters are exceptions, each becoming a
2522 * separate node; the code is simpler that way and it's not worth fixing.
2523 *
b45f050a 2524 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2525STATIC regnode *
830247a4 2526S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2527{
c277df42 2528 register regnode *ret = 0;
a0d0e21e
LW
2529 I32 flags;
2530
2531 *flagp = WORST; /* Tentatively. */
2532
2533tryagain:
830247a4 2534 switch (*RExC_parse) {
a0d0e21e 2535 case '^':
830247a4
IZ
2536 RExC_seen_zerolen++;
2537 nextchar(pRExC_state);
2538 if (RExC_flags16 & PMf_MULTILINE)
2539 ret = reg_node(pRExC_state, MBOL);
2540 else if (RExC_flags16 & PMf_SINGLELINE)
2541 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2542 else
830247a4 2543 ret = reg_node(pRExC_state, BOL);
a0d0e21e
LW
2544 break;
2545 case '$':
830247a4 2546 nextchar(pRExC_state);
b81d288d 2547 if (*RExC_parse)
830247a4
IZ
2548 RExC_seen_zerolen++;
2549 if (RExC_flags16 & PMf_MULTILINE)
2550 ret = reg_node(pRExC_state, MEOL);
2551 else if (RExC_flags16 & PMf_SINGLELINE)
2552 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2553 else
830247a4 2554 ret = reg_node(pRExC_state, EOL);
a0d0e21e
LW
2555 break;
2556 case '.':
830247a4 2557 nextchar(pRExC_state);
ffc61ed2
JH
2558 if (RExC_flags16 & PMf_SINGLELINE)
2559 ret = reg_node(pRExC_state, SANY);
2560 else
2561 ret = reg_node(pRExC_state, REG_ANY);
2562 *flagp |= HASWIDTH|SIMPLE;
830247a4 2563 RExC_naughty++;
a0d0e21e
LW
2564 break;
2565 case '[':
b45f050a 2566 {
830247a4 2567 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2568 ret = regclass(pRExC_state);
830247a4
IZ
2569 if (*RExC_parse != ']') {
2570 RExC_parse = oregcomp_parse;
b45f050a
JF
2571 vFAIL("Unmatched [");
2572 }
830247a4 2573 nextchar(pRExC_state);
a0d0e21e
LW
2574 *flagp |= HASWIDTH|SIMPLE;
2575 break;
b45f050a 2576 }
a0d0e21e 2577 case '(':
830247a4
IZ
2578 nextchar(pRExC_state);
2579 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2580 if (ret == NULL) {
bf93d4cc 2581 if (flags & TRYAGAIN) {
830247a4 2582 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2583 /* Make parent create an empty node if needed. */
2584 *flagp |= TRYAGAIN;
2585 return(NULL);
2586 }
a0d0e21e 2587 goto tryagain;
bf93d4cc 2588 }
a0d0e21e
LW
2589 return(NULL);
2590 }
c277df42 2591 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2592 break;
2593 case '|':
2594 case ')':
2595 if (flags & TRYAGAIN) {
2596 *flagp |= TRYAGAIN;
2597 return NULL;
2598 }
b45f050a 2599 vFAIL("Internal urp");
a0d0e21e
LW
2600 /* Supposed to be caught earlier. */
2601 break;
85afd4ae 2602 case '{':
830247a4
IZ
2603 if (!regcurly(RExC_parse)) {
2604 RExC_parse++;
85afd4ae
CS
2605 goto defchar;
2606 }
2607 /* FALL THROUGH */
a0d0e21e
LW
2608 case '?':
2609 case '+':
2610 case '*':
830247a4 2611 RExC_parse++;
b45f050a 2612 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2613 break;
2614 case '\\':
830247a4 2615 switch (*++RExC_parse) {
a0d0e21e 2616 case 'A':
830247a4
IZ
2617 RExC_seen_zerolen++;
2618 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2619 *flagp |= SIMPLE;
830247a4 2620 nextchar(pRExC_state);
a0d0e21e
LW
2621 break;
2622 case 'G':
830247a4
IZ
2623 ret = reg_node(pRExC_state, GPOS);
2624 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2625 *flagp |= SIMPLE;
830247a4 2626 nextchar(pRExC_state);
a0d0e21e
LW
2627 break;
2628 case 'Z':
830247a4 2629 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2630 *flagp |= SIMPLE;
830247a4 2631 nextchar(pRExC_state);
a0d0e21e 2632 break;
b85d18e9 2633 case 'z':
830247a4 2634 ret = reg_node(pRExC_state, EOS);
b85d18e9 2635 *flagp |= SIMPLE;
830247a4
IZ
2636 RExC_seen_zerolen++; /* Do not optimize RE away */
2637 nextchar(pRExC_state);
b85d18e9 2638 break;
4a2d328f 2639 case 'C':
830247a4 2640 ret = reg_node(pRExC_state, SANY);
a0ed51b3 2641 *flagp |= HASWIDTH|SIMPLE;
830247a4 2642 nextchar(pRExC_state);
a0ed51b3
LW
2643 break;
2644 case 'X':
830247a4 2645 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2646 *flagp |= HASWIDTH;
830247a4 2647 nextchar(pRExC_state);
a0ed51b3 2648 if (UTF && !PL_utf8_mark)
dfe13c55 2649 is_utf8_mark((U8*)"~"); /* preload table */
a0ed51b3 2650 break;
a0d0e21e 2651 case 'w':
ffc61ed2 2652 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2653 *flagp |= HASWIDTH|SIMPLE;
830247a4 2654 nextchar(pRExC_state);
a0ed51b3 2655 if (UTF && !PL_utf8_alnum)
dfe13c55 2656 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2657 break;
2658 case 'W':
ffc61ed2 2659 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2660 *flagp |= HASWIDTH|SIMPLE;
830247a4 2661 nextchar(pRExC_state);
a0ed51b3 2662 if (UTF && !PL_utf8_alnum)
dfe13c55 2663 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2664 break;
2665 case 'b':
830247a4
IZ
2666 RExC_seen_zerolen++;
2667 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2668 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2669 *flagp |= SIMPLE;
830247a4 2670 nextchar(pRExC_state);
a0ed51b3 2671 if (UTF && !PL_utf8_alnum)
dfe13c55 2672 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2673 break;
2674 case 'B':
830247a4
IZ
2675 RExC_seen_zerolen++;
2676 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2677 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2678 *flagp |= SIMPLE;
830247a4 2679 nextchar(pRExC_state);
a0ed51b3 2680 if (UTF && !PL_utf8_alnum)
dfe13c55 2681 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
2682 break;
2683 case 's':
ffc61ed2 2684 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2685 *flagp |= HASWIDTH|SIMPLE;
830247a4 2686 nextchar(pRExC_state);
a0ed51b3 2687 if (UTF && !PL_utf8_space)
dfe13c55 2688 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
2689 break;
2690 case 'S':
ffc61ed2 2691 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2692 *flagp |= HASWIDTH|SIMPLE;
830247a4 2693 nextchar(pRExC_state);
a0ed51b3 2694 if (UTF && !PL_utf8_space)
dfe13c55 2695 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
2696 break;
2697 case 'd':
ffc61ed2 2698 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2699 *flagp |= HASWIDTH|SIMPLE;
830247a4 2700 nextchar(pRExC_state);
a0ed51b3 2701 if (UTF && !PL_utf8_digit)
dfe13c55 2702 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e
LW
2703 break;
2704 case 'D':
ffc61ed2 2705 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2706 *flagp |= HASWIDTH|SIMPLE;
830247a4 2707 nextchar(pRExC_state);
a0ed51b3 2708 if (UTF && !PL_utf8_digit)
dfe13c55 2709 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e 2710 break;
a14b48bc
LW
2711 case 'p':
2712 case 'P':
2713 { /* a lovely hack--pretend we saw [\pX] instead */
830247a4 2714 char* oldregxend = RExC_end;
a14b48bc 2715
830247a4
IZ
2716 if (RExC_parse[1] == '{') {
2717 RExC_end = strchr(RExC_parse, '}');
2718 if (!RExC_end) {
2719 RExC_parse += 2;
2720 RExC_end = oldregxend;
b45f050a
JF
2721 vFAIL("Missing right brace on \\p{}");
2722 }
830247a4 2723 RExC_end++;
a14b48bc
LW
2724 }
2725 else
830247a4
IZ
2726 RExC_end = RExC_parse + 2;
2727 RExC_parse--;
a14b48bc 2728
ffc61ed2 2729 ret = regclass(pRExC_state);
a14b48bc 2730
830247a4
IZ
2731 RExC_end = oldregxend;
2732 RExC_parse--;
2733 nextchar(pRExC_state);
a14b48bc
LW
2734 *flagp |= HASWIDTH|SIMPLE;
2735 }
2736 break;
a0d0e21e
LW
2737 case 'n':
2738 case 'r':
2739 case 't':
2740 case 'f':
2741 case 'e':
2742 case 'a':
2743 case 'x':
2744 case 'c':
2745 case '0':
2746 goto defchar;
2747 case '1': case '2': case '3': case '4':
2748 case '5': case '6': case '7': case '8': case '9':
2749 {
830247a4 2750 I32 num = atoi(RExC_parse);
a0d0e21e 2751
830247a4 2752 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2753 goto defchar;
2754 else {
830247a4
IZ
2755 while (isDIGIT(*RExC_parse))
2756 RExC_parse++;
b45f050a 2757
830247a4 2758 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2759 vFAIL("Reference to nonexistent group");
830247a4
IZ
2760 RExC_sawback = 1;
2761 ret = reganode(pRExC_state, FOLD
a0ed51b3 2762 ? (LOC ? REFFL : REFF)
c8756f30 2763 : REF, num);
a0d0e21e 2764 *flagp |= HASWIDTH;
830247a4
IZ
2765 RExC_parse--;
2766 nextchar(pRExC_state);
a0d0e21e
LW
2767 }
2768 }
2769 break;
2770 case '\0':
830247a4 2771 if (RExC_parse >= RExC_end)
b45f050a 2772 FAIL("Trailing \\");
a0d0e21e
LW
2773 /* FALL THROUGH */
2774 default:
c9f97d15
IZ
2775 /* Do not generate `unrecognized' warnings here, we fall
2776 back into the quick-grab loop below */
a0d0e21e
LW
2777 goto defchar;
2778 }
2779 break;
4633a7c4
LW
2780
2781 case '#':
830247a4
IZ
2782 if (RExC_flags16 & PMf_EXTENDED) {
2783 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2784 if (RExC_parse < RExC_end)
4633a7c4
LW
2785 goto tryagain;
2786 }
2787 /* FALL THROUGH */
2788
a0d0e21e 2789 default: {
ba210ebe 2790 register STRLEN len;
a0ed51b3 2791 register UV ender;
a0d0e21e 2792 register char *p;
c277df42 2793 char *oldp, *s;
ba210ebe 2794 STRLEN numlen;
a0d0e21e 2795
830247a4 2796 RExC_parse++;
a0d0e21e
LW
2797
2798 defchar:
830247a4 2799 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2800 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2801 : EXACT);
cd439c50 2802 s = STRING(ret);
830247a4
IZ
2803 for (len = 0, p = RExC_parse - 1;
2804 len < 127 && p < RExC_end;
a0d0e21e
LW
2805 len++)
2806 {
2807 oldp = p;
5b5a24f7 2808
830247a4
IZ
2809 if (RExC_flags16 & PMf_EXTENDED)
2810 p = regwhite(p, RExC_end);
a0d0e21e
LW
2811 switch (*p) {
2812 case '^':
2813 case '$':
2814 case '.':
2815 case '[':
2816 case '(':
2817 case ')':
2818 case '|':
2819 goto loopdone;
2820 case '\\':
2821 switch (*++p) {
2822 case 'A':
2823 case 'G':
2824 case 'Z':
b85d18e9 2825 case 'z':
a0d0e21e
LW
2826 case 'w':
2827 case 'W':
2828 case 'b':
2829 case 'B':
2830 case 's':
2831 case 'S':
2832 case 'd':
2833 case 'D':
a14b48bc
LW
2834 case 'p':
2835 case 'P':
a0d0e21e
LW
2836 --p;
2837 goto loopdone;
2838 case 'n':
2839 ender = '\n';
2840 p++;
a687059c 2841 break;
a0d0e21e
LW
2842 case 'r':
2843 ender = '\r';
2844 p++;
a687059c 2845 break;
a0d0e21e
LW
2846 case 't':
2847 ender = '\t';
2848 p++;
a687059c 2849 break;
a0d0e21e
LW
2850 case 'f':
2851 ender = '\f';
2852 p++;
a687059c 2853 break;
a0d0e21e 2854 case 'e':
0407a77b
GS
2855#ifdef ASCIIish
2856 ender = '\033';
2857#else
2858 ender = '\047';
2859#endif
a0d0e21e 2860 p++;
a687059c 2861 break;
a0d0e21e 2862 case 'a':
0407a77b
GS
2863#ifdef ASCIIish
2864 ender = '\007';
2865#else
2866 ender = '\057';
2867#endif
a0d0e21e 2868 p++;
a687059c 2869 break;
a0d0e21e 2870 case 'x':
a0ed51b3
LW
2871 if (*++p == '{') {
2872 char* e = strchr(p, '}');
b81d288d 2873
b45f050a 2874 if (!e) {
830247a4 2875 RExC_parse = p + 1;
b45f050a
JF
2876 vFAIL("Missing right brace on \\x{}");
2877 }
de5f0749 2878 else {
b21ed0a9 2879 numlen = 1; /* allow underscores */
de35ba6f 2880 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
b21ed0a9
GS
2881 /* numlen is generous */
2882 if (numlen + len >= 127) {
a0ed51b3
LW
2883 p--;
2884 goto loopdone;
2885 }
2886 p = e + 1;
2887 }
a0ed51b3
LW
2888 }
2889 else {
b21ed0a9 2890 numlen = 0; /* disallow underscores */
dff6d3cd 2891 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2892 p += numlen;
2893 }
a687059c 2894 break;
a0d0e21e
LW
2895 case 'c':
2896 p++;
bbce6d69 2897 ender = UCHARAT(p++);
2898 ender = toCTRL(ender);
a687059c 2899 break;
a0d0e21e
LW
2900 case '0': case '1': case '2': case '3':case '4':
2901 case '5': case '6': case '7': case '8':case '9':
2902 if (*p == '0' ||
830247a4 2903 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2904 numlen = 0; /* disallow underscores */
dff6d3cd 2905 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2906 p += numlen;
2907 }
2908 else {
2909 --p;
2910 goto loopdone;
a687059c
LW
2911 }
2912 break;
a0d0e21e 2913 case '\0':
830247a4 2914 if (p >= RExC_end)
b45f050a 2915 FAIL("Trailing \\");
a687059c 2916 /* FALL THROUGH */
a0d0e21e 2917 default:
e476b1b5 2918 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 2919 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 2920 goto normal_default;
a0d0e21e
LW
2921 }
2922 break;
a687059c 2923 default:
a0ed51b3 2924 normal_default:
fd400ab9 2925 if (UTF8_IS_START(*p) && UTF) {
830247a4 2926 ender = utf8_to_uv((U8*)p, RExC_end - p,
ba210ebe 2927 &numlen, 0);
a0ed51b3
LW
2928 p += numlen;
2929 }
2930 else
2931 ender = *p++;
a0d0e21e 2932 break;
a687059c 2933 }
830247a4
IZ
2934 if (RExC_flags16 & PMf_EXTENDED)
2935 p = regwhite(p, RExC_end);
a0ed51b3
LW
2936 if (UTF && FOLD) {
2937 if (LOC)
2938 ender = toLOWER_LC_uni(ender);
2939 else
2940 ender = toLOWER_uni(ender);
2941 }
a0d0e21e
LW
2942 if (ISMULT2(p)) { /* Back off on ?+*. */
2943 if (len)
2944 p = oldp;
fd400ab9
JH
2945 /* ender is a Unicode value so it can be > 0xff --
2946 * in other words, do not use UTF8_IS_CONTINUED(). */
a0ed51b3 2947 else if (ender >= 0x80 && UTF) {
830247a4 2948 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2949 s += numlen;
2950 len += numlen;
2951 }
a0d0e21e
LW
2952 else {
2953 len++;
cd439c50 2954 REGC(ender, s++);
a0d0e21e
LW
2955 }
2956 break;
a687059c 2957 }
fd400ab9
JH
2958 /* ender is a Unicode value so it can be > 0xff --
2959 * in other words, do not use UTF8_IS_CONTINUED(). */
a0ed51b3 2960 if (ender >= 0x80 && UTF) {
830247a4 2961 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2962 s += numlen;
2963 len += numlen - 1;
2964 }
2965 else
cd439c50 2966 REGC(ender, s++);
a0d0e21e
LW
2967 }
2968 loopdone:
830247a4
IZ
2969 RExC_parse = p - 1;
2970 nextchar(pRExC_state);
793db0cb
JH
2971 {
2972 /* len is STRLEN which is unsigned, need to copy to signed */
2973 IV iv = len;
2974 if (iv < 0)
2975 vFAIL("Internal disaster");
2976 }
a0d0e21e
LW
2977 if (len > 0)
2978 *flagp |= HASWIDTH;
2979 if (len == 1)
2980 *flagp |= SIMPLE;
c277df42 2981 if (!SIZE_ONLY)
cd439c50
IZ
2982 STR_LEN(ret) = len;
2983 if (SIZE_ONLY)
830247a4 2984 RExC_size += STR_SZ(len);
cd439c50 2985 else
830247a4 2986 RExC_emit += STR_SZ(len);
a687059c 2987 }
a0d0e21e
LW
2988 break;
2989 }
a687059c 2990
a0d0e21e 2991 return(ret);
a687059c
LW
2992}
2993
873ef191 2994STATIC char *
cea2e8a9 2995S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
2996{
2997 while (p < e) {
2998 if (isSPACE(*p))
2999 ++p;
3000 else if (*p == '#') {
3001 do {
3002 p++;
3003 } while (p < e && *p != '\n');
3004 }
3005 else
3006 break;
3007 }
3008 return p;
3009}
3010
b8c5462f
JH
3011/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3012 Character classes ([:foo:]) can also be negated ([:^foo:]).
3013 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3014 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3015 but trigger warnings because they are currently unimplemented. */
3016STATIC I32
830247a4 3017S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3018{
3019 char *posixcc = 0;
936ed897 3020 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3021
830247a4 3022 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3023 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3024 (*RExC_parse == ':' ||
3025 *RExC_parse == '=' ||
3026 *RExC_parse == '.')) {
3027 char c = *RExC_parse;
3028 char* s = RExC_parse++;
b81d288d 3029
830247a4
IZ
3030 while (RExC_parse < RExC_end && *RExC_parse != c)
3031 RExC_parse++;
3032 if (RExC_parse == RExC_end)
620e46c5 3033 /* Grandfather lone [:, [=, [. */
830247a4 3034 RExC_parse = s;
620e46c5 3035 else {
830247a4 3036 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3037
830247a4
IZ
3038 if (*RExC_parse == ']') {
3039 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3040 posixcc = s + 1;
3041 if (*s == ':') {
3042 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3043 I32 skip = 5; /* the most common skip */
3044
3045 switch (*posixcc) {
3046 case 'a':
3047 if (strnEQ(posixcc, "alnum", 5))
3048 namedclass =
3049 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3050 else if (strnEQ(posixcc, "alpha", 5))
3051 namedclass =
3052 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3053 else if (strnEQ(posixcc, "ascii", 5))
3054 namedclass =
3055 complement ? ANYOF_NASCII : ANYOF_ASCII;
3056 break;
aaa51d5e
JF
3057 case 'b':
3058 if (strnEQ(posixcc, "blank", 5))
3059 namedclass =
3060 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3061 break;
b8c5462f
JH
3062 case 'c':
3063 if (strnEQ(posixcc, "cntrl", 5))
3064 namedclass =
3065 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3066 break;
3067 case 'd':
3068 if (strnEQ(posixcc, "digit", 5))
3069 namedclass =
3070 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3071 break;
3072 case 'g':
3073 if (strnEQ(posixcc, "graph", 5))
3074 namedclass =
3075 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3076 break;
3077 case 'l':
3078 if (strnEQ(posixcc, "lower", 5))
3079 namedclass =
3080 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3081 break;
3082 case 'p':
3083 if (strnEQ(posixcc, "print", 5))
3084 namedclass =
3085 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3086 else if (strnEQ(posixcc, "punct", 5))
3087 namedclass =
3088 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3089 break;
3090 case 's':
3091 if (strnEQ(posixcc, "space", 5))
3092 namedclass =
aaa51d5e 3093 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3094 break;
b8c5462f
JH
3095 case 'u':
3096 if (strnEQ(posixcc, "upper", 5))
3097 namedclass =
3098 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3099 break;
3100 case 'w': /* this is not POSIX, this is the Perl \w */
3101 if (strnEQ(posixcc, "word", 4)) {
3102 namedclass =
3103 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3104 skip = 4;
3105 }
3106 break;
3107 case 'x':
3108 if (strnEQ(posixcc, "xdigit", 6)) {
3109 namedclass =
3110 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3111 skip = 6;
3112 }
3113 break;
3114 }
ac561586
JH
3115 if (namedclass == OOB_NAMEDCLASS ||
3116 posixcc[skip] != ':' ||
3117 posixcc[skip+1] != ']')
b45f050a
JF
3118 {
3119 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3120 t - s - 1, s + 1);
3121 }
3122 } else if (!SIZE_ONLY) {
b8c5462f 3123 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3124
830247a4 3125 /* adjust RExC_parse so the warning shows after
b45f050a 3126 the class closes */
830247a4
IZ
3127 while (*RExC_parse && *RExC_parse != ']')
3128 RExC_parse++;
b45f050a
JF
3129 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3130 }
b8c5462f
JH
3131 } else {
3132 /* Maternal grandfather:
3133 * "[:" ending in ":" but not in ":]" */
830247a4 3134 RExC_parse = s;
767d463e 3135 }
620e46c5
JH
3136 }
3137 }
3138
b8c5462f
JH
3139 return namedclass;
3140}
3141
3142STATIC void
830247a4 3143S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3144{
e476b1b5 3145 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3146 (*RExC_parse == ':' ||
3147 *RExC_parse == '=' ||
3148 *RExC_parse == '.')) {
3149 char *s = RExC_parse;
93733859 3150 char c = *s++;
b8c5462f
JH
3151
3152 while(*s && isALNUM(*s))
3153 s++;
3154 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3155 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3156
3157 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3158 if (c == '=' || c == '.')
b45f050a 3159 {
830247a4 3160 /* adjust RExC_parse so the error shows after
b45f050a 3161 the class closes */
830247a4 3162 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3163 ;
3164 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3165 }
b8c5462f
JH
3166 }
3167 }
620e46c5
JH
3168}
3169
76e3520e 3170STATIC regnode *
830247a4 3171S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3172{
ffc61ed2
JH
3173 register UV value;
3174 register IV lastvalue = OOB_UNICODE;
3175 register IV range = 0;
c277df42 3176 register regnode *ret;
ba210ebe 3177 STRLEN numlen;
ffc61ed2 3178 IV namedclass;
73b437c8 3179 char *rangebegin;
936ed897 3180 bool need_class = 0;
ffc61ed2
JH
3181 SV *listsv;
3182 register char *e;
3183 UV n;
1aa99e6b 3184 bool dont_optimize_invert = FALSE;
ffc61ed2
JH
3185
3186 ret = reganode(pRExC_state, ANYOF, 0);
3187
3188 if (!SIZE_ONLY)
3189 ANYOF_FLAGS(ret) = 0;
3190
3191 if (*RExC_parse == '^') { /* Complement of range. */
3192 RExC_naughty++;
3193 RExC_parse++;
3194 if (!SIZE_ONLY)
3195 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3196 }
a0d0e21e 3197
936ed897 3198 if (SIZE_ONLY)
830247a4 3199 RExC_size += ANYOF_SKIP;
936ed897 3200 else {
830247a4 3201 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3202 if (FOLD)
3203 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3204 if (LOC)
3205 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3206 ANYOF_BITMAP_ZERO(ret);
3207 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3208 }
b8c5462f 3209
e476b1b5 3210 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3211 checkposixcc(pRExC_state);
b8c5462f 3212
830247a4 3213 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3214 goto charclassloop; /* allow 1st char to be ] or - */
3215
830247a4 3216 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3217
3218 charclassloop:
3219
3220 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3221
73b437c8 3222 if (!range)
830247a4 3223 rangebegin = RExC_parse;
ffc61ed2
JH
3224 if (UTF) {
3225 value = utf8_to_uv((U8*)RExC_parse,
3226 RExC_end - RExC_parse,
3227 &numlen, 0);
3228 RExC_parse += numlen;
3229 }
3230 else
3231 value = UCHARAT(RExC_parse++);
620e46c5 3232 if (value == '[')
830247a4 3233 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3234 else if (value == '\\') {
ffc61ed2
JH
3235 if (UTF) {
3236 value = utf8_to_uv((U8*)RExC_parse,
3237 RExC_end - RExC_parse,
3238 &numlen, 0);
3239 RExC_parse += numlen;
3240 }
3241 else
3242 value = UCHARAT(RExC_parse++);
470c3474 3243 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3244 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3245 * be a problem later if we want switch on Unicode.
3246 * A similar issue a little bit later when switching on
3247 * namedclass. --jhi */
ffc61ed2 3248 switch ((I32)value) {
b8c5462f
JH
3249 case 'w': namedclass = ANYOF_ALNUM; break;
3250 case 'W': namedclass = ANYOF_NALNUM; break;
3251 case 's': namedclass = ANYOF_SPACE; break;
3252 case 'S': namedclass = ANYOF_NSPACE; break;
3253 case 'd': namedclass = ANYOF_DIGIT; break;
3254 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3255 case 'p':
3256 case 'P':
3257 if (*RExC_parse == '{') {
3258 e = strchr(RExC_parse++, '}');
3259 if (!e)
3260 vFAIL("Missing right brace on \\p{}");
3261 n = e - RExC_parse;
3262 }
3263 else {
3264 e = RExC_parse;
3265 n = 1;
3266 }
3267 if (!SIZE_ONLY) {
3268 if (value == 'p')
3269 Perl_sv_catpvf(aTHX_ listsv,
3270 "+utf8::%.*s\n", (int)n, RExC_parse);
3271 else
3272 Perl_sv_catpvf(aTHX_ listsv,
3273 "!utf8::%.*s\n", (int)n, RExC_parse);
3274 }
3275 RExC_parse = e + 1;
3276 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3277 continue;
b8c5462f
JH
3278 case 'n': value = '\n'; break;
3279 case 'r': value = '\r'; break;
3280 case 't': value = '\t'; break;
3281 case 'f': value = '\f'; break;
3282 case 'b': value = '\b'; break;
0407a77b 3283#ifdef ASCIIish
b8c5462f
JH
3284 case 'e': value = '\033'; break;
3285 case 'a': value = '\007'; break;
0407a77b
GS
3286#else
3287 case 'e': value = '\047'; break;
3288 case 'a': value = '\057'; break;
3289#endif
b8c5462f 3290 case 'x':
ffc61ed2
JH
3291 if (*RExC_parse == '{') {
3292 e = strchr(RExC_parse++, '}');
b81d288d 3293 if (!e)
ffc61ed2
JH
3294 vFAIL("Missing right brace on \\x{}");
3295 numlen = 1; /* allow underscores */
3296 value = (UV)scan_hex(RExC_parse,
3297 e - RExC_parse,
3298 &numlen);
3299 RExC_parse = e + 1;
3300 }
3301 else {
3302 numlen = 0; /* disallow underscores */
3303 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3304 RExC_parse += numlen;
3305 }
b8c5462f
JH
3306 break;
3307 case 'c':
830247a4 3308 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3309 value = toCTRL(value);
3310 break;
3311 case '0': case '1': case '2': case '3': case '4':
3312 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3313 numlen = 0; /* disallow underscores */
830247a4
IZ
3314 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3315 RExC_parse += numlen;
b8c5462f 3316 break;
1028017a 3317 default:
e476b1b5 3318 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3319 vWARN2(RExC_parse,
3320 "Unrecognized escape \\%c in character class passed through",
3321 (int)value);
1028017a 3322 break;
b8c5462f 3323 }
ffc61ed2
JH
3324 } /* end of \blah */
3325
3326 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3327
3328 if (!SIZE_ONLY && !need_class)
936ed897 3329 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3330
936ed897 3331 need_class = 1;
ffc61ed2
JH
3332
3333 /* a bad range like a-\d, a-[:digit:] ? */
3334 if (range) {
73b437c8 3335 if (!SIZE_ONLY) {
e476b1b5 3336 if (ckWARN(WARN_REGEXP))
830247a4 3337 vWARN4(RExC_parse,
b45f050a 3338 "False [] range \"%*.*s\"",
830247a4
IZ
3339 RExC_parse - rangebegin,
3340 RExC_parse - rangebegin,
b45f050a 3341 rangebegin);
ffc61ed2
JH
3342 if (lastvalue < 256) {
3343 ANYOF_BITMAP_SET(ret, lastvalue);
3344 ANYOF_BITMAP_SET(ret, '-');
3345 }
3346 else {
3347 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3348 Perl_sv_catpvf(aTHX_ listsv,
3349 /* 0x002D is Unicode for '-' */
3350 "%04"UVxf"\n002D\n", (UV)lastvalue);
3351 }
b8c5462f 3352 }
ffc61ed2
JH
3353
3354 range = 0; /* this was not a true range */
73b437c8 3355 }
ffc61ed2 3356
73b437c8 3357 if (!SIZE_ONLY) {
e2962f66
JH
3358 /* Possible truncation here but in some 64-bit environments
3359 * the compiler gets heartburn about switch on 64-bit values.
3360 * A similar issue a little earlier when switching on value.
98f323fa 3361 * --jhi */
e2962f66 3362 switch ((I32)namedclass) {
73b437c8
JH
3363 case ANYOF_ALNUM:
3364 if (LOC)
936ed897 3365 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3366 else {
3367 for (value = 0; value < 256; value++)
3368 if (isALNUM(value))
936ed897 3369 ANYOF_BITMAP_SET(ret, value);
73b437c8 3370 }
1aa99e6b 3371 dont_optimize_invert = TRUE;
ffc61ed2 3372 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3373 break;
3374 case ANYOF_NALNUM:
3375 if (LOC)
936ed897 3376 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3377 else {
3378 for (value = 0; value < 256; value++)
3379 if (!isALNUM(value))
936ed897 3380 ANYOF_BITMAP_SET(ret, value);
73b437c8 3381 }
1aa99e6b 3382 dont_optimize_invert = TRUE;
ffc61ed2 3383 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3384 break;
ffc61ed2 3385 case ANYOF_ALNUMC:
73b437c8 3386 if (LOC)
ffc61ed2 3387 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3388 else {
3389 for (value = 0; value < 256; value++)
ffc61ed2 3390 if (isALNUMC(value))
936ed897 3391 ANYOF_BITMAP_SET(ret, value);
73b437c8 3392 }
1aa99e6b 3393 dont_optimize_invert = TRUE;
ffc61ed2 3394 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3395 break;
3396 case ANYOF_NALNUMC:
3397 if (LOC)
936ed897 3398 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3399 else {
3400 for (value = 0; value < 256; value++)
3401 if (!isALNUMC(value))
936ed897 3402 ANYOF_BITMAP_SET(ret, value);
73b437c8 3403 }
1aa99e6b 3404 dont_optimize_invert = TRUE;
ffc61ed2 3405 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3406 break;
3407 case ANYOF_ALPHA:
3408 if (LOC)
936ed897 3409 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3410 else {
3411 for (value = 0; value < 256; value++)
3412 if (isALPHA(value))
936ed897 3413 ANYOF_BITMAP_SET(ret, value);
73b437c8 3414 }
1aa99e6b 3415 dont_optimize_invert = TRUE;
ffc61ed2 3416 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3417 break;
3418 case ANYOF_NALPHA:
3419 if (LOC)
936ed897 3420 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3421 else {
3422 for (value = 0; value < 256; value++)
3423 if (!isALPHA(value))
936ed897 3424 ANYOF_BITMAP_SET(ret, value);
73b437c8 3425 }
1aa99e6b 3426 dont_optimize_invert = TRUE;
ffc61ed2 3427 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3428 break;
3429 case ANYOF_ASCII:
3430 if (LOC)
936ed897 3431 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3432 else {
a5993822 3433#ifdef ASCIIish
73b437c8 3434 for (value = 0; value < 128; value++)
936ed897 3435 ANYOF_BITMAP_SET(ret, value);
a5993822
PP
3436#else /* EBCDIC */
3437 for (value = 0; value < 256; value++)
3438 if (isASCII(value))
3439 ANYOF_BITMAP_SET(ret, value);
3440#endif /* EBCDIC */
73b437c8 3441 }
1aa99e6b 3442 dont_optimize_invert = TRUE;
ffc61ed2 3443 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3444 break;
3445 case ANYOF_NASCII:
3446 if (LOC)
936ed897 3447 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3448 else {
a5993822 3449#ifdef ASCIIish
73b437c8 3450 for (value = 128; value < 256; value++)
936ed897 3451 ANYOF_BITMAP_SET(ret, value);
a5993822
PP
3452#else /* EBCDIC */
3453 for (value = 0; value < 256; value++)
3454 if (!isASCII(value))
3455 ANYOF_BITMAP_SET(ret, value);
3456#endif /* EBCDIC */
73b437c8 3457 }
1aa99e6b 3458 dont_optimize_invert = TRUE;
ffc61ed2 3459 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3460 break;
aaa51d5e
JF
3461 case ANYOF_BLANK:
3462 if (LOC)
3463 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3464 else {
3465 for (value = 0; value < 256; value++)
3466 if (isBLANK(value))
3467 ANYOF_BITMAP_SET(ret, value);
3468 }
1aa99e6b 3469 dont_optimize_invert = TRUE;
ffc61ed2 3470 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3471 break;
3472 case ANYOF_NBLANK:
3473 if (LOC)
3474 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3475 else {
3476 for (value = 0; value < 256; value++)
3477 if (!isBLANK(value))
3478 ANYOF_BITMAP_SET(ret, value);
3479 }
1aa99e6b 3480 dont_optimize_invert = TRUE;
ffc61ed2 3481 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3482 break;
73b437c8
JH
3483 case ANYOF_CNTRL:
3484 if (LOC)
936ed897 3485 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3486 else {
3487 for (value = 0; value < 256; value++)
3488 if (isCNTRL(value))
936ed897 3489 ANYOF_BITMAP_SET(ret, value);
73b437c8 3490 }
1aa99e6b 3491 dont_optimize_invert = TRUE;
ffc61ed2 3492 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3493 break;
3494 case ANYOF_NCNTRL:
3495 if (LOC)
936ed897 3496 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3497 else {
3498 for (value = 0; value < 256; value++)
3499 if (!isCNTRL(value))
936ed897 3500 ANYOF_BITMAP_SET(ret, value);
73b437c8 3501 }
1aa99e6b 3502 dont_optimize_invert = TRUE;
ffc61ed2
JH
3503 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3504 break;
3505 case ANYOF_DIGIT:
3506 if (LOC)
3507 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3508 else {
3509 /* consecutive digits assumed */
3510 for (value = '0'; value <= '9'; value++)
3511 ANYOF_BITMAP_SET(ret, value);
3512 }
1aa99e6b 3513 dont_optimize_invert = TRUE;
ffc61ed2
JH
3514 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3515 break;
3516 case ANYOF_NDIGIT:
3517 if (LOC)
3518 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3519 else {
3520 /* consecutive digits assumed */
3521 for (value = 0; value < '0'; value++)
3522 ANYOF_BITMAP_SET(ret, value);
3523 for (value = '9' + 1; value < 256; value++)
3524 ANYOF_BITMAP_SET(ret, value);
3525 }
1aa99e6b 3526 dont_optimize_invert = TRUE;
ffc61ed2 3527 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3528 break;
3529 case ANYOF_GRAPH:
3530 if (LOC)
936ed897 3531 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3532 else {
3533 for (value = 0; value < 256; value++)
3534 if (isGRAPH(value))
936ed897 3535 ANYOF_BITMAP_SET(ret, value);
73b437c8 3536 }
1aa99e6b 3537 dont_optimize_invert = TRUE;
ffc61ed2 3538 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3539 break;
3540 case ANYOF_NGRAPH:
3541 if (LOC)
936ed897 3542 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3543 else {
3544 for (value = 0; value < 256; value++)
3545 if (!isGRAPH(value))
936ed897 3546 ANYOF_BITMAP_SET(ret, value);
73b437c8 3547 }
1aa99e6b 3548 dont_optimize_invert = TRUE;
ffc61ed2 3549 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3550 break;
3551 case ANYOF_LOWER:
3552 if (LOC)
936ed897 3553 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3554 else {
3555 for (value = 0; value < 256; value++)
3556 if (isLOWER(value))
936ed897 3557 ANYOF_BITMAP_SET(ret, value);
73b437c8 3558 }
1aa99e6b 3559 dont_optimize_invert = TRUE;
ffc61ed2 3560 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3561 break;
3562 case ANYOF_NLOWER:
3563 if (LOC)
936ed897 3564 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3565 else {
3566 for (value = 0; value < 256; value++)
3567 if (!isLOWER(value))
936ed897 3568 ANYOF_BITMAP_SET(ret, value);
73b437c8 3569 }
1aa99e6b 3570 dont_optimize_invert = TRUE;
ffc61ed2 3571 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3572 break;
3573 case ANYOF_PRINT:
3574 if (LOC)
936ed897 3575 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3576 else {
3577 for (value = 0; value < 256; value++)
3578 if (isPRINT(value))
936ed897 3579 ANYOF_BITMAP_SET(ret, value);
73b437c8 3580 }
1aa99e6b 3581 dont_optimize_invert = TRUE;
ffc61ed2 3582 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3583 break;
3584 case ANYOF_NPRINT:
3585 if (LOC)
936ed897 3586 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3587 else {
3588 for (value = 0; value < 256; value++)
3589 if (!isPRINT(value))
936ed897 3590 ANYOF_BITMAP_SET(ret, value);
73b437c8 3591 }
1aa99e6b 3592 dont_optimize_invert = TRUE;
ffc61ed2 3593 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3594 break;
aaa51d5e
JF
3595 case ANYOF_PSXSPC:
3596 if (LOC)
3597 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3598 else {
3599 for (value = 0; value < 256; value++)
3600 if (isPSXSPC(value))
3601 ANYOF_BITMAP_SET(ret, value);
3602 }
1aa99e6b 3603 dont_optimize_invert = TRUE;
ffc61ed2 3604 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3605 break;
3606 case ANYOF_NPSXSPC:
3607 if (LOC)
3608 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3609 else {
3610 for (value = 0; value < 256; value++)
3611 if (!isPSXSPC(value))
3612 ANYOF_BITMAP_SET(ret, value);
3613 }
1aa99e6b 3614 dont_optimize_invert = TRUE;
ffc61ed2 3615 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3616 break;
73b437c8
JH
3617 case ANYOF_PUNCT:
3618 if (LOC)
936ed897 3619 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3620 else {
3621 for (value = 0; value < 256; value++)
3622 if (isPUNCT(value))
936ed897 3623 ANYOF_BITMAP_SET(ret, value);
73b437c8 3624 }
1aa99e6b 3625 dont_optimize_invert = TRUE;
ffc61ed2 3626 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3627 break;
3628 case ANYOF_NPUNCT:
3629 if (LOC)
936ed897 3630 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3631 else {
3632 for (value = 0; value < 256; value++)
3633 if (!isPUNCT(value))
936ed897 3634 ANYOF_BITMAP_SET(ret, value);
73b437c8 3635 }
1aa99e6b 3636 dont_optimize_invert = TRUE;
ffc61ed2
JH
3637 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3638 break;
3639 case ANYOF_SPACE:
3640 if (LOC)
3641 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3642 else {
3643 for (value = 0; value < 256; value++)
3644 if (isSPACE(value))
3645 ANYOF_BITMAP_SET(ret, value);
3646 }
1aa99e6b 3647 dont_optimize_invert = TRUE;
ffc61ed2
JH
3648 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3649 break;
3650 case ANYOF_NSPACE:
3651 if (LOC)
3652 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3653 else {
3654 for (value = 0; value < 256; value++)
3655 if (!isSPACE(value))
3656 ANYOF_BITMAP_SET(ret, value);
3657 }
1aa99e6b 3658 dont_optimize_invert = TRUE;
ffc61ed2 3659 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3660 break;
3661 case ANYOF_UPPER:
3662 if (LOC)
936ed897 3663 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3664 else {
3665 for (value = 0; value < 256; value++)
3666 if (isUPPER(value))
936ed897 3667 ANYOF_BITMAP_SET(ret, value);
73b437c8 3668 }
1aa99e6b 3669 dont_optimize_invert = TRUE;
ffc61ed2 3670 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3671 break;
3672 case ANYOF_NUPPER:
3673 if (LOC)
936ed897 3674 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3675 else {
3676 for (value = 0; value < 256; value++)
3677 if (!isUPPER(value))
936ed897 3678 ANYOF_BITMAP_SET(ret, value);
73b437c8 3679 }
1aa99e6b 3680 dont_optimize_invert = TRUE;
ffc61ed2 3681 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3682 break;
3683 case ANYOF_XDIGIT:
3684 if (LOC)
936ed897 3685 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3686 else {
3687 for (value = 0; value < 256; value++)
3688 if (isXDIGIT(value))
936ed897 3689 ANYOF_BITMAP_SET(ret, value);
73b437c8 3690 }
1aa99e6b 3691 dont_optimize_invert = TRUE;
ffc61ed2 3692 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3693 break;
3694 case ANYOF_NXDIGIT:
3695 if (LOC)
936ed897 3696 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3697 else {
3698 for (value = 0; value < 256; value++)
3699 if (!isXDIGIT(value))
936ed897 3700 ANYOF_BITMAP_SET(ret, value);
73b437c8 3701 }
1aa99e6b 3702 dont_optimize_invert = TRUE;
ffc61ed2 3703 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3704 break;
3705 default:
b45f050a 3706 vFAIL("Invalid [::] class");
73b437c8 3707 break;
b8c5462f 3708 }
b8c5462f 3709 if (LOC)
936ed897 3710 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3711 continue;
a0d0e21e 3712 }
ffc61ed2
JH
3713 } /* end of namedclass \blah */
3714
a0d0e21e 3715 if (range) {
73b437c8 3716 if (lastvalue > value) /* b-a */ {
b45f050a 3717 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3718 RExC_parse - rangebegin,
3719 RExC_parse - rangebegin,
b45f050a 3720 rangebegin);
73b437c8 3721 }
ffc61ed2 3722 range = 0; /* not a true range */
a0d0e21e
LW
3723 }
3724 else {
ffc61ed2 3725 lastvalue = value; /* save the beginning of the range */
830247a4
IZ
3726 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3727 RExC_parse[1] != ']') {
3728 RExC_parse++;
ffc61ed2
JH
3729
3730 /* a bad range like \w-, [:word:]- ? */
3731 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3732 if (ckWARN(WARN_REGEXP))
830247a4 3733 vWARN4(RExC_parse,
b45f050a 3734 "False [] range \"%*.*s\"",
830247a4
IZ
3735 RExC_parse - rangebegin,
3736 RExC_parse - rangebegin,
b45f050a 3737 rangebegin);
73b437c8 3738 if (!SIZE_ONLY)
936ed897 3739 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3740 } else
ffc61ed2
JH
3741 range = 1; /* yeah, it's a range! */
3742 continue; /* but do it the next time */
a0d0e21e 3743 }
a687059c 3744 }
ffc61ed2 3745
93733859 3746 /* now is the next time */
ae5c130c 3747 if (!SIZE_ONLY) {
ffc61ed2 3748 if (lastvalue < 256 && value < 256) {
93733859 3749#ifndef ASCIIish /* EBCDIC, for example. */
ffc61ed2
JH
3750 if ((isLOWER(lastvalue) && isLOWER(value)) ||
3751 (isUPPER(lastvalue) && isUPPER(value)))
3752 {
3753 IV i;
3754 if (isLOWER(lastvalue)) {
3755 for (i = lastvalue; i <= value; i++)
3756 if (isLOWER(i))
3757 ANYOF_BITMAP_SET(ret, i);
3758 } else {
3759 for (i = lastvalue; i <= value; i++)
3760 if (isUPPER(i))
3761 ANYOF_BITMAP_SET(ret, i);
3762 }
8ada0baa 3763 }
ffc61ed2 3764 else
8ada0baa 3765#endif
ffc61ed2
JH
3766 for ( ; lastvalue <= value; lastvalue++)
3767 ANYOF_BITMAP_SET(ret, lastvalue);
3768 } else {
3769 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3770 if (lastvalue < value)
3771 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3772 (UV)lastvalue, (UV)value);
3773 else
3774 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3775 (UV)value);
3776 }
8ada0baa 3777 }
ffc61ed2
JH
3778
3779 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3780 }
ffc61ed2 3781
936ed897
IZ
3782 if (need_class) {
3783 if (SIZE_ONLY)
830247a4 3784 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3785 else
830247a4 3786 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3787 }
ffc61ed2 3788
ae5c130c 3789 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3790 if (!SIZE_ONLY &&
ffc61ed2
JH
3791 (ANYOF_FLAGS(ret) &
3792 /* If the only flag is folding (plus possibly inversion). */
3793 (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
a0ed51b3 3794 for (value = 0; value < 256; ++value) {
936ed897 3795 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3796 IV fold = PL_fold[value];
3797
3798 if (fold != value)
3799 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3800 }
3801 }
936ed897 3802 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3803 }
ffc61ed2 3804
ae5c130c 3805 /* optimize inverted simple patterns (e.g. [^a-z]) */
1aa99e6b 3806 if (!SIZE_ONLY && !dont_optimize_invert &&
ffc61ed2
JH
3807 /* If the only flag is inversion. */
3808 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3809 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3810 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3811 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3812 }
a0d0e21e 3813
b81d288d 3814 if (!SIZE_ONLY) {
fde631ed 3815 AV *av = newAV();
ffc61ed2
JH
3816 SV *rv;
3817
3818 av_store(av, 0, listsv);
3819 av_store(av, 1, NULL);
3820 rv = newRV_noinc((SV*)av);
19860706 3821 n = add_data(pRExC_state, 1, "s");
830247a4 3822 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3823 ARG_SET(ret, n);
a0ed51b3
LW
3824 }
3825
3826 return ret;
3827}
3828
76e3520e 3829STATIC char*
830247a4 3830S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3831{
830247a4 3832 char* retval = RExC_parse++;
a0d0e21e 3833
4633a7c4 3834 for (;;) {
830247a4
IZ
3835 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3836 RExC_parse[2] == '#') {
3837 while (*RExC_parse && *RExC_parse != ')')
3838 RExC_parse++;
3839 RExC_parse++;
4633a7c4
LW
3840 continue;
3841 }
830247a4
IZ
3842 if (RExC_flags16 & PMf_EXTENDED) {
3843 if (isSPACE(*RExC_parse)) {
3844 RExC_parse++;
748a9306
LW
3845 continue;
3846 }
830247a4
IZ
3847 else if (*RExC_parse == '#') {
3848 while (*RExC_parse && *RExC_parse != '\n')
3849 RExC_parse++;
3850 RExC_parse++;
748a9306
LW
3851 continue;
3852 }
748a9306 3853 }
4633a7c4 3854 return retval;
a0d0e21e 3855 }
a687059c
LW
3856}
3857
3858/*
c277df42 3859- reg_node - emit a node
a0d0e21e 3860*/
76e3520e 3861STATIC regnode * /* Location. */
830247a4 3862S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3863{
c277df42
IZ
3864 register regnode *ret;
3865 register regnode *ptr;
a687059c 3866
830247a4 3867 ret = RExC_emit;
c277df42 3868 if (SIZE_ONLY) {
830247a4
IZ
3869 SIZE_ALIGN(RExC_size);
3870 RExC_size += 1;
a0d0e21e
LW
3871 return(ret);
3872 }
a687059c 3873
c277df42 3874 NODE_ALIGN_FILL(ret);
a0d0e21e 3875 ptr = ret;
c277df42 3876 FILL_ADVANCE_NODE(ptr, op);
830247a4 3877 RExC_emit = ptr;
a687059c 3878
a0d0e21e 3879 return(ret);
a687059c
LW
3880}
3881
3882/*
a0d0e21e
LW
3883- reganode - emit a node with an argument
3884*/
76e3520e 3885STATIC regnode * /* Location. */
830247a4 3886S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 3887{
c277df42
IZ
3888 register regnode *ret;
3889 register regnode *ptr;
fe14fcc3 3890
830247a4 3891 ret = RExC_emit;
c277df42 3892 if (SIZE_ONLY) {
830247a4
IZ
3893 SIZE_ALIGN(RExC_size);
3894 RExC_size += 2;
a0d0e21e
LW
3895 return(ret);
3896 }
fe14fcc3 3897
c277df42 3898 NODE_ALIGN_FILL(ret);
a0d0e21e 3899 ptr = ret;
c277df42 3900 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
830247a4 3901 RExC_emit = ptr;
fe14fcc3 3902
a0d0e21e 3903 return(ret);
fe14fcc3
LW
3904}
3905
3906/*
cd439c50 3907- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
3908*/
3909STATIC void
830247a4 3910S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 3911{
ad391ad9 3912 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
3913}
3914
3915/*
a0d0e21e
LW
3916- reginsert - insert an operator in front of already-emitted operand
3917*
3918* Means relocating the operand.
3919*/
76e3520e 3920STATIC void
830247a4 3921S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 3922{
c277df42
IZ
3923 register regnode *src;
3924 register regnode *dst;
3925 register regnode *place;
3926 register int offset = regarglen[(U8)op];
b81d288d 3927
22c35a8c 3928/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
3929
3930 if (SIZE_ONLY) {
830247a4 3931 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
3932 return;
3933 }
a687059c 3934
830247a4
IZ
3935 src = RExC_emit;
3936 RExC_emit += NODE_STEP_REGNODE + offset;
3937 dst = RExC_emit;
a0d0e21e 3938 while (src > opnd)
c277df42 3939 StructCopy(--src, --dst, regnode);
a0d0e21e
LW
3940
3941 place = opnd; /* Op node, where operand used to be. */
c277df42
IZ
3942 src = NEXTOPER(place);
3943 FILL_ADVANCE_NODE(place, op);
3944 Zero(src, offset, regnode);
a687059c
LW
3945}
3946
3947/*
c277df42 3948- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 3949*/
76e3520e 3950STATIC void
830247a4 3951S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3952{
c277df42
IZ
3953 register regnode *scan;
3954 register regnode *temp;
a0d0e21e 3955
c277df42 3956 if (SIZE_ONLY)
a0d0e21e
LW
3957 return;
3958
3959 /* Find last node. */
3960 scan = p;
3961 for (;;) {
3962 temp = regnext(scan);
3963 if (temp == NULL)
3964 break;
3965 scan = temp;
3966 }
a687059c 3967
c277df42
IZ
3968 if (reg_off_by_arg[OP(scan)]) {
3969 ARG_SET(scan, val - scan);
a0ed51b3
LW
3970 }
3971 else {
c277df42
IZ
3972 NEXT_OFF(scan) = val - scan;
3973 }
a687059c
LW
3974}
3975
3976/*
a0d0e21e
LW
3977- regoptail - regtail on operand of first argument; nop if operandless
3978*/
76e3520e 3979STATIC void
830247a4 3980S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3981{
a0d0e21e 3982 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
3983 if (p == NULL || SIZE_ONLY)
3984 return;
22c35a8c 3985 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 3986 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 3987 }
22c35a8c 3988 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 3989 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
3990 }
3991 else
a0d0e21e 3992 return;
a687059c
LW
3993}
3994
3995/*
3996 - regcurly - a little FSA that accepts {\d+,?\d*}
3997 */
79072805 3998STATIC I32
cea2e8a9 3999S_regcurly(pTHX_ register char *s)
a687059c
LW
4000{
4001 if (*s++ != '{')
4002 return FALSE;
f0fcb552 4003 if (!isDIGIT(*s))
a687059c 4004 return FALSE;
f0fcb552 4005 while (isDIGIT(*s))
a687059c
LW
4006 s++;
4007 if (*s == ',')
4008 s++;
f0fcb552 4009 while (isDIGIT(*s))
a687059c
LW
4010 s++;
4011 if (*s != '}')
4012 return FALSE;
4013 return TRUE;
4014}
4015
a687059c 4016
76e3520e 4017STATIC regnode *
cea2e8a9 4018S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4019{
35ff7856 4020#ifdef DEBUGGING
f248d071 4021 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4022 register regnode *next;
c277df42
IZ
4023
4024 while (op != END && (!last || node < last)) {
4025 /* While that wasn't END last time... */
4026
4027 NODE_ALIGN(node);
4028 op = OP(node);
4029 if (op == CLOSE)
4030 l--;
4031 next = regnext(node);
4032 /* Where, what. */
4033 if (OP(node) == OPTIMIZED)
4034 goto after_print;
4035 regprop(sv, node);
b900a521 4036 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4037 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4038 if (next == NULL) /* Next ptr. */
4039 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4040 else
b900a521 4041 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4042 (void)PerlIO_putc(Perl_debug_log, '\n');
4043 after_print:
22c35a8c 4044 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4045 register regnode *nnode = (OP(next) == LONGJMP
4046 ? regnext(next)
c277df42
IZ
4047 : next);
4048 if (last && nnode > last)
4049 nnode = last;
4050 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4051 }
22c35a8c 4052 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4053 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4054 }
4055 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4056 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4057 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4058 }
22c35a8c 4059 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4060 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4061 next, sv, l + 1);
a0ed51b3
LW
4062 }
4063 else if ( op == PLUS || op == STAR) {
c277df42 4064 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4065 }
4066 else if (op == ANYOF) {
c277df42 4067 node = NEXTOPER(node);
936ed897 4068 node += ANYOF_SKIP;
a0ed51b3 4069 }
22c35a8c 4070 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4071 /* Literal string, where present. */
cd439c50 4072 node += NODE_SZ_STR(node) - 1;
c277df42 4073 node = NEXTOPER(node);
a0ed51b3
LW
4074 }
4075 else {
c277df42
IZ
4076 node = NEXTOPER(node);
4077 node += regarglen[(U8)op];
4078 }
4079 if (op == CURLYX || op == OPEN)
4080 l++;
4081 else if (op == WHILEM)
4082 l--;
4083 }
17c3b450 4084#endif /* DEBUGGING */
c277df42
IZ
4085 return node;
4086}
4087
a687059c 4088/*
fd181c75 4089 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4090 */
4091void
864dbfa3 4092Perl_regdump(pTHX_ regexp *r)
a687059c 4093{
35ff7856 4094#ifdef DEBUGGING
46fc3d4c 4095 SV *sv = sv_newmortal();
a687059c 4096
c277df42 4097 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4098
4099 /* Header fields of interest. */
c277df42 4100 if (r->anchored_substr)
7b0972df 4101 PerlIO_printf(Perl_debug_log,
b81d288d 4102 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4103 PL_colors[0],
7b0972df 4104 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4105 SvPVX(r->anchored_substr),
3280af22 4106 PL_colors[1],
c277df42 4107 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4108 (IV)r->anchored_offset);
c277df42 4109 if (r->float_substr)
7b0972df 4110 PerlIO_printf(Perl_debug_log,
b81d288d 4111 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4112 PL_colors[0],
b81d288d 4113 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4114 SvPVX(r->float_substr),
3280af22 4115 PL_colors[1],
c277df42 4116 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4117 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4118 if (r->check_substr)
b81d288d
AB
4119 PerlIO_printf(Perl_debug_log,
4120 r->check_substr == r->float_substr
c277df42
IZ
4121 ? "(checking floating" : "(checking anchored");
4122 if (r->reganch & ROPT_NOSCAN)
4123 PerlIO_printf(Perl_debug_log, " noscan");
4124 if (r->reganch & ROPT_CHECK_ALL)
4125 PerlIO_printf(Perl_debug_log, " isall");
4126 if (r->check_substr)
4127 PerlIO_printf(Perl_debug_log, ") ");
4128
46fc3d4c 4129 if (r->regstclass) {
4130 regprop(sv, r->regstclass);
4131 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4132 }
774d564b 4133 if (r->reganch & ROPT_ANCH) {
4134 PerlIO_printf(Perl_debug_log, "anchored");
4135 if (r->reganch & ROPT_ANCH_BOL)
4136 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4137 if (r->reganch & ROPT_ANCH_MBOL)
4138 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4139 if (r->reganch & ROPT_ANCH_SBOL)
4140 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4141 if (r->reganch & ROPT_ANCH_GPOS)
4142 PerlIO_printf(Perl_debug_log, "(GPOS)");
4143 PerlIO_putc(Perl_debug_log, ' ');
4144 }
c277df42
IZ
4145 if (r->reganch & ROPT_GPOS_SEEN)
4146 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4147 if (r->reganch & ROPT_SKIP)
760ac839 4148 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4149 if (r->reganch & ROPT_IMPLICIT)
760ac839 4150 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4151 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4152 if (r->reganch & ROPT_EVAL_SEEN)
4153 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4154 PerlIO_printf(Perl_debug_log, "\n");
17c3b450 4155#endif /* DEBUGGING */
a687059c
LW
4156}
4157
653099ff
GS
4158STATIC void
4159S_put_byte(pTHX_ SV *sv, int c)
4160{
ffc61ed2 4161 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4162 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4163 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4164 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4165 else
4166 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4167}
4168
a687059c 4169/*
a0d0e21e
LW
4170- regprop - printable representation of opcode
4171*/
46fc3d4c 4172void
864dbfa3 4173Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4174{
35ff7856 4175#ifdef DEBUGGING
9b155405 4176 register int k;
a0d0e21e 4177
54dc92de 4178 sv_setpvn(sv, "", 0);
9b155405 4179 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4180 /* It would be nice to FAIL() here, but this may be called from
4181 regexec.c, and it would be hard to supply pRExC_state. */
4182 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4183 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4184
4185 k = PL_regkind[(U8)OP(o)];
4186
4187 if (k == EXACT)
7821416a 4188 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4189 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4190 else if (k == CURLY) {
cb434fcc 4191 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4192 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4193 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4194 }
2c2d71f5
JH
4195 else if (k == WHILEM && o->flags) /* Ordinal/of */
4196 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4197 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4198 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4199 else if (k == LOGICAL)
04ebc1ab 4200 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4201 else if (k == ANYOF) {
4202 int i, rangestart = -1;
ffc61ed2 4203 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4204 const char * const anyofs[] = { /* Should be syncronized with
4205 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4206 "\\w",
4207 "\\W",
4208 "\\s",
4209 "\\S",
4210 "\\d",
4211 "\\D",
4212 "[:alnum:]",
4213 "[:^alnum:]",
4214 "[:alpha:]",
4215 "[:^alpha:]",
4216 "[:ascii:]",
4217 "[:^ascii:]",
4218 "[:ctrl:]",
4219 "[:^ctrl:]",
4220 "[:graph:]",
4221 "[:^graph:]",
4222 "[:lower:]",
4223 "[:^lower:]",
4224 "[:print:]",
4225 "[:^print:]",
4226 "[:punct:]",
4227 "[:^punct:]",
4228 "[:upper:]",
aaa51d5e 4229 "[:^upper:]",
653099ff 4230 "[:xdigit:]",
aaa51d5e
JF
4231 "[:^xdigit:]",
4232 "[:space:]",
4233 "[:^space:]",
4234 "[:blank:]",
4235 "[:^blank:]"
653099ff
GS
4236 };
4237
19860706 4238 if (flags & ANYOF_LOCALE)
653099ff 4239 sv_catpv(sv, "{loc}");
19860706 4240 if (flags & ANYOF_FOLD)
653099ff
GS
4241 sv_catpv(sv, "{i}");
4242 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4243 if (flags & ANYOF_INVERT)
653099ff 4244 sv_catpv(sv, "^");
ffc61ed2
JH
4245 for (i = 0; i <= 256; i++) {
4246 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4247 if (rangestart == -1)
4248 rangestart = i;
4249 } else if (rangestart != -1) {
4250 if (i <= rangestart + 3)
4251 for (; rangestart < i; rangestart++)
653099ff 4252 put_byte(sv, rangestart);
ffc61ed2
JH
4253 else {
4254 put_byte(sv, rangestart);
4255 sv_catpv(sv, "-");
4256 put_byte(sv, i - 1);
653099ff 4257 }
ffc61ed2 4258 rangestart = -1;
653099ff 4259 }
847a199f 4260 }
ffc61ed2
JH
4261
4262 if (o->flags & ANYOF_CLASS)
4263 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4264 if (ANYOF_CLASS_TEST(o,i))
4265 sv_catpv(sv, anyofs[i]);
4266
4267 if (flags & ANYOF_UNICODE)
4268 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4269 else if (flags & ANYOF_UNICODE_ALL)
4270 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4271
4272 {
4273 SV *lv;
4274 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4275
ffc61ed2
JH
4276 if (lv) {
4277 if (sw) {
4278 UV i;
4279 U8 s[UTF8_MAXLEN+1];
b81d288d 4280
ffc61ed2
JH
4281 for (i = 0; i <= 256; i++) { /* just the first 256 */
4282 U8 *e = uv_to_utf8(s, i);
4283
4284 if (i < 256 && swash_fetch(sw, s)) {
4285 if (rangestart == -1)
4286 rangestart = i;
4287 } else if (rangestart != -1) {
4288 U8 *p;
b81d288d 4289
ffc61ed2
JH
4290 if (i <= rangestart + 3)
4291 for (; rangestart < i; rangestart++) {
4292 for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
4293 put_byte(sv, *p);
4294 }
4295 else {
4296 for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
4297 put_byte(sv, *p);
4298 sv_catpv(sv, "-");
4299 for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
4300 put_byte(sv, *p);
4301 }
4302 rangestart = -1;
4303 }
19860706 4304 }
ffc61ed2
JH
4305
4306 sv_catpv(sv, "..."); /* et cetera */
19860706 4307 }
fde631ed 4308
ffc61ed2
JH
4309 {
4310 char *s = savepv(SvPVX(lv));
4311 char *origs = s;
b81d288d 4312
ffc61ed2 4313 while(*s && *s != '\n') s++;
b81d288d 4314
ffc61ed2
JH
4315 if (*s == '\n') {
4316 char *t = ++s;
4317
4318 while (*s) {
4319 if (*s == '\n')
4320 *s = ' ';
4321 s++;
4322 }
4323 if (s[-1] == ' ')
4324 s[-1] = 0;
4325
4326 sv_catpv(sv, t);
fde631ed 4327 }
b81d288d 4328
ffc61ed2 4329 Safefree(origs);
fde631ed
JH
4330 }
4331 }
653099ff 4332 }
ffc61ed2 4333
653099ff
GS
4334 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4335 }
9b155405 4336 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4337 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4338#endif /* DEBUGGING */
35ff7856 4339}
a687059c 4340
cad2e5aa
JH
4341SV *
4342Perl_re_intuit_string(pTHX_ regexp *prog)
4343{ /* Assume that RE_INTUIT is set */
4344 DEBUG_r(
4345 { STRLEN n_a;
4346 char *s = SvPV(prog->check_substr,n_a);
4347
4348 if (!PL_colorset) reginitcolors();
4349 PerlIO_printf(Perl_debug_log,
4350 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4351 PL_colors[4],PL_colors[5],PL_colors[0],
4352 s,
4353 PL_colors[1],
4354 (strlen(s) > 60 ? "..." : ""));
4355 } );
4356
4357 return prog->check_substr;
4358}
4359
2b69d0c2 4360void
864dbfa3 4361Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4362{
adac82c7 4363 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4364
4365 if (!r || (--r->refcnt > 0))
4366 return;
cad2e5aa
JH
4367 DEBUG_r(PerlIO_printf(Perl_debug_log,
4368 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4369 PL_colors[4],PL_colors[5],PL_colors[0],
4370 r->precomp,
4371 PL_colors[1],
4372 (strlen(r->precomp) > 60 ? "..." : "")));
4373
c277df42 4374 if (r->precomp)
a0d0e21e 4375 Safefree(r->precomp);
cf93c79d
IZ
4376 if (RX_MATCH_COPIED(r))
4377 Safefree(r->subbeg);
a193d654
GS
4378 if (r->substrs) {
4379 if (r->anchored_substr)
4380 SvREFCNT_dec(r->anchored_substr);
4381 if (r->float_substr)
4382 SvREFCNT_dec(r->float_substr);
2779dcf1 4383 Safefree(r->substrs);
a193d654 4384 }
c277df42
IZ
4385 if (r->data) {
4386 int n = r->data->count;
dfad63ad
HS
4387 AV* new_comppad = NULL;
4388 AV* old_comppad;
4389 SV** old_curpad;
4390
c277df42
IZ
4391 while (--n >= 0) {
4392 switch (r->data->what[n]) {
4393 case 's':
4394 SvREFCNT_dec((SV*)r->data->data[n]);
4395 break;
653099ff
GS
4396 case 'f':
4397 Safefree(r->data->data[n]);
4398 break;
dfad63ad
HS
4399 case 'p':
4400 new_comppad = (AV*)r->data->data[n];
4401 break;
c277df42 4402 case 'o':
dfad63ad 4403 if (new_comppad == NULL)
cea2e8a9 4404 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4405 old_comppad = PL_comppad;
4406 old_curpad = PL_curpad;
1e6dc0b6
SB
4407 /* Watch out for global destruction's random ordering. */
4408 if (SvTYPE(new_comppad) == SVt_PVAV) {
4409 PL_comppad = new_comppad;
4410 PL_curpad = AvARRAY(new_comppad);
4411 }
4412 else
4413 PL_curpad = NULL;
c277df42 4414 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4415 PL_comppad = old_comppad;
4416 PL_curpad = old_curpad;
4417 SvREFCNT_dec((SV*)new_comppad);
4418 new_comppad = NULL;
c277df42
IZ
4419 break;
4420 case 'n':
4421 break;
4422 default:
830247a4 4423 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4424 }
4425 }
4426 Safefree(r->data->what);
4427 Safefree(r->data);
a0d0e21e
LW
4428 }
4429 Safefree(r->startp);
4430 Safefree(r->endp);
4431 Safefree(r);
a687059c 4432}
c277df42
IZ
4433
4434/*
4435 - regnext - dig the "next" pointer out of a node
4436 *
4437 * [Note, when REGALIGN is defined there are two places in regmatch()
4438 * that bypass this code for speed.]
4439 */
4440regnode *
864dbfa3 4441Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4442{
4443 register I32 offset;
4444
3280af22 4445 if (p == &PL_regdummy)
c277df42
IZ
4446 return(NULL);
4447
4448 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4449 if (offset == 0)
4450 return(NULL);
4451
c277df42 4452 return(p+offset);
c277df42
IZ
4453}
4454
01f988be 4455STATIC void
cea2e8a9 4456S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4457{
4458 va_list args;
4459 STRLEN l1 = strlen(pat1);
4460 STRLEN l2 = strlen(pat2);
4461 char buf[512];
06bf62c7 4462 SV *msv;
c277df42
IZ
4463 char *message;
4464
4465 if (l1 > 510)
4466 l1 = 510;
4467 if (l1 + l2 > 510)
4468 l2 = 510 - l1;
4469 Copy(pat1, buf, l1 , char);
4470 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4471 buf[l1 + l2] = '\n';
4472 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4473#ifdef I_STDARG
4474 /* ANSI variant takes additional second argument */
c277df42 4475 va_start(args, pat2);
8736538c
AS
4476#else
4477 va_start(args);
4478#endif
5a844595 4479 msv = vmess(buf, &args);
c277df42 4480 va_end(args);
06bf62c7 4481 message = SvPV(msv,l1);
c277df42
IZ
4482 if (l1 > 512)
4483 l1 = 512;
4484 Copy(message, buf, l1 , char);
4485 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4486 Perl_croak(aTHX_ "%s", buf);
c277df42 4487}
a0ed51b3
LW
4488
4489/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4490
4491void
864dbfa3 4492Perl_save_re_context(pTHX)
b81d288d 4493{
830247a4
IZ
4494#if 0
4495 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4496 SAVEI32(RExC_npar); /* () count. */
4497 SAVEI32(RExC_size); /* Code size. */
4498 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4499 SAVEVPTR(RExC_rx); /* from regcomp.c */
4500 SAVEI32(RExC_seen); /* from regcomp.c */
4501 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4502 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4503 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4504 SAVEPPTR(RExC_end); /* End of input for compile */
4505 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4506#endif
4507
4508 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4509 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4510 SAVEPPTR(PL_reginput); /* String-input pointer. */
4511 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4512 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4513 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4514 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4515 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4516 SAVEPPTR(PL_regtill); /* How far we are required to go. */
e8347627 4517 SAVEI8(PL_regprev); /* char before regbol, \n if none */
b81d288d 4518 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4519 PL_reg_start_tmp = 0;
a0ed51b3
LW
4520 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4521 PL_reg_start_tmpl = 0;
7766f137 4522 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4523 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4524 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4525 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4526 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4527 SAVEVPTR(PL_regcc); /* from regexec.c */
4528 SAVEVPTR(PL_curcop);
7766f137
GS
4529 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4530 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4531 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4532 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4533 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4534 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4535 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4536 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4537 SAVEI32(PL_regnpar); /* () count. */
54b6e2fa 4538#ifdef DEBUGGING
b81d288d 4539 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4540#endif
a0ed51b3 4541}
51371543
GS
4542
4543#ifdef PERL_OBJECT
51371543
GS
4544#include "XSUB.h"
4545#undef this
4546#define this pPerl
4547#endif
4548
4549static void
4550clear_re(pTHXo_ void *r)
4551{
4552 ReREFCNT_dec((regexp *)r);
4553}