This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Typo in utf8.h
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
118 char *end; /* End of input for compile */
119 char *parse; /* Input-scan pointer. */
120 I32 whilem_seen; /* number of WHILEM in this expr */
ffc61ed2 121 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
125 I32 size; /* Code size. */
126 I32 npar; /* () count. */
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
1aa99e6b 130 I32 utf8;
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
135} RExC_state_t;
136
137#define RExC_flags16 (pRExC_state->flags16)
138#define RExC_precomp (pRExC_state->precomp)
139#define RExC_rx (pRExC_state->rx)
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
143#define RExC_emit (pRExC_state->emit)
144#define RExC_naughty (pRExC_state->naughty)
145#define RExC_sawback (pRExC_state->sawback)
146#define RExC_seen (pRExC_state->seen)
147#define RExC_size (pRExC_state->size)
148#define RExC_npar (pRExC_state->npar)
149#define RExC_extralen (pRExC_state->extralen)
150#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
151#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 152#define RExC_utf8 (pRExC_state->utf8)
830247a4 153
a687059c
LW
154#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
155#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
156 ((*s) == '{' && regcurly(s)))
a687059c 157
35c8bce7
LW
158#ifdef SPSTART
159#undef SPSTART /* dratted cpp namespace... */
160#endif
a687059c
LW
161/*
162 * Flags to be passed up and down.
163 */
a687059c 164#define WORST 0 /* Worst case. */
821b33a5 165#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
166#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
167#define SPSTART 0x4 /* Starts with * or +. */
168#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 169
2c2d71f5
JH
170/* Length of a variant. */
171
172typedef struct scan_data_t {
173 I32 len_min;
174 I32 len_delta;
175 I32 pos_min;
176 I32 pos_delta;
177 SV *last_found;
178 I32 last_end; /* min value, <0 unless valid. */
179 I32 last_start_min;
180 I32 last_start_max;
181 SV **longest; /* Either &l_fixed, or &l_float. */
182 SV *longest_fixed;
183 I32 offset_fixed;
184 SV *longest_float;
185 I32 offset_float_min;
186 I32 offset_float_max;
187 I32 flags;
188 I32 whilem_c;
cb434fcc 189 I32 *last_closep;
653099ff 190 struct regnode_charclass_class *start_class;
2c2d71f5
JH
191} scan_data_t;
192
a687059c 193/*
e50aee73 194 * Forward declarations for pregcomp()'s friends.
a687059c 195 */
a0d0e21e 196
b81d288d 197static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 198 0, 0, 0, 0, 0, 0};
c277df42
IZ
199
200#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
201#define SF_BEFORE_SEOL 0x1
202#define SF_BEFORE_MEOL 0x2
203#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
204#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
205
09b7f37c
CB
206#ifdef NO_UNARY_PLUS
207# define SF_FIX_SHIFT_EOL (0+2)
208# define SF_FL_SHIFT_EOL (0+4)
209#else
210# define SF_FIX_SHIFT_EOL (+2)
211# define SF_FL_SHIFT_EOL (+4)
212#endif
c277df42
IZ
213
214#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
215#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
216
217#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
218#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
219#define SF_IS_INF 0x40
220#define SF_HAS_PAR 0x80
221#define SF_IN_PAR 0x100
222#define SF_HAS_EVAL 0x200
4bfe0158 223#define SCF_DO_SUBSTR 0x400
653099ff
GS
224#define SCF_DO_STCLASS_AND 0x0800
225#define SCF_DO_STCLASS_OR 0x1000
226#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 227#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 228
1aa99e6b 229#define UTF RExC_utf8
830247a4
IZ
230#define LOC (RExC_flags16 & PMf_LOCALE)
231#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 232
ffc61ed2 233#define OOB_UNICODE 12345678
93733859 234#define OOB_NAMEDCLASS -1
b8c5462f 235
a0ed51b3
LW
236#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
237#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
238
8615cb43 239
b45f050a
JF
240/* length of regex to show in messages that don't mark a position within */
241#define RegexLengthToShowInErrorMessages 127
242
243/*
244 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
245 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
246 * op/pragma/warn/regcomp.
247 */
79eeca27
JH
248#define MARKER1 "HERE" /* marker as it appears in the description */
249#define MARKER2 " << HERE " /* marker as it appears within the regex */
b81d288d 250
79eeca27 251#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
b45f050a
JF
252
253/*
254 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
255 * arg. Show regex, up to a maximum length. If it's too long, chop and add
256 * "...".
257 */
a4eb266f 258#define FAIL(msg) \
8615cb43 259 STMT_START { \
a4eb266f 260 char *ellipses = ""; \
830247a4 261 unsigned len = strlen(RExC_precomp); \
b45f050a 262 \
8615cb43 263 if (!SIZE_ONLY) \
830247a4 264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
265 \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 269 ellipses = "..."; \
b45f050a
JF
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 272 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
273 } STMT_END
274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
a4eb266f 280#define FAIL2(pat,msg) \
8615cb43 281 STMT_START { \
a4eb266f 282 char *ellipses = ""; \
830247a4 283 unsigned len = strlen(RExC_precomp); \
b45f050a 284 \
8615cb43 285 if (!SIZE_ONLY) \
830247a4 286 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 287 \
b45f050a
JF
288 if (len > RegexLengthToShowInErrorMessages) { \
289 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
290 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 291 ellipses = "..."; \
b45f050a
JF
292 } \
293 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 294 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
295 } STMT_END
296
297
298/*
299 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
300 */
301#define Simple_vFAIL(m) \
302 STMT_START { \
830247a4 303 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
304 \
305 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 306 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
307 } STMT_END
308
309/*
310 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
311 */
312#define vFAIL(m) \
313 STMT_START { \
314 if (!SIZE_ONLY) \
830247a4 315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
316 Simple_vFAIL(m); \
317 } STMT_END
318
319/*
320 * Like Simple_vFAIL(), but accepts two arguments.
321 */
322#define Simple_vFAIL2(m,a1) \
323 STMT_START { \
830247a4 324 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
325 \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 327 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
328 } STMT_END
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
332 */
333#define vFAIL2(m,a1) \
334 STMT_START { \
335 if (!SIZE_ONLY) \
830247a4 336 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
337 Simple_vFAIL2(m, a1); \
338 } STMT_END
339
340
341/*
342 * Like Simple_vFAIL(), but accepts three arguments.
343 */
344#define Simple_vFAIL3(m, a1, a2) \
345 STMT_START { \
830247a4 346 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
347 \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 349 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
350 } STMT_END
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
355#define vFAIL3(m,a1,a2) \
356 STMT_START { \
357 if (!SIZE_ONLY) \
830247a4 358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
359 Simple_vFAIL3(m, a1, a2); \
360 } STMT_END
361
362/*
363 * Like Simple_vFAIL(), but accepts four arguments.
364 */
365#define Simple_vFAIL4(m, a1, a2, a3) \
366 STMT_START { \
830247a4 367 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
368 \
369 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 370 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
371 } STMT_END
372
373/*
374 * Like Simple_vFAIL(), but accepts five arguments.
375 */
376#define Simple_vFAIL5(m, a1, a2, a3, a4) \
377 STMT_START { \
830247a4 378 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 379 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 380 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
381 } STMT_END
382
383
384#define vWARN(loc,m) \
385 STMT_START { \
830247a4 386 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 387 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 388 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
389 } STMT_END \
390
391
392#define vWARN2(loc, m, a1) \
393 STMT_START { \
830247a4 394 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
395 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
396 a1, \
830247a4 397 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
398 } STMT_END
399
400#define vWARN3(loc, m, a1, a2) \
401 STMT_START { \
830247a4 402 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
403 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
404 a1, a2, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN4(loc, m, a1, a2, a3) \
409 STMT_START { \
830247a4 410 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
411 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
412 a1, a2, a3, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
414 } STMT_END
415
416
cd439c50
IZ
417/* Allow for side effects in s */
418#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
419
51371543 420static void clear_re(pTHXo_ void *r);
4327152a 421
653099ff
GS
422/* Mark that we cannot extend a found fixed substring at this point.
423 Updata the longest found anchored substring and the longest found
424 floating substrings if needed. */
425
4327152a 426STATIC void
830247a4 427S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 428{
a0ed51b3
LW
429 STRLEN l = CHR_SVLEN(data->last_found);
430 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 431
c277df42
IZ
432 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
433 sv_setsv(*data->longest, data->last_found);
434 if (*data->longest == data->longest_fixed) {
435 data->offset_fixed = l ? data->last_start_min : data->pos_min;
436 if (data->flags & SF_BEFORE_EOL)
b81d288d 437 data->flags
c277df42
IZ
438 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
439 else
440 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
441 }
442 else {
c277df42 443 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
444 data->offset_float_max = (l
445 ? data->last_start_max
c277df42
IZ
446 : data->pos_min + data->pos_delta);
447 if (data->flags & SF_BEFORE_EOL)
b81d288d 448 data->flags
c277df42
IZ
449 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
450 else
451 data->flags &= ~SF_FL_BEFORE_EOL;
452 }
453 }
454 SvCUR_set(data->last_found, 0);
455 data->last_end = -1;
456 data->flags &= ~SF_BEFORE_EOL;
457}
458
653099ff
GS
459/* Can match anything (initialization) */
460STATIC void
830247a4 461S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 462{
9af0acdb
JH
463 int value;
464
653099ff 465 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
466 for (value = 0; value < 256; ++value)
467 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 468 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
469 if (LOC)
470 cl->flags |= ANYOF_LOCALE;
471}
472
473/* Can match anything (initialization) */
474STATIC int
475S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
476{
477 int value;
478
aaa51d5e 479 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
480 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
481 return 1;
1aa99e6b
IH
482 if (!(cl->flags & ANYOF_UNICODE_ALL))
483 return 0;
9af0acdb
JH
484 for (value = 0; value < 256; ++value)
485 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
486 return 0;
487 return 1;
488}
489
490/* Can match anything (initialization) */
491STATIC void
830247a4 492S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 493{
8ecf7187 494 Zero(cl, 1, struct regnode_charclass_class);
653099ff 495 cl->type = ANYOF;
830247a4 496 cl_anything(pRExC_state, cl);
653099ff
GS
497}
498
499STATIC void
830247a4 500S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 501{
8ecf7187 502 Zero(cl, 1, struct regnode_charclass_class);
653099ff 503 cl->type = ANYOF;
830247a4 504 cl_anything(pRExC_state, cl);
653099ff
GS
505 if (LOC)
506 cl->flags |= ANYOF_LOCALE;
507}
508
509/* 'And' a given class with another one. Can create false positives */
510/* We assume that cl is not inverted */
511STATIC void
512S_cl_and(pTHX_ struct regnode_charclass_class *cl,
513 struct regnode_charclass_class *and_with)
514{
653099ff
GS
515 if (!(and_with->flags & ANYOF_CLASS)
516 && !(cl->flags & ANYOF_CLASS)
517 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
518 && !(and_with->flags & ANYOF_FOLD)
519 && !(cl->flags & ANYOF_FOLD)) {
520 int i;
521
522 if (and_with->flags & ANYOF_INVERT)
523 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
524 cl->bitmap[i] &= ~and_with->bitmap[i];
525 else
526 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
527 cl->bitmap[i] &= and_with->bitmap[i];
528 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
529 if (!(and_with->flags & ANYOF_EOS))
530 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
531
532 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
533 cl->flags &= ~ANYOF_UNICODE_ALL;
534 cl->flags |= ANYOF_UNICODE;
535 ARG_SET(cl, ARG(and_with));
536 }
537 if (!(and_with->flags & ANYOF_UNICODE_ALL))
538 cl->flags &= ~ANYOF_UNICODE_ALL;
539 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
540 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
541}
542
543/* 'OR' a given class with another one. Can create false positives */
544/* We assume that cl is not inverted */
545STATIC void
830247a4 546S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 547{
653099ff
GS
548 if (or_with->flags & ANYOF_INVERT) {
549 /* We do not use
550 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
551 * <= (B1 | !B2) | (CL1 | !CL2)
552 * which is wasteful if CL2 is small, but we ignore CL2:
553 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
554 * XXXX Can we handle case-fold? Unclear:
555 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
556 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
557 */
558 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
559 && !(or_with->flags & ANYOF_FOLD)
560 && !(cl->flags & ANYOF_FOLD) ) {
561 int i;
562
563 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
564 cl->bitmap[i] |= ~or_with->bitmap[i];
565 } /* XXXX: logic is complicated otherwise */
566 else {
830247a4 567 cl_anything(pRExC_state, cl);
653099ff
GS
568 }
569 } else {
570 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
571 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 572 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
573 || (cl->flags & ANYOF_FOLD)) ) {
574 int i;
575
576 /* OR char bitmap and class bitmap separately */
577 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
578 cl->bitmap[i] |= or_with->bitmap[i];
579 if (or_with->flags & ANYOF_CLASS) {
580 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
581 cl->classflags[i] |= or_with->classflags[i];
582 cl->flags |= ANYOF_CLASS;
583 }
584 }
585 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 }
589 if (or_with->flags & ANYOF_EOS)
590 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
591
592 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
593 ARG(cl) != ARG(or_with)) {
594 cl->flags |= ANYOF_UNICODE_ALL;
595 cl->flags &= ~ANYOF_UNICODE;
596 }
597 if (or_with->flags & ANYOF_UNICODE_ALL) {
598 cl->flags |= ANYOF_UNICODE_ALL;
599 cl->flags &= ~ANYOF_UNICODE;
600 }
653099ff
GS
601}
602
603/* REx optimizer. Converts nodes into quickier variants "in place".
604 Finds fixed substrings. */
605
c277df42
IZ
606/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
607 to the position after last scanned or to NULL. */
608
76e3520e 609STATIC I32
830247a4 610S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
611 /* scanp: Start here (read-write). */
612 /* deltap: Write maxlen-minlen here. */
613 /* last: Stop before this one. */
614{
615 I32 min = 0, pars = 0, code;
616 regnode *scan = *scanp, *next;
617 I32 delta = 0;
618 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 619 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
620 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
621 scan_data_t data_fake;
653099ff 622 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 623
c277df42
IZ
624 while (scan && OP(scan) != END && scan < last) {
625 /* Peephole optimizer: */
626
22c35a8c 627 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 628 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
629 regnode *n = regnext(scan);
630 U32 stringok = 1;
631#ifdef DEBUGGING
632 regnode *stop = scan;
b81d288d 633#endif
c277df42 634
cd439c50 635 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
636 /* Skip NOTHING, merge EXACT*. */
637 while (n &&
b81d288d 638 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
639 (stringok && (OP(n) == OP(scan))))
640 && NEXT_OFF(n)
641 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
642 if (OP(n) == TAIL || n > next)
643 stringok = 0;
22c35a8c 644 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
645 NEXT_OFF(scan) += NEXT_OFF(n);
646 next = n + NODE_STEP_REGNODE;
647#ifdef DEBUGGING
648 if (stringok)
649 stop = n;
b81d288d 650#endif
c277df42 651 n = regnext(n);
a0ed51b3 652 }
f49d4d0f 653 else if (stringok) {
cd439c50 654 int oldl = STR_LEN(scan);
c277df42 655 regnode *nnext = regnext(n);
f49d4d0f 656
b81d288d 657 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
658 break;
659 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
660 STR_LEN(scan) += STR_LEN(n);
661 next = n + NODE_SZ_STR(n);
c277df42 662 /* Now we can overwrite *n : */
f49d4d0f 663 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 664#ifdef DEBUGGING
f49d4d0f 665 stop = next - 1;
b81d288d 666#endif
c277df42
IZ
667 n = nnext;
668 }
669 }
670#ifdef DEBUGGING
671 /* Allow dumping */
cd439c50 672 n = scan + NODE_SZ_STR(scan);
c277df42 673 while (n <= stop) {
22c35a8c 674 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
675 OP(n) = OPTIMIZED;
676 NEXT_OFF(n) = 0;
677 }
678 n++;
679 }
653099ff 680#endif
c277df42 681 }
653099ff
GS
682 /* Follow the next-chain of the current node and optimize
683 away all the NOTHINGs from it. */
c277df42 684 if (OP(scan) != CURLYX) {
048cfca1
GS
685 int max = (reg_off_by_arg[OP(scan)]
686 ? I32_MAX
687 /* I32 may be smaller than U16 on CRAYs! */
688 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
689 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
690 int noff;
691 regnode *n = scan;
b81d288d 692
c277df42
IZ
693 /* Skip NOTHING and LONGJMP. */
694 while ((n = regnext(n))
22c35a8c 695 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
696 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
697 && off + noff < max)
698 off += noff;
699 if (reg_off_by_arg[OP(scan)])
700 ARG(scan) = off;
b81d288d 701 else
c277df42
IZ
702 NEXT_OFF(scan) = off;
703 }
653099ff
GS
704 /* The principal pseudo-switch. Cannot be a switch, since we
705 look into several different things. */
b81d288d 706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
708 next = regnext(scan);
709 code = OP(scan);
b81d288d
AB
710
711 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 712 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 713 struct regnode_charclass_class accum;
c277df42 714
653099ff 715 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 716 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 717 if (flags & SCF_DO_STCLASS)
830247a4 718 cl_init_zero(pRExC_state, &accum);
c277df42 719 while (OP(scan) == code) {
830247a4 720 I32 deltanext, minnext, f = 0, fake;
653099ff 721 struct regnode_charclass_class this_class;
c277df42
IZ
722
723 num++;
724 data_fake.flags = 0;
b81d288d 725 if (data) {
2c2d71f5 726 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
727 data_fake.last_closep = data->last_closep;
728 }
729 else
730 data_fake.last_closep = &fake;
c277df42
IZ
731 next = regnext(scan);
732 scan = NEXTOPER(scan);
733 if (code != BRANCH)
734 scan = NEXTOPER(scan);
653099ff 735 if (flags & SCF_DO_STCLASS) {
830247a4 736 cl_init(pRExC_state, &this_class);
653099ff
GS
737 data_fake.start_class = &this_class;
738 f = SCF_DO_STCLASS_AND;
b81d288d 739 }
e1901655
IZ
740 if (flags & SCF_WHILEM_VISITED_POS)
741 f |= SCF_WHILEM_VISITED_POS;
653099ff 742 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
743 minnext = study_chunk(pRExC_state, &scan, &deltanext,
744 next, &data_fake, f);
b81d288d 745 if (min1 > minnext)
c277df42
IZ
746 min1 = minnext;
747 if (max1 < minnext + deltanext)
748 max1 = minnext + deltanext;
749 if (deltanext == I32_MAX)
aca2d497 750 is_inf = is_inf_internal = 1;
c277df42
IZ
751 scan = next;
752 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
753 pars++;
405ff068 754 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 755 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
756 if (data)
757 data->whilem_c = data_fake.whilem_c;
653099ff 758 if (flags & SCF_DO_STCLASS)
830247a4 759 cl_or(pRExC_state, &accum, &this_class);
b81d288d 760 if (code == SUSPEND)
c277df42
IZ
761 break;
762 }
763 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
764 min1 = 0;
765 if (flags & SCF_DO_SUBSTR) {
766 data->pos_min += min1;
767 data->pos_delta += max1 - min1;
768 if (max1 != min1 || is_inf)
769 data->longest = &(data->longest_float);
770 }
771 min += min1;
772 delta += max1 - min1;
653099ff 773 if (flags & SCF_DO_STCLASS_OR) {
830247a4 774 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
775 if (min1) {
776 cl_and(data->start_class, &and_with);
777 flags &= ~SCF_DO_STCLASS;
778 }
779 }
780 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
781 if (min1) {
782 cl_and(data->start_class, &accum);
653099ff 783 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
784 }
785 else {
b81d288d 786 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
787 * data->start_class */
788 StructCopy(data->start_class, &and_with,
789 struct regnode_charclass_class);
790 flags &= ~SCF_DO_STCLASS_AND;
791 StructCopy(&accum, data->start_class,
792 struct regnode_charclass_class);
793 flags |= SCF_DO_STCLASS_OR;
794 data->start_class->flags |= ANYOF_EOS;
795 }
653099ff 796 }
a0ed51b3
LW
797 }
798 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
799 scan = NEXTOPER(NEXTOPER(scan));
800 else /* single branch is optimized. */
801 scan = NEXTOPER(scan);
802 continue;
a0ed51b3
LW
803 }
804 else if (OP(scan) == EXACT) {
cd439c50 805 I32 l = STR_LEN(scan);
1aa99e6b 806 UV uc = *((U8*)STRING(scan));
a0ed51b3 807 if (UTF) {
1aa99e6b
IH
808 U8 *s = (U8*)STRING(scan);
809 l = utf8_length(s, s + l);
9041c2e3 810 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
811 }
812 min += l;
c277df42 813 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
814 /* The code below prefers earlier match for fixed
815 offset, later match for variable offset. */
816 if (data->last_end == -1) { /* Update the start info. */
817 data->last_start_min = data->pos_min;
818 data->last_start_max = is_inf
b81d288d 819 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 820 }
cd439c50 821 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
822 data->last_end = data->pos_min + l;
823 data->pos_min += l; /* As in the first entry. */
824 data->flags &= ~SF_BEFORE_EOL;
825 }
653099ff
GS
826 if (flags & SCF_DO_STCLASS_AND) {
827 /* Check whether it is compatible with what we know already! */
828 int compat = 1;
829
1aa99e6b 830 if (uc >= 0x100 ||
b81d288d 831 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 833 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 834 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
835 compat = 0;
836 ANYOF_CLASS_ZERO(data->start_class);
837 ANYOF_BITMAP_ZERO(data->start_class);
838 if (compat)
1aa99e6b 839 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 840 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
841 if (uc < 0x100)
842 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
843 }
844 else if (flags & SCF_DO_STCLASS_OR) {
845 /* false positive possible if the class is case-folded */
1aa99e6b 846 if (uc < 0x100)
9b877dbb
IH
847 ANYOF_BITMAP_SET(data->start_class, uc);
848 else
849 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
850 data->start_class->flags &= ~ANYOF_EOS;
851 cl_and(data->start_class, &and_with);
852 }
853 flags &= ~SCF_DO_STCLASS;
a0ed51b3 854 }
653099ff 855 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 856 I32 l = STR_LEN(scan);
1aa99e6b 857 UV uc = *((U8*)STRING(scan));
653099ff
GS
858
859 /* Search for fixed substrings supports EXACT only. */
b81d288d 860 if (flags & SCF_DO_SUBSTR)
830247a4 861 scan_commit(pRExC_state, data);
a0ed51b3 862 if (UTF) {
1aa99e6b
IH
863 U8 *s = (U8 *)STRING(scan);
864 l = utf8_length(s, s + l);
9041c2e3 865 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
866 }
867 min += l;
c277df42 868 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 869 data->pos_min += l;
653099ff
GS
870 if (flags & SCF_DO_STCLASS_AND) {
871 /* Check whether it is compatible with what we know already! */
872 int compat = 1;
873
1aa99e6b 874 if (uc >= 0x100 ||
b81d288d 875 !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b
IH
876 && !ANYOF_BITMAP_TEST(data->start_class, uc)
877 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
653099ff
GS
878 compat = 0;
879 ANYOF_CLASS_ZERO(data->start_class);
880 ANYOF_BITMAP_ZERO(data->start_class);
881 if (compat) {
1aa99e6b 882 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
883 data->start_class->flags &= ~ANYOF_EOS;
884 data->start_class->flags |= ANYOF_FOLD;
885 if (OP(scan) == EXACTFL)
886 data->start_class->flags |= ANYOF_LOCALE;
887 }
888 }
889 else if (flags & SCF_DO_STCLASS_OR) {
890 if (data->start_class->flags & ANYOF_FOLD) {
891 /* false positive possible if the class is case-folded.
892 Assume that the locale settings are the same... */
1aa99e6b
IH
893 if (uc < 0x100)
894 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
895 data->start_class->flags &= ~ANYOF_EOS;
896 }
897 cl_and(data->start_class, &and_with);
898 }
899 flags &= ~SCF_DO_STCLASS;
a0ed51b3 900 }
4d61ec05 901 else if (strchr((char*)PL_varies,OP(scan))) {
aa7a4b56
MH
902 I32 mincount, maxcount, minnext, deltanext, fl;
903 I32 f = flags, pos_before = 0;
c277df42 904 regnode *oscan = scan;
653099ff
GS
905 struct regnode_charclass_class this_class;
906 struct regnode_charclass_class *oclass = NULL;
907
22c35a8c 908 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 909 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
910 scan = NEXTOPER(scan);
911 goto finish;
912 case PLUS:
653099ff 913 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 914 next = NEXTOPER(scan);
653099ff 915 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
916 mincount = 1;
917 maxcount = REG_INFTY;
c277df42
IZ
918 next = regnext(scan);
919 scan = NEXTOPER(scan);
920 goto do_curly;
921 }
922 }
923 if (flags & SCF_DO_SUBSTR)
924 data->pos_min++;
925 min++;
926 /* Fall through. */
927 case STAR:
653099ff
GS
928 if (flags & SCF_DO_STCLASS) {
929 mincount = 0;
b81d288d 930 maxcount = REG_INFTY;
653099ff
GS
931 next = regnext(scan);
932 scan = NEXTOPER(scan);
933 goto do_curly;
934 }
b81d288d 935 is_inf = is_inf_internal = 1;
c277df42
IZ
936 scan = regnext(scan);
937 if (flags & SCF_DO_SUBSTR) {
830247a4 938 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
939 data->longest = &(data->longest_float);
940 }
941 goto optimize_curly_tail;
942 case CURLY:
b81d288d 943 mincount = ARG1(scan);
c277df42
IZ
944 maxcount = ARG2(scan);
945 next = regnext(scan);
cb434fcc
IZ
946 if (OP(scan) == CURLYX) {
947 I32 lp = (data ? *(data->last_closep) : 0);
948
949 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
950 }
c277df42
IZ
951 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
952 do_curly:
953 if (flags & SCF_DO_SUBSTR) {
830247a4 954 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
955 pos_before = data->pos_min;
956 }
957 if (data) {
958 fl = data->flags;
959 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
960 if (is_inf)
961 data->flags |= SF_IS_INF;
962 }
653099ff 963 if (flags & SCF_DO_STCLASS) {
830247a4 964 cl_init(pRExC_state, &this_class);
653099ff
GS
965 oclass = data->start_class;
966 data->start_class = &this_class;
967 f |= SCF_DO_STCLASS_AND;
968 f &= ~SCF_DO_STCLASS_OR;
969 }
e1901655
IZ
970 /* These are the cases when once a subexpression
971 fails at a particular position, it cannot succeed
972 even after backtracking at the enclosing scope.
b81d288d 973
e1901655
IZ
974 XXXX what if minimal match and we are at the
975 initial run of {n,m}? */
976 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
977 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 978
c277df42 979 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
980 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
981 mincount == 0
653099ff
GS
982 ? (f & ~SCF_DO_SUBSTR) : f);
983
984 if (flags & SCF_DO_STCLASS)
985 data->start_class = oclass;
986 if (mincount == 0 || minnext == 0) {
987 if (flags & SCF_DO_STCLASS_OR) {
830247a4 988 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
989 }
990 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 991 /* Switch to OR mode: cache the old value of
653099ff
GS
992 * data->start_class */
993 StructCopy(data->start_class, &and_with,
994 struct regnode_charclass_class);
995 flags &= ~SCF_DO_STCLASS_AND;
996 StructCopy(&this_class, data->start_class,
997 struct regnode_charclass_class);
998 flags |= SCF_DO_STCLASS_OR;
999 data->start_class->flags |= ANYOF_EOS;
1000 }
1001 } else { /* Non-zero len */
1002 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1003 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1004 cl_and(data->start_class, &and_with);
1005 }
1006 else if (flags & SCF_DO_STCLASS_AND)
1007 cl_and(data->start_class, &this_class);
1008 flags &= ~SCF_DO_STCLASS;
1009 }
c277df42
IZ
1010 if (!scan) /* It was not CURLYX, but CURLY. */
1011 scan = next;
b81d288d 1012 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
821b33a5 1013 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1014 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1015 {
830247a4 1016 vWARN(RExC_parse,
b45f050a
JF
1017 "Quantifier unexpected on zero-length expression");
1018 }
1019
c277df42 1020 min += minnext * mincount;
b81d288d 1021 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1022 && (minnext + deltanext) > 0)
1023 || deltanext == I32_MAX);
aca2d497 1024 is_inf |= is_inf_internal;
c277df42
IZ
1025 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1026
1027 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1028 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1029 && data->flags & SF_IN_PAR
1030 && !(data->flags & SF_HAS_EVAL)
1031 && !deltanext && minnext == 1 ) {
1032 /* Try to optimize to CURLYN. */
1033 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1034 regnode *nxt1 = nxt, *nxt2;
1035
1036 /* Skip open. */
1037 nxt = regnext(nxt);
4d61ec05 1038 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1039 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1040 && STR_LEN(nxt) == 1))
c277df42
IZ
1041 goto nogo;
1042 nxt2 = nxt;
1043 nxt = regnext(nxt);
b81d288d 1044 if (OP(nxt) != CLOSE)
c277df42
IZ
1045 goto nogo;
1046 /* Now we know that nxt2 is the only contents: */
1047 oscan->flags = ARG(nxt);
1048 OP(oscan) = CURLYN;
1049 OP(nxt1) = NOTHING; /* was OPEN. */
1050#ifdef DEBUGGING
1051 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1052 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1053 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1054 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1055 OP(nxt + 1) = OPTIMIZED; /* was count. */
1056 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1057#endif
c277df42 1058 }
c277df42
IZ
1059 nogo:
1060
1061 /* Try optimization CURLYX => CURLYM. */
b81d288d 1062 if ( OP(oscan) == CURLYX && data
c277df42 1063 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1064 && !(data->flags & SF_HAS_EVAL)
1065 && !deltanext ) {
1066 /* XXXX How to optimize if data == 0? */
1067 /* Optimize to a simpler form. */
1068 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1069 regnode *nxt2;
1070
1071 OP(oscan) = CURLYM;
1072 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1073 && (OP(nxt2) != WHILEM))
c277df42
IZ
1074 nxt = nxt2;
1075 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1076 /* Need to optimize away parenths. */
1077 if (data->flags & SF_IN_PAR) {
1078 /* Set the parenth number. */
1079 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1080
b81d288d 1081 if (OP(nxt) != CLOSE)
b45f050a 1082 FAIL("Panic opt close");
c277df42
IZ
1083 oscan->flags = ARG(nxt);
1084 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1085 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1086#ifdef DEBUGGING
1087 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1088 OP(nxt + 1) = OPTIMIZED; /* was count. */
1089 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1090 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1091#endif
c277df42
IZ
1092#if 0
1093 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1094 regnode *nnxt = regnext(nxt1);
b81d288d 1095
c277df42
IZ
1096 if (nnxt == nxt) {
1097 if (reg_off_by_arg[OP(nxt1)])
1098 ARG_SET(nxt1, nxt2 - nxt1);
1099 else if (nxt2 - nxt1 < U16_MAX)
1100 NEXT_OFF(nxt1) = nxt2 - nxt1;
1101 else
1102 OP(nxt) = NOTHING; /* Cannot beautify */
1103 }
1104 nxt1 = nnxt;
1105 }
1106#endif
1107 /* Optimize again: */
b81d288d 1108 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1109 NULL, 0);
a0ed51b3
LW
1110 }
1111 else
c277df42 1112 oscan->flags = 0;
c277df42 1113 }
e1901655
IZ
1114 else if ((OP(oscan) == CURLYX)
1115 && (flags & SCF_WHILEM_VISITED_POS)
1116 /* See the comment on a similar expression above.
1117 However, this time it not a subexpression
1118 we care about, but the expression itself. */
1119 && (maxcount == REG_INFTY)
1120 && data && ++data->whilem_c < 16) {
1121 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1122 /* Find WHILEM (as in regexec.c) */
1123 regnode *nxt = oscan + NEXT_OFF(oscan);
1124
1125 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1126 nxt += ARG(nxt);
1127 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1128 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1129 }
b81d288d 1130 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1131 pars++;
1132 if (flags & SCF_DO_SUBSTR) {
1133 SV *last_str = Nullsv;
1134 int counted = mincount != 0;
1135
1136 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1137 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1138 ? pos_before : data->last_start_min;
1139 STRLEN l;
1140 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1141 I32 old = b - data->last_start_min;
1142
1143 if (UTF)
1144 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1145
a0ed51b3 1146 l -= old;
c277df42 1147 /* Get the added string: */
79cb57f6 1148 last_str = newSVpvn(s + old, l);
c277df42
IZ
1149 if (deltanext == 0 && pos_before == b) {
1150 /* What was added is a constant string */
1151 if (mincount > 1) {
1152 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1153 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1154 SvPVX(last_str), l, mincount - 1);
1155 SvCUR(last_str) *= mincount;
1156 /* Add additional parts. */
b81d288d 1157 SvCUR_set(data->last_found,
c277df42
IZ
1158 SvCUR(data->last_found) - l);
1159 sv_catsv(data->last_found, last_str);
1160 data->last_end += l * (mincount - 1);
1161 }
2a8d9689
HS
1162 } else {
1163 /* start offset must point into the last copy */
1164 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1165 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1166 * (minnext + data->pos_delta);
c277df42
IZ
1167 }
1168 }
1169 /* It is counted once already... */
1170 data->pos_min += minnext * (mincount - counted);
1171 data->pos_delta += - counted * deltanext +
1172 (minnext + deltanext) * maxcount - minnext * mincount;
1173 if (mincount != maxcount) {
653099ff
GS
1174 /* Cannot extend fixed substrings found inside
1175 the group. */
830247a4 1176 scan_commit(pRExC_state,data);
c277df42
IZ
1177 if (mincount && last_str) {
1178 sv_setsv(data->last_found, last_str);
1179 data->last_end = data->pos_min;
b81d288d 1180 data->last_start_min =
a0ed51b3 1181 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1182 data->last_start_max = is_inf
1183 ? I32_MAX
c277df42 1184 : data->pos_min + data->pos_delta
a0ed51b3 1185 - CHR_SVLEN(last_str);
c277df42
IZ
1186 }
1187 data->longest = &(data->longest_float);
1188 }
aca2d497 1189 SvREFCNT_dec(last_str);
c277df42 1190 }
405ff068 1191 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1192 data->flags |= SF_HAS_EVAL;
1193 optimize_curly_tail:
c277df42 1194 if (OP(oscan) != CURLYX) {
22c35a8c 1195 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1196 && NEXT_OFF(next))
1197 NEXT_OFF(oscan) += NEXT_OFF(next);
1198 }
c277df42 1199 continue;
653099ff 1200 default: /* REF and CLUMP only? */
c277df42 1201 if (flags & SCF_DO_SUBSTR) {
830247a4 1202 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1203 data->longest = &(data->longest_float);
1204 }
aca2d497 1205 is_inf = is_inf_internal = 1;
653099ff 1206 if (flags & SCF_DO_STCLASS_OR)
830247a4 1207 cl_anything(pRExC_state, data->start_class);
653099ff 1208 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1209 break;
1210 }
a0ed51b3 1211 }
ffc61ed2 1212 else if (strchr((char*)PL_simple,OP(scan))) {
653099ff
GS
1213 int value;
1214
c277df42 1215 if (flags & SCF_DO_SUBSTR) {
830247a4 1216 scan_commit(pRExC_state,data);
c277df42
IZ
1217 data->pos_min++;
1218 }
1219 min++;
653099ff
GS
1220 if (flags & SCF_DO_STCLASS) {
1221 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1222
1223 /* Some of the logic below assumes that switching
1224 locale on will only add false positives. */
1225 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1226 case SANY:
653099ff
GS
1227 default:
1228 do_default:
1229 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1230 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1231 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1232 break;
1233 case REG_ANY:
1234 if (OP(scan) == SANY)
1235 goto do_default;
1236 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1237 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1238 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1239 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1240 }
1241 if (flags & SCF_DO_STCLASS_AND || !value)
1242 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1243 break;
1244 case ANYOF:
1245 if (flags & SCF_DO_STCLASS_AND)
1246 cl_and(data->start_class,
1247 (struct regnode_charclass_class*)scan);
1248 else
830247a4 1249 cl_or(pRExC_state, data->start_class,
653099ff
GS
1250 (struct regnode_charclass_class*)scan);
1251 break;
1252 case ALNUM:
1253 if (flags & SCF_DO_STCLASS_AND) {
1254 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1255 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1256 for (value = 0; value < 256; value++)
1257 if (!isALNUM(value))
1258 ANYOF_BITMAP_CLEAR(data->start_class, value);
1259 }
1260 }
1261 else {
1262 if (data->start_class->flags & ANYOF_LOCALE)
1263 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1264 else {
1265 for (value = 0; value < 256; value++)
1266 if (isALNUM(value))
b81d288d 1267 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1268 }
1269 }
1270 break;
1271 case ALNUML:
1272 if (flags & SCF_DO_STCLASS_AND) {
1273 if (data->start_class->flags & ANYOF_LOCALE)
1274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1275 }
1276 else {
1277 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1278 data->start_class->flags |= ANYOF_LOCALE;
1279 }
1280 break;
1281 case NALNUM:
1282 if (flags & SCF_DO_STCLASS_AND) {
1283 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1284 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1285 for (value = 0; value < 256; value++)
1286 if (isALNUM(value))
1287 ANYOF_BITMAP_CLEAR(data->start_class, value);
1288 }
1289 }
1290 else {
1291 if (data->start_class->flags & ANYOF_LOCALE)
1292 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1293 else {
1294 for (value = 0; value < 256; value++)
1295 if (!isALNUM(value))
b81d288d 1296 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1297 }
1298 }
1299 break;
1300 case NALNUML:
1301 if (flags & SCF_DO_STCLASS_AND) {
1302 if (data->start_class->flags & ANYOF_LOCALE)
1303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1304 }
1305 else {
1306 data->start_class->flags |= ANYOF_LOCALE;
1307 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1308 }
1309 break;
1310 case SPACE:
1311 if (flags & SCF_DO_STCLASS_AND) {
1312 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1313 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1314 for (value = 0; value < 256; value++)
1315 if (!isSPACE(value))
1316 ANYOF_BITMAP_CLEAR(data->start_class, value);
1317 }
1318 }
1319 else {
1320 if (data->start_class->flags & ANYOF_LOCALE)
1321 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1322 else {
1323 for (value = 0; value < 256; value++)
1324 if (isSPACE(value))
b81d288d 1325 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1326 }
1327 }
1328 break;
1329 case SPACEL:
1330 if (flags & SCF_DO_STCLASS_AND) {
1331 if (data->start_class->flags & ANYOF_LOCALE)
1332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1333 }
1334 else {
1335 data->start_class->flags |= ANYOF_LOCALE;
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1337 }
1338 break;
1339 case NSPACE:
1340 if (flags & SCF_DO_STCLASS_AND) {
1341 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1342 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1343 for (value = 0; value < 256; value++)
1344 if (isSPACE(value))
1345 ANYOF_BITMAP_CLEAR(data->start_class, value);
1346 }
1347 }
1348 else {
1349 if (data->start_class->flags & ANYOF_LOCALE)
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1351 else {
1352 for (value = 0; value < 256; value++)
1353 if (!isSPACE(value))
b81d288d 1354 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1355 }
1356 }
1357 break;
1358 case NSPACEL:
1359 if (flags & SCF_DO_STCLASS_AND) {
1360 if (data->start_class->flags & ANYOF_LOCALE) {
1361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1362 for (value = 0; value < 256; value++)
1363 if (!isSPACE(value))
1364 ANYOF_BITMAP_CLEAR(data->start_class, value);
1365 }
1366 }
1367 else {
1368 data->start_class->flags |= ANYOF_LOCALE;
1369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1370 }
1371 break;
1372 case DIGIT:
1373 if (flags & SCF_DO_STCLASS_AND) {
1374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1375 for (value = 0; value < 256; value++)
1376 if (!isDIGIT(value))
1377 ANYOF_BITMAP_CLEAR(data->start_class, value);
1378 }
1379 else {
1380 if (data->start_class->flags & ANYOF_LOCALE)
1381 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1382 else {
1383 for (value = 0; value < 256; value++)
1384 if (isDIGIT(value))
b81d288d 1385 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1386 }
1387 }
1388 break;
1389 case NDIGIT:
1390 if (flags & SCF_DO_STCLASS_AND) {
1391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1392 for (value = 0; value < 256; value++)
1393 if (isDIGIT(value))
1394 ANYOF_BITMAP_CLEAR(data->start_class, value);
1395 }
1396 else {
1397 if (data->start_class->flags & ANYOF_LOCALE)
1398 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1399 else {
1400 for (value = 0; value < 256; value++)
1401 if (!isDIGIT(value))
b81d288d 1402 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1403 }
1404 }
1405 break;
1406 }
1407 if (flags & SCF_DO_STCLASS_OR)
1408 cl_and(data->start_class, &and_with);
1409 flags &= ~SCF_DO_STCLASS;
1410 }
a0ed51b3 1411 }
22c35a8c 1412 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1413 data->flags |= (OP(scan) == MEOL
1414 ? SF_BEFORE_MEOL
1415 : SF_BEFORE_SEOL);
a0ed51b3 1416 }
653099ff
GS
1417 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1418 /* Lookbehind, or need to calculate parens/evals/stclass: */
1419 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1420 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1421 /* Lookahead/lookbehind */
cb434fcc 1422 I32 deltanext, minnext, fake = 0;
c277df42 1423 regnode *nscan;
653099ff
GS
1424 struct regnode_charclass_class intrnl;
1425 int f = 0;
c277df42
IZ
1426
1427 data_fake.flags = 0;
b81d288d 1428 if (data) {
2c2d71f5 1429 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1430 data_fake.last_closep = data->last_closep;
1431 }
1432 else
1433 data_fake.last_closep = &fake;
653099ff
GS
1434 if ( flags & SCF_DO_STCLASS && !scan->flags
1435 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1436 cl_init(pRExC_state, &intrnl);
653099ff 1437 data_fake.start_class = &intrnl;
e1901655 1438 f |= SCF_DO_STCLASS_AND;
653099ff 1439 }
e1901655
IZ
1440 if (flags & SCF_WHILEM_VISITED_POS)
1441 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1442 next = regnext(scan);
1443 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1444 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1445 if (scan->flags) {
1446 if (deltanext) {
9baa0206 1447 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1448 }
1449 else if (minnext > U8_MAX) {
9baa0206 1450 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1451 }
1452 scan->flags = minnext;
1453 }
1454 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1455 pars++;
405ff068 1456 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1457 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1458 if (data)
1459 data->whilem_c = data_fake.whilem_c;
e1901655 1460 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1461 int was = (data->start_class->flags & ANYOF_EOS);
1462
1463 cl_and(data->start_class, &intrnl);
1464 if (was)
1465 data->start_class->flags |= ANYOF_EOS;
1466 }
a0ed51b3
LW
1467 }
1468 else if (OP(scan) == OPEN) {
c277df42 1469 pars++;
a0ed51b3 1470 }
cb434fcc
IZ
1471 else if (OP(scan) == CLOSE) {
1472 if (ARG(scan) == is_par) {
1473 next = regnext(scan);
c277df42 1474
cb434fcc
IZ
1475 if ( next && (OP(next) != WHILEM) && next < last)
1476 is_par = 0; /* Disable optimization */
1477 }
1478 if (data)
1479 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1480 }
1481 else if (OP(scan) == EVAL) {
c277df42
IZ
1482 if (data)
1483 data->flags |= SF_HAS_EVAL;
1484 }
96776eda 1485 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1486 if (flags & SCF_DO_SUBSTR) {
830247a4 1487 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1488 data->longest = &(data->longest_float);
1489 }
1490 is_inf = is_inf_internal = 1;
653099ff 1491 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1492 cl_anything(pRExC_state, data->start_class);
96776eda 1493 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1494 }
c277df42
IZ
1495 /* Else: zero-length, ignore. */
1496 scan = regnext(scan);
1497 }
1498
1499 finish:
1500 *scanp = scan;
aca2d497 1501 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1502 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1503 data->pos_delta = I32_MAX - data->pos_min;
1504 if (is_par > U8_MAX)
1505 is_par = 0;
1506 if (is_par && pars==1 && data) {
1507 data->flags |= SF_IN_PAR;
1508 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1509 }
1510 else if (pars && data) {
c277df42
IZ
1511 data->flags |= SF_HAS_PAR;
1512 data->flags &= ~SF_IN_PAR;
1513 }
653099ff
GS
1514 if (flags & SCF_DO_STCLASS_OR)
1515 cl_and(data->start_class, &and_with);
c277df42
IZ
1516 return min;
1517}
1518
76e3520e 1519STATIC I32
830247a4 1520S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1521{
830247a4 1522 if (RExC_rx->data) {
b81d288d
AB
1523 Renewc(RExC_rx->data,
1524 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1525 char, struct reg_data);
830247a4
IZ
1526 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1527 RExC_rx->data->count += n;
a0ed51b3
LW
1528 }
1529 else {
830247a4 1530 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1531 char, struct reg_data);
830247a4
IZ
1532 New(1208, RExC_rx->data->what, n, U8);
1533 RExC_rx->data->count = n;
c277df42 1534 }
830247a4
IZ
1535 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1536 return RExC_rx->data->count - n;
c277df42
IZ
1537}
1538
d88dccdf 1539void
864dbfa3 1540Perl_reginitcolors(pTHX)
d88dccdf 1541{
d88dccdf
IZ
1542 int i = 0;
1543 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1544
d88dccdf
IZ
1545 if (s) {
1546 PL_colors[0] = s = savepv(s);
1547 while (++i < 6) {
1548 s = strchr(s, '\t');
1549 if (s) {
1550 *s = '\0';
1551 PL_colors[i] = ++s;
1552 }
1553 else
c712d376 1554 PL_colors[i] = s = "";
d88dccdf
IZ
1555 }
1556 } else {
b81d288d 1557 while (i < 6)
d88dccdf
IZ
1558 PL_colors[i++] = "";
1559 }
1560 PL_colorset = 1;
1561}
1562
8615cb43 1563
a687059c 1564/*
e50aee73 1565 - pregcomp - compile a regular expression into internal code
a687059c
LW
1566 *
1567 * We can't allocate space until we know how big the compiled form will be,
1568 * but we can't compile it (and thus know how big it is) until we've got a
1569 * place to put the code. So we cheat: we compile it twice, once with code
1570 * generation turned off and size counting turned on, and once "for real".
1571 * This also means that we don't allocate space until we are sure that the
1572 * thing really will compile successfully, and we never have to move the
1573 * code and thus invalidate pointers into it. (Note that it has to be in
1574 * one piece because free() must be able to free it all.) [NB: not true in perl]
1575 *
1576 * Beware that the optimization-preparation code in here knows about some
1577 * of the structure of the compiled regexp. [I'll say.]
1578 */
1579regexp *
864dbfa3 1580Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1581{
a0d0e21e 1582 register regexp *r;
c277df42 1583 regnode *scan;
c277df42 1584 regnode *first;
a0d0e21e 1585 I32 flags;
a0d0e21e
LW
1586 I32 minlen = 0;
1587 I32 sawplus = 0;
1588 I32 sawopen = 0;
2c2d71f5 1589 scan_data_t data;
830247a4
IZ
1590 RExC_state_t RExC_state;
1591 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1592
1593 if (exp == NULL)
c277df42 1594 FAIL("NULL regexp argument");
a0d0e21e 1595
830247a4 1596 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1597 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1598 RExC_utf8 = 1;
a0ed51b3 1599 else
1aa99e6b 1600 RExC_utf8 = 0;
a0ed51b3 1601
5cfc7842 1602 RExC_precomp = exp;
35ef4773 1603 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1604 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1605 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1606 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1607 RExC_flags16 = pm->op_pmflags;
1608 RExC_sawback = 0;
bbce6d69 1609
830247a4
IZ
1610 RExC_seen = 0;
1611 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1612 RExC_seen_evals = 0;
1613 RExC_extralen = 0;
c277df42 1614
bbce6d69 1615 /* First pass: determine size, legality. */
830247a4
IZ
1616 RExC_parse = exp;
1617 RExC_end = xend;
1618 RExC_naughty = 0;
1619 RExC_npar = 1;
1620 RExC_size = 0L;
1621 RExC_emit = &PL_regdummy;
1622 RExC_whilem_seen = 0;
85ddcde9
JH
1623#if 0 /* REGC() is (currently) a NOP at the first pass.
1624 * Clever compilers notice this and complain. --jhi */
830247a4 1625 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1626#endif
830247a4 1627 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1628 RExC_precomp = Nullch;
a0d0e21e
LW
1629 return(NULL);
1630 }
830247a4 1631 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1632
c277df42
IZ
1633 /* Small enough for pointer-storage convention?
1634 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1635 if (RExC_size >= 0x10000L && RExC_extralen)
1636 RExC_size += RExC_extralen;
c277df42 1637 else
830247a4
IZ
1638 RExC_extralen = 0;
1639 if (RExC_whilem_seen > 15)
1640 RExC_whilem_seen = 15;
a0d0e21e 1641
bbce6d69 1642 /* Allocate space and initialize. */
830247a4 1643 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1644 char, regexp);
a0d0e21e 1645 if (r == NULL)
b45f050a
JF
1646 FAIL("Regexp out of space");
1647
0f79a09d
GS
1648#ifdef DEBUGGING
1649 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1650 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1651#endif
c277df42 1652 r->refcnt = 1;
bbce6d69 1653 r->prelen = xend - exp;
5cfc7842 1654 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1655 r->subbeg = NULL;
1656 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1657 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1658
1659 r->substrs = 0; /* Useful during FAIL. */
1660 r->startp = 0; /* Useful during FAIL. */
1661 r->endp = 0; /* Useful during FAIL. */
1662
830247a4 1663 RExC_rx = r;
bbce6d69 1664
1665 /* Second pass: emit code. */
830247a4
IZ
1666 RExC_parse = exp;
1667 RExC_end = xend;
1668 RExC_naughty = 0;
1669 RExC_npar = 1;
1670 RExC_emit = r->program;
2cd61cdb 1671 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1672 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1673 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1674 r->data = 0;
830247a4 1675 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1676 return(NULL);
1677
1678 /* Dig out information for optimizations. */
cf93c79d 1679 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1680 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1681 if (UTF)
1682 r->reganch |= ROPT_UTF8;
c277df42 1683 r->regstclass = NULL;
830247a4 1684 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1685 r->reganch |= ROPT_NAUGHTY;
c277df42 1686 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1687
1688 /* XXXX To minimize changes to RE engine we always allocate
1689 3-units-long substrs field. */
1690 Newz(1004, r->substrs, 1, struct reg_substr_data);
1691
2c2d71f5 1692 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1693 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1694 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1695 I32 fake;
c5254dd6 1696 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1697 struct regnode_charclass_class ch_class;
1698 int stclass_flag;
cb434fcc 1699 I32 last_close = 0;
a0d0e21e
LW
1700
1701 first = scan;
c277df42 1702 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1703 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1704 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1705 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1706 (OP(first) == PLUS) ||
1707 (OP(first) == MINMOD) ||
653099ff 1708 /* An {n,m} with n>0 */
22c35a8c 1709 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1710 if (OP(first) == PLUS)
1711 sawplus = 1;
1712 else
1713 first += regarglen[(U8)OP(first)];
1714 first = NEXTOPER(first);
a687059c
LW
1715 }
1716
a0d0e21e
LW
1717 /* Starting-point info. */
1718 again:
653099ff 1719 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1720 if (OP(first) == EXACT)
1721 ; /* Empty, get anchored substr later. */
1722 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1723 r->regstclass = first;
1724 }
653099ff 1725 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1726 r->regstclass = first;
22c35a8c
GS
1727 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1728 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1729 r->regstclass = first;
22c35a8c 1730 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1731 r->reganch |= (OP(first) == MBOL
1732 ? ROPT_ANCH_MBOL
1733 : (OP(first) == SBOL
1734 ? ROPT_ANCH_SBOL
1735 : ROPT_ANCH_BOL));
a0d0e21e 1736 first = NEXTOPER(first);
774d564b 1737 goto again;
1738 }
1739 else if (OP(first) == GPOS) {
1740 r->reganch |= ROPT_ANCH_GPOS;
1741 first = NEXTOPER(first);
1742 goto again;
a0d0e21e
LW
1743 }
1744 else if ((OP(first) == STAR &&
22c35a8c 1745 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1746 !(r->reganch & ROPT_ANCH) )
1747 {
1748 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1749 int type = OP(NEXTOPER(first));
1750
ffc61ed2 1751 if (type == REG_ANY)
cad2e5aa
JH
1752 type = ROPT_ANCH_MBOL;
1753 else
1754 type = ROPT_ANCH_SBOL;
1755
1756 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1757 first = NEXTOPER(first);
774d564b 1758 goto again;
a0d0e21e 1759 }
b81d288d 1760 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1761 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1762 /* x+ must match at the 1st pos of run of x's */
1763 r->reganch |= ROPT_SKIP;
a0d0e21e 1764
c277df42 1765 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1766 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1767 (IV)(first - scan + 1)));
a0d0e21e
LW
1768 /*
1769 * If there's something expensive in the r.e., find the
1770 * longest literal string that must appear and make it the
1771 * regmust. Resolve ties in favor of later strings, since
1772 * the regstart check works with the beginning of the r.e.
1773 * and avoiding duplication strengthens checking. Not a
1774 * strong reason, but sufficient in the absence of others.
1775 * [Now we resolve ties in favor of the earlier string if
c277df42 1776 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1777 * earlier string may buy us something the later one won't.]
1778 */
a0d0e21e 1779 minlen = 0;
a687059c 1780
79cb57f6
GS
1781 data.longest_fixed = newSVpvn("",0);
1782 data.longest_float = newSVpvn("",0);
1783 data.last_found = newSVpvn("",0);
c277df42
IZ
1784 data.longest = &(data.longest_fixed);
1785 first = scan;
653099ff 1786 if (!r->regstclass) {
830247a4 1787 cl_init(pRExC_state, &ch_class);
653099ff
GS
1788 data.start_class = &ch_class;
1789 stclass_flag = SCF_DO_STCLASS_AND;
1790 } else /* XXXX Check for BOUND? */
1791 stclass_flag = 0;
cb434fcc 1792 data.last_closep = &last_close;
653099ff 1793
830247a4 1794 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1795 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1796 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1797 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1798 && !RExC_seen_zerolen
1799 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1800 r->reganch |= ROPT_CHECK_ALL;
830247a4 1801 scan_commit(pRExC_state, &data);
c277df42
IZ
1802 SvREFCNT_dec(data.last_found);
1803
a0ed51b3 1804 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1805 if (longest_float_length
c277df42
IZ
1806 || (data.flags & SF_FL_BEFORE_EOL
1807 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1808 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1809 int t;
1810
a0ed51b3 1811 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1812 && data.offset_fixed == data.offset_float_min
1813 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1814 goto remove_float; /* As in (a)+. */
1815
c277df42
IZ
1816 r->float_substr = data.longest_float;
1817 r->float_min_offset = data.offset_float_min;
1818 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1819 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1820 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1821 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1822 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1823 }
1824 else {
aca2d497 1825 remove_float:
c277df42
IZ
1826 r->float_substr = Nullsv;
1827 SvREFCNT_dec(data.longest_float);
c5254dd6 1828 longest_float_length = 0;
a0d0e21e 1829 }
c277df42 1830
a0ed51b3 1831 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1832 if (longest_fixed_length
c277df42
IZ
1833 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1834 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1835 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1836 int t;
1837
c277df42
IZ
1838 r->anchored_substr = data.longest_fixed;
1839 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1840 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1841 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1842 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1843 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1844 }
1845 else {
c277df42
IZ
1846 r->anchored_substr = Nullsv;
1847 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1848 longest_fixed_length = 0;
a0d0e21e 1849 }
b81d288d 1850 if (r->regstclass
ffc61ed2 1851 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1852 r->regstclass = NULL;
1853 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1854 && !(data.start_class->flags & ANYOF_EOS)
1855 && !cl_is_anything(data.start_class)) {
1856 SV *sv;
830247a4 1857 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1858
b81d288d 1859 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1860 struct regnode_charclass_class);
1861 StructCopy(data.start_class,
830247a4 1862 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1863 struct regnode_charclass_class);
830247a4 1864 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1865 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1866 PL_regdata = r->data; /* for regprop() */
653099ff
GS
1867 DEBUG_r((sv = sv_newmortal(),
1868 regprop(sv, (regnode*)data.start_class),
894356b3 1869 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1870 SvPVX(sv))));
1871 }
c277df42
IZ
1872
1873 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1874 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1875 r->check_substr = r->anchored_substr;
1876 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1877 if (r->reganch & ROPT_ANCH_SINGLE)
1878 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1879 }
1880 else {
c277df42
IZ
1881 r->check_substr = r->float_substr;
1882 r->check_offset_min = data.offset_float_min;
1883 r->check_offset_max = data.offset_float_max;
a0d0e21e 1884 }
30382c73
IZ
1885 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1886 This should be changed ASAP! */
1887 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1888 r->reganch |= RE_USE_INTUIT;
1889 if (SvTAIL(r->check_substr))
1890 r->reganch |= RE_INTUIT_TAIL;
1891 }
a0ed51b3
LW
1892 }
1893 else {
c277df42
IZ
1894 /* Several toplevels. Best we can is to set minlen. */
1895 I32 fake;
653099ff 1896 struct regnode_charclass_class ch_class;
cb434fcc 1897 I32 last_close = 0;
c277df42
IZ
1898
1899 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1900 scan = r->program + 1;
830247a4 1901 cl_init(pRExC_state, &ch_class);
653099ff 1902 data.start_class = &ch_class;
cb434fcc 1903 data.last_closep = &last_close;
e1901655 1904 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1905 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1906 if (!(data.start_class->flags & ANYOF_EOS)
1907 && !cl_is_anything(data.start_class)) {
1908 SV *sv;
830247a4 1909 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1910
b81d288d 1911 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1912 struct regnode_charclass_class);
1913 StructCopy(data.start_class,
830247a4 1914 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1915 struct regnode_charclass_class);
830247a4 1916 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff
GS
1917 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
1918 DEBUG_r((sv = sv_newmortal(),
1919 regprop(sv, (regnode*)data.start_class),
894356b3 1920 PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
653099ff
GS
1921 SvPVX(sv))));
1922 }
a0d0e21e
LW
1923 }
1924
a0d0e21e 1925 r->minlen = minlen;
b81d288d 1926 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1927 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1928 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1929 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1930 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1931 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
1932 if (RExC_seen & REG_SEEN_SANY)
1933 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
1934 Newz(1002, r->startp, RExC_npar, I32);
1935 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1936 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1937 DEBUG_r(regdump(r));
1938 return(r);
a687059c
LW
1939}
1940
1941/*
1942 - reg - regular expression, i.e. main body or parenthesized thing
1943 *
1944 * Caller must absorb opening parenthesis.
1945 *
1946 * Combining parenthesis handling with the base level of regular expression
1947 * is a trifle forced, but the need to tie the tails of the branches to what
1948 * follows makes it hard to avoid.
1949 */
76e3520e 1950STATIC regnode *
830247a4 1951S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 1952 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 1953{
c277df42
IZ
1954 register regnode *ret; /* Will be the head of the group. */
1955 register regnode *br;
1956 register regnode *lastbr;
1957 register regnode *ender = 0;
a0d0e21e 1958 register I32 parno = 0;
830247a4
IZ
1959 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
1960 char *oregcomp_parse = RExC_parse;
c277df42 1961 char c;
a0d0e21e 1962
821b33a5 1963 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
1964
1965 /* Make an OPEN node, if parenthesized. */
1966 if (paren) {
830247a4 1967 if (*RExC_parse == '?') {
ca9dfc88
IZ
1968 U16 posflags = 0, negflags = 0;
1969 U16 *flagsp = &posflags;
0f5d15d6 1970 int logical = 0;
830247a4 1971 char *seqstart = RExC_parse;
ca9dfc88 1972
830247a4
IZ
1973 RExC_parse++;
1974 paren = *RExC_parse++;
c277df42 1975 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 1976 switch (paren) {
c277df42 1977 case '<':
830247a4 1978 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 1979 if (*RExC_parse == '!')
c277df42 1980 paren = ',';
b81d288d 1981 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 1982 goto unknown;
830247a4 1983 RExC_parse++;
a0d0e21e
LW
1984 case '=':
1985 case '!':
830247a4 1986 RExC_seen_zerolen++;
c277df42
IZ
1987 case ':':
1988 case '>':
a0d0e21e
LW
1989 break;
1990 case '$':
1991 case '@':
8615cb43 1992 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
1993 break;
1994 case '#':
830247a4
IZ
1995 while (*RExC_parse && *RExC_parse != ')')
1996 RExC_parse++;
1997 if (*RExC_parse != ')')
c277df42 1998 FAIL("Sequence (?#... not terminated");
830247a4 1999 nextchar(pRExC_state);
a0d0e21e
LW
2000 *flagp = TRYAGAIN;
2001 return NULL;
8c8ad484 2002 case 'p':
d7e9e385 2003 if (SIZE_ONLY)
830247a4 2004 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2005 /* FALL THROUGH*/
14455d6c 2006 case '?':
0f5d15d6 2007 logical = 1;
830247a4 2008 paren = *RExC_parse++;
0f5d15d6 2009 /* FALL THROUGH */
c277df42
IZ
2010 case '{':
2011 {
c277df42
IZ
2012 I32 count = 1, n = 0;
2013 char c;
830247a4 2014 char *s = RExC_parse;
c277df42
IZ
2015 SV *sv;
2016 OP_4tree *sop, *rop;
2017
830247a4
IZ
2018 RExC_seen_zerolen++;
2019 RExC_seen |= REG_SEEN_EVAL;
2020 while (count && (c = *RExC_parse)) {
2021 if (c == '\\' && RExC_parse[1])
2022 RExC_parse++;
b81d288d 2023 else if (c == '{')
c277df42 2024 count++;
b81d288d 2025 else if (c == '}')
c277df42 2026 count--;
830247a4 2027 RExC_parse++;
c277df42 2028 }
830247a4 2029 if (*RExC_parse != ')')
b45f050a 2030 {
b81d288d 2031 RExC_parse = s;
b45f050a
JF
2032 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2033 }
c277df42
IZ
2034 if (!SIZE_ONLY) {
2035 AV *av;
b81d288d
AB
2036
2037 if (RExC_parse - 1 - s)
830247a4 2038 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2039 else
79cb57f6 2040 sv = newSVpvn("", 0);
c277df42 2041
569233ed
SB
2042 ENTER;
2043 Perl_save_re_context(aTHX);
c277df42 2044 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2045 LEAVE;
c277df42 2046
830247a4
IZ
2047 n = add_data(pRExC_state, 3, "nop");
2048 RExC_rx->data->data[n] = (void*)rop;
2049 RExC_rx->data->data[n+1] = (void*)sop;
2050 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2051 SvREFCNT_dec(sv);
a0ed51b3 2052 }
e24b16f9 2053 else { /* First pass */
830247a4 2054 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2055 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2056 /* No compiled RE interpolated, has runtime
2057 components ===> unsafe. */
2058 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3280af22 2059 if (PL_tainted)
cc6b7395 2060 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2061 }
2062
830247a4 2063 nextchar(pRExC_state);
0f5d15d6 2064 if (logical) {
830247a4 2065 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2066 if (!SIZE_ONLY)
2067 ret->flags = 2;
830247a4 2068 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
0f5d15d6
IZ
2069 return ret;
2070 }
830247a4 2071 return reganode(pRExC_state, EVAL, n);
c277df42
IZ
2072 }
2073 case '(':
2074 {
830247a4 2075 if (RExC_parse[0] == '?') {
b81d288d
AB
2076 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2077 || RExC_parse[1] == '<'
830247a4 2078 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2079 I32 flag;
2080
830247a4 2081 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2082 if (!SIZE_ONLY)
2083 ret->flags = 1;
830247a4 2084 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2085 goto insert_if;
b81d288d 2086 }
a0ed51b3 2087 }
830247a4
IZ
2088 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2089 parno = atoi(RExC_parse++);
c277df42 2090
830247a4
IZ
2091 while (isDIGIT(*RExC_parse))
2092 RExC_parse++;
2093 ret = reganode(pRExC_state, GROUPP, parno);
2094 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2095 vFAIL("Switch condition not recognized");
c277df42 2096 insert_if:
830247a4
IZ
2097 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2098 br = regbranch(pRExC_state, &flags, 1);
c277df42 2099 if (br == NULL)
830247a4 2100 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2101 else
830247a4
IZ
2102 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2103 c = *nextchar(pRExC_state);
d1b80229
IZ
2104 if (flags&HASWIDTH)
2105 *flagp |= HASWIDTH;
c277df42 2106 if (c == '|') {
830247a4
IZ
2107 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2108 regbranch(pRExC_state, &flags, 1);
2109 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2110 if (flags&HASWIDTH)
2111 *flagp |= HASWIDTH;
830247a4 2112 c = *nextchar(pRExC_state);
a0ed51b3
LW
2113 }
2114 else
c277df42
IZ
2115 lastbr = NULL;
2116 if (c != ')')
8615cb43 2117 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2118 ender = reg_node(pRExC_state, TAIL);
2119 regtail(pRExC_state, br, ender);
c277df42 2120 if (lastbr) {
830247a4
IZ
2121 regtail(pRExC_state, lastbr, ender);
2122 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2123 }
2124 else
830247a4 2125 regtail(pRExC_state, ret, ender);
c277df42 2126 return ret;
a0ed51b3
LW
2127 }
2128 else {
830247a4 2129 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2130 }
2131 }
1b1626e4 2132 case 0:
830247a4 2133 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2134 vFAIL("Sequence (? incomplete");
1b1626e4 2135 break;
a0d0e21e 2136 default:
830247a4 2137 --RExC_parse;
ca9dfc88 2138 parse_flags:
830247a4
IZ
2139 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2140 if (*RExC_parse != 'o')
2141 pmflag(flagsp, *RExC_parse);
2142 ++RExC_parse;
ca9dfc88 2143 }
830247a4 2144 if (*RExC_parse == '-') {
ca9dfc88 2145 flagsp = &negflags;
830247a4 2146 ++RExC_parse;
ca9dfc88 2147 goto parse_flags;
48c036b1 2148 }
830247a4
IZ
2149 RExC_flags16 |= posflags;
2150 RExC_flags16 &= ~negflags;
2151 if (*RExC_parse == ':') {
2152 RExC_parse++;
ca9dfc88
IZ
2153 paren = ':';
2154 break;
2155 }
c277df42 2156 unknown:
830247a4
IZ
2157 if (*RExC_parse != ')') {
2158 RExC_parse++;
2159 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2160 }
830247a4 2161 nextchar(pRExC_state);
a0d0e21e
LW
2162 *flagp = TRYAGAIN;
2163 return NULL;
2164 }
2165 }
2166 else {
830247a4
IZ
2167 parno = RExC_npar;
2168 RExC_npar++;
2169 ret = reganode(pRExC_state, OPEN, parno);
c277df42 2170 open = 1;
a0d0e21e 2171 }
a0ed51b3
LW
2172 }
2173 else
a0d0e21e
LW
2174 ret = NULL;
2175
2176 /* Pick up the branches, linking them together. */
830247a4 2177 br = regbranch(pRExC_state, &flags, 1);
a0d0e21e
LW
2178 if (br == NULL)
2179 return(NULL);
830247a4
IZ
2180 if (*RExC_parse == '|') {
2181 if (!SIZE_ONLY && RExC_extralen) {
2182 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3
LW
2183 }
2184 else
830247a4 2185 reginsert(pRExC_state, BRANCH, br);
c277df42
IZ
2186 have_branch = 1;
2187 if (SIZE_ONLY)
830247a4 2188 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2189 }
2190 else if (paren == ':') {
c277df42
IZ
2191 *flagp |= flags&SIMPLE;
2192 }
2193 if (open) { /* Starts with OPEN. */
830247a4 2194 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2195 }
2196 else if (paren != '?') /* Not Conditional */
a0d0e21e 2197 ret = br;
821b33a5
IZ
2198 if (flags&HASWIDTH)
2199 *flagp |= HASWIDTH;
a0d0e21e 2200 *flagp |= flags&SPSTART;
c277df42 2201 lastbr = br;
830247a4
IZ
2202 while (*RExC_parse == '|') {
2203 if (!SIZE_ONLY && RExC_extralen) {
2204 ender = reganode(pRExC_state, LONGJMP,0);
2205 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2206 }
2207 if (SIZE_ONLY)
830247a4
IZ
2208 RExC_extralen += 2; /* Account for LONGJMP. */
2209 nextchar(pRExC_state);
2210 br = regbranch(pRExC_state, &flags, 0);
a687059c 2211 if (br == NULL)
a0d0e21e 2212 return(NULL);
830247a4 2213 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2214 lastbr = br;
821b33a5
IZ
2215 if (flags&HASWIDTH)
2216 *flagp |= HASWIDTH;
a687059c 2217 *flagp |= flags&SPSTART;
a0d0e21e
LW
2218 }
2219
c277df42
IZ
2220 if (have_branch || paren != ':') {
2221 /* Make a closing node, and hook it on the end. */
2222 switch (paren) {
2223 case ':':
830247a4 2224 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2225 break;
2226 case 1:
830247a4 2227 ender = reganode(pRExC_state, CLOSE, parno);
c277df42
IZ
2228 break;
2229 case '<':
c277df42
IZ
2230 case ',':
2231 case '=':
2232 case '!':
c277df42 2233 *flagp &= ~HASWIDTH;
821b33a5
IZ
2234 /* FALL THROUGH */
2235 case '>':
830247a4 2236 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2237 break;
2238 case 0:
830247a4 2239 ender = reg_node(pRExC_state, END);
c277df42
IZ
2240 break;
2241 }
830247a4 2242 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2243
c277df42
IZ
2244 if (have_branch) {
2245 /* Hook the tails of the branches to the closing node. */
2246 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2247 regoptail(pRExC_state, br, ender);
c277df42
IZ
2248 }
2249 }
a0d0e21e 2250 }
c277df42
IZ
2251
2252 {
2253 char *p;
2254 static char parens[] = "=!<,>";
2255
2256 if (paren && (p = strchr(parens, paren))) {
2257 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2258 int flag = (p - parens) > 1;
2259
2260 if (paren == '>')
2261 node = SUSPEND, flag = 0;
830247a4 2262 reginsert(pRExC_state, node,ret);
c277df42 2263 ret->flags = flag;
830247a4 2264 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2265 }
a0d0e21e
LW
2266 }
2267
2268 /* Check for proper termination. */
ce3e6498 2269 if (paren) {
830247a4
IZ
2270 RExC_flags16 = oregflags;
2271 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2272 RExC_parse = oregcomp_parse;
380a0633 2273 vFAIL("Unmatched (");
ce3e6498 2274 }
a0ed51b3 2275 }
830247a4
IZ
2276 else if (!paren && RExC_parse < RExC_end) {
2277 if (*RExC_parse == ')') {
2278 RExC_parse++;
380a0633 2279 vFAIL("Unmatched )");
a0ed51b3
LW
2280 }
2281 else
b45f050a 2282 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2283 /* NOTREACHED */
2284 }
a687059c 2285
a0d0e21e 2286 return(ret);
a687059c
LW
2287}
2288
2289/*
2290 - regbranch - one alternative of an | operator
2291 *
2292 * Implements the concatenation operator.
2293 */
76e3520e 2294STATIC regnode *
830247a4 2295S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2296{
c277df42
IZ
2297 register regnode *ret;
2298 register regnode *chain = NULL;
2299 register regnode *latest;
2300 I32 flags = 0, c = 0;
a0d0e21e 2301
b81d288d 2302 if (first)
c277df42
IZ
2303 ret = NULL;
2304 else {
b81d288d 2305 if (!SIZE_ONLY && RExC_extralen)
830247a4 2306 ret = reganode(pRExC_state, BRANCHJ,0);
c277df42 2307 else
830247a4 2308 ret = reg_node(pRExC_state, BRANCH);
c277df42
IZ
2309 }
2310
b81d288d 2311 if (!first && SIZE_ONLY)
830247a4 2312 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2313
c277df42 2314 *flagp = WORST; /* Tentatively. */
a0d0e21e 2315
830247a4
IZ
2316 RExC_parse--;
2317 nextchar(pRExC_state);
2318 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2319 flags &= ~TRYAGAIN;
830247a4 2320 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2321 if (latest == NULL) {
2322 if (flags & TRYAGAIN)
2323 continue;
2324 return(NULL);
a0ed51b3
LW
2325 }
2326 else if (ret == NULL)
c277df42 2327 ret = latest;
a0d0e21e 2328 *flagp |= flags&HASWIDTH;
c277df42 2329 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2330 *flagp |= flags&SPSTART;
2331 else {
830247a4
IZ
2332 RExC_naughty++;
2333 regtail(pRExC_state, chain, latest);
a687059c 2334 }
a0d0e21e 2335 chain = latest;
c277df42
IZ
2336 c++;
2337 }
2338 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2339 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2340 if (ret == NULL)
2341 ret = chain;
2342 }
2343 if (c == 1) {
2344 *flagp |= flags&SIMPLE;
a0d0e21e 2345 }
a687059c 2346
a0d0e21e 2347 return(ret);
a687059c
LW
2348}
2349
2350/*
2351 - regpiece - something followed by possible [*+?]
2352 *
2353 * Note that the branching code sequences used for ? and the general cases
2354 * of * and + are somewhat optimized: they use the same NOTHING node as
2355 * both the endmarker for their branch list and the body of the last branch.
2356 * It might seem that this node could be dispensed with entirely, but the
2357 * endmarker role is not redundant.
2358 */
76e3520e 2359STATIC regnode *
830247a4 2360S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2361{
c277df42 2362 register regnode *ret;
a0d0e21e
LW
2363 register char op;
2364 register char *next;
2365 I32 flags;
830247a4 2366 char *origparse = RExC_parse;
a0d0e21e
LW
2367 char *maxpos;
2368 I32 min;
c277df42 2369 I32 max = REG_INFTY;
a0d0e21e 2370
830247a4 2371 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2372 if (ret == NULL) {
2373 if (flags & TRYAGAIN)
2374 *flagp |= TRYAGAIN;
2375 return(NULL);
2376 }
2377
830247a4 2378 op = *RExC_parse;
a0d0e21e 2379
830247a4
IZ
2380 if (op == '{' && regcurly(RExC_parse)) {
2381 next = RExC_parse + 1;
a0d0e21e
LW
2382 maxpos = Nullch;
2383 while (isDIGIT(*next) || *next == ',') {
2384 if (*next == ',') {
2385 if (maxpos)
2386 break;
2387 else
2388 maxpos = next;
a687059c 2389 }
a0d0e21e
LW
2390 next++;
2391 }
2392 if (*next == '}') { /* got one */
2393 if (!maxpos)
2394 maxpos = next;
830247a4
IZ
2395 RExC_parse++;
2396 min = atoi(RExC_parse);
a0d0e21e
LW
2397 if (*maxpos == ',')
2398 maxpos++;
2399 else
830247a4 2400 maxpos = RExC_parse;
a0d0e21e
LW
2401 max = atoi(maxpos);
2402 if (!max && *maxpos != '0')
c277df42
IZ
2403 max = REG_INFTY; /* meaning "infinity" */
2404 else if (max >= REG_INFTY)
8615cb43 2405 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2406 RExC_parse = next;
2407 nextchar(pRExC_state);
a0d0e21e
LW
2408
2409 do_curly:
2410 if ((flags&SIMPLE)) {
830247a4
IZ
2411 RExC_naughty += 2 + RExC_naughty / 2;
2412 reginsert(pRExC_state, CURLY, ret);
a0d0e21e
LW
2413 }
2414 else {
830247a4 2415 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2416
2417 w->flags = 0;
830247a4
IZ
2418 regtail(pRExC_state, ret, w);
2419 if (!SIZE_ONLY && RExC_extralen) {
2420 reginsert(pRExC_state, LONGJMP,ret);
2421 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2422 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2423 }
830247a4
IZ
2424 reginsert(pRExC_state, CURLYX,ret);
2425 if (!SIZE_ONLY && RExC_extralen)
c277df42 2426 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2427 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2428 if (SIZE_ONLY)
830247a4
IZ
2429 RExC_whilem_seen++, RExC_extralen += 3;
2430 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2431 }
c277df42 2432 ret->flags = 0;
a0d0e21e
LW
2433
2434 if (min > 0)
821b33a5
IZ
2435 *flagp = WORST;
2436 if (max > 0)
2437 *flagp |= HASWIDTH;
a0d0e21e 2438 if (max && max < min)
8615cb43 2439 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2440 if (!SIZE_ONLY) {
2441 ARG1_SET(ret, min);
2442 ARG2_SET(ret, max);
a687059c 2443 }
a687059c 2444
a0d0e21e 2445 goto nest_check;
a687059c 2446 }
a0d0e21e 2447 }
a687059c 2448
a0d0e21e
LW
2449 if (!ISMULT1(op)) {
2450 *flagp = flags;
a687059c 2451 return(ret);
a0d0e21e 2452 }
bb20fd44 2453
c277df42 2454#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2455
2456 /* if this is reinstated, don't forget to put this back into perldiag:
2457
2458 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2459
2460 (F) The part of the regexp subject to either the * or + quantifier
2461 could match an empty string. The {#} shows in the regular
2462 expression about where the problem was discovered.
2463
2464 */
2465
bb20fd44 2466 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2467 vFAIL("Regexp *+ operand could be empty");
b81d288d 2468#endif
bb20fd44 2469
830247a4 2470 nextchar(pRExC_state);
a0d0e21e 2471
821b33a5 2472 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2473
2474 if (op == '*' && (flags&SIMPLE)) {
830247a4 2475 reginsert(pRExC_state, STAR, ret);
c277df42 2476 ret->flags = 0;
830247a4 2477 RExC_naughty += 4;
a0d0e21e
LW
2478 }
2479 else if (op == '*') {
2480 min = 0;
2481 goto do_curly;
a0ed51b3
LW
2482 }
2483 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2484 reginsert(pRExC_state, PLUS, ret);
c277df42 2485 ret->flags = 0;
830247a4 2486 RExC_naughty += 3;
a0d0e21e
LW
2487 }
2488 else if (op == '+') {
2489 min = 1;
2490 goto do_curly;
a0ed51b3
LW
2491 }
2492 else if (op == '?') {
a0d0e21e
LW
2493 min = 0; max = 1;
2494 goto do_curly;
2495 }
2496 nest_check:
e476b1b5 2497 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2498 vWARN3(RExC_parse,
b45f050a 2499 "%.*s matches null string many times",
830247a4 2500 RExC_parse - origparse,
b45f050a 2501 origparse);
a0d0e21e
LW
2502 }
2503
830247a4
IZ
2504 if (*RExC_parse == '?') {
2505 nextchar(pRExC_state);
2506 reginsert(pRExC_state, MINMOD, ret);
2507 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2508 }
830247a4
IZ
2509 if (ISMULT2(RExC_parse)) {
2510 RExC_parse++;
b45f050a
JF
2511 vFAIL("Nested quantifiers");
2512 }
a0d0e21e
LW
2513
2514 return(ret);
a687059c
LW
2515}
2516
2517/*
2518 - regatom - the lowest level
2519 *
2520 * Optimization: gobbles an entire sequence of ordinary characters so that
2521 * it can turn them into a single node, which is smaller to store and
2522 * faster to run. Backslashed characters are exceptions, each becoming a
2523 * separate node; the code is simpler that way and it's not worth fixing.
2524 *
b45f050a 2525 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2526STATIC regnode *
830247a4 2527S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2528{
c277df42 2529 register regnode *ret = 0;
a0d0e21e
LW
2530 I32 flags;
2531
2532 *flagp = WORST; /* Tentatively. */
2533
2534tryagain:
830247a4 2535 switch (*RExC_parse) {
a0d0e21e 2536 case '^':
830247a4
IZ
2537 RExC_seen_zerolen++;
2538 nextchar(pRExC_state);
2539 if (RExC_flags16 & PMf_MULTILINE)
2540 ret = reg_node(pRExC_state, MBOL);
2541 else if (RExC_flags16 & PMf_SINGLELINE)
2542 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2543 else
830247a4 2544 ret = reg_node(pRExC_state, BOL);
a0d0e21e
LW
2545 break;
2546 case '$':
830247a4 2547 nextchar(pRExC_state);
b81d288d 2548 if (*RExC_parse)
830247a4
IZ
2549 RExC_seen_zerolen++;
2550 if (RExC_flags16 & PMf_MULTILINE)
2551 ret = reg_node(pRExC_state, MEOL);
2552 else if (RExC_flags16 & PMf_SINGLELINE)
2553 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2554 else
830247a4 2555 ret = reg_node(pRExC_state, EOL);
a0d0e21e
LW
2556 break;
2557 case '.':
830247a4 2558 nextchar(pRExC_state);
ffc61ed2
JH
2559 if (RExC_flags16 & PMf_SINGLELINE)
2560 ret = reg_node(pRExC_state, SANY);
2561 else
2562 ret = reg_node(pRExC_state, REG_ANY);
2563 *flagp |= HASWIDTH|SIMPLE;
830247a4 2564 RExC_naughty++;
a0d0e21e
LW
2565 break;
2566 case '[':
b45f050a 2567 {
830247a4 2568 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2569 ret = regclass(pRExC_state);
830247a4
IZ
2570 if (*RExC_parse != ']') {
2571 RExC_parse = oregcomp_parse;
b45f050a
JF
2572 vFAIL("Unmatched [");
2573 }
830247a4 2574 nextchar(pRExC_state);
a0d0e21e
LW
2575 *flagp |= HASWIDTH|SIMPLE;
2576 break;
b45f050a 2577 }
a0d0e21e 2578 case '(':
830247a4
IZ
2579 nextchar(pRExC_state);
2580 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2581 if (ret == NULL) {
bf93d4cc 2582 if (flags & TRYAGAIN) {
830247a4 2583 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2584 /* Make parent create an empty node if needed. */
2585 *flagp |= TRYAGAIN;
2586 return(NULL);
2587 }
a0d0e21e 2588 goto tryagain;
bf93d4cc 2589 }
a0d0e21e
LW
2590 return(NULL);
2591 }
c277df42 2592 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2593 break;
2594 case '|':
2595 case ')':
2596 if (flags & TRYAGAIN) {
2597 *flagp |= TRYAGAIN;
2598 return NULL;
2599 }
b45f050a 2600 vFAIL("Internal urp");
a0d0e21e
LW
2601 /* Supposed to be caught earlier. */
2602 break;
85afd4ae 2603 case '{':
830247a4
IZ
2604 if (!regcurly(RExC_parse)) {
2605 RExC_parse++;
85afd4ae
CS
2606 goto defchar;
2607 }
2608 /* FALL THROUGH */
a0d0e21e
LW
2609 case '?':
2610 case '+':
2611 case '*':
830247a4 2612 RExC_parse++;
b45f050a 2613 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2614 break;
2615 case '\\':
830247a4 2616 switch (*++RExC_parse) {
a0d0e21e 2617 case 'A':
830247a4
IZ
2618 RExC_seen_zerolen++;
2619 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2620 *flagp |= SIMPLE;
830247a4 2621 nextchar(pRExC_state);
a0d0e21e
LW
2622 break;
2623 case 'G':
830247a4
IZ
2624 ret = reg_node(pRExC_state, GPOS);
2625 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2626 *flagp |= SIMPLE;
830247a4 2627 nextchar(pRExC_state);
a0d0e21e
LW
2628 break;
2629 case 'Z':
830247a4 2630 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2631 *flagp |= SIMPLE;
830247a4 2632 nextchar(pRExC_state);
a0d0e21e 2633 break;
b85d18e9 2634 case 'z':
830247a4 2635 ret = reg_node(pRExC_state, EOS);
b85d18e9 2636 *flagp |= SIMPLE;
830247a4
IZ
2637 RExC_seen_zerolen++; /* Do not optimize RE away */
2638 nextchar(pRExC_state);
b85d18e9 2639 break;
4a2d328f 2640 case 'C':
830247a4 2641 ret = reg_node(pRExC_state, SANY);
3baa4c62 2642 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2643 *flagp |= HASWIDTH|SIMPLE;
830247a4 2644 nextchar(pRExC_state);
a0ed51b3
LW
2645 break;
2646 case 'X':
830247a4 2647 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2648 *flagp |= HASWIDTH;
830247a4 2649 nextchar(pRExC_state);
a0ed51b3 2650 break;
a0d0e21e 2651 case 'w':
ffc61ed2 2652 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2653 *flagp |= HASWIDTH|SIMPLE;
830247a4 2654 nextchar(pRExC_state);
a0d0e21e
LW
2655 break;
2656 case 'W':
ffc61ed2 2657 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2658 *flagp |= HASWIDTH|SIMPLE;
830247a4 2659 nextchar(pRExC_state);
a0d0e21e
LW
2660 break;
2661 case 'b':
830247a4
IZ
2662 RExC_seen_zerolen++;
2663 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2664 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2665 *flagp |= SIMPLE;
830247a4 2666 nextchar(pRExC_state);
a0d0e21e
LW
2667 break;
2668 case 'B':
830247a4
IZ
2669 RExC_seen_zerolen++;
2670 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2671 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2672 *flagp |= SIMPLE;
830247a4 2673 nextchar(pRExC_state);
a0d0e21e
LW
2674 break;
2675 case 's':
ffc61ed2 2676 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2677 *flagp |= HASWIDTH|SIMPLE;
830247a4 2678 nextchar(pRExC_state);
a0d0e21e
LW
2679 break;
2680 case 'S':
ffc61ed2 2681 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2682 *flagp |= HASWIDTH|SIMPLE;
830247a4 2683 nextchar(pRExC_state);
a0d0e21e
LW
2684 break;
2685 case 'd':
ffc61ed2 2686 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2687 *flagp |= HASWIDTH|SIMPLE;
830247a4 2688 nextchar(pRExC_state);
a0d0e21e
LW
2689 break;
2690 case 'D':
ffc61ed2 2691 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2692 *flagp |= HASWIDTH|SIMPLE;
830247a4 2693 nextchar(pRExC_state);
a0d0e21e 2694 break;
a14b48bc
LW
2695 case 'p':
2696 case 'P':
2697 { /* a lovely hack--pretend we saw [\pX] instead */
830247a4 2698 char* oldregxend = RExC_end;
a14b48bc 2699
830247a4
IZ
2700 if (RExC_parse[1] == '{') {
2701 RExC_end = strchr(RExC_parse, '}');
2702 if (!RExC_end) {
2703 RExC_parse += 2;
2704 RExC_end = oldregxend;
b45f050a
JF
2705 vFAIL("Missing right brace on \\p{}");
2706 }
830247a4 2707 RExC_end++;
a14b48bc
LW
2708 }
2709 else
830247a4
IZ
2710 RExC_end = RExC_parse + 2;
2711 RExC_parse--;
a14b48bc 2712
ffc61ed2 2713 ret = regclass(pRExC_state);
a14b48bc 2714
830247a4
IZ
2715 RExC_end = oldregxend;
2716 RExC_parse--;
2717 nextchar(pRExC_state);
a14b48bc
LW
2718 *flagp |= HASWIDTH|SIMPLE;
2719 }
2720 break;
a0d0e21e
LW
2721 case 'n':
2722 case 'r':
2723 case 't':
2724 case 'f':
2725 case 'e':
2726 case 'a':
2727 case 'x':
2728 case 'c':
2729 case '0':
2730 goto defchar;
2731 case '1': case '2': case '3': case '4':
2732 case '5': case '6': case '7': case '8': case '9':
2733 {
830247a4 2734 I32 num = atoi(RExC_parse);
a0d0e21e 2735
830247a4 2736 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2737 goto defchar;
2738 else {
830247a4
IZ
2739 while (isDIGIT(*RExC_parse))
2740 RExC_parse++;
b45f050a 2741
830247a4 2742 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2743 vFAIL("Reference to nonexistent group");
830247a4
IZ
2744 RExC_sawback = 1;
2745 ret = reganode(pRExC_state, FOLD
a0ed51b3 2746 ? (LOC ? REFFL : REFF)
c8756f30 2747 : REF, num);
a0d0e21e 2748 *flagp |= HASWIDTH;
830247a4
IZ
2749 RExC_parse--;
2750 nextchar(pRExC_state);
a0d0e21e
LW
2751 }
2752 }
2753 break;
2754 case '\0':
830247a4 2755 if (RExC_parse >= RExC_end)
b45f050a 2756 FAIL("Trailing \\");
a0d0e21e
LW
2757 /* FALL THROUGH */
2758 default:
c9f97d15
IZ
2759 /* Do not generate `unrecognized' warnings here, we fall
2760 back into the quick-grab loop below */
a0d0e21e
LW
2761 goto defchar;
2762 }
2763 break;
4633a7c4
LW
2764
2765 case '#':
830247a4
IZ
2766 if (RExC_flags16 & PMf_EXTENDED) {
2767 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2768 if (RExC_parse < RExC_end)
4633a7c4
LW
2769 goto tryagain;
2770 }
2771 /* FALL THROUGH */
2772
a0d0e21e 2773 default: {
ba210ebe 2774 register STRLEN len;
a0ed51b3 2775 register UV ender;
a0d0e21e 2776 register char *p;
c277df42 2777 char *oldp, *s;
ba210ebe 2778 STRLEN numlen;
a0d0e21e 2779
830247a4 2780 RExC_parse++;
a0d0e21e
LW
2781
2782 defchar:
830247a4 2783 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2784 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2785 : EXACT);
cd439c50 2786 s = STRING(ret);
830247a4
IZ
2787 for (len = 0, p = RExC_parse - 1;
2788 len < 127 && p < RExC_end;
a0d0e21e
LW
2789 len++)
2790 {
2791 oldp = p;
5b5a24f7 2792
830247a4
IZ
2793 if (RExC_flags16 & PMf_EXTENDED)
2794 p = regwhite(p, RExC_end);
a0d0e21e
LW
2795 switch (*p) {
2796 case '^':
2797 case '$':
2798 case '.':
2799 case '[':
2800 case '(':
2801 case ')':
2802 case '|':
2803 goto loopdone;
2804 case '\\':
2805 switch (*++p) {
2806 case 'A':
2807 case 'G':
2808 case 'Z':
b85d18e9 2809 case 'z':
a0d0e21e
LW
2810 case 'w':
2811 case 'W':
2812 case 'b':
2813 case 'B':
2814 case 's':
2815 case 'S':
2816 case 'd':
2817 case 'D':
a14b48bc
LW
2818 case 'p':
2819 case 'P':
a0d0e21e
LW
2820 --p;
2821 goto loopdone;
2822 case 'n':
2823 ender = '\n';
2824 p++;
a687059c 2825 break;
a0d0e21e
LW
2826 case 'r':
2827 ender = '\r';
2828 p++;
a687059c 2829 break;
a0d0e21e
LW
2830 case 't':
2831 ender = '\t';
2832 p++;
a687059c 2833 break;
a0d0e21e
LW
2834 case 'f':
2835 ender = '\f';
2836 p++;
a687059c 2837 break;
a0d0e21e 2838 case 'e':
c7f1f016 2839 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 2840 p++;
a687059c 2841 break;
a0d0e21e 2842 case 'a':
c7f1f016 2843 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 2844 p++;
a687059c 2845 break;
a0d0e21e 2846 case 'x':
a0ed51b3
LW
2847 if (*++p == '{') {
2848 char* e = strchr(p, '}');
b81d288d 2849
b45f050a 2850 if (!e) {
830247a4 2851 RExC_parse = p + 1;
b45f050a
JF
2852 vFAIL("Missing right brace on \\x{}");
2853 }
de5f0749 2854 else {
b21ed0a9 2855 numlen = 1; /* allow underscores */
de35ba6f 2856 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
2857 if (ender > 0xff)
2858 RExC_utf8 = 1;
b21ed0a9
GS
2859 /* numlen is generous */
2860 if (numlen + len >= 127) {
a0ed51b3
LW
2861 p--;
2862 goto loopdone;
2863 }
2864 p = e + 1;
2865 }
a0ed51b3
LW
2866 }
2867 else {
b21ed0a9 2868 numlen = 0; /* disallow underscores */
dff6d3cd 2869 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2870 p += numlen;
2871 }
a687059c 2872 break;
a0d0e21e
LW
2873 case 'c':
2874 p++;
bbce6d69 2875 ender = UCHARAT(p++);
2876 ender = toCTRL(ender);
a687059c 2877 break;
a0d0e21e
LW
2878 case '0': case '1': case '2': case '3':case '4':
2879 case '5': case '6': case '7': case '8':case '9':
2880 if (*p == '0' ||
830247a4 2881 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2882 numlen = 0; /* disallow underscores */
dff6d3cd 2883 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2884 p += numlen;
2885 }
2886 else {
2887 --p;
2888 goto loopdone;
a687059c
LW
2889 }
2890 break;
a0d0e21e 2891 case '\0':
830247a4 2892 if (p >= RExC_end)
b45f050a 2893 FAIL("Trailing \\");
a687059c 2894 /* FALL THROUGH */
a0d0e21e 2895 default:
e476b1b5 2896 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 2897 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 2898 goto normal_default;
a0d0e21e
LW
2899 }
2900 break;
a687059c 2901 default:
a0ed51b3 2902 normal_default:
fd400ab9 2903 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 2904 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 2905 &numlen, 0);
a0ed51b3
LW
2906 p += numlen;
2907 }
2908 else
2909 ender = *p++;
a0d0e21e 2910 break;
a687059c 2911 }
830247a4
IZ
2912 if (RExC_flags16 & PMf_EXTENDED)
2913 p = regwhite(p, RExC_end);
a0ed51b3
LW
2914 if (UTF && FOLD) {
2915 if (LOC)
5e12f4fb 2916 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
2917 else
2918 ender = toLOWER_uni(ender);
2919 }
a0d0e21e
LW
2920 if (ISMULT2(p)) { /* Back off on ?+*. */
2921 if (len)
2922 p = oldp;
2b9d42f0 2923 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 2924 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2925 s += numlen;
2926 len += numlen;
2927 }
a0d0e21e
LW
2928 else {
2929 len++;
cd439c50 2930 REGC(ender, s++);
a0d0e21e
LW
2931 }
2932 break;
a687059c 2933 }
2b9d42f0 2934 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 2935 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
2936 s += numlen;
2937 len += numlen - 1;
2938 }
2939 else
cd439c50 2940 REGC(ender, s++);
a0d0e21e
LW
2941 }
2942 loopdone:
830247a4
IZ
2943 RExC_parse = p - 1;
2944 nextchar(pRExC_state);
793db0cb
JH
2945 {
2946 /* len is STRLEN which is unsigned, need to copy to signed */
2947 IV iv = len;
2948 if (iv < 0)
2949 vFAIL("Internal disaster");
2950 }
a0d0e21e
LW
2951 if (len > 0)
2952 *flagp |= HASWIDTH;
2953 if (len == 1)
2954 *flagp |= SIMPLE;
c277df42 2955 if (!SIZE_ONLY)
cd439c50
IZ
2956 STR_LEN(ret) = len;
2957 if (SIZE_ONLY)
830247a4 2958 RExC_size += STR_SZ(len);
cd439c50 2959 else
830247a4 2960 RExC_emit += STR_SZ(len);
a687059c 2961 }
a0d0e21e
LW
2962 break;
2963 }
a687059c 2964
a0d0e21e 2965 return(ret);
a687059c
LW
2966}
2967
873ef191 2968STATIC char *
cea2e8a9 2969S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
2970{
2971 while (p < e) {
2972 if (isSPACE(*p))
2973 ++p;
2974 else if (*p == '#') {
2975 do {
2976 p++;
2977 } while (p < e && *p != '\n');
2978 }
2979 else
2980 break;
2981 }
2982 return p;
2983}
2984
b8c5462f
JH
2985/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
2986 Character classes ([:foo:]) can also be negated ([:^foo:]).
2987 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
2988 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
2989 but trigger warnings because they are currently unimplemented. */
2990STATIC I32
830247a4 2991S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
2992{
2993 char *posixcc = 0;
936ed897 2994 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 2995
830247a4 2996 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 2997 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
2998 (*RExC_parse == ':' ||
2999 *RExC_parse == '=' ||
3000 *RExC_parse == '.')) {
3001 char c = *RExC_parse;
3002 char* s = RExC_parse++;
b81d288d 3003
830247a4
IZ
3004 while (RExC_parse < RExC_end && *RExC_parse != c)
3005 RExC_parse++;
3006 if (RExC_parse == RExC_end)
620e46c5 3007 /* Grandfather lone [:, [=, [. */
830247a4 3008 RExC_parse = s;
620e46c5 3009 else {
830247a4 3010 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3011
830247a4
IZ
3012 if (*RExC_parse == ']') {
3013 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3014 posixcc = s + 1;
3015 if (*s == ':') {
3016 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3017 I32 skip = 5; /* the most common skip */
3018
3019 switch (*posixcc) {
3020 case 'a':
3021 if (strnEQ(posixcc, "alnum", 5))
3022 namedclass =
3023 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3024 else if (strnEQ(posixcc, "alpha", 5))
3025 namedclass =
3026 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3027 else if (strnEQ(posixcc, "ascii", 5))
3028 namedclass =
3029 complement ? ANYOF_NASCII : ANYOF_ASCII;
3030 break;
aaa51d5e
JF
3031 case 'b':
3032 if (strnEQ(posixcc, "blank", 5))
3033 namedclass =
3034 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3035 break;
b8c5462f
JH
3036 case 'c':
3037 if (strnEQ(posixcc, "cntrl", 5))
3038 namedclass =
3039 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3040 break;
3041 case 'd':
3042 if (strnEQ(posixcc, "digit", 5))
3043 namedclass =
3044 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3045 break;
3046 case 'g':
3047 if (strnEQ(posixcc, "graph", 5))
3048 namedclass =
3049 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3050 break;
3051 case 'l':
3052 if (strnEQ(posixcc, "lower", 5))
3053 namedclass =
3054 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3055 break;
3056 case 'p':
3057 if (strnEQ(posixcc, "print", 5))
3058 namedclass =
3059 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3060 else if (strnEQ(posixcc, "punct", 5))
3061 namedclass =
3062 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3063 break;
3064 case 's':
3065 if (strnEQ(posixcc, "space", 5))
3066 namedclass =
aaa51d5e 3067 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3068 break;
b8c5462f
JH
3069 case 'u':
3070 if (strnEQ(posixcc, "upper", 5))
3071 namedclass =
3072 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3073 break;
3074 case 'w': /* this is not POSIX, this is the Perl \w */
3075 if (strnEQ(posixcc, "word", 4)) {
3076 namedclass =
3077 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3078 skip = 4;
3079 }
3080 break;
3081 case 'x':
3082 if (strnEQ(posixcc, "xdigit", 6)) {
3083 namedclass =
3084 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3085 skip = 6;
3086 }
3087 break;
3088 }
ac561586
JH
3089 if (namedclass == OOB_NAMEDCLASS ||
3090 posixcc[skip] != ':' ||
3091 posixcc[skip+1] != ']')
b45f050a
JF
3092 {
3093 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3094 t - s - 1, s + 1);
3095 }
3096 } else if (!SIZE_ONLY) {
b8c5462f 3097 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3098
830247a4 3099 /* adjust RExC_parse so the warning shows after
b45f050a 3100 the class closes */
830247a4
IZ
3101 while (*RExC_parse && *RExC_parse != ']')
3102 RExC_parse++;
b45f050a
JF
3103 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3104 }
b8c5462f
JH
3105 } else {
3106 /* Maternal grandfather:
3107 * "[:" ending in ":" but not in ":]" */
830247a4 3108 RExC_parse = s;
767d463e 3109 }
620e46c5
JH
3110 }
3111 }
3112
b8c5462f
JH
3113 return namedclass;
3114}
3115
3116STATIC void
830247a4 3117S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3118{
e476b1b5 3119 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3120 (*RExC_parse == ':' ||
3121 *RExC_parse == '=' ||
3122 *RExC_parse == '.')) {
3123 char *s = RExC_parse;
93733859 3124 char c = *s++;
b8c5462f
JH
3125
3126 while(*s && isALNUM(*s))
3127 s++;
3128 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3129 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3130
3131 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3132 if (c == '=' || c == '.')
b45f050a 3133 {
830247a4 3134 /* adjust RExC_parse so the error shows after
b45f050a 3135 the class closes */
830247a4 3136 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3137 ;
3138 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3139 }
b8c5462f
JH
3140 }
3141 }
620e46c5
JH
3142}
3143
76e3520e 3144STATIC regnode *
830247a4 3145S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3146{
ffc61ed2
JH
3147 register UV value;
3148 register IV lastvalue = OOB_UNICODE;
3149 register IV range = 0;
c277df42 3150 register regnode *ret;
ba210ebe 3151 STRLEN numlen;
ffc61ed2 3152 IV namedclass;
73b437c8 3153 char *rangebegin;
936ed897 3154 bool need_class = 0;
ffc61ed2
JH
3155 SV *listsv;
3156 register char *e;
3157 UV n;
1aa99e6b 3158 bool dont_optimize_invert = FALSE;
ffc61ed2
JH
3159
3160 ret = reganode(pRExC_state, ANYOF, 0);
3161
3162 if (!SIZE_ONLY)
3163 ANYOF_FLAGS(ret) = 0;
3164
3165 if (*RExC_parse == '^') { /* Complement of range. */
3166 RExC_naughty++;
3167 RExC_parse++;
3168 if (!SIZE_ONLY)
3169 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3170 }
a0d0e21e 3171
936ed897 3172 if (SIZE_ONLY)
830247a4 3173 RExC_size += ANYOF_SKIP;
936ed897 3174 else {
830247a4 3175 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3176 if (FOLD)
3177 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3178 if (LOC)
3179 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3180 ANYOF_BITMAP_ZERO(ret);
3181 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3182 }
b8c5462f 3183
e476b1b5 3184 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3185 checkposixcc(pRExC_state);
b8c5462f 3186
830247a4 3187 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3188 goto charclassloop; /* allow 1st char to be ] or - */
3189
830247a4 3190 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3191
3192 charclassloop:
3193
3194 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3195
73b437c8 3196 if (!range)
830247a4 3197 rangebegin = RExC_parse;
ffc61ed2 3198 if (UTF) {
5e12f4fb 3199 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3200 RExC_end - RExC_parse,
3201 &numlen, 0);
3202 RExC_parse += numlen;
3203 }
3204 else
3205 value = UCHARAT(RExC_parse++);
620e46c5 3206 if (value == '[')
830247a4 3207 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3208 else if (value == '\\') {
ffc61ed2 3209 if (UTF) {
5e12f4fb 3210 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3211 RExC_end - RExC_parse,
3212 &numlen, 0);
3213 RExC_parse += numlen;
3214 }
3215 else
3216 value = UCHARAT(RExC_parse++);
470c3474 3217 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3218 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3219 * be a problem later if we want switch on Unicode.
3220 * A similar issue a little bit later when switching on
3221 * namedclass. --jhi */
ffc61ed2 3222 switch ((I32)value) {
b8c5462f
JH
3223 case 'w': namedclass = ANYOF_ALNUM; break;
3224 case 'W': namedclass = ANYOF_NALNUM; break;
3225 case 's': namedclass = ANYOF_SPACE; break;
3226 case 'S': namedclass = ANYOF_NSPACE; break;
3227 case 'd': namedclass = ANYOF_DIGIT; break;
3228 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3229 case 'p':
3230 case 'P':
3231 if (*RExC_parse == '{') {
3232 e = strchr(RExC_parse++, '}');
3233 if (!e)
3234 vFAIL("Missing right brace on \\p{}");
3235 n = e - RExC_parse;
3236 }
3237 else {
3238 e = RExC_parse;
3239 n = 1;
3240 }
3241 if (!SIZE_ONLY) {
3242 if (value == 'p')
3243 Perl_sv_catpvf(aTHX_ listsv,
3244 "+utf8::%.*s\n", (int)n, RExC_parse);
3245 else
3246 Perl_sv_catpvf(aTHX_ listsv,
3247 "!utf8::%.*s\n", (int)n, RExC_parse);
3248 }
3249 RExC_parse = e + 1;
3250 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3251 continue;
b8c5462f
JH
3252 case 'n': value = '\n'; break;
3253 case 'r': value = '\r'; break;
3254 case 't': value = '\t'; break;
3255 case 'f': value = '\f'; break;
3256 case 'b': value = '\b'; break;
c7f1f016
NIS
3257 case 'e': value = ASCII_TO_NATIVE('\033');break;
3258 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3259 case 'x':
ffc61ed2
JH
3260 if (*RExC_parse == '{') {
3261 e = strchr(RExC_parse++, '}');
b81d288d 3262 if (!e)
ffc61ed2
JH
3263 vFAIL("Missing right brace on \\x{}");
3264 numlen = 1; /* allow underscores */
3265 value = (UV)scan_hex(RExC_parse,
3266 e - RExC_parse,
3267 &numlen);
3268 RExC_parse = e + 1;
3269 }
3270 else {
3271 numlen = 0; /* disallow underscores */
3272 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3273 RExC_parse += numlen;
3274 }
b8c5462f
JH
3275 break;
3276 case 'c':
830247a4 3277 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3278 value = toCTRL(value);
3279 break;
3280 case '0': case '1': case '2': case '3': case '4':
3281 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3282 numlen = 0; /* disallow underscores */
830247a4
IZ
3283 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3284 RExC_parse += numlen;
b8c5462f 3285 break;
1028017a 3286 default:
e476b1b5 3287 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3288 vWARN2(RExC_parse,
3289 "Unrecognized escape \\%c in character class passed through",
3290 (int)value);
1028017a 3291 break;
b8c5462f 3292 }
ffc61ed2
JH
3293 } /* end of \blah */
3294
3295 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3296
3297 if (!SIZE_ONLY && !need_class)
936ed897 3298 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3299
936ed897 3300 need_class = 1;
ffc61ed2
JH
3301
3302 /* a bad range like a-\d, a-[:digit:] ? */
3303 if (range) {
73b437c8 3304 if (!SIZE_ONLY) {
e476b1b5 3305 if (ckWARN(WARN_REGEXP))
830247a4 3306 vWARN4(RExC_parse,
b45f050a 3307 "False [] range \"%*.*s\"",
830247a4
IZ
3308 RExC_parse - rangebegin,
3309 RExC_parse - rangebegin,
b45f050a 3310 rangebegin);
ffc61ed2
JH
3311 if (lastvalue < 256) {
3312 ANYOF_BITMAP_SET(ret, lastvalue);
3313 ANYOF_BITMAP_SET(ret, '-');
3314 }
3315 else {
3316 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3317 Perl_sv_catpvf(aTHX_ listsv,
5e12f4fb 3318 "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-');
ffc61ed2 3319 }
b8c5462f 3320 }
ffc61ed2
JH
3321
3322 range = 0; /* this was not a true range */
73b437c8 3323 }
ffc61ed2 3324
73b437c8 3325 if (!SIZE_ONLY) {
e2962f66
JH
3326 /* Possible truncation here but in some 64-bit environments
3327 * the compiler gets heartburn about switch on 64-bit values.
3328 * A similar issue a little earlier when switching on value.
98f323fa 3329 * --jhi */
e2962f66 3330 switch ((I32)namedclass) {
73b437c8
JH
3331 case ANYOF_ALNUM:
3332 if (LOC)
936ed897 3333 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3334 else {
3335 for (value = 0; value < 256; value++)
3336 if (isALNUM(value))
936ed897 3337 ANYOF_BITMAP_SET(ret, value);
73b437c8 3338 }
1ba5c669 3339 dont_optimize_invert = TRUE;
ffc61ed2 3340 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3341 break;
3342 case ANYOF_NALNUM:
3343 if (LOC)
936ed897 3344 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3345 else {
3346 for (value = 0; value < 256; value++)
3347 if (!isALNUM(value))
936ed897 3348 ANYOF_BITMAP_SET(ret, value);
73b437c8 3349 }
1ba5c669 3350 dont_optimize_invert = TRUE;
ffc61ed2 3351 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3352 break;
ffc61ed2 3353 case ANYOF_ALNUMC:
73b437c8 3354 if (LOC)
ffc61ed2 3355 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3356 else {
3357 for (value = 0; value < 256; value++)
ffc61ed2 3358 if (isALNUMC(value))
936ed897 3359 ANYOF_BITMAP_SET(ret, value);
73b437c8 3360 }
1ba5c669 3361 dont_optimize_invert = TRUE;
ffc61ed2 3362 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3363 break;
3364 case ANYOF_NALNUMC:
3365 if (LOC)
936ed897 3366 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3367 else {
3368 for (value = 0; value < 256; value++)
3369 if (!isALNUMC(value))
936ed897 3370 ANYOF_BITMAP_SET(ret, value);
73b437c8 3371 }
1ba5c669 3372 dont_optimize_invert = TRUE;
ffc61ed2 3373 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3374 break;
3375 case ANYOF_ALPHA:
3376 if (LOC)
936ed897 3377 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3378 else {
3379 for (value = 0; value < 256; value++)
3380 if (isALPHA(value))
936ed897 3381 ANYOF_BITMAP_SET(ret, value);
73b437c8 3382 }
1ba5c669 3383 dont_optimize_invert = TRUE;
ffc61ed2 3384 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3385 break;
3386 case ANYOF_NALPHA:
3387 if (LOC)
936ed897 3388 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3389 else {
3390 for (value = 0; value < 256; value++)
3391 if (!isALPHA(value))
936ed897 3392 ANYOF_BITMAP_SET(ret, value);
73b437c8 3393 }
1ba5c669 3394 dont_optimize_invert = TRUE;
ffc61ed2 3395 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3396 break;
3397 case ANYOF_ASCII:
3398 if (LOC)
936ed897 3399 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3400 else {
c7f1f016 3401#ifndef EBCDIC
1ba5c669
JH
3402 for (value = 0; value < 128; value++)
3403 ANYOF_BITMAP_SET(ret, value);
3404#else /* EBCDIC */
ffbc6a93
JH
3405 for (value = 0; value < 256; value++) {
3406 if (PL_hints & HINT_RE_ASCIIR) {
3407 if (NATIVE_TO_ASCII(value) < 128)
3408 ANYOF_BITMAP_SET(ret, value);
3409 }
3410 else {
3411 if (isASCII(value))
3412 ANYOF_BITMAP_SET(ret, value);
3413 }
3414 }
1ba5c669 3415#endif /* EBCDIC */
73b437c8 3416 }
1ba5c669 3417 dont_optimize_invert = TRUE;
ffc61ed2 3418 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3419 break;
3420 case ANYOF_NASCII:
3421 if (LOC)
936ed897 3422 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3423 else {
c7f1f016 3424#ifndef EBCDIC
1ba5c669
JH
3425 for (value = 128; value < 256; value++)
3426 ANYOF_BITMAP_SET(ret, value);
3427#else /* EBCDIC */
ffbc6a93
JH
3428 for (value = 0; value < 256; value++) {
3429 if (PL_hints & HINT_RE_ASCIIR) {
3430 if (NATIVE_TO_ASCII(value) >= 128)
3431 ANYOF_BITMAP_SET(ret, value);
3432 }
3433 else {
3434 if (!isASCII(value))
3435 ANYOF_BITMAP_SET(ret, value);
3436 }
3437 }
1ba5c669 3438#endif /* EBCDIC */
73b437c8 3439 }
1ba5c669 3440 dont_optimize_invert = TRUE;
ffc61ed2 3441 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3442 break;
aaa51d5e
JF
3443 case ANYOF_BLANK:
3444 if (LOC)
3445 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3446 else {
3447 for (value = 0; value < 256; value++)
3448 if (isBLANK(value))
3449 ANYOF_BITMAP_SET(ret, value);
3450 }
1ba5c669 3451 dont_optimize_invert = TRUE;
ffc61ed2 3452 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3453 break;
3454 case ANYOF_NBLANK:
3455 if (LOC)
3456 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3457 else {
3458 for (value = 0; value < 256; value++)
3459 if (!isBLANK(value))
3460 ANYOF_BITMAP_SET(ret, value);
3461 }
1ba5c669 3462 dont_optimize_invert = TRUE;
ffc61ed2 3463 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3464 break;
73b437c8
JH
3465 case ANYOF_CNTRL:
3466 if (LOC)
936ed897 3467 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3468 else {
3469 for (value = 0; value < 256; value++)
3470 if (isCNTRL(value))
936ed897 3471 ANYOF_BITMAP_SET(ret, value);
73b437c8 3472 }
1ba5c669 3473 dont_optimize_invert = TRUE;
ffc61ed2 3474 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3475 break;
3476 case ANYOF_NCNTRL:
3477 if (LOC)
936ed897 3478 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3479 else {
3480 for (value = 0; value < 256; value++)
3481 if (!isCNTRL(value))
936ed897 3482 ANYOF_BITMAP_SET(ret, value);
73b437c8 3483 }
1ba5c669 3484 dont_optimize_invert = TRUE;
ffc61ed2
JH
3485 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3486 break;
3487 case ANYOF_DIGIT:
3488 if (LOC)
3489 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3490 else {
3491 /* consecutive digits assumed */
3492 for (value = '0'; value <= '9'; value++)
3493 ANYOF_BITMAP_SET(ret, value);
3494 }
1ba5c669 3495 dont_optimize_invert = TRUE;
ffc61ed2
JH
3496 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3497 break;
3498 case ANYOF_NDIGIT:
3499 if (LOC)
3500 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3501 else {
3502 /* consecutive digits assumed */
3503 for (value = 0; value < '0'; value++)
3504 ANYOF_BITMAP_SET(ret, value);
3505 for (value = '9' + 1; value < 256; value++)
3506 ANYOF_BITMAP_SET(ret, value);
3507 }
1ba5c669 3508 dont_optimize_invert = TRUE;
ffc61ed2 3509 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3510 break;
3511 case ANYOF_GRAPH:
3512 if (LOC)
936ed897 3513 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3514 else {
3515 for (value = 0; value < 256; value++)
3516 if (isGRAPH(value))
936ed897 3517 ANYOF_BITMAP_SET(ret, value);
73b437c8 3518 }
1ba5c669 3519 dont_optimize_invert = TRUE;
ffc61ed2 3520 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3521 break;
3522 case ANYOF_NGRAPH:
3523 if (LOC)
936ed897 3524 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3525 else {
3526 for (value = 0; value < 256; value++)
3527 if (!isGRAPH(value))
936ed897 3528 ANYOF_BITMAP_SET(ret, value);
73b437c8 3529 }
1ba5c669 3530 dont_optimize_invert = TRUE;
ffc61ed2 3531 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3532 break;
3533 case ANYOF_LOWER:
3534 if (LOC)
936ed897 3535 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3536 else {
3537 for (value = 0; value < 256; value++)
3538 if (isLOWER(value))
936ed897 3539 ANYOF_BITMAP_SET(ret, value);
73b437c8 3540 }
1ba5c669 3541 dont_optimize_invert = TRUE;
ffc61ed2 3542 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3543 break;
3544 case ANYOF_NLOWER:
3545 if (LOC)
936ed897 3546 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3547 else {
3548 for (value = 0; value < 256; value++)
3549 if (!isLOWER(value))
936ed897 3550 ANYOF_BITMAP_SET(ret, value);
73b437c8 3551 }
1ba5c669 3552 dont_optimize_invert = TRUE;
ffc61ed2 3553 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3554 break;
3555 case ANYOF_PRINT:
3556 if (LOC)
936ed897 3557 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3558 else {
3559 for (value = 0; value < 256; value++)
3560 if (isPRINT(value))
936ed897 3561 ANYOF_BITMAP_SET(ret, value);
73b437c8 3562 }
1ba5c669 3563 dont_optimize_invert = TRUE;
ffc61ed2 3564 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3565 break;
3566 case ANYOF_NPRINT:
3567 if (LOC)
936ed897 3568 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3569 else {
3570 for (value = 0; value < 256; value++)
3571 if (!isPRINT(value))
936ed897 3572 ANYOF_BITMAP_SET(ret, value);
73b437c8 3573 }
1ba5c669 3574 dont_optimize_invert = TRUE;
ffc61ed2 3575 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3576 break;
aaa51d5e
JF
3577 case ANYOF_PSXSPC:
3578 if (LOC)
3579 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3580 else {
3581 for (value = 0; value < 256; value++)
3582 if (isPSXSPC(value))
3583 ANYOF_BITMAP_SET(ret, value);
3584 }
1ba5c669 3585 dont_optimize_invert = TRUE;
ffc61ed2 3586 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3587 break;
3588 case ANYOF_NPSXSPC:
3589 if (LOC)
3590 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3591 else {
3592 for (value = 0; value < 256; value++)
3593 if (!isPSXSPC(value))
3594 ANYOF_BITMAP_SET(ret, value);
3595 }
1ba5c669 3596 dont_optimize_invert = TRUE;
ffc61ed2 3597 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3598 break;
73b437c8
JH
3599 case ANYOF_PUNCT:
3600 if (LOC)
936ed897 3601 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3602 else {
3603 for (value = 0; value < 256; value++)
3604 if (isPUNCT(value))
936ed897 3605 ANYOF_BITMAP_SET(ret, value);
73b437c8 3606 }
1ba5c669 3607 dont_optimize_invert = TRUE;
ffc61ed2 3608 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3609 break;
3610 case ANYOF_NPUNCT:
3611 if (LOC)
936ed897 3612 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3613 else {
3614 for (value = 0; value < 256; value++)
3615 if (!isPUNCT(value))
936ed897 3616 ANYOF_BITMAP_SET(ret, value);
73b437c8 3617 }
1ba5c669 3618 dont_optimize_invert = TRUE;
ffc61ed2
JH
3619 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3620 break;
3621 case ANYOF_SPACE:
3622 if (LOC)
3623 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3624 else {
3625 for (value = 0; value < 256; value++)
3626 if (isSPACE(value))
3627 ANYOF_BITMAP_SET(ret, value);
3628 }
1ba5c669 3629 dont_optimize_invert = TRUE;
ffc61ed2
JH
3630 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3631 break;
3632 case ANYOF_NSPACE:
3633 if (LOC)
3634 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3635 else {
3636 for (value = 0; value < 256; value++)
3637 if (!isSPACE(value))
3638 ANYOF_BITMAP_SET(ret, value);
3639 }
1ba5c669 3640 dont_optimize_invert = TRUE;
ffc61ed2 3641 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3642 break;
3643 case ANYOF_UPPER:
3644 if (LOC)
936ed897 3645 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3646 else {
3647 for (value = 0; value < 256; value++)
3648 if (isUPPER(value))
936ed897 3649 ANYOF_BITMAP_SET(ret, value);
73b437c8 3650 }
1ba5c669 3651 dont_optimize_invert = TRUE;
ffc61ed2 3652 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3653 break;
3654 case ANYOF_NUPPER:
3655 if (LOC)
936ed897 3656 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3657 else {
3658 for (value = 0; value < 256; value++)
3659 if (!isUPPER(value))
936ed897 3660 ANYOF_BITMAP_SET(ret, value);
73b437c8 3661 }
1ba5c669 3662 dont_optimize_invert = TRUE;
ffc61ed2 3663 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3664 break;
3665 case ANYOF_XDIGIT:
3666 if (LOC)
936ed897 3667 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3668 else {
3669 for (value = 0; value < 256; value++)
3670 if (isXDIGIT(value))
936ed897 3671 ANYOF_BITMAP_SET(ret, value);
73b437c8 3672 }
1ba5c669 3673 dont_optimize_invert = TRUE;
ffc61ed2 3674 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3675 break;
3676 case ANYOF_NXDIGIT:
3677 if (LOC)
936ed897 3678 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3679 else {
3680 for (value = 0; value < 256; value++)
3681 if (!isXDIGIT(value))
936ed897 3682 ANYOF_BITMAP_SET(ret, value);
73b437c8 3683 }
1ba5c669 3684 dont_optimize_invert = TRUE;
ffc61ed2 3685 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3686 break;
3687 default:
b45f050a 3688 vFAIL("Invalid [::] class");
73b437c8 3689 break;
b8c5462f 3690 }
b8c5462f 3691 if (LOC)
936ed897 3692 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3693 continue;
a0d0e21e 3694 }
ffc61ed2
JH
3695 } /* end of namedclass \blah */
3696
a0d0e21e 3697 if (range) {
ffbc6a93
JH
3698 if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
3699 ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
b45f050a 3700 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3701 RExC_parse - rangebegin,
3702 RExC_parse - rangebegin,
b45f050a 3703 rangebegin);
73b437c8 3704 }
ffc61ed2 3705 range = 0; /* not a true range */
a0d0e21e
LW
3706 }
3707 else {
ffc61ed2 3708 lastvalue = value; /* save the beginning of the range */
830247a4
IZ
3709 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3710 RExC_parse[1] != ']') {
3711 RExC_parse++;
ffc61ed2
JH
3712
3713 /* a bad range like \w-, [:word:]- ? */
3714 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3715 if (ckWARN(WARN_REGEXP))
830247a4 3716 vWARN4(RExC_parse,
b45f050a 3717 "False [] range \"%*.*s\"",
830247a4
IZ
3718 RExC_parse - rangebegin,
3719 RExC_parse - rangebegin,
b45f050a 3720 rangebegin);
73b437c8 3721 if (!SIZE_ONLY)
936ed897 3722 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3723 } else
ffc61ed2
JH
3724 range = 1; /* yeah, it's a range! */
3725 continue; /* but do it the next time */
a0d0e21e 3726 }
a687059c 3727 }
ffc61ed2 3728
93733859 3729 /* now is the next time */
ae5c130c 3730 if (!SIZE_ONLY) {
ffc61ed2 3731 if (lastvalue < 256 && value < 256) {
c7f1f016 3732#ifdef EBCDIC /* EBCDIC, for example. */
ffbc6a93
JH
3733 if (PL_hints & HINT_RE_ASCIIR) {
3734 IV i;
3735 /* New style scheme for ranges:
3736 * after :
3737 * use re 'asciir';
3738 * do ranges in ASCII/Unicode space
3739 */
3740 for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
3741 ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
3742 }
3743 else if ((isLOWER(lastvalue) && isLOWER(value)) ||
1ba5c669 3744 (isUPPER(lastvalue) && isUPPER(value)))
ffc61ed2
JH
3745 {
3746 IV i;
3747 if (isLOWER(lastvalue)) {
3748 for (i = lastvalue; i <= value; i++)
3749 if (isLOWER(i))
3750 ANYOF_BITMAP_SET(ret, i);
3751 } else {
3752 for (i = lastvalue; i <= value; i++)
3753 if (isUPPER(i))
3754 ANYOF_BITMAP_SET(ret, i);
3755 }
8ada0baa 3756 }
ffc61ed2 3757 else
8ada0baa 3758#endif
ffc61ed2
JH
3759 for ( ; lastvalue <= value; lastvalue++)
3760 ANYOF_BITMAP_SET(ret, lastvalue);
3761 } else {
3762 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3763 if (lastvalue < value)
3764 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3765 (UV)lastvalue, (UV)value);
3766 else
3767 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3768 (UV)value);
3769 }
8ada0baa 3770 }
ffc61ed2
JH
3771
3772 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3773 }
ffc61ed2 3774
936ed897
IZ
3775 if (need_class) {
3776 if (SIZE_ONLY)
830247a4 3777 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3778 else
830247a4 3779 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3780 }
ffc61ed2 3781
ae5c130c 3782 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3783 if (!SIZE_ONLY &&
ffc61ed2
JH
3784 (ANYOF_FLAGS(ret) &
3785 /* If the only flag is folding (plus possibly inversion). */
3786 (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
a0ed51b3 3787 for (value = 0; value < 256; ++value) {
936ed897 3788 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3789 IV fold = PL_fold[value];
3790
3791 if (fold != value)
3792 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3793 }
3794 }
936ed897 3795 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3796 }
ffc61ed2 3797
ae5c130c 3798 /* optimize inverted simple patterns (e.g. [^a-z]) */
1aa99e6b 3799 if (!SIZE_ONLY && !dont_optimize_invert &&
ffc61ed2
JH
3800 /* If the only flag is inversion. */
3801 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3802 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3803 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3804 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3805 }
a0d0e21e 3806
b81d288d 3807 if (!SIZE_ONLY) {
fde631ed 3808 AV *av = newAV();
ffc61ed2
JH
3809 SV *rv;
3810
3811 av_store(av, 0, listsv);
3812 av_store(av, 1, NULL);
3813 rv = newRV_noinc((SV*)av);
19860706 3814 n = add_data(pRExC_state, 1, "s");
830247a4 3815 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3816 ARG_SET(ret, n);
a0ed51b3
LW
3817 }
3818
3819 return ret;
3820}
3821
76e3520e 3822STATIC char*
830247a4 3823S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3824{
830247a4 3825 char* retval = RExC_parse++;
a0d0e21e 3826
4633a7c4 3827 for (;;) {
830247a4
IZ
3828 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3829 RExC_parse[2] == '#') {
3830 while (*RExC_parse && *RExC_parse != ')')
3831 RExC_parse++;
3832 RExC_parse++;
4633a7c4
LW
3833 continue;
3834 }
830247a4
IZ
3835 if (RExC_flags16 & PMf_EXTENDED) {
3836 if (isSPACE(*RExC_parse)) {
3837 RExC_parse++;
748a9306
LW
3838 continue;
3839 }
830247a4
IZ
3840 else if (*RExC_parse == '#') {
3841 while (*RExC_parse && *RExC_parse != '\n')
3842 RExC_parse++;
3843 RExC_parse++;
748a9306
LW
3844 continue;
3845 }
748a9306 3846 }
4633a7c4 3847 return retval;
a0d0e21e 3848 }
a687059c
LW
3849}
3850
3851/*
c277df42 3852- reg_node - emit a node
a0d0e21e 3853*/
76e3520e 3854STATIC regnode * /* Location. */
830247a4 3855S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3856{
c277df42
IZ
3857 register regnode *ret;
3858 register regnode *ptr;
a687059c 3859
830247a4 3860 ret = RExC_emit;
c277df42 3861 if (SIZE_ONLY) {
830247a4
IZ
3862 SIZE_ALIGN(RExC_size);
3863 RExC_size += 1;
a0d0e21e
LW
3864 return(ret);
3865 }
a687059c 3866
c277df42 3867 NODE_ALIGN_FILL(ret);
a0d0e21e 3868 ptr = ret;
c277df42 3869 FILL_ADVANCE_NODE(ptr, op);
830247a4 3870 RExC_emit = ptr;
a687059c 3871
a0d0e21e 3872 return(ret);
a687059c
LW
3873}
3874
3875/*
a0d0e21e
LW
3876- reganode - emit a node with an argument
3877*/
76e3520e 3878STATIC regnode * /* Location. */
830247a4 3879S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 3880{
c277df42
IZ
3881 register regnode *ret;
3882 register regnode *ptr;
fe14fcc3 3883
830247a4 3884 ret = RExC_emit;
c277df42 3885 if (SIZE_ONLY) {
830247a4
IZ
3886 SIZE_ALIGN(RExC_size);
3887 RExC_size += 2;
a0d0e21e
LW
3888 return(ret);
3889 }
fe14fcc3 3890
c277df42 3891 NODE_ALIGN_FILL(ret);
a0d0e21e 3892 ptr = ret;
c277df42 3893 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
830247a4 3894 RExC_emit = ptr;
fe14fcc3 3895
a0d0e21e 3896 return(ret);
fe14fcc3
LW
3897}
3898
3899/*
cd439c50 3900- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
3901*/
3902STATIC void
830247a4 3903S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 3904{
5e12f4fb 3905 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
3906}
3907
3908/*
a0d0e21e
LW
3909- reginsert - insert an operator in front of already-emitted operand
3910*
3911* Means relocating the operand.
3912*/
76e3520e 3913STATIC void
830247a4 3914S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 3915{
c277df42
IZ
3916 register regnode *src;
3917 register regnode *dst;
3918 register regnode *place;
3919 register int offset = regarglen[(U8)op];
b81d288d 3920
22c35a8c 3921/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
3922
3923 if (SIZE_ONLY) {
830247a4 3924 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
3925 return;
3926 }
a687059c 3927
830247a4
IZ
3928 src = RExC_emit;
3929 RExC_emit += NODE_STEP_REGNODE + offset;
3930 dst = RExC_emit;
a0d0e21e 3931 while (src > opnd)
c277df42 3932 StructCopy(--src, --dst, regnode);
a0d0e21e
LW
3933
3934 place = opnd; /* Op node, where operand used to be. */
c277df42
IZ
3935 src = NEXTOPER(place);
3936 FILL_ADVANCE_NODE(place, op);
3937 Zero(src, offset, regnode);
a687059c
LW
3938}
3939
3940/*
c277df42 3941- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 3942*/
76e3520e 3943STATIC void
830247a4 3944S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3945{
c277df42
IZ
3946 register regnode *scan;
3947 register regnode *temp;
a0d0e21e 3948
c277df42 3949 if (SIZE_ONLY)
a0d0e21e
LW
3950 return;
3951
3952 /* Find last node. */
3953 scan = p;
3954 for (;;) {
3955 temp = regnext(scan);
3956 if (temp == NULL)
3957 break;
3958 scan = temp;
3959 }
a687059c 3960
c277df42
IZ
3961 if (reg_off_by_arg[OP(scan)]) {
3962 ARG_SET(scan, val - scan);
a0ed51b3
LW
3963 }
3964 else {
c277df42
IZ
3965 NEXT_OFF(scan) = val - scan;
3966 }
a687059c
LW
3967}
3968
3969/*
a0d0e21e
LW
3970- regoptail - regtail on operand of first argument; nop if operandless
3971*/
76e3520e 3972STATIC void
830247a4 3973S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 3974{
a0d0e21e 3975 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
3976 if (p == NULL || SIZE_ONLY)
3977 return;
22c35a8c 3978 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 3979 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 3980 }
22c35a8c 3981 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 3982 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
3983 }
3984 else
a0d0e21e 3985 return;
a687059c
LW
3986}
3987
3988/*
3989 - regcurly - a little FSA that accepts {\d+,?\d*}
3990 */
79072805 3991STATIC I32
cea2e8a9 3992S_regcurly(pTHX_ register char *s)
a687059c
LW
3993{
3994 if (*s++ != '{')
3995 return FALSE;
f0fcb552 3996 if (!isDIGIT(*s))
a687059c 3997 return FALSE;
f0fcb552 3998 while (isDIGIT(*s))
a687059c
LW
3999 s++;
4000 if (*s == ',')
4001 s++;
f0fcb552 4002 while (isDIGIT(*s))
a687059c
LW
4003 s++;
4004 if (*s != '}')
4005 return FALSE;
4006 return TRUE;
4007}
4008
a687059c 4009
76e3520e 4010STATIC regnode *
cea2e8a9 4011S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4012{
35ff7856 4013#ifdef DEBUGGING
f248d071 4014 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4015 register regnode *next;
c277df42
IZ
4016
4017 while (op != END && (!last || node < last)) {
4018 /* While that wasn't END last time... */
4019
4020 NODE_ALIGN(node);
4021 op = OP(node);
4022 if (op == CLOSE)
4023 l--;
4024 next = regnext(node);
4025 /* Where, what. */
4026 if (OP(node) == OPTIMIZED)
4027 goto after_print;
4028 regprop(sv, node);
b900a521 4029 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4030 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4031 if (next == NULL) /* Next ptr. */
4032 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4033 else
b900a521 4034 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4035 (void)PerlIO_putc(Perl_debug_log, '\n');
4036 after_print:
22c35a8c 4037 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4038 register regnode *nnode = (OP(next) == LONGJMP
4039 ? regnext(next)
c277df42
IZ
4040 : next);
4041 if (last && nnode > last)
4042 nnode = last;
4043 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4044 }
22c35a8c 4045 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4046 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4047 }
4048 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4049 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4050 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4051 }
22c35a8c 4052 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4053 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4054 next, sv, l + 1);
a0ed51b3
LW
4055 }
4056 else if ( op == PLUS || op == STAR) {
c277df42 4057 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4058 }
4059 else if (op == ANYOF) {
c277df42 4060 node = NEXTOPER(node);
936ed897 4061 node += ANYOF_SKIP;
a0ed51b3 4062 }
22c35a8c 4063 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4064 /* Literal string, where present. */
cd439c50 4065 node += NODE_SZ_STR(node) - 1;
c277df42 4066 node = NEXTOPER(node);
a0ed51b3
LW
4067 }
4068 else {
c277df42
IZ
4069 node = NEXTOPER(node);
4070 node += regarglen[(U8)op];
4071 }
4072 if (op == CURLYX || op == OPEN)
4073 l++;
4074 else if (op == WHILEM)
4075 l--;
4076 }
17c3b450 4077#endif /* DEBUGGING */
c277df42
IZ
4078 return node;
4079}
4080
a687059c 4081/*
fd181c75 4082 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4083 */
4084void
864dbfa3 4085Perl_regdump(pTHX_ regexp *r)
a687059c 4086{
35ff7856 4087#ifdef DEBUGGING
46fc3d4c 4088 SV *sv = sv_newmortal();
a687059c 4089
c277df42 4090 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4091
4092 /* Header fields of interest. */
c277df42 4093 if (r->anchored_substr)
7b0972df 4094 PerlIO_printf(Perl_debug_log,
b81d288d 4095 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4096 PL_colors[0],
7b0972df 4097 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4098 SvPVX(r->anchored_substr),
3280af22 4099 PL_colors[1],
c277df42 4100 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4101 (IV)r->anchored_offset);
c277df42 4102 if (r->float_substr)
7b0972df 4103 PerlIO_printf(Perl_debug_log,
b81d288d 4104 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4105 PL_colors[0],
b81d288d 4106 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4107 SvPVX(r->float_substr),
3280af22 4108 PL_colors[1],
c277df42 4109 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4110 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4111 if (r->check_substr)
b81d288d
AB
4112 PerlIO_printf(Perl_debug_log,
4113 r->check_substr == r->float_substr
c277df42
IZ
4114 ? "(checking floating" : "(checking anchored");
4115 if (r->reganch & ROPT_NOSCAN)
4116 PerlIO_printf(Perl_debug_log, " noscan");
4117 if (r->reganch & ROPT_CHECK_ALL)
4118 PerlIO_printf(Perl_debug_log, " isall");
4119 if (r->check_substr)
4120 PerlIO_printf(Perl_debug_log, ") ");
4121
46fc3d4c 4122 if (r->regstclass) {
4123 regprop(sv, r->regstclass);
4124 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4125 }
774d564b 4126 if (r->reganch & ROPT_ANCH) {
4127 PerlIO_printf(Perl_debug_log, "anchored");
4128 if (r->reganch & ROPT_ANCH_BOL)
4129 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4130 if (r->reganch & ROPT_ANCH_MBOL)
4131 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4132 if (r->reganch & ROPT_ANCH_SBOL)
4133 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4134 if (r->reganch & ROPT_ANCH_GPOS)
4135 PerlIO_printf(Perl_debug_log, "(GPOS)");
4136 PerlIO_putc(Perl_debug_log, ' ');
4137 }
c277df42
IZ
4138 if (r->reganch & ROPT_GPOS_SEEN)
4139 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4140 if (r->reganch & ROPT_SKIP)
760ac839 4141 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4142 if (r->reganch & ROPT_IMPLICIT)
760ac839 4143 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4144 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4145 if (r->reganch & ROPT_EVAL_SEEN)
4146 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4147 PerlIO_printf(Perl_debug_log, "\n");
17c3b450 4148#endif /* DEBUGGING */
a687059c
LW
4149}
4150
653099ff
GS
4151STATIC void
4152S_put_byte(pTHX_ SV *sv, int c)
4153{
ffc61ed2 4154 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4155 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4156 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4157 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4158 else
4159 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4160}
4161
a687059c 4162/*
a0d0e21e
LW
4163- regprop - printable representation of opcode
4164*/
46fc3d4c 4165void
864dbfa3 4166Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4167{
35ff7856 4168#ifdef DEBUGGING
9b155405 4169 register int k;
a0d0e21e 4170
54dc92de 4171 sv_setpvn(sv, "", 0);
9b155405 4172 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4173 /* It would be nice to FAIL() here, but this may be called from
4174 regexec.c, and it would be hard to supply pRExC_state. */
4175 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4176 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4177
4178 k = PL_regkind[(U8)OP(o)];
4179
4180 if (k == EXACT)
7821416a 4181 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4182 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4183 else if (k == CURLY) {
cb434fcc 4184 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4185 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4186 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4187 }
2c2d71f5
JH
4188 else if (k == WHILEM && o->flags) /* Ordinal/of */
4189 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4190 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4191 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4192 else if (k == LOGICAL)
04ebc1ab 4193 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4194 else if (k == ANYOF) {
4195 int i, rangestart = -1;
ffc61ed2 4196 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4197 const char * const anyofs[] = { /* Should be syncronized with
4198 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4199 "\\w",
4200 "\\W",
4201 "\\s",
4202 "\\S",
4203 "\\d",
4204 "\\D",
4205 "[:alnum:]",
4206 "[:^alnum:]",
4207 "[:alpha:]",
4208 "[:^alpha:]",
4209 "[:ascii:]",
4210 "[:^ascii:]",
4211 "[:ctrl:]",
4212 "[:^ctrl:]",
4213 "[:graph:]",
4214 "[:^graph:]",
4215 "[:lower:]",
4216 "[:^lower:]",
4217 "[:print:]",
4218 "[:^print:]",
4219 "[:punct:]",
4220 "[:^punct:]",
4221 "[:upper:]",
aaa51d5e 4222 "[:^upper:]",
653099ff 4223 "[:xdigit:]",
aaa51d5e
JF
4224 "[:^xdigit:]",
4225 "[:space:]",
4226 "[:^space:]",
4227 "[:blank:]",
4228 "[:^blank:]"
653099ff
GS
4229 };
4230
19860706 4231 if (flags & ANYOF_LOCALE)
653099ff 4232 sv_catpv(sv, "{loc}");
19860706 4233 if (flags & ANYOF_FOLD)
653099ff
GS
4234 sv_catpv(sv, "{i}");
4235 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4236 if (flags & ANYOF_INVERT)
653099ff 4237 sv_catpv(sv, "^");
ffc61ed2
JH
4238 for (i = 0; i <= 256; i++) {
4239 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4240 if (rangestart == -1)
4241 rangestart = i;
4242 } else if (rangestart != -1) {
4243 if (i <= rangestart + 3)
4244 for (; rangestart < i; rangestart++)
653099ff 4245 put_byte(sv, rangestart);
ffc61ed2
JH
4246 else {
4247 put_byte(sv, rangestart);
4248 sv_catpv(sv, "-");
4249 put_byte(sv, i - 1);
653099ff 4250 }
ffc61ed2 4251 rangestart = -1;
653099ff 4252 }
847a199f 4253 }
ffc61ed2
JH
4254
4255 if (o->flags & ANYOF_CLASS)
4256 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4257 if (ANYOF_CLASS_TEST(o,i))
4258 sv_catpv(sv, anyofs[i]);
4259
4260 if (flags & ANYOF_UNICODE)
4261 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4262 else if (flags & ANYOF_UNICODE_ALL)
4263 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4264
4265 {
4266 SV *lv;
4267 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4268
ffc61ed2
JH
4269 if (lv) {
4270 if (sw) {
4271 UV i;
4272 U8 s[UTF8_MAXLEN+1];
b81d288d 4273
ffc61ed2 4274 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4275 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2
JH
4276
4277 if (i < 256 && swash_fetch(sw, s)) {
4278 if (rangestart == -1)
4279 rangestart = i;
4280 } else if (rangestart != -1) {
4281 U8 *p;
b81d288d 4282
ffc61ed2
JH
4283 if (i <= rangestart + 3)
4284 for (; rangestart < i; rangestart++) {
2b9d42f0 4285 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4286 put_byte(sv, *p);
4287 }
4288 else {
2b9d42f0 4289 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4290 put_byte(sv, *p);
4291 sv_catpv(sv, "-");
2b9d42f0 4292 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4293 put_byte(sv, *p);
4294 }
4295 rangestart = -1;
4296 }
19860706 4297 }
ffc61ed2
JH
4298
4299 sv_catpv(sv, "..."); /* et cetera */
19860706 4300 }
fde631ed 4301
ffc61ed2
JH
4302 {
4303 char *s = savepv(SvPVX(lv));
4304 char *origs = s;
b81d288d 4305
ffc61ed2 4306 while(*s && *s != '\n') s++;
b81d288d 4307
ffc61ed2
JH
4308 if (*s == '\n') {
4309 char *t = ++s;
4310
4311 while (*s) {
4312 if (*s == '\n')
4313 *s = ' ';
4314 s++;
4315 }
4316 if (s[-1] == ' ')
4317 s[-1] = 0;
4318
4319 sv_catpv(sv, t);
fde631ed 4320 }
b81d288d 4321
ffc61ed2 4322 Safefree(origs);
fde631ed
JH
4323 }
4324 }
653099ff 4325 }
ffc61ed2 4326
653099ff
GS
4327 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4328 }
9b155405 4329 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4330 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4331#endif /* DEBUGGING */
35ff7856 4332}
a687059c 4333
cad2e5aa
JH
4334SV *
4335Perl_re_intuit_string(pTHX_ regexp *prog)
4336{ /* Assume that RE_INTUIT is set */
4337 DEBUG_r(
4338 { STRLEN n_a;
4339 char *s = SvPV(prog->check_substr,n_a);
4340
4341 if (!PL_colorset) reginitcolors();
4342 PerlIO_printf(Perl_debug_log,
4343 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4344 PL_colors[4],PL_colors[5],PL_colors[0],
4345 s,
4346 PL_colors[1],
4347 (strlen(s) > 60 ? "..." : ""));
4348 } );
4349
4350 return prog->check_substr;
4351}
4352
2b69d0c2 4353void
864dbfa3 4354Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4355{
adac82c7 4356 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4357
4358 if (!r || (--r->refcnt > 0))
4359 return;
cad2e5aa
JH
4360 DEBUG_r(PerlIO_printf(Perl_debug_log,
4361 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4362 PL_colors[4],PL_colors[5],PL_colors[0],
4363 r->precomp,
4364 PL_colors[1],
4365 (strlen(r->precomp) > 60 ? "..." : "")));
4366
c277df42 4367 if (r->precomp)
a0d0e21e 4368 Safefree(r->precomp);
cf93c79d
IZ
4369 if (RX_MATCH_COPIED(r))
4370 Safefree(r->subbeg);
a193d654
GS
4371 if (r->substrs) {
4372 if (r->anchored_substr)
4373 SvREFCNT_dec(r->anchored_substr);
4374 if (r->float_substr)
4375 SvREFCNT_dec(r->float_substr);
2779dcf1 4376 Safefree(r->substrs);
a193d654 4377 }
c277df42
IZ
4378 if (r->data) {
4379 int n = r->data->count;
dfad63ad
HS
4380 AV* new_comppad = NULL;
4381 AV* old_comppad;
4382 SV** old_curpad;
4383
c277df42
IZ
4384 while (--n >= 0) {
4385 switch (r->data->what[n]) {
4386 case 's':
4387 SvREFCNT_dec((SV*)r->data->data[n]);
4388 break;
653099ff
GS
4389 case 'f':
4390 Safefree(r->data->data[n]);
4391 break;
dfad63ad
HS
4392 case 'p':
4393 new_comppad = (AV*)r->data->data[n];
4394 break;
c277df42 4395 case 'o':
dfad63ad 4396 if (new_comppad == NULL)
cea2e8a9 4397 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4398 old_comppad = PL_comppad;
4399 old_curpad = PL_curpad;
1e6dc0b6
SB
4400 /* Watch out for global destruction's random ordering. */
4401 if (SvTYPE(new_comppad) == SVt_PVAV) {
4402 PL_comppad = new_comppad;
4403 PL_curpad = AvARRAY(new_comppad);
4404 }
4405 else
4406 PL_curpad = NULL;
c277df42 4407 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4408 PL_comppad = old_comppad;
4409 PL_curpad = old_curpad;
4410 SvREFCNT_dec((SV*)new_comppad);
4411 new_comppad = NULL;
c277df42
IZ
4412 break;
4413 case 'n':
4414 break;
4415 default:
830247a4 4416 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4417 }
4418 }
4419 Safefree(r->data->what);
4420 Safefree(r->data);
a0d0e21e
LW
4421 }
4422 Safefree(r->startp);
4423 Safefree(r->endp);
4424 Safefree(r);
a687059c 4425}
c277df42
IZ
4426
4427/*
4428 - regnext - dig the "next" pointer out of a node
4429 *
4430 * [Note, when REGALIGN is defined there are two places in regmatch()
4431 * that bypass this code for speed.]
4432 */
4433regnode *
864dbfa3 4434Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4435{
4436 register I32 offset;
4437
3280af22 4438 if (p == &PL_regdummy)
c277df42
IZ
4439 return(NULL);
4440
4441 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4442 if (offset == 0)
4443 return(NULL);
4444
c277df42 4445 return(p+offset);
c277df42
IZ
4446}
4447
01f988be 4448STATIC void
cea2e8a9 4449S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4450{
4451 va_list args;
4452 STRLEN l1 = strlen(pat1);
4453 STRLEN l2 = strlen(pat2);
4454 char buf[512];
06bf62c7 4455 SV *msv;
c277df42
IZ
4456 char *message;
4457
4458 if (l1 > 510)
4459 l1 = 510;
4460 if (l1 + l2 > 510)
4461 l2 = 510 - l1;
4462 Copy(pat1, buf, l1 , char);
4463 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4464 buf[l1 + l2] = '\n';
4465 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4466#ifdef I_STDARG
4467 /* ANSI variant takes additional second argument */
c277df42 4468 va_start(args, pat2);
8736538c
AS
4469#else
4470 va_start(args);
4471#endif
5a844595 4472 msv = vmess(buf, &args);
c277df42 4473 va_end(args);
06bf62c7 4474 message = SvPV(msv,l1);
c277df42
IZ
4475 if (l1 > 512)
4476 l1 = 512;
4477 Copy(message, buf, l1 , char);
4478 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4479 Perl_croak(aTHX_ "%s", buf);
c277df42 4480}
a0ed51b3
LW
4481
4482/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4483
4484void
864dbfa3 4485Perl_save_re_context(pTHX)
b81d288d 4486{
830247a4
IZ
4487#if 0
4488 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4489 SAVEI32(RExC_npar); /* () count. */
4490 SAVEI32(RExC_size); /* Code size. */
4491 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4492 SAVEVPTR(RExC_rx); /* from regcomp.c */
4493 SAVEI32(RExC_seen); /* from regcomp.c */
4494 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4495 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4496 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4497 SAVEPPTR(RExC_end); /* End of input for compile */
4498 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4499#endif
4500
4501 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4502 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4503 SAVEPPTR(PL_reginput); /* String-input pointer. */
4504 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4505 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4506 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4507 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4508 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4509 SAVEPPTR(PL_regtill); /* How far we are required to go. */
e8347627 4510 SAVEI8(PL_regprev); /* char before regbol, \n if none */
b81d288d 4511 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4512 PL_reg_start_tmp = 0;
a0ed51b3
LW
4513 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4514 PL_reg_start_tmpl = 0;
7766f137 4515 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4516 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4517 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4518 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4519 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4520 SAVEVPTR(PL_regcc); /* from regexec.c */
4521 SAVEVPTR(PL_curcop);
7766f137
GS
4522 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4523 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4524 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4525 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4526 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4527 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4528 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4529 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4530 SAVEI32(PL_regnpar); /* () count. */
54b6e2fa 4531#ifdef DEBUGGING
b81d288d 4532 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4533#endif
a0ed51b3 4534}
51371543
GS
4535
4536#ifdef PERL_OBJECT
51371543
GS
4537#include "XSUB.h"
4538#undef this
4539#define this pPerl
4540#endif
4541
4542static void
4543clear_re(pTHXo_ void *r)
4544{
4545 ReREFCNT_dec((regexp *)r);
4546}
ffbc6a93 4547