This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Smoke 13191 /pro/3gl/CPAN/perl-current
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
9af0acdb
JH
525 int value;
526
653099ff 527 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
528 for (value = 0; value < 256; ++value)
529 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 530 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* Can match anything (initialization) */
536STATIC int
537S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538{
539 int value;
540
aaa51d5e 541 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
542 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
543 return 1;
1aa99e6b
IH
544 if (!(cl->flags & ANYOF_UNICODE_ALL))
545 return 0;
9af0acdb
JH
546 for (value = 0; value < 256; ++value)
547 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
548 return 0;
549 return 1;
550}
551
552/* Can match anything (initialization) */
553STATIC void
830247a4 554S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 555{
8ecf7187 556 Zero(cl, 1, struct regnode_charclass_class);
653099ff 557 cl->type = ANYOF;
830247a4 558 cl_anything(pRExC_state, cl);
653099ff
GS
559}
560
561STATIC void
830247a4 562S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 563{
8ecf7187 564 Zero(cl, 1, struct regnode_charclass_class);
653099ff 565 cl->type = ANYOF;
830247a4 566 cl_anything(pRExC_state, cl);
653099ff
GS
567 if (LOC)
568 cl->flags |= ANYOF_LOCALE;
569}
570
571/* 'And' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
574S_cl_and(pTHX_ struct regnode_charclass_class *cl,
575 struct regnode_charclass_class *and_with)
576{
653099ff
GS
577 if (!(and_with->flags & ANYOF_CLASS)
578 && !(cl->flags & ANYOF_CLASS)
579 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
580 && !(and_with->flags & ANYOF_FOLD)
581 && !(cl->flags & ANYOF_FOLD)) {
582 int i;
583
584 if (and_with->flags & ANYOF_INVERT)
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= ~and_with->bitmap[i];
587 else
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] &= and_with->bitmap[i];
590 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
591 if (!(and_with->flags & ANYOF_EOS))
592 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
593
594 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 cl->flags |= ANYOF_UNICODE;
597 ARG_SET(cl, ARG(and_with));
598 }
599 if (!(and_with->flags & ANYOF_UNICODE_ALL))
600 cl->flags &= ~ANYOF_UNICODE_ALL;
601 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
602 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
603}
604
605/* 'OR' a given class with another one. Can create false positives */
606/* We assume that cl is not inverted */
607STATIC void
830247a4 608S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 609{
653099ff
GS
610 if (or_with->flags & ANYOF_INVERT) {
611 /* We do not use
612 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
613 * <= (B1 | !B2) | (CL1 | !CL2)
614 * which is wasteful if CL2 is small, but we ignore CL2:
615 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
616 * XXXX Can we handle case-fold? Unclear:
617 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
618 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
619 */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && !(or_with->flags & ANYOF_FOLD)
622 && !(cl->flags & ANYOF_FOLD) ) {
623 int i;
624
625 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
626 cl->bitmap[i] |= ~or_with->bitmap[i];
627 } /* XXXX: logic is complicated otherwise */
628 else {
830247a4 629 cl_anything(pRExC_state, cl);
653099ff
GS
630 }
631 } else {
632 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
633 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 634 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
635 || (cl->flags & ANYOF_FOLD)) ) {
636 int i;
637
638 /* OR char bitmap and class bitmap separately */
639 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
640 cl->bitmap[i] |= or_with->bitmap[i];
641 if (or_with->flags & ANYOF_CLASS) {
642 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
643 cl->classflags[i] |= or_with->classflags[i];
644 cl->flags |= ANYOF_CLASS;
645 }
646 }
647 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649 }
650 }
651 if (or_with->flags & ANYOF_EOS)
652 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
653
654 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
655 ARG(cl) != ARG(or_with)) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659 if (or_with->flags & ANYOF_UNICODE_ALL) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
653099ff
GS
663}
664
665/* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
c277df42
IZ
668/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
669 to the position after last scanned or to NULL. */
670
76e3520e 671STATIC I32
830247a4 672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676{
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 681 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
653099ff 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 685
c277df42
IZ
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
22c35a8c 689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 690 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693#ifdef DEBUGGING
694 regnode *stop = scan;
b81d288d 695#endif
c277df42 696
cd439c50 697 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
b81d288d 700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
22c35a8c 706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709#ifdef DEBUGGING
710 if (stringok)
711 stop = n;
b81d288d 712#endif
c277df42 713 n = regnext(n);
a0ed51b3 714 }
f49d4d0f 715 else if (stringok) {
cd439c50 716 int oldl = STR_LEN(scan);
c277df42 717 regnode *nnext = regnext(n);
f49d4d0f 718
b81d288d 719 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
c277df42 724 /* Now we can overwrite *n : */
f49d4d0f 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 726#ifdef DEBUGGING
f49d4d0f 727 stop = next - 1;
b81d288d 728#endif
c277df42
IZ
729 n = nnext;
730 }
731 }
732#ifdef DEBUGGING
733 /* Allow dumping */
cd439c50 734 n = scan + NODE_SZ_STR(scan);
c277df42 735 while (n <= stop) {
22c35a8c 736 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
737 OP(n) = OPTIMIZED;
738 NEXT_OFF(n) = 0;
739 }
740 n++;
741 }
653099ff 742#endif
c277df42 743 }
653099ff
GS
744 /* Follow the next-chain of the current node and optimize
745 away all the NOTHINGs from it. */
c277df42 746 if (OP(scan) != CURLYX) {
048cfca1
GS
747 int max = (reg_off_by_arg[OP(scan)]
748 ? I32_MAX
749 /* I32 may be smaller than U16 on CRAYs! */
750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
752 int noff;
753 regnode *n = scan;
b81d288d 754
c277df42
IZ
755 /* Skip NOTHING and LONGJMP. */
756 while ((n = regnext(n))
22c35a8c 757 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
759 && off + noff < max)
760 off += noff;
761 if (reg_off_by_arg[OP(scan)])
762 ARG(scan) = off;
b81d288d 763 else
c277df42
IZ
764 NEXT_OFF(scan) = off;
765 }
653099ff
GS
766 /* The principal pseudo-switch. Cannot be a switch, since we
767 look into several different things. */
b81d288d 768 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
769 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
770 next = regnext(scan);
771 code = OP(scan);
b81d288d
AB
772
773 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 774 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 775 struct regnode_charclass_class accum;
c277df42 776
653099ff 777 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 778 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 779 if (flags & SCF_DO_STCLASS)
830247a4 780 cl_init_zero(pRExC_state, &accum);
c277df42 781 while (OP(scan) == code) {
830247a4 782 I32 deltanext, minnext, f = 0, fake;
653099ff 783 struct regnode_charclass_class this_class;
c277df42
IZ
784
785 num++;
786 data_fake.flags = 0;
b81d288d 787 if (data) {
2c2d71f5 788 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
789 data_fake.last_closep = data->last_closep;
790 }
791 else
792 data_fake.last_closep = &fake;
c277df42
IZ
793 next = regnext(scan);
794 scan = NEXTOPER(scan);
795 if (code != BRANCH)
796 scan = NEXTOPER(scan);
653099ff 797 if (flags & SCF_DO_STCLASS) {
830247a4 798 cl_init(pRExC_state, &this_class);
653099ff
GS
799 data_fake.start_class = &this_class;
800 f = SCF_DO_STCLASS_AND;
b81d288d 801 }
e1901655
IZ
802 if (flags & SCF_WHILEM_VISITED_POS)
803 f |= SCF_WHILEM_VISITED_POS;
653099ff 804 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
805 minnext = study_chunk(pRExC_state, &scan, &deltanext,
806 next, &data_fake, f);
b81d288d 807 if (min1 > minnext)
c277df42
IZ
808 min1 = minnext;
809 if (max1 < minnext + deltanext)
810 max1 = minnext + deltanext;
811 if (deltanext == I32_MAX)
aca2d497 812 is_inf = is_inf_internal = 1;
c277df42
IZ
813 scan = next;
814 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
815 pars++;
405ff068 816 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 817 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
818 if (data)
819 data->whilem_c = data_fake.whilem_c;
653099ff 820 if (flags & SCF_DO_STCLASS)
830247a4 821 cl_or(pRExC_state, &accum, &this_class);
b81d288d 822 if (code == SUSPEND)
c277df42
IZ
823 break;
824 }
825 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
826 min1 = 0;
827 if (flags & SCF_DO_SUBSTR) {
828 data->pos_min += min1;
829 data->pos_delta += max1 - min1;
830 if (max1 != min1 || is_inf)
831 data->longest = &(data->longest_float);
832 }
833 min += min1;
834 delta += max1 - min1;
653099ff 835 if (flags & SCF_DO_STCLASS_OR) {
830247a4 836 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
837 if (min1) {
838 cl_and(data->start_class, &and_with);
839 flags &= ~SCF_DO_STCLASS;
840 }
841 }
842 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
843 if (min1) {
844 cl_and(data->start_class, &accum);
653099ff 845 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
846 }
847 else {
b81d288d 848 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
849 * data->start_class */
850 StructCopy(data->start_class, &and_with,
851 struct regnode_charclass_class);
852 flags &= ~SCF_DO_STCLASS_AND;
853 StructCopy(&accum, data->start_class,
854 struct regnode_charclass_class);
855 flags |= SCF_DO_STCLASS_OR;
856 data->start_class->flags |= ANYOF_EOS;
857 }
653099ff 858 }
a0ed51b3
LW
859 }
860 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
861 scan = NEXTOPER(NEXTOPER(scan));
862 else /* single branch is optimized. */
863 scan = NEXTOPER(scan);
864 continue;
a0ed51b3
LW
865 }
866 else if (OP(scan) == EXACT) {
cd439c50 867 I32 l = STR_LEN(scan);
1aa99e6b 868 UV uc = *((U8*)STRING(scan));
a0ed51b3 869 if (UTF) {
1aa99e6b
IH
870 U8 *s = (U8*)STRING(scan);
871 l = utf8_length(s, s + l);
9041c2e3 872 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
873 }
874 min += l;
c277df42 875 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
876 /* The code below prefers earlier match for fixed
877 offset, later match for variable offset. */
878 if (data->last_end == -1) { /* Update the start info. */
879 data->last_start_min = data->pos_min;
880 data->last_start_max = is_inf
b81d288d 881 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 882 }
cd439c50 883 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
884 data->last_end = data->pos_min + l;
885 data->pos_min += l; /* As in the first entry. */
886 data->flags &= ~SF_BEFORE_EOL;
887 }
653099ff
GS
888 if (flags & SCF_DO_STCLASS_AND) {
889 /* Check whether it is compatible with what we know already! */
890 int compat = 1;
891
1aa99e6b 892 if (uc >= 0x100 ||
516a5887 893 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 894 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 895 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 896 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 897 )
653099ff
GS
898 compat = 0;
899 ANYOF_CLASS_ZERO(data->start_class);
900 ANYOF_BITMAP_ZERO(data->start_class);
901 if (compat)
1aa99e6b 902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 903 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
904 if (uc < 0x100)
905 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
906 }
907 else if (flags & SCF_DO_STCLASS_OR) {
908 /* false positive possible if the class is case-folded */
1aa99e6b 909 if (uc < 0x100)
9b877dbb
IH
910 ANYOF_BITMAP_SET(data->start_class, uc);
911 else
912 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
913 data->start_class->flags &= ~ANYOF_EOS;
914 cl_and(data->start_class, &and_with);
915 }
916 flags &= ~SCF_DO_STCLASS;
a0ed51b3 917 }
653099ff 918 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
653099ff
GS
921
922 /* Search for fixed substrings supports EXACT only. */
b81d288d 923 if (flags & SCF_DO_SUBSTR)
830247a4 924 scan_commit(pRExC_state, data);
a0ed51b3 925 if (UTF) {
1aa99e6b
IH
926 U8 *s = (U8 *)STRING(scan);
927 l = utf8_length(s, s + l);
9041c2e3 928 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
929 }
930 min += l;
c277df42 931 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 932 data->pos_min += l;
653099ff
GS
933 if (flags & SCF_DO_STCLASS_AND) {
934 /* Check whether it is compatible with what we know already! */
935 int compat = 1;
936
1aa99e6b 937 if (uc >= 0x100 ||
516a5887 938 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 939 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 940 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
941 compat = 0;
942 ANYOF_CLASS_ZERO(data->start_class);
943 ANYOF_BITMAP_ZERO(data->start_class);
944 if (compat) {
1aa99e6b 945 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
946 data->start_class->flags &= ~ANYOF_EOS;
947 data->start_class->flags |= ANYOF_FOLD;
948 if (OP(scan) == EXACTFL)
949 data->start_class->flags |= ANYOF_LOCALE;
950 }
951 }
952 else if (flags & SCF_DO_STCLASS_OR) {
953 if (data->start_class->flags & ANYOF_FOLD) {
954 /* false positive possible if the class is case-folded.
955 Assume that the locale settings are the same... */
1aa99e6b
IH
956 if (uc < 0x100)
957 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
958 data->start_class->flags &= ~ANYOF_EOS;
959 }
960 cl_and(data->start_class, &and_with);
961 }
962 flags &= ~SCF_DO_STCLASS;
a0ed51b3 963 }
4d61ec05 964 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 965 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 966 I32 f = flags, pos_before = 0;
c277df42 967 regnode *oscan = scan;
653099ff
GS
968 struct regnode_charclass_class this_class;
969 struct regnode_charclass_class *oclass = NULL;
727f22e3 970 I32 next_is_eval = 0;
653099ff 971
22c35a8c 972 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 973 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
974 scan = NEXTOPER(scan);
975 goto finish;
976 case PLUS:
653099ff 977 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 978 next = NEXTOPER(scan);
653099ff 979 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
980 mincount = 1;
981 maxcount = REG_INFTY;
c277df42
IZ
982 next = regnext(scan);
983 scan = NEXTOPER(scan);
984 goto do_curly;
985 }
986 }
987 if (flags & SCF_DO_SUBSTR)
988 data->pos_min++;
989 min++;
990 /* Fall through. */
991 case STAR:
653099ff
GS
992 if (flags & SCF_DO_STCLASS) {
993 mincount = 0;
b81d288d 994 maxcount = REG_INFTY;
653099ff
GS
995 next = regnext(scan);
996 scan = NEXTOPER(scan);
997 goto do_curly;
998 }
b81d288d 999 is_inf = is_inf_internal = 1;
c277df42
IZ
1000 scan = regnext(scan);
1001 if (flags & SCF_DO_SUBSTR) {
830247a4 1002 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1003 data->longest = &(data->longest_float);
1004 }
1005 goto optimize_curly_tail;
1006 case CURLY:
b81d288d 1007 mincount = ARG1(scan);
c277df42
IZ
1008 maxcount = ARG2(scan);
1009 next = regnext(scan);
cb434fcc
IZ
1010 if (OP(scan) == CURLYX) {
1011 I32 lp = (data ? *(data->last_closep) : 0);
1012
1013 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1014 }
c277df42 1015 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1016 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1017 do_curly:
1018 if (flags & SCF_DO_SUBSTR) {
830247a4 1019 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1020 pos_before = data->pos_min;
1021 }
1022 if (data) {
1023 fl = data->flags;
1024 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1025 if (is_inf)
1026 data->flags |= SF_IS_INF;
1027 }
653099ff 1028 if (flags & SCF_DO_STCLASS) {
830247a4 1029 cl_init(pRExC_state, &this_class);
653099ff
GS
1030 oclass = data->start_class;
1031 data->start_class = &this_class;
1032 f |= SCF_DO_STCLASS_AND;
1033 f &= ~SCF_DO_STCLASS_OR;
1034 }
e1901655
IZ
1035 /* These are the cases when once a subexpression
1036 fails at a particular position, it cannot succeed
1037 even after backtracking at the enclosing scope.
b81d288d 1038
e1901655
IZ
1039 XXXX what if minimal match and we are at the
1040 initial run of {n,m}? */
1041 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1042 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1043
c277df42 1044 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1045 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1046 mincount == 0
653099ff
GS
1047 ? (f & ~SCF_DO_SUBSTR) : f);
1048
1049 if (flags & SCF_DO_STCLASS)
1050 data->start_class = oclass;
1051 if (mincount == 0 || minnext == 0) {
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 }
1055 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1056 /* Switch to OR mode: cache the old value of
653099ff
GS
1057 * data->start_class */
1058 StructCopy(data->start_class, &and_with,
1059 struct regnode_charclass_class);
1060 flags &= ~SCF_DO_STCLASS_AND;
1061 StructCopy(&this_class, data->start_class,
1062 struct regnode_charclass_class);
1063 flags |= SCF_DO_STCLASS_OR;
1064 data->start_class->flags |= ANYOF_EOS;
1065 }
1066 } else { /* Non-zero len */
1067 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1068 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1069 cl_and(data->start_class, &and_with);
1070 }
1071 else if (flags & SCF_DO_STCLASS_AND)
1072 cl_and(data->start_class, &this_class);
1073 flags &= ~SCF_DO_STCLASS;
1074 }
c277df42
IZ
1075 if (!scan) /* It was not CURLYX, but CURLY. */
1076 scan = next;
84037bb0 1077 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1078 /* ? quantifier ok, except for (?{ ... }) */
1079 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
84e09d5e
JH
1669 /* XXXX This looks very suspicious... */
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
1672 else
1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
055bb491 1749 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1750 RExC_parse = exp;
1751 RExC_end = xend;
1752 RExC_naughty = 0;
1753 RExC_npar = 1;
fac92740 1754 RExC_emit_start = r->program;
830247a4 1755 RExC_emit = r->program;
2cd61cdb 1756 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1757 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1758 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1759 r->data = 0;
830247a4 1760 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1761 return(NULL);
1762
1763 /* Dig out information for optimizations. */
cf93c79d 1764 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1765 pm->op_pmflags = RExC_flags16;
a0ed51b3 1766 if (UTF)
5ff6fc6d 1767 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1768 r->regstclass = NULL;
830247a4 1769 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1770 r->reganch |= ROPT_NAUGHTY;
c277df42 1771 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1772
1773 /* XXXX To minimize changes to RE engine we always allocate
1774 3-units-long substrs field. */
1775 Newz(1004, r->substrs, 1, struct reg_substr_data);
1776
2c2d71f5 1777 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1778 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1779 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1780 I32 fake;
c5254dd6 1781 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1782 struct regnode_charclass_class ch_class;
1783 int stclass_flag;
cb434fcc 1784 I32 last_close = 0;
a0d0e21e
LW
1785
1786 first = scan;
c277df42 1787 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1788 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1789 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1790 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1791 (OP(first) == PLUS) ||
1792 (OP(first) == MINMOD) ||
653099ff 1793 /* An {n,m} with n>0 */
22c35a8c 1794 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1795 if (OP(first) == PLUS)
1796 sawplus = 1;
1797 else
1798 first += regarglen[(U8)OP(first)];
1799 first = NEXTOPER(first);
a687059c
LW
1800 }
1801
a0d0e21e
LW
1802 /* Starting-point info. */
1803 again:
653099ff 1804 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1805 if (OP(first) == EXACT)
1806 ; /* Empty, get anchored substr later. */
1807 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1808 r->regstclass = first;
1809 }
653099ff 1810 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1811 r->regstclass = first;
22c35a8c
GS
1812 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1813 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1814 r->regstclass = first;
22c35a8c 1815 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1816 r->reganch |= (OP(first) == MBOL
1817 ? ROPT_ANCH_MBOL
1818 : (OP(first) == SBOL
1819 ? ROPT_ANCH_SBOL
1820 : ROPT_ANCH_BOL));
a0d0e21e 1821 first = NEXTOPER(first);
774d564b 1822 goto again;
1823 }
1824 else if (OP(first) == GPOS) {
1825 r->reganch |= ROPT_ANCH_GPOS;
1826 first = NEXTOPER(first);
1827 goto again;
a0d0e21e
LW
1828 }
1829 else if ((OP(first) == STAR &&
22c35a8c 1830 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1831 !(r->reganch & ROPT_ANCH) )
1832 {
1833 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1834 int type = OP(NEXTOPER(first));
1835
ffc61ed2 1836 if (type == REG_ANY)
cad2e5aa
JH
1837 type = ROPT_ANCH_MBOL;
1838 else
1839 type = ROPT_ANCH_SBOL;
1840
1841 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1842 first = NEXTOPER(first);
774d564b 1843 goto again;
a0d0e21e 1844 }
b81d288d 1845 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1846 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1847 /* x+ must match at the 1st pos of run of x's */
1848 r->reganch |= ROPT_SKIP;
a0d0e21e 1849
c277df42 1850 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1851 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1852 (IV)(first - scan + 1)));
a0d0e21e
LW
1853 /*
1854 * If there's something expensive in the r.e., find the
1855 * longest literal string that must appear and make it the
1856 * regmust. Resolve ties in favor of later strings, since
1857 * the regstart check works with the beginning of the r.e.
1858 * and avoiding duplication strengthens checking. Not a
1859 * strong reason, but sufficient in the absence of others.
1860 * [Now we resolve ties in favor of the earlier string if
c277df42 1861 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1862 * earlier string may buy us something the later one won't.]
1863 */
a0d0e21e 1864 minlen = 0;
a687059c 1865
79cb57f6
GS
1866 data.longest_fixed = newSVpvn("",0);
1867 data.longest_float = newSVpvn("",0);
1868 data.last_found = newSVpvn("",0);
c277df42
IZ
1869 data.longest = &(data.longest_fixed);
1870 first = scan;
653099ff 1871 if (!r->regstclass) {
830247a4 1872 cl_init(pRExC_state, &ch_class);
653099ff
GS
1873 data.start_class = &ch_class;
1874 stclass_flag = SCF_DO_STCLASS_AND;
1875 } else /* XXXX Check for BOUND? */
1876 stclass_flag = 0;
cb434fcc 1877 data.last_closep = &last_close;
653099ff 1878
830247a4 1879 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1880 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1882 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1883 && !RExC_seen_zerolen
1884 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1885 r->reganch |= ROPT_CHECK_ALL;
830247a4 1886 scan_commit(pRExC_state, &data);
c277df42
IZ
1887 SvREFCNT_dec(data.last_found);
1888
a0ed51b3 1889 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1890 if (longest_float_length
c277df42
IZ
1891 || (data.flags & SF_FL_BEFORE_EOL
1892 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1893 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1894 int t;
1895
a0ed51b3 1896 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1897 && data.offset_fixed == data.offset_float_min
1898 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1899 goto remove_float; /* As in (a)+. */
1900
c277df42
IZ
1901 r->float_substr = data.longest_float;
1902 r->float_min_offset = data.offset_float_min;
1903 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1904 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1905 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1906 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1907 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1908 }
1909 else {
aca2d497 1910 remove_float:
c277df42
IZ
1911 r->float_substr = Nullsv;
1912 SvREFCNT_dec(data.longest_float);
c5254dd6 1913 longest_float_length = 0;
a0d0e21e 1914 }
c277df42 1915
a0ed51b3 1916 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1917 if (longest_fixed_length
c277df42
IZ
1918 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1921 int t;
1922
c277df42
IZ
1923 r->anchored_substr = data.longest_fixed;
1924 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1925 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1926 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1927 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1928 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1929 }
1930 else {
c277df42
IZ
1931 r->anchored_substr = Nullsv;
1932 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1933 longest_fixed_length = 0;
a0d0e21e 1934 }
b81d288d 1935 if (r->regstclass
ffc61ed2 1936 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1937 r->regstclass = NULL;
1938 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1939 && !(data.start_class->flags & ANYOF_EOS)
1940 && !cl_is_anything(data.start_class)) {
830247a4 1941 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1942
b81d288d 1943 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1944 struct regnode_charclass_class);
1945 StructCopy(data.start_class,
830247a4 1946 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1947 struct regnode_charclass_class);
830247a4 1948 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1949 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1950 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1951 DEBUG_r({ SV *sv = sv_newmortal();
1952 regprop(sv, (regnode*)data.start_class);
1953 PerlIO_printf(Perl_debug_log,
1954 "synthetic stclass `%s'.\n",
1955 SvPVX(sv));});
653099ff 1956 }
c277df42
IZ
1957
1958 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1959 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1960 r->check_substr = r->anchored_substr;
1961 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1962 if (r->reganch & ROPT_ANCH_SINGLE)
1963 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1964 }
1965 else {
c277df42
IZ
1966 r->check_substr = r->float_substr;
1967 r->check_offset_min = data.offset_float_min;
1968 r->check_offset_max = data.offset_float_max;
a0d0e21e 1969 }
30382c73
IZ
1970 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1971 This should be changed ASAP! */
1972 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1973 r->reganch |= RE_USE_INTUIT;
1974 if (SvTAIL(r->check_substr))
1975 r->reganch |= RE_INTUIT_TAIL;
1976 }
a0ed51b3
LW
1977 }
1978 else {
c277df42
IZ
1979 /* Several toplevels. Best we can is to set minlen. */
1980 I32 fake;
653099ff 1981 struct regnode_charclass_class ch_class;
cb434fcc 1982 I32 last_close = 0;
c277df42
IZ
1983
1984 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1985 scan = r->program + 1;
830247a4 1986 cl_init(pRExC_state, &ch_class);
653099ff 1987 data.start_class = &ch_class;
cb434fcc 1988 data.last_closep = &last_close;
e1901655 1989 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1990 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1991 if (!(data.start_class->flags & ANYOF_EOS)
1992 && !cl_is_anything(data.start_class)) {
830247a4 1993 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1994
b81d288d 1995 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1996 struct regnode_charclass_class);
1997 StructCopy(data.start_class,
830247a4 1998 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1999 struct regnode_charclass_class);
830247a4 2000 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2001 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2002 DEBUG_r({ SV* sv = sv_newmortal();
2003 regprop(sv, (regnode*)data.start_class);
2004 PerlIO_printf(Perl_debug_log,
2005 "synthetic stclass `%s'.\n",
2006 SvPVX(sv));});
653099ff 2007 }
a0d0e21e
LW
2008 }
2009
a0d0e21e 2010 r->minlen = minlen;
b81d288d 2011 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2012 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2013 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2014 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2015 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2016 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2017 if (RExC_seen & REG_SEEN_CANY)
2018 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2019 Newz(1002, r->startp, RExC_npar, I32);
2020 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2021 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2022 DEBUG_r(regdump(r));
2023 return(r);
a687059c
LW
2024}
2025
2026/*
2027 - reg - regular expression, i.e. main body or parenthesized thing
2028 *
2029 * Caller must absorb opening parenthesis.
2030 *
2031 * Combining parenthesis handling with the base level of regular expression
2032 * is a trifle forced, but the need to tie the tails of the branches to what
2033 * follows makes it hard to avoid.
2034 */
76e3520e 2035STATIC regnode *
830247a4 2036S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2037 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2038{
c277df42
IZ
2039 register regnode *ret; /* Will be the head of the group. */
2040 register regnode *br;
2041 register regnode *lastbr;
2042 register regnode *ender = 0;
a0d0e21e 2043 register I32 parno = 0;
830247a4 2044 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2045
2046 /* for (?g), (?gc), and (?o) warnings; warning
2047 about (?c) will warn about (?g) -- japhy */
2048
2049 I32 wastedflags = 0x00,
2050 wasted_o = 0x01,
2051 wasted_g = 0x02,
2052 wasted_gc = 0x02 | 0x04,
2053 wasted_c = 0x04;
2054
fac92740 2055 char * parse_start = RExC_parse; /* MJD */
830247a4 2056 char *oregcomp_parse = RExC_parse;
c277df42 2057 char c;
a0d0e21e 2058
821b33a5 2059 *flagp = 0; /* Tentatively. */
a0d0e21e 2060
9d1d55b5 2061
a0d0e21e
LW
2062 /* Make an OPEN node, if parenthesized. */
2063 if (paren) {
fac92740 2064 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2065 U16 posflags = 0, negflags = 0;
2066 U16 *flagsp = &posflags;
0f5d15d6 2067 int logical = 0;
830247a4 2068 char *seqstart = RExC_parse;
ca9dfc88 2069
830247a4
IZ
2070 RExC_parse++;
2071 paren = *RExC_parse++;
c277df42 2072 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2073 switch (paren) {
fac92740 2074 case '<': /* (?<...) */
830247a4 2075 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2076 if (*RExC_parse == '!')
c277df42 2077 paren = ',';
b81d288d 2078 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2079 goto unknown;
830247a4 2080 RExC_parse++;
fac92740
MJD
2081 case '=': /* (?=...) */
2082 case '!': /* (?!...) */
830247a4 2083 RExC_seen_zerolen++;
fac92740
MJD
2084 case ':': /* (?:...) */
2085 case '>': /* (?>...) */
a0d0e21e 2086 break;
fac92740
MJD
2087 case '$': /* (?$...) */
2088 case '@': /* (?@...) */
8615cb43 2089 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2090 break;
fac92740 2091 case '#': /* (?#...) */
830247a4
IZ
2092 while (*RExC_parse && *RExC_parse != ')')
2093 RExC_parse++;
2094 if (*RExC_parse != ')')
c277df42 2095 FAIL("Sequence (?#... not terminated");
830247a4 2096 nextchar(pRExC_state);
a0d0e21e
LW
2097 *flagp = TRYAGAIN;
2098 return NULL;
fac92740 2099 case 'p': /* (?p...) */
f9373011
PM
2100 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2101 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2102 /* FALL THROUGH*/
fac92740 2103 case '?': /* (??...) */
0f5d15d6 2104 logical = 1;
830247a4 2105 paren = *RExC_parse++;
0f5d15d6 2106 /* FALL THROUGH */
fac92740 2107 case '{': /* (?{...}) */
c277df42 2108 {
c277df42
IZ
2109 I32 count = 1, n = 0;
2110 char c;
830247a4 2111 char *s = RExC_parse;
c277df42
IZ
2112 SV *sv;
2113 OP_4tree *sop, *rop;
2114
830247a4
IZ
2115 RExC_seen_zerolen++;
2116 RExC_seen |= REG_SEEN_EVAL;
2117 while (count && (c = *RExC_parse)) {
2118 if (c == '\\' && RExC_parse[1])
2119 RExC_parse++;
b81d288d 2120 else if (c == '{')
c277df42 2121 count++;
b81d288d 2122 else if (c == '}')
c277df42 2123 count--;
830247a4 2124 RExC_parse++;
c277df42 2125 }
830247a4 2126 if (*RExC_parse != ')')
b45f050a 2127 {
b81d288d 2128 RExC_parse = s;
b45f050a
JF
2129 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2130 }
c277df42
IZ
2131 if (!SIZE_ONLY) {
2132 AV *av;
b81d288d
AB
2133
2134 if (RExC_parse - 1 - s)
830247a4 2135 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2136 else
79cb57f6 2137 sv = newSVpvn("", 0);
c277df42 2138
569233ed
SB
2139 ENTER;
2140 Perl_save_re_context(aTHX);
c277df42 2141 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2142 sop->op_private |= OPpREFCOUNTED;
2143 /* re_dup will OpREFCNT_inc */
2144 OpREFCNT_set(sop, 1);
569233ed 2145 LEAVE;
c277df42 2146
830247a4
IZ
2147 n = add_data(pRExC_state, 3, "nop");
2148 RExC_rx->data->data[n] = (void*)rop;
2149 RExC_rx->data->data[n+1] = (void*)sop;
2150 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2151 SvREFCNT_dec(sv);
a0ed51b3 2152 }
e24b16f9 2153 else { /* First pass */
830247a4 2154 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2155 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2156 /* No compiled RE interpolated, has runtime
2157 components ===> unsafe. */
2158 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2159 if (PL_tainting && PL_tainted)
cc6b7395 2160 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2161 }
2162
830247a4 2163 nextchar(pRExC_state);
0f5d15d6 2164 if (logical) {
830247a4 2165 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2166 if (!SIZE_ONLY)
2167 ret->flags = 2;
830247a4 2168 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2169 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2170 return ret;
2171 }
830247a4 2172 return reganode(pRExC_state, EVAL, n);
c277df42 2173 }
fac92740 2174 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2175 {
fac92740 2176 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2177 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2178 || RExC_parse[1] == '<'
830247a4 2179 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2180 I32 flag;
2181
830247a4 2182 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2183 if (!SIZE_ONLY)
2184 ret->flags = 1;
830247a4 2185 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2186 goto insert_if;
b81d288d 2187 }
a0ed51b3 2188 }
830247a4 2189 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2190 /* (?(1)...) */
830247a4 2191 parno = atoi(RExC_parse++);
c277df42 2192
830247a4
IZ
2193 while (isDIGIT(*RExC_parse))
2194 RExC_parse++;
fac92740
MJD
2195 ret = reganode(pRExC_state, GROUPP, parno);
2196
830247a4 2197 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2198 vFAIL("Switch condition not recognized");
c277df42 2199 insert_if:
830247a4
IZ
2200 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2201 br = regbranch(pRExC_state, &flags, 1);
c277df42 2202 if (br == NULL)
830247a4 2203 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2204 else
830247a4
IZ
2205 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2206 c = *nextchar(pRExC_state);
d1b80229
IZ
2207 if (flags&HASWIDTH)
2208 *flagp |= HASWIDTH;
c277df42 2209 if (c == '|') {
830247a4
IZ
2210 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2211 regbranch(pRExC_state, &flags, 1);
2212 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2213 if (flags&HASWIDTH)
2214 *flagp |= HASWIDTH;
830247a4 2215 c = *nextchar(pRExC_state);
a0ed51b3
LW
2216 }
2217 else
c277df42
IZ
2218 lastbr = NULL;
2219 if (c != ')')
8615cb43 2220 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2221 ender = reg_node(pRExC_state, TAIL);
2222 regtail(pRExC_state, br, ender);
c277df42 2223 if (lastbr) {
830247a4
IZ
2224 regtail(pRExC_state, lastbr, ender);
2225 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2226 }
2227 else
830247a4 2228 regtail(pRExC_state, ret, ender);
c277df42 2229 return ret;
a0ed51b3
LW
2230 }
2231 else {
830247a4 2232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2233 }
2234 }
1b1626e4 2235 case 0:
830247a4 2236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2237 vFAIL("Sequence (? incomplete");
1b1626e4 2238 break;
a0d0e21e 2239 default:
830247a4 2240 --RExC_parse;
fac92740 2241 parse_flags: /* (?i) */
830247a4 2242 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2243 /* (?g), (?gc) and (?o) are useless here
2244 and must be globally applied -- japhy */
2245
2246 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2247 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2248 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2249 if (! (wastedflags & wflagbit) ) {
2250 wastedflags |= wflagbit;
2251 vWARN5(
2252 RExC_parse + 1,
2253 "Useless (%s%c) - %suse /%c modifier",
2254 flagsp == &negflags ? "?-" : "?",
2255 *RExC_parse,
2256 flagsp == &negflags ? "don't " : "",
2257 *RExC_parse
2258 );
2259 }
2260 }
2261 }
2262 else if (*RExC_parse == 'c') {
2263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2264 if (! (wastedflags & wasted_c) ) {
2265 wastedflags |= wasted_gc;
2266 vWARN3(
2267 RExC_parse + 1,
2268 "Useless (%sc) - %suse /gc modifier",
2269 flagsp == &negflags ? "?-" : "?",
2270 flagsp == &negflags ? "don't " : ""
2271 );
2272 }
2273 }
2274 }
2275 else { pmflag(flagsp, *RExC_parse); }
2276
830247a4 2277 ++RExC_parse;
ca9dfc88 2278 }
830247a4 2279 if (*RExC_parse == '-') {
ca9dfc88 2280 flagsp = &negflags;
9d1d55b5 2281 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2282 ++RExC_parse;
ca9dfc88 2283 goto parse_flags;
48c036b1 2284 }
830247a4
IZ
2285 RExC_flags16 |= posflags;
2286 RExC_flags16 &= ~negflags;
2287 if (*RExC_parse == ':') {
2288 RExC_parse++;
ca9dfc88
IZ
2289 paren = ':';
2290 break;
2291 }
c277df42 2292 unknown:
830247a4
IZ
2293 if (*RExC_parse != ')') {
2294 RExC_parse++;
2295 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2296 }
830247a4 2297 nextchar(pRExC_state);
a0d0e21e
LW
2298 *flagp = TRYAGAIN;
2299 return NULL;
2300 }
2301 }
fac92740 2302 else { /* (...) */
830247a4
IZ
2303 parno = RExC_npar;
2304 RExC_npar++;
2305 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2306 Set_Node_Length(ret, 1); /* MJD */
2307 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2308 open = 1;
a0d0e21e 2309 }
a0ed51b3 2310 }
fac92740 2311 else /* ! paren */
a0d0e21e
LW
2312 ret = NULL;
2313
2314 /* Pick up the branches, linking them together. */
fac92740 2315 parse_start = RExC_parse; /* MJD */
830247a4 2316 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2317 /* branch_len = (paren != 0); */
2318
a0d0e21e
LW
2319 if (br == NULL)
2320 return(NULL);
830247a4
IZ
2321 if (*RExC_parse == '|') {
2322 if (!SIZE_ONLY && RExC_extralen) {
2323 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2324 }
fac92740 2325 else { /* MJD */
830247a4 2326 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2327 Set_Node_Length(br, paren != 0);
2328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2329 }
c277df42
IZ
2330 have_branch = 1;
2331 if (SIZE_ONLY)
830247a4 2332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2333 }
2334 else if (paren == ':') {
c277df42
IZ
2335 *flagp |= flags&SIMPLE;
2336 }
2337 if (open) { /* Starts with OPEN. */
830247a4 2338 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2339 }
2340 else if (paren != '?') /* Not Conditional */
a0d0e21e 2341 ret = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a0d0e21e 2344 *flagp |= flags&SPSTART;
c277df42 2345 lastbr = br;
830247a4
IZ
2346 while (*RExC_parse == '|') {
2347 if (!SIZE_ONLY && RExC_extralen) {
2348 ender = reganode(pRExC_state, LONGJMP,0);
2349 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2350 }
2351 if (SIZE_ONLY)
830247a4
IZ
2352 RExC_extralen += 2; /* Account for LONGJMP. */
2353 nextchar(pRExC_state);
2354 br = regbranch(pRExC_state, &flags, 0);
fac92740 2355
a687059c 2356 if (br == NULL)
a0d0e21e 2357 return(NULL);
830247a4 2358 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2359 lastbr = br;
821b33a5
IZ
2360 if (flags&HASWIDTH)
2361 *flagp |= HASWIDTH;
a687059c 2362 *flagp |= flags&SPSTART;
a0d0e21e
LW
2363 }
2364
c277df42
IZ
2365 if (have_branch || paren != ':') {
2366 /* Make a closing node, and hook it on the end. */
2367 switch (paren) {
2368 case ':':
830247a4 2369 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2370 break;
2371 case 1:
830247a4 2372 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2373 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2374 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2375 break;
2376 case '<':
c277df42
IZ
2377 case ',':
2378 case '=':
2379 case '!':
c277df42 2380 *flagp &= ~HASWIDTH;
821b33a5
IZ
2381 /* FALL THROUGH */
2382 case '>':
830247a4 2383 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2384 break;
2385 case 0:
830247a4 2386 ender = reg_node(pRExC_state, END);
c277df42
IZ
2387 break;
2388 }
830247a4 2389 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2390
c277df42
IZ
2391 if (have_branch) {
2392 /* Hook the tails of the branches to the closing node. */
2393 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2394 regoptail(pRExC_state, br, ender);
c277df42
IZ
2395 }
2396 }
a0d0e21e 2397 }
c277df42
IZ
2398
2399 {
2400 char *p;
2401 static char parens[] = "=!<,>";
2402
2403 if (paren && (p = strchr(parens, paren))) {
2404 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2405 int flag = (p - parens) > 1;
2406
2407 if (paren == '>')
2408 node = SUSPEND, flag = 0;
830247a4 2409 reginsert(pRExC_state, node,ret);
c277df42 2410 ret->flags = flag;
830247a4 2411 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2412 }
a0d0e21e
LW
2413 }
2414
2415 /* Check for proper termination. */
ce3e6498 2416 if (paren) {
830247a4
IZ
2417 RExC_flags16 = oregflags;
2418 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2419 RExC_parse = oregcomp_parse;
380a0633 2420 vFAIL("Unmatched (");
ce3e6498 2421 }
a0ed51b3 2422 }
830247a4
IZ
2423 else if (!paren && RExC_parse < RExC_end) {
2424 if (*RExC_parse == ')') {
2425 RExC_parse++;
380a0633 2426 vFAIL("Unmatched )");
a0ed51b3
LW
2427 }
2428 else
b45f050a 2429 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2430 /* NOTREACHED */
2431 }
a687059c 2432
a0d0e21e 2433 return(ret);
a687059c
LW
2434}
2435
2436/*
2437 - regbranch - one alternative of an | operator
2438 *
2439 * Implements the concatenation operator.
2440 */
76e3520e 2441STATIC regnode *
830247a4 2442S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2443{
c277df42
IZ
2444 register regnode *ret;
2445 register regnode *chain = NULL;
2446 register regnode *latest;
2447 I32 flags = 0, c = 0;
a0d0e21e 2448
b81d288d 2449 if (first)
c277df42
IZ
2450 ret = NULL;
2451 else {
b81d288d 2452 if (!SIZE_ONLY && RExC_extralen)
830247a4 2453 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2454 else {
830247a4 2455 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2456 Set_Node_Length(ret, 1);
2457 }
c277df42
IZ
2458 }
2459
b81d288d 2460 if (!first && SIZE_ONLY)
830247a4 2461 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2462
c277df42 2463 *flagp = WORST; /* Tentatively. */
a0d0e21e 2464
830247a4
IZ
2465 RExC_parse--;
2466 nextchar(pRExC_state);
2467 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2468 flags &= ~TRYAGAIN;
830247a4 2469 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2470 if (latest == NULL) {
2471 if (flags & TRYAGAIN)
2472 continue;
2473 return(NULL);
a0ed51b3
LW
2474 }
2475 else if (ret == NULL)
c277df42 2476 ret = latest;
a0d0e21e 2477 *flagp |= flags&HASWIDTH;
c277df42 2478 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2479 *flagp |= flags&SPSTART;
2480 else {
830247a4
IZ
2481 RExC_naughty++;
2482 regtail(pRExC_state, chain, latest);
a687059c 2483 }
a0d0e21e 2484 chain = latest;
c277df42
IZ
2485 c++;
2486 }
2487 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2488 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2489 if (ret == NULL)
2490 ret = chain;
2491 }
2492 if (c == 1) {
2493 *flagp |= flags&SIMPLE;
a0d0e21e 2494 }
a687059c 2495
a0d0e21e 2496 return(ret);
a687059c
LW
2497}
2498
2499/*
2500 - regpiece - something followed by possible [*+?]
2501 *
2502 * Note that the branching code sequences used for ? and the general cases
2503 * of * and + are somewhat optimized: they use the same NOTHING node as
2504 * both the endmarker for their branch list and the body of the last branch.
2505 * It might seem that this node could be dispensed with entirely, but the
2506 * endmarker role is not redundant.
2507 */
76e3520e 2508STATIC regnode *
830247a4 2509S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2510{
c277df42 2511 register regnode *ret;
a0d0e21e
LW
2512 register char op;
2513 register char *next;
2514 I32 flags;
830247a4 2515 char *origparse = RExC_parse;
a0d0e21e
LW
2516 char *maxpos;
2517 I32 min;
c277df42 2518 I32 max = REG_INFTY;
fac92740 2519 char *parse_start;
a0d0e21e 2520
830247a4 2521 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2522 if (ret == NULL) {
2523 if (flags & TRYAGAIN)
2524 *flagp |= TRYAGAIN;
2525 return(NULL);
2526 }
2527
830247a4 2528 op = *RExC_parse;
a0d0e21e 2529
830247a4 2530 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2531 parse_start = RExC_parse; /* MJD */
830247a4 2532 next = RExC_parse + 1;
a0d0e21e
LW
2533 maxpos = Nullch;
2534 while (isDIGIT(*next) || *next == ',') {
2535 if (*next == ',') {
2536 if (maxpos)
2537 break;
2538 else
2539 maxpos = next;
a687059c 2540 }
a0d0e21e
LW
2541 next++;
2542 }
2543 if (*next == '}') { /* got one */
2544 if (!maxpos)
2545 maxpos = next;
830247a4
IZ
2546 RExC_parse++;
2547 min = atoi(RExC_parse);
a0d0e21e
LW
2548 if (*maxpos == ',')
2549 maxpos++;
2550 else
830247a4 2551 maxpos = RExC_parse;
a0d0e21e
LW
2552 max = atoi(maxpos);
2553 if (!max && *maxpos != '0')
c277df42
IZ
2554 max = REG_INFTY; /* meaning "infinity" */
2555 else if (max >= REG_INFTY)
8615cb43 2556 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2557 RExC_parse = next;
2558 nextchar(pRExC_state);
a0d0e21e
LW
2559
2560 do_curly:
2561 if ((flags&SIMPLE)) {
830247a4
IZ
2562 RExC_naughty += 2 + RExC_naughty / 2;
2563 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2564 Set_Node_Offset(ret, parse_start+1); /* MJD */
2565 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2566 }
2567 else {
830247a4 2568 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2569
2570 w->flags = 0;
830247a4
IZ
2571 regtail(pRExC_state, ret, w);
2572 if (!SIZE_ONLY && RExC_extralen) {
2573 reginsert(pRExC_state, LONGJMP,ret);
2574 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2575 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2576 }
830247a4 2577 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2578 /* MJD hk */
2579 Set_Node_Offset(ret, parse_start+1);
2580 Set_Node_Length(ret,
2581 op == '{' ? (RExC_parse - parse_start) : 1);
2582
830247a4 2583 if (!SIZE_ONLY && RExC_extralen)
c277df42 2584 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2585 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2586 if (SIZE_ONLY)
830247a4
IZ
2587 RExC_whilem_seen++, RExC_extralen += 3;
2588 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2589 }
c277df42 2590 ret->flags = 0;
a0d0e21e
LW
2591
2592 if (min > 0)
821b33a5
IZ
2593 *flagp = WORST;
2594 if (max > 0)
2595 *flagp |= HASWIDTH;
a0d0e21e 2596 if (max && max < min)
8615cb43 2597 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2598 if (!SIZE_ONLY) {
2599 ARG1_SET(ret, min);
2600 ARG2_SET(ret, max);
a687059c 2601 }
a687059c 2602
a0d0e21e 2603 goto nest_check;
a687059c 2604 }
a0d0e21e 2605 }
a687059c 2606
a0d0e21e
LW
2607 if (!ISMULT1(op)) {
2608 *flagp = flags;
a687059c 2609 return(ret);
a0d0e21e 2610 }
bb20fd44 2611
c277df42 2612#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2613
2614 /* if this is reinstated, don't forget to put this back into perldiag:
2615
2616 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2617
2618 (F) The part of the regexp subject to either the * or + quantifier
2619 could match an empty string. The {#} shows in the regular
2620 expression about where the problem was discovered.
2621
2622 */
2623
bb20fd44 2624 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2625 vFAIL("Regexp *+ operand could be empty");
b81d288d 2626#endif
bb20fd44 2627
fac92740 2628 parse_start = RExC_parse;
830247a4 2629 nextchar(pRExC_state);
a0d0e21e 2630
821b33a5 2631 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2632
2633 if (op == '*' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, STAR, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 4;
a0d0e21e
LW
2637 }
2638 else if (op == '*') {
2639 min = 0;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2643 reginsert(pRExC_state, PLUS, ret);
c277df42 2644 ret->flags = 0;
830247a4 2645 RExC_naughty += 3;
a0d0e21e
LW
2646 }
2647 else if (op == '+') {
2648 min = 1;
2649 goto do_curly;
a0ed51b3
LW
2650 }
2651 else if (op == '?') {
a0d0e21e
LW
2652 min = 0; max = 1;
2653 goto do_curly;
2654 }
2655 nest_check:
e476b1b5 2656 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2657 vWARN3(RExC_parse,
b45f050a 2658 "%.*s matches null string many times",
830247a4 2659 RExC_parse - origparse,
b45f050a 2660 origparse);
a0d0e21e
LW
2661 }
2662
830247a4
IZ
2663 if (*RExC_parse == '?') {
2664 nextchar(pRExC_state);
2665 reginsert(pRExC_state, MINMOD, ret);
2666 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2667 }
830247a4
IZ
2668 if (ISMULT2(RExC_parse)) {
2669 RExC_parse++;
b45f050a
JF
2670 vFAIL("Nested quantifiers");
2671 }
a0d0e21e
LW
2672
2673 return(ret);
a687059c
LW
2674}
2675
2676/*
2677 - regatom - the lowest level
2678 *
2679 * Optimization: gobbles an entire sequence of ordinary characters so that
2680 * it can turn them into a single node, which is smaller to store and
2681 * faster to run. Backslashed characters are exceptions, each becoming a
2682 * separate node; the code is simpler that way and it's not worth fixing.
2683 *
b45f050a 2684 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2685STATIC regnode *
830247a4 2686S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2687{
c277df42 2688 register regnode *ret = 0;
a0d0e21e 2689 I32 flags;
f06dbbb7 2690 char *parse_start = 0;
a0d0e21e
LW
2691
2692 *flagp = WORST; /* Tentatively. */
2693
2694tryagain:
830247a4 2695 switch (*RExC_parse) {
a0d0e21e 2696 case '^':
830247a4
IZ
2697 RExC_seen_zerolen++;
2698 nextchar(pRExC_state);
2699 if (RExC_flags16 & PMf_MULTILINE)
2700 ret = reg_node(pRExC_state, MBOL);
2701 else if (RExC_flags16 & PMf_SINGLELINE)
2702 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2703 else
830247a4 2704 ret = reg_node(pRExC_state, BOL);
fac92740 2705 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2706 break;
2707 case '$':
830247a4 2708 nextchar(pRExC_state);
b81d288d 2709 if (*RExC_parse)
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 if (RExC_flags16 & PMf_MULTILINE)
2712 ret = reg_node(pRExC_state, MEOL);
2713 else if (RExC_flags16 & PMf_SINGLELINE)
2714 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2715 else
830247a4 2716 ret = reg_node(pRExC_state, EOL);
fac92740 2717 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2718 break;
2719 case '.':
830247a4 2720 nextchar(pRExC_state);
cce850e4 2721 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2722 ret = reg_node(pRExC_state, SANY);
2723 else
2724 ret = reg_node(pRExC_state, REG_ANY);
2725 *flagp |= HASWIDTH|SIMPLE;
830247a4 2726 RExC_naughty++;
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '[':
b45f050a 2730 {
830247a4 2731 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2732 ret = regclass(pRExC_state);
830247a4
IZ
2733 if (*RExC_parse != ']') {
2734 RExC_parse = oregcomp_parse;
b45f050a
JF
2735 vFAIL("Unmatched [");
2736 }
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738 *flagp |= HASWIDTH|SIMPLE;
fac92740 2739 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2740 break;
b45f050a 2741 }
a0d0e21e 2742 case '(':
830247a4
IZ
2743 nextchar(pRExC_state);
2744 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2745 if (ret == NULL) {
bf93d4cc 2746 if (flags & TRYAGAIN) {
830247a4 2747 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2748 /* Make parent create an empty node if needed. */
2749 *flagp |= TRYAGAIN;
2750 return(NULL);
2751 }
a0d0e21e 2752 goto tryagain;
bf93d4cc 2753 }
a0d0e21e
LW
2754 return(NULL);
2755 }
c277df42 2756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2757 break;
2758 case '|':
2759 case ')':
2760 if (flags & TRYAGAIN) {
2761 *flagp |= TRYAGAIN;
2762 return NULL;
2763 }
b45f050a 2764 vFAIL("Internal urp");
a0d0e21e
LW
2765 /* Supposed to be caught earlier. */
2766 break;
85afd4ae 2767 case '{':
830247a4
IZ
2768 if (!regcurly(RExC_parse)) {
2769 RExC_parse++;
85afd4ae
CS
2770 goto defchar;
2771 }
2772 /* FALL THROUGH */
a0d0e21e
LW
2773 case '?':
2774 case '+':
2775 case '*':
830247a4 2776 RExC_parse++;
b45f050a 2777 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2778 break;
2779 case '\\':
830247a4 2780 switch (*++RExC_parse) {
a0d0e21e 2781 case 'A':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2784 *flagp |= SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'G':
830247a4
IZ
2789 ret = reg_node(pRExC_state, GPOS);
2790 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2791 *flagp |= SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2794 break;
2795 case 'Z':
830247a4 2796 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2797 *flagp |= SIMPLE;
a1917ab9 2798 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2799 nextchar(pRExC_state);
a0d0e21e 2800 break;
b85d18e9 2801 case 'z':
830247a4 2802 ret = reg_node(pRExC_state, EOS);
b85d18e9 2803 *flagp |= SIMPLE;
830247a4
IZ
2804 RExC_seen_zerolen++; /* Do not optimize RE away */
2805 nextchar(pRExC_state);
fac92740 2806 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2807 break;
4a2d328f 2808 case 'C':
f33976b4
DB
2809 ret = reg_node(pRExC_state, CANY);
2810 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2811 *flagp |= HASWIDTH|SIMPLE;
830247a4 2812 nextchar(pRExC_state);
fac92740 2813 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2814 break;
2815 case 'X':
830247a4 2816 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2817 *flagp |= HASWIDTH;
830247a4 2818 nextchar(pRExC_state);
fac92740 2819 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2820 break;
a0d0e21e 2821 case 'w':
ffc61ed2 2822 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2823 *flagp |= HASWIDTH|SIMPLE;
830247a4 2824 nextchar(pRExC_state);
fac92740 2825 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2826 break;
2827 case 'W':
ffc61ed2 2828 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2829 *flagp |= HASWIDTH|SIMPLE;
830247a4 2830 nextchar(pRExC_state);
fac92740 2831 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2832 break;
2833 case 'b':
830247a4
IZ
2834 RExC_seen_zerolen++;
2835 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2836 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2837 *flagp |= SIMPLE;
830247a4 2838 nextchar(pRExC_state);
fac92740 2839 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2840 break;
2841 case 'B':
830247a4
IZ
2842 RExC_seen_zerolen++;
2843 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2844 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2845 *flagp |= SIMPLE;
830247a4 2846 nextchar(pRExC_state);
fac92740 2847 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2848 break;
2849 case 's':
ffc61ed2 2850 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2851 *flagp |= HASWIDTH|SIMPLE;
830247a4 2852 nextchar(pRExC_state);
fac92740 2853 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2854 break;
2855 case 'S':
ffc61ed2 2856 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2857 *flagp |= HASWIDTH|SIMPLE;
830247a4 2858 nextchar(pRExC_state);
fac92740 2859 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2860 break;
2861 case 'd':
ffc61ed2 2862 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2863 *flagp |= HASWIDTH|SIMPLE;
830247a4 2864 nextchar(pRExC_state);
fac92740 2865 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2866 break;
2867 case 'D':
ffc61ed2 2868 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2869 *flagp |= HASWIDTH|SIMPLE;
830247a4 2870 nextchar(pRExC_state);
fac92740 2871 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2872 break;
a14b48bc
LW
2873 case 'p':
2874 case 'P':
3568d838 2875 {
830247a4 2876 char* oldregxend = RExC_end;
fac92740 2877 char* parse_start = RExC_parse;
a14b48bc 2878
830247a4 2879 if (RExC_parse[1] == '{') {
3568d838 2880 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2881 RExC_end = strchr(RExC_parse, '}');
2882 if (!RExC_end) {
0da60cf5 2883 U8 c = (U8)*RExC_parse;
830247a4
IZ
2884 RExC_parse += 2;
2885 RExC_end = oldregxend;
0da60cf5 2886 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2887 }
830247a4 2888 RExC_end++;
a14b48bc
LW
2889 }
2890 else
830247a4
IZ
2891 RExC_end = RExC_parse + 2;
2892 RExC_parse--;
a14b48bc 2893
ffc61ed2 2894 ret = regclass(pRExC_state);
a14b48bc 2895
830247a4
IZ
2896 RExC_end = oldregxend;
2897 RExC_parse--;
fac92740 2898 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2899 nextchar(pRExC_state);
a14b48bc
LW
2900 *flagp |= HASWIDTH|SIMPLE;
2901 }
2902 break;
a0d0e21e
LW
2903 case 'n':
2904 case 'r':
2905 case 't':
2906 case 'f':
2907 case 'e':
2908 case 'a':
2909 case 'x':
2910 case 'c':
2911 case '0':
2912 goto defchar;
2913 case '1': case '2': case '3': case '4':
2914 case '5': case '6': case '7': case '8': case '9':
2915 {
830247a4 2916 I32 num = atoi(RExC_parse);
a0d0e21e 2917
830247a4 2918 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2919 goto defchar;
2920 else {
fac92740 2921 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2922 while (isDIGIT(*RExC_parse))
2923 RExC_parse++;
b45f050a 2924
830247a4 2925 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2926 vFAIL("Reference to nonexistent group");
830247a4
IZ
2927 RExC_sawback = 1;
2928 ret = reganode(pRExC_state, FOLD
a0ed51b3 2929 ? (LOC ? REFFL : REFF)
c8756f30 2930 : REF, num);
a0d0e21e 2931 *flagp |= HASWIDTH;
fac92740
MJD
2932
2933 /* override incorrect value set in reganode MJD */
2934 Set_Node_Offset(ret, parse_start+1);
2935 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2936 RExC_parse--;
2937 nextchar(pRExC_state);
a0d0e21e
LW
2938 }
2939 }
2940 break;
2941 case '\0':
830247a4 2942 if (RExC_parse >= RExC_end)
b45f050a 2943 FAIL("Trailing \\");
a0d0e21e
LW
2944 /* FALL THROUGH */
2945 default:
c9f97d15
IZ
2946 /* Do not generate `unrecognized' warnings here, we fall
2947 back into the quick-grab loop below */
a0d0e21e
LW
2948 goto defchar;
2949 }
2950 break;
4633a7c4
LW
2951
2952 case '#':
830247a4
IZ
2953 if (RExC_flags16 & PMf_EXTENDED) {
2954 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2955 if (RExC_parse < RExC_end)
4633a7c4
LW
2956 goto tryagain;
2957 }
2958 /* FALL THROUGH */
2959
a0d0e21e 2960 default: {
ba210ebe 2961 register STRLEN len;
a0ed51b3 2962 register UV ender;
a0d0e21e 2963 register char *p;
c277df42 2964 char *oldp, *s;
ba210ebe 2965 STRLEN numlen;
a2a2844f
JH
2966 STRLEN ulen;
2967 U8 tmpbuf[UTF8_MAXLEN*2+1];
f06dbbb7
JH
2968
2969 parse_start = RExC_parse - 1;
a0d0e21e 2970
830247a4 2971 RExC_parse++;
a0d0e21e
LW
2972
2973 defchar:
830247a4 2974 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2975 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2976 : EXACT);
cd439c50 2977 s = STRING(ret);
830247a4
IZ
2978 for (len = 0, p = RExC_parse - 1;
2979 len < 127 && p < RExC_end;
a0d0e21e
LW
2980 len++)
2981 {
2982 oldp = p;
5b5a24f7 2983
830247a4
IZ
2984 if (RExC_flags16 & PMf_EXTENDED)
2985 p = regwhite(p, RExC_end);
a0d0e21e
LW
2986 switch (*p) {
2987 case '^':
2988 case '$':
2989 case '.':
2990 case '[':
2991 case '(':
2992 case ')':
2993 case '|':
2994 goto loopdone;
2995 case '\\':
2996 switch (*++p) {
2997 case 'A':
2998 case 'G':
2999 case 'Z':
b85d18e9 3000 case 'z':
a0d0e21e
LW
3001 case 'w':
3002 case 'W':
3003 case 'b':
3004 case 'B':
3005 case 's':
3006 case 'S':
3007 case 'd':
3008 case 'D':
a14b48bc
LW
3009 case 'p':
3010 case 'P':
a0d0e21e
LW
3011 --p;
3012 goto loopdone;
3013 case 'n':
3014 ender = '\n';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 'r':
3018 ender = '\r';
3019 p++;
a687059c 3020 break;
a0d0e21e
LW
3021 case 't':
3022 ender = '\t';
3023 p++;
a687059c 3024 break;
a0d0e21e
LW
3025 case 'f':
3026 ender = '\f';
3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'e':
c7f1f016 3030 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3031 p++;
a687059c 3032 break;
a0d0e21e 3033 case 'a':
c7f1f016 3034 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3035 p++;
a687059c 3036 break;
a0d0e21e 3037 case 'x':
a0ed51b3
LW
3038 if (*++p == '{') {
3039 char* e = strchr(p, '}');
b81d288d 3040
b45f050a 3041 if (!e) {
830247a4 3042 RExC_parse = p + 1;
b45f050a
JF
3043 vFAIL("Missing right brace on \\x{}");
3044 }
de5f0749 3045 else {
a4c04bdc
NC
3046 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3047 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3048 numlen = e - p - 1;
3049 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3050 if (ender > 0xff)
3051 RExC_utf8 = 1;
b21ed0a9
GS
3052 /* numlen is generous */
3053 if (numlen + len >= 127) {
a0ed51b3
LW
3054 p--;
3055 goto loopdone;
3056 }
3057 p = e + 1;
3058 }
a0ed51b3
LW
3059 }
3060 else {
a4c04bdc 3061 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3062 numlen = 2;
3063 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3064 p += numlen;
3065 }
a687059c 3066 break;
a0d0e21e
LW
3067 case 'c':
3068 p++;
bbce6d69 3069 ender = UCHARAT(p++);
3070 ender = toCTRL(ender);
a687059c 3071 break;
a0d0e21e
LW
3072 case '0': case '1': case '2': case '3':case '4':
3073 case '5': case '6': case '7': case '8':case '9':
3074 if (*p == '0' ||
830247a4 3075 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3076 I32 flags = 0;
3077 numlen = 3;
3078 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3079 p += numlen;
3080 }
3081 else {
3082 --p;
3083 goto loopdone;
a687059c
LW
3084 }
3085 break;
a0d0e21e 3086 case '\0':
830247a4 3087 if (p >= RExC_end)
b45f050a 3088 FAIL("Trailing \\");
a687059c 3089 /* FALL THROUGH */
a0d0e21e 3090 default:
e476b1b5 3091 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3092 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3093 goto normal_default;
a0d0e21e
LW
3094 }
3095 break;
a687059c 3096 default:
a0ed51b3 3097 normal_default:
fd400ab9 3098 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3099 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3100 &numlen, 0);
a0ed51b3
LW
3101 p += numlen;
3102 }
3103 else
3104 ender = *p++;
a0d0e21e 3105 break;
a687059c 3106 }
830247a4
IZ
3107 if (RExC_flags16 & PMf_EXTENDED)
3108 p = regwhite(p, RExC_end);
a0ed51b3 3109 if (UTF && FOLD) {
a2a2844f
JH
3110 toLOWER_uni(ender, tmpbuf, &ulen);
3111 ender = utf8_to_uvchr(tmpbuf, 0);
a0ed51b3 3112 }
a0d0e21e
LW
3113 if (ISMULT2(p)) { /* Back off on ?+*. */
3114 if (len)
3115 p = oldp;
2b9d42f0 3116 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3117 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3118 s += numlen;
3119 len += numlen;
3120 }
a0d0e21e
LW
3121 else {
3122 len++;
cd439c50 3123 REGC(ender, s++);
a0d0e21e
LW
3124 }
3125 break;
a687059c 3126 }
2b9d42f0 3127 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3128 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3129 s += numlen;
3130 len += numlen - 1;
3131 }
3132 else
cd439c50 3133 REGC(ender, s++);
a0d0e21e
LW
3134 }
3135 loopdone:
830247a4 3136 RExC_parse = p - 1;
fac92740 3137 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3138 nextchar(pRExC_state);
793db0cb
JH
3139 {
3140 /* len is STRLEN which is unsigned, need to copy to signed */
3141 IV iv = len;
3142 if (iv < 0)
3143 vFAIL("Internal disaster");
3144 }
a0d0e21e
LW
3145 if (len > 0)
3146 *flagp |= HASWIDTH;
3147 if (len == 1)
3148 *flagp |= SIMPLE;
c277df42 3149 if (!SIZE_ONLY)
cd439c50
IZ
3150 STR_LEN(ret) = len;
3151 if (SIZE_ONLY)
830247a4 3152 RExC_size += STR_SZ(len);
cd439c50 3153 else
830247a4 3154 RExC_emit += STR_SZ(len);
a687059c 3155 }
a0d0e21e
LW
3156 break;
3157 }
a687059c 3158
a72c7584
JH
3159 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
3160 STRLEN oldlen = STR_LEN(ret);
3161 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3162 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3163 STRLEN newlen = SvCUR(sv);
3164 if (!SIZE_ONLY) {
3165 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3166 oldlen, STRING(ret), newlen, s));
3167 Copy(s, STRING(ret), newlen, char);
3168 STR_LEN(ret) += newlen - oldlen;
3169 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3170 } else
3171 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5ff6fc6d 3172 RExC_utf8 = 1;
a72c7584
JH
3173 }
3174
a0d0e21e 3175 return(ret);
a687059c
LW
3176}
3177
873ef191 3178STATIC char *
cea2e8a9 3179S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3180{
3181 while (p < e) {
3182 if (isSPACE(*p))
3183 ++p;
3184 else if (*p == '#') {
3185 do {
3186 p++;
3187 } while (p < e && *p != '\n');
3188 }
3189 else
3190 break;
3191 }
3192 return p;
3193}
3194
b8c5462f
JH
3195/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3196 Character classes ([:foo:]) can also be negated ([:^foo:]).
3197 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3198 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3199 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3200
3201#define POSIXCC_DONE(c) ((c) == ':')
3202#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3203#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3204
b8c5462f 3205STATIC I32
830247a4 3206S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3207{
3208 char *posixcc = 0;
936ed897 3209 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3210
830247a4 3211 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3212 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3213 POSIXCC(UCHARAT(RExC_parse))) {
3214 char c = UCHARAT(RExC_parse);
830247a4 3215 char* s = RExC_parse++;
b81d288d 3216
9a86a77b 3217 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3218 RExC_parse++;
3219 if (RExC_parse == RExC_end)
620e46c5 3220 /* Grandfather lone [:, [=, [. */
830247a4 3221 RExC_parse = s;
620e46c5 3222 else {
830247a4 3223 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3224
9a86a77b 3225 if (UCHARAT(RExC_parse) == ']') {
830247a4 3226 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3227 posixcc = s + 1;
3228 if (*s == ':') {
3229 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3230 I32 skip = 5; /* the most common skip */
3231
3232 switch (*posixcc) {
3233 case 'a':
3234 if (strnEQ(posixcc, "alnum", 5))
3235 namedclass =
3236 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3237 else if (strnEQ(posixcc, "alpha", 5))
3238 namedclass =
3239 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3240 else if (strnEQ(posixcc, "ascii", 5))
3241 namedclass =
3242 complement ? ANYOF_NASCII : ANYOF_ASCII;
3243 break;
aaa51d5e
JF
3244 case 'b':
3245 if (strnEQ(posixcc, "blank", 5))
3246 namedclass =
3247 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3248 break;
b8c5462f
JH
3249 case 'c':
3250 if (strnEQ(posixcc, "cntrl", 5))
3251 namedclass =
3252 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3253 break;
3254 case 'd':
3255 if (strnEQ(posixcc, "digit", 5))
3256 namedclass =
3257 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3258 break;
3259 case 'g':
3260 if (strnEQ(posixcc, "graph", 5))
3261 namedclass =
3262 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3263 break;
3264 case 'l':
3265 if (strnEQ(posixcc, "lower", 5))
3266 namedclass =
3267 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3268 break;
3269 case 'p':
3270 if (strnEQ(posixcc, "print", 5))
3271 namedclass =
3272 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3273 else if (strnEQ(posixcc, "punct", 5))
3274 namedclass =
3275 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3276 break;
3277 case 's':
3278 if (strnEQ(posixcc, "space", 5))
3279 namedclass =
aaa51d5e 3280 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3281 break;
b8c5462f
JH
3282 case 'u':
3283 if (strnEQ(posixcc, "upper", 5))
3284 namedclass =
3285 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3286 break;
3287 case 'w': /* this is not POSIX, this is the Perl \w */
3288 if (strnEQ(posixcc, "word", 4)) {
3289 namedclass =
3290 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3291 skip = 4;
3292 }
3293 break;
3294 case 'x':
3295 if (strnEQ(posixcc, "xdigit", 6)) {
3296 namedclass =
3297 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3298 skip = 6;
3299 }
3300 break;
3301 }
ac561586
JH
3302 if (namedclass == OOB_NAMEDCLASS ||
3303 posixcc[skip] != ':' ||
3304 posixcc[skip+1] != ']')
b45f050a
JF
3305 {
3306 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3307 t - s - 1, s + 1);
3308 }
3309 } else if (!SIZE_ONLY) {
b8c5462f 3310 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3311
830247a4 3312 /* adjust RExC_parse so the warning shows after
b45f050a 3313 the class closes */
9a86a77b 3314 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3315 RExC_parse++;
b45f050a
JF
3316 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3317 }
b8c5462f
JH
3318 } else {
3319 /* Maternal grandfather:
3320 * "[:" ending in ":" but not in ":]" */
830247a4 3321 RExC_parse = s;
767d463e 3322 }
620e46c5
JH
3323 }
3324 }
3325
b8c5462f
JH
3326 return namedclass;
3327}
3328
3329STATIC void
830247a4 3330S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3331{
e476b1b5 3332 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3333 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3334 char *s = RExC_parse;
93733859 3335 char c = *s++;
b8c5462f
JH
3336
3337 while(*s && isALNUM(*s))
3338 s++;
3339 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3340 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3341
3342 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3343 if (POSIXCC_NOTYET(c)) {
830247a4 3344 /* adjust RExC_parse so the error shows after
b45f050a 3345 the class closes */
9a86a77b 3346 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3347 ;
3348 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3349 }
b8c5462f
JH
3350 }
3351 }
620e46c5
JH
3352}
3353
76e3520e 3354STATIC regnode *
830247a4 3355S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3356{
ffc61ed2 3357 register UV value;
9a86a77b 3358 register UV nextvalue;
3568d838 3359 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3360 register IV range = 0;
c277df42 3361 register regnode *ret;
ba210ebe 3362 STRLEN numlen;
ffc61ed2 3363 IV namedclass;
9c5ffd7c 3364 char *rangebegin = 0;
936ed897 3365 bool need_class = 0;
9c5ffd7c 3366 SV *listsv = Nullsv;
ffc61ed2
JH
3367 register char *e;
3368 UV n;
3568d838 3369 bool optimize_invert = TRUE;
ffc61ed2
JH
3370
3371 ret = reganode(pRExC_state, ANYOF, 0);
3372
3373 if (!SIZE_ONLY)
3374 ANYOF_FLAGS(ret) = 0;
3375
9a86a77b 3376 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3377 RExC_naughty++;
3378 RExC_parse++;
3379 if (!SIZE_ONLY)
3380 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3381 }
a0d0e21e 3382
936ed897 3383 if (SIZE_ONLY)
830247a4 3384 RExC_size += ANYOF_SKIP;
936ed897 3385 else {
830247a4 3386 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3387 if (FOLD)
3388 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3389 if (LOC)
3390 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3391 ANYOF_BITMAP_ZERO(ret);
3392 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3393 }
b8c5462f 3394
9a86a77b
JH
3395 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3396
3397 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
830247a4 3398 checkposixcc(pRExC_state);
b8c5462f 3399
9a86a77b 3400 if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
ffc61ed2
JH
3401 goto charclassloop; /* allow 1st char to be ] or - */
3402
9a86a77b 3403 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3404
3405 charclassloop:
3406
3407 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3408
73b437c8 3409 if (!range)
830247a4 3410 rangebegin = RExC_parse;
ffc61ed2 3411 if (UTF) {
5e12f4fb 3412 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3413 RExC_end - RExC_parse,
3414 &numlen, 0);
ffc61ed2
JH
3415 RExC_parse += numlen;
3416 }
3417 else
3418 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3419 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3420 if (value == '[' && POSIXCC(nextvalue))
830247a4 3421 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3422 else if (value == '\\') {
ffc61ed2 3423 if (UTF) {
5e12f4fb 3424 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3425 RExC_end - RExC_parse,
3426 &numlen, 0);
3427 RExC_parse += numlen;
3428 }
3429 else
3430 value = UCHARAT(RExC_parse++);
470c3474 3431 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3432 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3433 * be a problem later if we want switch on Unicode.
3434 * A similar issue a little bit later when switching on
3435 * namedclass. --jhi */
ffc61ed2 3436 switch ((I32)value) {
b8c5462f
JH
3437 case 'w': namedclass = ANYOF_ALNUM; break;
3438 case 'W': namedclass = ANYOF_NALNUM; break;
3439 case 's': namedclass = ANYOF_SPACE; break;
3440 case 'S': namedclass = ANYOF_NSPACE; break;
3441 case 'd': namedclass = ANYOF_DIGIT; break;
3442 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3443 case 'p':
3444 case 'P':
3445 if (*RExC_parse == '{') {
0da60cf5 3446 U8 c = (U8)value;
ffc61ed2
JH
3447 e = strchr(RExC_parse++, '}');
3448 if (!e)
0da60cf5 3449 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3450 while (isSPACE(UCHARAT(RExC_parse)))
3451 RExC_parse++;
3452 if (e == RExC_parse)
0da60cf5 3453 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3454 n = e - RExC_parse;
ab13f0c7
JH
3455 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3456 n--;
ffc61ed2
JH
3457 }
3458 else {
3459 e = RExC_parse;
3460 n = 1;
3461 }
3462 if (!SIZE_ONLY) {
ab13f0c7
JH
3463 if (UCHARAT(RExC_parse) == '^') {
3464 RExC_parse++;
3465 n--;
3466 value = value == 'p' ? 'P' : 'p'; /* toggle */
3467 while (isSPACE(UCHARAT(RExC_parse))) {
3468 RExC_parse++;
3469 n--;
3470 }
3471 }
ffc61ed2 3472 if (value == 'p')
ab13f0c7
JH
3473 Perl_sv_catpvf(aTHX_ listsv,
3474 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3475 else
ab13f0c7
JH
3476 Perl_sv_catpvf(aTHX_ listsv,
3477 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3478 }
3479 RExC_parse = e + 1;
3480 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3481 continue;
b8c5462f
JH
3482 case 'n': value = '\n'; break;
3483 case 'r': value = '\r'; break;
3484 case 't': value = '\t'; break;
3485 case 'f': value = '\f'; break;
3486 case 'b': value = '\b'; break;
c7f1f016
NIS
3487 case 'e': value = ASCII_TO_NATIVE('\033');break;
3488 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3489 case 'x':
ffc61ed2 3490 if (*RExC_parse == '{') {
a4c04bdc
NC
3491 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3492 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3493 e = strchr(RExC_parse++, '}');
b81d288d 3494 if (!e)
ffc61ed2 3495 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3496
3497 numlen = e - RExC_parse;
3498 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3499 RExC_parse = e + 1;
3500 }
3501 else {
a4c04bdc 3502 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3503 numlen = 2;
3504 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3505 RExC_parse += numlen;
3506 }
b8c5462f
JH
3507 break;
3508 case 'c':
830247a4 3509 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3510 value = toCTRL(value);
3511 break;
3512 case '0': case '1': case '2': case '3': case '4':
3513 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3514 {
3515 I32 flags = 0;
3516 numlen = 3;
3517 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3518 RExC_parse += numlen;
b8c5462f 3519 break;
53305cf1 3520 }
1028017a 3521 default:
e476b1b5 3522 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3523 vWARN2(RExC_parse,
3524 "Unrecognized escape \\%c in character class passed through",
3525 (int)value);
1028017a 3526 break;
b8c5462f 3527 }
ffc61ed2
JH
3528 } /* end of \blah */
3529
3530 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3531
3532 if (!SIZE_ONLY && !need_class)
936ed897 3533 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3534
936ed897 3535 need_class = 1;
ffc61ed2
JH
3536
3537 /* a bad range like a-\d, a-[:digit:] ? */
3538 if (range) {
73b437c8 3539 if (!SIZE_ONLY) {
e476b1b5 3540 if (ckWARN(WARN_REGEXP))
830247a4 3541 vWARN4(RExC_parse,
b45f050a 3542 "False [] range \"%*.*s\"",
830247a4
IZ
3543 RExC_parse - rangebegin,
3544 RExC_parse - rangebegin,
b45f050a 3545 rangebegin);
3568d838
JH
3546 if (prevvalue < 256) {
3547 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3548 ANYOF_BITMAP_SET(ret, '-');
3549 }
3550 else {
3551 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3552 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3553 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3554 }
b8c5462f 3555 }
ffc61ed2
JH
3556
3557 range = 0; /* this was not a true range */
73b437c8 3558 }
ffc61ed2 3559
73b437c8 3560 if (!SIZE_ONLY) {
3568d838
JH
3561 if (namedclass > OOB_NAMEDCLASS)
3562 optimize_invert = FALSE;
e2962f66
JH
3563 /* Possible truncation here but in some 64-bit environments
3564 * the compiler gets heartburn about switch on 64-bit values.
3565 * A similar issue a little earlier when switching on value.
98f323fa 3566 * --jhi */
e2962f66 3567 switch ((I32)namedclass) {
73b437c8
JH
3568 case ANYOF_ALNUM:
3569 if (LOC)
936ed897 3570 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3571 else {
3572 for (value = 0; value < 256; value++)
3573 if (isALNUM(value))
936ed897 3574 ANYOF_BITMAP_SET(ret, value);
73b437c8 3575 }
ffc61ed2 3576 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3577 break;
3578 case ANYOF_NALNUM:
3579 if (LOC)
936ed897 3580 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3581 else {
3582 for (value = 0; value < 256; value++)
3583 if (!isALNUM(value))
936ed897 3584 ANYOF_BITMAP_SET(ret, value);
73b437c8 3585 }
ffc61ed2 3586 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3587 break;
ffc61ed2 3588 case ANYOF_ALNUMC:
73b437c8 3589 if (LOC)
ffc61ed2 3590 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3591 else {
3592 for (value = 0; value < 256; value++)
ffc61ed2 3593 if (isALNUMC(value))
936ed897 3594 ANYOF_BITMAP_SET(ret, value);
73b437c8 3595 }
ffc61ed2 3596 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3597 break;
3598 case ANYOF_NALNUMC:
3599 if (LOC)
936ed897 3600 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3601 else {
3602 for (value = 0; value < 256; value++)
3603 if (!isALNUMC(value))
936ed897 3604 ANYOF_BITMAP_SET(ret, value);
73b437c8 3605 }
ffc61ed2 3606 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3607 break;
3608 case ANYOF_ALPHA:
3609 if (LOC)
936ed897 3610 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3611 else {
3612 for (value = 0; value < 256; value++)
3613 if (isALPHA(value))
936ed897 3614 ANYOF_BITMAP_SET(ret, value);
73b437c8 3615 }
ffc61ed2 3616 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3617 break;
3618 case ANYOF_NALPHA:
3619 if (LOC)
936ed897 3620 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3621 else {
3622 for (value = 0; value < 256; value++)
3623 if (!isALPHA(value))
936ed897 3624 ANYOF_BITMAP_SET(ret, value);
73b437c8 3625 }
ffc61ed2 3626 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3627 break;
3628 case ANYOF_ASCII:
3629 if (LOC)
936ed897 3630 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3631 else {
c7f1f016 3632#ifndef EBCDIC
1ba5c669
JH
3633 for (value = 0; value < 128; value++)
3634 ANYOF_BITMAP_SET(ret, value);
3635#else /* EBCDIC */
ffbc6a93 3636 for (value = 0; value < 256; value++) {
3a3c4447
JH
3637 if (isASCII(value))
3638 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3639 }
1ba5c669 3640#endif /* EBCDIC */
73b437c8 3641 }
ffc61ed2 3642 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3643 break;
3644 case ANYOF_NASCII:
3645 if (LOC)
936ed897 3646 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3647 else {
c7f1f016 3648#ifndef EBCDIC
1ba5c669
JH
3649 for (value = 128; value < 256; value++)
3650 ANYOF_BITMAP_SET(ret, value);
3651#else /* EBCDIC */
ffbc6a93 3652 for (value = 0; value < 256; value++) {
3a3c4447
JH
3653 if (!isASCII(value))
3654 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3655 }
1ba5c669 3656#endif /* EBCDIC */
73b437c8 3657 }
ffc61ed2 3658 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3659 break;
aaa51d5e
JF
3660 case ANYOF_BLANK:
3661 if (LOC)
3662 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3663 else {
3664 for (value = 0; value < 256; value++)
3665 if (isBLANK(value))
3666 ANYOF_BITMAP_SET(ret, value);
3667 }
ffc61ed2 3668 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3669 break;
3670 case ANYOF_NBLANK:
3671 if (LOC)
3672 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3673 else {
3674 for (value = 0; value < 256; value++)
3675 if (!isBLANK(value))
3676 ANYOF_BITMAP_SET(ret, value);
3677 }
ffc61ed2 3678 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3679 break;
73b437c8
JH
3680 case ANYOF_CNTRL:
3681 if (LOC)
936ed897 3682 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3683 else {
3684 for (value = 0; value < 256; value++)
3685 if (isCNTRL(value))
936ed897 3686 ANYOF_BITMAP_SET(ret, value);
73b437c8 3687 }
ffc61ed2 3688 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3689 break;
3690 case ANYOF_NCNTRL:
3691 if (LOC)
936ed897 3692 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3693 else {
3694 for (value = 0; value < 256; value++)
3695 if (!isCNTRL(value))
936ed897 3696 ANYOF_BITMAP_SET(ret, value);
73b437c8 3697 }
ffc61ed2
JH
3698 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3699 break;
3700 case ANYOF_DIGIT:
3701 if (LOC)
3702 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3703 else {
3704 /* consecutive digits assumed */
3705 for (value = '0'; value <= '9'; value++)
3706 ANYOF_BITMAP_SET(ret, value);
3707 }
3708 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3709 break;
3710 case ANYOF_NDIGIT:
3711 if (LOC)
3712 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3713 else {
3714 /* consecutive digits assumed */
3715 for (value = 0; value < '0'; value++)
3716 ANYOF_BITMAP_SET(ret, value);
3717 for (value = '9' + 1; value < 256; value++)
3718 ANYOF_BITMAP_SET(ret, value);
3719 }
3720 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3721 break;
3722 case ANYOF_GRAPH:
3723 if (LOC)
936ed897 3724 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3725 else {
3726 for (value = 0; value < 256; value++)
3727 if (isGRAPH(value))
936ed897 3728 ANYOF_BITMAP_SET(ret, value);
73b437c8 3729 }
ffc61ed2 3730 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3731 break;
3732 case ANYOF_NGRAPH:
3733 if (LOC)
936ed897 3734 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3735 else {
3736 for (value = 0; value < 256; value++)
3737 if (!isGRAPH(value))
936ed897 3738 ANYOF_BITMAP_SET(ret, value);
73b437c8 3739 }
ffc61ed2 3740 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3741 break;
3742 case ANYOF_LOWER:
3743 if (LOC)
936ed897 3744 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3745 else {
3746 for (value = 0; value < 256; value++)
3747 if (isLOWER(value))
936ed897 3748 ANYOF_BITMAP_SET(ret, value);
73b437c8 3749 }
ffc61ed2 3750 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3751 break;
3752 case ANYOF_NLOWER:
3753 if (LOC)
936ed897 3754 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3755 else {
3756 for (value = 0; value < 256; value++)
3757 if (!isLOWER(value))
936ed897 3758 ANYOF_BITMAP_SET(ret, value);
73b437c8 3759 }
ffc61ed2 3760 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3761 break;
3762 case ANYOF_PRINT:
3763 if (LOC)
936ed897 3764 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3765 else {
3766 for (value = 0; value < 256; value++)
3767 if (isPRINT(value))
936ed897 3768 ANYOF_BITMAP_SET(ret, value);
73b437c8 3769 }
ffc61ed2 3770 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3771 break;
3772 case ANYOF_NPRINT:
3773 if (LOC)
936ed897 3774 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3775 else {
3776 for (value = 0; value < 256; value++)
3777 if (!isPRINT(value))
936ed897 3778 ANYOF_BITMAP_SET(ret, value);
73b437c8 3779 }
ffc61ed2 3780 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3781 break;
aaa51d5e
JF
3782 case ANYOF_PSXSPC:
3783 if (LOC)
3784 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (isPSXSPC(value))
3788 ANYOF_BITMAP_SET(ret, value);
3789 }
ffc61ed2 3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3791 break;
3792 case ANYOF_NPSXSPC:
3793 if (LOC)
3794 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3795 else {
3796 for (value = 0; value < 256; value++)
3797 if (!isPSXSPC(value))
3798 ANYOF_BITMAP_SET(ret, value);
3799 }
ffc61ed2 3800 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3801 break;
73b437c8
JH
3802 case ANYOF_PUNCT:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3805 else {
3806 for (value = 0; value < 256; value++)
3807 if (isPUNCT(value))
936ed897 3808 ANYOF_BITMAP_SET(ret, value);
73b437c8 3809 }
ffc61ed2 3810 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3811 break;
3812 case ANYOF_NPUNCT:
3813 if (LOC)
936ed897 3814 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3815 else {
3816 for (value = 0; value < 256; value++)
3817 if (!isPUNCT(value))
936ed897 3818 ANYOF_BITMAP_SET(ret, value);
73b437c8 3819 }
ffc61ed2
JH
3820 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3821 break;
3822 case ANYOF_SPACE:
3823 if (LOC)
3824 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3825 else {
3826 for (value = 0; value < 256; value++)
3827 if (isSPACE(value))
3828 ANYOF_BITMAP_SET(ret, value);
3829 }
3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3831 break;
3832 case ANYOF_NSPACE:
3833 if (LOC)
3834 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3835 else {
3836 for (value = 0; value < 256; value++)
3837 if (!isSPACE(value))
3838 ANYOF_BITMAP_SET(ret, value);
3839 }
3840 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3841 break;
3842 case ANYOF_UPPER:
3843 if (LOC)
936ed897 3844 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3845 else {
3846 for (value = 0; value < 256; value++)
3847 if (isUPPER(value))
936ed897 3848 ANYOF_BITMAP_SET(ret, value);
73b437c8 3849 }
ffc61ed2 3850 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3851 break;
3852 case ANYOF_NUPPER:
3853 if (LOC)
936ed897 3854 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3855 else {
3856 for (value = 0; value < 256; value++)
3857 if (!isUPPER(value))
936ed897 3858 ANYOF_BITMAP_SET(ret, value);
73b437c8 3859 }
ffc61ed2 3860 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3861 break;
3862 case ANYOF_XDIGIT:
3863 if (LOC)
936ed897 3864 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3865 else {
3866 for (value = 0; value < 256; value++)
3867 if (isXDIGIT(value))
936ed897 3868 ANYOF_BITMAP_SET(ret, value);
73b437c8 3869 }
ffc61ed2 3870 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3871 break;
3872 case ANYOF_NXDIGIT:
3873 if (LOC)
936ed897 3874 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3875 else {
3876 for (value = 0; value < 256; value++)
3877 if (!isXDIGIT(value))
936ed897 3878 ANYOF_BITMAP_SET(ret, value);
73b437c8 3879 }
ffc61ed2 3880 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3881 break;
3882 default:
b45f050a 3883 vFAIL("Invalid [::] class");
73b437c8 3884 break;
b8c5462f 3885 }
b8c5462f 3886 if (LOC)
936ed897 3887 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3888 continue;
a0d0e21e 3889 }
ffc61ed2
JH
3890 } /* end of namedclass \blah */
3891
a0d0e21e 3892 if (range) {
3a3c4447 3893 if (prevvalue > value) /* b-a */ {
b45f050a 3894 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3895 RExC_parse - rangebegin,
3896 RExC_parse - rangebegin,
b45f050a 3897 rangebegin);
3568d838 3898 range = 0; /* not a valid range */
73b437c8 3899 }
a0d0e21e
LW
3900 }
3901 else {
3568d838 3902 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3903 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3904 RExC_parse[1] != ']') {
3905 RExC_parse++;
ffc61ed2
JH
3906
3907 /* a bad range like \w-, [:word:]- ? */
3908 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3909 if (ckWARN(WARN_REGEXP))
830247a4 3910 vWARN4(RExC_parse,
b45f050a 3911 "False [] range \"%*.*s\"",
830247a4
IZ
3912 RExC_parse - rangebegin,
3913 RExC_parse - rangebegin,
b45f050a 3914 rangebegin);
73b437c8 3915 if (!SIZE_ONLY)
936ed897 3916 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3917 } else
ffc61ed2
JH
3918 range = 1; /* yeah, it's a range! */
3919 continue; /* but do it the next time */
a0d0e21e 3920 }
a687059c 3921 }
ffc61ed2 3922
93733859 3923 /* now is the next time */
ae5c130c 3924 if (!SIZE_ONLY) {
3568d838
JH
3925 IV i;
3926
3927 if (prevvalue < 256) {
3928 IV ceilvalue = value < 256 ? value : 255;
3929
3930#ifdef EBCDIC
3a3c4447
JH
3931 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3932 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3933 {
3568d838
JH
3934 if (isLOWER(prevvalue)) {
3935 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3936 if (isLOWER(i))
3937 ANYOF_BITMAP_SET(ret, i);
3938 } else {
3568d838 3939 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3940 if (isUPPER(i))
3941 ANYOF_BITMAP_SET(ret, i);
3942 }
8ada0baa 3943 }
ffc61ed2 3944 else
8ada0baa 3945#endif
3568d838
JH
3946 for (i = prevvalue; i <= ceilvalue; i++)
3947 ANYOF_BITMAP_SET(ret, i);
3948 }
3949 if (value > 255) {
ffc61ed2 3950 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3951 if (prevvalue < value)
ffc61ed2 3952 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3953 (UV)prevvalue, (UV)value);
3954 else if (prevvalue == value)
ffc61ed2
JH
3955 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3956 (UV)value);
3957 }
8ada0baa 3958 }
ffc61ed2
JH
3959
3960 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3961 }
ffc61ed2 3962
936ed897 3963 if (need_class) {
4f66b38d 3964 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3965 if (SIZE_ONLY)
830247a4 3966 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3967 else
830247a4 3968 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3969 }
ffc61ed2 3970
ae5c130c 3971 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3972 if (!SIZE_ONLY &&
ffc61ed2 3973 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3974 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3975 ) {
a0ed51b3 3976 for (value = 0; value < 256; ++value) {
936ed897 3977 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3978 IV fold = PL_fold[value];
3979
3980 if (fold != value)
3981 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3982 }
3983 }
936ed897 3984 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3985 }
ffc61ed2 3986
ae5c130c 3987 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3988 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3989 /* If the only flag is inversion. */
3990 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3991 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3992 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3993 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3994 }
a0d0e21e 3995
b81d288d 3996 if (!SIZE_ONLY) {
fde631ed 3997 AV *av = newAV();
ffc61ed2
JH
3998 SV *rv;
3999
4000 av_store(av, 0, listsv);
4001 av_store(av, 1, NULL);
4002 rv = newRV_noinc((SV*)av);
19860706 4003 n = add_data(pRExC_state, 1, "s");
830247a4 4004 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4005 ARG_SET(ret, n);
a0ed51b3
LW
4006 }
4007
4008 return ret;
4009}
4010
76e3520e 4011STATIC char*
830247a4 4012S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4013{
830247a4 4014 char* retval = RExC_parse++;
a0d0e21e 4015
4633a7c4 4016 for (;;) {
830247a4
IZ
4017 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4018 RExC_parse[2] == '#') {
4019 while (*RExC_parse && *RExC_parse != ')')
4020 RExC_parse++;
4021 RExC_parse++;
4633a7c4
LW
4022 continue;
4023 }
830247a4
IZ
4024 if (RExC_flags16 & PMf_EXTENDED) {
4025 if (isSPACE(*RExC_parse)) {
4026 RExC_parse++;
748a9306
LW
4027 continue;
4028 }
830247a4
IZ
4029 else if (*RExC_parse == '#') {
4030 while (*RExC_parse && *RExC_parse != '\n')
4031 RExC_parse++;
4032 RExC_parse++;
748a9306
LW
4033 continue;
4034 }
748a9306 4035 }
4633a7c4 4036 return retval;
a0d0e21e 4037 }
a687059c
LW
4038}
4039
4040/*
c277df42 4041- reg_node - emit a node
a0d0e21e 4042*/
76e3520e 4043STATIC regnode * /* Location. */
830247a4 4044S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4045{
c277df42
IZ
4046 register regnode *ret;
4047 register regnode *ptr;
a687059c 4048
830247a4 4049 ret = RExC_emit;
c277df42 4050 if (SIZE_ONLY) {
830247a4
IZ
4051 SIZE_ALIGN(RExC_size);
4052 RExC_size += 1;
a0d0e21e
LW
4053 return(ret);
4054 }
a687059c 4055
c277df42 4056 NODE_ALIGN_FILL(ret);
a0d0e21e 4057 ptr = ret;
c277df42 4058 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4059 if (RExC_offsets) { /* MJD */
4060 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4061 "reg_node", __LINE__,
4062 reg_name[op],
4063 RExC_emit - RExC_emit_start > RExC_offsets[0]
4064 ? "Overwriting end of array!\n" : "OK",
4065 RExC_emit - RExC_emit_start,
4066 RExC_parse - RExC_start,
4067 RExC_offsets[0]));
4068 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4069 }
4070
830247a4 4071 RExC_emit = ptr;
a687059c 4072
a0d0e21e 4073 return(ret);
a687059c
LW
4074}
4075
4076/*
a0d0e21e
LW
4077- reganode - emit a node with an argument
4078*/
76e3520e 4079STATIC regnode * /* Location. */
830247a4 4080S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4081{
c277df42
IZ
4082 register regnode *ret;
4083 register regnode *ptr;
fe14fcc3 4084
830247a4 4085 ret = RExC_emit;
c277df42 4086 if (SIZE_ONLY) {
830247a4
IZ
4087 SIZE_ALIGN(RExC_size);
4088 RExC_size += 2;
a0d0e21e
LW
4089 return(ret);
4090 }
fe14fcc3 4091
c277df42 4092 NODE_ALIGN_FILL(ret);
a0d0e21e 4093 ptr = ret;
c277df42 4094 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4095 if (RExC_offsets) { /* MJD */
4096 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4097 "reganode",
4098 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4099 "Overwriting end of array!\n" : "OK",
4100 RExC_emit - RExC_emit_start,
4101 RExC_parse - RExC_start,
4102 RExC_offsets[0]));
4103 Set_Cur_Node_Offset;
4104 }
4105
830247a4 4106 RExC_emit = ptr;
fe14fcc3 4107
a0d0e21e 4108 return(ret);
fe14fcc3
LW
4109}
4110
4111/*
cd439c50 4112- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4113*/
4114STATIC void
830247a4 4115S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4116{
5e12f4fb 4117 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4118}
4119
4120/*
a0d0e21e
LW
4121- reginsert - insert an operator in front of already-emitted operand
4122*
4123* Means relocating the operand.
4124*/
76e3520e 4125STATIC void
830247a4 4126S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4127{
c277df42
IZ
4128 register regnode *src;
4129 register regnode *dst;
4130 register regnode *place;
4131 register int offset = regarglen[(U8)op];
b81d288d 4132
22c35a8c 4133/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4134
4135 if (SIZE_ONLY) {
830247a4 4136 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4137 return;
4138 }
a687059c 4139
830247a4
IZ
4140 src = RExC_emit;
4141 RExC_emit += NODE_STEP_REGNODE + offset;
4142 dst = RExC_emit;
fac92740 4143 while (src > opnd) {
c277df42 4144 StructCopy(--src, --dst, regnode);
fac92740
MJD
4145 if (RExC_offsets) { /* MJD 20010112 */
4146 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4147 "reg_insert",
4148 dst - RExC_emit_start > RExC_offsets[0]
4149 ? "Overwriting end of array!\n" : "OK",
4150 src - RExC_emit_start,
4151 dst - RExC_emit_start,
4152 RExC_offsets[0]));
4153 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4154 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4155 }
4156 }
4157
a0d0e21e
LW
4158
4159 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4160 if (RExC_offsets) { /* MJD */
4161 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4162 "reginsert",
4163 place - RExC_emit_start > RExC_offsets[0]
4164 ? "Overwriting end of array!\n" : "OK",
4165 place - RExC_emit_start,
4166 RExC_parse - RExC_start,
4167 RExC_offsets[0]));
4168 Set_Node_Offset(place, RExC_parse);
4169 }
c277df42
IZ
4170 src = NEXTOPER(place);
4171 FILL_ADVANCE_NODE(place, op);
4172 Zero(src, offset, regnode);
a687059c
LW
4173}
4174
4175/*
c277df42 4176- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4177*/
76e3520e 4178STATIC void
830247a4 4179S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4180{
c277df42
IZ
4181 register regnode *scan;
4182 register regnode *temp;
a0d0e21e 4183
c277df42 4184 if (SIZE_ONLY)
a0d0e21e
LW
4185 return;
4186
4187 /* Find last node. */
4188 scan = p;
4189 for (;;) {
4190 temp = regnext(scan);
4191 if (temp == NULL)
4192 break;
4193 scan = temp;
4194 }
a687059c 4195
c277df42
IZ
4196 if (reg_off_by_arg[OP(scan)]) {
4197 ARG_SET(scan, val - scan);
a0ed51b3
LW
4198 }
4199 else {
c277df42
IZ
4200 NEXT_OFF(scan) = val - scan;
4201 }
a687059c
LW
4202}
4203
4204/*
a0d0e21e
LW
4205- regoptail - regtail on operand of first argument; nop if operandless
4206*/
76e3520e 4207STATIC void
830247a4 4208S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4209{
a0d0e21e 4210 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4211 if (p == NULL || SIZE_ONLY)
4212 return;
22c35a8c 4213 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4214 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4215 }
22c35a8c 4216 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4217 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4218 }
4219 else
a0d0e21e 4220 return;
a687059c
LW
4221}
4222
4223/*
4224 - regcurly - a little FSA that accepts {\d+,?\d*}
4225 */
79072805 4226STATIC I32
cea2e8a9 4227S_regcurly(pTHX_ register char *s)
a687059c
LW
4228{
4229 if (*s++ != '{')
4230 return FALSE;
f0fcb552 4231 if (!isDIGIT(*s))
a687059c 4232 return FALSE;
f0fcb552 4233 while (isDIGIT(*s))
a687059c
LW
4234 s++;
4235 if (*s == ',')
4236 s++;
f0fcb552 4237 while (isDIGIT(*s))
a687059c
LW
4238 s++;
4239 if (*s != '}')
4240 return FALSE;
4241 return TRUE;
4242}
4243
a687059c 4244
8fa7f367
JH
4245#ifdef DEBUGGING
4246
76e3520e 4247STATIC regnode *
cea2e8a9 4248S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4249{
f248d071 4250 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4251 register regnode *next;
c277df42
IZ
4252
4253 while (op != END && (!last || node < last)) {
4254 /* While that wasn't END last time... */
4255
4256 NODE_ALIGN(node);
4257 op = OP(node);
4258 if (op == CLOSE)
4259 l--;
4260 next = regnext(node);
4261 /* Where, what. */
4262 if (OP(node) == OPTIMIZED)
4263 goto after_print;
4264 regprop(sv, node);
b900a521 4265 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4266 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4267 if (next == NULL) /* Next ptr. */
4268 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4269 else
b900a521 4270 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4271 (void)PerlIO_putc(Perl_debug_log, '\n');
4272 after_print:
22c35a8c 4273 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4274 register regnode *nnode = (OP(next) == LONGJMP
4275 ? regnext(next)
c277df42
IZ
4276 : next);
4277 if (last && nnode > last)
4278 nnode = last;
4279 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4280 }
22c35a8c 4281 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4282 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4283 }
4284 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4285 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4286 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4287 }
22c35a8c 4288 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4289 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4290 next, sv, l + 1);
a0ed51b3
LW
4291 }
4292 else if ( op == PLUS || op == STAR) {
c277df42 4293 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4294 }
4295 else if (op == ANYOF) {
4f66b38d
HS
4296 /* arglen 1 + class block */
4297 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4298 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4299 node = NEXTOPER(node);
a0ed51b3 4300 }
22c35a8c 4301 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4302 /* Literal string, where present. */
cd439c50 4303 node += NODE_SZ_STR(node) - 1;
c277df42 4304 node = NEXTOPER(node);
a0ed51b3
LW
4305 }
4306 else {
c277df42
IZ
4307 node = NEXTOPER(node);
4308 node += regarglen[(U8)op];
4309 }
4310 if (op == CURLYX || op == OPEN)
4311 l++;
4312 else if (op == WHILEM)
4313 l--;
4314 }
4315 return node;
4316}
4317
8fa7f367
JH
4318#endif /* DEBUGGING */
4319
a687059c 4320/*
fd181c75 4321 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4322 */
4323void
864dbfa3 4324Perl_regdump(pTHX_ regexp *r)
a687059c 4325{
35ff7856 4326#ifdef DEBUGGING
46fc3d4c 4327 SV *sv = sv_newmortal();
a687059c 4328
c277df42 4329 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4330
4331 /* Header fields of interest. */
c277df42 4332 if (r->anchored_substr)
7b0972df 4333 PerlIO_printf(Perl_debug_log,
b81d288d 4334 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4335 PL_colors[0],
7b0972df 4336 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4337 SvPVX(r->anchored_substr),
3280af22 4338 PL_colors[1],
c277df42 4339 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4340 (IV)r->anchored_offset);
c277df42 4341 if (r->float_substr)
7b0972df 4342 PerlIO_printf(Perl_debug_log,
b81d288d 4343 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4344 PL_colors[0],
b81d288d 4345 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4346 SvPVX(r->float_substr),
3280af22 4347 PL_colors[1],
c277df42 4348 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4349 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4350 if (r->check_substr)
b81d288d
AB
4351 PerlIO_printf(Perl_debug_log,
4352 r->check_substr == r->float_substr
c277df42
IZ
4353 ? "(checking floating" : "(checking anchored");
4354 if (r->reganch & ROPT_NOSCAN)
4355 PerlIO_printf(Perl_debug_log, " noscan");
4356 if (r->reganch & ROPT_CHECK_ALL)
4357 PerlIO_printf(Perl_debug_log, " isall");
4358 if (r->check_substr)
4359 PerlIO_printf(Perl_debug_log, ") ");
4360
46fc3d4c 4361 if (r->regstclass) {
4362 regprop(sv, r->regstclass);
4363 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4364 }
774d564b 4365 if (r->reganch & ROPT_ANCH) {
4366 PerlIO_printf(Perl_debug_log, "anchored");
4367 if (r->reganch & ROPT_ANCH_BOL)
4368 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4369 if (r->reganch & ROPT_ANCH_MBOL)
4370 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4371 if (r->reganch & ROPT_ANCH_SBOL)
4372 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4373 if (r->reganch & ROPT_ANCH_GPOS)
4374 PerlIO_printf(Perl_debug_log, "(GPOS)");
4375 PerlIO_putc(Perl_debug_log, ' ');
4376 }
c277df42
IZ
4377 if (r->reganch & ROPT_GPOS_SEEN)
4378 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4379 if (r->reganch & ROPT_SKIP)
760ac839 4380 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4381 if (r->reganch & ROPT_IMPLICIT)
760ac839 4382 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4383 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4384 if (r->reganch & ROPT_EVAL_SEEN)
4385 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4386 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4387 if (r->offsets) {
4388 U32 i;
4389 U32 len = r->offsets[0];
392fbf5d 4390 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4391 for (i = 1; i <= len; i++)
392fbf5d
RB
4392 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4393 (UV)r->offsets[i*2-1],
4394 (UV)r->offsets[i*2]);
fac92740
MJD
4395 PerlIO_printf(Perl_debug_log, "\n");
4396 }
17c3b450 4397#endif /* DEBUGGING */
a687059c
LW
4398}
4399
8fa7f367
JH
4400#ifdef DEBUGGING
4401
653099ff
GS
4402STATIC void
4403S_put_byte(pTHX_ SV *sv, int c)
4404{
7be5a6cf 4405 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4406 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4407 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4408 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4409 else
4410 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4411}
4412
8fa7f367
JH
4413#endif /* DEBUGGING */
4414
a687059c 4415/*
a0d0e21e
LW
4416- regprop - printable representation of opcode
4417*/
46fc3d4c 4418void
864dbfa3 4419Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4420{
35ff7856 4421#ifdef DEBUGGING
9b155405 4422 register int k;
a0d0e21e 4423
54dc92de 4424 sv_setpvn(sv, "", 0);
9b155405 4425 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4426 /* It would be nice to FAIL() here, but this may be called from
4427 regexec.c, and it would be hard to supply pRExC_state. */
4428 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4429 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4430
4431 k = PL_regkind[(U8)OP(o)];
4432
2a782b5b
JH
4433 if (k == EXACT) {
4434 SV *dsv = sv_2mortal(newSVpvn("", 0));
df1ffd02 4435 bool do_utf8 = DO_UTF8(sv);
8a989385 4436 char *s = do_utf8 ?
2a782b5b
JH
4437 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
4438 STRING(o);
40eddc46 4439 int len = do_utf8 ?
2a782b5b
JH
4440 strlen(s) :
4441 STR_LEN(o);
4442 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4443 PL_colors[0],
4444 len, s,
4445 PL_colors[1]);
4446 }
9b155405 4447 else if (k == CURLY) {
cb434fcc 4448 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4449 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4450 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4451 }
2c2d71f5
JH
4452 else if (k == WHILEM && o->flags) /* Ordinal/of */
4453 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4454 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4455 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4456 else if (k == LOGICAL)
04ebc1ab 4457 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4458 else if (k == ANYOF) {
4459 int i, rangestart = -1;
ffc61ed2 4460 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4461 const char * const anyofs[] = { /* Should be syncronized with
4462 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4463 "\\w",
4464 "\\W",
4465 "\\s",
4466 "\\S",
4467 "\\d",
4468 "\\D",
4469 "[:alnum:]",
4470 "[:^alnum:]",
4471 "[:alpha:]",
4472 "[:^alpha:]",
4473 "[:ascii:]",
4474 "[:^ascii:]",
4475 "[:ctrl:]",
4476 "[:^ctrl:]",
4477 "[:graph:]",
4478 "[:^graph:]",
4479 "[:lower:]",
4480 "[:^lower:]",
4481 "[:print:]",
4482 "[:^print:]",
4483 "[:punct:]",
4484 "[:^punct:]",
4485 "[:upper:]",
aaa51d5e 4486 "[:^upper:]",
653099ff 4487 "[:xdigit:]",
aaa51d5e
JF
4488 "[:^xdigit:]",
4489 "[:space:]",
4490 "[:^space:]",
4491 "[:blank:]",
4492 "[:^blank:]"
653099ff
GS
4493 };
4494
19860706 4495 if (flags & ANYOF_LOCALE)
653099ff 4496 sv_catpv(sv, "{loc}");
19860706 4497 if (flags & ANYOF_FOLD)
653099ff
GS
4498 sv_catpv(sv, "{i}");
4499 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4500 if (flags & ANYOF_INVERT)
653099ff 4501 sv_catpv(sv, "^");
ffc61ed2
JH
4502 for (i = 0; i <= 256; i++) {
4503 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4504 if (rangestart == -1)
4505 rangestart = i;
4506 } else if (rangestart != -1) {
4507 if (i <= rangestart + 3)
4508 for (; rangestart < i; rangestart++)
653099ff 4509 put_byte(sv, rangestart);
ffc61ed2
JH
4510 else {
4511 put_byte(sv, rangestart);
4512 sv_catpv(sv, "-");
4513 put_byte(sv, i - 1);
653099ff 4514 }
ffc61ed2 4515 rangestart = -1;
653099ff 4516 }
847a199f 4517 }
ffc61ed2
JH
4518
4519 if (o->flags & ANYOF_CLASS)
4520 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4521 if (ANYOF_CLASS_TEST(o,i))
4522 sv_catpv(sv, anyofs[i]);
4523
4524 if (flags & ANYOF_UNICODE)
4525 sv_catpv(sv, "{unicode}");
1aa99e6b 4526 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4527 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4528
4529 {
4530 SV *lv;
4531 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4532
ffc61ed2
JH
4533 if (lv) {
4534 if (sw) {
4535 UV i;
4536 U8 s[UTF8_MAXLEN+1];
b81d288d 4537
ffc61ed2 4538 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4539 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4540
3568d838 4541 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4542 if (rangestart == -1)
4543 rangestart = i;
4544 } else if (rangestart != -1) {
4545 U8 *p;
b81d288d 4546
ffc61ed2
JH
4547 if (i <= rangestart + 3)
4548 for (; rangestart < i; rangestart++) {
2b9d42f0 4549 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4550 put_byte(sv, *p);
4551 }
4552 else {
2b9d42f0 4553 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4554 put_byte(sv, *p);
4555 sv_catpv(sv, "-");
2b9d42f0 4556 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4557 put_byte(sv, *p);
4558 }
4559 rangestart = -1;
4560 }
19860706 4561 }
ffc61ed2
JH
4562
4563 sv_catpv(sv, "..."); /* et cetera */
19860706 4564 }
fde631ed 4565
ffc61ed2
JH
4566 {
4567 char *s = savepv(SvPVX(lv));
4568 char *origs = s;
b81d288d 4569
ffc61ed2 4570 while(*s && *s != '\n') s++;
b81d288d 4571
ffc61ed2
JH
4572 if (*s == '\n') {
4573 char *t = ++s;
4574
4575 while (*s) {
4576 if (*s == '\n')
4577 *s = ' ';
4578 s++;
4579 }
4580 if (s[-1] == ' ')
4581 s[-1] = 0;
4582
4583 sv_catpv(sv, t);
fde631ed 4584 }
b81d288d 4585
ffc61ed2 4586 Safefree(origs);
fde631ed
JH
4587 }
4588 }
653099ff 4589 }
ffc61ed2 4590
653099ff
GS
4591 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4592 }
9b155405 4593 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4594 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4595#endif /* DEBUGGING */
35ff7856 4596}
a687059c 4597
cad2e5aa
JH
4598SV *
4599Perl_re_intuit_string(pTHX_ regexp *prog)
4600{ /* Assume that RE_INTUIT is set */
4601 DEBUG_r(
4602 { STRLEN n_a;
4603 char *s = SvPV(prog->check_substr,n_a);
4604
4605 if (!PL_colorset) reginitcolors();
4606 PerlIO_printf(Perl_debug_log,
4607 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4608 PL_colors[4],PL_colors[5],PL_colors[0],
4609 s,
4610 PL_colors[1],
4611 (strlen(s) > 60 ? "..." : ""));
4612 } );
4613
4614 return prog->check_substr;
4615}
4616
2b69d0c2 4617void
864dbfa3 4618Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4619{
adac82c7 4620 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4621
4622 if (!r || (--r->refcnt > 0))
4623 return;
cad2e5aa
JH
4624 DEBUG_r(PerlIO_printf(Perl_debug_log,
4625 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4626 PL_colors[4],PL_colors[5],PL_colors[0],
4627 r->precomp,
4628 PL_colors[1],
4629 (strlen(r->precomp) > 60 ? "..." : "")));
4630
c277df42 4631 if (r->precomp)
a0d0e21e 4632 Safefree(r->precomp);
fac92740
MJD
4633 if (r->offsets) /* 20010421 MJD */
4634 Safefree(r->offsets);
cf93c79d
IZ
4635 if (RX_MATCH_COPIED(r))
4636 Safefree(r->subbeg);
a193d654
GS
4637 if (r->substrs) {
4638 if (r->anchored_substr)
4639 SvREFCNT_dec(r->anchored_substr);
4640 if (r->float_substr)
4641 SvREFCNT_dec(r->float_substr);
2779dcf1 4642 Safefree(r->substrs);
a193d654 4643 }
c277df42
IZ
4644 if (r->data) {
4645 int n = r->data->count;
dfad63ad
HS
4646 AV* new_comppad = NULL;
4647 AV* old_comppad;
4648 SV** old_curpad;
4649
c277df42 4650 while (--n >= 0) {
261faec3 4651 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4652 switch (r->data->what[n]) {
4653 case 's':
4654 SvREFCNT_dec((SV*)r->data->data[n]);
4655 break;
653099ff
GS
4656 case 'f':
4657 Safefree(r->data->data[n]);
4658 break;
dfad63ad
HS
4659 case 'p':
4660 new_comppad = (AV*)r->data->data[n];
4661 break;
c277df42 4662 case 'o':
dfad63ad 4663 if (new_comppad == NULL)
cea2e8a9 4664 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4665 old_comppad = PL_comppad;
4666 old_curpad = PL_curpad;
1e6dc0b6
SB
4667 /* Watch out for global destruction's random ordering. */
4668 if (SvTYPE(new_comppad) == SVt_PVAV) {
4669 PL_comppad = new_comppad;
4670 PL_curpad = AvARRAY(new_comppad);
4671 }
4672 else
4673 PL_curpad = NULL;
9b978d73
DM
4674
4675 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4676 op_free((OP_4tree*)r->data->data[n]);
4677 }
4678
dfad63ad
HS
4679 PL_comppad = old_comppad;
4680 PL_curpad = old_curpad;
4681 SvREFCNT_dec((SV*)new_comppad);
4682 new_comppad = NULL;
c277df42
IZ
4683 break;
4684 case 'n':
4685 break;
4686 default:
830247a4 4687 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4688 }
4689 }
4690 Safefree(r->data->what);
4691 Safefree(r->data);
a0d0e21e
LW
4692 }
4693 Safefree(r->startp);
4694 Safefree(r->endp);
4695 Safefree(r);
a687059c 4696}
c277df42
IZ
4697
4698/*
4699 - regnext - dig the "next" pointer out of a node
4700 *
4701 * [Note, when REGALIGN is defined there are two places in regmatch()
4702 * that bypass this code for speed.]
4703 */
4704regnode *
864dbfa3 4705Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4706{
4707 register I32 offset;
4708
3280af22 4709 if (p == &PL_regdummy)
c277df42
IZ
4710 return(NULL);
4711
4712 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4713 if (offset == 0)
4714 return(NULL);
4715
c277df42 4716 return(p+offset);
c277df42
IZ
4717}
4718
01f988be 4719STATIC void
cea2e8a9 4720S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4721{
4722 va_list args;
4723 STRLEN l1 = strlen(pat1);
4724 STRLEN l2 = strlen(pat2);
4725 char buf[512];
06bf62c7 4726 SV *msv;
c277df42
IZ
4727 char *message;
4728
4729 if (l1 > 510)
4730 l1 = 510;
4731 if (l1 + l2 > 510)
4732 l2 = 510 - l1;
4733 Copy(pat1, buf, l1 , char);
4734 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4735 buf[l1 + l2] = '\n';
4736 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4737#ifdef I_STDARG
4738 /* ANSI variant takes additional second argument */
c277df42 4739 va_start(args, pat2);
8736538c
AS
4740#else
4741 va_start(args);
4742#endif
5a844595 4743 msv = vmess(buf, &args);
c277df42 4744 va_end(args);
06bf62c7 4745 message = SvPV(msv,l1);
c277df42
IZ
4746 if (l1 > 512)
4747 l1 = 512;
4748 Copy(message, buf, l1 , char);
4749 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4750 Perl_croak(aTHX_ "%s", buf);
c277df42 4751}
a0ed51b3
LW
4752
4753/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4754
4755void
864dbfa3 4756Perl_save_re_context(pTHX)
b81d288d 4757{
830247a4
IZ
4758#if 0
4759 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4760 SAVEI32(RExC_npar); /* () count. */
4761 SAVEI32(RExC_size); /* Code size. */
4762 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4763 SAVEVPTR(RExC_rx); /* from regcomp.c */
4764 SAVEI32(RExC_seen); /* from regcomp.c */
4765 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4766 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4767 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4768 SAVEPPTR(RExC_end); /* End of input for compile */
4769 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4770#endif
4771
4772 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4773 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4774 SAVEPPTR(PL_reginput); /* String-input pointer. */
4775 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4776 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4777 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4778 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4779 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4780 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4781 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4782 PL_reg_start_tmp = 0;
a0ed51b3
LW
4783 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4784 PL_reg_start_tmpl = 0;
7766f137 4785 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4786 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4787 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4788 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4789 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4790 SAVEVPTR(PL_regcc); /* from regexec.c */
4791 SAVEVPTR(PL_curcop);
7766f137
GS
4792 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4793 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4794 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4795 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4796 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4797 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4798 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4799 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4800 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4801 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4802 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4803#ifdef DEBUGGING
b81d288d 4804 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4805#endif
a0ed51b3 4806}
51371543 4807
51371543 4808static void
acfe0abc 4809clear_re(pTHX_ void *r)
51371543
GS
4810{
4811 ReREFCNT_dec((regexp *)r);
4812}
ffbc6a93 4813