This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Can't count parentheses.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
830247a4 262 unsigned len = strlen(RExC_precomp); \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
830247a4 284 unsigned len = strlen(RExC_precomp); \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
830247a4 304 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
830247a4 325 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
830247a4 347 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
830247a4 368 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
830247a4 379 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
830247a4 387 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
394 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
830247a4 403 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
830247a4 411 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
830247a4 419 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
428 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
9af0acdb
JH
525 int value;
526
653099ff 527 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
528 for (value = 0; value < 256; ++value)
529 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 530 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* Can match anything (initialization) */
536STATIC int
537S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538{
539 int value;
540
aaa51d5e 541 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
542 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
543 return 1;
1aa99e6b
IH
544 if (!(cl->flags & ANYOF_UNICODE_ALL))
545 return 0;
9af0acdb
JH
546 for (value = 0; value < 256; ++value)
547 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
548 return 0;
549 return 1;
550}
551
552/* Can match anything (initialization) */
553STATIC void
830247a4 554S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 555{
8ecf7187 556 Zero(cl, 1, struct regnode_charclass_class);
653099ff 557 cl->type = ANYOF;
830247a4 558 cl_anything(pRExC_state, cl);
653099ff
GS
559}
560
561STATIC void
830247a4 562S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 563{
8ecf7187 564 Zero(cl, 1, struct regnode_charclass_class);
653099ff 565 cl->type = ANYOF;
830247a4 566 cl_anything(pRExC_state, cl);
653099ff
GS
567 if (LOC)
568 cl->flags |= ANYOF_LOCALE;
569}
570
571/* 'And' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
574S_cl_and(pTHX_ struct regnode_charclass_class *cl,
575 struct regnode_charclass_class *and_with)
576{
653099ff
GS
577 if (!(and_with->flags & ANYOF_CLASS)
578 && !(cl->flags & ANYOF_CLASS)
579 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
580 && !(and_with->flags & ANYOF_FOLD)
581 && !(cl->flags & ANYOF_FOLD)) {
582 int i;
583
584 if (and_with->flags & ANYOF_INVERT)
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= ~and_with->bitmap[i];
587 else
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] &= and_with->bitmap[i];
590 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
591 if (!(and_with->flags & ANYOF_EOS))
592 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
593
594 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 cl->flags |= ANYOF_UNICODE;
597 ARG_SET(cl, ARG(and_with));
598 }
599 if (!(and_with->flags & ANYOF_UNICODE_ALL))
600 cl->flags &= ~ANYOF_UNICODE_ALL;
601 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
602 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
603}
604
605/* 'OR' a given class with another one. Can create false positives */
606/* We assume that cl is not inverted */
607STATIC void
830247a4 608S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 609{
653099ff
GS
610 if (or_with->flags & ANYOF_INVERT) {
611 /* We do not use
612 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
613 * <= (B1 | !B2) | (CL1 | !CL2)
614 * which is wasteful if CL2 is small, but we ignore CL2:
615 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
616 * XXXX Can we handle case-fold? Unclear:
617 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
618 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
619 */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && !(or_with->flags & ANYOF_FOLD)
622 && !(cl->flags & ANYOF_FOLD) ) {
623 int i;
624
625 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
626 cl->bitmap[i] |= ~or_with->bitmap[i];
627 } /* XXXX: logic is complicated otherwise */
628 else {
830247a4 629 cl_anything(pRExC_state, cl);
653099ff
GS
630 }
631 } else {
632 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
633 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 634 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
635 || (cl->flags & ANYOF_FOLD)) ) {
636 int i;
637
638 /* OR char bitmap and class bitmap separately */
639 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
640 cl->bitmap[i] |= or_with->bitmap[i];
641 if (or_with->flags & ANYOF_CLASS) {
642 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
643 cl->classflags[i] |= or_with->classflags[i];
644 cl->flags |= ANYOF_CLASS;
645 }
646 }
647 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649 }
650 }
651 if (or_with->flags & ANYOF_EOS)
652 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
653
654 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
655 ARG(cl) != ARG(or_with)) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
659 if (or_with->flags & ANYOF_UNICODE_ALL) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
653099ff
GS
663}
664
665/* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
c277df42
IZ
668/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
669 to the position after last scanned or to NULL. */
670
76e3520e 671STATIC I32
830247a4 672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676{
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 681 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
653099ff 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 685
c277df42
IZ
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
22c35a8c 689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 690 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693#ifdef DEBUGGING
694 regnode *stop = scan;
b81d288d 695#endif
c277df42 696
cd439c50 697 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
b81d288d 700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
22c35a8c 706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709#ifdef DEBUGGING
710 if (stringok)
711 stop = n;
b81d288d 712#endif
c277df42 713 n = regnext(n);
a0ed51b3 714 }
f49d4d0f 715 else if (stringok) {
cd439c50 716 int oldl = STR_LEN(scan);
c277df42 717 regnode *nnext = regnext(n);
f49d4d0f 718
b81d288d 719 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
c277df42 724 /* Now we can overwrite *n : */
f49d4d0f 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 726#ifdef DEBUGGING
f49d4d0f 727 stop = next - 1;
b81d288d 728#endif
c277df42
IZ
729 n = nnext;
730 }
731 }
732#ifdef DEBUGGING
733 /* Allow dumping */
cd439c50 734 n = scan + NODE_SZ_STR(scan);
c277df42 735 while (n <= stop) {
22c35a8c 736 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
737 OP(n) = OPTIMIZED;
738 NEXT_OFF(n) = 0;
739 }
740 n++;
741 }
653099ff 742#endif
c277df42 743 }
653099ff
GS
744 /* Follow the next-chain of the current node and optimize
745 away all the NOTHINGs from it. */
c277df42 746 if (OP(scan) != CURLYX) {
048cfca1
GS
747 int max = (reg_off_by_arg[OP(scan)]
748 ? I32_MAX
749 /* I32 may be smaller than U16 on CRAYs! */
750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
752 int noff;
753 regnode *n = scan;
b81d288d 754
c277df42
IZ
755 /* Skip NOTHING and LONGJMP. */
756 while ((n = regnext(n))
22c35a8c 757 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
759 && off + noff < max)
760 off += noff;
761 if (reg_off_by_arg[OP(scan)])
762 ARG(scan) = off;
b81d288d 763 else
c277df42
IZ
764 NEXT_OFF(scan) = off;
765 }
653099ff
GS
766 /* The principal pseudo-switch. Cannot be a switch, since we
767 look into several different things. */
b81d288d 768 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
769 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
770 next = regnext(scan);
771 code = OP(scan);
b81d288d
AB
772
773 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 774 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 775 struct regnode_charclass_class accum;
c277df42 776
653099ff 777 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 778 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 779 if (flags & SCF_DO_STCLASS)
830247a4 780 cl_init_zero(pRExC_state, &accum);
c277df42 781 while (OP(scan) == code) {
830247a4 782 I32 deltanext, minnext, f = 0, fake;
653099ff 783 struct regnode_charclass_class this_class;
c277df42
IZ
784
785 num++;
786 data_fake.flags = 0;
b81d288d 787 if (data) {
2c2d71f5 788 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
789 data_fake.last_closep = data->last_closep;
790 }
791 else
792 data_fake.last_closep = &fake;
c277df42
IZ
793 next = regnext(scan);
794 scan = NEXTOPER(scan);
795 if (code != BRANCH)
796 scan = NEXTOPER(scan);
653099ff 797 if (flags & SCF_DO_STCLASS) {
830247a4 798 cl_init(pRExC_state, &this_class);
653099ff
GS
799 data_fake.start_class = &this_class;
800 f = SCF_DO_STCLASS_AND;
b81d288d 801 }
e1901655
IZ
802 if (flags & SCF_WHILEM_VISITED_POS)
803 f |= SCF_WHILEM_VISITED_POS;
653099ff 804 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
805 minnext = study_chunk(pRExC_state, &scan, &deltanext,
806 next, &data_fake, f);
b81d288d 807 if (min1 > minnext)
c277df42
IZ
808 min1 = minnext;
809 if (max1 < minnext + deltanext)
810 max1 = minnext + deltanext;
811 if (deltanext == I32_MAX)
aca2d497 812 is_inf = is_inf_internal = 1;
c277df42
IZ
813 scan = next;
814 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
815 pars++;
405ff068 816 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 817 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
818 if (data)
819 data->whilem_c = data_fake.whilem_c;
653099ff 820 if (flags & SCF_DO_STCLASS)
830247a4 821 cl_or(pRExC_state, &accum, &this_class);
b81d288d 822 if (code == SUSPEND)
c277df42
IZ
823 break;
824 }
825 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
826 min1 = 0;
827 if (flags & SCF_DO_SUBSTR) {
828 data->pos_min += min1;
829 data->pos_delta += max1 - min1;
830 if (max1 != min1 || is_inf)
831 data->longest = &(data->longest_float);
832 }
833 min += min1;
834 delta += max1 - min1;
653099ff 835 if (flags & SCF_DO_STCLASS_OR) {
830247a4 836 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
837 if (min1) {
838 cl_and(data->start_class, &and_with);
839 flags &= ~SCF_DO_STCLASS;
840 }
841 }
842 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
843 if (min1) {
844 cl_and(data->start_class, &accum);
653099ff 845 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
846 }
847 else {
b81d288d 848 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
849 * data->start_class */
850 StructCopy(data->start_class, &and_with,
851 struct regnode_charclass_class);
852 flags &= ~SCF_DO_STCLASS_AND;
853 StructCopy(&accum, data->start_class,
854 struct regnode_charclass_class);
855 flags |= SCF_DO_STCLASS_OR;
856 data->start_class->flags |= ANYOF_EOS;
857 }
653099ff 858 }
a0ed51b3
LW
859 }
860 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
861 scan = NEXTOPER(NEXTOPER(scan));
862 else /* single branch is optimized. */
863 scan = NEXTOPER(scan);
864 continue;
a0ed51b3
LW
865 }
866 else if (OP(scan) == EXACT) {
cd439c50 867 I32 l = STR_LEN(scan);
1aa99e6b 868 UV uc = *((U8*)STRING(scan));
a0ed51b3 869 if (UTF) {
1aa99e6b
IH
870 U8 *s = (U8*)STRING(scan);
871 l = utf8_length(s, s + l);
9041c2e3 872 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
873 }
874 min += l;
c277df42 875 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
876 /* The code below prefers earlier match for fixed
877 offset, later match for variable offset. */
878 if (data->last_end == -1) { /* Update the start info. */
879 data->last_start_min = data->pos_min;
880 data->last_start_max = is_inf
b81d288d 881 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 882 }
cd439c50 883 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
884 data->last_end = data->pos_min + l;
885 data->pos_min += l; /* As in the first entry. */
886 data->flags &= ~SF_BEFORE_EOL;
887 }
653099ff
GS
888 if (flags & SCF_DO_STCLASS_AND) {
889 /* Check whether it is compatible with what we know already! */
890 int compat = 1;
891
1aa99e6b 892 if (uc >= 0x100 ||
516a5887 893 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 894 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 895 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 896 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 897 )
653099ff
GS
898 compat = 0;
899 ANYOF_CLASS_ZERO(data->start_class);
900 ANYOF_BITMAP_ZERO(data->start_class);
901 if (compat)
1aa99e6b 902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 903 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
904 if (uc < 0x100)
905 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
906 }
907 else if (flags & SCF_DO_STCLASS_OR) {
908 /* false positive possible if the class is case-folded */
1aa99e6b 909 if (uc < 0x100)
9b877dbb
IH
910 ANYOF_BITMAP_SET(data->start_class, uc);
911 else
912 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
913 data->start_class->flags &= ~ANYOF_EOS;
914 cl_and(data->start_class, &and_with);
915 }
916 flags &= ~SCF_DO_STCLASS;
a0ed51b3 917 }
653099ff 918 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 919 I32 l = STR_LEN(scan);
1aa99e6b 920 UV uc = *((U8*)STRING(scan));
653099ff
GS
921
922 /* Search for fixed substrings supports EXACT only. */
b81d288d 923 if (flags & SCF_DO_SUBSTR)
830247a4 924 scan_commit(pRExC_state, data);
a0ed51b3 925 if (UTF) {
1aa99e6b
IH
926 U8 *s = (U8 *)STRING(scan);
927 l = utf8_length(s, s + l);
9041c2e3 928 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
929 }
930 min += l;
c277df42 931 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 932 data->pos_min += l;
653099ff
GS
933 if (flags & SCF_DO_STCLASS_AND) {
934 /* Check whether it is compatible with what we know already! */
935 int compat = 1;
936
1aa99e6b 937 if (uc >= 0x100 ||
516a5887 938 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 939 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 940 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
941 compat = 0;
942 ANYOF_CLASS_ZERO(data->start_class);
943 ANYOF_BITMAP_ZERO(data->start_class);
944 if (compat) {
1aa99e6b 945 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
946 data->start_class->flags &= ~ANYOF_EOS;
947 data->start_class->flags |= ANYOF_FOLD;
948 if (OP(scan) == EXACTFL)
949 data->start_class->flags |= ANYOF_LOCALE;
950 }
951 }
952 else if (flags & SCF_DO_STCLASS_OR) {
953 if (data->start_class->flags & ANYOF_FOLD) {
954 /* false positive possible if the class is case-folded.
955 Assume that the locale settings are the same... */
1aa99e6b
IH
956 if (uc < 0x100)
957 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
958 data->start_class->flags &= ~ANYOF_EOS;
959 }
960 cl_and(data->start_class, &and_with);
961 }
962 flags &= ~SCF_DO_STCLASS;
a0ed51b3 963 }
4d61ec05 964 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 965 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 966 I32 f = flags, pos_before = 0;
c277df42 967 regnode *oscan = scan;
653099ff
GS
968 struct regnode_charclass_class this_class;
969 struct regnode_charclass_class *oclass = NULL;
727f22e3 970 I32 next_is_eval = 0;
653099ff 971
22c35a8c 972 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 973 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
974 scan = NEXTOPER(scan);
975 goto finish;
976 case PLUS:
653099ff 977 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 978 next = NEXTOPER(scan);
653099ff 979 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
980 mincount = 1;
981 maxcount = REG_INFTY;
c277df42
IZ
982 next = regnext(scan);
983 scan = NEXTOPER(scan);
984 goto do_curly;
985 }
986 }
987 if (flags & SCF_DO_SUBSTR)
988 data->pos_min++;
989 min++;
990 /* Fall through. */
991 case STAR:
653099ff
GS
992 if (flags & SCF_DO_STCLASS) {
993 mincount = 0;
b81d288d 994 maxcount = REG_INFTY;
653099ff
GS
995 next = regnext(scan);
996 scan = NEXTOPER(scan);
997 goto do_curly;
998 }
b81d288d 999 is_inf = is_inf_internal = 1;
c277df42
IZ
1000 scan = regnext(scan);
1001 if (flags & SCF_DO_SUBSTR) {
830247a4 1002 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1003 data->longest = &(data->longest_float);
1004 }
1005 goto optimize_curly_tail;
1006 case CURLY:
b81d288d 1007 mincount = ARG1(scan);
c277df42
IZ
1008 maxcount = ARG2(scan);
1009 next = regnext(scan);
cb434fcc
IZ
1010 if (OP(scan) == CURLYX) {
1011 I32 lp = (data ? *(data->last_closep) : 0);
1012
1013 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1014 }
c277df42 1015 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1016 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1017 do_curly:
1018 if (flags & SCF_DO_SUBSTR) {
830247a4 1019 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1020 pos_before = data->pos_min;
1021 }
1022 if (data) {
1023 fl = data->flags;
1024 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1025 if (is_inf)
1026 data->flags |= SF_IS_INF;
1027 }
653099ff 1028 if (flags & SCF_DO_STCLASS) {
830247a4 1029 cl_init(pRExC_state, &this_class);
653099ff
GS
1030 oclass = data->start_class;
1031 data->start_class = &this_class;
1032 f |= SCF_DO_STCLASS_AND;
1033 f &= ~SCF_DO_STCLASS_OR;
1034 }
e1901655
IZ
1035 /* These are the cases when once a subexpression
1036 fails at a particular position, it cannot succeed
1037 even after backtracking at the enclosing scope.
b81d288d 1038
e1901655
IZ
1039 XXXX what if minimal match and we are at the
1040 initial run of {n,m}? */
1041 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1042 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1043
c277df42 1044 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1045 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1046 mincount == 0
653099ff
GS
1047 ? (f & ~SCF_DO_SUBSTR) : f);
1048
1049 if (flags & SCF_DO_STCLASS)
1050 data->start_class = oclass;
1051 if (mincount == 0 || minnext == 0) {
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 }
1055 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1056 /* Switch to OR mode: cache the old value of
653099ff
GS
1057 * data->start_class */
1058 StructCopy(data->start_class, &and_with,
1059 struct regnode_charclass_class);
1060 flags &= ~SCF_DO_STCLASS_AND;
1061 StructCopy(&this_class, data->start_class,
1062 struct regnode_charclass_class);
1063 flags |= SCF_DO_STCLASS_OR;
1064 data->start_class->flags |= ANYOF_EOS;
1065 }
1066 } else { /* Non-zero len */
1067 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1068 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1069 cl_and(data->start_class, &and_with);
1070 }
1071 else if (flags & SCF_DO_STCLASS_AND)
1072 cl_and(data->start_class, &this_class);
1073 flags &= ~SCF_DO_STCLASS;
1074 }
c277df42
IZ
1075 if (!scan) /* It was not CURLYX, but CURLY. */
1076 scan = next;
84037bb0 1077 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1078 /* ? quantifier ok, except for (?{ ... }) */
1079 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
84e09d5e
JH
1669 /* XXXX This looks very suspicious... */
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
1672 else
1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
055bb491 1749 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1750 RExC_parse = exp;
1751 RExC_end = xend;
1752 RExC_naughty = 0;
1753 RExC_npar = 1;
fac92740 1754 RExC_emit_start = r->program;
830247a4 1755 RExC_emit = r->program;
2cd61cdb 1756 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1757 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1758 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1759 r->data = 0;
830247a4 1760 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1761 return(NULL);
1762
1763 /* Dig out information for optimizations. */
cf93c79d 1764 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1765 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1766 if (UTF)
1767 r->reganch |= ROPT_UTF8;
c277df42 1768 r->regstclass = NULL;
830247a4 1769 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1770 r->reganch |= ROPT_NAUGHTY;
c277df42 1771 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1772
1773 /* XXXX To minimize changes to RE engine we always allocate
1774 3-units-long substrs field. */
1775 Newz(1004, r->substrs, 1, struct reg_substr_data);
1776
2c2d71f5 1777 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1778 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1779 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1780 I32 fake;
c5254dd6 1781 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1782 struct regnode_charclass_class ch_class;
1783 int stclass_flag;
cb434fcc 1784 I32 last_close = 0;
a0d0e21e
LW
1785
1786 first = scan;
c277df42 1787 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1788 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1789 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1790 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1791 (OP(first) == PLUS) ||
1792 (OP(first) == MINMOD) ||
653099ff 1793 /* An {n,m} with n>0 */
22c35a8c 1794 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1795 if (OP(first) == PLUS)
1796 sawplus = 1;
1797 else
1798 first += regarglen[(U8)OP(first)];
1799 first = NEXTOPER(first);
a687059c
LW
1800 }
1801
a0d0e21e
LW
1802 /* Starting-point info. */
1803 again:
653099ff 1804 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1805 if (OP(first) == EXACT)
1806 ; /* Empty, get anchored substr later. */
1807 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1808 r->regstclass = first;
1809 }
653099ff 1810 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1811 r->regstclass = first;
22c35a8c
GS
1812 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1813 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1814 r->regstclass = first;
22c35a8c 1815 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1816 r->reganch |= (OP(first) == MBOL
1817 ? ROPT_ANCH_MBOL
1818 : (OP(first) == SBOL
1819 ? ROPT_ANCH_SBOL
1820 : ROPT_ANCH_BOL));
a0d0e21e 1821 first = NEXTOPER(first);
774d564b 1822 goto again;
1823 }
1824 else if (OP(first) == GPOS) {
1825 r->reganch |= ROPT_ANCH_GPOS;
1826 first = NEXTOPER(first);
1827 goto again;
a0d0e21e
LW
1828 }
1829 else if ((OP(first) == STAR &&
22c35a8c 1830 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1831 !(r->reganch & ROPT_ANCH) )
1832 {
1833 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1834 int type = OP(NEXTOPER(first));
1835
ffc61ed2 1836 if (type == REG_ANY)
cad2e5aa
JH
1837 type = ROPT_ANCH_MBOL;
1838 else
1839 type = ROPT_ANCH_SBOL;
1840
1841 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1842 first = NEXTOPER(first);
774d564b 1843 goto again;
a0d0e21e 1844 }
b81d288d 1845 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1846 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1847 /* x+ must match at the 1st pos of run of x's */
1848 r->reganch |= ROPT_SKIP;
a0d0e21e 1849
c277df42 1850 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1851 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1852 (IV)(first - scan + 1)));
a0d0e21e
LW
1853 /*
1854 * If there's something expensive in the r.e., find the
1855 * longest literal string that must appear and make it the
1856 * regmust. Resolve ties in favor of later strings, since
1857 * the regstart check works with the beginning of the r.e.
1858 * and avoiding duplication strengthens checking. Not a
1859 * strong reason, but sufficient in the absence of others.
1860 * [Now we resolve ties in favor of the earlier string if
c277df42 1861 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1862 * earlier string may buy us something the later one won't.]
1863 */
a0d0e21e 1864 minlen = 0;
a687059c 1865
79cb57f6
GS
1866 data.longest_fixed = newSVpvn("",0);
1867 data.longest_float = newSVpvn("",0);
1868 data.last_found = newSVpvn("",0);
c277df42
IZ
1869 data.longest = &(data.longest_fixed);
1870 first = scan;
653099ff 1871 if (!r->regstclass) {
830247a4 1872 cl_init(pRExC_state, &ch_class);
653099ff
GS
1873 data.start_class = &ch_class;
1874 stclass_flag = SCF_DO_STCLASS_AND;
1875 } else /* XXXX Check for BOUND? */
1876 stclass_flag = 0;
cb434fcc 1877 data.last_closep = &last_close;
653099ff 1878
830247a4 1879 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1880 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1882 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1883 && !RExC_seen_zerolen
1884 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1885 r->reganch |= ROPT_CHECK_ALL;
830247a4 1886 scan_commit(pRExC_state, &data);
c277df42
IZ
1887 SvREFCNT_dec(data.last_found);
1888
a0ed51b3 1889 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1890 if (longest_float_length
c277df42
IZ
1891 || (data.flags & SF_FL_BEFORE_EOL
1892 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1893 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1894 int t;
1895
a0ed51b3 1896 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1897 && data.offset_fixed == data.offset_float_min
1898 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1899 goto remove_float; /* As in (a)+. */
1900
c277df42
IZ
1901 r->float_substr = data.longest_float;
1902 r->float_min_offset = data.offset_float_min;
1903 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1904 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1905 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1906 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1907 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1908 }
1909 else {
aca2d497 1910 remove_float:
c277df42
IZ
1911 r->float_substr = Nullsv;
1912 SvREFCNT_dec(data.longest_float);
c5254dd6 1913 longest_float_length = 0;
a0d0e21e 1914 }
c277df42 1915
a0ed51b3 1916 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1917 if (longest_fixed_length
c277df42
IZ
1918 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1921 int t;
1922
c277df42
IZ
1923 r->anchored_substr = data.longest_fixed;
1924 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1925 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1926 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1927 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1928 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1929 }
1930 else {
c277df42
IZ
1931 r->anchored_substr = Nullsv;
1932 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1933 longest_fixed_length = 0;
a0d0e21e 1934 }
b81d288d 1935 if (r->regstclass
ffc61ed2 1936 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1937 r->regstclass = NULL;
1938 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1939 && !(data.start_class->flags & ANYOF_EOS)
1940 && !cl_is_anything(data.start_class)) {
830247a4 1941 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1942
b81d288d 1943 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1944 struct regnode_charclass_class);
1945 StructCopy(data.start_class,
830247a4 1946 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1947 struct regnode_charclass_class);
830247a4 1948 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1949 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1950 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1951 DEBUG_r({ SV *sv = sv_newmortal();
1952 regprop(sv, (regnode*)data.start_class);
1953 PerlIO_printf(Perl_debug_log,
1954 "synthetic stclass `%s'.\n",
1955 SvPVX(sv));});
653099ff 1956 }
c277df42
IZ
1957
1958 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1959 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1960 r->check_substr = r->anchored_substr;
1961 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1962 if (r->reganch & ROPT_ANCH_SINGLE)
1963 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1964 }
1965 else {
c277df42
IZ
1966 r->check_substr = r->float_substr;
1967 r->check_offset_min = data.offset_float_min;
1968 r->check_offset_max = data.offset_float_max;
a0d0e21e 1969 }
30382c73
IZ
1970 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1971 This should be changed ASAP! */
1972 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1973 r->reganch |= RE_USE_INTUIT;
1974 if (SvTAIL(r->check_substr))
1975 r->reganch |= RE_INTUIT_TAIL;
1976 }
a0ed51b3
LW
1977 }
1978 else {
c277df42
IZ
1979 /* Several toplevels. Best we can is to set minlen. */
1980 I32 fake;
653099ff 1981 struct regnode_charclass_class ch_class;
cb434fcc 1982 I32 last_close = 0;
c277df42
IZ
1983
1984 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1985 scan = r->program + 1;
830247a4 1986 cl_init(pRExC_state, &ch_class);
653099ff 1987 data.start_class = &ch_class;
cb434fcc 1988 data.last_closep = &last_close;
e1901655 1989 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1990 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1991 if (!(data.start_class->flags & ANYOF_EOS)
1992 && !cl_is_anything(data.start_class)) {
830247a4 1993 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1994
b81d288d 1995 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1996 struct regnode_charclass_class);
1997 StructCopy(data.start_class,
830247a4 1998 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1999 struct regnode_charclass_class);
830247a4 2000 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2001 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2002 DEBUG_r({ SV* sv = sv_newmortal();
2003 regprop(sv, (regnode*)data.start_class);
2004 PerlIO_printf(Perl_debug_log,
2005 "synthetic stclass `%s'.\n",
2006 SvPVX(sv));});
653099ff 2007 }
a0d0e21e
LW
2008 }
2009
a0d0e21e 2010 r->minlen = minlen;
b81d288d 2011 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2012 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2013 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2014 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2015 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2016 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2017 if (RExC_seen & REG_SEEN_CANY)
2018 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2019 Newz(1002, r->startp, RExC_npar, I32);
2020 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2021 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2022 DEBUG_r(regdump(r));
2023 return(r);
a687059c
LW
2024}
2025
2026/*
2027 - reg - regular expression, i.e. main body or parenthesized thing
2028 *
2029 * Caller must absorb opening parenthesis.
2030 *
2031 * Combining parenthesis handling with the base level of regular expression
2032 * is a trifle forced, but the need to tie the tails of the branches to what
2033 * follows makes it hard to avoid.
2034 */
76e3520e 2035STATIC regnode *
830247a4 2036S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2037 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2038{
c277df42
IZ
2039 register regnode *ret; /* Will be the head of the group. */
2040 register regnode *br;
2041 register regnode *lastbr;
2042 register regnode *ender = 0;
a0d0e21e 2043 register I32 parno = 0;
830247a4 2044 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2045
2046 /* for (?g), (?gc), and (?o) warnings; warning
2047 about (?c) will warn about (?g) -- japhy */
2048
2049 I32 wastedflags = 0x00,
2050 wasted_o = 0x01,
2051 wasted_g = 0x02,
2052 wasted_gc = 0x02 | 0x04,
2053 wasted_c = 0x04;
2054
fac92740 2055 char * parse_start = RExC_parse; /* MJD */
830247a4 2056 char *oregcomp_parse = RExC_parse;
c277df42 2057 char c;
a0d0e21e 2058
821b33a5 2059 *flagp = 0; /* Tentatively. */
a0d0e21e 2060
9d1d55b5 2061
a0d0e21e
LW
2062 /* Make an OPEN node, if parenthesized. */
2063 if (paren) {
fac92740 2064 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2065 U16 posflags = 0, negflags = 0;
2066 U16 *flagsp = &posflags;
0f5d15d6 2067 int logical = 0;
830247a4 2068 char *seqstart = RExC_parse;
ca9dfc88 2069
830247a4
IZ
2070 RExC_parse++;
2071 paren = *RExC_parse++;
c277df42 2072 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2073 switch (paren) {
fac92740 2074 case '<': /* (?<...) */
830247a4 2075 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2076 if (*RExC_parse == '!')
c277df42 2077 paren = ',';
b81d288d 2078 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2079 goto unknown;
830247a4 2080 RExC_parse++;
fac92740
MJD
2081 case '=': /* (?=...) */
2082 case '!': /* (?!...) */
830247a4 2083 RExC_seen_zerolen++;
fac92740
MJD
2084 case ':': /* (?:...) */
2085 case '>': /* (?>...) */
a0d0e21e 2086 break;
fac92740
MJD
2087 case '$': /* (?$...) */
2088 case '@': /* (?@...) */
8615cb43 2089 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2090 break;
fac92740 2091 case '#': /* (?#...) */
830247a4
IZ
2092 while (*RExC_parse && *RExC_parse != ')')
2093 RExC_parse++;
2094 if (*RExC_parse != ')')
c277df42 2095 FAIL("Sequence (?#... not terminated");
830247a4 2096 nextchar(pRExC_state);
a0d0e21e
LW
2097 *flagp = TRYAGAIN;
2098 return NULL;
fac92740 2099 case 'p': /* (?p...) */
f9373011
PM
2100 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2101 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2102 /* FALL THROUGH*/
fac92740 2103 case '?': /* (??...) */
0f5d15d6 2104 logical = 1;
830247a4 2105 paren = *RExC_parse++;
0f5d15d6 2106 /* FALL THROUGH */
fac92740 2107 case '{': /* (?{...}) */
c277df42 2108 {
c277df42
IZ
2109 I32 count = 1, n = 0;
2110 char c;
830247a4 2111 char *s = RExC_parse;
c277df42
IZ
2112 SV *sv;
2113 OP_4tree *sop, *rop;
2114
830247a4
IZ
2115 RExC_seen_zerolen++;
2116 RExC_seen |= REG_SEEN_EVAL;
2117 while (count && (c = *RExC_parse)) {
2118 if (c == '\\' && RExC_parse[1])
2119 RExC_parse++;
b81d288d 2120 else if (c == '{')
c277df42 2121 count++;
b81d288d 2122 else if (c == '}')
c277df42 2123 count--;
830247a4 2124 RExC_parse++;
c277df42 2125 }
830247a4 2126 if (*RExC_parse != ')')
b45f050a 2127 {
b81d288d 2128 RExC_parse = s;
b45f050a
JF
2129 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2130 }
c277df42
IZ
2131 if (!SIZE_ONLY) {
2132 AV *av;
b81d288d
AB
2133
2134 if (RExC_parse - 1 - s)
830247a4 2135 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2136 else
79cb57f6 2137 sv = newSVpvn("", 0);
c277df42 2138
569233ed
SB
2139 ENTER;
2140 Perl_save_re_context(aTHX);
c277df42 2141 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2142 sop->op_private |= OPpREFCOUNTED;
2143 /* re_dup will OpREFCNT_inc */
2144 OpREFCNT_set(sop, 1);
569233ed 2145 LEAVE;
c277df42 2146
830247a4
IZ
2147 n = add_data(pRExC_state, 3, "nop");
2148 RExC_rx->data->data[n] = (void*)rop;
2149 RExC_rx->data->data[n+1] = (void*)sop;
2150 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2151 SvREFCNT_dec(sv);
a0ed51b3 2152 }
e24b16f9 2153 else { /* First pass */
830247a4 2154 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2155 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2156 /* No compiled RE interpolated, has runtime
2157 components ===> unsafe. */
2158 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2159 if (PL_tainting && PL_tainted)
cc6b7395 2160 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2161 }
2162
830247a4 2163 nextchar(pRExC_state);
0f5d15d6 2164 if (logical) {
830247a4 2165 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2166 if (!SIZE_ONLY)
2167 ret->flags = 2;
830247a4 2168 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2169 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2170 return ret;
2171 }
830247a4 2172 return reganode(pRExC_state, EVAL, n);
c277df42 2173 }
fac92740 2174 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2175 {
fac92740 2176 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2177 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2178 || RExC_parse[1] == '<'
830247a4 2179 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2180 I32 flag;
2181
830247a4 2182 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2183 if (!SIZE_ONLY)
2184 ret->flags = 1;
830247a4 2185 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2186 goto insert_if;
b81d288d 2187 }
a0ed51b3 2188 }
830247a4 2189 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2190 /* (?(1)...) */
830247a4 2191 parno = atoi(RExC_parse++);
c277df42 2192
830247a4
IZ
2193 while (isDIGIT(*RExC_parse))
2194 RExC_parse++;
fac92740
MJD
2195 ret = reganode(pRExC_state, GROUPP, parno);
2196
830247a4 2197 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2198 vFAIL("Switch condition not recognized");
c277df42 2199 insert_if:
830247a4
IZ
2200 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2201 br = regbranch(pRExC_state, &flags, 1);
c277df42 2202 if (br == NULL)
830247a4 2203 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2204 else
830247a4
IZ
2205 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2206 c = *nextchar(pRExC_state);
d1b80229
IZ
2207 if (flags&HASWIDTH)
2208 *flagp |= HASWIDTH;
c277df42 2209 if (c == '|') {
830247a4
IZ
2210 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2211 regbranch(pRExC_state, &flags, 1);
2212 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2213 if (flags&HASWIDTH)
2214 *flagp |= HASWIDTH;
830247a4 2215 c = *nextchar(pRExC_state);
a0ed51b3
LW
2216 }
2217 else
c277df42
IZ
2218 lastbr = NULL;
2219 if (c != ')')
8615cb43 2220 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2221 ender = reg_node(pRExC_state, TAIL);
2222 regtail(pRExC_state, br, ender);
c277df42 2223 if (lastbr) {
830247a4
IZ
2224 regtail(pRExC_state, lastbr, ender);
2225 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2226 }
2227 else
830247a4 2228 regtail(pRExC_state, ret, ender);
c277df42 2229 return ret;
a0ed51b3
LW
2230 }
2231 else {
830247a4 2232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2233 }
2234 }
1b1626e4 2235 case 0:
830247a4 2236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2237 vFAIL("Sequence (? incomplete");
1b1626e4 2238 break;
a0d0e21e 2239 default:
830247a4 2240 --RExC_parse;
fac92740 2241 parse_flags: /* (?i) */
830247a4 2242 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2243 /* (?g), (?gc) and (?o) are useless here
2244 and must be globally applied -- japhy */
2245
2246 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2247 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2248 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2249 if (! (wastedflags & wflagbit) ) {
2250 wastedflags |= wflagbit;
2251 vWARN5(
2252 RExC_parse + 1,
2253 "Useless (%s%c) - %suse /%c modifier",
2254 flagsp == &negflags ? "?-" : "?",
2255 *RExC_parse,
2256 flagsp == &negflags ? "don't " : "",
2257 *RExC_parse
2258 );
2259 }
2260 }
2261 }
2262 else if (*RExC_parse == 'c') {
2263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2264 if (! (wastedflags & wasted_c) ) {
2265 wastedflags |= wasted_gc;
2266 vWARN3(
2267 RExC_parse + 1,
2268 "Useless (%sc) - %suse /gc modifier",
2269 flagsp == &negflags ? "?-" : "?",
2270 flagsp == &negflags ? "don't " : ""
2271 );
2272 }
2273 }
2274 }
2275 else { pmflag(flagsp, *RExC_parse); }
2276
830247a4 2277 ++RExC_parse;
ca9dfc88 2278 }
830247a4 2279 if (*RExC_parse == '-') {
ca9dfc88 2280 flagsp = &negflags;
9d1d55b5 2281 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2282 ++RExC_parse;
ca9dfc88 2283 goto parse_flags;
48c036b1 2284 }
830247a4
IZ
2285 RExC_flags16 |= posflags;
2286 RExC_flags16 &= ~negflags;
2287 if (*RExC_parse == ':') {
2288 RExC_parse++;
ca9dfc88
IZ
2289 paren = ':';
2290 break;
2291 }
c277df42 2292 unknown:
830247a4
IZ
2293 if (*RExC_parse != ')') {
2294 RExC_parse++;
2295 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2296 }
830247a4 2297 nextchar(pRExC_state);
a0d0e21e
LW
2298 *flagp = TRYAGAIN;
2299 return NULL;
2300 }
2301 }
fac92740 2302 else { /* (...) */
830247a4
IZ
2303 parno = RExC_npar;
2304 RExC_npar++;
2305 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2306 Set_Node_Length(ret, 1); /* MJD */
2307 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2308 open = 1;
a0d0e21e 2309 }
a0ed51b3 2310 }
fac92740 2311 else /* ! paren */
a0d0e21e
LW
2312 ret = NULL;
2313
2314 /* Pick up the branches, linking them together. */
fac92740 2315 parse_start = RExC_parse; /* MJD */
830247a4 2316 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2317 /* branch_len = (paren != 0); */
2318
a0d0e21e
LW
2319 if (br == NULL)
2320 return(NULL);
830247a4
IZ
2321 if (*RExC_parse == '|') {
2322 if (!SIZE_ONLY && RExC_extralen) {
2323 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2324 }
fac92740 2325 else { /* MJD */
830247a4 2326 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2327 Set_Node_Length(br, paren != 0);
2328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2329 }
c277df42
IZ
2330 have_branch = 1;
2331 if (SIZE_ONLY)
830247a4 2332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2333 }
2334 else if (paren == ':') {
c277df42
IZ
2335 *flagp |= flags&SIMPLE;
2336 }
2337 if (open) { /* Starts with OPEN. */
830247a4 2338 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2339 }
2340 else if (paren != '?') /* Not Conditional */
a0d0e21e 2341 ret = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a0d0e21e 2344 *flagp |= flags&SPSTART;
c277df42 2345 lastbr = br;
830247a4
IZ
2346 while (*RExC_parse == '|') {
2347 if (!SIZE_ONLY && RExC_extralen) {
2348 ender = reganode(pRExC_state, LONGJMP,0);
2349 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2350 }
2351 if (SIZE_ONLY)
830247a4
IZ
2352 RExC_extralen += 2; /* Account for LONGJMP. */
2353 nextchar(pRExC_state);
2354 br = regbranch(pRExC_state, &flags, 0);
fac92740 2355
a687059c 2356 if (br == NULL)
a0d0e21e 2357 return(NULL);
830247a4 2358 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2359 lastbr = br;
821b33a5
IZ
2360 if (flags&HASWIDTH)
2361 *flagp |= HASWIDTH;
a687059c 2362 *flagp |= flags&SPSTART;
a0d0e21e
LW
2363 }
2364
c277df42
IZ
2365 if (have_branch || paren != ':') {
2366 /* Make a closing node, and hook it on the end. */
2367 switch (paren) {
2368 case ':':
830247a4 2369 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2370 break;
2371 case 1:
830247a4 2372 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2373 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2374 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2375 break;
2376 case '<':
c277df42
IZ
2377 case ',':
2378 case '=':
2379 case '!':
c277df42 2380 *flagp &= ~HASWIDTH;
821b33a5
IZ
2381 /* FALL THROUGH */
2382 case '>':
830247a4 2383 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2384 break;
2385 case 0:
830247a4 2386 ender = reg_node(pRExC_state, END);
c277df42
IZ
2387 break;
2388 }
830247a4 2389 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2390
c277df42
IZ
2391 if (have_branch) {
2392 /* Hook the tails of the branches to the closing node. */
2393 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2394 regoptail(pRExC_state, br, ender);
c277df42
IZ
2395 }
2396 }
a0d0e21e 2397 }
c277df42
IZ
2398
2399 {
2400 char *p;
2401 static char parens[] = "=!<,>";
2402
2403 if (paren && (p = strchr(parens, paren))) {
2404 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2405 int flag = (p - parens) > 1;
2406
2407 if (paren == '>')
2408 node = SUSPEND, flag = 0;
830247a4 2409 reginsert(pRExC_state, node,ret);
c277df42 2410 ret->flags = flag;
830247a4 2411 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2412 }
a0d0e21e
LW
2413 }
2414
2415 /* Check for proper termination. */
ce3e6498 2416 if (paren) {
830247a4
IZ
2417 RExC_flags16 = oregflags;
2418 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2419 RExC_parse = oregcomp_parse;
380a0633 2420 vFAIL("Unmatched (");
ce3e6498 2421 }
a0ed51b3 2422 }
830247a4
IZ
2423 else if (!paren && RExC_parse < RExC_end) {
2424 if (*RExC_parse == ')') {
2425 RExC_parse++;
380a0633 2426 vFAIL("Unmatched )");
a0ed51b3
LW
2427 }
2428 else
b45f050a 2429 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2430 /* NOTREACHED */
2431 }
a687059c 2432
a0d0e21e 2433 return(ret);
a687059c
LW
2434}
2435
2436/*
2437 - regbranch - one alternative of an | operator
2438 *
2439 * Implements the concatenation operator.
2440 */
76e3520e 2441STATIC regnode *
830247a4 2442S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2443{
c277df42
IZ
2444 register regnode *ret;
2445 register regnode *chain = NULL;
2446 register regnode *latest;
2447 I32 flags = 0, c = 0;
a0d0e21e 2448
b81d288d 2449 if (first)
c277df42
IZ
2450 ret = NULL;
2451 else {
b81d288d 2452 if (!SIZE_ONLY && RExC_extralen)
830247a4 2453 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2454 else {
830247a4 2455 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2456 Set_Node_Length(ret, 1);
2457 }
c277df42
IZ
2458 }
2459
b81d288d 2460 if (!first && SIZE_ONLY)
830247a4 2461 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2462
c277df42 2463 *flagp = WORST; /* Tentatively. */
a0d0e21e 2464
830247a4
IZ
2465 RExC_parse--;
2466 nextchar(pRExC_state);
2467 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2468 flags &= ~TRYAGAIN;
830247a4 2469 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2470 if (latest == NULL) {
2471 if (flags & TRYAGAIN)
2472 continue;
2473 return(NULL);
a0ed51b3
LW
2474 }
2475 else if (ret == NULL)
c277df42 2476 ret = latest;
a0d0e21e 2477 *flagp |= flags&HASWIDTH;
c277df42 2478 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2479 *flagp |= flags&SPSTART;
2480 else {
830247a4
IZ
2481 RExC_naughty++;
2482 regtail(pRExC_state, chain, latest);
a687059c 2483 }
a0d0e21e 2484 chain = latest;
c277df42
IZ
2485 c++;
2486 }
2487 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2488 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2489 if (ret == NULL)
2490 ret = chain;
2491 }
2492 if (c == 1) {
2493 *flagp |= flags&SIMPLE;
a0d0e21e 2494 }
a687059c 2495
a0d0e21e 2496 return(ret);
a687059c
LW
2497}
2498
2499/*
2500 - regpiece - something followed by possible [*+?]
2501 *
2502 * Note that the branching code sequences used for ? and the general cases
2503 * of * and + are somewhat optimized: they use the same NOTHING node as
2504 * both the endmarker for their branch list and the body of the last branch.
2505 * It might seem that this node could be dispensed with entirely, but the
2506 * endmarker role is not redundant.
2507 */
76e3520e 2508STATIC regnode *
830247a4 2509S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2510{
c277df42 2511 register regnode *ret;
a0d0e21e
LW
2512 register char op;
2513 register char *next;
2514 I32 flags;
830247a4 2515 char *origparse = RExC_parse;
a0d0e21e
LW
2516 char *maxpos;
2517 I32 min;
c277df42 2518 I32 max = REG_INFTY;
fac92740 2519 char *parse_start;
a0d0e21e 2520
830247a4 2521 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2522 if (ret == NULL) {
2523 if (flags & TRYAGAIN)
2524 *flagp |= TRYAGAIN;
2525 return(NULL);
2526 }
2527
830247a4 2528 op = *RExC_parse;
a0d0e21e 2529
830247a4 2530 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2531 parse_start = RExC_parse; /* MJD */
830247a4 2532 next = RExC_parse + 1;
a0d0e21e
LW
2533 maxpos = Nullch;
2534 while (isDIGIT(*next) || *next == ',') {
2535 if (*next == ',') {
2536 if (maxpos)
2537 break;
2538 else
2539 maxpos = next;
a687059c 2540 }
a0d0e21e
LW
2541 next++;
2542 }
2543 if (*next == '}') { /* got one */
2544 if (!maxpos)
2545 maxpos = next;
830247a4
IZ
2546 RExC_parse++;
2547 min = atoi(RExC_parse);
a0d0e21e
LW
2548 if (*maxpos == ',')
2549 maxpos++;
2550 else
830247a4 2551 maxpos = RExC_parse;
a0d0e21e
LW
2552 max = atoi(maxpos);
2553 if (!max && *maxpos != '0')
c277df42
IZ
2554 max = REG_INFTY; /* meaning "infinity" */
2555 else if (max >= REG_INFTY)
8615cb43 2556 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2557 RExC_parse = next;
2558 nextchar(pRExC_state);
a0d0e21e
LW
2559
2560 do_curly:
2561 if ((flags&SIMPLE)) {
830247a4
IZ
2562 RExC_naughty += 2 + RExC_naughty / 2;
2563 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2564 Set_Node_Offset(ret, parse_start+1); /* MJD */
2565 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2566 }
2567 else {
830247a4 2568 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2569
2570 w->flags = 0;
830247a4
IZ
2571 regtail(pRExC_state, ret, w);
2572 if (!SIZE_ONLY && RExC_extralen) {
2573 reginsert(pRExC_state, LONGJMP,ret);
2574 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2575 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2576 }
830247a4 2577 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2578 /* MJD hk */
2579 Set_Node_Offset(ret, parse_start+1);
2580 Set_Node_Length(ret,
2581 op == '{' ? (RExC_parse - parse_start) : 1);
2582
830247a4 2583 if (!SIZE_ONLY && RExC_extralen)
c277df42 2584 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2585 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2586 if (SIZE_ONLY)
830247a4
IZ
2587 RExC_whilem_seen++, RExC_extralen += 3;
2588 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2589 }
c277df42 2590 ret->flags = 0;
a0d0e21e
LW
2591
2592 if (min > 0)
821b33a5
IZ
2593 *flagp = WORST;
2594 if (max > 0)
2595 *flagp |= HASWIDTH;
a0d0e21e 2596 if (max && max < min)
8615cb43 2597 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2598 if (!SIZE_ONLY) {
2599 ARG1_SET(ret, min);
2600 ARG2_SET(ret, max);
a687059c 2601 }
a687059c 2602
a0d0e21e 2603 goto nest_check;
a687059c 2604 }
a0d0e21e 2605 }
a687059c 2606
a0d0e21e
LW
2607 if (!ISMULT1(op)) {
2608 *flagp = flags;
a687059c 2609 return(ret);
a0d0e21e 2610 }
bb20fd44 2611
c277df42 2612#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2613
2614 /* if this is reinstated, don't forget to put this back into perldiag:
2615
2616 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2617
2618 (F) The part of the regexp subject to either the * or + quantifier
2619 could match an empty string. The {#} shows in the regular
2620 expression about where the problem was discovered.
2621
2622 */
2623
bb20fd44 2624 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2625 vFAIL("Regexp *+ operand could be empty");
b81d288d 2626#endif
bb20fd44 2627
fac92740 2628 parse_start = RExC_parse;
830247a4 2629 nextchar(pRExC_state);
a0d0e21e 2630
821b33a5 2631 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2632
2633 if (op == '*' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, STAR, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 4;
a0d0e21e
LW
2637 }
2638 else if (op == '*') {
2639 min = 0;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2643 reginsert(pRExC_state, PLUS, ret);
c277df42 2644 ret->flags = 0;
830247a4 2645 RExC_naughty += 3;
a0d0e21e
LW
2646 }
2647 else if (op == '+') {
2648 min = 1;
2649 goto do_curly;
a0ed51b3
LW
2650 }
2651 else if (op == '?') {
a0d0e21e
LW
2652 min = 0; max = 1;
2653 goto do_curly;
2654 }
2655 nest_check:
e476b1b5 2656 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2657 vWARN3(RExC_parse,
b45f050a 2658 "%.*s matches null string many times",
830247a4 2659 RExC_parse - origparse,
b45f050a 2660 origparse);
a0d0e21e
LW
2661 }
2662
830247a4
IZ
2663 if (*RExC_parse == '?') {
2664 nextchar(pRExC_state);
2665 reginsert(pRExC_state, MINMOD, ret);
2666 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2667 }
830247a4
IZ
2668 if (ISMULT2(RExC_parse)) {
2669 RExC_parse++;
b45f050a
JF
2670 vFAIL("Nested quantifiers");
2671 }
a0d0e21e
LW
2672
2673 return(ret);
a687059c
LW
2674}
2675
2676/*
2677 - regatom - the lowest level
2678 *
2679 * Optimization: gobbles an entire sequence of ordinary characters so that
2680 * it can turn them into a single node, which is smaller to store and
2681 * faster to run. Backslashed characters are exceptions, each becoming a
2682 * separate node; the code is simpler that way and it's not worth fixing.
2683 *
b45f050a 2684 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2685STATIC regnode *
830247a4 2686S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2687{
c277df42 2688 register regnode *ret = 0;
a0d0e21e 2689 I32 flags;
f06dbbb7 2690 char *parse_start = 0;
a0d0e21e
LW
2691
2692 *flagp = WORST; /* Tentatively. */
2693
2694tryagain:
830247a4 2695 switch (*RExC_parse) {
a0d0e21e 2696 case '^':
830247a4
IZ
2697 RExC_seen_zerolen++;
2698 nextchar(pRExC_state);
2699 if (RExC_flags16 & PMf_MULTILINE)
2700 ret = reg_node(pRExC_state, MBOL);
2701 else if (RExC_flags16 & PMf_SINGLELINE)
2702 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2703 else
830247a4 2704 ret = reg_node(pRExC_state, BOL);
fac92740 2705 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2706 break;
2707 case '$':
830247a4 2708 nextchar(pRExC_state);
b81d288d 2709 if (*RExC_parse)
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 if (RExC_flags16 & PMf_MULTILINE)
2712 ret = reg_node(pRExC_state, MEOL);
2713 else if (RExC_flags16 & PMf_SINGLELINE)
2714 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2715 else
830247a4 2716 ret = reg_node(pRExC_state, EOL);
fac92740 2717 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2718 break;
2719 case '.':
830247a4 2720 nextchar(pRExC_state);
cce850e4 2721 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2722 ret = reg_node(pRExC_state, SANY);
2723 else
2724 ret = reg_node(pRExC_state, REG_ANY);
2725 *flagp |= HASWIDTH|SIMPLE;
830247a4 2726 RExC_naughty++;
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '[':
b45f050a 2730 {
830247a4 2731 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2732 ret = regclass(pRExC_state);
830247a4
IZ
2733 if (*RExC_parse != ']') {
2734 RExC_parse = oregcomp_parse;
b45f050a
JF
2735 vFAIL("Unmatched [");
2736 }
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738 *flagp |= HASWIDTH|SIMPLE;
fac92740 2739 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2740 break;
b45f050a 2741 }
a0d0e21e 2742 case '(':
830247a4
IZ
2743 nextchar(pRExC_state);
2744 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2745 if (ret == NULL) {
bf93d4cc 2746 if (flags & TRYAGAIN) {
830247a4 2747 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2748 /* Make parent create an empty node if needed. */
2749 *flagp |= TRYAGAIN;
2750 return(NULL);
2751 }
a0d0e21e 2752 goto tryagain;
bf93d4cc 2753 }
a0d0e21e
LW
2754 return(NULL);
2755 }
c277df42 2756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2757 break;
2758 case '|':
2759 case ')':
2760 if (flags & TRYAGAIN) {
2761 *flagp |= TRYAGAIN;
2762 return NULL;
2763 }
b45f050a 2764 vFAIL("Internal urp");
a0d0e21e
LW
2765 /* Supposed to be caught earlier. */
2766 break;
85afd4ae 2767 case '{':
830247a4
IZ
2768 if (!regcurly(RExC_parse)) {
2769 RExC_parse++;
85afd4ae
CS
2770 goto defchar;
2771 }
2772 /* FALL THROUGH */
a0d0e21e
LW
2773 case '?':
2774 case '+':
2775 case '*':
830247a4 2776 RExC_parse++;
b45f050a 2777 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2778 break;
2779 case '\\':
830247a4 2780 switch (*++RExC_parse) {
a0d0e21e 2781 case 'A':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2784 *flagp |= SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'G':
830247a4
IZ
2789 ret = reg_node(pRExC_state, GPOS);
2790 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2791 *flagp |= SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2794 break;
2795 case 'Z':
830247a4 2796 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2797 *flagp |= SIMPLE;
830247a4 2798 nextchar(pRExC_state);
a0d0e21e 2799 break;
b85d18e9 2800 case 'z':
830247a4 2801 ret = reg_node(pRExC_state, EOS);
b85d18e9 2802 *flagp |= SIMPLE;
830247a4
IZ
2803 RExC_seen_zerolen++; /* Do not optimize RE away */
2804 nextchar(pRExC_state);
fac92740 2805 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2806 break;
4a2d328f 2807 case 'C':
f33976b4
DB
2808 ret = reg_node(pRExC_state, CANY);
2809 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2810 *flagp |= HASWIDTH|SIMPLE;
830247a4 2811 nextchar(pRExC_state);
fac92740 2812 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2813 break;
2814 case 'X':
830247a4 2815 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2816 *flagp |= HASWIDTH;
830247a4 2817 nextchar(pRExC_state);
fac92740 2818 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2819 break;
a0d0e21e 2820 case 'w':
ffc61ed2 2821 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2822 *flagp |= HASWIDTH|SIMPLE;
830247a4 2823 nextchar(pRExC_state);
fac92740 2824 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2825 break;
2826 case 'W':
ffc61ed2 2827 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2828 *flagp |= HASWIDTH|SIMPLE;
830247a4 2829 nextchar(pRExC_state);
fac92740 2830 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2831 break;
2832 case 'b':
830247a4
IZ
2833 RExC_seen_zerolen++;
2834 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2835 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2836 *flagp |= SIMPLE;
830247a4 2837 nextchar(pRExC_state);
fac92740 2838 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2839 break;
2840 case 'B':
830247a4
IZ
2841 RExC_seen_zerolen++;
2842 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2843 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2844 *flagp |= SIMPLE;
830247a4 2845 nextchar(pRExC_state);
fac92740 2846 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2847 break;
2848 case 's':
ffc61ed2 2849 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2850 *flagp |= HASWIDTH|SIMPLE;
830247a4 2851 nextchar(pRExC_state);
fac92740 2852 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2853 break;
2854 case 'S':
ffc61ed2 2855 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
830247a4 2857 nextchar(pRExC_state);
fac92740 2858 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2859 break;
2860 case 'd':
ffc61ed2 2861 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2862 *flagp |= HASWIDTH|SIMPLE;
830247a4 2863 nextchar(pRExC_state);
fac92740 2864 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2865 break;
2866 case 'D':
ffc61ed2 2867 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2868 *flagp |= HASWIDTH|SIMPLE;
830247a4 2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2871 break;
a14b48bc
LW
2872 case 'p':
2873 case 'P':
3568d838 2874 {
830247a4 2875 char* oldregxend = RExC_end;
fac92740 2876 char* parse_start = RExC_parse;
a14b48bc 2877
830247a4 2878 if (RExC_parse[1] == '{') {
3568d838 2879 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2880 RExC_end = strchr(RExC_parse, '}');
2881 if (!RExC_end) {
2882 RExC_parse += 2;
2883 RExC_end = oldregxend;
b45f050a
JF
2884 vFAIL("Missing right brace on \\p{}");
2885 }
830247a4 2886 RExC_end++;
a14b48bc
LW
2887 }
2888 else
830247a4
IZ
2889 RExC_end = RExC_parse + 2;
2890 RExC_parse--;
a14b48bc 2891
ffc61ed2 2892 ret = regclass(pRExC_state);
a14b48bc 2893
830247a4
IZ
2894 RExC_end = oldregxend;
2895 RExC_parse--;
fac92740 2896 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2897 nextchar(pRExC_state);
a14b48bc
LW
2898 *flagp |= HASWIDTH|SIMPLE;
2899 }
2900 break;
a0d0e21e
LW
2901 case 'n':
2902 case 'r':
2903 case 't':
2904 case 'f':
2905 case 'e':
2906 case 'a':
2907 case 'x':
2908 case 'c':
2909 case '0':
2910 goto defchar;
2911 case '1': case '2': case '3': case '4':
2912 case '5': case '6': case '7': case '8': case '9':
2913 {
830247a4 2914 I32 num = atoi(RExC_parse);
a0d0e21e 2915
830247a4 2916 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2917 goto defchar;
2918 else {
fac92740 2919 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2920 while (isDIGIT(*RExC_parse))
2921 RExC_parse++;
b45f050a 2922
830247a4 2923 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2924 vFAIL("Reference to nonexistent group");
830247a4
IZ
2925 RExC_sawback = 1;
2926 ret = reganode(pRExC_state, FOLD
a0ed51b3 2927 ? (LOC ? REFFL : REFF)
c8756f30 2928 : REF, num);
a0d0e21e 2929 *flagp |= HASWIDTH;
fac92740
MJD
2930
2931 /* override incorrect value set in reganode MJD */
2932 Set_Node_Offset(ret, parse_start+1);
2933 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2934 RExC_parse--;
2935 nextchar(pRExC_state);
a0d0e21e
LW
2936 }
2937 }
2938 break;
2939 case '\0':
830247a4 2940 if (RExC_parse >= RExC_end)
b45f050a 2941 FAIL("Trailing \\");
a0d0e21e
LW
2942 /* FALL THROUGH */
2943 default:
c9f97d15
IZ
2944 /* Do not generate `unrecognized' warnings here, we fall
2945 back into the quick-grab loop below */
a0d0e21e
LW
2946 goto defchar;
2947 }
2948 break;
4633a7c4
LW
2949
2950 case '#':
830247a4
IZ
2951 if (RExC_flags16 & PMf_EXTENDED) {
2952 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2953 if (RExC_parse < RExC_end)
4633a7c4
LW
2954 goto tryagain;
2955 }
2956 /* FALL THROUGH */
2957
a0d0e21e 2958 default: {
ba210ebe 2959 register STRLEN len;
a0ed51b3 2960 register UV ender;
a0d0e21e 2961 register char *p;
c277df42 2962 char *oldp, *s;
ba210ebe 2963 STRLEN numlen;
f06dbbb7
JH
2964
2965 parse_start = RExC_parse - 1;
a0d0e21e 2966
830247a4 2967 RExC_parse++;
a0d0e21e
LW
2968
2969 defchar:
830247a4 2970 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2971 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2972 : EXACT);
cd439c50 2973 s = STRING(ret);
830247a4
IZ
2974 for (len = 0, p = RExC_parse - 1;
2975 len < 127 && p < RExC_end;
a0d0e21e
LW
2976 len++)
2977 {
2978 oldp = p;
5b5a24f7 2979
830247a4
IZ
2980 if (RExC_flags16 & PMf_EXTENDED)
2981 p = regwhite(p, RExC_end);
a0d0e21e
LW
2982 switch (*p) {
2983 case '^':
2984 case '$':
2985 case '.':
2986 case '[':
2987 case '(':
2988 case ')':
2989 case '|':
2990 goto loopdone;
2991 case '\\':
2992 switch (*++p) {
2993 case 'A':
2994 case 'G':
2995 case 'Z':
b85d18e9 2996 case 'z':
a0d0e21e
LW
2997 case 'w':
2998 case 'W':
2999 case 'b':
3000 case 'B':
3001 case 's':
3002 case 'S':
3003 case 'd':
3004 case 'D':
a14b48bc
LW
3005 case 'p':
3006 case 'P':
a0d0e21e
LW
3007 --p;
3008 goto loopdone;
3009 case 'n':
3010 ender = '\n';
3011 p++;
a687059c 3012 break;
a0d0e21e
LW
3013 case 'r':
3014 ender = '\r';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 't':
3018 ender = '\t';
3019 p++;
a687059c 3020 break;
a0d0e21e
LW
3021 case 'f':
3022 ender = '\f';
3023 p++;
a687059c 3024 break;
a0d0e21e 3025 case 'e':
c7f1f016 3026 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'a':
c7f1f016 3030 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3031 p++;
a687059c 3032 break;
a0d0e21e 3033 case 'x':
a0ed51b3
LW
3034 if (*++p == '{') {
3035 char* e = strchr(p, '}');
b81d288d 3036
b45f050a 3037 if (!e) {
830247a4 3038 RExC_parse = p + 1;
b45f050a
JF
3039 vFAIL("Missing right brace on \\x{}");
3040 }
de5f0749 3041 else {
53305cf1
NC
3042 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES;
3043 numlen = e - p - 1;
3044 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3045 if (ender > 0xff)
3046 RExC_utf8 = 1;
b21ed0a9
GS
3047 /* numlen is generous */
3048 if (numlen + len >= 127) {
a0ed51b3
LW
3049 p--;
3050 goto loopdone;
3051 }
3052 p = e + 1;
3053 }
a0ed51b3
LW
3054 }
3055 else {
53305cf1
NC
3056 I32 flags = 0;
3057 numlen = 2;
3058 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3059 p += numlen;
3060 }
a687059c 3061 break;
a0d0e21e
LW
3062 case 'c':
3063 p++;
bbce6d69 3064 ender = UCHARAT(p++);
3065 ender = toCTRL(ender);
a687059c 3066 break;
a0d0e21e
LW
3067 case '0': case '1': case '2': case '3':case '4':
3068 case '5': case '6': case '7': case '8':case '9':
3069 if (*p == '0' ||
830247a4 3070 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3071 I32 flags = 0;
3072 numlen = 3;
3073 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3074 p += numlen;
3075 }
3076 else {
3077 --p;
3078 goto loopdone;
a687059c
LW
3079 }
3080 break;
a0d0e21e 3081 case '\0':
830247a4 3082 if (p >= RExC_end)
b45f050a 3083 FAIL("Trailing \\");
a687059c 3084 /* FALL THROUGH */
a0d0e21e 3085 default:
e476b1b5 3086 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3087 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3088 goto normal_default;
a0d0e21e
LW
3089 }
3090 break;
a687059c 3091 default:
a0ed51b3 3092 normal_default:
fd400ab9 3093 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3094 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3095 &numlen, 0);
a0ed51b3
LW
3096 p += numlen;
3097 }
3098 else
3099 ender = *p++;
a0d0e21e 3100 break;
a687059c 3101 }
830247a4
IZ
3102 if (RExC_flags16 & PMf_EXTENDED)
3103 p = regwhite(p, RExC_end);
a0ed51b3
LW
3104 if (UTF && FOLD) {
3105 if (LOC)
5e12f4fb 3106 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3107 else
3108 ender = toLOWER_uni(ender);
3109 }
a0d0e21e
LW
3110 if (ISMULT2(p)) { /* Back off on ?+*. */
3111 if (len)
3112 p = oldp;
2b9d42f0 3113 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3114 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3115 s += numlen;
3116 len += numlen;
3117 }
a0d0e21e
LW
3118 else {
3119 len++;
cd439c50 3120 REGC(ender, s++);
a0d0e21e
LW
3121 }
3122 break;
a687059c 3123 }
2b9d42f0 3124 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3125 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3126 s += numlen;
3127 len += numlen - 1;
3128 }
3129 else
cd439c50 3130 REGC(ender, s++);
a0d0e21e
LW
3131 }
3132 loopdone:
830247a4 3133 RExC_parse = p - 1;
fac92740 3134 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3135 nextchar(pRExC_state);
793db0cb
JH
3136 {
3137 /* len is STRLEN which is unsigned, need to copy to signed */
3138 IV iv = len;
3139 if (iv < 0)
3140 vFAIL("Internal disaster");
3141 }
a0d0e21e
LW
3142 if (len > 0)
3143 *flagp |= HASWIDTH;
3144 if (len == 1)
3145 *flagp |= SIMPLE;
c277df42 3146 if (!SIZE_ONLY)
cd439c50
IZ
3147 STR_LEN(ret) = len;
3148 if (SIZE_ONLY)
830247a4 3149 RExC_size += STR_SZ(len);
cd439c50 3150 else
830247a4 3151 RExC_emit += STR_SZ(len);
a687059c 3152 }
a0d0e21e
LW
3153 break;
3154 }
a687059c 3155
a0d0e21e 3156 return(ret);
a687059c
LW
3157}
3158
873ef191 3159STATIC char *
cea2e8a9 3160S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3161{
3162 while (p < e) {
3163 if (isSPACE(*p))
3164 ++p;
3165 else if (*p == '#') {
3166 do {
3167 p++;
3168 } while (p < e && *p != '\n');
3169 }
3170 else
3171 break;
3172 }
3173 return p;
3174}
3175
b8c5462f
JH
3176/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3177 Character classes ([:foo:]) can also be negated ([:^foo:]).
3178 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3179 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3180 but trigger warnings because they are currently unimplemented. */
3181STATIC I32
830247a4 3182S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3183{
3184 char *posixcc = 0;
936ed897 3185 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3186
830247a4 3187 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3188 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3189 (*RExC_parse == ':' ||
3190 *RExC_parse == '=' ||
3191 *RExC_parse == '.')) {
3192 char c = *RExC_parse;
3193 char* s = RExC_parse++;
b81d288d 3194
830247a4
IZ
3195 while (RExC_parse < RExC_end && *RExC_parse != c)
3196 RExC_parse++;
3197 if (RExC_parse == RExC_end)
620e46c5 3198 /* Grandfather lone [:, [=, [. */
830247a4 3199 RExC_parse = s;
620e46c5 3200 else {
830247a4 3201 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3202
830247a4
IZ
3203 if (*RExC_parse == ']') {
3204 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3205 posixcc = s + 1;
3206 if (*s == ':') {
3207 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3208 I32 skip = 5; /* the most common skip */
3209
3210 switch (*posixcc) {
3211 case 'a':
3212 if (strnEQ(posixcc, "alnum", 5))
3213 namedclass =
3214 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3215 else if (strnEQ(posixcc, "alpha", 5))
3216 namedclass =
3217 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3218 else if (strnEQ(posixcc, "ascii", 5))
3219 namedclass =
3220 complement ? ANYOF_NASCII : ANYOF_ASCII;
3221 break;
aaa51d5e
JF
3222 case 'b':
3223 if (strnEQ(posixcc, "blank", 5))
3224 namedclass =
3225 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3226 break;
b8c5462f
JH
3227 case 'c':
3228 if (strnEQ(posixcc, "cntrl", 5))
3229 namedclass =
3230 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3231 break;
3232 case 'd':
3233 if (strnEQ(posixcc, "digit", 5))
3234 namedclass =
3235 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3236 break;
3237 case 'g':
3238 if (strnEQ(posixcc, "graph", 5))
3239 namedclass =
3240 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3241 break;
3242 case 'l':
3243 if (strnEQ(posixcc, "lower", 5))
3244 namedclass =
3245 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3246 break;
3247 case 'p':
3248 if (strnEQ(posixcc, "print", 5))
3249 namedclass =
3250 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3251 else if (strnEQ(posixcc, "punct", 5))
3252 namedclass =
3253 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3254 break;
3255 case 's':
3256 if (strnEQ(posixcc, "space", 5))
3257 namedclass =
aaa51d5e 3258 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3259 break;
b8c5462f
JH
3260 case 'u':
3261 if (strnEQ(posixcc, "upper", 5))
3262 namedclass =
3263 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3264 break;
3265 case 'w': /* this is not POSIX, this is the Perl \w */
3266 if (strnEQ(posixcc, "word", 4)) {
3267 namedclass =
3268 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3269 skip = 4;
3270 }
3271 break;
3272 case 'x':
3273 if (strnEQ(posixcc, "xdigit", 6)) {
3274 namedclass =
3275 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3276 skip = 6;
3277 }
3278 break;
3279 }
ac561586
JH
3280 if (namedclass == OOB_NAMEDCLASS ||
3281 posixcc[skip] != ':' ||
3282 posixcc[skip+1] != ']')
b45f050a
JF
3283 {
3284 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3285 t - s - 1, s + 1);
3286 }
3287 } else if (!SIZE_ONLY) {
b8c5462f 3288 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3289
830247a4 3290 /* adjust RExC_parse so the warning shows after
b45f050a 3291 the class closes */
830247a4
IZ
3292 while (*RExC_parse && *RExC_parse != ']')
3293 RExC_parse++;
b45f050a
JF
3294 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3295 }
b8c5462f
JH
3296 } else {
3297 /* Maternal grandfather:
3298 * "[:" ending in ":" but not in ":]" */
830247a4 3299 RExC_parse = s;
767d463e 3300 }
620e46c5
JH
3301 }
3302 }
3303
b8c5462f
JH
3304 return namedclass;
3305}
3306
3307STATIC void
830247a4 3308S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3309{
e476b1b5 3310 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3311 (*RExC_parse == ':' ||
3312 *RExC_parse == '=' ||
3313 *RExC_parse == '.')) {
3314 char *s = RExC_parse;
93733859 3315 char c = *s++;
b8c5462f
JH
3316
3317 while(*s && isALNUM(*s))
3318 s++;
3319 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3320 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3321
3322 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3323 if (c == '=' || c == '.')
b45f050a 3324 {
830247a4 3325 /* adjust RExC_parse so the error shows after
b45f050a 3326 the class closes */
830247a4 3327 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3328 ;
3329 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3330 }
b8c5462f
JH
3331 }
3332 }
620e46c5
JH
3333}
3334
76e3520e 3335STATIC regnode *
830247a4 3336S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3337{
ffc61ed2 3338 register UV value;
3568d838 3339 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3340 register IV range = 0;
c277df42 3341 register regnode *ret;
ba210ebe 3342 STRLEN numlen;
ffc61ed2 3343 IV namedclass;
9c5ffd7c 3344 char *rangebegin = 0;
936ed897 3345 bool need_class = 0;
9c5ffd7c 3346 SV *listsv = Nullsv;
ffc61ed2
JH
3347 register char *e;
3348 UV n;
3568d838 3349 bool optimize_invert = TRUE;
ffc61ed2
JH
3350
3351 ret = reganode(pRExC_state, ANYOF, 0);
3352
3353 if (!SIZE_ONLY)
3354 ANYOF_FLAGS(ret) = 0;
3355
3356 if (*RExC_parse == '^') { /* Complement of range. */
3357 RExC_naughty++;
3358 RExC_parse++;
3359 if (!SIZE_ONLY)
3360 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3361 }
a0d0e21e 3362
936ed897 3363 if (SIZE_ONLY)
830247a4 3364 RExC_size += ANYOF_SKIP;
936ed897 3365 else {
830247a4 3366 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3367 if (FOLD)
3368 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3369 if (LOC)
3370 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3371 ANYOF_BITMAP_ZERO(ret);
3372 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3373 }
b8c5462f 3374
e476b1b5 3375 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3376 checkposixcc(pRExC_state);
b8c5462f 3377
830247a4 3378 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3379 goto charclassloop; /* allow 1st char to be ] or - */
3380
830247a4 3381 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3382
3383 charclassloop:
3384
3385 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3386
73b437c8 3387 if (!range)
830247a4 3388 rangebegin = RExC_parse;
ffc61ed2 3389 if (UTF) {
5e12f4fb 3390 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3391 RExC_end - RExC_parse,
3392 &numlen, 0);
ffc61ed2
JH
3393 RExC_parse += numlen;
3394 }
3395 else
3396 value = UCHARAT(RExC_parse++);
620e46c5 3397 if (value == '[')
830247a4 3398 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3399 else if (value == '\\') {
ffc61ed2 3400 if (UTF) {
5e12f4fb 3401 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3402 RExC_end - RExC_parse,
3403 &numlen, 0);
3404 RExC_parse += numlen;
3405 }
3406 else
3407 value = UCHARAT(RExC_parse++);
470c3474 3408 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3409 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3410 * be a problem later if we want switch on Unicode.
3411 * A similar issue a little bit later when switching on
3412 * namedclass. --jhi */
ffc61ed2 3413 switch ((I32)value) {
b8c5462f
JH
3414 case 'w': namedclass = ANYOF_ALNUM; break;
3415 case 'W': namedclass = ANYOF_NALNUM; break;
3416 case 's': namedclass = ANYOF_SPACE; break;
3417 case 'S': namedclass = ANYOF_NSPACE; break;
3418 case 'd': namedclass = ANYOF_DIGIT; break;
3419 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3420 case 'p':
3421 case 'P':
3422 if (*RExC_parse == '{') {
3423 e = strchr(RExC_parse++, '}');
3424 if (!e)
3425 vFAIL("Missing right brace on \\p{}");
3426 n = e - RExC_parse;
3427 }
3428 else {
3429 e = RExC_parse;
3430 n = 1;
3431 }
3432 if (!SIZE_ONLY) {
3433 if (value == 'p')
3434 Perl_sv_catpvf(aTHX_ listsv,
3435 "+utf8::%.*s\n", (int)n, RExC_parse);
3436 else
3437 Perl_sv_catpvf(aTHX_ listsv,
3438 "!utf8::%.*s\n", (int)n, RExC_parse);
3439 }
3440 RExC_parse = e + 1;
3441 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3442 continue;
b8c5462f
JH
3443 case 'n': value = '\n'; break;
3444 case 'r': value = '\r'; break;
3445 case 't': value = '\t'; break;
3446 case 'f': value = '\f'; break;
3447 case 'b': value = '\b'; break;
c7f1f016
NIS
3448 case 'e': value = ASCII_TO_NATIVE('\033');break;
3449 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3450 case 'x':
ffc61ed2 3451 if (*RExC_parse == '{') {
53305cf1 3452 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES;
ffc61ed2 3453 e = strchr(RExC_parse++, '}');
b81d288d 3454 if (!e)
ffc61ed2 3455 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3456
3457 numlen = e - RExC_parse;
3458 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3459 RExC_parse = e + 1;
3460 }
3461 else {
53305cf1
NC
3462 I32 flags = 0;
3463 numlen = 2;
3464 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3465 RExC_parse += numlen;
3466 }
b8c5462f
JH
3467 break;
3468 case 'c':
830247a4 3469 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3470 value = toCTRL(value);
3471 break;
3472 case '0': case '1': case '2': case '3': case '4':
3473 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3474 {
3475 I32 flags = 0;
3476 numlen = 3;
3477 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3478 RExC_parse += numlen;
b8c5462f 3479 break;
53305cf1 3480 }
1028017a 3481 default:
e476b1b5 3482 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3483 vWARN2(RExC_parse,
3484 "Unrecognized escape \\%c in character class passed through",
3485 (int)value);
1028017a 3486 break;
b8c5462f 3487 }
ffc61ed2
JH
3488 } /* end of \blah */
3489
3490 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3491
3492 if (!SIZE_ONLY && !need_class)
936ed897 3493 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3494
936ed897 3495 need_class = 1;
ffc61ed2
JH
3496
3497 /* a bad range like a-\d, a-[:digit:] ? */
3498 if (range) {
73b437c8 3499 if (!SIZE_ONLY) {
e476b1b5 3500 if (ckWARN(WARN_REGEXP))
830247a4 3501 vWARN4(RExC_parse,
b45f050a 3502 "False [] range \"%*.*s\"",
830247a4
IZ
3503 RExC_parse - rangebegin,
3504 RExC_parse - rangebegin,
b45f050a 3505 rangebegin);
3568d838
JH
3506 if (prevvalue < 256) {
3507 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3508 ANYOF_BITMAP_SET(ret, '-');
3509 }
3510 else {
3511 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3512 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3513 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3514 }
b8c5462f 3515 }
ffc61ed2
JH
3516
3517 range = 0; /* this was not a true range */
73b437c8 3518 }
ffc61ed2 3519
73b437c8 3520 if (!SIZE_ONLY) {
3568d838
JH
3521 if (namedclass > OOB_NAMEDCLASS)
3522 optimize_invert = FALSE;
e2962f66
JH
3523 /* Possible truncation here but in some 64-bit environments
3524 * the compiler gets heartburn about switch on 64-bit values.
3525 * A similar issue a little earlier when switching on value.
98f323fa 3526 * --jhi */
e2962f66 3527 switch ((I32)namedclass) {
73b437c8
JH
3528 case ANYOF_ALNUM:
3529 if (LOC)
936ed897 3530 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3531 else {
3532 for (value = 0; value < 256; value++)
3533 if (isALNUM(value))
936ed897 3534 ANYOF_BITMAP_SET(ret, value);
73b437c8 3535 }
ffc61ed2 3536 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3537 break;
3538 case ANYOF_NALNUM:
3539 if (LOC)
936ed897 3540 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3541 else {
3542 for (value = 0; value < 256; value++)
3543 if (!isALNUM(value))
936ed897 3544 ANYOF_BITMAP_SET(ret, value);
73b437c8 3545 }
ffc61ed2 3546 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3547 break;
ffc61ed2 3548 case ANYOF_ALNUMC:
73b437c8 3549 if (LOC)
ffc61ed2 3550 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3551 else {
3552 for (value = 0; value < 256; value++)
ffc61ed2 3553 if (isALNUMC(value))
936ed897 3554 ANYOF_BITMAP_SET(ret, value);
73b437c8 3555 }
ffc61ed2 3556 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3557 break;
3558 case ANYOF_NALNUMC:
3559 if (LOC)
936ed897 3560 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3561 else {
3562 for (value = 0; value < 256; value++)
3563 if (!isALNUMC(value))
936ed897 3564 ANYOF_BITMAP_SET(ret, value);
73b437c8 3565 }
ffc61ed2 3566 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3567 break;
3568 case ANYOF_ALPHA:
3569 if (LOC)
936ed897 3570 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3571 else {
3572 for (value = 0; value < 256; value++)
3573 if (isALPHA(value))
936ed897 3574 ANYOF_BITMAP_SET(ret, value);
73b437c8 3575 }
ffc61ed2 3576 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3577 break;
3578 case ANYOF_NALPHA:
3579 if (LOC)
936ed897 3580 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3581 else {
3582 for (value = 0; value < 256; value++)
3583 if (!isALPHA(value))
936ed897 3584 ANYOF_BITMAP_SET(ret, value);
73b437c8 3585 }
ffc61ed2 3586 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3587 break;
3588 case ANYOF_ASCII:
3589 if (LOC)
936ed897 3590 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3591 else {
c7f1f016 3592#ifndef EBCDIC
1ba5c669
JH
3593 for (value = 0; value < 128; value++)
3594 ANYOF_BITMAP_SET(ret, value);
3595#else /* EBCDIC */
ffbc6a93 3596 for (value = 0; value < 256; value++) {
3a3c4447
JH
3597 if (isASCII(value))
3598 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3599 }
1ba5c669 3600#endif /* EBCDIC */
73b437c8 3601 }
ffc61ed2 3602 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3603 break;
3604 case ANYOF_NASCII:
3605 if (LOC)
936ed897 3606 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3607 else {
c7f1f016 3608#ifndef EBCDIC
1ba5c669
JH
3609 for (value = 128; value < 256; value++)
3610 ANYOF_BITMAP_SET(ret, value);
3611#else /* EBCDIC */
ffbc6a93 3612 for (value = 0; value < 256; value++) {
3a3c4447
JH
3613 if (!isASCII(value))
3614 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3615 }
1ba5c669 3616#endif /* EBCDIC */
73b437c8 3617 }
ffc61ed2 3618 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3619 break;
aaa51d5e
JF
3620 case ANYOF_BLANK:
3621 if (LOC)
3622 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3623 else {
3624 for (value = 0; value < 256; value++)
3625 if (isBLANK(value))
3626 ANYOF_BITMAP_SET(ret, value);
3627 }
ffc61ed2 3628 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3629 break;
3630 case ANYOF_NBLANK:
3631 if (LOC)
3632 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3633 else {
3634 for (value = 0; value < 256; value++)
3635 if (!isBLANK(value))
3636 ANYOF_BITMAP_SET(ret, value);
3637 }
ffc61ed2 3638 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3639 break;
73b437c8
JH
3640 case ANYOF_CNTRL:
3641 if (LOC)
936ed897 3642 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3643 else {
3644 for (value = 0; value < 256; value++)
3645 if (isCNTRL(value))
936ed897 3646 ANYOF_BITMAP_SET(ret, value);
73b437c8 3647 }
ffc61ed2 3648 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3649 break;
3650 case ANYOF_NCNTRL:
3651 if (LOC)
936ed897 3652 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3653 else {
3654 for (value = 0; value < 256; value++)
3655 if (!isCNTRL(value))
936ed897 3656 ANYOF_BITMAP_SET(ret, value);
73b437c8 3657 }
ffc61ed2
JH
3658 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3659 break;
3660 case ANYOF_DIGIT:
3661 if (LOC)
3662 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3663 else {
3664 /* consecutive digits assumed */
3665 for (value = '0'; value <= '9'; value++)
3666 ANYOF_BITMAP_SET(ret, value);
3667 }
3668 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3669 break;
3670 case ANYOF_NDIGIT:
3671 if (LOC)
3672 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3673 else {
3674 /* consecutive digits assumed */
3675 for (value = 0; value < '0'; value++)
3676 ANYOF_BITMAP_SET(ret, value);
3677 for (value = '9' + 1; value < 256; value++)
3678 ANYOF_BITMAP_SET(ret, value);
3679 }
3680 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3681 break;
3682 case ANYOF_GRAPH:
3683 if (LOC)
936ed897 3684 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3685 else {
3686 for (value = 0; value < 256; value++)
3687 if (isGRAPH(value))
936ed897 3688 ANYOF_BITMAP_SET(ret, value);
73b437c8 3689 }
ffc61ed2 3690 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3691 break;
3692 case ANYOF_NGRAPH:
3693 if (LOC)
936ed897 3694 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3695 else {
3696 for (value = 0; value < 256; value++)
3697 if (!isGRAPH(value))
936ed897 3698 ANYOF_BITMAP_SET(ret, value);
73b437c8 3699 }
ffc61ed2 3700 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3701 break;
3702 case ANYOF_LOWER:
3703 if (LOC)
936ed897 3704 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3705 else {
3706 for (value = 0; value < 256; value++)
3707 if (isLOWER(value))
936ed897 3708 ANYOF_BITMAP_SET(ret, value);
73b437c8 3709 }
ffc61ed2 3710 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3711 break;
3712 case ANYOF_NLOWER:
3713 if (LOC)
936ed897 3714 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3715 else {
3716 for (value = 0; value < 256; value++)
3717 if (!isLOWER(value))
936ed897 3718 ANYOF_BITMAP_SET(ret, value);
73b437c8 3719 }
ffc61ed2 3720 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3721 break;
3722 case ANYOF_PRINT:
3723 if (LOC)
936ed897 3724 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3725 else {
3726 for (value = 0; value < 256; value++)
3727 if (isPRINT(value))
936ed897 3728 ANYOF_BITMAP_SET(ret, value);
73b437c8 3729 }
ffc61ed2 3730 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3731 break;
3732 case ANYOF_NPRINT:
3733 if (LOC)
936ed897 3734 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3735 else {
3736 for (value = 0; value < 256; value++)
3737 if (!isPRINT(value))
936ed897 3738 ANYOF_BITMAP_SET(ret, value);
73b437c8 3739 }
ffc61ed2 3740 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3741 break;
aaa51d5e
JF
3742 case ANYOF_PSXSPC:
3743 if (LOC)
3744 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3745 else {
3746 for (value = 0; value < 256; value++)
3747 if (isPSXSPC(value))
3748 ANYOF_BITMAP_SET(ret, value);
3749 }
ffc61ed2 3750 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3751 break;
3752 case ANYOF_NPSXSPC:
3753 if (LOC)
3754 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3755 else {
3756 for (value = 0; value < 256; value++)
3757 if (!isPSXSPC(value))
3758 ANYOF_BITMAP_SET(ret, value);
3759 }
ffc61ed2 3760 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3761 break;
73b437c8
JH
3762 case ANYOF_PUNCT:
3763 if (LOC)
936ed897 3764 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3765 else {
3766 for (value = 0; value < 256; value++)
3767 if (isPUNCT(value))
936ed897 3768 ANYOF_BITMAP_SET(ret, value);
73b437c8 3769 }
ffc61ed2 3770 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3771 break;
3772 case ANYOF_NPUNCT:
3773 if (LOC)
936ed897 3774 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3775 else {
3776 for (value = 0; value < 256; value++)
3777 if (!isPUNCT(value))
936ed897 3778 ANYOF_BITMAP_SET(ret, value);
73b437c8 3779 }
ffc61ed2
JH
3780 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3781 break;
3782 case ANYOF_SPACE:
3783 if (LOC)
3784 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (isSPACE(value))
3788 ANYOF_BITMAP_SET(ret, value);
3789 }
3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3791 break;
3792 case ANYOF_NSPACE:
3793 if (LOC)
3794 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3795 else {
3796 for (value = 0; value < 256; value++)
3797 if (!isSPACE(value))
3798 ANYOF_BITMAP_SET(ret, value);
3799 }
3800 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3801 break;
3802 case ANYOF_UPPER:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3805 else {
3806 for (value = 0; value < 256; value++)
3807 if (isUPPER(value))
936ed897 3808 ANYOF_BITMAP_SET(ret, value);
73b437c8 3809 }
ffc61ed2 3810 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3811 break;
3812 case ANYOF_NUPPER:
3813 if (LOC)
936ed897 3814 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3815 else {
3816 for (value = 0; value < 256; value++)
3817 if (!isUPPER(value))
936ed897 3818 ANYOF_BITMAP_SET(ret, value);
73b437c8 3819 }
ffc61ed2 3820 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3821 break;
3822 case ANYOF_XDIGIT:
3823 if (LOC)
936ed897 3824 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3825 else {
3826 for (value = 0; value < 256; value++)
3827 if (isXDIGIT(value))
936ed897 3828 ANYOF_BITMAP_SET(ret, value);
73b437c8 3829 }
ffc61ed2 3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3831 break;
3832 case ANYOF_NXDIGIT:
3833 if (LOC)
936ed897 3834 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3835 else {
3836 for (value = 0; value < 256; value++)
3837 if (!isXDIGIT(value))
936ed897 3838 ANYOF_BITMAP_SET(ret, value);
73b437c8 3839 }
ffc61ed2 3840 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3841 break;
3842 default:
b45f050a 3843 vFAIL("Invalid [::] class");
73b437c8 3844 break;
b8c5462f 3845 }
b8c5462f 3846 if (LOC)
936ed897 3847 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3848 continue;
a0d0e21e 3849 }
ffc61ed2
JH
3850 } /* end of namedclass \blah */
3851
a0d0e21e 3852 if (range) {
3a3c4447 3853 if (prevvalue > value) /* b-a */ {
b45f050a 3854 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3855 RExC_parse - rangebegin,
3856 RExC_parse - rangebegin,
b45f050a 3857 rangebegin);
3568d838 3858 range = 0; /* not a valid range */
73b437c8 3859 }
a0d0e21e
LW
3860 }
3861 else {
3568d838 3862 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3863 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3864 RExC_parse[1] != ']') {
3865 RExC_parse++;
ffc61ed2
JH
3866
3867 /* a bad range like \w-, [:word:]- ? */
3868 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3869 if (ckWARN(WARN_REGEXP))
830247a4 3870 vWARN4(RExC_parse,
b45f050a 3871 "False [] range \"%*.*s\"",
830247a4
IZ
3872 RExC_parse - rangebegin,
3873 RExC_parse - rangebegin,
b45f050a 3874 rangebegin);
73b437c8 3875 if (!SIZE_ONLY)
936ed897 3876 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3877 } else
ffc61ed2
JH
3878 range = 1; /* yeah, it's a range! */
3879 continue; /* but do it the next time */
a0d0e21e 3880 }
a687059c 3881 }
ffc61ed2 3882
93733859 3883 /* now is the next time */
ae5c130c 3884 if (!SIZE_ONLY) {
3568d838
JH
3885 IV i;
3886
3887 if (prevvalue < 256) {
3888 IV ceilvalue = value < 256 ? value : 255;
3889
3890#ifdef EBCDIC
3a3c4447
JH
3891 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3892 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3893 {
3568d838
JH
3894 if (isLOWER(prevvalue)) {
3895 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3896 if (isLOWER(i))
3897 ANYOF_BITMAP_SET(ret, i);
3898 } else {
3568d838 3899 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3900 if (isUPPER(i))
3901 ANYOF_BITMAP_SET(ret, i);
3902 }
8ada0baa 3903 }
ffc61ed2 3904 else
8ada0baa 3905#endif
3568d838
JH
3906 for (i = prevvalue; i <= ceilvalue; i++)
3907 ANYOF_BITMAP_SET(ret, i);
3908 }
3909 if (value > 255) {
ffc61ed2 3910 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3911 if (prevvalue < value)
ffc61ed2 3912 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3913 (UV)prevvalue, (UV)value);
3914 else if (prevvalue == value)
ffc61ed2
JH
3915 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3916 (UV)value);
3917 }
8ada0baa 3918 }
ffc61ed2
JH
3919
3920 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3921 }
ffc61ed2 3922
936ed897 3923 if (need_class) {
4f66b38d 3924 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3925 if (SIZE_ONLY)
830247a4 3926 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3927 else
830247a4 3928 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3929 }
ffc61ed2 3930
ae5c130c 3931 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3932 if (!SIZE_ONLY &&
ffc61ed2 3933 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3934 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3935 ) {
a0ed51b3 3936 for (value = 0; value < 256; ++value) {
936ed897 3937 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3938 IV fold = PL_fold[value];
3939
3940 if (fold != value)
3941 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3942 }
3943 }
936ed897 3944 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3945 }
ffc61ed2 3946
ae5c130c 3947 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3948 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3949 /* If the only flag is inversion. */
3950 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3951 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3952 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3953 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3954 }
a0d0e21e 3955
b81d288d 3956 if (!SIZE_ONLY) {
fde631ed 3957 AV *av = newAV();
ffc61ed2
JH
3958 SV *rv;
3959
3960 av_store(av, 0, listsv);
3961 av_store(av, 1, NULL);
3962 rv = newRV_noinc((SV*)av);
19860706 3963 n = add_data(pRExC_state, 1, "s");
830247a4 3964 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3965 ARG_SET(ret, n);
a0ed51b3
LW
3966 }
3967
3968 return ret;
3969}
3970
76e3520e 3971STATIC char*
830247a4 3972S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3973{
830247a4 3974 char* retval = RExC_parse++;
a0d0e21e 3975
4633a7c4 3976 for (;;) {
830247a4
IZ
3977 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3978 RExC_parse[2] == '#') {
3979 while (*RExC_parse && *RExC_parse != ')')
3980 RExC_parse++;
3981 RExC_parse++;
4633a7c4
LW
3982 continue;
3983 }
830247a4
IZ
3984 if (RExC_flags16 & PMf_EXTENDED) {
3985 if (isSPACE(*RExC_parse)) {
3986 RExC_parse++;
748a9306
LW
3987 continue;
3988 }
830247a4
IZ
3989 else if (*RExC_parse == '#') {
3990 while (*RExC_parse && *RExC_parse != '\n')
3991 RExC_parse++;
3992 RExC_parse++;
748a9306
LW
3993 continue;
3994 }
748a9306 3995 }
4633a7c4 3996 return retval;
a0d0e21e 3997 }
a687059c
LW
3998}
3999
4000/*
c277df42 4001- reg_node - emit a node
a0d0e21e 4002*/
76e3520e 4003STATIC regnode * /* Location. */
830247a4 4004S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4005{
c277df42
IZ
4006 register regnode *ret;
4007 register regnode *ptr;
a687059c 4008
830247a4 4009 ret = RExC_emit;
c277df42 4010 if (SIZE_ONLY) {
830247a4
IZ
4011 SIZE_ALIGN(RExC_size);
4012 RExC_size += 1;
a0d0e21e
LW
4013 return(ret);
4014 }
a687059c 4015
c277df42 4016 NODE_ALIGN_FILL(ret);
a0d0e21e 4017 ptr = ret;
c277df42 4018 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4019 if (RExC_offsets) { /* MJD */
4020 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4021 "reg_node", __LINE__,
4022 reg_name[op],
4023 RExC_emit - RExC_emit_start > RExC_offsets[0]
4024 ? "Overwriting end of array!\n" : "OK",
4025 RExC_emit - RExC_emit_start,
4026 RExC_parse - RExC_start,
4027 RExC_offsets[0]));
4028 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4029 }
4030
830247a4 4031 RExC_emit = ptr;
a687059c 4032
a0d0e21e 4033 return(ret);
a687059c
LW
4034}
4035
4036/*
a0d0e21e
LW
4037- reganode - emit a node with an argument
4038*/
76e3520e 4039STATIC regnode * /* Location. */
830247a4 4040S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4041{
c277df42
IZ
4042 register regnode *ret;
4043 register regnode *ptr;
fe14fcc3 4044
830247a4 4045 ret = RExC_emit;
c277df42 4046 if (SIZE_ONLY) {
830247a4
IZ
4047 SIZE_ALIGN(RExC_size);
4048 RExC_size += 2;
a0d0e21e
LW
4049 return(ret);
4050 }
fe14fcc3 4051
c277df42 4052 NODE_ALIGN_FILL(ret);
a0d0e21e 4053 ptr = ret;
c277df42 4054 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4055 if (RExC_offsets) { /* MJD */
4056 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4057 "reganode",
4058 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4059 "Overwriting end of array!\n" : "OK",
4060 RExC_emit - RExC_emit_start,
4061 RExC_parse - RExC_start,
4062 RExC_offsets[0]));
4063 Set_Cur_Node_Offset;
4064 }
4065
830247a4 4066 RExC_emit = ptr;
fe14fcc3 4067
a0d0e21e 4068 return(ret);
fe14fcc3
LW
4069}
4070
4071/*
cd439c50 4072- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4073*/
4074STATIC void
830247a4 4075S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4076{
5e12f4fb 4077 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4078}
4079
4080/*
a0d0e21e
LW
4081- reginsert - insert an operator in front of already-emitted operand
4082*
4083* Means relocating the operand.
4084*/
76e3520e 4085STATIC void
830247a4 4086S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4087{
c277df42
IZ
4088 register regnode *src;
4089 register regnode *dst;
4090 register regnode *place;
4091 register int offset = regarglen[(U8)op];
b81d288d 4092
22c35a8c 4093/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4094
4095 if (SIZE_ONLY) {
830247a4 4096 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4097 return;
4098 }
a687059c 4099
830247a4
IZ
4100 src = RExC_emit;
4101 RExC_emit += NODE_STEP_REGNODE + offset;
4102 dst = RExC_emit;
fac92740 4103 while (src > opnd) {
c277df42 4104 StructCopy(--src, --dst, regnode);
fac92740
MJD
4105 if (RExC_offsets) { /* MJD 20010112 */
4106 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4107 "reg_insert",
4108 dst - RExC_emit_start > RExC_offsets[0]
4109 ? "Overwriting end of array!\n" : "OK",
4110 src - RExC_emit_start,
4111 dst - RExC_emit_start,
4112 RExC_offsets[0]));
4113 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4114 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4115 }
4116 }
4117
a0d0e21e
LW
4118
4119 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4120 if (RExC_offsets) { /* MJD */
4121 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4122 "reginsert",
4123 place - RExC_emit_start > RExC_offsets[0]
4124 ? "Overwriting end of array!\n" : "OK",
4125 place - RExC_emit_start,
4126 RExC_parse - RExC_start,
4127 RExC_offsets[0]));
4128 Set_Node_Offset(place, RExC_parse);
4129 }
c277df42
IZ
4130 src = NEXTOPER(place);
4131 FILL_ADVANCE_NODE(place, op);
4132 Zero(src, offset, regnode);
a687059c
LW
4133}
4134
4135/*
c277df42 4136- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4137*/
76e3520e 4138STATIC void
830247a4 4139S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4140{
c277df42
IZ
4141 register regnode *scan;
4142 register regnode *temp;
a0d0e21e 4143
c277df42 4144 if (SIZE_ONLY)
a0d0e21e
LW
4145 return;
4146
4147 /* Find last node. */
4148 scan = p;
4149 for (;;) {
4150 temp = regnext(scan);
4151 if (temp == NULL)
4152 break;
4153 scan = temp;
4154 }
a687059c 4155
c277df42
IZ
4156 if (reg_off_by_arg[OP(scan)]) {
4157 ARG_SET(scan, val - scan);
a0ed51b3
LW
4158 }
4159 else {
c277df42
IZ
4160 NEXT_OFF(scan) = val - scan;
4161 }
a687059c
LW
4162}
4163
4164/*
a0d0e21e
LW
4165- regoptail - regtail on operand of first argument; nop if operandless
4166*/
76e3520e 4167STATIC void
830247a4 4168S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4169{
a0d0e21e 4170 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4171 if (p == NULL || SIZE_ONLY)
4172 return;
22c35a8c 4173 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4174 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4175 }
22c35a8c 4176 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4177 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4178 }
4179 else
a0d0e21e 4180 return;
a687059c
LW
4181}
4182
4183/*
4184 - regcurly - a little FSA that accepts {\d+,?\d*}
4185 */
79072805 4186STATIC I32
cea2e8a9 4187S_regcurly(pTHX_ register char *s)
a687059c
LW
4188{
4189 if (*s++ != '{')
4190 return FALSE;
f0fcb552 4191 if (!isDIGIT(*s))
a687059c 4192 return FALSE;
f0fcb552 4193 while (isDIGIT(*s))
a687059c
LW
4194 s++;
4195 if (*s == ',')
4196 s++;
f0fcb552 4197 while (isDIGIT(*s))
a687059c
LW
4198 s++;
4199 if (*s != '}')
4200 return FALSE;
4201 return TRUE;
4202}
4203
a687059c 4204
8fa7f367
JH
4205#ifdef DEBUGGING
4206
76e3520e 4207STATIC regnode *
cea2e8a9 4208S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4209{
f248d071 4210 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4211 register regnode *next;
c277df42
IZ
4212
4213 while (op != END && (!last || node < last)) {
4214 /* While that wasn't END last time... */
4215
4216 NODE_ALIGN(node);
4217 op = OP(node);
4218 if (op == CLOSE)
4219 l--;
4220 next = regnext(node);
4221 /* Where, what. */
4222 if (OP(node) == OPTIMIZED)
4223 goto after_print;
4224 regprop(sv, node);
b900a521 4225 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4226 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4227 if (next == NULL) /* Next ptr. */
4228 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4229 else
b900a521 4230 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4231 (void)PerlIO_putc(Perl_debug_log, '\n');
4232 after_print:
22c35a8c 4233 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4234 register regnode *nnode = (OP(next) == LONGJMP
4235 ? regnext(next)
c277df42
IZ
4236 : next);
4237 if (last && nnode > last)
4238 nnode = last;
4239 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4240 }
22c35a8c 4241 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4242 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4243 }
4244 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4245 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4246 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4247 }
22c35a8c 4248 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4249 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4250 next, sv, l + 1);
a0ed51b3
LW
4251 }
4252 else if ( op == PLUS || op == STAR) {
c277df42 4253 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4254 }
4255 else if (op == ANYOF) {
4f66b38d
HS
4256 /* arglen 1 + class block */
4257 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4258 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4259 node = NEXTOPER(node);
a0ed51b3 4260 }
22c35a8c 4261 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4262 /* Literal string, where present. */
cd439c50 4263 node += NODE_SZ_STR(node) - 1;
c277df42 4264 node = NEXTOPER(node);
a0ed51b3
LW
4265 }
4266 else {
c277df42
IZ
4267 node = NEXTOPER(node);
4268 node += regarglen[(U8)op];
4269 }
4270 if (op == CURLYX || op == OPEN)
4271 l++;
4272 else if (op == WHILEM)
4273 l--;
4274 }
4275 return node;
4276}
4277
8fa7f367
JH
4278#endif /* DEBUGGING */
4279
a687059c 4280/*
fd181c75 4281 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4282 */
4283void
864dbfa3 4284Perl_regdump(pTHX_ regexp *r)
a687059c 4285{
35ff7856 4286#ifdef DEBUGGING
46fc3d4c 4287 SV *sv = sv_newmortal();
a687059c 4288
c277df42 4289 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4290
4291 /* Header fields of interest. */
c277df42 4292 if (r->anchored_substr)
7b0972df 4293 PerlIO_printf(Perl_debug_log,
b81d288d 4294 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4295 PL_colors[0],
7b0972df 4296 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4297 SvPVX(r->anchored_substr),
3280af22 4298 PL_colors[1],
c277df42 4299 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4300 (IV)r->anchored_offset);
c277df42 4301 if (r->float_substr)
7b0972df 4302 PerlIO_printf(Perl_debug_log,
b81d288d 4303 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4304 PL_colors[0],
b81d288d 4305 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4306 SvPVX(r->float_substr),
3280af22 4307 PL_colors[1],
c277df42 4308 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4309 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4310 if (r->check_substr)
b81d288d
AB
4311 PerlIO_printf(Perl_debug_log,
4312 r->check_substr == r->float_substr
c277df42
IZ
4313 ? "(checking floating" : "(checking anchored");
4314 if (r->reganch & ROPT_NOSCAN)
4315 PerlIO_printf(Perl_debug_log, " noscan");
4316 if (r->reganch & ROPT_CHECK_ALL)
4317 PerlIO_printf(Perl_debug_log, " isall");
4318 if (r->check_substr)
4319 PerlIO_printf(Perl_debug_log, ") ");
4320
46fc3d4c 4321 if (r->regstclass) {
4322 regprop(sv, r->regstclass);
4323 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4324 }
774d564b 4325 if (r->reganch & ROPT_ANCH) {
4326 PerlIO_printf(Perl_debug_log, "anchored");
4327 if (r->reganch & ROPT_ANCH_BOL)
4328 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4329 if (r->reganch & ROPT_ANCH_MBOL)
4330 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4331 if (r->reganch & ROPT_ANCH_SBOL)
4332 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4333 if (r->reganch & ROPT_ANCH_GPOS)
4334 PerlIO_printf(Perl_debug_log, "(GPOS)");
4335 PerlIO_putc(Perl_debug_log, ' ');
4336 }
c277df42
IZ
4337 if (r->reganch & ROPT_GPOS_SEEN)
4338 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4339 if (r->reganch & ROPT_SKIP)
760ac839 4340 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4341 if (r->reganch & ROPT_IMPLICIT)
760ac839 4342 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4343 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4344 if (r->reganch & ROPT_EVAL_SEEN)
4345 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4346 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4347 if (r->offsets) {
4348 U32 i;
4349 U32 len = r->offsets[0];
392fbf5d 4350 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4351 for (i = 1; i <= len; i++)
392fbf5d
RB
4352 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4353 (UV)r->offsets[i*2-1],
4354 (UV)r->offsets[i*2]);
fac92740
MJD
4355 PerlIO_printf(Perl_debug_log, "\n");
4356 }
17c3b450 4357#endif /* DEBUGGING */
a687059c
LW
4358}
4359
8fa7f367
JH
4360#ifdef DEBUGGING
4361
653099ff
GS
4362STATIC void
4363S_put_byte(pTHX_ SV *sv, int c)
4364{
7be5a6cf 4365 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4366 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4367 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4368 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4369 else
4370 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4371}
4372
8fa7f367
JH
4373#endif /* DEBUGGING */
4374
a687059c 4375/*
a0d0e21e
LW
4376- regprop - printable representation of opcode
4377*/
46fc3d4c 4378void
864dbfa3 4379Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4380{
35ff7856 4381#ifdef DEBUGGING
9b155405 4382 register int k;
a0d0e21e 4383
54dc92de 4384 sv_setpvn(sv, "", 0);
9b155405 4385 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4386 /* It would be nice to FAIL() here, but this may be called from
4387 regexec.c, and it would be hard to supply pRExC_state. */
4388 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4389 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4390
4391 k = PL_regkind[(U8)OP(o)];
4392
4393 if (k == EXACT)
7821416a 4394 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4395 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4396 else if (k == CURLY) {
cb434fcc 4397 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4398 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4399 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4400 }
2c2d71f5
JH
4401 else if (k == WHILEM && o->flags) /* Ordinal/of */
4402 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4403 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4404 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4405 else if (k == LOGICAL)
04ebc1ab 4406 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4407 else if (k == ANYOF) {
4408 int i, rangestart = -1;
ffc61ed2 4409 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4410 const char * const anyofs[] = { /* Should be syncronized with
4411 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4412 "\\w",
4413 "\\W",
4414 "\\s",
4415 "\\S",
4416 "\\d",
4417 "\\D",
4418 "[:alnum:]",
4419 "[:^alnum:]",
4420 "[:alpha:]",
4421 "[:^alpha:]",
4422 "[:ascii:]",
4423 "[:^ascii:]",
4424 "[:ctrl:]",
4425 "[:^ctrl:]",
4426 "[:graph:]",
4427 "[:^graph:]",
4428 "[:lower:]",
4429 "[:^lower:]",
4430 "[:print:]",
4431 "[:^print:]",
4432 "[:punct:]",
4433 "[:^punct:]",
4434 "[:upper:]",
aaa51d5e 4435 "[:^upper:]",
653099ff 4436 "[:xdigit:]",
aaa51d5e
JF
4437 "[:^xdigit:]",
4438 "[:space:]",
4439 "[:^space:]",
4440 "[:blank:]",
4441 "[:^blank:]"
653099ff
GS
4442 };
4443
19860706 4444 if (flags & ANYOF_LOCALE)
653099ff 4445 sv_catpv(sv, "{loc}");
19860706 4446 if (flags & ANYOF_FOLD)
653099ff
GS
4447 sv_catpv(sv, "{i}");
4448 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4449 if (flags & ANYOF_INVERT)
653099ff 4450 sv_catpv(sv, "^");
ffc61ed2
JH
4451 for (i = 0; i <= 256; i++) {
4452 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4453 if (rangestart == -1)
4454 rangestart = i;
4455 } else if (rangestart != -1) {
4456 if (i <= rangestart + 3)
4457 for (; rangestart < i; rangestart++)
653099ff 4458 put_byte(sv, rangestart);
ffc61ed2
JH
4459 else {
4460 put_byte(sv, rangestart);
4461 sv_catpv(sv, "-");
4462 put_byte(sv, i - 1);
653099ff 4463 }
ffc61ed2 4464 rangestart = -1;
653099ff 4465 }
847a199f 4466 }
ffc61ed2
JH
4467
4468 if (o->flags & ANYOF_CLASS)
4469 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4470 if (ANYOF_CLASS_TEST(o,i))
4471 sv_catpv(sv, anyofs[i]);
4472
4473 if (flags & ANYOF_UNICODE)
4474 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4475 else if (flags & ANYOF_UNICODE_ALL)
4476 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4477
4478 {
4479 SV *lv;
4480 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4481
ffc61ed2
JH
4482 if (lv) {
4483 if (sw) {
4484 UV i;
4485 U8 s[UTF8_MAXLEN+1];
b81d288d 4486
ffc61ed2 4487 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4488 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4489
3568d838 4490 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4491 if (rangestart == -1)
4492 rangestart = i;
4493 } else if (rangestart != -1) {
4494 U8 *p;
b81d288d 4495
ffc61ed2
JH
4496 if (i <= rangestart + 3)
4497 for (; rangestart < i; rangestart++) {
2b9d42f0 4498 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4499 put_byte(sv, *p);
4500 }
4501 else {
2b9d42f0 4502 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4503 put_byte(sv, *p);
4504 sv_catpv(sv, "-");
2b9d42f0 4505 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4506 put_byte(sv, *p);
4507 }
4508 rangestart = -1;
4509 }
19860706 4510 }
ffc61ed2
JH
4511
4512 sv_catpv(sv, "..."); /* et cetera */
19860706 4513 }
fde631ed 4514
ffc61ed2
JH
4515 {
4516 char *s = savepv(SvPVX(lv));
4517 char *origs = s;
b81d288d 4518
ffc61ed2 4519 while(*s && *s != '\n') s++;
b81d288d 4520
ffc61ed2
JH
4521 if (*s == '\n') {
4522 char *t = ++s;
4523
4524 while (*s) {
4525 if (*s == '\n')
4526 *s = ' ';
4527 s++;
4528 }
4529 if (s[-1] == ' ')
4530 s[-1] = 0;
4531
4532 sv_catpv(sv, t);
fde631ed 4533 }
b81d288d 4534
ffc61ed2 4535 Safefree(origs);
fde631ed
JH
4536 }
4537 }
653099ff 4538 }
ffc61ed2 4539
653099ff
GS
4540 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4541 }
9b155405 4542 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4543 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4544#endif /* DEBUGGING */
35ff7856 4545}
a687059c 4546
cad2e5aa
JH
4547SV *
4548Perl_re_intuit_string(pTHX_ regexp *prog)
4549{ /* Assume that RE_INTUIT is set */
4550 DEBUG_r(
4551 { STRLEN n_a;
4552 char *s = SvPV(prog->check_substr,n_a);
4553
4554 if (!PL_colorset) reginitcolors();
4555 PerlIO_printf(Perl_debug_log,
4556 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4557 PL_colors[4],PL_colors[5],PL_colors[0],
4558 s,
4559 PL_colors[1],
4560 (strlen(s) > 60 ? "..." : ""));
4561 } );
4562
4563 return prog->check_substr;
4564}
4565
2b69d0c2 4566void
864dbfa3 4567Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4568{
adac82c7 4569 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4570
4571 if (!r || (--r->refcnt > 0))
4572 return;
cad2e5aa
JH
4573 DEBUG_r(PerlIO_printf(Perl_debug_log,
4574 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4575 PL_colors[4],PL_colors[5],PL_colors[0],
4576 r->precomp,
4577 PL_colors[1],
4578 (strlen(r->precomp) > 60 ? "..." : "")));
4579
c277df42 4580 if (r->precomp)
a0d0e21e 4581 Safefree(r->precomp);
fac92740
MJD
4582 if (r->offsets) /* 20010421 MJD */
4583 Safefree(r->offsets);
cf93c79d
IZ
4584 if (RX_MATCH_COPIED(r))
4585 Safefree(r->subbeg);
a193d654
GS
4586 if (r->substrs) {
4587 if (r->anchored_substr)
4588 SvREFCNT_dec(r->anchored_substr);
4589 if (r->float_substr)
4590 SvREFCNT_dec(r->float_substr);
2779dcf1 4591 Safefree(r->substrs);
a193d654 4592 }
c277df42
IZ
4593 if (r->data) {
4594 int n = r->data->count;
dfad63ad
HS
4595 AV* new_comppad = NULL;
4596 AV* old_comppad;
4597 SV** old_curpad;
4598
c277df42 4599 while (--n >= 0) {
261faec3 4600 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4601 switch (r->data->what[n]) {
4602 case 's':
4603 SvREFCNT_dec((SV*)r->data->data[n]);
4604 break;
653099ff
GS
4605 case 'f':
4606 Safefree(r->data->data[n]);
4607 break;
dfad63ad
HS
4608 case 'p':
4609 new_comppad = (AV*)r->data->data[n];
4610 break;
c277df42 4611 case 'o':
dfad63ad 4612 if (new_comppad == NULL)
cea2e8a9 4613 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4614 old_comppad = PL_comppad;
4615 old_curpad = PL_curpad;
1e6dc0b6
SB
4616 /* Watch out for global destruction's random ordering. */
4617 if (SvTYPE(new_comppad) == SVt_PVAV) {
4618 PL_comppad = new_comppad;
4619 PL_curpad = AvARRAY(new_comppad);
4620 }
4621 else
4622 PL_curpad = NULL;
9b978d73
DM
4623
4624 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4625 op_free((OP_4tree*)r->data->data[n]);
4626 }
4627
dfad63ad
HS
4628 PL_comppad = old_comppad;
4629 PL_curpad = old_curpad;
4630 SvREFCNT_dec((SV*)new_comppad);
4631 new_comppad = NULL;
c277df42
IZ
4632 break;
4633 case 'n':
4634 break;
4635 default:
830247a4 4636 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4637 }
4638 }
4639 Safefree(r->data->what);
4640 Safefree(r->data);
a0d0e21e
LW
4641 }
4642 Safefree(r->startp);
4643 Safefree(r->endp);
4644 Safefree(r);
a687059c 4645}
c277df42
IZ
4646
4647/*
4648 - regnext - dig the "next" pointer out of a node
4649 *
4650 * [Note, when REGALIGN is defined there are two places in regmatch()
4651 * that bypass this code for speed.]
4652 */
4653regnode *
864dbfa3 4654Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4655{
4656 register I32 offset;
4657
3280af22 4658 if (p == &PL_regdummy)
c277df42
IZ
4659 return(NULL);
4660
4661 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4662 if (offset == 0)
4663 return(NULL);
4664
c277df42 4665 return(p+offset);
c277df42
IZ
4666}
4667
01f988be 4668STATIC void
cea2e8a9 4669S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4670{
4671 va_list args;
4672 STRLEN l1 = strlen(pat1);
4673 STRLEN l2 = strlen(pat2);
4674 char buf[512];
06bf62c7 4675 SV *msv;
c277df42
IZ
4676 char *message;
4677
4678 if (l1 > 510)
4679 l1 = 510;
4680 if (l1 + l2 > 510)
4681 l2 = 510 - l1;
4682 Copy(pat1, buf, l1 , char);
4683 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4684 buf[l1 + l2] = '\n';
4685 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4686#ifdef I_STDARG
4687 /* ANSI variant takes additional second argument */
c277df42 4688 va_start(args, pat2);
8736538c
AS
4689#else
4690 va_start(args);
4691#endif
5a844595 4692 msv = vmess(buf, &args);
c277df42 4693 va_end(args);
06bf62c7 4694 message = SvPV(msv,l1);
c277df42
IZ
4695 if (l1 > 512)
4696 l1 = 512;
4697 Copy(message, buf, l1 , char);
4698 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4699 Perl_croak(aTHX_ "%s", buf);
c277df42 4700}
a0ed51b3
LW
4701
4702/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4703
4704void
864dbfa3 4705Perl_save_re_context(pTHX)
b81d288d 4706{
830247a4
IZ
4707#if 0
4708 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4709 SAVEI32(RExC_npar); /* () count. */
4710 SAVEI32(RExC_size); /* Code size. */
4711 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4712 SAVEVPTR(RExC_rx); /* from regcomp.c */
4713 SAVEI32(RExC_seen); /* from regcomp.c */
4714 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4715 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4716 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4717 SAVEPPTR(RExC_end); /* End of input for compile */
4718 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4719#endif
4720
4721 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4722 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4723 SAVEPPTR(PL_reginput); /* String-input pointer. */
4724 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4725 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4726 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4727 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4728 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4729 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4730 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4731 PL_reg_start_tmp = 0;
a0ed51b3
LW
4732 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4733 PL_reg_start_tmpl = 0;
7766f137 4734 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4735 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4736 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4737 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4738 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4739 SAVEVPTR(PL_regcc); /* from regexec.c */
4740 SAVEVPTR(PL_curcop);
7766f137
GS
4741 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4742 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4743 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4744 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4745 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4746 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4747 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4748 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4749 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4750 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4751 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4752#ifdef DEBUGGING
b81d288d 4753 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4754#endif
a0ed51b3 4755}
51371543 4756
51371543 4757static void
acfe0abc 4758clear_re(pTHX_ void *r)
51371543
GS
4759{
4760 ReREFCNT_dec((regexp *)r);
4761}
ffbc6a93 4762