This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Promote the e/uid to wide enough un/signed integers
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
739#ifdef DEBUGGING
740 /* Allow dumping */
cd439c50 741 n = scan + NODE_SZ_STR(scan);
c277df42 742 while (n <= stop) {
22c35a8c 743 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
744 OP(n) = OPTIMIZED;
745 NEXT_OFF(n) = 0;
746 }
747 n++;
748 }
653099ff 749#endif
c277df42 750 }
653099ff
GS
751 /* Follow the next-chain of the current node and optimize
752 away all the NOTHINGs from it. */
c277df42 753 if (OP(scan) != CURLYX) {
048cfca1
GS
754 int max = (reg_off_by_arg[OP(scan)]
755 ? I32_MAX
756 /* I32 may be smaller than U16 on CRAYs! */
757 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
758 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
759 int noff;
760 regnode *n = scan;
b81d288d 761
c277df42
IZ
762 /* Skip NOTHING and LONGJMP. */
763 while ((n = regnext(n))
22c35a8c 764 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
765 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
766 && off + noff < max)
767 off += noff;
768 if (reg_off_by_arg[OP(scan)])
769 ARG(scan) = off;
b81d288d 770 else
c277df42
IZ
771 NEXT_OFF(scan) = off;
772 }
653099ff
GS
773 /* The principal pseudo-switch. Cannot be a switch, since we
774 look into several different things. */
b81d288d 775 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
776 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
777 next = regnext(scan);
778 code = OP(scan);
b81d288d
AB
779
780 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 781 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 782 struct regnode_charclass_class accum;
c277df42 783
653099ff 784 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 785 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 786 if (flags & SCF_DO_STCLASS)
830247a4 787 cl_init_zero(pRExC_state, &accum);
c277df42 788 while (OP(scan) == code) {
830247a4 789 I32 deltanext, minnext, f = 0, fake;
653099ff 790 struct regnode_charclass_class this_class;
c277df42
IZ
791
792 num++;
793 data_fake.flags = 0;
b81d288d 794 if (data) {
2c2d71f5 795 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
796 data_fake.last_closep = data->last_closep;
797 }
798 else
799 data_fake.last_closep = &fake;
c277df42
IZ
800 next = regnext(scan);
801 scan = NEXTOPER(scan);
802 if (code != BRANCH)
803 scan = NEXTOPER(scan);
653099ff 804 if (flags & SCF_DO_STCLASS) {
830247a4 805 cl_init(pRExC_state, &this_class);
653099ff
GS
806 data_fake.start_class = &this_class;
807 f = SCF_DO_STCLASS_AND;
b81d288d 808 }
e1901655
IZ
809 if (flags & SCF_WHILEM_VISITED_POS)
810 f |= SCF_WHILEM_VISITED_POS;
653099ff 811 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
812 minnext = study_chunk(pRExC_state, &scan, &deltanext,
813 next, &data_fake, f);
b81d288d 814 if (min1 > minnext)
c277df42
IZ
815 min1 = minnext;
816 if (max1 < minnext + deltanext)
817 max1 = minnext + deltanext;
818 if (deltanext == I32_MAX)
aca2d497 819 is_inf = is_inf_internal = 1;
c277df42
IZ
820 scan = next;
821 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
822 pars++;
405ff068 823 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 824 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
825 if (data)
826 data->whilem_c = data_fake.whilem_c;
653099ff 827 if (flags & SCF_DO_STCLASS)
830247a4 828 cl_or(pRExC_state, &accum, &this_class);
b81d288d 829 if (code == SUSPEND)
c277df42
IZ
830 break;
831 }
832 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
833 min1 = 0;
834 if (flags & SCF_DO_SUBSTR) {
835 data->pos_min += min1;
836 data->pos_delta += max1 - min1;
837 if (max1 != min1 || is_inf)
838 data->longest = &(data->longest_float);
839 }
840 min += min1;
841 delta += max1 - min1;
653099ff 842 if (flags & SCF_DO_STCLASS_OR) {
830247a4 843 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
844 if (min1) {
845 cl_and(data->start_class, &and_with);
846 flags &= ~SCF_DO_STCLASS;
847 }
848 }
849 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
850 if (min1) {
851 cl_and(data->start_class, &accum);
653099ff 852 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
853 }
854 else {
b81d288d 855 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
856 * data->start_class */
857 StructCopy(data->start_class, &and_with,
858 struct regnode_charclass_class);
859 flags &= ~SCF_DO_STCLASS_AND;
860 StructCopy(&accum, data->start_class,
861 struct regnode_charclass_class);
862 flags |= SCF_DO_STCLASS_OR;
863 data->start_class->flags |= ANYOF_EOS;
864 }
653099ff 865 }
a0ed51b3
LW
866 }
867 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
868 scan = NEXTOPER(NEXTOPER(scan));
869 else /* single branch is optimized. */
870 scan = NEXTOPER(scan);
871 continue;
a0ed51b3
LW
872 }
873 else if (OP(scan) == EXACT) {
cd439c50 874 I32 l = STR_LEN(scan);
1aa99e6b 875 UV uc = *((U8*)STRING(scan));
a0ed51b3 876 if (UTF) {
1aa99e6b
IH
877 U8 *s = (U8*)STRING(scan);
878 l = utf8_length(s, s + l);
9041c2e3 879 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
880 }
881 min += l;
c277df42 882 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
883 /* The code below prefers earlier match for fixed
884 offset, later match for variable offset. */
885 if (data->last_end == -1) { /* Update the start info. */
886 data->last_start_min = data->pos_min;
887 data->last_start_max = is_inf
b81d288d 888 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 889 }
cd439c50 890 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
891 data->last_end = data->pos_min + l;
892 data->pos_min += l; /* As in the first entry. */
893 data->flags &= ~SF_BEFORE_EOL;
894 }
653099ff
GS
895 if (flags & SCF_DO_STCLASS_AND) {
896 /* Check whether it is compatible with what we know already! */
897 int compat = 1;
898
1aa99e6b 899 if (uc >= 0x100 ||
516a5887 900 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 901 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 902 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 903 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 904 )
653099ff
GS
905 compat = 0;
906 ANYOF_CLASS_ZERO(data->start_class);
907 ANYOF_BITMAP_ZERO(data->start_class);
908 if (compat)
1aa99e6b 909 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 910 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
911 if (uc < 0x100)
912 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
913 }
914 else if (flags & SCF_DO_STCLASS_OR) {
915 /* false positive possible if the class is case-folded */
1aa99e6b 916 if (uc < 0x100)
9b877dbb
IH
917 ANYOF_BITMAP_SET(data->start_class, uc);
918 else
919 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
920 data->start_class->flags &= ~ANYOF_EOS;
921 cl_and(data->start_class, &and_with);
922 }
923 flags &= ~SCF_DO_STCLASS;
a0ed51b3 924 }
653099ff 925 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 926 I32 l = STR_LEN(scan);
1aa99e6b 927 UV uc = *((U8*)STRING(scan));
653099ff
GS
928
929 /* Search for fixed substrings supports EXACT only. */
b81d288d 930 if (flags & SCF_DO_SUBSTR)
830247a4 931 scan_commit(pRExC_state, data);
a0ed51b3 932 if (UTF) {
1aa99e6b
IH
933 U8 *s = (U8 *)STRING(scan);
934 l = utf8_length(s, s + l);
9041c2e3 935 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
936 }
937 min += l;
c277df42 938 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 939 data->pos_min += l;
653099ff
GS
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
942 int compat = 1;
943
1aa99e6b 944 if (uc >= 0x100 ||
516a5887 945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 947 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat) {
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
953 data->start_class->flags &= ~ANYOF_EOS;
954 data->start_class->flags |= ANYOF_FOLD;
955 if (OP(scan) == EXACTFL)
956 data->start_class->flags |= ANYOF_LOCALE;
957 }
958 }
959 else if (flags & SCF_DO_STCLASS_OR) {
960 if (data->start_class->flags & ANYOF_FOLD) {
961 /* false positive possible if the class is case-folded.
962 Assume that the locale settings are the same... */
1aa99e6b
IH
963 if (uc < 0x100)
964 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
965 data->start_class->flags &= ~ANYOF_EOS;
966 }
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
4d61ec05 971 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 972 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 973 I32 f = flags, pos_before = 0;
c277df42 974 regnode *oscan = scan;
653099ff
GS
975 struct regnode_charclass_class this_class;
976 struct regnode_charclass_class *oclass = NULL;
727f22e3 977 I32 next_is_eval = 0;
653099ff 978
22c35a8c 979 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 980 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
981 scan = NEXTOPER(scan);
982 goto finish;
983 case PLUS:
653099ff 984 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 985 next = NEXTOPER(scan);
653099ff 986 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
987 mincount = 1;
988 maxcount = REG_INFTY;
c277df42
IZ
989 next = regnext(scan);
990 scan = NEXTOPER(scan);
991 goto do_curly;
992 }
993 }
994 if (flags & SCF_DO_SUBSTR)
995 data->pos_min++;
996 min++;
997 /* Fall through. */
998 case STAR:
653099ff
GS
999 if (flags & SCF_DO_STCLASS) {
1000 mincount = 0;
b81d288d 1001 maxcount = REG_INFTY;
653099ff
GS
1002 next = regnext(scan);
1003 scan = NEXTOPER(scan);
1004 goto do_curly;
1005 }
b81d288d 1006 is_inf = is_inf_internal = 1;
c277df42
IZ
1007 scan = regnext(scan);
1008 if (flags & SCF_DO_SUBSTR) {
830247a4 1009 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1010 data->longest = &(data->longest_float);
1011 }
1012 goto optimize_curly_tail;
1013 case CURLY:
b81d288d 1014 mincount = ARG1(scan);
c277df42
IZ
1015 maxcount = ARG2(scan);
1016 next = regnext(scan);
cb434fcc
IZ
1017 if (OP(scan) == CURLYX) {
1018 I32 lp = (data ? *(data->last_closep) : 0);
1019
1020 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1021 }
c277df42 1022 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1023 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1024 do_curly:
1025 if (flags & SCF_DO_SUBSTR) {
830247a4 1026 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1027 pos_before = data->pos_min;
1028 }
1029 if (data) {
1030 fl = data->flags;
1031 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1032 if (is_inf)
1033 data->flags |= SF_IS_INF;
1034 }
653099ff 1035 if (flags & SCF_DO_STCLASS) {
830247a4 1036 cl_init(pRExC_state, &this_class);
653099ff
GS
1037 oclass = data->start_class;
1038 data->start_class = &this_class;
1039 f |= SCF_DO_STCLASS_AND;
1040 f &= ~SCF_DO_STCLASS_OR;
1041 }
e1901655
IZ
1042 /* These are the cases when once a subexpression
1043 fails at a particular position, it cannot succeed
1044 even after backtracking at the enclosing scope.
b81d288d 1045
e1901655
IZ
1046 XXXX what if minimal match and we are at the
1047 initial run of {n,m}? */
1048 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1049 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1050
c277df42 1051 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1052 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1053 mincount == 0
653099ff
GS
1054 ? (f & ~SCF_DO_SUBSTR) : f);
1055
1056 if (flags & SCF_DO_STCLASS)
1057 data->start_class = oclass;
1058 if (mincount == 0 || minnext == 0) {
1059 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1060 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1061 }
1062 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1063 /* Switch to OR mode: cache the old value of
653099ff
GS
1064 * data->start_class */
1065 StructCopy(data->start_class, &and_with,
1066 struct regnode_charclass_class);
1067 flags &= ~SCF_DO_STCLASS_AND;
1068 StructCopy(&this_class, data->start_class,
1069 struct regnode_charclass_class);
1070 flags |= SCF_DO_STCLASS_OR;
1071 data->start_class->flags |= ANYOF_EOS;
1072 }
1073 } else { /* Non-zero len */
1074 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1075 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1076 cl_and(data->start_class, &and_with);
1077 }
1078 else if (flags & SCF_DO_STCLASS_AND)
1079 cl_and(data->start_class, &this_class);
1080 flags &= ~SCF_DO_STCLASS;
1081 }
c277df42
IZ
1082 if (!scan) /* It was not CURLYX, but CURLY. */
1083 scan = next;
84037bb0 1084 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1085 /* ? quantifier ok, except for (?{ ... }) */
1086 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1087 && (minnext == 0) && (deltanext == 0)
99799961 1088 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1089 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1090 {
830247a4 1091 vWARN(RExC_parse,
b45f050a
JF
1092 "Quantifier unexpected on zero-length expression");
1093 }
1094
c277df42 1095 min += minnext * mincount;
b81d288d 1096 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1097 && (minnext + deltanext) > 0)
1098 || deltanext == I32_MAX);
aca2d497 1099 is_inf |= is_inf_internal;
c277df42
IZ
1100 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1101
1102 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1103 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1104 && data->flags & SF_IN_PAR
1105 && !(data->flags & SF_HAS_EVAL)
1106 && !deltanext && minnext == 1 ) {
1107 /* Try to optimize to CURLYN. */
1108 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1109 regnode *nxt1 = nxt;
1110#ifdef DEBUGGING
1111 regnode *nxt2;
1112#endif
c277df42
IZ
1113
1114 /* Skip open. */
1115 nxt = regnext(nxt);
4d61ec05 1116 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1117 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1118 && STR_LEN(nxt) == 1))
c277df42 1119 goto nogo;
497b47a8 1120#ifdef DEBUGGING
c277df42 1121 nxt2 = nxt;
497b47a8 1122#endif
c277df42 1123 nxt = regnext(nxt);
b81d288d 1124 if (OP(nxt) != CLOSE)
c277df42
IZ
1125 goto nogo;
1126 /* Now we know that nxt2 is the only contents: */
1127 oscan->flags = ARG(nxt);
1128 OP(oscan) = CURLYN;
1129 OP(nxt1) = NOTHING; /* was OPEN. */
1130#ifdef DEBUGGING
1131 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1132 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1133 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1135 OP(nxt + 1) = OPTIMIZED; /* was count. */
1136 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1137#endif
c277df42 1138 }
c277df42
IZ
1139 nogo:
1140
1141 /* Try optimization CURLYX => CURLYM. */
b81d288d 1142 if ( OP(oscan) == CURLYX && data
c277df42 1143 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1144 && !(data->flags & SF_HAS_EVAL)
1145 && !deltanext ) {
1146 /* XXXX How to optimize if data == 0? */
1147 /* Optimize to a simpler form. */
1148 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1149 regnode *nxt2;
1150
1151 OP(oscan) = CURLYM;
1152 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1153 && (OP(nxt2) != WHILEM))
c277df42
IZ
1154 nxt = nxt2;
1155 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1156 /* Need to optimize away parenths. */
1157 if (data->flags & SF_IN_PAR) {
1158 /* Set the parenth number. */
1159 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1160
b81d288d 1161 if (OP(nxt) != CLOSE)
b45f050a 1162 FAIL("Panic opt close");
c277df42
IZ
1163 oscan->flags = ARG(nxt);
1164 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1165 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1166#ifdef DEBUGGING
1167 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1168 OP(nxt + 1) = OPTIMIZED; /* was count. */
1169 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1170 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1171#endif
c277df42
IZ
1172#if 0
1173 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1174 regnode *nnxt = regnext(nxt1);
b81d288d 1175
c277df42
IZ
1176 if (nnxt == nxt) {
1177 if (reg_off_by_arg[OP(nxt1)])
1178 ARG_SET(nxt1, nxt2 - nxt1);
1179 else if (nxt2 - nxt1 < U16_MAX)
1180 NEXT_OFF(nxt1) = nxt2 - nxt1;
1181 else
1182 OP(nxt) = NOTHING; /* Cannot beautify */
1183 }
1184 nxt1 = nnxt;
1185 }
1186#endif
1187 /* Optimize again: */
b81d288d 1188 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1189 NULL, 0);
a0ed51b3
LW
1190 }
1191 else
c277df42 1192 oscan->flags = 0;
c277df42 1193 }
e1901655
IZ
1194 else if ((OP(oscan) == CURLYX)
1195 && (flags & SCF_WHILEM_VISITED_POS)
1196 /* See the comment on a similar expression above.
1197 However, this time it not a subexpression
1198 we care about, but the expression itself. */
1199 && (maxcount == REG_INFTY)
1200 && data && ++data->whilem_c < 16) {
1201 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1202 /* Find WHILEM (as in regexec.c) */
1203 regnode *nxt = oscan + NEXT_OFF(oscan);
1204
1205 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1206 nxt += ARG(nxt);
1207 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1208 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1209 }
b81d288d 1210 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1211 pars++;
1212 if (flags & SCF_DO_SUBSTR) {
1213 SV *last_str = Nullsv;
1214 int counted = mincount != 0;
1215
1216 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1217#if defined(SPARC64_GCC_WORKAROUND)
1218 I32 b = 0;
1219 STRLEN l = 0;
1220 char *s = NULL;
1221 I32 old = 0;
1222
1223 if (pos_before >= data->last_start_min)
1224 b = pos_before;
1225 else
1226 b = data->last_start_min;
1227
1228 l = 0;
1229 s = SvPV(data->last_found, l);
1230 old = b - data->last_start_min;
1231
1232#else
b81d288d 1233 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1234 ? pos_before : data->last_start_min;
1235 STRLEN l;
1236 char *s = SvPV(data->last_found, l);
a0ed51b3 1237 I32 old = b - data->last_start_min;
5d1c421c 1238#endif
a0ed51b3
LW
1239
1240 if (UTF)
1241 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1242
a0ed51b3 1243 l -= old;
c277df42 1244 /* Get the added string: */
79cb57f6 1245 last_str = newSVpvn(s + old, l);
c277df42
IZ
1246 if (deltanext == 0 && pos_before == b) {
1247 /* What was added is a constant string */
1248 if (mincount > 1) {
1249 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1250 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1251 SvPVX(last_str), l, mincount - 1);
1252 SvCUR(last_str) *= mincount;
1253 /* Add additional parts. */
b81d288d 1254 SvCUR_set(data->last_found,
c277df42
IZ
1255 SvCUR(data->last_found) - l);
1256 sv_catsv(data->last_found, last_str);
1257 data->last_end += l * (mincount - 1);
1258 }
2a8d9689
HS
1259 } else {
1260 /* start offset must point into the last copy */
1261 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1262 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1263 * (minnext + data->pos_delta);
c277df42
IZ
1264 }
1265 }
1266 /* It is counted once already... */
1267 data->pos_min += minnext * (mincount - counted);
1268 data->pos_delta += - counted * deltanext +
1269 (minnext + deltanext) * maxcount - minnext * mincount;
1270 if (mincount != maxcount) {
653099ff
GS
1271 /* Cannot extend fixed substrings found inside
1272 the group. */
830247a4 1273 scan_commit(pRExC_state,data);
c277df42
IZ
1274 if (mincount && last_str) {
1275 sv_setsv(data->last_found, last_str);
1276 data->last_end = data->pos_min;
b81d288d 1277 data->last_start_min =
a0ed51b3 1278 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1279 data->last_start_max = is_inf
1280 ? I32_MAX
c277df42 1281 : data->pos_min + data->pos_delta
a0ed51b3 1282 - CHR_SVLEN(last_str);
c277df42
IZ
1283 }
1284 data->longest = &(data->longest_float);
1285 }
aca2d497 1286 SvREFCNT_dec(last_str);
c277df42 1287 }
405ff068 1288 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1289 data->flags |= SF_HAS_EVAL;
1290 optimize_curly_tail:
c277df42 1291 if (OP(oscan) != CURLYX) {
22c35a8c 1292 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1293 && NEXT_OFF(next))
1294 NEXT_OFF(oscan) += NEXT_OFF(next);
1295 }
c277df42 1296 continue;
653099ff 1297 default: /* REF and CLUMP only? */
c277df42 1298 if (flags & SCF_DO_SUBSTR) {
830247a4 1299 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1300 data->longest = &(data->longest_float);
1301 }
aca2d497 1302 is_inf = is_inf_internal = 1;
653099ff 1303 if (flags & SCF_DO_STCLASS_OR)
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff 1305 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1306 break;
1307 }
a0ed51b3 1308 }
ffc61ed2 1309 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1310 int value = 0;
653099ff 1311
c277df42 1312 if (flags & SCF_DO_SUBSTR) {
830247a4 1313 scan_commit(pRExC_state,data);
c277df42
IZ
1314 data->pos_min++;
1315 }
1316 min++;
653099ff
GS
1317 if (flags & SCF_DO_STCLASS) {
1318 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1319
1320 /* Some of the logic below assumes that switching
1321 locale on will only add false positives. */
1322 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1323 case SANY:
653099ff
GS
1324 default:
1325 do_default:
1326 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1327 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1328 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1329 break;
1330 case REG_ANY:
1331 if (OP(scan) == SANY)
1332 goto do_default;
1333 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1334 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1335 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1336 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1337 }
1338 if (flags & SCF_DO_STCLASS_AND || !value)
1339 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1340 break;
1341 case ANYOF:
1342 if (flags & SCF_DO_STCLASS_AND)
1343 cl_and(data->start_class,
1344 (struct regnode_charclass_class*)scan);
1345 else
830247a4 1346 cl_or(pRExC_state, data->start_class,
653099ff
GS
1347 (struct regnode_charclass_class*)scan);
1348 break;
1349 case ALNUM:
1350 if (flags & SCF_DO_STCLASS_AND) {
1351 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1353 for (value = 0; value < 256; value++)
1354 if (!isALNUM(value))
1355 ANYOF_BITMAP_CLEAR(data->start_class, value);
1356 }
1357 }
1358 else {
1359 if (data->start_class->flags & ANYOF_LOCALE)
1360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1361 else {
1362 for (value = 0; value < 256; value++)
1363 if (isALNUM(value))
b81d288d 1364 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1365 }
1366 }
1367 break;
1368 case ALNUML:
1369 if (flags & SCF_DO_STCLASS_AND) {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1372 }
1373 else {
1374 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1375 data->start_class->flags |= ANYOF_LOCALE;
1376 }
1377 break;
1378 case NALNUM:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (!isALNUM(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case NALNUML:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1401 }
1402 else {
1403 data->start_class->flags |= ANYOF_LOCALE;
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1405 }
1406 break;
1407 case SPACE:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1411 for (value = 0; value < 256; value++)
1412 if (!isSPACE(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isSPACE(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case SPACEL:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1430 }
1431 else {
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1434 }
1435 break;
1436 case NSPACE:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1440 for (value = 0; value < 256; value++)
1441 if (isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isSPACE(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case NSPACEL:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE) {
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1459 for (value = 0; value < 256; value++)
1460 if (!isSPACE(value))
1461 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 }
1463 }
1464 else {
1465 data->start_class->flags |= ANYOF_LOCALE;
1466 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1467 }
1468 break;
1469 case DIGIT:
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1472 for (value = 0; value < 256; value++)
1473 if (!isDIGIT(value))
1474 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 }
1476 else {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1479 else {
1480 for (value = 0; value < 256; value++)
1481 if (isDIGIT(value))
b81d288d 1482 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1483 }
1484 }
1485 break;
1486 case NDIGIT:
1487 if (flags & SCF_DO_STCLASS_AND) {
1488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1489 for (value = 0; value < 256; value++)
1490 if (isDIGIT(value))
1491 ANYOF_BITMAP_CLEAR(data->start_class, value);
1492 }
1493 else {
1494 if (data->start_class->flags & ANYOF_LOCALE)
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1496 else {
1497 for (value = 0; value < 256; value++)
1498 if (!isDIGIT(value))
b81d288d 1499 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1500 }
1501 }
1502 break;
1503 }
1504 if (flags & SCF_DO_STCLASS_OR)
1505 cl_and(data->start_class, &and_with);
1506 flags &= ~SCF_DO_STCLASS;
1507 }
a0ed51b3 1508 }
22c35a8c 1509 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1510 data->flags |= (OP(scan) == MEOL
1511 ? SF_BEFORE_MEOL
1512 : SF_BEFORE_SEOL);
a0ed51b3 1513 }
653099ff
GS
1514 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1515 /* Lookbehind, or need to calculate parens/evals/stclass: */
1516 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1517 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1518 /* Lookahead/lookbehind */
cb434fcc 1519 I32 deltanext, minnext, fake = 0;
c277df42 1520 regnode *nscan;
653099ff
GS
1521 struct regnode_charclass_class intrnl;
1522 int f = 0;
c277df42
IZ
1523
1524 data_fake.flags = 0;
b81d288d 1525 if (data) {
2c2d71f5 1526 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1527 data_fake.last_closep = data->last_closep;
1528 }
1529 else
1530 data_fake.last_closep = &fake;
653099ff
GS
1531 if ( flags & SCF_DO_STCLASS && !scan->flags
1532 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1533 cl_init(pRExC_state, &intrnl);
653099ff 1534 data_fake.start_class = &intrnl;
e1901655 1535 f |= SCF_DO_STCLASS_AND;
653099ff 1536 }
e1901655
IZ
1537 if (flags & SCF_WHILEM_VISITED_POS)
1538 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1539 next = regnext(scan);
1540 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1541 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1542 if (scan->flags) {
1543 if (deltanext) {
9baa0206 1544 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1545 }
1546 else if (minnext > U8_MAX) {
9baa0206 1547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1548 }
1549 scan->flags = minnext;
1550 }
1551 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1552 pars++;
405ff068 1553 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1554 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1555 if (data)
1556 data->whilem_c = data_fake.whilem_c;
e1901655 1557 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1558 int was = (data->start_class->flags & ANYOF_EOS);
1559
1560 cl_and(data->start_class, &intrnl);
1561 if (was)
1562 data->start_class->flags |= ANYOF_EOS;
1563 }
a0ed51b3
LW
1564 }
1565 else if (OP(scan) == OPEN) {
c277df42 1566 pars++;
a0ed51b3 1567 }
cb434fcc
IZ
1568 else if (OP(scan) == CLOSE) {
1569 if (ARG(scan) == is_par) {
1570 next = regnext(scan);
c277df42 1571
cb434fcc
IZ
1572 if ( next && (OP(next) != WHILEM) && next < last)
1573 is_par = 0; /* Disable optimization */
1574 }
1575 if (data)
1576 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1577 }
1578 else if (OP(scan) == EVAL) {
c277df42
IZ
1579 if (data)
1580 data->flags |= SF_HAS_EVAL;
1581 }
96776eda 1582 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1583 if (flags & SCF_DO_SUBSTR) {
830247a4 1584 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1585 data->longest = &(data->longest_float);
1586 }
1587 is_inf = is_inf_internal = 1;
653099ff 1588 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1589 cl_anything(pRExC_state, data->start_class);
96776eda 1590 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1591 }
c277df42
IZ
1592 /* Else: zero-length, ignore. */
1593 scan = regnext(scan);
1594 }
1595
1596 finish:
1597 *scanp = scan;
aca2d497 1598 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1599 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1600 data->pos_delta = I32_MAX - data->pos_min;
1601 if (is_par > U8_MAX)
1602 is_par = 0;
1603 if (is_par && pars==1 && data) {
1604 data->flags |= SF_IN_PAR;
1605 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1606 }
1607 else if (pars && data) {
c277df42
IZ
1608 data->flags |= SF_HAS_PAR;
1609 data->flags &= ~SF_IN_PAR;
1610 }
653099ff
GS
1611 if (flags & SCF_DO_STCLASS_OR)
1612 cl_and(data->start_class, &and_with);
c277df42
IZ
1613 return min;
1614}
1615
76e3520e 1616STATIC I32
830247a4 1617S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1618{
830247a4 1619 if (RExC_rx->data) {
b81d288d
AB
1620 Renewc(RExC_rx->data,
1621 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1622 char, struct reg_data);
830247a4
IZ
1623 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1624 RExC_rx->data->count += n;
a0ed51b3
LW
1625 }
1626 else {
830247a4 1627 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1628 char, struct reg_data);
830247a4
IZ
1629 New(1208, RExC_rx->data->what, n, U8);
1630 RExC_rx->data->count = n;
c277df42 1631 }
830247a4
IZ
1632 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1633 return RExC_rx->data->count - n;
c277df42
IZ
1634}
1635
d88dccdf 1636void
864dbfa3 1637Perl_reginitcolors(pTHX)
d88dccdf 1638{
d88dccdf
IZ
1639 int i = 0;
1640 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1641
d88dccdf
IZ
1642 if (s) {
1643 PL_colors[0] = s = savepv(s);
1644 while (++i < 6) {
1645 s = strchr(s, '\t');
1646 if (s) {
1647 *s = '\0';
1648 PL_colors[i] = ++s;
1649 }
1650 else
c712d376 1651 PL_colors[i] = s = "";
d88dccdf
IZ
1652 }
1653 } else {
b81d288d 1654 while (i < 6)
d88dccdf
IZ
1655 PL_colors[i++] = "";
1656 }
1657 PL_colorset = 1;
1658}
1659
8615cb43 1660
a687059c 1661/*
e50aee73 1662 - pregcomp - compile a regular expression into internal code
a687059c
LW
1663 *
1664 * We can't allocate space until we know how big the compiled form will be,
1665 * but we can't compile it (and thus know how big it is) until we've got a
1666 * place to put the code. So we cheat: we compile it twice, once with code
1667 * generation turned off and size counting turned on, and once "for real".
1668 * This also means that we don't allocate space until we are sure that the
1669 * thing really will compile successfully, and we never have to move the
1670 * code and thus invalidate pointers into it. (Note that it has to be in
1671 * one piece because free() must be able to free it all.) [NB: not true in perl]
1672 *
1673 * Beware that the optimization-preparation code in here knows about some
1674 * of the structure of the compiled regexp. [I'll say.]
1675 */
1676regexp *
864dbfa3 1677Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1678{
a0d0e21e 1679 register regexp *r;
c277df42 1680 regnode *scan;
c277df42 1681 regnode *first;
a0d0e21e 1682 I32 flags;
a0d0e21e
LW
1683 I32 minlen = 0;
1684 I32 sawplus = 0;
1685 I32 sawopen = 0;
2c2d71f5 1686 scan_data_t data;
830247a4
IZ
1687 RExC_state_t RExC_state;
1688 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1689
1690 if (exp == NULL)
c277df42 1691 FAIL("NULL regexp argument");
a0d0e21e 1692
a5961de5 1693 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1694
5cfc7842 1695 RExC_precomp = exp;
a5961de5
JH
1696 DEBUG_r({
1697 if (!PL_colorset) reginitcolors();
1698 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1699 PL_colors[4],PL_colors[5],PL_colors[0],
1700 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1701 });
830247a4
IZ
1702 RExC_flags16 = pm->op_pmflags;
1703 RExC_sawback = 0;
bbce6d69 1704
830247a4
IZ
1705 RExC_seen = 0;
1706 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1707 RExC_seen_evals = 0;
1708 RExC_extralen = 0;
c277df42 1709
bbce6d69 1710 /* First pass: determine size, legality. */
830247a4 1711 RExC_parse = exp;
fac92740 1712 RExC_start = exp;
830247a4
IZ
1713 RExC_end = xend;
1714 RExC_naughty = 0;
1715 RExC_npar = 1;
1716 RExC_size = 0L;
1717 RExC_emit = &PL_regdummy;
1718 RExC_whilem_seen = 0;
85ddcde9
JH
1719#if 0 /* REGC() is (currently) a NOP at the first pass.
1720 * Clever compilers notice this and complain. --jhi */
830247a4 1721 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1722#endif
830247a4 1723 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1724 RExC_precomp = Nullch;
a0d0e21e
LW
1725 return(NULL);
1726 }
830247a4 1727 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1728
c277df42
IZ
1729 /* Small enough for pointer-storage convention?
1730 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1731 if (RExC_size >= 0x10000L && RExC_extralen)
1732 RExC_size += RExC_extralen;
c277df42 1733 else
830247a4
IZ
1734 RExC_extralen = 0;
1735 if (RExC_whilem_seen > 15)
1736 RExC_whilem_seen = 15;
a0d0e21e 1737
bbce6d69 1738 /* Allocate space and initialize. */
830247a4 1739 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1740 char, regexp);
a0d0e21e 1741 if (r == NULL)
b45f050a
JF
1742 FAIL("Regexp out of space");
1743
0f79a09d
GS
1744#ifdef DEBUGGING
1745 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1746 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1747#endif
c277df42 1748 r->refcnt = 1;
bbce6d69 1749 r->prelen = xend - exp;
5cfc7842 1750 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1751 r->subbeg = NULL;
1752 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1753 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1754
1755 r->substrs = 0; /* Useful during FAIL. */
1756 r->startp = 0; /* Useful during FAIL. */
1757 r->endp = 0; /* Useful during FAIL. */
1758
fac92740
MJD
1759 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1760 if (r->offsets) {
1761 r->offsets[0] = RExC_size;
1762 }
1763 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1764 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1765 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1766 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1767
830247a4 1768 RExC_rx = r;
bbce6d69 1769
1770 /* Second pass: emit code. */
055bb491 1771 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1772 RExC_parse = exp;
1773 RExC_end = xend;
1774 RExC_naughty = 0;
1775 RExC_npar = 1;
fac92740 1776 RExC_emit_start = r->program;
830247a4 1777 RExC_emit = r->program;
2cd61cdb 1778 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1779 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1780 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1781 r->data = 0;
830247a4 1782 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1783 return(NULL);
1784
1785 /* Dig out information for optimizations. */
cf93c79d 1786 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1787 pm->op_pmflags = RExC_flags16;
a0ed51b3 1788 if (UTF)
5ff6fc6d 1789 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1790 r->regstclass = NULL;
830247a4 1791 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1792 r->reganch |= ROPT_NAUGHTY;
c277df42 1793 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1794
1795 /* XXXX To minimize changes to RE engine we always allocate
1796 3-units-long substrs field. */
1797 Newz(1004, r->substrs, 1, struct reg_substr_data);
1798
2c2d71f5 1799 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1800 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1801 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1802 I32 fake;
c5254dd6 1803 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1804 struct regnode_charclass_class ch_class;
1805 int stclass_flag;
cb434fcc 1806 I32 last_close = 0;
a0d0e21e
LW
1807
1808 first = scan;
c277df42 1809 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1810 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1811 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1812 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1813 (OP(first) == PLUS) ||
1814 (OP(first) == MINMOD) ||
653099ff 1815 /* An {n,m} with n>0 */
22c35a8c 1816 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1817 if (OP(first) == PLUS)
1818 sawplus = 1;
1819 else
1820 first += regarglen[(U8)OP(first)];
1821 first = NEXTOPER(first);
a687059c
LW
1822 }
1823
a0d0e21e
LW
1824 /* Starting-point info. */
1825 again:
653099ff 1826 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1827 if (OP(first) == EXACT)
1828 ; /* Empty, get anchored substr later. */
1829 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1830 r->regstclass = first;
1831 }
653099ff 1832 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1833 r->regstclass = first;
22c35a8c
GS
1834 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1835 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1836 r->regstclass = first;
22c35a8c 1837 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1838 r->reganch |= (OP(first) == MBOL
1839 ? ROPT_ANCH_MBOL
1840 : (OP(first) == SBOL
1841 ? ROPT_ANCH_SBOL
1842 : ROPT_ANCH_BOL));
a0d0e21e 1843 first = NEXTOPER(first);
774d564b 1844 goto again;
1845 }
1846 else if (OP(first) == GPOS) {
1847 r->reganch |= ROPT_ANCH_GPOS;
1848 first = NEXTOPER(first);
1849 goto again;
a0d0e21e
LW
1850 }
1851 else if ((OP(first) == STAR &&
22c35a8c 1852 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1853 !(r->reganch & ROPT_ANCH) )
1854 {
1855 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1856 int type = OP(NEXTOPER(first));
1857
ffc61ed2 1858 if (type == REG_ANY)
cad2e5aa
JH
1859 type = ROPT_ANCH_MBOL;
1860 else
1861 type = ROPT_ANCH_SBOL;
1862
1863 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1864 first = NEXTOPER(first);
774d564b 1865 goto again;
a0d0e21e 1866 }
b81d288d 1867 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1868 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1869 /* x+ must match at the 1st pos of run of x's */
1870 r->reganch |= ROPT_SKIP;
a0d0e21e 1871
c277df42 1872 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1873 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1874 (IV)(first - scan + 1)));
a0d0e21e
LW
1875 /*
1876 * If there's something expensive in the r.e., find the
1877 * longest literal string that must appear and make it the
1878 * regmust. Resolve ties in favor of later strings, since
1879 * the regstart check works with the beginning of the r.e.
1880 * and avoiding duplication strengthens checking. Not a
1881 * strong reason, but sufficient in the absence of others.
1882 * [Now we resolve ties in favor of the earlier string if
c277df42 1883 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1884 * earlier string may buy us something the later one won't.]
1885 */
a0d0e21e 1886 minlen = 0;
a687059c 1887
79cb57f6
GS
1888 data.longest_fixed = newSVpvn("",0);
1889 data.longest_float = newSVpvn("",0);
1890 data.last_found = newSVpvn("",0);
c277df42
IZ
1891 data.longest = &(data.longest_fixed);
1892 first = scan;
653099ff 1893 if (!r->regstclass) {
830247a4 1894 cl_init(pRExC_state, &ch_class);
653099ff
GS
1895 data.start_class = &ch_class;
1896 stclass_flag = SCF_DO_STCLASS_AND;
1897 } else /* XXXX Check for BOUND? */
1898 stclass_flag = 0;
cb434fcc 1899 data.last_closep = &last_close;
653099ff 1900
830247a4 1901 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1902 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1903 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1904 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1905 && !RExC_seen_zerolen
1906 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1907 r->reganch |= ROPT_CHECK_ALL;
830247a4 1908 scan_commit(pRExC_state, &data);
c277df42
IZ
1909 SvREFCNT_dec(data.last_found);
1910
a0ed51b3 1911 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1912 if (longest_float_length
c277df42
IZ
1913 || (data.flags & SF_FL_BEFORE_EOL
1914 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1915 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1916 int t;
1917
a0ed51b3 1918 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1919 && data.offset_fixed == data.offset_float_min
1920 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1921 goto remove_float; /* As in (a)+. */
1922
c277df42
IZ
1923 r->float_substr = data.longest_float;
1924 r->float_min_offset = data.offset_float_min;
1925 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1926 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1927 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1928 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1929 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1930 }
1931 else {
aca2d497 1932 remove_float:
c277df42
IZ
1933 r->float_substr = Nullsv;
1934 SvREFCNT_dec(data.longest_float);
c5254dd6 1935 longest_float_length = 0;
a0d0e21e 1936 }
c277df42 1937
a0ed51b3 1938 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1939 if (longest_fixed_length
c277df42
IZ
1940 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1941 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1942 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1943 int t;
1944
c277df42
IZ
1945 r->anchored_substr = data.longest_fixed;
1946 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1947 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1948 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1949 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1950 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1951 }
1952 else {
c277df42
IZ
1953 r->anchored_substr = Nullsv;
1954 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1955 longest_fixed_length = 0;
a0d0e21e 1956 }
b81d288d 1957 if (r->regstclass
ffc61ed2 1958 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1959 r->regstclass = NULL;
1960 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1961 && !(data.start_class->flags & ANYOF_EOS)
1962 && !cl_is_anything(data.start_class)) {
830247a4 1963 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1964
b81d288d 1965 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1966 struct regnode_charclass_class);
1967 StructCopy(data.start_class,
830247a4 1968 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1969 struct regnode_charclass_class);
830247a4 1970 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1971 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1972 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1973 DEBUG_r({ SV *sv = sv_newmortal();
1974 regprop(sv, (regnode*)data.start_class);
1975 PerlIO_printf(Perl_debug_log,
1976 "synthetic stclass `%s'.\n",
1977 SvPVX(sv));});
653099ff 1978 }
c277df42
IZ
1979
1980 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1981 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1982 r->check_substr = r->anchored_substr;
1983 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1984 if (r->reganch & ROPT_ANCH_SINGLE)
1985 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1986 }
1987 else {
c277df42
IZ
1988 r->check_substr = r->float_substr;
1989 r->check_offset_min = data.offset_float_min;
1990 r->check_offset_max = data.offset_float_max;
a0d0e21e 1991 }
30382c73
IZ
1992 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1993 This should be changed ASAP! */
1994 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1995 r->reganch |= RE_USE_INTUIT;
1996 if (SvTAIL(r->check_substr))
1997 r->reganch |= RE_INTUIT_TAIL;
1998 }
a0ed51b3
LW
1999 }
2000 else {
c277df42
IZ
2001 /* Several toplevels. Best we can is to set minlen. */
2002 I32 fake;
653099ff 2003 struct regnode_charclass_class ch_class;
cb434fcc 2004 I32 last_close = 0;
c277df42
IZ
2005
2006 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2007 scan = r->program + 1;
830247a4 2008 cl_init(pRExC_state, &ch_class);
653099ff 2009 data.start_class = &ch_class;
cb434fcc 2010 data.last_closep = &last_close;
e1901655 2011 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2012 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2013 if (!(data.start_class->flags & ANYOF_EOS)
2014 && !cl_is_anything(data.start_class)) {
830247a4 2015 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2016
b81d288d 2017 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2018 struct regnode_charclass_class);
2019 StructCopy(data.start_class,
830247a4 2020 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2021 struct regnode_charclass_class);
830247a4 2022 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2023 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2024 DEBUG_r({ SV* sv = sv_newmortal();
2025 regprop(sv, (regnode*)data.start_class);
2026 PerlIO_printf(Perl_debug_log,
2027 "synthetic stclass `%s'.\n",
2028 SvPVX(sv));});
653099ff 2029 }
a0d0e21e
LW
2030 }
2031
a0d0e21e 2032 r->minlen = minlen;
b81d288d 2033 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2034 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2035 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2036 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2037 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2038 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2039 if (RExC_seen & REG_SEEN_CANY)
2040 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2041 Newz(1002, r->startp, RExC_npar, I32);
2042 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2043 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2044 DEBUG_r(regdump(r));
2045 return(r);
a687059c
LW
2046}
2047
2048/*
2049 - reg - regular expression, i.e. main body or parenthesized thing
2050 *
2051 * Caller must absorb opening parenthesis.
2052 *
2053 * Combining parenthesis handling with the base level of regular expression
2054 * is a trifle forced, but the need to tie the tails of the branches to what
2055 * follows makes it hard to avoid.
2056 */
76e3520e 2057STATIC regnode *
830247a4 2058S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2059 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2060{
c277df42
IZ
2061 register regnode *ret; /* Will be the head of the group. */
2062 register regnode *br;
2063 register regnode *lastbr;
2064 register regnode *ender = 0;
a0d0e21e 2065 register I32 parno = 0;
830247a4 2066 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2067
2068 /* for (?g), (?gc), and (?o) warnings; warning
2069 about (?c) will warn about (?g) -- japhy */
2070
2071 I32 wastedflags = 0x00,
2072 wasted_o = 0x01,
2073 wasted_g = 0x02,
2074 wasted_gc = 0x02 | 0x04,
2075 wasted_c = 0x04;
2076
fac92740 2077 char * parse_start = RExC_parse; /* MJD */
830247a4 2078 char *oregcomp_parse = RExC_parse;
c277df42 2079 char c;
a0d0e21e 2080
821b33a5 2081 *flagp = 0; /* Tentatively. */
a0d0e21e 2082
9d1d55b5 2083
a0d0e21e
LW
2084 /* Make an OPEN node, if parenthesized. */
2085 if (paren) {
fac92740 2086 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2087 U16 posflags = 0, negflags = 0;
2088 U16 *flagsp = &posflags;
0f5d15d6 2089 int logical = 0;
830247a4 2090 char *seqstart = RExC_parse;
ca9dfc88 2091
830247a4
IZ
2092 RExC_parse++;
2093 paren = *RExC_parse++;
c277df42 2094 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2095 switch (paren) {
fac92740 2096 case '<': /* (?<...) */
830247a4 2097 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2098 if (*RExC_parse == '!')
c277df42 2099 paren = ',';
b81d288d 2100 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2101 goto unknown;
830247a4 2102 RExC_parse++;
fac92740
MJD
2103 case '=': /* (?=...) */
2104 case '!': /* (?!...) */
830247a4 2105 RExC_seen_zerolen++;
fac92740
MJD
2106 case ':': /* (?:...) */
2107 case '>': /* (?>...) */
a0d0e21e 2108 break;
fac92740
MJD
2109 case '$': /* (?$...) */
2110 case '@': /* (?@...) */
8615cb43 2111 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2112 break;
fac92740 2113 case '#': /* (?#...) */
830247a4
IZ
2114 while (*RExC_parse && *RExC_parse != ')')
2115 RExC_parse++;
2116 if (*RExC_parse != ')')
c277df42 2117 FAIL("Sequence (?#... not terminated");
830247a4 2118 nextchar(pRExC_state);
a0d0e21e
LW
2119 *flagp = TRYAGAIN;
2120 return NULL;
fac92740 2121 case 'p': /* (?p...) */
f9373011
PM
2122 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2123 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2124 /* FALL THROUGH*/
fac92740 2125 case '?': /* (??...) */
0f5d15d6 2126 logical = 1;
830247a4 2127 paren = *RExC_parse++;
0f5d15d6 2128 /* FALL THROUGH */
fac92740 2129 case '{': /* (?{...}) */
c277df42 2130 {
c277df42
IZ
2131 I32 count = 1, n = 0;
2132 char c;
830247a4 2133 char *s = RExC_parse;
c277df42
IZ
2134 SV *sv;
2135 OP_4tree *sop, *rop;
2136
830247a4
IZ
2137 RExC_seen_zerolen++;
2138 RExC_seen |= REG_SEEN_EVAL;
2139 while (count && (c = *RExC_parse)) {
2140 if (c == '\\' && RExC_parse[1])
2141 RExC_parse++;
b81d288d 2142 else if (c == '{')
c277df42 2143 count++;
b81d288d 2144 else if (c == '}')
c277df42 2145 count--;
830247a4 2146 RExC_parse++;
c277df42 2147 }
830247a4 2148 if (*RExC_parse != ')')
b45f050a 2149 {
b81d288d 2150 RExC_parse = s;
b45f050a
JF
2151 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2152 }
c277df42
IZ
2153 if (!SIZE_ONLY) {
2154 AV *av;
b81d288d
AB
2155
2156 if (RExC_parse - 1 - s)
830247a4 2157 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2158 else
79cb57f6 2159 sv = newSVpvn("", 0);
c277df42 2160
569233ed
SB
2161 ENTER;
2162 Perl_save_re_context(aTHX);
c277df42 2163 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2164 sop->op_private |= OPpREFCOUNTED;
2165 /* re_dup will OpREFCNT_inc */
2166 OpREFCNT_set(sop, 1);
569233ed 2167 LEAVE;
c277df42 2168
830247a4
IZ
2169 n = add_data(pRExC_state, 3, "nop");
2170 RExC_rx->data->data[n] = (void*)rop;
2171 RExC_rx->data->data[n+1] = (void*)sop;
2172 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2173 SvREFCNT_dec(sv);
a0ed51b3 2174 }
e24b16f9 2175 else { /* First pass */
830247a4 2176 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2177 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2178 /* No compiled RE interpolated, has runtime
2179 components ===> unsafe. */
2180 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2181 if (PL_tainting && PL_tainted)
cc6b7395 2182 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2183 }
2184
830247a4 2185 nextchar(pRExC_state);
0f5d15d6 2186 if (logical) {
830247a4 2187 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2188 if (!SIZE_ONLY)
2189 ret->flags = 2;
830247a4 2190 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2191 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2192 return ret;
2193 }
830247a4 2194 return reganode(pRExC_state, EVAL, n);
c277df42 2195 }
fac92740 2196 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2197 {
fac92740 2198 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2199 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2200 || RExC_parse[1] == '<'
830247a4 2201 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2202 I32 flag;
2203
830247a4 2204 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2205 if (!SIZE_ONLY)
2206 ret->flags = 1;
830247a4 2207 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2208 goto insert_if;
b81d288d 2209 }
a0ed51b3 2210 }
830247a4 2211 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2212 /* (?(1)...) */
830247a4 2213 parno = atoi(RExC_parse++);
c277df42 2214
830247a4
IZ
2215 while (isDIGIT(*RExC_parse))
2216 RExC_parse++;
fac92740
MJD
2217 ret = reganode(pRExC_state, GROUPP, parno);
2218
830247a4 2219 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2220 vFAIL("Switch condition not recognized");
c277df42 2221 insert_if:
830247a4
IZ
2222 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2223 br = regbranch(pRExC_state, &flags, 1);
c277df42 2224 if (br == NULL)
830247a4 2225 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2226 else
830247a4
IZ
2227 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2228 c = *nextchar(pRExC_state);
d1b80229
IZ
2229 if (flags&HASWIDTH)
2230 *flagp |= HASWIDTH;
c277df42 2231 if (c == '|') {
830247a4
IZ
2232 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2233 regbranch(pRExC_state, &flags, 1);
2234 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2235 if (flags&HASWIDTH)
2236 *flagp |= HASWIDTH;
830247a4 2237 c = *nextchar(pRExC_state);
a0ed51b3
LW
2238 }
2239 else
c277df42
IZ
2240 lastbr = NULL;
2241 if (c != ')')
8615cb43 2242 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2243 ender = reg_node(pRExC_state, TAIL);
2244 regtail(pRExC_state, br, ender);
c277df42 2245 if (lastbr) {
830247a4
IZ
2246 regtail(pRExC_state, lastbr, ender);
2247 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2248 }
2249 else
830247a4 2250 regtail(pRExC_state, ret, ender);
c277df42 2251 return ret;
a0ed51b3
LW
2252 }
2253 else {
830247a4 2254 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2255 }
2256 }
1b1626e4 2257 case 0:
830247a4 2258 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2259 vFAIL("Sequence (? incomplete");
1b1626e4 2260 break;
a0d0e21e 2261 default:
830247a4 2262 --RExC_parse;
fac92740 2263 parse_flags: /* (?i) */
830247a4 2264 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2265 /* (?g), (?gc) and (?o) are useless here
2266 and must be globally applied -- japhy */
2267
2268 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2269 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2270 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2271 if (! (wastedflags & wflagbit) ) {
2272 wastedflags |= wflagbit;
2273 vWARN5(
2274 RExC_parse + 1,
2275 "Useless (%s%c) - %suse /%c modifier",
2276 flagsp == &negflags ? "?-" : "?",
2277 *RExC_parse,
2278 flagsp == &negflags ? "don't " : "",
2279 *RExC_parse
2280 );
2281 }
2282 }
2283 }
2284 else if (*RExC_parse == 'c') {
2285 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2286 if (! (wastedflags & wasted_c) ) {
2287 wastedflags |= wasted_gc;
2288 vWARN3(
2289 RExC_parse + 1,
2290 "Useless (%sc) - %suse /gc modifier",
2291 flagsp == &negflags ? "?-" : "?",
2292 flagsp == &negflags ? "don't " : ""
2293 );
2294 }
2295 }
2296 }
2297 else { pmflag(flagsp, *RExC_parse); }
2298
830247a4 2299 ++RExC_parse;
ca9dfc88 2300 }
830247a4 2301 if (*RExC_parse == '-') {
ca9dfc88 2302 flagsp = &negflags;
9d1d55b5 2303 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2304 ++RExC_parse;
ca9dfc88 2305 goto parse_flags;
48c036b1 2306 }
830247a4
IZ
2307 RExC_flags16 |= posflags;
2308 RExC_flags16 &= ~negflags;
2309 if (*RExC_parse == ':') {
2310 RExC_parse++;
ca9dfc88
IZ
2311 paren = ':';
2312 break;
2313 }
c277df42 2314 unknown:
830247a4
IZ
2315 if (*RExC_parse != ')') {
2316 RExC_parse++;
2317 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2318 }
830247a4 2319 nextchar(pRExC_state);
a0d0e21e
LW
2320 *flagp = TRYAGAIN;
2321 return NULL;
2322 }
2323 }
fac92740 2324 else { /* (...) */
830247a4
IZ
2325 parno = RExC_npar;
2326 RExC_npar++;
2327 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2328 Set_Node_Length(ret, 1); /* MJD */
2329 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2330 open = 1;
a0d0e21e 2331 }
a0ed51b3 2332 }
fac92740 2333 else /* ! paren */
a0d0e21e
LW
2334 ret = NULL;
2335
2336 /* Pick up the branches, linking them together. */
fac92740 2337 parse_start = RExC_parse; /* MJD */
830247a4 2338 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2339 /* branch_len = (paren != 0); */
2340
a0d0e21e
LW
2341 if (br == NULL)
2342 return(NULL);
830247a4
IZ
2343 if (*RExC_parse == '|') {
2344 if (!SIZE_ONLY && RExC_extralen) {
2345 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2346 }
fac92740 2347 else { /* MJD */
830247a4 2348 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2349 Set_Node_Length(br, paren != 0);
2350 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2351 }
c277df42
IZ
2352 have_branch = 1;
2353 if (SIZE_ONLY)
830247a4 2354 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2355 }
2356 else if (paren == ':') {
c277df42
IZ
2357 *flagp |= flags&SIMPLE;
2358 }
2359 if (open) { /* Starts with OPEN. */
830247a4 2360 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2361 }
2362 else if (paren != '?') /* Not Conditional */
a0d0e21e 2363 ret = br;
821b33a5
IZ
2364 if (flags&HASWIDTH)
2365 *flagp |= HASWIDTH;
a0d0e21e 2366 *flagp |= flags&SPSTART;
c277df42 2367 lastbr = br;
830247a4
IZ
2368 while (*RExC_parse == '|') {
2369 if (!SIZE_ONLY && RExC_extralen) {
2370 ender = reganode(pRExC_state, LONGJMP,0);
2371 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2372 }
2373 if (SIZE_ONLY)
830247a4
IZ
2374 RExC_extralen += 2; /* Account for LONGJMP. */
2375 nextchar(pRExC_state);
2376 br = regbranch(pRExC_state, &flags, 0);
fac92740 2377
a687059c 2378 if (br == NULL)
a0d0e21e 2379 return(NULL);
830247a4 2380 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2381 lastbr = br;
821b33a5
IZ
2382 if (flags&HASWIDTH)
2383 *flagp |= HASWIDTH;
a687059c 2384 *flagp |= flags&SPSTART;
a0d0e21e
LW
2385 }
2386
c277df42
IZ
2387 if (have_branch || paren != ':') {
2388 /* Make a closing node, and hook it on the end. */
2389 switch (paren) {
2390 case ':':
830247a4 2391 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2392 break;
2393 case 1:
830247a4 2394 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2395 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2396 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2397 break;
2398 case '<':
c277df42
IZ
2399 case ',':
2400 case '=':
2401 case '!':
c277df42 2402 *flagp &= ~HASWIDTH;
821b33a5
IZ
2403 /* FALL THROUGH */
2404 case '>':
830247a4 2405 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2406 break;
2407 case 0:
830247a4 2408 ender = reg_node(pRExC_state, END);
c277df42
IZ
2409 break;
2410 }
830247a4 2411 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2412
c277df42
IZ
2413 if (have_branch) {
2414 /* Hook the tails of the branches to the closing node. */
2415 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2416 regoptail(pRExC_state, br, ender);
c277df42
IZ
2417 }
2418 }
a0d0e21e 2419 }
c277df42
IZ
2420
2421 {
2422 char *p;
2423 static char parens[] = "=!<,>";
2424
2425 if (paren && (p = strchr(parens, paren))) {
2426 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2427 int flag = (p - parens) > 1;
2428
2429 if (paren == '>')
2430 node = SUSPEND, flag = 0;
830247a4 2431 reginsert(pRExC_state, node,ret);
c277df42 2432 ret->flags = flag;
830247a4 2433 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2434 }
a0d0e21e
LW
2435 }
2436
2437 /* Check for proper termination. */
ce3e6498 2438 if (paren) {
830247a4
IZ
2439 RExC_flags16 = oregflags;
2440 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2441 RExC_parse = oregcomp_parse;
380a0633 2442 vFAIL("Unmatched (");
ce3e6498 2443 }
a0ed51b3 2444 }
830247a4
IZ
2445 else if (!paren && RExC_parse < RExC_end) {
2446 if (*RExC_parse == ')') {
2447 RExC_parse++;
380a0633 2448 vFAIL("Unmatched )");
a0ed51b3
LW
2449 }
2450 else
b45f050a 2451 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2452 /* NOTREACHED */
2453 }
a687059c 2454
a0d0e21e 2455 return(ret);
a687059c
LW
2456}
2457
2458/*
2459 - regbranch - one alternative of an | operator
2460 *
2461 * Implements the concatenation operator.
2462 */
76e3520e 2463STATIC regnode *
830247a4 2464S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2465{
c277df42
IZ
2466 register regnode *ret;
2467 register regnode *chain = NULL;
2468 register regnode *latest;
2469 I32 flags = 0, c = 0;
a0d0e21e 2470
b81d288d 2471 if (first)
c277df42
IZ
2472 ret = NULL;
2473 else {
b81d288d 2474 if (!SIZE_ONLY && RExC_extralen)
830247a4 2475 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2476 else {
830247a4 2477 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2478 Set_Node_Length(ret, 1);
2479 }
c277df42
IZ
2480 }
2481
b81d288d 2482 if (!first && SIZE_ONLY)
830247a4 2483 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2484
c277df42 2485 *flagp = WORST; /* Tentatively. */
a0d0e21e 2486
830247a4
IZ
2487 RExC_parse--;
2488 nextchar(pRExC_state);
2489 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2490 flags &= ~TRYAGAIN;
830247a4 2491 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2492 if (latest == NULL) {
2493 if (flags & TRYAGAIN)
2494 continue;
2495 return(NULL);
a0ed51b3
LW
2496 }
2497 else if (ret == NULL)
c277df42 2498 ret = latest;
a0d0e21e 2499 *flagp |= flags&HASWIDTH;
c277df42 2500 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2501 *flagp |= flags&SPSTART;
2502 else {
830247a4
IZ
2503 RExC_naughty++;
2504 regtail(pRExC_state, chain, latest);
a687059c 2505 }
a0d0e21e 2506 chain = latest;
c277df42
IZ
2507 c++;
2508 }
2509 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2510 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2511 if (ret == NULL)
2512 ret = chain;
2513 }
2514 if (c == 1) {
2515 *flagp |= flags&SIMPLE;
a0d0e21e 2516 }
a687059c 2517
a0d0e21e 2518 return(ret);
a687059c
LW
2519}
2520
2521/*
2522 - regpiece - something followed by possible [*+?]
2523 *
2524 * Note that the branching code sequences used for ? and the general cases
2525 * of * and + are somewhat optimized: they use the same NOTHING node as
2526 * both the endmarker for their branch list and the body of the last branch.
2527 * It might seem that this node could be dispensed with entirely, but the
2528 * endmarker role is not redundant.
2529 */
76e3520e 2530STATIC regnode *
830247a4 2531S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2532{
c277df42 2533 register regnode *ret;
a0d0e21e
LW
2534 register char op;
2535 register char *next;
2536 I32 flags;
830247a4 2537 char *origparse = RExC_parse;
a0d0e21e
LW
2538 char *maxpos;
2539 I32 min;
c277df42 2540 I32 max = REG_INFTY;
fac92740 2541 char *parse_start;
a0d0e21e 2542
830247a4 2543 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2544 if (ret == NULL) {
2545 if (flags & TRYAGAIN)
2546 *flagp |= TRYAGAIN;
2547 return(NULL);
2548 }
2549
830247a4 2550 op = *RExC_parse;
a0d0e21e 2551
830247a4 2552 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2553 parse_start = RExC_parse; /* MJD */
830247a4 2554 next = RExC_parse + 1;
a0d0e21e
LW
2555 maxpos = Nullch;
2556 while (isDIGIT(*next) || *next == ',') {
2557 if (*next == ',') {
2558 if (maxpos)
2559 break;
2560 else
2561 maxpos = next;
a687059c 2562 }
a0d0e21e
LW
2563 next++;
2564 }
2565 if (*next == '}') { /* got one */
2566 if (!maxpos)
2567 maxpos = next;
830247a4
IZ
2568 RExC_parse++;
2569 min = atoi(RExC_parse);
a0d0e21e
LW
2570 if (*maxpos == ',')
2571 maxpos++;
2572 else
830247a4 2573 maxpos = RExC_parse;
a0d0e21e
LW
2574 max = atoi(maxpos);
2575 if (!max && *maxpos != '0')
c277df42
IZ
2576 max = REG_INFTY; /* meaning "infinity" */
2577 else if (max >= REG_INFTY)
8615cb43 2578 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2579 RExC_parse = next;
2580 nextchar(pRExC_state);
a0d0e21e
LW
2581
2582 do_curly:
2583 if ((flags&SIMPLE)) {
830247a4
IZ
2584 RExC_naughty += 2 + RExC_naughty / 2;
2585 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2586 Set_Node_Offset(ret, parse_start+1); /* MJD */
2587 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2588 }
2589 else {
830247a4 2590 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2591
2592 w->flags = 0;
830247a4
IZ
2593 regtail(pRExC_state, ret, w);
2594 if (!SIZE_ONLY && RExC_extralen) {
2595 reginsert(pRExC_state, LONGJMP,ret);
2596 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2597 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2598 }
830247a4 2599 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2600 /* MJD hk */
2601 Set_Node_Offset(ret, parse_start+1);
2602 Set_Node_Length(ret,
2603 op == '{' ? (RExC_parse - parse_start) : 1);
2604
830247a4 2605 if (!SIZE_ONLY && RExC_extralen)
c277df42 2606 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2607 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2608 if (SIZE_ONLY)
830247a4
IZ
2609 RExC_whilem_seen++, RExC_extralen += 3;
2610 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2611 }
c277df42 2612 ret->flags = 0;
a0d0e21e
LW
2613
2614 if (min > 0)
821b33a5
IZ
2615 *flagp = WORST;
2616 if (max > 0)
2617 *flagp |= HASWIDTH;
a0d0e21e 2618 if (max && max < min)
8615cb43 2619 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2620 if (!SIZE_ONLY) {
2621 ARG1_SET(ret, min);
2622 ARG2_SET(ret, max);
a687059c 2623 }
a687059c 2624
a0d0e21e 2625 goto nest_check;
a687059c 2626 }
a0d0e21e 2627 }
a687059c 2628
a0d0e21e
LW
2629 if (!ISMULT1(op)) {
2630 *flagp = flags;
a687059c 2631 return(ret);
a0d0e21e 2632 }
bb20fd44 2633
c277df42 2634#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2635
2636 /* if this is reinstated, don't forget to put this back into perldiag:
2637
2638 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2639
2640 (F) The part of the regexp subject to either the * or + quantifier
2641 could match an empty string. The {#} shows in the regular
2642 expression about where the problem was discovered.
2643
2644 */
2645
bb20fd44 2646 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2647 vFAIL("Regexp *+ operand could be empty");
b81d288d 2648#endif
bb20fd44 2649
fac92740 2650 parse_start = RExC_parse;
830247a4 2651 nextchar(pRExC_state);
a0d0e21e 2652
821b33a5 2653 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2654
2655 if (op == '*' && (flags&SIMPLE)) {
830247a4 2656 reginsert(pRExC_state, STAR, ret);
c277df42 2657 ret->flags = 0;
830247a4 2658 RExC_naughty += 4;
a0d0e21e
LW
2659 }
2660 else if (op == '*') {
2661 min = 0;
2662 goto do_curly;
a0ed51b3
LW
2663 }
2664 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2665 reginsert(pRExC_state, PLUS, ret);
c277df42 2666 ret->flags = 0;
830247a4 2667 RExC_naughty += 3;
a0d0e21e
LW
2668 }
2669 else if (op == '+') {
2670 min = 1;
2671 goto do_curly;
a0ed51b3
LW
2672 }
2673 else if (op == '?') {
a0d0e21e
LW
2674 min = 0; max = 1;
2675 goto do_curly;
2676 }
2677 nest_check:
e476b1b5 2678 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2679 vWARN3(RExC_parse,
b45f050a 2680 "%.*s matches null string many times",
830247a4 2681 RExC_parse - origparse,
b45f050a 2682 origparse);
a0d0e21e
LW
2683 }
2684
830247a4
IZ
2685 if (*RExC_parse == '?') {
2686 nextchar(pRExC_state);
2687 reginsert(pRExC_state, MINMOD, ret);
2688 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2689 }
830247a4
IZ
2690 if (ISMULT2(RExC_parse)) {
2691 RExC_parse++;
b45f050a
JF
2692 vFAIL("Nested quantifiers");
2693 }
a0d0e21e
LW
2694
2695 return(ret);
a687059c
LW
2696}
2697
2698/*
2699 - regatom - the lowest level
2700 *
2701 * Optimization: gobbles an entire sequence of ordinary characters so that
2702 * it can turn them into a single node, which is smaller to store and
2703 * faster to run. Backslashed characters are exceptions, each becoming a
2704 * separate node; the code is simpler that way and it's not worth fixing.
2705 *
b45f050a 2706 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2707STATIC regnode *
830247a4 2708S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2709{
c277df42 2710 register regnode *ret = 0;
a0d0e21e 2711 I32 flags;
f06dbbb7 2712 char *parse_start = 0;
a0d0e21e
LW
2713
2714 *flagp = WORST; /* Tentatively. */
2715
2716tryagain:
830247a4 2717 switch (*RExC_parse) {
a0d0e21e 2718 case '^':
830247a4
IZ
2719 RExC_seen_zerolen++;
2720 nextchar(pRExC_state);
2721 if (RExC_flags16 & PMf_MULTILINE)
2722 ret = reg_node(pRExC_state, MBOL);
2723 else if (RExC_flags16 & PMf_SINGLELINE)
2724 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2725 else
830247a4 2726 ret = reg_node(pRExC_state, BOL);
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '$':
830247a4 2730 nextchar(pRExC_state);
b81d288d 2731 if (*RExC_parse)
830247a4
IZ
2732 RExC_seen_zerolen++;
2733 if (RExC_flags16 & PMf_MULTILINE)
2734 ret = reg_node(pRExC_state, MEOL);
2735 else if (RExC_flags16 & PMf_SINGLELINE)
2736 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2737 else
830247a4 2738 ret = reg_node(pRExC_state, EOL);
fac92740 2739 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2740 break;
2741 case '.':
830247a4 2742 nextchar(pRExC_state);
cce850e4 2743 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2744 ret = reg_node(pRExC_state, SANY);
2745 else
2746 ret = reg_node(pRExC_state, REG_ANY);
2747 *flagp |= HASWIDTH|SIMPLE;
830247a4 2748 RExC_naughty++;
fac92740 2749 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2750 break;
2751 case '[':
b45f050a 2752 {
830247a4 2753 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2754 ret = regclass(pRExC_state);
830247a4
IZ
2755 if (*RExC_parse != ']') {
2756 RExC_parse = oregcomp_parse;
b45f050a
JF
2757 vFAIL("Unmatched [");
2758 }
830247a4 2759 nextchar(pRExC_state);
a0d0e21e 2760 *flagp |= HASWIDTH|SIMPLE;
fac92740 2761 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2762 break;
b45f050a 2763 }
a0d0e21e 2764 case '(':
830247a4
IZ
2765 nextchar(pRExC_state);
2766 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2767 if (ret == NULL) {
bf93d4cc 2768 if (flags & TRYAGAIN) {
830247a4 2769 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2770 /* Make parent create an empty node if needed. */
2771 *flagp |= TRYAGAIN;
2772 return(NULL);
2773 }
a0d0e21e 2774 goto tryagain;
bf93d4cc 2775 }
a0d0e21e
LW
2776 return(NULL);
2777 }
c277df42 2778 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2779 break;
2780 case '|':
2781 case ')':
2782 if (flags & TRYAGAIN) {
2783 *flagp |= TRYAGAIN;
2784 return NULL;
2785 }
b45f050a 2786 vFAIL("Internal urp");
a0d0e21e
LW
2787 /* Supposed to be caught earlier. */
2788 break;
85afd4ae 2789 case '{':
830247a4
IZ
2790 if (!regcurly(RExC_parse)) {
2791 RExC_parse++;
85afd4ae
CS
2792 goto defchar;
2793 }
2794 /* FALL THROUGH */
a0d0e21e
LW
2795 case '?':
2796 case '+':
2797 case '*':
830247a4 2798 RExC_parse++;
b45f050a 2799 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2800 break;
2801 case '\\':
830247a4 2802 switch (*++RExC_parse) {
a0d0e21e 2803 case 'A':
830247a4
IZ
2804 RExC_seen_zerolen++;
2805 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2806 *flagp |= SIMPLE;
830247a4 2807 nextchar(pRExC_state);
fac92740 2808 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2809 break;
2810 case 'G':
830247a4
IZ
2811 ret = reg_node(pRExC_state, GPOS);
2812 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2813 *flagp |= SIMPLE;
830247a4 2814 nextchar(pRExC_state);
fac92740 2815 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2816 break;
2817 case 'Z':
830247a4 2818 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2819 *flagp |= SIMPLE;
a1917ab9 2820 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2821 nextchar(pRExC_state);
a0d0e21e 2822 break;
b85d18e9 2823 case 'z':
830247a4 2824 ret = reg_node(pRExC_state, EOS);
b85d18e9 2825 *flagp |= SIMPLE;
830247a4
IZ
2826 RExC_seen_zerolen++; /* Do not optimize RE away */
2827 nextchar(pRExC_state);
fac92740 2828 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2829 break;
4a2d328f 2830 case 'C':
f33976b4
DB
2831 ret = reg_node(pRExC_state, CANY);
2832 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2833 *flagp |= HASWIDTH|SIMPLE;
830247a4 2834 nextchar(pRExC_state);
fac92740 2835 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2836 break;
2837 case 'X':
830247a4 2838 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2839 *flagp |= HASWIDTH;
830247a4 2840 nextchar(pRExC_state);
fac92740 2841 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2842 break;
a0d0e21e 2843 case 'w':
ffc61ed2 2844 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2845 *flagp |= HASWIDTH|SIMPLE;
830247a4 2846 nextchar(pRExC_state);
fac92740 2847 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2848 break;
2849 case 'W':
ffc61ed2 2850 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2851 *flagp |= HASWIDTH|SIMPLE;
830247a4 2852 nextchar(pRExC_state);
fac92740 2853 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2854 break;
2855 case 'b':
830247a4
IZ
2856 RExC_seen_zerolen++;
2857 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2858 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2859 *flagp |= SIMPLE;
830247a4 2860 nextchar(pRExC_state);
fac92740 2861 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2862 break;
2863 case 'B':
830247a4
IZ
2864 RExC_seen_zerolen++;
2865 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2866 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2867 *flagp |= SIMPLE;
830247a4 2868 nextchar(pRExC_state);
fac92740 2869 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2870 break;
2871 case 's':
ffc61ed2 2872 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2873 *flagp |= HASWIDTH|SIMPLE;
830247a4 2874 nextchar(pRExC_state);
fac92740 2875 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2876 break;
2877 case 'S':
ffc61ed2 2878 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2879 *flagp |= HASWIDTH|SIMPLE;
830247a4 2880 nextchar(pRExC_state);
fac92740 2881 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2882 break;
2883 case 'd':
ffc61ed2 2884 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2885 *flagp |= HASWIDTH|SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2888 break;
2889 case 'D':
ffc61ed2 2890 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2891 *flagp |= HASWIDTH|SIMPLE;
830247a4 2892 nextchar(pRExC_state);
fac92740 2893 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2894 break;
a14b48bc
LW
2895 case 'p':
2896 case 'P':
3568d838 2897 {
830247a4 2898 char* oldregxend = RExC_end;
fac92740 2899 char* parse_start = RExC_parse;
a14b48bc 2900
830247a4 2901 if (RExC_parse[1] == '{') {
3568d838 2902 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2903 RExC_end = strchr(RExC_parse, '}');
2904 if (!RExC_end) {
0da60cf5 2905 U8 c = (U8)*RExC_parse;
830247a4
IZ
2906 RExC_parse += 2;
2907 RExC_end = oldregxend;
0da60cf5 2908 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2909 }
830247a4 2910 RExC_end++;
a14b48bc
LW
2911 }
2912 else
830247a4
IZ
2913 RExC_end = RExC_parse + 2;
2914 RExC_parse--;
a14b48bc 2915
ffc61ed2 2916 ret = regclass(pRExC_state);
a14b48bc 2917
830247a4
IZ
2918 RExC_end = oldregxend;
2919 RExC_parse--;
fac92740 2920 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2921 nextchar(pRExC_state);
a14b48bc
LW
2922 *flagp |= HASWIDTH|SIMPLE;
2923 }
2924 break;
a0d0e21e
LW
2925 case 'n':
2926 case 'r':
2927 case 't':
2928 case 'f':
2929 case 'e':
2930 case 'a':
2931 case 'x':
2932 case 'c':
2933 case '0':
2934 goto defchar;
2935 case '1': case '2': case '3': case '4':
2936 case '5': case '6': case '7': case '8': case '9':
2937 {
830247a4 2938 I32 num = atoi(RExC_parse);
a0d0e21e 2939
830247a4 2940 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2941 goto defchar;
2942 else {
fac92740 2943 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2944 while (isDIGIT(*RExC_parse))
2945 RExC_parse++;
b45f050a 2946
830247a4 2947 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2948 vFAIL("Reference to nonexistent group");
830247a4
IZ
2949 RExC_sawback = 1;
2950 ret = reganode(pRExC_state, FOLD
a0ed51b3 2951 ? (LOC ? REFFL : REFF)
c8756f30 2952 : REF, num);
a0d0e21e 2953 *flagp |= HASWIDTH;
fac92740
MJD
2954
2955 /* override incorrect value set in reganode MJD */
2956 Set_Node_Offset(ret, parse_start+1);
2957 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2958 RExC_parse--;
2959 nextchar(pRExC_state);
a0d0e21e
LW
2960 }
2961 }
2962 break;
2963 case '\0':
830247a4 2964 if (RExC_parse >= RExC_end)
b45f050a 2965 FAIL("Trailing \\");
a0d0e21e
LW
2966 /* FALL THROUGH */
2967 default:
c9f97d15
IZ
2968 /* Do not generate `unrecognized' warnings here, we fall
2969 back into the quick-grab loop below */
a0d0e21e
LW
2970 goto defchar;
2971 }
2972 break;
4633a7c4
LW
2973
2974 case '#':
830247a4
IZ
2975 if (RExC_flags16 & PMf_EXTENDED) {
2976 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2977 if (RExC_parse < RExC_end)
4633a7c4
LW
2978 goto tryagain;
2979 }
2980 /* FALL THROUGH */
2981
a0d0e21e 2982 default: {
ba210ebe 2983 register STRLEN len;
a0ed51b3 2984 register UV ender;
a0d0e21e 2985 register char *p;
c277df42 2986 char *oldp, *s;
ba210ebe 2987 STRLEN numlen;
a2a2844f 2988 STRLEN ulen;
e7ae6809 2989 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
f06dbbb7
JH
2990
2991 parse_start = RExC_parse - 1;
a0d0e21e 2992
830247a4 2993 RExC_parse++;
a0d0e21e
LW
2994
2995 defchar:
830247a4 2996 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2997 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2998 : EXACT);
cd439c50 2999 s = STRING(ret);
830247a4
IZ
3000 for (len = 0, p = RExC_parse - 1;
3001 len < 127 && p < RExC_end;
a0d0e21e
LW
3002 len++)
3003 {
3004 oldp = p;
5b5a24f7 3005
830247a4
IZ
3006 if (RExC_flags16 & PMf_EXTENDED)
3007 p = regwhite(p, RExC_end);
a0d0e21e
LW
3008 switch (*p) {
3009 case '^':
3010 case '$':
3011 case '.':
3012 case '[':
3013 case '(':
3014 case ')':
3015 case '|':
3016 goto loopdone;
3017 case '\\':
3018 switch (*++p) {
3019 case 'A':
1ed8eac0
JF
3020 case 'C':
3021 case 'X':
a0d0e21e
LW
3022 case 'G':
3023 case 'Z':
b85d18e9 3024 case 'z':
a0d0e21e
LW
3025 case 'w':
3026 case 'W':
3027 case 'b':
3028 case 'B':
3029 case 's':
3030 case 'S':
3031 case 'd':
3032 case 'D':
a14b48bc
LW
3033 case 'p':
3034 case 'P':
a0d0e21e
LW
3035 --p;
3036 goto loopdone;
3037 case 'n':
3038 ender = '\n';
3039 p++;
a687059c 3040 break;
a0d0e21e
LW
3041 case 'r':
3042 ender = '\r';
3043 p++;
a687059c 3044 break;
a0d0e21e
LW
3045 case 't':
3046 ender = '\t';
3047 p++;
a687059c 3048 break;
a0d0e21e
LW
3049 case 'f':
3050 ender = '\f';
3051 p++;
a687059c 3052 break;
a0d0e21e 3053 case 'e':
c7f1f016 3054 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3055 p++;
a687059c 3056 break;
a0d0e21e 3057 case 'a':
c7f1f016 3058 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3059 p++;
a687059c 3060 break;
a0d0e21e 3061 case 'x':
a0ed51b3
LW
3062 if (*++p == '{') {
3063 char* e = strchr(p, '}');
b81d288d 3064
b45f050a 3065 if (!e) {
830247a4 3066 RExC_parse = p + 1;
b45f050a
JF
3067 vFAIL("Missing right brace on \\x{}");
3068 }
de5f0749 3069 else {
a4c04bdc
NC
3070 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3071 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3072 numlen = e - p - 1;
3073 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3074 if (ender > 0xff)
3075 RExC_utf8 = 1;
b21ed0a9
GS
3076 /* numlen is generous */
3077 if (numlen + len >= 127) {
a0ed51b3
LW
3078 p--;
3079 goto loopdone;
3080 }
3081 p = e + 1;
3082 }
a0ed51b3
LW
3083 }
3084 else {
a4c04bdc 3085 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3086 numlen = 2;
3087 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3088 p += numlen;
3089 }
a687059c 3090 break;
a0d0e21e
LW
3091 case 'c':
3092 p++;
bbce6d69 3093 ender = UCHARAT(p++);
3094 ender = toCTRL(ender);
a687059c 3095 break;
a0d0e21e
LW
3096 case '0': case '1': case '2': case '3':case '4':
3097 case '5': case '6': case '7': case '8':case '9':
3098 if (*p == '0' ||
830247a4 3099 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3100 I32 flags = 0;
3101 numlen = 3;
3102 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3103 p += numlen;
3104 }
3105 else {
3106 --p;
3107 goto loopdone;
a687059c
LW
3108 }
3109 break;
a0d0e21e 3110 case '\0':
830247a4 3111 if (p >= RExC_end)
b45f050a 3112 FAIL("Trailing \\");
a687059c 3113 /* FALL THROUGH */
a0d0e21e 3114 default:
e476b1b5 3115 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3116 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3117 goto normal_default;
a0d0e21e
LW
3118 }
3119 break;
a687059c 3120 default:
a0ed51b3 3121 normal_default:
fd400ab9 3122 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3123 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3124 &numlen, 0);
a0ed51b3
LW
3125 p += numlen;
3126 }
3127 else
3128 ender = *p++;
a0d0e21e 3129 break;
a687059c 3130 }
830247a4
IZ
3131 if (RExC_flags16 & PMf_EXTENDED)
3132 p = regwhite(p, RExC_end);
a0ed51b3 3133 if (UTF && FOLD) {
22c54be3 3134 toFOLD_uni(ender, tmpbuf, &ulen);
a2a2844f 3135 ender = utf8_to_uvchr(tmpbuf, 0);
a0ed51b3 3136 }
a0d0e21e
LW
3137 if (ISMULT2(p)) { /* Back off on ?+*. */
3138 if (len)
3139 p = oldp;
2b9d42f0 3140 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3141 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3142 s += numlen;
3143 len += numlen;
3144 }
a0d0e21e
LW
3145 else {
3146 len++;
cd439c50 3147 REGC(ender, s++);
a0d0e21e
LW
3148 }
3149 break;
a687059c 3150 }
2b9d42f0 3151 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3152 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3153 s += numlen;
3154 len += numlen - 1;
3155 }
3156 else
cd439c50 3157 REGC(ender, s++);
a0d0e21e
LW
3158 }
3159 loopdone:
830247a4 3160 RExC_parse = p - 1;
fac92740 3161 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3162 nextchar(pRExC_state);
793db0cb
JH
3163 {
3164 /* len is STRLEN which is unsigned, need to copy to signed */
3165 IV iv = len;
3166 if (iv < 0)
3167 vFAIL("Internal disaster");
3168 }
a0d0e21e
LW
3169 if (len > 0)
3170 *flagp |= HASWIDTH;
3171 if (len == 1)
3172 *flagp |= SIMPLE;
c277df42 3173 if (!SIZE_ONLY)
cd439c50
IZ
3174 STR_LEN(ret) = len;
3175 if (SIZE_ONLY)
830247a4 3176 RExC_size += STR_SZ(len);
cd439c50 3177 else
830247a4 3178 RExC_emit += STR_SZ(len);
a687059c 3179 }
a0d0e21e
LW
3180 break;
3181 }
a687059c 3182
22c54be3 3183 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3184 STRLEN oldlen = STR_LEN(ret);
3185 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3186
3187 if (RExC_utf8)
3188 SvUTF8_on(sv);
3189 if (sv_utf8_downgrade(sv, TRUE)) {
3190 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3191 STRLEN newlen = SvCUR(sv);
3192
3193 if (!SIZE_ONLY) {
3194 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3195 (int)oldlen, STRING(ret),
3196 (int)newlen, s));
3197 Copy(s, STRING(ret), newlen, char);
3198 STR_LEN(ret) += newlen - oldlen;
3199 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3200 } else
3201 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3202 }
a72c7584
JH
3203 }
3204
a0d0e21e 3205 return(ret);
a687059c
LW
3206}
3207
873ef191 3208STATIC char *
cea2e8a9 3209S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3210{
3211 while (p < e) {
3212 if (isSPACE(*p))
3213 ++p;
3214 else if (*p == '#') {
3215 do {
3216 p++;
3217 } while (p < e && *p != '\n');
3218 }
3219 else
3220 break;
3221 }
3222 return p;
3223}
3224
b8c5462f
JH
3225/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3226 Character classes ([:foo:]) can also be negated ([:^foo:]).
3227 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3228 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3229 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3230
3231#define POSIXCC_DONE(c) ((c) == ':')
3232#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3233#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3234
b8c5462f 3235STATIC I32
830247a4 3236S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3237{
3238 char *posixcc = 0;
936ed897 3239 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3240
830247a4 3241 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3242 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3243 POSIXCC(UCHARAT(RExC_parse))) {
3244 char c = UCHARAT(RExC_parse);
830247a4 3245 char* s = RExC_parse++;
b81d288d 3246
9a86a77b 3247 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3248 RExC_parse++;
3249 if (RExC_parse == RExC_end)
620e46c5 3250 /* Grandfather lone [:, [=, [. */
830247a4 3251 RExC_parse = s;
620e46c5 3252 else {
830247a4 3253 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3254
9a86a77b 3255 if (UCHARAT(RExC_parse) == ']') {
830247a4 3256 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3257 posixcc = s + 1;
3258 if (*s == ':') {
3259 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3260 I32 skip = 5; /* the most common skip */
3261
3262 switch (*posixcc) {
3263 case 'a':
3264 if (strnEQ(posixcc, "alnum", 5))
3265 namedclass =
3266 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3267 else if (strnEQ(posixcc, "alpha", 5))
3268 namedclass =
3269 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3270 else if (strnEQ(posixcc, "ascii", 5))
3271 namedclass =
3272 complement ? ANYOF_NASCII : ANYOF_ASCII;
3273 break;
aaa51d5e
JF
3274 case 'b':
3275 if (strnEQ(posixcc, "blank", 5))
3276 namedclass =
3277 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3278 break;
b8c5462f
JH
3279 case 'c':
3280 if (strnEQ(posixcc, "cntrl", 5))
3281 namedclass =
3282 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3283 break;
3284 case 'd':
3285 if (strnEQ(posixcc, "digit", 5))
3286 namedclass =
3287 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3288 break;
3289 case 'g':
3290 if (strnEQ(posixcc, "graph", 5))
3291 namedclass =
3292 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3293 break;
3294 case 'l':
3295 if (strnEQ(posixcc, "lower", 5))
3296 namedclass =
3297 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3298 break;
3299 case 'p':
3300 if (strnEQ(posixcc, "print", 5))
3301 namedclass =
3302 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3303 else if (strnEQ(posixcc, "punct", 5))
3304 namedclass =
3305 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3306 break;
3307 case 's':
3308 if (strnEQ(posixcc, "space", 5))
3309 namedclass =
aaa51d5e 3310 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3311 break;
b8c5462f
JH
3312 case 'u':
3313 if (strnEQ(posixcc, "upper", 5))
3314 namedclass =
3315 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3316 break;
3317 case 'w': /* this is not POSIX, this is the Perl \w */
3318 if (strnEQ(posixcc, "word", 4)) {
3319 namedclass =
3320 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3321 skip = 4;
3322 }
3323 break;
3324 case 'x':
3325 if (strnEQ(posixcc, "xdigit", 6)) {
3326 namedclass =
3327 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3328 skip = 6;
3329 }
3330 break;
3331 }
ac561586
JH
3332 if (namedclass == OOB_NAMEDCLASS ||
3333 posixcc[skip] != ':' ||
3334 posixcc[skip+1] != ']')
b45f050a
JF
3335 {
3336 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3337 t - s - 1, s + 1);
3338 }
3339 } else if (!SIZE_ONLY) {
b8c5462f 3340 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3341
830247a4 3342 /* adjust RExC_parse so the warning shows after
b45f050a 3343 the class closes */
9a86a77b 3344 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3345 RExC_parse++;
b45f050a
JF
3346 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3347 }
b8c5462f
JH
3348 } else {
3349 /* Maternal grandfather:
3350 * "[:" ending in ":" but not in ":]" */
830247a4 3351 RExC_parse = s;
767d463e 3352 }
620e46c5
JH
3353 }
3354 }
3355
b8c5462f
JH
3356 return namedclass;
3357}
3358
3359STATIC void
830247a4 3360S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3361{
e476b1b5 3362 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3363 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3364 char *s = RExC_parse;
93733859 3365 char c = *s++;
b8c5462f
JH
3366
3367 while(*s && isALNUM(*s))
3368 s++;
3369 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3370 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3371
3372 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3373 if (POSIXCC_NOTYET(c)) {
830247a4 3374 /* adjust RExC_parse so the error shows after
b45f050a 3375 the class closes */
9a86a77b 3376 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3377 ;
3378 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3379 }
b8c5462f
JH
3380 }
3381 }
620e46c5
JH
3382}
3383
76e3520e 3384STATIC regnode *
830247a4 3385S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3386{
ffc61ed2 3387 register UV value;
9a86a77b 3388 register UV nextvalue;
3568d838 3389 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3390 register IV range = 0;
c277df42 3391 register regnode *ret;
ba210ebe 3392 STRLEN numlen;
ffc61ed2 3393 IV namedclass;
9c5ffd7c 3394 char *rangebegin = 0;
936ed897 3395 bool need_class = 0;
9c5ffd7c 3396 SV *listsv = Nullsv;
ffc61ed2
JH
3397 register char *e;
3398 UV n;
3568d838 3399 bool optimize_invert = TRUE;
ffc61ed2
JH
3400
3401 ret = reganode(pRExC_state, ANYOF, 0);
3402
3403 if (!SIZE_ONLY)
3404 ANYOF_FLAGS(ret) = 0;
3405
9a86a77b 3406 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3407 RExC_naughty++;
3408 RExC_parse++;
3409 if (!SIZE_ONLY)
3410 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3411 }
a0d0e21e 3412
936ed897 3413 if (SIZE_ONLY)
830247a4 3414 RExC_size += ANYOF_SKIP;
936ed897 3415 else {
830247a4 3416 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3417 if (FOLD)
3418 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3419 if (LOC)
3420 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3421 ANYOF_BITMAP_ZERO(ret);
3422 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3423 }
b8c5462f 3424
9a86a77b
JH
3425 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3426
3427 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
830247a4 3428 checkposixcc(pRExC_state);
b8c5462f 3429
9a86a77b 3430 if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
ffc61ed2
JH
3431 goto charclassloop; /* allow 1st char to be ] or - */
3432
9a86a77b 3433 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3434
3435 charclassloop:
3436
3437 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3438
73b437c8 3439 if (!range)
830247a4 3440 rangebegin = RExC_parse;
ffc61ed2 3441 if (UTF) {
5e12f4fb 3442 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3443 RExC_end - RExC_parse,
3444 &numlen, 0);
ffc61ed2
JH
3445 RExC_parse += numlen;
3446 }
3447 else
3448 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3449 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3450 if (value == '[' && POSIXCC(nextvalue))
830247a4 3451 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3452 else if (value == '\\') {
ffc61ed2 3453 if (UTF) {
5e12f4fb 3454 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3455 RExC_end - RExC_parse,
3456 &numlen, 0);
3457 RExC_parse += numlen;
3458 }
3459 else
3460 value = UCHARAT(RExC_parse++);
470c3474 3461 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3462 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3463 * be a problem later if we want switch on Unicode.
3464 * A similar issue a little bit later when switching on
3465 * namedclass. --jhi */
ffc61ed2 3466 switch ((I32)value) {
b8c5462f
JH
3467 case 'w': namedclass = ANYOF_ALNUM; break;
3468 case 'W': namedclass = ANYOF_NALNUM; break;
3469 case 's': namedclass = ANYOF_SPACE; break;
3470 case 'S': namedclass = ANYOF_NSPACE; break;
3471 case 'd': namedclass = ANYOF_DIGIT; break;
3472 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3473 case 'p':
3474 case 'P':
3475 if (*RExC_parse == '{') {
0da60cf5 3476 U8 c = (U8)value;
ffc61ed2
JH
3477 e = strchr(RExC_parse++, '}');
3478 if (!e)
0da60cf5 3479 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3480 while (isSPACE(UCHARAT(RExC_parse)))
3481 RExC_parse++;
3482 if (e == RExC_parse)
0da60cf5 3483 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3484 n = e - RExC_parse;
ab13f0c7
JH
3485 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3486 n--;
ffc61ed2
JH
3487 }
3488 else {
3489 e = RExC_parse;
3490 n = 1;
3491 }
3492 if (!SIZE_ONLY) {
ab13f0c7
JH
3493 if (UCHARAT(RExC_parse) == '^') {
3494 RExC_parse++;
3495 n--;
3496 value = value == 'p' ? 'P' : 'p'; /* toggle */
3497 while (isSPACE(UCHARAT(RExC_parse))) {
3498 RExC_parse++;
3499 n--;
3500 }
3501 }
ffc61ed2 3502 if (value == 'p')
ab13f0c7
JH
3503 Perl_sv_catpvf(aTHX_ listsv,
3504 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3505 else
ab13f0c7
JH
3506 Perl_sv_catpvf(aTHX_ listsv,
3507 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3508 }
3509 RExC_parse = e + 1;
3510 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3511 continue;
b8c5462f
JH
3512 case 'n': value = '\n'; break;
3513 case 'r': value = '\r'; break;
3514 case 't': value = '\t'; break;
3515 case 'f': value = '\f'; break;
3516 case 'b': value = '\b'; break;
c7f1f016
NIS
3517 case 'e': value = ASCII_TO_NATIVE('\033');break;
3518 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3519 case 'x':
ffc61ed2 3520 if (*RExC_parse == '{') {
a4c04bdc
NC
3521 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3522 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3523 e = strchr(RExC_parse++, '}');
b81d288d 3524 if (!e)
ffc61ed2 3525 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3526
3527 numlen = e - RExC_parse;
3528 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3529 RExC_parse = e + 1;
3530 }
3531 else {
a4c04bdc 3532 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3533 numlen = 2;
3534 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3535 RExC_parse += numlen;
3536 }
b8c5462f
JH
3537 break;
3538 case 'c':
830247a4 3539 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3540 value = toCTRL(value);
3541 break;
3542 case '0': case '1': case '2': case '3': case '4':
3543 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3544 {
3545 I32 flags = 0;
3546 numlen = 3;
3547 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3548 RExC_parse += numlen;
b8c5462f 3549 break;
53305cf1 3550 }
1028017a 3551 default:
e476b1b5 3552 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3553 vWARN2(RExC_parse,
3554 "Unrecognized escape \\%c in character class passed through",
3555 (int)value);
1028017a 3556 break;
b8c5462f 3557 }
ffc61ed2
JH
3558 } /* end of \blah */
3559
3560 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3561
3562 if (!SIZE_ONLY && !need_class)
936ed897 3563 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3564
936ed897 3565 need_class = 1;
ffc61ed2
JH
3566
3567 /* a bad range like a-\d, a-[:digit:] ? */
3568 if (range) {
73b437c8 3569 if (!SIZE_ONLY) {
e476b1b5 3570 if (ckWARN(WARN_REGEXP))
830247a4 3571 vWARN4(RExC_parse,
b45f050a 3572 "False [] range \"%*.*s\"",
830247a4
IZ
3573 RExC_parse - rangebegin,
3574 RExC_parse - rangebegin,
b45f050a 3575 rangebegin);
3568d838
JH
3576 if (prevvalue < 256) {
3577 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3578 ANYOF_BITMAP_SET(ret, '-');
3579 }
3580 else {
3581 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3582 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3583 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3584 }
b8c5462f 3585 }
ffc61ed2
JH
3586
3587 range = 0; /* this was not a true range */
73b437c8 3588 }
ffc61ed2 3589
73b437c8 3590 if (!SIZE_ONLY) {
3568d838
JH
3591 if (namedclass > OOB_NAMEDCLASS)
3592 optimize_invert = FALSE;
e2962f66
JH
3593 /* Possible truncation here but in some 64-bit environments
3594 * the compiler gets heartburn about switch on 64-bit values.
3595 * A similar issue a little earlier when switching on value.
98f323fa 3596 * --jhi */
e2962f66 3597 switch ((I32)namedclass) {
73b437c8
JH
3598 case ANYOF_ALNUM:
3599 if (LOC)
936ed897 3600 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3601 else {
3602 for (value = 0; value < 256; value++)
3603 if (isALNUM(value))
936ed897 3604 ANYOF_BITMAP_SET(ret, value);
73b437c8 3605 }
ffc61ed2 3606 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3607 break;
3608 case ANYOF_NALNUM:
3609 if (LOC)
936ed897 3610 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3611 else {
3612 for (value = 0; value < 256; value++)
3613 if (!isALNUM(value))
936ed897 3614 ANYOF_BITMAP_SET(ret, value);
73b437c8 3615 }
ffc61ed2 3616 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3617 break;
ffc61ed2 3618 case ANYOF_ALNUMC:
73b437c8 3619 if (LOC)
ffc61ed2 3620 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3621 else {
3622 for (value = 0; value < 256; value++)
ffc61ed2 3623 if (isALNUMC(value))
936ed897 3624 ANYOF_BITMAP_SET(ret, value);
73b437c8 3625 }
ffc61ed2 3626 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3627 break;
3628 case ANYOF_NALNUMC:
3629 if (LOC)
936ed897 3630 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3631 else {
3632 for (value = 0; value < 256; value++)
3633 if (!isALNUMC(value))
936ed897 3634 ANYOF_BITMAP_SET(ret, value);
73b437c8 3635 }
ffc61ed2 3636 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3637 break;
3638 case ANYOF_ALPHA:
3639 if (LOC)
936ed897 3640 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3641 else {
3642 for (value = 0; value < 256; value++)
3643 if (isALPHA(value))
936ed897 3644 ANYOF_BITMAP_SET(ret, value);
73b437c8 3645 }
ffc61ed2 3646 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3647 break;
3648 case ANYOF_NALPHA:
3649 if (LOC)
936ed897 3650 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3651 else {
3652 for (value = 0; value < 256; value++)
3653 if (!isALPHA(value))
936ed897 3654 ANYOF_BITMAP_SET(ret, value);
73b437c8 3655 }
ffc61ed2 3656 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3657 break;
3658 case ANYOF_ASCII:
3659 if (LOC)
936ed897 3660 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3661 else {
c7f1f016 3662#ifndef EBCDIC
1ba5c669
JH
3663 for (value = 0; value < 128; value++)
3664 ANYOF_BITMAP_SET(ret, value);
3665#else /* EBCDIC */
ffbc6a93 3666 for (value = 0; value < 256; value++) {
3a3c4447
JH
3667 if (isASCII(value))
3668 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3669 }
1ba5c669 3670#endif /* EBCDIC */
73b437c8 3671 }
ffc61ed2 3672 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3673 break;
3674 case ANYOF_NASCII:
3675 if (LOC)
936ed897 3676 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3677 else {
c7f1f016 3678#ifndef EBCDIC
1ba5c669
JH
3679 for (value = 128; value < 256; value++)
3680 ANYOF_BITMAP_SET(ret, value);
3681#else /* EBCDIC */
ffbc6a93 3682 for (value = 0; value < 256; value++) {
3a3c4447
JH
3683 if (!isASCII(value))
3684 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3685 }
1ba5c669 3686#endif /* EBCDIC */
73b437c8 3687 }
ffc61ed2 3688 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3689 break;
aaa51d5e
JF
3690 case ANYOF_BLANK:
3691 if (LOC)
3692 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3693 else {
3694 for (value = 0; value < 256; value++)
3695 if (isBLANK(value))
3696 ANYOF_BITMAP_SET(ret, value);
3697 }
ffc61ed2 3698 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3699 break;
3700 case ANYOF_NBLANK:
3701 if (LOC)
3702 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3703 else {
3704 for (value = 0; value < 256; value++)
3705 if (!isBLANK(value))
3706 ANYOF_BITMAP_SET(ret, value);
3707 }
ffc61ed2 3708 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3709 break;
73b437c8
JH
3710 case ANYOF_CNTRL:
3711 if (LOC)
936ed897 3712 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3713 else {
3714 for (value = 0; value < 256; value++)
3715 if (isCNTRL(value))
936ed897 3716 ANYOF_BITMAP_SET(ret, value);
73b437c8 3717 }
ffc61ed2 3718 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3719 break;
3720 case ANYOF_NCNTRL:
3721 if (LOC)
936ed897 3722 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3723 else {
3724 for (value = 0; value < 256; value++)
3725 if (!isCNTRL(value))
936ed897 3726 ANYOF_BITMAP_SET(ret, value);
73b437c8 3727 }
ffc61ed2
JH
3728 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3729 break;
3730 case ANYOF_DIGIT:
3731 if (LOC)
3732 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3733 else {
3734 /* consecutive digits assumed */
3735 for (value = '0'; value <= '9'; value++)
3736 ANYOF_BITMAP_SET(ret, value);
3737 }
3738 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3739 break;
3740 case ANYOF_NDIGIT:
3741 if (LOC)
3742 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3743 else {
3744 /* consecutive digits assumed */
3745 for (value = 0; value < '0'; value++)
3746 ANYOF_BITMAP_SET(ret, value);
3747 for (value = '9' + 1; value < 256; value++)
3748 ANYOF_BITMAP_SET(ret, value);
3749 }
3750 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3751 break;
3752 case ANYOF_GRAPH:
3753 if (LOC)
936ed897 3754 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3755 else {
3756 for (value = 0; value < 256; value++)
3757 if (isGRAPH(value))
936ed897 3758 ANYOF_BITMAP_SET(ret, value);
73b437c8 3759 }
ffc61ed2 3760 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3761 break;
3762 case ANYOF_NGRAPH:
3763 if (LOC)
936ed897 3764 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3765 else {
3766 for (value = 0; value < 256; value++)
3767 if (!isGRAPH(value))
936ed897 3768 ANYOF_BITMAP_SET(ret, value);
73b437c8 3769 }
ffc61ed2 3770 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3771 break;
3772 case ANYOF_LOWER:
3773 if (LOC)
936ed897 3774 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3775 else {
3776 for (value = 0; value < 256; value++)
3777 if (isLOWER(value))
936ed897 3778 ANYOF_BITMAP_SET(ret, value);
73b437c8 3779 }
ffc61ed2 3780 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3781 break;
3782 case ANYOF_NLOWER:
3783 if (LOC)
936ed897 3784 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (!isLOWER(value))
936ed897 3788 ANYOF_BITMAP_SET(ret, value);
73b437c8 3789 }
ffc61ed2 3790 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3791 break;
3792 case ANYOF_PRINT:
3793 if (LOC)
936ed897 3794 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3795 else {
3796 for (value = 0; value < 256; value++)
3797 if (isPRINT(value))
936ed897 3798 ANYOF_BITMAP_SET(ret, value);
73b437c8 3799 }
ffc61ed2 3800 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3801 break;
3802 case ANYOF_NPRINT:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3805 else {
3806 for (value = 0; value < 256; value++)
3807 if (!isPRINT(value))
936ed897 3808 ANYOF_BITMAP_SET(ret, value);
73b437c8 3809 }
ffc61ed2 3810 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3811 break;
aaa51d5e
JF
3812 case ANYOF_PSXSPC:
3813 if (LOC)
3814 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3815 else {
3816 for (value = 0; value < 256; value++)
3817 if (isPSXSPC(value))
3818 ANYOF_BITMAP_SET(ret, value);
3819 }
ffc61ed2 3820 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3821 break;
3822 case ANYOF_NPSXSPC:
3823 if (LOC)
3824 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3825 else {
3826 for (value = 0; value < 256; value++)
3827 if (!isPSXSPC(value))
3828 ANYOF_BITMAP_SET(ret, value);
3829 }
ffc61ed2 3830 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3831 break;
73b437c8
JH
3832 case ANYOF_PUNCT:
3833 if (LOC)
936ed897 3834 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3835 else {
3836 for (value = 0; value < 256; value++)
3837 if (isPUNCT(value))
936ed897 3838 ANYOF_BITMAP_SET(ret, value);
73b437c8 3839 }
ffc61ed2 3840 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3841 break;
3842 case ANYOF_NPUNCT:
3843 if (LOC)
936ed897 3844 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3845 else {
3846 for (value = 0; value < 256; value++)
3847 if (!isPUNCT(value))
936ed897 3848 ANYOF_BITMAP_SET(ret, value);
73b437c8 3849 }
ffc61ed2
JH
3850 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3851 break;
3852 case ANYOF_SPACE:
3853 if (LOC)
3854 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3855 else {
3856 for (value = 0; value < 256; value++)
3857 if (isSPACE(value))
3858 ANYOF_BITMAP_SET(ret, value);
3859 }
3860 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3861 break;
3862 case ANYOF_NSPACE:
3863 if (LOC)
3864 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3865 else {
3866 for (value = 0; value < 256; value++)
3867 if (!isSPACE(value))
3868 ANYOF_BITMAP_SET(ret, value);
3869 }
3870 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3871 break;
3872 case ANYOF_UPPER:
3873 if (LOC)
936ed897 3874 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3875 else {
3876 for (value = 0; value < 256; value++)
3877 if (isUPPER(value))
936ed897 3878 ANYOF_BITMAP_SET(ret, value);
73b437c8 3879 }
ffc61ed2 3880 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3881 break;
3882 case ANYOF_NUPPER:
3883 if (LOC)
936ed897 3884 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3885 else {
3886 for (value = 0; value < 256; value++)
3887 if (!isUPPER(value))
936ed897 3888 ANYOF_BITMAP_SET(ret, value);
73b437c8 3889 }
ffc61ed2 3890 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3891 break;
3892 case ANYOF_XDIGIT:
3893 if (LOC)
936ed897 3894 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3895 else {
3896 for (value = 0; value < 256; value++)
3897 if (isXDIGIT(value))
936ed897 3898 ANYOF_BITMAP_SET(ret, value);
73b437c8 3899 }
ffc61ed2 3900 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3901 break;
3902 case ANYOF_NXDIGIT:
3903 if (LOC)
936ed897 3904 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3905 else {
3906 for (value = 0; value < 256; value++)
3907 if (!isXDIGIT(value))
936ed897 3908 ANYOF_BITMAP_SET(ret, value);
73b437c8 3909 }
ffc61ed2 3910 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3911 break;
3912 default:
b45f050a 3913 vFAIL("Invalid [::] class");
73b437c8 3914 break;
b8c5462f 3915 }
b8c5462f 3916 if (LOC)
936ed897 3917 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3918 continue;
a0d0e21e 3919 }
ffc61ed2
JH
3920 } /* end of namedclass \blah */
3921
a0d0e21e 3922 if (range) {
3a3c4447 3923 if (prevvalue > value) /* b-a */ {
b45f050a 3924 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3925 RExC_parse - rangebegin,
3926 RExC_parse - rangebegin,
b45f050a 3927 rangebegin);
3568d838 3928 range = 0; /* not a valid range */
73b437c8 3929 }
a0d0e21e
LW
3930 }
3931 else {
3568d838 3932 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3933 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3934 RExC_parse[1] != ']') {
3935 RExC_parse++;
ffc61ed2
JH
3936
3937 /* a bad range like \w-, [:word:]- ? */
3938 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3939 if (ckWARN(WARN_REGEXP))
830247a4 3940 vWARN4(RExC_parse,
b45f050a 3941 "False [] range \"%*.*s\"",
830247a4
IZ
3942 RExC_parse - rangebegin,
3943 RExC_parse - rangebegin,
b45f050a 3944 rangebegin);
73b437c8 3945 if (!SIZE_ONLY)
936ed897 3946 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3947 } else
ffc61ed2
JH
3948 range = 1; /* yeah, it's a range! */
3949 continue; /* but do it the next time */
a0d0e21e 3950 }
a687059c 3951 }
ffc61ed2 3952
93733859 3953 /* now is the next time */
ae5c130c 3954 if (!SIZE_ONLY) {
3568d838
JH
3955 IV i;
3956
3957 if (prevvalue < 256) {
3958 IV ceilvalue = value < 256 ? value : 255;
3959
3960#ifdef EBCDIC
3a3c4447
JH
3961 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3962 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3963 {
3568d838
JH
3964 if (isLOWER(prevvalue)) {
3965 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3966 if (isLOWER(i))
3967 ANYOF_BITMAP_SET(ret, i);
3968 } else {
3568d838 3969 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3970 if (isUPPER(i))
3971 ANYOF_BITMAP_SET(ret, i);
3972 }
8ada0baa 3973 }
ffc61ed2 3974 else
8ada0baa 3975#endif
a5961de5
JH
3976 for (i = prevvalue; i <= ceilvalue; i++)
3977 ANYOF_BITMAP_SET(ret, i);
3568d838 3978 }
a5961de5 3979 if (value > 255 || UTF) {
ffc61ed2 3980 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3981 if (prevvalue < value)
ffc61ed2 3982 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838 3983 (UV)prevvalue, (UV)value);
09091399 3984 else if (prevvalue == value) {
ffc61ed2
JH
3985 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3986 (UV)value);
09091399
JH
3987 if (FOLD) {
3988 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
3989 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3990 UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
3991 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3992 UNICODE_GREEK_SMALL_LETTER_SIGMA);
3993 }
3994 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
3995 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3996 UNICODE_GREEK_SMALL_LETTER_SIGMA);
3997 }
3998 }
ffc61ed2 3999 }
8ada0baa 4000 }
ffc61ed2
JH
4001
4002 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4003 }
ffc61ed2 4004
936ed897 4005 if (need_class) {
4f66b38d 4006 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4007 if (SIZE_ONLY)
830247a4 4008 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4009 else
830247a4 4010 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4011 }
ffc61ed2 4012
ae5c130c 4013 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4014 if (!SIZE_ONLY &&
ffc61ed2 4015 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4016 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4017 ) {
a0ed51b3 4018 for (value = 0; value < 256; ++value) {
936ed897 4019 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4020 IV fold = PL_fold[value];
4021
4022 if (fold != value)
4023 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4024 }
4025 }
936ed897 4026 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4027 }
ffc61ed2 4028
ae5c130c 4029 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4030 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4031 /* If the only flag is inversion. */
4032 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4033 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4034 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4035 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4036 }
a0d0e21e 4037
b81d288d 4038 if (!SIZE_ONLY) {
fde631ed 4039 AV *av = newAV();
ffc61ed2
JH
4040 SV *rv;
4041
4042 av_store(av, 0, listsv);
4043 av_store(av, 1, NULL);
4044 rv = newRV_noinc((SV*)av);
19860706 4045 n = add_data(pRExC_state, 1, "s");
830247a4 4046 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4047 ARG_SET(ret, n);
a0ed51b3
LW
4048 }
4049
4050 return ret;
4051}
4052
76e3520e 4053STATIC char*
830247a4 4054S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4055{
830247a4 4056 char* retval = RExC_parse++;
a0d0e21e 4057
4633a7c4 4058 for (;;) {
830247a4
IZ
4059 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4060 RExC_parse[2] == '#') {
4061 while (*RExC_parse && *RExC_parse != ')')
4062 RExC_parse++;
4063 RExC_parse++;
4633a7c4
LW
4064 continue;
4065 }
830247a4
IZ
4066 if (RExC_flags16 & PMf_EXTENDED) {
4067 if (isSPACE(*RExC_parse)) {
4068 RExC_parse++;
748a9306
LW
4069 continue;
4070 }
830247a4
IZ
4071 else if (*RExC_parse == '#') {
4072 while (*RExC_parse && *RExC_parse != '\n')
4073 RExC_parse++;
4074 RExC_parse++;
748a9306
LW
4075 continue;
4076 }
748a9306 4077 }
4633a7c4 4078 return retval;
a0d0e21e 4079 }
a687059c
LW
4080}
4081
4082/*
c277df42 4083- reg_node - emit a node
a0d0e21e 4084*/
76e3520e 4085STATIC regnode * /* Location. */
830247a4 4086S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4087{
c277df42
IZ
4088 register regnode *ret;
4089 register regnode *ptr;
a687059c 4090
830247a4 4091 ret = RExC_emit;
c277df42 4092 if (SIZE_ONLY) {
830247a4
IZ
4093 SIZE_ALIGN(RExC_size);
4094 RExC_size += 1;
a0d0e21e
LW
4095 return(ret);
4096 }
a687059c 4097
c277df42 4098 NODE_ALIGN_FILL(ret);
a0d0e21e 4099 ptr = ret;
c277df42 4100 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4101 if (RExC_offsets) { /* MJD */
4102 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4103 "reg_node", __LINE__,
4104 reg_name[op],
4105 RExC_emit - RExC_emit_start > RExC_offsets[0]
4106 ? "Overwriting end of array!\n" : "OK",
4107 RExC_emit - RExC_emit_start,
4108 RExC_parse - RExC_start,
4109 RExC_offsets[0]));
4110 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4111 }
4112
830247a4 4113 RExC_emit = ptr;
a687059c 4114
a0d0e21e 4115 return(ret);
a687059c
LW
4116}
4117
4118/*
a0d0e21e
LW
4119- reganode - emit a node with an argument
4120*/
76e3520e 4121STATIC regnode * /* Location. */
830247a4 4122S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4123{
c277df42
IZ
4124 register regnode *ret;
4125 register regnode *ptr;
fe14fcc3 4126
830247a4 4127 ret = RExC_emit;
c277df42 4128 if (SIZE_ONLY) {
830247a4
IZ
4129 SIZE_ALIGN(RExC_size);
4130 RExC_size += 2;
a0d0e21e
LW
4131 return(ret);
4132 }
fe14fcc3 4133
c277df42 4134 NODE_ALIGN_FILL(ret);
a0d0e21e 4135 ptr = ret;
c277df42 4136 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4137 if (RExC_offsets) { /* MJD */
4138 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4139 "reganode",
4140 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4141 "Overwriting end of array!\n" : "OK",
4142 RExC_emit - RExC_emit_start,
4143 RExC_parse - RExC_start,
4144 RExC_offsets[0]));
4145 Set_Cur_Node_Offset;
4146 }
4147
830247a4 4148 RExC_emit = ptr;
fe14fcc3 4149
a0d0e21e 4150 return(ret);
fe14fcc3
LW
4151}
4152
4153/*
cd439c50 4154- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4155*/
4156STATIC void
830247a4 4157S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4158{
5e12f4fb 4159 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4160}
4161
4162/*
a0d0e21e
LW
4163- reginsert - insert an operator in front of already-emitted operand
4164*
4165* Means relocating the operand.
4166*/
76e3520e 4167STATIC void
830247a4 4168S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4169{
c277df42
IZ
4170 register regnode *src;
4171 register regnode *dst;
4172 register regnode *place;
4173 register int offset = regarglen[(U8)op];
b81d288d 4174
22c35a8c 4175/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4176
4177 if (SIZE_ONLY) {
830247a4 4178 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4179 return;
4180 }
a687059c 4181
830247a4
IZ
4182 src = RExC_emit;
4183 RExC_emit += NODE_STEP_REGNODE + offset;
4184 dst = RExC_emit;
fac92740 4185 while (src > opnd) {
c277df42 4186 StructCopy(--src, --dst, regnode);
fac92740
MJD
4187 if (RExC_offsets) { /* MJD 20010112 */
4188 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4189 "reg_insert",
4190 dst - RExC_emit_start > RExC_offsets[0]
4191 ? "Overwriting end of array!\n" : "OK",
4192 src - RExC_emit_start,
4193 dst - RExC_emit_start,
4194 RExC_offsets[0]));
4195 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4196 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4197 }
4198 }
4199
a0d0e21e
LW
4200
4201 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4202 if (RExC_offsets) { /* MJD */
4203 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4204 "reginsert",
4205 place - RExC_emit_start > RExC_offsets[0]
4206 ? "Overwriting end of array!\n" : "OK",
4207 place - RExC_emit_start,
4208 RExC_parse - RExC_start,
4209 RExC_offsets[0]));
4210 Set_Node_Offset(place, RExC_parse);
4211 }
c277df42
IZ
4212 src = NEXTOPER(place);
4213 FILL_ADVANCE_NODE(place, op);
4214 Zero(src, offset, regnode);
a687059c
LW
4215}
4216
4217/*
c277df42 4218- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4219*/
76e3520e 4220STATIC void
830247a4 4221S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4222{
c277df42
IZ
4223 register regnode *scan;
4224 register regnode *temp;
a0d0e21e 4225
c277df42 4226 if (SIZE_ONLY)
a0d0e21e
LW
4227 return;
4228
4229 /* Find last node. */
4230 scan = p;
4231 for (;;) {
4232 temp = regnext(scan);
4233 if (temp == NULL)
4234 break;
4235 scan = temp;
4236 }
a687059c 4237
c277df42
IZ
4238 if (reg_off_by_arg[OP(scan)]) {
4239 ARG_SET(scan, val - scan);
a0ed51b3
LW
4240 }
4241 else {
c277df42
IZ
4242 NEXT_OFF(scan) = val - scan;
4243 }
a687059c
LW
4244}
4245
4246/*
a0d0e21e
LW
4247- regoptail - regtail on operand of first argument; nop if operandless
4248*/
76e3520e 4249STATIC void
830247a4 4250S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4251{
a0d0e21e 4252 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4253 if (p == NULL || SIZE_ONLY)
4254 return;
22c35a8c 4255 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4256 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4257 }
22c35a8c 4258 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4259 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4260 }
4261 else
a0d0e21e 4262 return;
a687059c
LW
4263}
4264
4265/*
4266 - regcurly - a little FSA that accepts {\d+,?\d*}
4267 */
79072805 4268STATIC I32
cea2e8a9 4269S_regcurly(pTHX_ register char *s)
a687059c
LW
4270{
4271 if (*s++ != '{')
4272 return FALSE;
f0fcb552 4273 if (!isDIGIT(*s))
a687059c 4274 return FALSE;
f0fcb552 4275 while (isDIGIT(*s))
a687059c
LW
4276 s++;
4277 if (*s == ',')
4278 s++;
f0fcb552 4279 while (isDIGIT(*s))
a687059c
LW
4280 s++;
4281 if (*s != '}')
4282 return FALSE;
4283 return TRUE;
4284}
4285
a687059c 4286
8fa7f367
JH
4287#ifdef DEBUGGING
4288
76e3520e 4289STATIC regnode *
cea2e8a9 4290S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4291{
f248d071 4292 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4293 register regnode *next;
c277df42
IZ
4294
4295 while (op != END && (!last || node < last)) {
4296 /* While that wasn't END last time... */
4297
4298 NODE_ALIGN(node);
4299 op = OP(node);
4300 if (op == CLOSE)
4301 l--;
4302 next = regnext(node);
4303 /* Where, what. */
4304 if (OP(node) == OPTIMIZED)
4305 goto after_print;
4306 regprop(sv, node);
b900a521 4307 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4308 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4309 if (next == NULL) /* Next ptr. */
4310 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4311 else
b900a521 4312 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4313 (void)PerlIO_putc(Perl_debug_log, '\n');
4314 after_print:
22c35a8c 4315 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4316 register regnode *nnode = (OP(next) == LONGJMP
4317 ? regnext(next)
c277df42
IZ
4318 : next);
4319 if (last && nnode > last)
4320 nnode = last;
4321 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4322 }
22c35a8c 4323 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4324 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4325 }
4326 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4327 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4328 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4329 }
22c35a8c 4330 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4331 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4332 next, sv, l + 1);
a0ed51b3
LW
4333 }
4334 else if ( op == PLUS || op == STAR) {
c277df42 4335 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4336 }
4337 else if (op == ANYOF) {
4f66b38d
HS
4338 /* arglen 1 + class block */
4339 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4340 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4341 node = NEXTOPER(node);
a0ed51b3 4342 }
22c35a8c 4343 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4344 /* Literal string, where present. */
cd439c50 4345 node += NODE_SZ_STR(node) - 1;
c277df42 4346 node = NEXTOPER(node);
a0ed51b3
LW
4347 }
4348 else {
c277df42
IZ
4349 node = NEXTOPER(node);
4350 node += regarglen[(U8)op];
4351 }
4352 if (op == CURLYX || op == OPEN)
4353 l++;
4354 else if (op == WHILEM)
4355 l--;
4356 }
4357 return node;
4358}
4359
8fa7f367
JH
4360#endif /* DEBUGGING */
4361
a687059c 4362/*
fd181c75 4363 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4364 */
4365void
864dbfa3 4366Perl_regdump(pTHX_ regexp *r)
a687059c 4367{
35ff7856 4368#ifdef DEBUGGING
46fc3d4c 4369 SV *sv = sv_newmortal();
a687059c 4370
c277df42 4371 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4372
4373 /* Header fields of interest. */
c277df42 4374 if (r->anchored_substr)
7b0972df 4375 PerlIO_printf(Perl_debug_log,
b81d288d 4376 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4377 PL_colors[0],
7b0972df 4378 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4379 SvPVX(r->anchored_substr),
3280af22 4380 PL_colors[1],
c277df42 4381 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4382 (IV)r->anchored_offset);
c277df42 4383 if (r->float_substr)
7b0972df 4384 PerlIO_printf(Perl_debug_log,
b81d288d 4385 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4386 PL_colors[0],
b81d288d 4387 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4388 SvPVX(r->float_substr),
3280af22 4389 PL_colors[1],
c277df42 4390 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4391 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4392 if (r->check_substr)
b81d288d
AB
4393 PerlIO_printf(Perl_debug_log,
4394 r->check_substr == r->float_substr
c277df42
IZ
4395 ? "(checking floating" : "(checking anchored");
4396 if (r->reganch & ROPT_NOSCAN)
4397 PerlIO_printf(Perl_debug_log, " noscan");
4398 if (r->reganch & ROPT_CHECK_ALL)
4399 PerlIO_printf(Perl_debug_log, " isall");
4400 if (r->check_substr)
4401 PerlIO_printf(Perl_debug_log, ") ");
4402
46fc3d4c 4403 if (r->regstclass) {
4404 regprop(sv, r->regstclass);
4405 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4406 }
774d564b 4407 if (r->reganch & ROPT_ANCH) {
4408 PerlIO_printf(Perl_debug_log, "anchored");
4409 if (r->reganch & ROPT_ANCH_BOL)
4410 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4411 if (r->reganch & ROPT_ANCH_MBOL)
4412 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4413 if (r->reganch & ROPT_ANCH_SBOL)
4414 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4415 if (r->reganch & ROPT_ANCH_GPOS)
4416 PerlIO_printf(Perl_debug_log, "(GPOS)");
4417 PerlIO_putc(Perl_debug_log, ' ');
4418 }
c277df42
IZ
4419 if (r->reganch & ROPT_GPOS_SEEN)
4420 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4421 if (r->reganch & ROPT_SKIP)
760ac839 4422 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4423 if (r->reganch & ROPT_IMPLICIT)
760ac839 4424 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4425 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4426 if (r->reganch & ROPT_EVAL_SEEN)
4427 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4428 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4429 if (r->offsets) {
4430 U32 i;
4431 U32 len = r->offsets[0];
392fbf5d 4432 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4433 for (i = 1; i <= len; i++)
392fbf5d
RB
4434 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4435 (UV)r->offsets[i*2-1],
4436 (UV)r->offsets[i*2]);
fac92740
MJD
4437 PerlIO_printf(Perl_debug_log, "\n");
4438 }
17c3b450 4439#endif /* DEBUGGING */
a687059c
LW
4440}
4441
8fa7f367
JH
4442#ifdef DEBUGGING
4443
653099ff
GS
4444STATIC void
4445S_put_byte(pTHX_ SV *sv, int c)
4446{
7be5a6cf 4447 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4448 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4449 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4450 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4451 else
4452 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4453}
4454
8fa7f367
JH
4455#endif /* DEBUGGING */
4456
a687059c 4457/*
a0d0e21e
LW
4458- regprop - printable representation of opcode
4459*/
46fc3d4c 4460void
864dbfa3 4461Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4462{
35ff7856 4463#ifdef DEBUGGING
9b155405 4464 register int k;
a0d0e21e 4465
54dc92de 4466 sv_setpvn(sv, "", 0);
9b155405 4467 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4468 /* It would be nice to FAIL() here, but this may be called from
4469 regexec.c, and it would be hard to supply pRExC_state. */
4470 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4471 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4472
4473 k = PL_regkind[(U8)OP(o)];
4474
2a782b5b
JH
4475 if (k == EXACT) {
4476 SV *dsv = sv_2mortal(newSVpvn("", 0));
df1ffd02 4477 bool do_utf8 = DO_UTF8(sv);
8a989385 4478 char *s = do_utf8 ?
2a782b5b
JH
4479 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
4480 STRING(o);
40eddc46 4481 int len = do_utf8 ?
2a782b5b
JH
4482 strlen(s) :
4483 STR_LEN(o);
4484 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4485 PL_colors[0],
4486 len, s,
4487 PL_colors[1]);
4488 }
9b155405 4489 else if (k == CURLY) {
cb434fcc 4490 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4491 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4492 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4493 }
2c2d71f5
JH
4494 else if (k == WHILEM && o->flags) /* Ordinal/of */
4495 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4496 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4497 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4498 else if (k == LOGICAL)
04ebc1ab 4499 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4500 else if (k == ANYOF) {
4501 int i, rangestart = -1;
ffc61ed2 4502 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4503 const char * const anyofs[] = { /* Should be syncronized with
4504 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4505 "\\w",
4506 "\\W",
4507 "\\s",
4508 "\\S",
4509 "\\d",
4510 "\\D",
4511 "[:alnum:]",
4512 "[:^alnum:]",
4513 "[:alpha:]",
4514 "[:^alpha:]",
4515 "[:ascii:]",
4516 "[:^ascii:]",
4517 "[:ctrl:]",
4518 "[:^ctrl:]",
4519 "[:graph:]",
4520 "[:^graph:]",
4521 "[:lower:]",
4522 "[:^lower:]",
4523 "[:print:]",
4524 "[:^print:]",
4525 "[:punct:]",
4526 "[:^punct:]",
4527 "[:upper:]",
aaa51d5e 4528 "[:^upper:]",
653099ff 4529 "[:xdigit:]",
aaa51d5e
JF
4530 "[:^xdigit:]",
4531 "[:space:]",
4532 "[:^space:]",
4533 "[:blank:]",
4534 "[:^blank:]"
653099ff
GS
4535 };
4536
19860706 4537 if (flags & ANYOF_LOCALE)
653099ff 4538 sv_catpv(sv, "{loc}");
19860706 4539 if (flags & ANYOF_FOLD)
653099ff
GS
4540 sv_catpv(sv, "{i}");
4541 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4542 if (flags & ANYOF_INVERT)
653099ff 4543 sv_catpv(sv, "^");
ffc61ed2
JH
4544 for (i = 0; i <= 256; i++) {
4545 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4546 if (rangestart == -1)
4547 rangestart = i;
4548 } else if (rangestart != -1) {
4549 if (i <= rangestart + 3)
4550 for (; rangestart < i; rangestart++)
653099ff 4551 put_byte(sv, rangestart);
ffc61ed2
JH
4552 else {
4553 put_byte(sv, rangestart);
4554 sv_catpv(sv, "-");
4555 put_byte(sv, i - 1);
653099ff 4556 }
ffc61ed2 4557 rangestart = -1;
653099ff 4558 }
847a199f 4559 }
ffc61ed2
JH
4560
4561 if (o->flags & ANYOF_CLASS)
4562 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4563 if (ANYOF_CLASS_TEST(o,i))
4564 sv_catpv(sv, anyofs[i]);
4565
4566 if (flags & ANYOF_UNICODE)
4567 sv_catpv(sv, "{unicode}");
1aa99e6b 4568 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4569 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4570
4571 {
4572 SV *lv;
4573 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4574
ffc61ed2
JH
4575 if (lv) {
4576 if (sw) {
4577 UV i;
4578 U8 s[UTF8_MAXLEN+1];
b81d288d 4579
ffc61ed2 4580 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4581 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4582
3568d838 4583 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4584 if (rangestart == -1)
4585 rangestart = i;
4586 } else if (rangestart != -1) {
4587 U8 *p;
b81d288d 4588
ffc61ed2
JH
4589 if (i <= rangestart + 3)
4590 for (; rangestart < i; rangestart++) {
2b9d42f0 4591 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4592 put_byte(sv, *p);
4593 }
4594 else {
2b9d42f0 4595 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4596 put_byte(sv, *p);
4597 sv_catpv(sv, "-");
2b9d42f0 4598 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4599 put_byte(sv, *p);
4600 }
4601 rangestart = -1;
4602 }
19860706 4603 }
ffc61ed2
JH
4604
4605 sv_catpv(sv, "..."); /* et cetera */
19860706 4606 }
fde631ed 4607
ffc61ed2
JH
4608 {
4609 char *s = savepv(SvPVX(lv));
4610 char *origs = s;
b81d288d 4611
ffc61ed2 4612 while(*s && *s != '\n') s++;
b81d288d 4613
ffc61ed2
JH
4614 if (*s == '\n') {
4615 char *t = ++s;
4616
4617 while (*s) {
4618 if (*s == '\n')
4619 *s = ' ';
4620 s++;
4621 }
4622 if (s[-1] == ' ')
4623 s[-1] = 0;
4624
4625 sv_catpv(sv, t);
fde631ed 4626 }
b81d288d 4627
ffc61ed2 4628 Safefree(origs);
fde631ed
JH
4629 }
4630 }
653099ff 4631 }
ffc61ed2 4632
653099ff
GS
4633 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4634 }
9b155405 4635 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4636 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4637#endif /* DEBUGGING */
35ff7856 4638}
a687059c 4639
cad2e5aa
JH
4640SV *
4641Perl_re_intuit_string(pTHX_ regexp *prog)
4642{ /* Assume that RE_INTUIT is set */
4643 DEBUG_r(
4644 { STRLEN n_a;
4645 char *s = SvPV(prog->check_substr,n_a);
4646
4647 if (!PL_colorset) reginitcolors();
4648 PerlIO_printf(Perl_debug_log,
4649 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4650 PL_colors[4],PL_colors[5],PL_colors[0],
4651 s,
4652 PL_colors[1],
4653 (strlen(s) > 60 ? "..." : ""));
4654 } );
4655
4656 return prog->check_substr;
4657}
4658
2b69d0c2 4659void
864dbfa3 4660Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4661{
adac82c7 4662 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4663
4664 if (!r || (--r->refcnt > 0))
4665 return;
cad2e5aa
JH
4666 DEBUG_r(PerlIO_printf(Perl_debug_log,
4667 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4668 PL_colors[4],PL_colors[5],PL_colors[0],
4669 r->precomp,
4670 PL_colors[1],
4671 (strlen(r->precomp) > 60 ? "..." : "")));
4672
c277df42 4673 if (r->precomp)
a0d0e21e 4674 Safefree(r->precomp);
fac92740
MJD
4675 if (r->offsets) /* 20010421 MJD */
4676 Safefree(r->offsets);
cf93c79d
IZ
4677 if (RX_MATCH_COPIED(r))
4678 Safefree(r->subbeg);
a193d654
GS
4679 if (r->substrs) {
4680 if (r->anchored_substr)
4681 SvREFCNT_dec(r->anchored_substr);
4682 if (r->float_substr)
4683 SvREFCNT_dec(r->float_substr);
2779dcf1 4684 Safefree(r->substrs);
a193d654 4685 }
c277df42
IZ
4686 if (r->data) {
4687 int n = r->data->count;
dfad63ad
HS
4688 AV* new_comppad = NULL;
4689 AV* old_comppad;
4690 SV** old_curpad;
4691
c277df42 4692 while (--n >= 0) {
261faec3 4693 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4694 switch (r->data->what[n]) {
4695 case 's':
4696 SvREFCNT_dec((SV*)r->data->data[n]);
4697 break;
653099ff
GS
4698 case 'f':
4699 Safefree(r->data->data[n]);
4700 break;
dfad63ad
HS
4701 case 'p':
4702 new_comppad = (AV*)r->data->data[n];
4703 break;
c277df42 4704 case 'o':
dfad63ad 4705 if (new_comppad == NULL)
cea2e8a9 4706 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4707 old_comppad = PL_comppad;
4708 old_curpad = PL_curpad;
1e6dc0b6
SB
4709 /* Watch out for global destruction's random ordering. */
4710 if (SvTYPE(new_comppad) == SVt_PVAV) {
4711 PL_comppad = new_comppad;
4712 PL_curpad = AvARRAY(new_comppad);
4713 }
4714 else
4715 PL_curpad = NULL;
9b978d73
DM
4716
4717 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4718 op_free((OP_4tree*)r->data->data[n]);
4719 }
4720
dfad63ad
HS
4721 PL_comppad = old_comppad;
4722 PL_curpad = old_curpad;
4723 SvREFCNT_dec((SV*)new_comppad);
4724 new_comppad = NULL;
c277df42
IZ
4725 break;
4726 case 'n':
4727 break;
4728 default:
830247a4 4729 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4730 }
4731 }
4732 Safefree(r->data->what);
4733 Safefree(r->data);
a0d0e21e
LW
4734 }
4735 Safefree(r->startp);
4736 Safefree(r->endp);
4737 Safefree(r);
a687059c 4738}
c277df42
IZ
4739
4740/*
4741 - regnext - dig the "next" pointer out of a node
4742 *
4743 * [Note, when REGALIGN is defined there are two places in regmatch()
4744 * that bypass this code for speed.]
4745 */
4746regnode *
864dbfa3 4747Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4748{
4749 register I32 offset;
4750
3280af22 4751 if (p == &PL_regdummy)
c277df42
IZ
4752 return(NULL);
4753
4754 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4755 if (offset == 0)
4756 return(NULL);
4757
c277df42 4758 return(p+offset);
c277df42
IZ
4759}
4760
01f988be 4761STATIC void
cea2e8a9 4762S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4763{
4764 va_list args;
4765 STRLEN l1 = strlen(pat1);
4766 STRLEN l2 = strlen(pat2);
4767 char buf[512];
06bf62c7 4768 SV *msv;
c277df42
IZ
4769 char *message;
4770
4771 if (l1 > 510)
4772 l1 = 510;
4773 if (l1 + l2 > 510)
4774 l2 = 510 - l1;
4775 Copy(pat1, buf, l1 , char);
4776 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4777 buf[l1 + l2] = '\n';
4778 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4779#ifdef I_STDARG
4780 /* ANSI variant takes additional second argument */
c277df42 4781 va_start(args, pat2);
8736538c
AS
4782#else
4783 va_start(args);
4784#endif
5a844595 4785 msv = vmess(buf, &args);
c277df42 4786 va_end(args);
06bf62c7 4787 message = SvPV(msv,l1);
c277df42
IZ
4788 if (l1 > 512)
4789 l1 = 512;
4790 Copy(message, buf, l1 , char);
4791 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4792 Perl_croak(aTHX_ "%s", buf);
c277df42 4793}
a0ed51b3
LW
4794
4795/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4796
4797void
864dbfa3 4798Perl_save_re_context(pTHX)
b81d288d 4799{
830247a4
IZ
4800#if 0
4801 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4802 SAVEI32(RExC_npar); /* () count. */
4803 SAVEI32(RExC_size); /* Code size. */
4804 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4805 SAVEVPTR(RExC_rx); /* from regcomp.c */
4806 SAVEI32(RExC_seen); /* from regcomp.c */
4807 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4808 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4809 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4810 SAVEPPTR(RExC_end); /* End of input for compile */
4811 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4812#endif
4813
4814 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4815 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4816 SAVEPPTR(PL_reginput); /* String-input pointer. */
4817 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4818 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4819 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4820 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4821 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4822 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4823 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4824 PL_reg_start_tmp = 0;
a0ed51b3
LW
4825 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4826 PL_reg_start_tmpl = 0;
7766f137 4827 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4828 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4829 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4830 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4831 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4832 SAVEVPTR(PL_regcc); /* from regexec.c */
4833 SAVEVPTR(PL_curcop);
7766f137
GS
4834 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4835 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4836 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4837 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4838 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4839 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4840 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4841 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4842 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4843 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4844 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4845#ifdef DEBUGGING
b81d288d 4846 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4847#endif
a0ed51b3 4848}
51371543 4849
51371543 4850static void
acfe0abc 4851clear_re(pTHX_ void *r)
51371543
GS
4852{
4853 ReREFCNT_dec((regexp *)r);
4854}
ffbc6a93 4855