This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Nice porting script from Sarathy. Well, less nice
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
12bcd1a6 395 Perl_warner(aTHX_ packWARN3(WARN_DEPRECATED, WARN_REGEXP, WARN_SYNTAX), "%s" REPORT_LOCATION,\
f9373011
PM
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
397 } STMT_END \
398
b45f050a
JF
399
400#define vWARN2(loc, m, a1) \
401 STMT_START { \
7a799e20 402 IV offset = loc - RExC_precomp; \
b45f050a
JF
403 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
404 a1, \
830247a4 405 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
406 } STMT_END
407
408#define vWARN3(loc, m, a1, a2) \
409 STMT_START { \
7a799e20 410 IV offset = loc - RExC_precomp; \
b45f050a
JF
411 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
412 a1, a2, \
830247a4 413 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
414 } STMT_END
415
416#define vWARN4(loc, m, a1, a2, a3) \
417 STMT_START { \
7a799e20 418 IV offset = loc - RExC_precomp; \
b45f050a
JF
419 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
420 a1, a2, a3, \
830247a4 421 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
422 } STMT_END
423
9d1d55b5
JP
424/* used for the parse_flags section for (?c) -- japhy */
425#define vWARN5(loc, m, a1, a2, a3, a4) \
426 STMT_START { \
7a799e20 427 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
428 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
429 a1, a2, a3, a4, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
431 } STMT_END
432
8615cb43 433
cd439c50 434/* Allow for side effects in s */
fd3f0ae2 435#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 436
fac92740
MJD
437/* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
441 */
442
443#define MJD_OFFSET_DEBUG(x)
444/* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447# define Set_Node_Offset_To_R(node,byte) \
448 STMT_START { \
449 if (! SIZE_ONLY) { \
450 if((node) < 0) { \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452 } else { \
453 RExC_offsets[2*(node)-1] = (byte); \
454 } \
455 } \
456 } STMT_END
457
458# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461# define Set_Node_Length_To_R(node,len) \
462 STMT_START { \
463 if (! SIZE_ONLY) { \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465 if((node) < 0) { \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467 } else { \
468 RExC_offsets[2*(node)] = (len); \
469 } \
470 } \
471 } STMT_END
472
473# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
476
477/* Get offsets and lengths */
478#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
acfe0abc 481static void clear_re(pTHX_ void *r);
4327152a 482
653099ff
GS
483/* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
486
4327152a 487STATIC void
830247a4 488S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 489{
a0ed51b3
LW
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 492
c277df42
IZ
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
b81d288d 498 data->flags
c277df42
IZ
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500 else
501 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
502 }
503 else {
c277df42 504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
505 data->offset_float_max = (l
506 ? data->last_start_max
c277df42
IZ
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
b81d288d 509 data->flags
c277df42
IZ
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511 else
512 data->flags &= ~SF_FL_BEFORE_EOL;
513 }
514 }
515 SvCUR_set(data->last_found, 0);
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
588
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
593 }
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
598}
599
600/* 'OR' a given class with another one. Can create false positives */
601/* We assume that cl is not inverted */
602STATIC void
830247a4 603S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 604{
653099ff
GS
605 if (or_with->flags & ANYOF_INVERT) {
606 /* We do not use
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614 */
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
618 int i;
619
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
623 else {
830247a4 624 cl_anything(pRExC_state, cl);
653099ff
GS
625 }
626 } else {
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 629 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
630 || (cl->flags & ANYOF_FOLD)) ) {
631 int i;
632
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
640 }
641 }
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 643 cl_anything(pRExC_state, cl);
653099ff
GS
644 }
645 }
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
648
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
653 }
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
657 }
653099ff
GS
658}
659
5d1c421c
JH
660/*
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
663 */
664#if defined(__sparc64__) && defined(__GNUC__)
665# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666# undef SPARC64_GCC_WORKAROUND
667# define SPARC64_GCC_WORKAROUND 1
668# endif
669#endif
670
653099ff
GS
671/* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
673
c277df42
IZ
674/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
676
76e3520e 677STATIC I32
830247a4 678S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
682{
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
685 I32 delta = 0;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 687 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
653099ff 690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 691
c277df42
IZ
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
694
22c35a8c 695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 696 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
697 regnode *n = regnext(scan);
698 U32 stringok = 1;
699#ifdef DEBUGGING
700 regnode *stop = scan;
b81d288d 701#endif
c277df42 702
cd439c50 703 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
704 /* Skip NOTHING, merge EXACT*. */
705 while (n &&
b81d288d 706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
707 (stringok && (OP(n) == OP(scan))))
708 && NEXT_OFF(n)
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
711 stringok = 0;
22c35a8c 712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
715#ifdef DEBUGGING
716 if (stringok)
717 stop = n;
b81d288d 718#endif
c277df42 719 n = regnext(n);
a0ed51b3 720 }
f49d4d0f 721 else if (stringok) {
cd439c50 722 int oldl = STR_LEN(scan);
c277df42 723 regnode *nnext = regnext(n);
f49d4d0f 724
b81d288d 725 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
726 break;
727 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
c277df42 730 /* Now we can overwrite *n : */
f49d4d0f 731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 732#ifdef DEBUGGING
f49d4d0f 733 stop = next - 1;
b81d288d 734#endif
c277df42
IZ
735 n = nnext;
736 }
737 }
61a36c01 738
d65e4eab 739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
740/*
741 Two problematic code points in Unicode casefolding of EXACT nodes:
742
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746 which casefold to
747
748 Unicode UTF-8
749
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
758
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
762
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
766
767*/
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
771 char *t1 = t0 + 3;
772
773 for (s = s0 + 2;
774 s < s2 && (t = ninstr(s, s1, t0, t1));
775 s = t + 4) {
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778 min -= 4;
779 }
780 }
781
c277df42
IZ
782#ifdef DEBUGGING
783 /* Allow dumping */
cd439c50 784 n = scan + NODE_SZ_STR(scan);
c277df42 785 while (n <= stop) {
22c35a8c 786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
787 OP(n) = OPTIMIZED;
788 NEXT_OFF(n) = 0;
789 }
790 n++;
791 }
653099ff 792#endif
c277df42 793 }
653099ff
GS
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
c277df42 796 if (OP(scan) != CURLYX) {
048cfca1
GS
797 int max = (reg_off_by_arg[OP(scan)]
798 ? I32_MAX
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802 int noff;
803 regnode *n = scan;
b81d288d 804
c277df42
IZ
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
22c35a8c 807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809 && off + noff < max)
810 off += noff;
811 if (reg_off_by_arg[OP(scan)])
812 ARG(scan) = off;
b81d288d 813 else
c277df42
IZ
814 NEXT_OFF(scan) = off;
815 }
653099ff
GS
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
b81d288d 818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
821 code = OP(scan);
b81d288d
AB
822
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 824 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 825 struct regnode_charclass_class accum;
c277df42 826
653099ff 827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 829 if (flags & SCF_DO_STCLASS)
830247a4 830 cl_init_zero(pRExC_state, &accum);
c277df42 831 while (OP(scan) == code) {
830247a4 832 I32 deltanext, minnext, f = 0, fake;
653099ff 833 struct regnode_charclass_class this_class;
c277df42
IZ
834
835 num++;
836 data_fake.flags = 0;
b81d288d 837 if (data) {
2c2d71f5 838 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
839 data_fake.last_closep = data->last_closep;
840 }
841 else
842 data_fake.last_closep = &fake;
c277df42
IZ
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
845 if (code != BRANCH)
846 scan = NEXTOPER(scan);
653099ff 847 if (flags & SCF_DO_STCLASS) {
830247a4 848 cl_init(pRExC_state, &this_class);
653099ff
GS
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
b81d288d 851 }
e1901655
IZ
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
653099ff 854 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
b81d288d 857 if (min1 > minnext)
c277df42
IZ
858 min1 = minnext;
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
aca2d497 862 is_inf = is_inf_internal = 1;
c277df42
IZ
863 scan = next;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865 pars++;
405ff068 866 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 867 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
868 if (data)
869 data->whilem_c = data_fake.whilem_c;
653099ff 870 if (flags & SCF_DO_STCLASS)
830247a4 871 cl_or(pRExC_state, &accum, &this_class);
b81d288d 872 if (code == SUSPEND)
c277df42
IZ
873 break;
874 }
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876 min1 = 0;
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
882 }
883 min += min1;
884 delta += max1 - min1;
653099ff 885 if (flags & SCF_DO_STCLASS_OR) {
830247a4 886 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
887 if (min1) {
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
890 }
891 }
892 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
893 if (min1) {
894 cl_and(data->start_class, &accum);
653099ff 895 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
896 }
897 else {
b81d288d 898 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
907 }
653099ff 908 }
a0ed51b3
LW
909 }
910 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
914 continue;
a0ed51b3
LW
915 }
916 else if (OP(scan) == EXACT) {
cd439c50 917 I32 l = STR_LEN(scan);
1aa99e6b 918 UV uc = *((U8*)STRING(scan));
a0ed51b3 919 if (UTF) {
1aa99e6b
IH
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
9041c2e3 922 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
923 }
924 min += l;
c277df42 925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
b81d288d 931 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 932 }
cd439c50 933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
937 }
653099ff
GS
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
940 int compat = 1;
941
1aa99e6b 942 if (uc >= 0x100 ||
516a5887 943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 945 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 947 )
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat)
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 953 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
954 if (uc < 0x100)
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
956 }
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
1aa99e6b 959 if (uc < 0x100)
9b877dbb
IH
960 ANYOF_BITMAP_SET(data->start_class, uc);
961 else
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
653099ff 968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 969 I32 l = STR_LEN(scan);
1aa99e6b 970 UV uc = *((U8*)STRING(scan));
653099ff
GS
971
972 /* Search for fixed substrings supports EXACT only. */
b81d288d 973 if (flags & SCF_DO_SUBSTR)
830247a4 974 scan_commit(pRExC_state, data);
a0ed51b3 975 if (UTF) {
1aa99e6b
IH
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
9041c2e3 978 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
979 }
980 min += l;
c277df42 981 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 982 data->pos_min += l;
653099ff
GS
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
985 int compat = 1;
986
1aa99e6b 987 if (uc >= 0x100 ||
516a5887 988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
991 compat = 0;
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
994 if (compat) {
1aa99e6b 995 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1000 }
1001 }
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1aa99e6b
IH
1006 if (uc < 0x100)
1007 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1008 data->start_class->flags &= ~ANYOF_EOS;
1009 }
1010 cl_and(data->start_class, &and_with);
1011 }
1012 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1013 }
4d61ec05 1014 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1016 I32 f = flags, pos_before = 0;
c277df42 1017 regnode *oscan = scan;
653099ff
GS
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
727f22e3 1020 I32 next_is_eval = 0;
653099ff 1021
22c35a8c 1022 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1023 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1024 scan = NEXTOPER(scan);
1025 goto finish;
1026 case PLUS:
653099ff 1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1028 next = NEXTOPER(scan);
653099ff 1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1030 mincount = 1;
1031 maxcount = REG_INFTY;
c277df42
IZ
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1034 goto do_curly;
1035 }
1036 }
1037 if (flags & SCF_DO_SUBSTR)
1038 data->pos_min++;
1039 min++;
1040 /* Fall through. */
1041 case STAR:
653099ff
GS
1042 if (flags & SCF_DO_STCLASS) {
1043 mincount = 0;
b81d288d 1044 maxcount = REG_INFTY;
653099ff
GS
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1047 goto do_curly;
1048 }
b81d288d 1049 is_inf = is_inf_internal = 1;
c277df42
IZ
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
830247a4 1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1053 data->longest = &(data->longest_float);
1054 }
1055 goto optimize_curly_tail;
1056 case CURLY:
b81d288d 1057 mincount = ARG1(scan);
c277df42
IZ
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
cb434fcc
IZ
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1062
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064 }
c277df42 1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1066 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1067 do_curly:
1068 if (flags & SCF_DO_SUBSTR) {
830247a4 1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1070 pos_before = data->pos_min;
1071 }
1072 if (data) {
1073 fl = data->flags;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075 if (is_inf)
1076 data->flags |= SF_IS_INF;
1077 }
653099ff 1078 if (flags & SCF_DO_STCLASS) {
830247a4 1079 cl_init(pRExC_state, &this_class);
653099ff
GS
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1084 }
e1901655
IZ
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
b81d288d 1088
e1901655
IZ
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1093
c277df42 1094 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096 mincount == 0
653099ff
GS
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1098
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1103 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1104 }
1105 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1106 /* Switch to OR mode: cache the old value of
653099ff
GS
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1115 }
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1118 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1119 cl_and(data->start_class, &and_with);
1120 }
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1124 }
c277df42
IZ
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1126 scan = next;
84037bb0 1127 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1130 && (minnext == 0) && (deltanext == 0)
99799961 1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1133 {
830247a4 1134 vWARN(RExC_parse,
b45f050a
JF
1135 "Quantifier unexpected on zero-length expression");
1136 }
1137
c277df42 1138 min += minnext * mincount;
b81d288d 1139 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
aca2d497 1142 is_inf |= is_inf_internal;
c277df42
IZ
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144
1145 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1146 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1152 regnode *nxt1 = nxt;
1153#ifdef DEBUGGING
1154 regnode *nxt2;
1155#endif
c277df42
IZ
1156
1157 /* Skip open. */
1158 nxt = regnext(nxt);
4d61ec05 1159 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1161 && STR_LEN(nxt) == 1))
c277df42 1162 goto nogo;
497b47a8 1163#ifdef DEBUGGING
c277df42 1164 nxt2 = nxt;
497b47a8 1165#endif
c277df42 1166 nxt = regnext(nxt);
b81d288d 1167 if (OP(nxt) != CLOSE)
c277df42
IZ
1168 goto nogo;
1169 /* Now we know that nxt2 is the only contents: */
1170 oscan->flags = ARG(nxt);
1171 OP(oscan) = CURLYN;
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1173#ifdef DEBUGGING
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1180#endif
c277df42 1181 }
c277df42
IZ
1182 nogo:
1183
1184 /* Try optimization CURLYX => CURLYM. */
b81d288d 1185 if ( OP(oscan) == CURLYX && data
c277df42 1186 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1187 && !(data->flags & SF_HAS_EVAL)
1188 && !deltanext ) {
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1192 regnode *nxt2;
1193
1194 OP(oscan) = CURLYM;
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1196 && (OP(nxt2) != WHILEM))
c277df42
IZ
1197 nxt = nxt2;
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203
b81d288d 1204 if (OP(nxt) != CLOSE)
b45f050a 1205 FAIL("Panic opt close");
c277df42
IZ
1206 oscan->flags = ARG(nxt);
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1209#ifdef DEBUGGING
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1214#endif
c277df42
IZ
1215#if 0
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
b81d288d 1218
c277df42
IZ
1219 if (nnxt == nxt) {
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1224 else
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1226 }
1227 nxt1 = nnxt;
1228 }
1229#endif
1230 /* Optimize again: */
b81d288d 1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1232 NULL, 0);
a0ed51b3
LW
1233 }
1234 else
c277df42 1235 oscan->flags = 0;
c277df42 1236 }
e1901655
IZ
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1247
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249 nxt += ARG(nxt);
1250 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1251 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1252 }
b81d288d 1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1254 pars++;
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1258
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1260#if defined(SPARC64_GCC_WORKAROUND)
1261 I32 b = 0;
1262 STRLEN l = 0;
1263 char *s = NULL;
1264 I32 old = 0;
1265
1266 if (pos_before >= data->last_start_min)
1267 b = pos_before;
1268 else
1269 b = data->last_start_min;
1270
1271 l = 0;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1274
1275#else
b81d288d 1276 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1277 ? pos_before : data->last_start_min;
1278 STRLEN l;
1279 char *s = SvPV(data->last_found, l);
a0ed51b3 1280 I32 old = b - data->last_start_min;
5d1c421c 1281#endif
a0ed51b3
LW
1282
1283 if (UTF)
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1285
a0ed51b3 1286 l -= old;
c277df42 1287 /* Get the added string: */
79cb57f6 1288 last_str = newSVpvn(s + old, l);
c277df42
IZ
1289 if (deltanext == 0 && pos_before == b) {
1290 /* What was added is a constant string */
1291 if (mincount > 1) {
1292 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1293 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1294 SvPVX(last_str), l, mincount - 1);
1295 SvCUR(last_str) *= mincount;
1296 /* Add additional parts. */
b81d288d 1297 SvCUR_set(data->last_found,
c277df42
IZ
1298 SvCUR(data->last_found) - l);
1299 sv_catsv(data->last_found, last_str);
1300 data->last_end += l * (mincount - 1);
1301 }
2a8d9689
HS
1302 } else {
1303 /* start offset must point into the last copy */
1304 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1305 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1306 * (minnext + data->pos_delta);
c277df42
IZ
1307 }
1308 }
1309 /* It is counted once already... */
1310 data->pos_min += minnext * (mincount - counted);
1311 data->pos_delta += - counted * deltanext +
1312 (minnext + deltanext) * maxcount - minnext * mincount;
1313 if (mincount != maxcount) {
653099ff
GS
1314 /* Cannot extend fixed substrings found inside
1315 the group. */
830247a4 1316 scan_commit(pRExC_state,data);
c277df42
IZ
1317 if (mincount && last_str) {
1318 sv_setsv(data->last_found, last_str);
1319 data->last_end = data->pos_min;
b81d288d 1320 data->last_start_min =
a0ed51b3 1321 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1322 data->last_start_max = is_inf
1323 ? I32_MAX
c277df42 1324 : data->pos_min + data->pos_delta
a0ed51b3 1325 - CHR_SVLEN(last_str);
c277df42
IZ
1326 }
1327 data->longest = &(data->longest_float);
1328 }
aca2d497 1329 SvREFCNT_dec(last_str);
c277df42 1330 }
405ff068 1331 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1332 data->flags |= SF_HAS_EVAL;
1333 optimize_curly_tail:
c277df42 1334 if (OP(oscan) != CURLYX) {
22c35a8c 1335 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1336 && NEXT_OFF(next))
1337 NEXT_OFF(oscan) += NEXT_OFF(next);
1338 }
c277df42 1339 continue;
653099ff 1340 default: /* REF and CLUMP only? */
c277df42 1341 if (flags & SCF_DO_SUBSTR) {
830247a4 1342 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1343 data->longest = &(data->longest_float);
1344 }
aca2d497 1345 is_inf = is_inf_internal = 1;
653099ff 1346 if (flags & SCF_DO_STCLASS_OR)
830247a4 1347 cl_anything(pRExC_state, data->start_class);
653099ff 1348 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1349 break;
1350 }
a0ed51b3 1351 }
ffc61ed2 1352 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1353 int value = 0;
653099ff 1354
c277df42 1355 if (flags & SCF_DO_SUBSTR) {
830247a4 1356 scan_commit(pRExC_state,data);
c277df42
IZ
1357 data->pos_min++;
1358 }
1359 min++;
653099ff
GS
1360 if (flags & SCF_DO_STCLASS) {
1361 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1362
1363 /* Some of the logic below assumes that switching
1364 locale on will only add false positives. */
1365 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1366 case SANY:
653099ff
GS
1367 default:
1368 do_default:
1369 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1370 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1371 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1372 break;
1373 case REG_ANY:
1374 if (OP(scan) == SANY)
1375 goto do_default;
1376 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1377 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1378 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1379 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1380 }
1381 if (flags & SCF_DO_STCLASS_AND || !value)
1382 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1383 break;
1384 case ANYOF:
1385 if (flags & SCF_DO_STCLASS_AND)
1386 cl_and(data->start_class,
1387 (struct regnode_charclass_class*)scan);
1388 else
830247a4 1389 cl_or(pRExC_state, data->start_class,
653099ff
GS
1390 (struct regnode_charclass_class*)scan);
1391 break;
1392 case ALNUM:
1393 if (flags & SCF_DO_STCLASS_AND) {
1394 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1395 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1396 for (value = 0; value < 256; value++)
1397 if (!isALNUM(value))
1398 ANYOF_BITMAP_CLEAR(data->start_class, value);
1399 }
1400 }
1401 else {
1402 if (data->start_class->flags & ANYOF_LOCALE)
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404 else {
1405 for (value = 0; value < 256; value++)
1406 if (isALNUM(value))
b81d288d 1407 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1408 }
1409 }
1410 break;
1411 case ALNUML:
1412 if (flags & SCF_DO_STCLASS_AND) {
1413 if (data->start_class->flags & ANYOF_LOCALE)
1414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1415 }
1416 else {
1417 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418 data->start_class->flags |= ANYOF_LOCALE;
1419 }
1420 break;
1421 case NALNUM:
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1425 for (value = 0; value < 256; value++)
1426 if (isALNUM(value))
1427 ANYOF_BITMAP_CLEAR(data->start_class, value);
1428 }
1429 }
1430 else {
1431 if (data->start_class->flags & ANYOF_LOCALE)
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1433 else {
1434 for (value = 0; value < 256; value++)
1435 if (!isALNUM(value))
b81d288d 1436 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1437 }
1438 }
1439 break;
1440 case NALNUML:
1441 if (flags & SCF_DO_STCLASS_AND) {
1442 if (data->start_class->flags & ANYOF_LOCALE)
1443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1444 }
1445 else {
1446 data->start_class->flags |= ANYOF_LOCALE;
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448 }
1449 break;
1450 case SPACE:
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1454 for (value = 0; value < 256; value++)
1455 if (!isSPACE(value))
1456 ANYOF_BITMAP_CLEAR(data->start_class, value);
1457 }
1458 }
1459 else {
1460 if (data->start_class->flags & ANYOF_LOCALE)
1461 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1462 else {
1463 for (value = 0; value < 256; value++)
1464 if (isSPACE(value))
b81d288d 1465 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1466 }
1467 }
1468 break;
1469 case SPACEL:
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 if (data->start_class->flags & ANYOF_LOCALE)
1472 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1473 }
1474 else {
1475 data->start_class->flags |= ANYOF_LOCALE;
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477 }
1478 break;
1479 case NSPACE:
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1483 for (value = 0; value < 256; value++)
1484 if (isSPACE(value))
1485 ANYOF_BITMAP_CLEAR(data->start_class, value);
1486 }
1487 }
1488 else {
1489 if (data->start_class->flags & ANYOF_LOCALE)
1490 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1491 else {
1492 for (value = 0; value < 256; value++)
1493 if (!isSPACE(value))
b81d288d 1494 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1495 }
1496 }
1497 break;
1498 case NSPACEL:
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 if (data->start_class->flags & ANYOF_LOCALE) {
1501 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
1504 ANYOF_BITMAP_CLEAR(data->start_class, value);
1505 }
1506 }
1507 else {
1508 data->start_class->flags |= ANYOF_LOCALE;
1509 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1510 }
1511 break;
1512 case DIGIT:
1513 if (flags & SCF_DO_STCLASS_AND) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1515 for (value = 0; value < 256; value++)
1516 if (!isDIGIT(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1518 }
1519 else {
1520 if (data->start_class->flags & ANYOF_LOCALE)
1521 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1522 else {
1523 for (value = 0; value < 256; value++)
1524 if (isDIGIT(value))
b81d288d 1525 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1526 }
1527 }
1528 break;
1529 case NDIGIT:
1530 if (flags & SCF_DO_STCLASS_AND) {
1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1532 for (value = 0; value < 256; value++)
1533 if (isDIGIT(value))
1534 ANYOF_BITMAP_CLEAR(data->start_class, value);
1535 }
1536 else {
1537 if (data->start_class->flags & ANYOF_LOCALE)
1538 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1539 else {
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
b81d288d 1542 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1543 }
1544 }
1545 break;
1546 }
1547 if (flags & SCF_DO_STCLASS_OR)
1548 cl_and(data->start_class, &and_with);
1549 flags &= ~SCF_DO_STCLASS;
1550 }
a0ed51b3 1551 }
22c35a8c 1552 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1553 data->flags |= (OP(scan) == MEOL
1554 ? SF_BEFORE_MEOL
1555 : SF_BEFORE_SEOL);
a0ed51b3 1556 }
653099ff
GS
1557 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1558 /* Lookbehind, or need to calculate parens/evals/stclass: */
1559 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1560 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1561 /* Lookahead/lookbehind */
cb434fcc 1562 I32 deltanext, minnext, fake = 0;
c277df42 1563 regnode *nscan;
653099ff
GS
1564 struct regnode_charclass_class intrnl;
1565 int f = 0;
c277df42
IZ
1566
1567 data_fake.flags = 0;
b81d288d 1568 if (data) {
2c2d71f5 1569 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1570 data_fake.last_closep = data->last_closep;
1571 }
1572 else
1573 data_fake.last_closep = &fake;
653099ff
GS
1574 if ( flags & SCF_DO_STCLASS && !scan->flags
1575 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1576 cl_init(pRExC_state, &intrnl);
653099ff 1577 data_fake.start_class = &intrnl;
e1901655 1578 f |= SCF_DO_STCLASS_AND;
653099ff 1579 }
e1901655
IZ
1580 if (flags & SCF_WHILEM_VISITED_POS)
1581 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1582 next = regnext(scan);
1583 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1584 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1585 if (scan->flags) {
1586 if (deltanext) {
9baa0206 1587 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1588 }
1589 else if (minnext > U8_MAX) {
9baa0206 1590 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1591 }
1592 scan->flags = minnext;
1593 }
1594 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1595 pars++;
405ff068 1596 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1597 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1598 if (data)
1599 data->whilem_c = data_fake.whilem_c;
e1901655 1600 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1601 int was = (data->start_class->flags & ANYOF_EOS);
1602
1603 cl_and(data->start_class, &intrnl);
1604 if (was)
1605 data->start_class->flags |= ANYOF_EOS;
1606 }
a0ed51b3
LW
1607 }
1608 else if (OP(scan) == OPEN) {
c277df42 1609 pars++;
a0ed51b3 1610 }
cb434fcc
IZ
1611 else if (OP(scan) == CLOSE) {
1612 if (ARG(scan) == is_par) {
1613 next = regnext(scan);
c277df42 1614
cb434fcc
IZ
1615 if ( next && (OP(next) != WHILEM) && next < last)
1616 is_par = 0; /* Disable optimization */
1617 }
1618 if (data)
1619 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1620 }
1621 else if (OP(scan) == EVAL) {
c277df42
IZ
1622 if (data)
1623 data->flags |= SF_HAS_EVAL;
1624 }
96776eda 1625 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1626 if (flags & SCF_DO_SUBSTR) {
830247a4 1627 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1628 data->longest = &(data->longest_float);
1629 }
1630 is_inf = is_inf_internal = 1;
653099ff 1631 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1632 cl_anything(pRExC_state, data->start_class);
96776eda 1633 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1634 }
c277df42
IZ
1635 /* Else: zero-length, ignore. */
1636 scan = regnext(scan);
1637 }
1638
1639 finish:
1640 *scanp = scan;
aca2d497 1641 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1642 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1643 data->pos_delta = I32_MAX - data->pos_min;
1644 if (is_par > U8_MAX)
1645 is_par = 0;
1646 if (is_par && pars==1 && data) {
1647 data->flags |= SF_IN_PAR;
1648 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1649 }
1650 else if (pars && data) {
c277df42
IZ
1651 data->flags |= SF_HAS_PAR;
1652 data->flags &= ~SF_IN_PAR;
1653 }
653099ff
GS
1654 if (flags & SCF_DO_STCLASS_OR)
1655 cl_and(data->start_class, &and_with);
c277df42
IZ
1656 return min;
1657}
1658
76e3520e 1659STATIC I32
830247a4 1660S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1661{
830247a4 1662 if (RExC_rx->data) {
b81d288d
AB
1663 Renewc(RExC_rx->data,
1664 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1665 char, struct reg_data);
830247a4
IZ
1666 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1667 RExC_rx->data->count += n;
a0ed51b3
LW
1668 }
1669 else {
830247a4 1670 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1671 char, struct reg_data);
830247a4
IZ
1672 New(1208, RExC_rx->data->what, n, U8);
1673 RExC_rx->data->count = n;
c277df42 1674 }
830247a4
IZ
1675 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1676 return RExC_rx->data->count - n;
c277df42
IZ
1677}
1678
d88dccdf 1679void
864dbfa3 1680Perl_reginitcolors(pTHX)
d88dccdf 1681{
d88dccdf
IZ
1682 int i = 0;
1683 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1684
d88dccdf
IZ
1685 if (s) {
1686 PL_colors[0] = s = savepv(s);
1687 while (++i < 6) {
1688 s = strchr(s, '\t');
1689 if (s) {
1690 *s = '\0';
1691 PL_colors[i] = ++s;
1692 }
1693 else
c712d376 1694 PL_colors[i] = s = "";
d88dccdf
IZ
1695 }
1696 } else {
b81d288d 1697 while (i < 6)
d88dccdf
IZ
1698 PL_colors[i++] = "";
1699 }
1700 PL_colorset = 1;
1701}
1702
8615cb43 1703
a687059c 1704/*
e50aee73 1705 - pregcomp - compile a regular expression into internal code
a687059c
LW
1706 *
1707 * We can't allocate space until we know how big the compiled form will be,
1708 * but we can't compile it (and thus know how big it is) until we've got a
1709 * place to put the code. So we cheat: we compile it twice, once with code
1710 * generation turned off and size counting turned on, and once "for real".
1711 * This also means that we don't allocate space until we are sure that the
1712 * thing really will compile successfully, and we never have to move the
1713 * code and thus invalidate pointers into it. (Note that it has to be in
1714 * one piece because free() must be able to free it all.) [NB: not true in perl]
1715 *
1716 * Beware that the optimization-preparation code in here knows about some
1717 * of the structure of the compiled regexp. [I'll say.]
1718 */
1719regexp *
864dbfa3 1720Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1721{
a0d0e21e 1722 register regexp *r;
c277df42 1723 regnode *scan;
c277df42 1724 regnode *first;
a0d0e21e 1725 I32 flags;
a0d0e21e
LW
1726 I32 minlen = 0;
1727 I32 sawplus = 0;
1728 I32 sawopen = 0;
2c2d71f5 1729 scan_data_t data;
830247a4
IZ
1730 RExC_state_t RExC_state;
1731 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1732
1733 if (exp == NULL)
c277df42 1734 FAIL("NULL regexp argument");
a0d0e21e 1735
a5961de5 1736 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1737
5cfc7842 1738 RExC_precomp = exp;
a5961de5
JH
1739 DEBUG_r({
1740 if (!PL_colorset) reginitcolors();
1741 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1742 PL_colors[4],PL_colors[5],PL_colors[0],
1743 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1744 });
830247a4
IZ
1745 RExC_flags16 = pm->op_pmflags;
1746 RExC_sawback = 0;
bbce6d69 1747
830247a4
IZ
1748 RExC_seen = 0;
1749 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1750 RExC_seen_evals = 0;
1751 RExC_extralen = 0;
c277df42 1752
bbce6d69 1753 /* First pass: determine size, legality. */
830247a4 1754 RExC_parse = exp;
fac92740 1755 RExC_start = exp;
830247a4
IZ
1756 RExC_end = xend;
1757 RExC_naughty = 0;
1758 RExC_npar = 1;
1759 RExC_size = 0L;
1760 RExC_emit = &PL_regdummy;
1761 RExC_whilem_seen = 0;
85ddcde9
JH
1762#if 0 /* REGC() is (currently) a NOP at the first pass.
1763 * Clever compilers notice this and complain. --jhi */
830247a4 1764 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1765#endif
830247a4 1766 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1767 RExC_precomp = Nullch;
a0d0e21e
LW
1768 return(NULL);
1769 }
830247a4 1770 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1771
c277df42
IZ
1772 /* Small enough for pointer-storage convention?
1773 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1774 if (RExC_size >= 0x10000L && RExC_extralen)
1775 RExC_size += RExC_extralen;
c277df42 1776 else
830247a4
IZ
1777 RExC_extralen = 0;
1778 if (RExC_whilem_seen > 15)
1779 RExC_whilem_seen = 15;
a0d0e21e 1780
bbce6d69 1781 /* Allocate space and initialize. */
830247a4 1782 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1783 char, regexp);
a0d0e21e 1784 if (r == NULL)
b45f050a
JF
1785 FAIL("Regexp out of space");
1786
0f79a09d
GS
1787#ifdef DEBUGGING
1788 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1789 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1790#endif
c277df42 1791 r->refcnt = 1;
bbce6d69 1792 r->prelen = xend - exp;
5cfc7842 1793 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1794 r->subbeg = NULL;
1795 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1796 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1797
1798 r->substrs = 0; /* Useful during FAIL. */
1799 r->startp = 0; /* Useful during FAIL. */
1800 r->endp = 0; /* Useful during FAIL. */
1801
fac92740
MJD
1802 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1803 if (r->offsets) {
1804 r->offsets[0] = RExC_size;
1805 }
1806 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1807 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1808 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1809 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1810
830247a4 1811 RExC_rx = r;
bbce6d69 1812
1813 /* Second pass: emit code. */
055bb491 1814 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1815 RExC_parse = exp;
1816 RExC_end = xend;
1817 RExC_naughty = 0;
1818 RExC_npar = 1;
fac92740 1819 RExC_emit_start = r->program;
830247a4 1820 RExC_emit = r->program;
2cd61cdb 1821 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1822 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1823 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1824 r->data = 0;
830247a4 1825 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1826 return(NULL);
1827
1828 /* Dig out information for optimizations. */
cf93c79d 1829 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1830 pm->op_pmflags = RExC_flags16;
a0ed51b3 1831 if (UTF)
5ff6fc6d 1832 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1833 r->regstclass = NULL;
830247a4 1834 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1835 r->reganch |= ROPT_NAUGHTY;
c277df42 1836 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1837
1838 /* XXXX To minimize changes to RE engine we always allocate
1839 3-units-long substrs field. */
1840 Newz(1004, r->substrs, 1, struct reg_substr_data);
1841
2c2d71f5 1842 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1843 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1844 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1845 I32 fake;
c5254dd6 1846 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1847 struct regnode_charclass_class ch_class;
1848 int stclass_flag;
cb434fcc 1849 I32 last_close = 0;
a0d0e21e
LW
1850
1851 first = scan;
c277df42 1852 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1853 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1854 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1855 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1856 (OP(first) == PLUS) ||
1857 (OP(first) == MINMOD) ||
653099ff 1858 /* An {n,m} with n>0 */
22c35a8c 1859 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1860 if (OP(first) == PLUS)
1861 sawplus = 1;
1862 else
1863 first += regarglen[(U8)OP(first)];
1864 first = NEXTOPER(first);
a687059c
LW
1865 }
1866
a0d0e21e
LW
1867 /* Starting-point info. */
1868 again:
653099ff 1869 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1870 if (OP(first) == EXACT)
1871 ; /* Empty, get anchored substr later. */
1872 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1873 r->regstclass = first;
1874 }
653099ff 1875 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1876 r->regstclass = first;
22c35a8c
GS
1877 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1878 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1879 r->regstclass = first;
22c35a8c 1880 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1881 r->reganch |= (OP(first) == MBOL
1882 ? ROPT_ANCH_MBOL
1883 : (OP(first) == SBOL
1884 ? ROPT_ANCH_SBOL
1885 : ROPT_ANCH_BOL));
a0d0e21e 1886 first = NEXTOPER(first);
774d564b 1887 goto again;
1888 }
1889 else if (OP(first) == GPOS) {
1890 r->reganch |= ROPT_ANCH_GPOS;
1891 first = NEXTOPER(first);
1892 goto again;
a0d0e21e 1893 }
e09294f4 1894 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1895 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1896 !(r->reganch & ROPT_ANCH) )
1897 {
1898 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1899 int type = OP(NEXTOPER(first));
1900
ffc61ed2 1901 if (type == REG_ANY)
cad2e5aa
JH
1902 type = ROPT_ANCH_MBOL;
1903 else
1904 type = ROPT_ANCH_SBOL;
1905
1906 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1907 first = NEXTOPER(first);
774d564b 1908 goto again;
a0d0e21e 1909 }
b81d288d 1910 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1911 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1912 /* x+ must match at the 1st pos of run of x's */
1913 r->reganch |= ROPT_SKIP;
a0d0e21e 1914
c277df42 1915 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1916 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1917 (IV)(first - scan + 1)));
a0d0e21e
LW
1918 /*
1919 * If there's something expensive in the r.e., find the
1920 * longest literal string that must appear and make it the
1921 * regmust. Resolve ties in favor of later strings, since
1922 * the regstart check works with the beginning of the r.e.
1923 * and avoiding duplication strengthens checking. Not a
1924 * strong reason, but sufficient in the absence of others.
1925 * [Now we resolve ties in favor of the earlier string if
c277df42 1926 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1927 * earlier string may buy us something the later one won't.]
1928 */
a0d0e21e 1929 minlen = 0;
a687059c 1930
79cb57f6
GS
1931 data.longest_fixed = newSVpvn("",0);
1932 data.longest_float = newSVpvn("",0);
1933 data.last_found = newSVpvn("",0);
c277df42
IZ
1934 data.longest = &(data.longest_fixed);
1935 first = scan;
653099ff 1936 if (!r->regstclass) {
830247a4 1937 cl_init(pRExC_state, &ch_class);
653099ff
GS
1938 data.start_class = &ch_class;
1939 stclass_flag = SCF_DO_STCLASS_AND;
1940 } else /* XXXX Check for BOUND? */
1941 stclass_flag = 0;
cb434fcc 1942 data.last_closep = &last_close;
653099ff 1943
830247a4 1944 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1945 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1946 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1947 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1948 && !RExC_seen_zerolen
1949 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1950 r->reganch |= ROPT_CHECK_ALL;
830247a4 1951 scan_commit(pRExC_state, &data);
c277df42
IZ
1952 SvREFCNT_dec(data.last_found);
1953
a0ed51b3 1954 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1955 if (longest_float_length
c277df42
IZ
1956 || (data.flags & SF_FL_BEFORE_EOL
1957 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1958 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1959 int t;
1960
a0ed51b3 1961 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1962 && data.offset_fixed == data.offset_float_min
1963 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1964 goto remove_float; /* As in (a)+. */
1965
c277df42
IZ
1966 r->float_substr = data.longest_float;
1967 r->float_min_offset = data.offset_float_min;
1968 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1969 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1971 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1972 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1973 }
1974 else {
aca2d497 1975 remove_float:
c277df42
IZ
1976 r->float_substr = Nullsv;
1977 SvREFCNT_dec(data.longest_float);
c5254dd6 1978 longest_float_length = 0;
a0d0e21e 1979 }
c277df42 1980
a0ed51b3 1981 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1982 if (longest_fixed_length
c277df42
IZ
1983 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1984 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1985 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1986 int t;
1987
c277df42
IZ
1988 r->anchored_substr = data.longest_fixed;
1989 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1990 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1991 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1992 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1993 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1994 }
1995 else {
c277df42
IZ
1996 r->anchored_substr = Nullsv;
1997 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1998 longest_fixed_length = 0;
a0d0e21e 1999 }
b81d288d 2000 if (r->regstclass
ffc61ed2 2001 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
2002 r->regstclass = NULL;
2003 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2004 && !(data.start_class->flags & ANYOF_EOS)
2005 && !cl_is_anything(data.start_class)) {
830247a4 2006 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2007
b81d288d 2008 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2009 struct regnode_charclass_class);
2010 StructCopy(data.start_class,
830247a4 2011 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2012 struct regnode_charclass_class);
830247a4 2013 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2014 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2015 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2016 DEBUG_r({ SV *sv = sv_newmortal();
2017 regprop(sv, (regnode*)data.start_class);
2018 PerlIO_printf(Perl_debug_log,
2019 "synthetic stclass `%s'.\n",
2020 SvPVX(sv));});
653099ff 2021 }
c277df42
IZ
2022
2023 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2024 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
2025 r->check_substr = r->anchored_substr;
2026 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2027 if (r->reganch & ROPT_ANCH_SINGLE)
2028 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2029 }
2030 else {
c277df42
IZ
2031 r->check_substr = r->float_substr;
2032 r->check_offset_min = data.offset_float_min;
2033 r->check_offset_max = data.offset_float_max;
a0d0e21e 2034 }
30382c73
IZ
2035 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036 This should be changed ASAP! */
2037 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
2038 r->reganch |= RE_USE_INTUIT;
2039 if (SvTAIL(r->check_substr))
2040 r->reganch |= RE_INTUIT_TAIL;
2041 }
a0ed51b3
LW
2042 }
2043 else {
c277df42
IZ
2044 /* Several toplevels. Best we can is to set minlen. */
2045 I32 fake;
653099ff 2046 struct regnode_charclass_class ch_class;
cb434fcc 2047 I32 last_close = 0;
c277df42
IZ
2048
2049 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050 scan = r->program + 1;
830247a4 2051 cl_init(pRExC_state, &ch_class);
653099ff 2052 data.start_class = &ch_class;
cb434fcc 2053 data.last_closep = &last_close;
e1901655 2054 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2055 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2056 if (!(data.start_class->flags & ANYOF_EOS)
2057 && !cl_is_anything(data.start_class)) {
830247a4 2058 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2059
b81d288d 2060 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2061 struct regnode_charclass_class);
2062 StructCopy(data.start_class,
830247a4 2063 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2064 struct regnode_charclass_class);
830247a4 2065 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2066 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2067 DEBUG_r({ SV* sv = sv_newmortal();
2068 regprop(sv, (regnode*)data.start_class);
2069 PerlIO_printf(Perl_debug_log,
2070 "synthetic stclass `%s'.\n",
2071 SvPVX(sv));});
653099ff 2072 }
a0d0e21e
LW
2073 }
2074
a0d0e21e 2075 r->minlen = minlen;
b81d288d 2076 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2077 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2078 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2079 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2080 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2081 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2082 if (RExC_seen & REG_SEEN_CANY)
2083 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2084 Newz(1002, r->startp, RExC_npar, I32);
2085 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2086 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2087 DEBUG_r(regdump(r));
2088 return(r);
a687059c
LW
2089}
2090
2091/*
2092 - reg - regular expression, i.e. main body or parenthesized thing
2093 *
2094 * Caller must absorb opening parenthesis.
2095 *
2096 * Combining parenthesis handling with the base level of regular expression
2097 * is a trifle forced, but the need to tie the tails of the branches to what
2098 * follows makes it hard to avoid.
2099 */
76e3520e 2100STATIC regnode *
830247a4 2101S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2102 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2103{
c277df42
IZ
2104 register regnode *ret; /* Will be the head of the group. */
2105 register regnode *br;
2106 register regnode *lastbr;
2107 register regnode *ender = 0;
a0d0e21e 2108 register I32 parno = 0;
830247a4 2109 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2110
2111 /* for (?g), (?gc), and (?o) warnings; warning
2112 about (?c) will warn about (?g) -- japhy */
2113
2114 I32 wastedflags = 0x00,
2115 wasted_o = 0x01,
2116 wasted_g = 0x02,
2117 wasted_gc = 0x02 | 0x04,
2118 wasted_c = 0x04;
2119
fac92740 2120 char * parse_start = RExC_parse; /* MJD */
830247a4 2121 char *oregcomp_parse = RExC_parse;
c277df42 2122 char c;
a0d0e21e 2123
821b33a5 2124 *flagp = 0; /* Tentatively. */
a0d0e21e 2125
9d1d55b5 2126
a0d0e21e
LW
2127 /* Make an OPEN node, if parenthesized. */
2128 if (paren) {
fac92740 2129 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2130 U16 posflags = 0, negflags = 0;
2131 U16 *flagsp = &posflags;
0f5d15d6 2132 int logical = 0;
830247a4 2133 char *seqstart = RExC_parse;
ca9dfc88 2134
830247a4
IZ
2135 RExC_parse++;
2136 paren = *RExC_parse++;
c277df42 2137 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2138 switch (paren) {
fac92740 2139 case '<': /* (?<...) */
830247a4 2140 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2141 if (*RExC_parse == '!')
c277df42 2142 paren = ',';
b81d288d 2143 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2144 goto unknown;
830247a4 2145 RExC_parse++;
fac92740
MJD
2146 case '=': /* (?=...) */
2147 case '!': /* (?!...) */
830247a4 2148 RExC_seen_zerolen++;
fac92740
MJD
2149 case ':': /* (?:...) */
2150 case '>': /* (?>...) */
a0d0e21e 2151 break;
fac92740
MJD
2152 case '$': /* (?$...) */
2153 case '@': /* (?@...) */
8615cb43 2154 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2155 break;
fac92740 2156 case '#': /* (?#...) */
830247a4
IZ
2157 while (*RExC_parse && *RExC_parse != ')')
2158 RExC_parse++;
2159 if (*RExC_parse != ')')
c277df42 2160 FAIL("Sequence (?#... not terminated");
830247a4 2161 nextchar(pRExC_state);
a0d0e21e
LW
2162 *flagp = TRYAGAIN;
2163 return NULL;
fac92740 2164 case 'p': /* (?p...) */
12bcd1a6 2165 if (SIZE_ONLY && ckWARN3(WARN_DEPRECATED, WARN_REGEXP, WARN_SYNTAX))
f9373011 2166 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2167 /* FALL THROUGH*/
fac92740 2168 case '?': /* (??...) */
0f5d15d6 2169 logical = 1;
438a3801
YST
2170 if (*RExC_parse != '{')
2171 goto unknown;
830247a4 2172 paren = *RExC_parse++;
0f5d15d6 2173 /* FALL THROUGH */
fac92740 2174 case '{': /* (?{...}) */
c277df42 2175 {
c277df42
IZ
2176 I32 count = 1, n = 0;
2177 char c;
830247a4 2178 char *s = RExC_parse;
c277df42
IZ
2179 SV *sv;
2180 OP_4tree *sop, *rop;
2181
830247a4
IZ
2182 RExC_seen_zerolen++;
2183 RExC_seen |= REG_SEEN_EVAL;
2184 while (count && (c = *RExC_parse)) {
2185 if (c == '\\' && RExC_parse[1])
2186 RExC_parse++;
b81d288d 2187 else if (c == '{')
c277df42 2188 count++;
b81d288d 2189 else if (c == '}')
c277df42 2190 count--;
830247a4 2191 RExC_parse++;
c277df42 2192 }
830247a4 2193 if (*RExC_parse != ')')
b45f050a 2194 {
b81d288d 2195 RExC_parse = s;
b45f050a
JF
2196 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2197 }
c277df42
IZ
2198 if (!SIZE_ONLY) {
2199 AV *av;
b81d288d
AB
2200
2201 if (RExC_parse - 1 - s)
830247a4 2202 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2203 else
79cb57f6 2204 sv = newSVpvn("", 0);
c277df42 2205
569233ed
SB
2206 ENTER;
2207 Perl_save_re_context(aTHX);
c277df42 2208 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2209 sop->op_private |= OPpREFCOUNTED;
2210 /* re_dup will OpREFCNT_inc */
2211 OpREFCNT_set(sop, 1);
569233ed 2212 LEAVE;
c277df42 2213
830247a4
IZ
2214 n = add_data(pRExC_state, 3, "nop");
2215 RExC_rx->data->data[n] = (void*)rop;
2216 RExC_rx->data->data[n+1] = (void*)sop;
2217 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2218 SvREFCNT_dec(sv);
a0ed51b3 2219 }
e24b16f9 2220 else { /* First pass */
830247a4 2221 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2222 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2223 /* No compiled RE interpolated, has runtime
2224 components ===> unsafe. */
2225 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2226 if (PL_tainting && PL_tainted)
cc6b7395 2227 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2228 }
2229
830247a4 2230 nextchar(pRExC_state);
0f5d15d6 2231 if (logical) {
830247a4 2232 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2233 if (!SIZE_ONLY)
2234 ret->flags = 2;
830247a4 2235 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2236 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2237 return ret;
2238 }
830247a4 2239 return reganode(pRExC_state, EVAL, n);
c277df42 2240 }
fac92740 2241 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2242 {
fac92740 2243 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2244 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2245 || RExC_parse[1] == '<'
830247a4 2246 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2247 I32 flag;
2248
830247a4 2249 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2250 if (!SIZE_ONLY)
2251 ret->flags = 1;
830247a4 2252 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2253 goto insert_if;
b81d288d 2254 }
a0ed51b3 2255 }
830247a4 2256 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2257 /* (?(1)...) */
830247a4 2258 parno = atoi(RExC_parse++);
c277df42 2259
830247a4
IZ
2260 while (isDIGIT(*RExC_parse))
2261 RExC_parse++;
fac92740
MJD
2262 ret = reganode(pRExC_state, GROUPP, parno);
2263
830247a4 2264 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2265 vFAIL("Switch condition not recognized");
c277df42 2266 insert_if:
830247a4
IZ
2267 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2268 br = regbranch(pRExC_state, &flags, 1);
c277df42 2269 if (br == NULL)
830247a4 2270 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2271 else
830247a4
IZ
2272 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2273 c = *nextchar(pRExC_state);
d1b80229
IZ
2274 if (flags&HASWIDTH)
2275 *flagp |= HASWIDTH;
c277df42 2276 if (c == '|') {
830247a4
IZ
2277 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2278 regbranch(pRExC_state, &flags, 1);
2279 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2280 if (flags&HASWIDTH)
2281 *flagp |= HASWIDTH;
830247a4 2282 c = *nextchar(pRExC_state);
a0ed51b3
LW
2283 }
2284 else
c277df42
IZ
2285 lastbr = NULL;
2286 if (c != ')')
8615cb43 2287 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2288 ender = reg_node(pRExC_state, TAIL);
2289 regtail(pRExC_state, br, ender);
c277df42 2290 if (lastbr) {
830247a4
IZ
2291 regtail(pRExC_state, lastbr, ender);
2292 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2293 }
2294 else
830247a4 2295 regtail(pRExC_state, ret, ender);
c277df42 2296 return ret;
a0ed51b3
LW
2297 }
2298 else {
830247a4 2299 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2300 }
2301 }
1b1626e4 2302 case 0:
830247a4 2303 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2304 vFAIL("Sequence (? incomplete");
1b1626e4 2305 break;
a0d0e21e 2306 default:
830247a4 2307 --RExC_parse;
fac92740 2308 parse_flags: /* (?i) */
830247a4 2309 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2310 /* (?g), (?gc) and (?o) are useless here
2311 and must be globally applied -- japhy */
2312
2313 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2314 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2315 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2316 if (! (wastedflags & wflagbit) ) {
2317 wastedflags |= wflagbit;
2318 vWARN5(
2319 RExC_parse + 1,
2320 "Useless (%s%c) - %suse /%c modifier",
2321 flagsp == &negflags ? "?-" : "?",
2322 *RExC_parse,
2323 flagsp == &negflags ? "don't " : "",
2324 *RExC_parse
2325 );
2326 }
2327 }
2328 }
2329 else if (*RExC_parse == 'c') {
2330 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2331 if (! (wastedflags & wasted_c) ) {
2332 wastedflags |= wasted_gc;
2333 vWARN3(
2334 RExC_parse + 1,
2335 "Useless (%sc) - %suse /gc modifier",
2336 flagsp == &negflags ? "?-" : "?",
2337 flagsp == &negflags ? "don't " : ""
2338 );
2339 }
2340 }
2341 }
2342 else { pmflag(flagsp, *RExC_parse); }
2343
830247a4 2344 ++RExC_parse;
ca9dfc88 2345 }
830247a4 2346 if (*RExC_parse == '-') {
ca9dfc88 2347 flagsp = &negflags;
9d1d55b5 2348 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2349 ++RExC_parse;
ca9dfc88 2350 goto parse_flags;
48c036b1 2351 }
830247a4
IZ
2352 RExC_flags16 |= posflags;
2353 RExC_flags16 &= ~negflags;
2354 if (*RExC_parse == ':') {
2355 RExC_parse++;
ca9dfc88
IZ
2356 paren = ':';
2357 break;
2358 }
c277df42 2359 unknown:
830247a4
IZ
2360 if (*RExC_parse != ')') {
2361 RExC_parse++;
2362 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2363 }
830247a4 2364 nextchar(pRExC_state);
a0d0e21e
LW
2365 *flagp = TRYAGAIN;
2366 return NULL;
2367 }
2368 }
fac92740 2369 else { /* (...) */
830247a4
IZ
2370 parno = RExC_npar;
2371 RExC_npar++;
2372 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2373 Set_Node_Length(ret, 1); /* MJD */
2374 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2375 open = 1;
a0d0e21e 2376 }
a0ed51b3 2377 }
fac92740 2378 else /* ! paren */
a0d0e21e
LW
2379 ret = NULL;
2380
2381 /* Pick up the branches, linking them together. */
fac92740 2382 parse_start = RExC_parse; /* MJD */
830247a4 2383 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2384 /* branch_len = (paren != 0); */
2385
a0d0e21e
LW
2386 if (br == NULL)
2387 return(NULL);
830247a4
IZ
2388 if (*RExC_parse == '|') {
2389 if (!SIZE_ONLY && RExC_extralen) {
2390 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2391 }
fac92740 2392 else { /* MJD */
830247a4 2393 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2394 Set_Node_Length(br, paren != 0);
2395 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2396 }
c277df42
IZ
2397 have_branch = 1;
2398 if (SIZE_ONLY)
830247a4 2399 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2400 }
2401 else if (paren == ':') {
c277df42
IZ
2402 *flagp |= flags&SIMPLE;
2403 }
2404 if (open) { /* Starts with OPEN. */
830247a4 2405 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2406 }
2407 else if (paren != '?') /* Not Conditional */
a0d0e21e 2408 ret = br;
32a0ca98 2409 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2410 lastbr = br;
830247a4
IZ
2411 while (*RExC_parse == '|') {
2412 if (!SIZE_ONLY && RExC_extralen) {
2413 ender = reganode(pRExC_state, LONGJMP,0);
2414 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2415 }
2416 if (SIZE_ONLY)
830247a4
IZ
2417 RExC_extralen += 2; /* Account for LONGJMP. */
2418 nextchar(pRExC_state);
2419 br = regbranch(pRExC_state, &flags, 0);
fac92740 2420
a687059c 2421 if (br == NULL)
a0d0e21e 2422 return(NULL);
830247a4 2423 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2424 lastbr = br;
821b33a5
IZ
2425 if (flags&HASWIDTH)
2426 *flagp |= HASWIDTH;
a687059c 2427 *flagp |= flags&SPSTART;
a0d0e21e
LW
2428 }
2429
c277df42
IZ
2430 if (have_branch || paren != ':') {
2431 /* Make a closing node, and hook it on the end. */
2432 switch (paren) {
2433 case ':':
830247a4 2434 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2435 break;
2436 case 1:
830247a4 2437 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2438 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2439 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2440 break;
2441 case '<':
c277df42
IZ
2442 case ',':
2443 case '=':
2444 case '!':
c277df42 2445 *flagp &= ~HASWIDTH;
821b33a5
IZ
2446 /* FALL THROUGH */
2447 case '>':
830247a4 2448 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2449 break;
2450 case 0:
830247a4 2451 ender = reg_node(pRExC_state, END);
c277df42
IZ
2452 break;
2453 }
830247a4 2454 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2455
c277df42
IZ
2456 if (have_branch) {
2457 /* Hook the tails of the branches to the closing node. */
2458 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2459 regoptail(pRExC_state, br, ender);
c277df42
IZ
2460 }
2461 }
a0d0e21e 2462 }
c277df42
IZ
2463
2464 {
2465 char *p;
2466 static char parens[] = "=!<,>";
2467
2468 if (paren && (p = strchr(parens, paren))) {
2469 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2470 int flag = (p - parens) > 1;
2471
2472 if (paren == '>')
2473 node = SUSPEND, flag = 0;
830247a4 2474 reginsert(pRExC_state, node,ret);
c277df42 2475 ret->flags = flag;
830247a4 2476 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2477 }
a0d0e21e
LW
2478 }
2479
2480 /* Check for proper termination. */
ce3e6498 2481 if (paren) {
830247a4
IZ
2482 RExC_flags16 = oregflags;
2483 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2484 RExC_parse = oregcomp_parse;
380a0633 2485 vFAIL("Unmatched (");
ce3e6498 2486 }
a0ed51b3 2487 }
830247a4
IZ
2488 else if (!paren && RExC_parse < RExC_end) {
2489 if (*RExC_parse == ')') {
2490 RExC_parse++;
380a0633 2491 vFAIL("Unmatched )");
a0ed51b3
LW
2492 }
2493 else
b45f050a 2494 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2495 /* NOTREACHED */
2496 }
a687059c 2497
a0d0e21e 2498 return(ret);
a687059c
LW
2499}
2500
2501/*
2502 - regbranch - one alternative of an | operator
2503 *
2504 * Implements the concatenation operator.
2505 */
76e3520e 2506STATIC regnode *
830247a4 2507S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2508{
c277df42
IZ
2509 register regnode *ret;
2510 register regnode *chain = NULL;
2511 register regnode *latest;
2512 I32 flags = 0, c = 0;
a0d0e21e 2513
b81d288d 2514 if (first)
c277df42
IZ
2515 ret = NULL;
2516 else {
b81d288d 2517 if (!SIZE_ONLY && RExC_extralen)
830247a4 2518 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2519 else {
830247a4 2520 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2521 Set_Node_Length(ret, 1);
2522 }
c277df42
IZ
2523 }
2524
b81d288d 2525 if (!first && SIZE_ONLY)
830247a4 2526 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2527
c277df42 2528 *flagp = WORST; /* Tentatively. */
a0d0e21e 2529
830247a4
IZ
2530 RExC_parse--;
2531 nextchar(pRExC_state);
2532 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2533 flags &= ~TRYAGAIN;
830247a4 2534 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2535 if (latest == NULL) {
2536 if (flags & TRYAGAIN)
2537 continue;
2538 return(NULL);
a0ed51b3
LW
2539 }
2540 else if (ret == NULL)
c277df42 2541 ret = latest;
a0d0e21e 2542 *flagp |= flags&HASWIDTH;
c277df42 2543 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2544 *flagp |= flags&SPSTART;
2545 else {
830247a4
IZ
2546 RExC_naughty++;
2547 regtail(pRExC_state, chain, latest);
a687059c 2548 }
a0d0e21e 2549 chain = latest;
c277df42
IZ
2550 c++;
2551 }
2552 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2553 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2554 if (ret == NULL)
2555 ret = chain;
2556 }
2557 if (c == 1) {
2558 *flagp |= flags&SIMPLE;
a0d0e21e 2559 }
a687059c 2560
a0d0e21e 2561 return(ret);
a687059c
LW
2562}
2563
2564/*
2565 - regpiece - something followed by possible [*+?]
2566 *
2567 * Note that the branching code sequences used for ? and the general cases
2568 * of * and + are somewhat optimized: they use the same NOTHING node as
2569 * both the endmarker for their branch list and the body of the last branch.
2570 * It might seem that this node could be dispensed with entirely, but the
2571 * endmarker role is not redundant.
2572 */
76e3520e 2573STATIC regnode *
830247a4 2574S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2575{
c277df42 2576 register regnode *ret;
a0d0e21e
LW
2577 register char op;
2578 register char *next;
2579 I32 flags;
830247a4 2580 char *origparse = RExC_parse;
a0d0e21e
LW
2581 char *maxpos;
2582 I32 min;
c277df42 2583 I32 max = REG_INFTY;
fac92740 2584 char *parse_start;
a0d0e21e 2585
830247a4 2586 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2587 if (ret == NULL) {
2588 if (flags & TRYAGAIN)
2589 *flagp |= TRYAGAIN;
2590 return(NULL);
2591 }
2592
830247a4 2593 op = *RExC_parse;
a0d0e21e 2594
830247a4 2595 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2596 parse_start = RExC_parse; /* MJD */
830247a4 2597 next = RExC_parse + 1;
a0d0e21e
LW
2598 maxpos = Nullch;
2599 while (isDIGIT(*next) || *next == ',') {
2600 if (*next == ',') {
2601 if (maxpos)
2602 break;
2603 else
2604 maxpos = next;
a687059c 2605 }
a0d0e21e
LW
2606 next++;
2607 }
2608 if (*next == '}') { /* got one */
2609 if (!maxpos)
2610 maxpos = next;
830247a4
IZ
2611 RExC_parse++;
2612 min = atoi(RExC_parse);
a0d0e21e
LW
2613 if (*maxpos == ',')
2614 maxpos++;
2615 else
830247a4 2616 maxpos = RExC_parse;
a0d0e21e
LW
2617 max = atoi(maxpos);
2618 if (!max && *maxpos != '0')
c277df42
IZ
2619 max = REG_INFTY; /* meaning "infinity" */
2620 else if (max >= REG_INFTY)
8615cb43 2621 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2622 RExC_parse = next;
2623 nextchar(pRExC_state);
a0d0e21e
LW
2624
2625 do_curly:
2626 if ((flags&SIMPLE)) {
830247a4
IZ
2627 RExC_naughty += 2 + RExC_naughty / 2;
2628 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2629 Set_Node_Offset(ret, parse_start+1); /* MJD */
2630 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2631 }
2632 else {
830247a4 2633 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2634
2635 w->flags = 0;
830247a4
IZ
2636 regtail(pRExC_state, ret, w);
2637 if (!SIZE_ONLY && RExC_extralen) {
2638 reginsert(pRExC_state, LONGJMP,ret);
2639 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2640 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2641 }
830247a4 2642 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2643 /* MJD hk */
2644 Set_Node_Offset(ret, parse_start+1);
2645 Set_Node_Length(ret,
2646 op == '{' ? (RExC_parse - parse_start) : 1);
2647
830247a4 2648 if (!SIZE_ONLY && RExC_extralen)
c277df42 2649 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2650 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2651 if (SIZE_ONLY)
830247a4
IZ
2652 RExC_whilem_seen++, RExC_extralen += 3;
2653 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2654 }
c277df42 2655 ret->flags = 0;
a0d0e21e
LW
2656
2657 if (min > 0)
821b33a5
IZ
2658 *flagp = WORST;
2659 if (max > 0)
2660 *flagp |= HASWIDTH;
a0d0e21e 2661 if (max && max < min)
8615cb43 2662 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2663 if (!SIZE_ONLY) {
2664 ARG1_SET(ret, min);
2665 ARG2_SET(ret, max);
a687059c 2666 }
a687059c 2667
a0d0e21e 2668 goto nest_check;
a687059c 2669 }
a0d0e21e 2670 }
a687059c 2671
a0d0e21e
LW
2672 if (!ISMULT1(op)) {
2673 *flagp = flags;
a687059c 2674 return(ret);
a0d0e21e 2675 }
bb20fd44 2676
c277df42 2677#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2678
2679 /* if this is reinstated, don't forget to put this back into perldiag:
2680
2681 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2682
2683 (F) The part of the regexp subject to either the * or + quantifier
2684 could match an empty string. The {#} shows in the regular
2685 expression about where the problem was discovered.
2686
2687 */
2688
bb20fd44 2689 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2690 vFAIL("Regexp *+ operand could be empty");
b81d288d 2691#endif
bb20fd44 2692
fac92740 2693 parse_start = RExC_parse;
830247a4 2694 nextchar(pRExC_state);
a0d0e21e 2695
821b33a5 2696 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2697
2698 if (op == '*' && (flags&SIMPLE)) {
830247a4 2699 reginsert(pRExC_state, STAR, ret);
c277df42 2700 ret->flags = 0;
830247a4 2701 RExC_naughty += 4;
a0d0e21e
LW
2702 }
2703 else if (op == '*') {
2704 min = 0;
2705 goto do_curly;
a0ed51b3
LW
2706 }
2707 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2708 reginsert(pRExC_state, PLUS, ret);
c277df42 2709 ret->flags = 0;
830247a4 2710 RExC_naughty += 3;
a0d0e21e
LW
2711 }
2712 else if (op == '+') {
2713 min = 1;
2714 goto do_curly;
a0ed51b3
LW
2715 }
2716 else if (op == '?') {
a0d0e21e
LW
2717 min = 0; max = 1;
2718 goto do_curly;
2719 }
2720 nest_check:
e476b1b5 2721 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2722 vWARN3(RExC_parse,
b45f050a 2723 "%.*s matches null string many times",
830247a4 2724 RExC_parse - origparse,
b45f050a 2725 origparse);
a0d0e21e
LW
2726 }
2727
830247a4
IZ
2728 if (*RExC_parse == '?') {
2729 nextchar(pRExC_state);
2730 reginsert(pRExC_state, MINMOD, ret);
2731 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2732 }
830247a4
IZ
2733 if (ISMULT2(RExC_parse)) {
2734 RExC_parse++;
b45f050a
JF
2735 vFAIL("Nested quantifiers");
2736 }
a0d0e21e
LW
2737
2738 return(ret);
a687059c
LW
2739}
2740
2741/*
2742 - regatom - the lowest level
2743 *
2744 * Optimization: gobbles an entire sequence of ordinary characters so that
2745 * it can turn them into a single node, which is smaller to store and
2746 * faster to run. Backslashed characters are exceptions, each becoming a
2747 * separate node; the code is simpler that way and it's not worth fixing.
2748 *
b45f050a 2749 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2750STATIC regnode *
830247a4 2751S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2752{
c277df42 2753 register regnode *ret = 0;
a0d0e21e 2754 I32 flags;
f06dbbb7 2755 char *parse_start = 0;
a0d0e21e
LW
2756
2757 *flagp = WORST; /* Tentatively. */
2758
2759tryagain:
830247a4 2760 switch (*RExC_parse) {
a0d0e21e 2761 case '^':
830247a4
IZ
2762 RExC_seen_zerolen++;
2763 nextchar(pRExC_state);
2764 if (RExC_flags16 & PMf_MULTILINE)
2765 ret = reg_node(pRExC_state, MBOL);
2766 else if (RExC_flags16 & PMf_SINGLELINE)
2767 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2768 else
830247a4 2769 ret = reg_node(pRExC_state, BOL);
fac92740 2770 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2771 break;
2772 case '$':
830247a4 2773 nextchar(pRExC_state);
b81d288d 2774 if (*RExC_parse)
830247a4
IZ
2775 RExC_seen_zerolen++;
2776 if (RExC_flags16 & PMf_MULTILINE)
2777 ret = reg_node(pRExC_state, MEOL);
2778 else if (RExC_flags16 & PMf_SINGLELINE)
2779 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2780 else
830247a4 2781 ret = reg_node(pRExC_state, EOL);
fac92740 2782 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2783 break;
2784 case '.':
830247a4 2785 nextchar(pRExC_state);
cce850e4 2786 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2787 ret = reg_node(pRExC_state, SANY);
2788 else
2789 ret = reg_node(pRExC_state, REG_ANY);
2790 *flagp |= HASWIDTH|SIMPLE;
830247a4 2791 RExC_naughty++;
fac92740 2792 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2793 break;
2794 case '[':
b45f050a 2795 {
830247a4 2796 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2797 ret = regclass(pRExC_state);
830247a4
IZ
2798 if (*RExC_parse != ']') {
2799 RExC_parse = oregcomp_parse;
b45f050a
JF
2800 vFAIL("Unmatched [");
2801 }
830247a4 2802 nextchar(pRExC_state);
a0d0e21e 2803 *flagp |= HASWIDTH|SIMPLE;
fac92740 2804 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2805 break;
b45f050a 2806 }
a0d0e21e 2807 case '(':
830247a4
IZ
2808 nextchar(pRExC_state);
2809 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2810 if (ret == NULL) {
bf93d4cc 2811 if (flags & TRYAGAIN) {
830247a4 2812 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2813 /* Make parent create an empty node if needed. */
2814 *flagp |= TRYAGAIN;
2815 return(NULL);
2816 }
a0d0e21e 2817 goto tryagain;
bf93d4cc 2818 }
a0d0e21e
LW
2819 return(NULL);
2820 }
c277df42 2821 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2822 break;
2823 case '|':
2824 case ')':
2825 if (flags & TRYAGAIN) {
2826 *flagp |= TRYAGAIN;
2827 return NULL;
2828 }
b45f050a 2829 vFAIL("Internal urp");
a0d0e21e
LW
2830 /* Supposed to be caught earlier. */
2831 break;
85afd4ae 2832 case '{':
830247a4
IZ
2833 if (!regcurly(RExC_parse)) {
2834 RExC_parse++;
85afd4ae
CS
2835 goto defchar;
2836 }
2837 /* FALL THROUGH */
a0d0e21e
LW
2838 case '?':
2839 case '+':
2840 case '*':
830247a4 2841 RExC_parse++;
b45f050a 2842 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2843 break;
2844 case '\\':
830247a4 2845 switch (*++RExC_parse) {
a0d0e21e 2846 case 'A':
830247a4
IZ
2847 RExC_seen_zerolen++;
2848 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2849 *flagp |= SIMPLE;
830247a4 2850 nextchar(pRExC_state);
fac92740 2851 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2852 break;
2853 case 'G':
830247a4
IZ
2854 ret = reg_node(pRExC_state, GPOS);
2855 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2856 *flagp |= SIMPLE;
830247a4 2857 nextchar(pRExC_state);
fac92740 2858 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2859 break;
2860 case 'Z':
830247a4 2861 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2862 *flagp |= SIMPLE;
a1917ab9 2863 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2864 nextchar(pRExC_state);
a0d0e21e 2865 break;
b85d18e9 2866 case 'z':
830247a4 2867 ret = reg_node(pRExC_state, EOS);
b85d18e9 2868 *flagp |= SIMPLE;
830247a4
IZ
2869 RExC_seen_zerolen++; /* Do not optimize RE away */
2870 nextchar(pRExC_state);
fac92740 2871 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2872 break;
4a2d328f 2873 case 'C':
f33976b4
DB
2874 ret = reg_node(pRExC_state, CANY);
2875 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2876 *flagp |= HASWIDTH|SIMPLE;
830247a4 2877 nextchar(pRExC_state);
fac92740 2878 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2879 break;
2880 case 'X':
830247a4 2881 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2882 *flagp |= HASWIDTH;
830247a4 2883 nextchar(pRExC_state);
fac92740 2884 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2885 break;
a0d0e21e 2886 case 'w':
ffc61ed2 2887 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2888 *flagp |= HASWIDTH|SIMPLE;
830247a4 2889 nextchar(pRExC_state);
fac92740 2890 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2891 break;
2892 case 'W':
ffc61ed2 2893 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2894 *flagp |= HASWIDTH|SIMPLE;
830247a4 2895 nextchar(pRExC_state);
fac92740 2896 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2897 break;
2898 case 'b':
830247a4
IZ
2899 RExC_seen_zerolen++;
2900 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2901 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2902 *flagp |= SIMPLE;
830247a4 2903 nextchar(pRExC_state);
fac92740 2904 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2905 break;
2906 case 'B':
830247a4
IZ
2907 RExC_seen_zerolen++;
2908 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2909 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2910 *flagp |= SIMPLE;
830247a4 2911 nextchar(pRExC_state);
fac92740 2912 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2913 break;
2914 case 's':
ffc61ed2 2915 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2916 *flagp |= HASWIDTH|SIMPLE;
830247a4 2917 nextchar(pRExC_state);
fac92740 2918 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2919 break;
2920 case 'S':
ffc61ed2 2921 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2922 *flagp |= HASWIDTH|SIMPLE;
830247a4 2923 nextchar(pRExC_state);
fac92740 2924 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2925 break;
2926 case 'd':
ffc61ed2 2927 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2928 *flagp |= HASWIDTH|SIMPLE;
830247a4 2929 nextchar(pRExC_state);
fac92740 2930 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2931 break;
2932 case 'D':
ffc61ed2 2933 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2934 *flagp |= HASWIDTH|SIMPLE;
830247a4 2935 nextchar(pRExC_state);
fac92740 2936 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2937 break;
a14b48bc
LW
2938 case 'p':
2939 case 'P':
3568d838 2940 {
830247a4 2941 char* oldregxend = RExC_end;
fac92740 2942 char* parse_start = RExC_parse;
a14b48bc 2943
830247a4 2944 if (RExC_parse[1] == '{') {
3568d838 2945 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2946 RExC_end = strchr(RExC_parse, '}');
2947 if (!RExC_end) {
0da60cf5 2948 U8 c = (U8)*RExC_parse;
830247a4
IZ
2949 RExC_parse += 2;
2950 RExC_end = oldregxend;
0da60cf5 2951 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2952 }
830247a4 2953 RExC_end++;
a14b48bc
LW
2954 }
2955 else
830247a4
IZ
2956 RExC_end = RExC_parse + 2;
2957 RExC_parse--;
a14b48bc 2958
ffc61ed2 2959 ret = regclass(pRExC_state);
a14b48bc 2960
830247a4
IZ
2961 RExC_end = oldregxend;
2962 RExC_parse--;
fac92740 2963 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2964 nextchar(pRExC_state);
a14b48bc
LW
2965 *flagp |= HASWIDTH|SIMPLE;
2966 }
2967 break;
a0d0e21e
LW
2968 case 'n':
2969 case 'r':
2970 case 't':
2971 case 'f':
2972 case 'e':
2973 case 'a':
2974 case 'x':
2975 case 'c':
2976 case '0':
2977 goto defchar;
2978 case '1': case '2': case '3': case '4':
2979 case '5': case '6': case '7': case '8': case '9':
2980 {
830247a4 2981 I32 num = atoi(RExC_parse);
a0d0e21e 2982
830247a4 2983 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2984 goto defchar;
2985 else {
fac92740 2986 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2987 while (isDIGIT(*RExC_parse))
2988 RExC_parse++;
b45f050a 2989
830247a4 2990 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2991 vFAIL("Reference to nonexistent group");
830247a4
IZ
2992 RExC_sawback = 1;
2993 ret = reganode(pRExC_state, FOLD
a0ed51b3 2994 ? (LOC ? REFFL : REFF)
c8756f30 2995 : REF, num);
a0d0e21e 2996 *flagp |= HASWIDTH;
fac92740
MJD
2997
2998 /* override incorrect value set in reganode MJD */
2999 Set_Node_Offset(ret, parse_start+1);
3000 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3001 RExC_parse--;
3002 nextchar(pRExC_state);
a0d0e21e
LW
3003 }
3004 }
3005 break;
3006 case '\0':
830247a4 3007 if (RExC_parse >= RExC_end)
b45f050a 3008 FAIL("Trailing \\");
a0d0e21e
LW
3009 /* FALL THROUGH */
3010 default:
c9f97d15
IZ
3011 /* Do not generate `unrecognized' warnings here, we fall
3012 back into the quick-grab loop below */
a0d0e21e
LW
3013 goto defchar;
3014 }
3015 break;
4633a7c4
LW
3016
3017 case '#':
830247a4
IZ
3018 if (RExC_flags16 & PMf_EXTENDED) {
3019 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3020 if (RExC_parse < RExC_end)
4633a7c4
LW
3021 goto tryagain;
3022 }
3023 /* FALL THROUGH */
3024
a0d0e21e 3025 default: {
ba210ebe 3026 register STRLEN len;
a0ed51b3 3027 register UV ender;
a0d0e21e 3028 register char *p;
c277df42 3029 char *oldp, *s;
ba210ebe 3030 STRLEN numlen;
80aecb99 3031 STRLEN foldlen;
60a8b682 3032 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3033
3034 parse_start = RExC_parse - 1;
a0d0e21e 3035
830247a4 3036 RExC_parse++;
a0d0e21e
LW
3037
3038 defchar:
830247a4 3039 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3040 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3041 : EXACT);
cd439c50 3042 s = STRING(ret);
830247a4
IZ
3043 for (len = 0, p = RExC_parse - 1;
3044 len < 127 && p < RExC_end;
a0d0e21e
LW
3045 len++)
3046 {
3047 oldp = p;
5b5a24f7 3048
830247a4
IZ
3049 if (RExC_flags16 & PMf_EXTENDED)
3050 p = regwhite(p, RExC_end);
a0d0e21e
LW
3051 switch (*p) {
3052 case '^':
3053 case '$':
3054 case '.':
3055 case '[':
3056 case '(':
3057 case ')':
3058 case '|':
3059 goto loopdone;
3060 case '\\':
3061 switch (*++p) {
3062 case 'A':
1ed8eac0
JF
3063 case 'C':
3064 case 'X':
a0d0e21e
LW
3065 case 'G':
3066 case 'Z':
b85d18e9 3067 case 'z':
a0d0e21e
LW
3068 case 'w':
3069 case 'W':
3070 case 'b':
3071 case 'B':
3072 case 's':
3073 case 'S':
3074 case 'd':
3075 case 'D':
a14b48bc
LW
3076 case 'p':
3077 case 'P':
a0d0e21e
LW
3078 --p;
3079 goto loopdone;
3080 case 'n':
3081 ender = '\n';
3082 p++;
a687059c 3083 break;
a0d0e21e
LW
3084 case 'r':
3085 ender = '\r';
3086 p++;
a687059c 3087 break;
a0d0e21e
LW
3088 case 't':
3089 ender = '\t';
3090 p++;
a687059c 3091 break;
a0d0e21e
LW
3092 case 'f':
3093 ender = '\f';
3094 p++;
a687059c 3095 break;
a0d0e21e 3096 case 'e':
c7f1f016 3097 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3098 p++;
a687059c 3099 break;
a0d0e21e 3100 case 'a':
c7f1f016 3101 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3102 p++;
a687059c 3103 break;
a0d0e21e 3104 case 'x':
a0ed51b3
LW
3105 if (*++p == '{') {
3106 char* e = strchr(p, '}');
b81d288d 3107
b45f050a 3108 if (!e) {
830247a4 3109 RExC_parse = p + 1;
b45f050a
JF
3110 vFAIL("Missing right brace on \\x{}");
3111 }
de5f0749 3112 else {
a4c04bdc
NC
3113 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3114 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3115 numlen = e - p - 1;
3116 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3117 if (ender > 0xff)
3118 RExC_utf8 = 1;
b21ed0a9
GS
3119 /* numlen is generous */
3120 if (numlen + len >= 127) {
a0ed51b3
LW
3121 p--;
3122 goto loopdone;
3123 }
3124 p = e + 1;
3125 }
a0ed51b3
LW
3126 }
3127 else {
a4c04bdc 3128 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3129 numlen = 2;
3130 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3131 p += numlen;
3132 }
a687059c 3133 break;
a0d0e21e
LW
3134 case 'c':
3135 p++;
bbce6d69 3136 ender = UCHARAT(p++);
3137 ender = toCTRL(ender);
a687059c 3138 break;
a0d0e21e
LW
3139 case '0': case '1': case '2': case '3':case '4':
3140 case '5': case '6': case '7': case '8':case '9':
3141 if (*p == '0' ||
830247a4 3142 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3143 I32 flags = 0;
3144 numlen = 3;
3145 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3146 p += numlen;
3147 }
3148 else {
3149 --p;
3150 goto loopdone;
a687059c
LW
3151 }
3152 break;
a0d0e21e 3153 case '\0':
830247a4 3154 if (p >= RExC_end)
b45f050a 3155 FAIL("Trailing \\");
a687059c 3156 /* FALL THROUGH */
a0d0e21e 3157 default:
e476b1b5 3158 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3159 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3160 goto normal_default;
a0d0e21e
LW
3161 }
3162 break;
a687059c 3163 default:
a0ed51b3 3164 normal_default:
fd400ab9 3165 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3166 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3167 &numlen, 0);
a0ed51b3
LW
3168 p += numlen;
3169 }
3170 else
3171 ender = *p++;
a0d0e21e 3172 break;
a687059c 3173 }
830247a4
IZ
3174 if (RExC_flags16 & PMf_EXTENDED)
3175 p = regwhite(p, RExC_end);
60a8b682
JH
3176 if (UTF && FOLD) {
3177 /* Prime the casefolded buffer. */
ac7e0132 3178 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3179 }
a0d0e21e
LW
3180 if (ISMULT2(p)) { /* Back off on ?+*. */
3181 if (len)
3182 p = oldp;
16ea2a2e 3183 else if (UTF) {
0ebc6274
JH
3184 STRLEN unilen;
3185
80aecb99 3186 if (FOLD) {
60a8b682 3187 /* Emit all the Unicode characters. */
80aecb99
JH
3188 for (foldbuf = tmpbuf;
3189 foldlen;
3190 foldlen -= numlen) {
3191 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3192 if (numlen > 0) {
0ebc6274
JH
3193 reguni(pRExC_state, ender, s, &unilen);
3194 s += unilen;
3195 len += unilen;
3196 /* In EBCDIC the numlen
3197 * and unilen can differ. */
9dc45d57 3198 foldbuf += numlen;
47654450
JH
3199 if (numlen >= foldlen)
3200 break;
9dc45d57
JH
3201 }
3202 else
3203 break; /* "Can't happen." */
80aecb99
JH
3204 }
3205 }
3206 else {
0ebc6274 3207 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3208 if (unilen > 0) {
0ebc6274
JH
3209 s += unilen;
3210 len += unilen;
9dc45d57 3211 }
80aecb99 3212 }
a0ed51b3 3213 }
a0d0e21e
LW
3214 else {
3215 len++;
cd439c50 3216 REGC(ender, s++);
a0d0e21e
LW
3217 }
3218 break;
a687059c 3219 }
16ea2a2e 3220 if (UTF) {
0ebc6274
JH
3221 STRLEN unilen;
3222
80aecb99 3223 if (FOLD) {
60a8b682 3224 /* Emit all the Unicode characters. */
80aecb99
JH
3225 for (foldbuf = tmpbuf;
3226 foldlen;
3227 foldlen -= numlen) {
3228 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3229 if (numlen > 0) {
0ebc6274
JH
3230 reguni(pRExC_state, ender, s, &unilen);
3231 len += unilen;
3232 s += unilen;
3233 /* In EBCDIC the numlen
3234 * and unilen can differ. */
9dc45d57 3235 foldbuf += numlen;
47654450
JH
3236 if (numlen >= foldlen)
3237 break;
9dc45d57
JH
3238 }
3239 else
3240 break;
80aecb99
JH
3241 }
3242 }
3243 else {
0ebc6274 3244 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3245 if (unilen > 0) {
0ebc6274
JH
3246 s += unilen;
3247 len += unilen;
9dc45d57 3248 }
80aecb99
JH
3249 }
3250 len--;
a0ed51b3
LW
3251 }
3252 else
cd439c50 3253 REGC(ender, s++);
a0d0e21e
LW
3254 }
3255 loopdone:
830247a4 3256 RExC_parse = p - 1;
fac92740 3257 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3258 nextchar(pRExC_state);
793db0cb
JH
3259 {
3260 /* len is STRLEN which is unsigned, need to copy to signed */
3261 IV iv = len;
3262 if (iv < 0)
3263 vFAIL("Internal disaster");
3264 }
a0d0e21e
LW
3265 if (len > 0)
3266 *flagp |= HASWIDTH;
3267 if (len == 1)
3268 *flagp |= SIMPLE;
c277df42 3269 if (!SIZE_ONLY)
cd439c50
IZ
3270 STR_LEN(ret) = len;
3271 if (SIZE_ONLY)
830247a4 3272 RExC_size += STR_SZ(len);
cd439c50 3273 else
830247a4 3274 RExC_emit += STR_SZ(len);
a687059c 3275 }
a0d0e21e
LW
3276 break;
3277 }
a687059c 3278
60a8b682
JH
3279 /* If the encoding pragma is in effect recode the text of
3280 * any EXACT-kind nodes. */
22c54be3 3281 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3282 STRLEN oldlen = STR_LEN(ret);
3283 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3284
3285 if (RExC_utf8)
3286 SvUTF8_on(sv);
3287 if (sv_utf8_downgrade(sv, TRUE)) {
799ef3cb 3288 char *s = sv_recode_to_utf8(sv, PL_encoding);
22c54be3
JH
3289 STRLEN newlen = SvCUR(sv);
3290
3291 if (!SIZE_ONLY) {
3292 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3293 (int)oldlen, STRING(ret),
3294 (int)newlen, s));
3295 Copy(s, STRING(ret), newlen, char);
3296 STR_LEN(ret) += newlen - oldlen;
3297 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3298 } else
3299 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3300 }
a72c7584
JH
3301 }
3302
a0d0e21e 3303 return(ret);
a687059c
LW
3304}
3305
873ef191 3306STATIC char *
cea2e8a9 3307S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3308{
3309 while (p < e) {
3310 if (isSPACE(*p))
3311 ++p;
3312 else if (*p == '#') {
3313 do {
3314 p++;
3315 } while (p < e && *p != '\n');
3316 }
3317 else
3318 break;
3319 }
3320 return p;
3321}
3322
b8c5462f
JH
3323/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3324 Character classes ([:foo:]) can also be negated ([:^foo:]).
3325 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3326 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3327 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3328
3329#define POSIXCC_DONE(c) ((c) == ':')
3330#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3331#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3332
b8c5462f 3333STATIC I32
830247a4 3334S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3335{
3336 char *posixcc = 0;
936ed897 3337 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3338
830247a4 3339 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3340 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3341 POSIXCC(UCHARAT(RExC_parse))) {
3342 char c = UCHARAT(RExC_parse);
830247a4 3343 char* s = RExC_parse++;
b81d288d 3344
9a86a77b 3345 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3346 RExC_parse++;
3347 if (RExC_parse == RExC_end)
620e46c5 3348 /* Grandfather lone [:, [=, [. */
830247a4 3349 RExC_parse = s;
620e46c5 3350 else {
830247a4 3351 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3352
9a86a77b 3353 if (UCHARAT(RExC_parse) == ']') {
830247a4 3354 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3355 posixcc = s + 1;
3356 if (*s == ':') {
3357 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3358 I32 skip = 5; /* the most common skip */
3359
3360 switch (*posixcc) {
3361 case 'a':
3362 if (strnEQ(posixcc, "alnum", 5))
3363 namedclass =
3364 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3365 else if (strnEQ(posixcc, "alpha", 5))
3366 namedclass =
3367 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3368 else if (strnEQ(posixcc, "ascii", 5))
3369 namedclass =
3370 complement ? ANYOF_NASCII : ANYOF_ASCII;
3371 break;
aaa51d5e
JF
3372 case 'b':
3373 if (strnEQ(posixcc, "blank", 5))
3374 namedclass =
3375 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3376 break;
b8c5462f
JH
3377 case 'c':
3378 if (strnEQ(posixcc, "cntrl", 5))
3379 namedclass =
3380 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3381 break;
3382 case 'd':
3383 if (strnEQ(posixcc, "digit", 5))
3384 namedclass =
3385 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3386 break;
3387 case 'g':
3388 if (strnEQ(posixcc, "graph", 5))
3389 namedclass =
3390 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3391 break;
3392 case 'l':
3393 if (strnEQ(posixcc, "lower", 5))
3394 namedclass =
3395 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3396 break;
3397 case 'p':
3398 if (strnEQ(posixcc, "print", 5))
3399 namedclass =
3400 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3401 else if (strnEQ(posixcc, "punct", 5))
3402 namedclass =
3403 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3404 break;
3405 case 's':
3406 if (strnEQ(posixcc, "space", 5))
3407 namedclass =
aaa51d5e 3408 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3409 break;
b8c5462f
JH
3410 case 'u':
3411 if (strnEQ(posixcc, "upper", 5))
3412 namedclass =
3413 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3414 break;
3415 case 'w': /* this is not POSIX, this is the Perl \w */
3416 if (strnEQ(posixcc, "word", 4)) {
3417 namedclass =
3418 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3419 skip = 4;
3420 }
3421 break;
3422 case 'x':
3423 if (strnEQ(posixcc, "xdigit", 6)) {
3424 namedclass =
3425 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3426 skip = 6;
3427 }
3428 break;
3429 }
ac561586
JH
3430 if (namedclass == OOB_NAMEDCLASS ||
3431 posixcc[skip] != ':' ||
3432 posixcc[skip+1] != ']')
b45f050a
JF
3433 {
3434 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3435 t - s - 1, s + 1);
3436 }
3437 } else if (!SIZE_ONLY) {
b8c5462f 3438 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3439
830247a4 3440 /* adjust RExC_parse so the warning shows after
b45f050a 3441 the class closes */
9a86a77b 3442 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3443 RExC_parse++;
b45f050a
JF
3444 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3445 }
b8c5462f
JH
3446 } else {
3447 /* Maternal grandfather:
3448 * "[:" ending in ":" but not in ":]" */
830247a4 3449 RExC_parse = s;
767d463e 3450 }
620e46c5
JH
3451 }
3452 }
3453
b8c5462f
JH
3454 return namedclass;
3455}
3456
3457STATIC void
830247a4 3458S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3459{
b938889d 3460 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3461 char *s = RExC_parse;
93733859 3462 char c = *s++;
b8c5462f
JH
3463
3464 while(*s && isALNUM(*s))
3465 s++;
3466 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3467 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3468
3469 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3470 if (POSIXCC_NOTYET(c)) {
830247a4 3471 /* adjust RExC_parse so the error shows after
b45f050a 3472 the class closes */
9a86a77b 3473 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3474 ;
3475 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3476 }
b8c5462f
JH
3477 }
3478 }
620e46c5
JH
3479}
3480
76e3520e 3481STATIC regnode *
830247a4 3482S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3483{
ffc61ed2 3484 register UV value;
9a86a77b 3485 register UV nextvalue;
3568d838 3486 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3487 register IV range = 0;
c277df42 3488 register regnode *ret;
ba210ebe 3489 STRLEN numlen;
ffc61ed2 3490 IV namedclass;
9c5ffd7c 3491 char *rangebegin = 0;
936ed897 3492 bool need_class = 0;
9c5ffd7c 3493 SV *listsv = Nullsv;
ffc61ed2
JH
3494 register char *e;
3495 UV n;
9e55ce06
JH
3496 bool optimize_invert = TRUE;
3497 AV* unicode_alternate = 0;
ffc61ed2
JH
3498
3499 ret = reganode(pRExC_state, ANYOF, 0);
3500
3501 if (!SIZE_ONLY)
3502 ANYOF_FLAGS(ret) = 0;
3503
9a86a77b 3504 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3505 RExC_naughty++;
3506 RExC_parse++;
3507 if (!SIZE_ONLY)
3508 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3509 }
a0d0e21e 3510
936ed897 3511 if (SIZE_ONLY)
830247a4 3512 RExC_size += ANYOF_SKIP;
936ed897 3513 else {
830247a4 3514 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3515 if (FOLD)
3516 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3517 if (LOC)
3518 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3519 ANYOF_BITMAP_ZERO(ret);
3520 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3521 }
b8c5462f 3522
9a86a77b
JH
3523 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3524
b938889d 3525 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3526 checkposixcc(pRExC_state);
b8c5462f 3527
f064b6ad
HS
3528 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3529 if (UCHARAT(RExC_parse) == ']')
3530 goto charclassloop;
ffc61ed2 3531
9a86a77b 3532 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3533
3534 charclassloop:
3535
3536 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3537
73b437c8 3538 if (!range)
830247a4 3539 rangebegin = RExC_parse;
ffc61ed2 3540 if (UTF) {
5e12f4fb 3541 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3542 RExC_end - RExC_parse,
3543 &numlen, 0);
ffc61ed2
JH
3544 RExC_parse += numlen;
3545 }
3546 else
3547 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549 if (value == '[' && POSIXCC(nextvalue))
830247a4 3550 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3551 else if (value == '\\') {
ffc61ed2 3552 if (UTF) {
5e12f4fb 3553 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3554 RExC_end - RExC_parse,
3555 &numlen, 0);
3556 RExC_parse += numlen;
3557 }
3558 else
3559 value = UCHARAT(RExC_parse++);
470c3474 3560 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3561 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3562 * be a problem later if we want switch on Unicode.
3563 * A similar issue a little bit later when switching on
3564 * namedclass. --jhi */
ffc61ed2 3565 switch ((I32)value) {
b8c5462f
JH
3566 case 'w': namedclass = ANYOF_ALNUM; break;
3567 case 'W': namedclass = ANYOF_NALNUM; break;
3568 case 's': namedclass = ANYOF_SPACE; break;
3569 case 'S': namedclass = ANYOF_NSPACE; break;
3570 case 'd': namedclass = ANYOF_DIGIT; break;
3571 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3572 case 'p':
3573 case 'P':
3574 if (*RExC_parse == '{') {
0da60cf5 3575 U8 c = (U8)value;
ffc61ed2
JH
3576 e = strchr(RExC_parse++, '}');
3577 if (!e)
0da60cf5 3578 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3579 while (isSPACE(UCHARAT(RExC_parse)))
3580 RExC_parse++;
3581 if (e == RExC_parse)
0da60cf5 3582 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3583 n = e - RExC_parse;
ab13f0c7
JH
3584 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3585 n--;
ffc61ed2
JH
3586 }
3587 else {
3588 e = RExC_parse;
3589 n = 1;
3590 }
3591 if (!SIZE_ONLY) {
ab13f0c7
JH
3592 if (UCHARAT(RExC_parse) == '^') {
3593 RExC_parse++;
3594 n--;
3595 value = value == 'p' ? 'P' : 'p'; /* toggle */
3596 while (isSPACE(UCHARAT(RExC_parse))) {
3597 RExC_parse++;
3598 n--;
3599 }
3600 }
ffc61ed2 3601 if (value == 'p')
ab13f0c7
JH
3602 Perl_sv_catpvf(aTHX_ listsv,
3603 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3604 else
ab13f0c7
JH
3605 Perl_sv_catpvf(aTHX_ listsv,
3606 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3607 }
3608 RExC_parse = e + 1;
3609 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3610 continue;
b8c5462f
JH
3611 case 'n': value = '\n'; break;
3612 case 'r': value = '\r'; break;
3613 case 't': value = '\t'; break;
3614 case 'f': value = '\f'; break;
3615 case 'b': value = '\b'; break;
c7f1f016
NIS
3616 case 'e': value = ASCII_TO_NATIVE('\033');break;
3617 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3618 case 'x':
ffc61ed2 3619 if (*RExC_parse == '{') {
a4c04bdc
NC
3620 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3621 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3622 e = strchr(RExC_parse++, '}');
b81d288d 3623 if (!e)
ffc61ed2 3624 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3625
3626 numlen = e - RExC_parse;
3627 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3628 RExC_parse = e + 1;
3629 }
3630 else {
a4c04bdc 3631 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3632 numlen = 2;
3633 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3634 RExC_parse += numlen;
3635 }
b8c5462f
JH
3636 break;
3637 case 'c':
830247a4 3638 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3639 value = toCTRL(value);
3640 break;
3641 case '0': case '1': case '2': case '3': case '4':
3642 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3643 {
3644 I32 flags = 0;
3645 numlen = 3;
3646 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3647 RExC_parse += numlen;
b8c5462f 3648 break;
53305cf1 3649 }
1028017a 3650 default:
e476b1b5 3651 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3652 vWARN2(RExC_parse,
3653 "Unrecognized escape \\%c in character class passed through",
3654 (int)value);
1028017a 3655 break;
b8c5462f 3656 }
ffc61ed2
JH
3657 } /* end of \blah */
3658
3659 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3660
3661 if (!SIZE_ONLY && !need_class)
936ed897 3662 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3663
936ed897 3664 need_class = 1;
ffc61ed2
JH
3665
3666 /* a bad range like a-\d, a-[:digit:] ? */
3667 if (range) {
73b437c8 3668 if (!SIZE_ONLY) {
e476b1b5 3669 if (ckWARN(WARN_REGEXP))
830247a4 3670 vWARN4(RExC_parse,
b45f050a 3671 "False [] range \"%*.*s\"",
830247a4
IZ
3672 RExC_parse - rangebegin,
3673 RExC_parse - rangebegin,
b45f050a 3674 rangebegin);
3568d838
JH
3675 if (prevvalue < 256) {
3676 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3677 ANYOF_BITMAP_SET(ret, '-');
3678 }
3679 else {
3680 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3681 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3682 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3683 }
b8c5462f 3684 }
ffc61ed2
JH
3685
3686 range = 0; /* this was not a true range */
73b437c8 3687 }
ffc61ed2 3688
73b437c8 3689 if (!SIZE_ONLY) {
3568d838
JH
3690 if (namedclass > OOB_NAMEDCLASS)
3691 optimize_invert = FALSE;
e2962f66
JH
3692 /* Possible truncation here but in some 64-bit environments
3693 * the compiler gets heartburn about switch on 64-bit values.
3694 * A similar issue a little earlier when switching on value.
98f323fa 3695 * --jhi */
e2962f66 3696 switch ((I32)namedclass) {
73b437c8
JH
3697 case ANYOF_ALNUM:
3698 if (LOC)
936ed897 3699 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3700 else {
3701 for (value = 0; value < 256; value++)
3702 if (isALNUM(value))
936ed897 3703 ANYOF_BITMAP_SET(ret, value);
73b437c8 3704 }
ffc61ed2 3705 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3706 break;
3707 case ANYOF_NALNUM:
3708 if (LOC)
936ed897 3709 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3710 else {
3711 for (value = 0; value < 256; value++)
3712 if (!isALNUM(value))
936ed897 3713 ANYOF_BITMAP_SET(ret, value);
73b437c8 3714 }
ffc61ed2 3715 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3716 break;
ffc61ed2 3717 case ANYOF_ALNUMC:
73b437c8 3718 if (LOC)
ffc61ed2 3719 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3720 else {
3721 for (value = 0; value < 256; value++)
ffc61ed2 3722 if (isALNUMC(value))
936ed897 3723 ANYOF_BITMAP_SET(ret, value);
73b437c8 3724 }
ffc61ed2 3725 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3726 break;
3727 case ANYOF_NALNUMC:
3728 if (LOC)
936ed897 3729 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3730 else {
3731 for (value = 0; value < 256; value++)
3732 if (!isALNUMC(value))
936ed897 3733 ANYOF_BITMAP_SET(ret, value);
73b437c8 3734 }
ffc61ed2 3735 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3736 break;
3737 case ANYOF_ALPHA:
3738 if (LOC)
936ed897 3739 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3740 else {
3741 for (value = 0; value < 256; value++)
3742 if (isALPHA(value))
936ed897 3743 ANYOF_BITMAP_SET(ret, value);
73b437c8 3744 }
ffc61ed2 3745 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3746 break;
3747 case ANYOF_NALPHA:
3748 if (LOC)
936ed897 3749 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3750 else {
3751 for (value = 0; value < 256; value++)
3752 if (!isALPHA(value))
936ed897 3753 ANYOF_BITMAP_SET(ret, value);
73b437c8 3754 }
ffc61ed2 3755 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3756 break;
3757 case ANYOF_ASCII:
3758 if (LOC)
936ed897 3759 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3760 else {
c7f1f016 3761#ifndef EBCDIC
1ba5c669
JH
3762 for (value = 0; value < 128; value++)
3763 ANYOF_BITMAP_SET(ret, value);
3764#else /* EBCDIC */
ffbc6a93 3765 for (value = 0; value < 256; value++) {
3a3c4447
JH
3766 if (isASCII(value))
3767 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3768 }
1ba5c669 3769#endif /* EBCDIC */
73b437c8 3770 }
ffc61ed2 3771 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3772 break;
3773 case ANYOF_NASCII:
3774 if (LOC)
936ed897 3775 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3776 else {
c7f1f016 3777#ifndef EBCDIC
1ba5c669
JH
3778 for (value = 128; value < 256; value++)
3779 ANYOF_BITMAP_SET(ret, value);
3780#else /* EBCDIC */
ffbc6a93 3781 for (value = 0; value < 256; value++) {
3a3c4447
JH
3782 if (!isASCII(value))
3783 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3784 }
1ba5c669 3785#endif /* EBCDIC */
73b437c8 3786 }
ffc61ed2 3787 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3788 break;
aaa51d5e
JF
3789 case ANYOF_BLANK:
3790 if (LOC)
3791 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3792 else {
3793 for (value = 0; value < 256; value++)
3794 if (isBLANK(value))
3795 ANYOF_BITMAP_SET(ret, value);
3796 }
ffc61ed2 3797 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3798 break;
3799 case ANYOF_NBLANK:
3800 if (LOC)
3801 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3802 else {
3803 for (value = 0; value < 256; value++)
3804 if (!isBLANK(value))
3805 ANYOF_BITMAP_SET(ret, value);
3806 }
ffc61ed2 3807 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3808 break;
73b437c8
JH
3809 case ANYOF_CNTRL:
3810 if (LOC)
936ed897 3811 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3812 else {
3813 for (value = 0; value < 256; value++)
3814 if (isCNTRL(value))
936ed897 3815 ANYOF_BITMAP_SET(ret, value);
73b437c8 3816 }
ffc61ed2 3817 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3818 break;
3819 case ANYOF_NCNTRL:
3820 if (LOC)
936ed897 3821 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3822 else {
3823 for (value = 0; value < 256; value++)
3824 if (!isCNTRL(value))
936ed897 3825 ANYOF_BITMAP_SET(ret, value);
73b437c8 3826 }
ffc61ed2
JH
3827 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3828 break;
3829 case ANYOF_DIGIT:
3830 if (LOC)
3831 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3832 else {
3833 /* consecutive digits assumed */
3834 for (value = '0'; value <= '9'; value++)
3835 ANYOF_BITMAP_SET(ret, value);
3836 }
3837 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3838 break;
3839 case ANYOF_NDIGIT:
3840 if (LOC)
3841 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3842 else {
3843 /* consecutive digits assumed */
3844 for (value = 0; value < '0'; value++)
3845 ANYOF_BITMAP_SET(ret, value);
3846 for (value = '9' + 1; value < 256; value++)
3847 ANYOF_BITMAP_SET(ret, value);
3848 }
3849 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3850 break;
3851 case ANYOF_GRAPH:
3852 if (LOC)
936ed897 3853 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3854 else {
3855 for (value = 0; value < 256; value++)
3856 if (isGRAPH(value))
936ed897 3857 ANYOF_BITMAP_SET(ret, value);
73b437c8 3858 }
ffc61ed2 3859 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3860 break;
3861 case ANYOF_NGRAPH:
3862 if (LOC)
936ed897 3863 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3864 else {
3865 for (value = 0; value < 256; value++)
3866 if (!isGRAPH(value))
936ed897 3867 ANYOF_BITMAP_SET(ret, value);
73b437c8 3868 }
ffc61ed2 3869 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3870 break;
3871 case ANYOF_LOWER:
3872 if (LOC)
936ed897 3873 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3874 else {
3875 for (value = 0; value < 256; value++)
3876 if (isLOWER(value))
936ed897 3877 ANYOF_BITMAP_SET(ret, value);
73b437c8 3878 }
ffc61ed2 3879 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3880 break;
3881 case ANYOF_NLOWER:
3882 if (LOC)
936ed897 3883 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3884 else {
3885 for (value = 0; value < 256; value++)
3886 if (!isLOWER(value))
936ed897 3887 ANYOF_BITMAP_SET(ret, value);
73b437c8 3888 }
ffc61ed2 3889 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3890 break;
3891 case ANYOF_PRINT:
3892 if (LOC)
936ed897 3893 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3894 else {
3895 for (value = 0; value < 256; value++)
3896 if (isPRINT(value))
936ed897 3897 ANYOF_BITMAP_SET(ret, value);
73b437c8 3898 }
ffc61ed2 3899 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3900 break;
3901 case ANYOF_NPRINT:
3902 if (LOC)
936ed897 3903 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3904 else {
3905 for (value = 0; value < 256; value++)
3906 if (!isPRINT(value))
936ed897 3907 ANYOF_BITMAP_SET(ret, value);
73b437c8 3908 }
ffc61ed2 3909 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3910 break;
aaa51d5e
JF
3911 case ANYOF_PSXSPC:
3912 if (LOC)
3913 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3914 else {
3915 for (value = 0; value < 256; value++)
3916 if (isPSXSPC(value))
3917 ANYOF_BITMAP_SET(ret, value);
3918 }
ffc61ed2 3919 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3920 break;
3921 case ANYOF_NPSXSPC:
3922 if (LOC)
3923 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3924 else {
3925 for (value = 0; value < 256; value++)
3926 if (!isPSXSPC(value))
3927 ANYOF_BITMAP_SET(ret, value);
3928 }
ffc61ed2 3929 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3930 break;
73b437c8
JH
3931 case ANYOF_PUNCT:
3932 if (LOC)
936ed897 3933 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3934 else {
3935 for (value = 0; value < 256; value++)
3936 if (isPUNCT(value))
936ed897 3937 ANYOF_BITMAP_SET(ret, value);
73b437c8 3938 }
ffc61ed2 3939 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3940 break;
3941 case ANYOF_NPUNCT:
3942 if (LOC)
936ed897 3943 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3944 else {
3945 for (value = 0; value < 256; value++)
3946 if (!isPUNCT(value))
936ed897 3947 ANYOF_BITMAP_SET(ret, value);
73b437c8 3948 }
ffc61ed2
JH
3949 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3950 break;
3951 case ANYOF_SPACE:
3952 if (LOC)
3953 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3954 else {
3955 for (value = 0; value < 256; value++)
3956 if (isSPACE(value))
3957 ANYOF_BITMAP_SET(ret, value);
3958 }
3959 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3960 break;
3961 case ANYOF_NSPACE:
3962 if (LOC)
3963 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3964 else {
3965 for (value = 0; value < 256; value++)
3966 if (!isSPACE(value))
3967 ANYOF_BITMAP_SET(ret, value);
3968 }
3969 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3970 break;
3971 case ANYOF_UPPER:
3972 if (LOC)
936ed897 3973 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3974 else {
3975 for (value = 0; value < 256; value++)
3976 if (isUPPER(value))
936ed897 3977 ANYOF_BITMAP_SET(ret, value);
73b437c8 3978 }
ffc61ed2 3979 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3980 break;
3981 case ANYOF_NUPPER:
3982 if (LOC)
936ed897 3983 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3984 else {
3985 for (value = 0; value < 256; value++)
3986 if (!isUPPER(value))
936ed897 3987 ANYOF_BITMAP_SET(ret, value);
73b437c8 3988 }
ffc61ed2 3989 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3990 break;
3991 case ANYOF_XDIGIT:
3992 if (LOC)
936ed897 3993 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3994 else {
3995 for (value = 0; value < 256; value++)
3996 if (isXDIGIT(value))
936ed897 3997 ANYOF_BITMAP_SET(ret, value);
73b437c8 3998 }
ffc61ed2 3999 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4000 break;
4001 case ANYOF_NXDIGIT:
4002 if (LOC)
936ed897 4003 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4004 else {
4005 for (value = 0; value < 256; value++)
4006 if (!isXDIGIT(value))
936ed897 4007 ANYOF_BITMAP_SET(ret, value);
73b437c8 4008 }
ffc61ed2 4009 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4010 break;
4011 default:
b45f050a 4012 vFAIL("Invalid [::] class");
73b437c8 4013 break;
b8c5462f 4014 }
b8c5462f 4015 if (LOC)
936ed897 4016 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4017 continue;
a0d0e21e 4018 }
ffc61ed2
JH
4019 } /* end of namedclass \blah */
4020
a0d0e21e 4021 if (range) {
3a3c4447 4022 if (prevvalue > value) /* b-a */ {
b45f050a 4023 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4024 RExC_parse - rangebegin,
4025 RExC_parse - rangebegin,
b45f050a 4026 rangebegin);
3568d838 4027 range = 0; /* not a valid range */
73b437c8 4028 }
a0d0e21e
LW
4029 }
4030 else {
3568d838 4031 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4032 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4033 RExC_parse[1] != ']') {
4034 RExC_parse++;
ffc61ed2
JH
4035
4036 /* a bad range like \w-, [:word:]- ? */
4037 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4038 if (ckWARN(WARN_REGEXP))
830247a4 4039 vWARN4(RExC_parse,
b45f050a 4040 "False [] range \"%*.*s\"",
830247a4
IZ
4041 RExC_parse - rangebegin,
4042 RExC_parse - rangebegin,
b45f050a 4043 rangebegin);
73b437c8 4044 if (!SIZE_ONLY)
936ed897 4045 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4046 } else
ffc61ed2
JH
4047 range = 1; /* yeah, it's a range! */
4048 continue; /* but do it the next time */
a0d0e21e 4049 }
a687059c 4050 }
ffc61ed2 4051
93733859 4052 /* now is the next time */
ae5c130c 4053 if (!SIZE_ONLY) {
3568d838
JH
4054 IV i;
4055
4056 if (prevvalue < 256) {
4057 IV ceilvalue = value < 256 ? value : 255;
4058
4059#ifdef EBCDIC
3a3c4447
JH
4060 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4061 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4062 {
3568d838
JH
4063 if (isLOWER(prevvalue)) {
4064 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4065 if (isLOWER(i))
4066 ANYOF_BITMAP_SET(ret, i);
4067 } else {
3568d838 4068 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4069 if (isUPPER(i))
4070 ANYOF_BITMAP_SET(ret, i);
4071 }
8ada0baa 4072 }
ffc61ed2 4073 else
8ada0baa 4074#endif
a5961de5
JH
4075 for (i = prevvalue; i <= ceilvalue; i++)
4076 ANYOF_BITMAP_SET(ret, i);
3568d838 4077 }
a5961de5 4078 if (value > 255 || UTF) {
b08decb7
JH
4079 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4080 UV natvalue = NATIVE_TO_UNI(value);
4081
ffc61ed2 4082 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4083 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4084 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4085 prevnatvalue, natvalue);
4086 }
4087 else if (prevnatvalue == natvalue) {
4088 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4089 if (FOLD) {
254ba52a
JH
4090 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4091 STRLEN foldlen;
2f3bf011 4092 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4093
c840d2a2
JH
4094 /* If folding and foldable and a single
4095 * character, insert also the folded version
4096 * to the charclass. */
9e55ce06
JH
4097 if (f != value) {
4098 if (foldlen == UNISKIP(f))
4099 Perl_sv_catpvf(aTHX_ listsv,
4100 "%04"UVxf"\n", f);
4101 else {
4102 /* Any multicharacter foldings
4103 * require the following transform:
4104 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4105 * where E folds into "pq" and F folds
4106 * into "rst", all other characters
4107 * fold to single characters. We save
4108 * away these multicharacter foldings,
4109 * to be later saved as part of the
4110 * additional "s" data. */
4111 SV *sv;
4112
4113 if (!unicode_alternate)
4114 unicode_alternate = newAV();
4115 sv = newSVpvn((char*)foldbuf, foldlen);
4116 SvUTF8_on(sv);
4117 av_push(unicode_alternate, sv);
4118 }
4119 }
254ba52a 4120
60a8b682
JH
4121 /* If folding and the value is one of the Greek
4122 * sigmas insert a few more sigmas to make the
4123 * folding rules of the sigmas to work right.
4124 * Note that not all the possible combinations
4125 * are handled here: some of them are handled
9e55ce06
JH
4126 * by the standard folding rules, and some of
4127 * them (literal or EXACTF cases) are handled
4128 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4129 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4130 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4131 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4132 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4133 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4134 }
4135 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4136 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4137 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4138 }
4139 }
ffc61ed2 4140 }
8ada0baa 4141 }
ffc61ed2
JH
4142
4143 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4144 }
ffc61ed2 4145
936ed897 4146 if (need_class) {
4f66b38d 4147 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4148 if (SIZE_ONLY)
830247a4 4149 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4150 else
830247a4 4151 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4152 }
ffc61ed2 4153
ae5c130c 4154 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4155 if (!SIZE_ONLY &&
ffc61ed2 4156 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4157 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4158 ) {
a0ed51b3 4159 for (value = 0; value < 256; ++value) {
936ed897 4160 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4161 IV fold = PL_fold[value];
4162
4163 if (fold != value)
4164 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4165 }
4166 }
936ed897 4167 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4168 }
ffc61ed2 4169
ae5c130c 4170 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4171 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4172 /* If the only flag is inversion. */
4173 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4174 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4175 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4176 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4177 }
a0d0e21e 4178
b81d288d 4179 if (!SIZE_ONLY) {
fde631ed 4180 AV *av = newAV();
ffc61ed2
JH
4181 SV *rv;
4182
9e55ce06
JH
4183 /* The 0th element stores the character class description
4184 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4185 * to initialize the appropriate swash (which gets stored in
4186 * the 1st element), and also useful for dumping the regnode.
4187 * The 2nd element stores the multicharacter foldings,
4188 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4189 av_store(av, 0, listsv);
4190 av_store(av, 1, NULL);
9e55ce06 4191 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4192 rv = newRV_noinc((SV*)av);
19860706 4193 n = add_data(pRExC_state, 1, "s");
830247a4 4194 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4195 ARG_SET(ret, n);
a0ed51b3
LW
4196 }
4197
4198 return ret;
4199}
4200
76e3520e 4201STATIC char*
830247a4 4202S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4203{
830247a4 4204 char* retval = RExC_parse++;
a0d0e21e 4205
4633a7c4 4206 for (;;) {
830247a4
IZ
4207 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4208 RExC_parse[2] == '#') {
4209 while (*RExC_parse && *RExC_parse != ')')
4210 RExC_parse++;
4211 RExC_parse++;
4633a7c4
LW
4212 continue;
4213 }
830247a4
IZ
4214 if (RExC_flags16 & PMf_EXTENDED) {
4215 if (isSPACE(*RExC_parse)) {
4216 RExC_parse++;
748a9306
LW
4217 continue;
4218 }
830247a4
IZ
4219 else if (*RExC_parse == '#') {
4220 while (*RExC_parse && *RExC_parse != '\n')
4221 RExC_parse++;
4222 RExC_parse++;
748a9306
LW
4223 continue;
4224 }
748a9306 4225 }
4633a7c4 4226 return retval;
a0d0e21e 4227 }
a687059c
LW
4228}
4229
4230/*
c277df42 4231- reg_node - emit a node
a0d0e21e 4232*/
76e3520e 4233STATIC regnode * /* Location. */
830247a4 4234S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4235{
c277df42
IZ
4236 register regnode *ret;
4237 register regnode *ptr;
a687059c 4238
830247a4 4239 ret = RExC_emit;
c277df42 4240 if (SIZE_ONLY) {
830247a4
IZ
4241 SIZE_ALIGN(RExC_size);
4242 RExC_size += 1;
a0d0e21e
LW
4243 return(ret);
4244 }
a687059c 4245
c277df42 4246 NODE_ALIGN_FILL(ret);
a0d0e21e 4247 ptr = ret;
c277df42 4248 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4249 if (RExC_offsets) { /* MJD */
4250 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4251 "reg_node", __LINE__,
4252 reg_name[op],
4253 RExC_emit - RExC_emit_start > RExC_offsets[0]
4254 ? "Overwriting end of array!\n" : "OK",
4255 RExC_emit - RExC_emit_start,
4256 RExC_parse - RExC_start,
4257 RExC_offsets[0]));
4258 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4259 }
4260
830247a4 4261 RExC_emit = ptr;
a687059c 4262
a0d0e21e 4263 return(ret);
a687059c
LW
4264}
4265
4266/*
a0d0e21e
LW
4267- reganode - emit a node with an argument
4268*/
76e3520e 4269STATIC regnode * /* Location. */
830247a4 4270S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4271{
c277df42
IZ
4272 register regnode *ret;
4273 register regnode *ptr;
fe14fcc3 4274
830247a4 4275 ret = RExC_emit;
c277df42 4276 if (SIZE_ONLY) {
830247a4
IZ
4277 SIZE_ALIGN(RExC_size);
4278 RExC_size += 2;
a0d0e21e
LW
4279 return(ret);
4280 }
fe14fcc3 4281
c277df42 4282 NODE_ALIGN_FILL(ret);
a0d0e21e 4283 ptr = ret;
c277df42 4284 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4285 if (RExC_offsets) { /* MJD */
4286 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4287 "reganode",
4288 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4289 "Overwriting end of array!\n" : "OK",
4290 RExC_emit - RExC_emit_start,
4291 RExC_parse - RExC_start,
4292 RExC_offsets[0]));
4293 Set_Cur_Node_Offset;
4294 }
4295
830247a4 4296 RExC_emit = ptr;
fe14fcc3 4297
a0d0e21e 4298 return(ret);
fe14fcc3
LW
4299}
4300
4301/*
cd439c50 4302- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4303*/
4304STATIC void
830247a4 4305S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4306{
5e12f4fb 4307 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4308}
4309
4310/*
a0d0e21e
LW
4311- reginsert - insert an operator in front of already-emitted operand
4312*
4313* Means relocating the operand.
4314*/
76e3520e 4315STATIC void
830247a4 4316S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4317{
c277df42
IZ
4318 register regnode *src;
4319 register regnode *dst;
4320 register regnode *place;
4321 register int offset = regarglen[(U8)op];
b81d288d 4322
22c35a8c 4323/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4324
4325 if (SIZE_ONLY) {
830247a4 4326 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4327 return;
4328 }
a687059c 4329
830247a4
IZ
4330 src = RExC_emit;
4331 RExC_emit += NODE_STEP_REGNODE + offset;
4332 dst = RExC_emit;
fac92740 4333 while (src > opnd) {
c277df42 4334 StructCopy(--src, --dst, regnode);
fac92740
MJD
4335 if (RExC_offsets) { /* MJD 20010112 */
4336 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4337 "reg_insert",
4338 dst - RExC_emit_start > RExC_offsets[0]
4339 ? "Overwriting end of array!\n" : "OK",
4340 src - RExC_emit_start,
4341 dst - RExC_emit_start,
4342 RExC_offsets[0]));
4343 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4344 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4345 }
4346 }
4347
a0d0e21e
LW
4348
4349 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4350 if (RExC_offsets) { /* MJD */
4351 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4352 "reginsert",
4353 place - RExC_emit_start > RExC_offsets[0]
4354 ? "Overwriting end of array!\n" : "OK",
4355 place - RExC_emit_start,
4356 RExC_parse - RExC_start,
4357 RExC_offsets[0]));
4358 Set_Node_Offset(place, RExC_parse);
4359 }
c277df42
IZ
4360 src = NEXTOPER(place);
4361 FILL_ADVANCE_NODE(place, op);
4362 Zero(src, offset, regnode);
a687059c
LW
4363}
4364
4365/*
c277df42 4366- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4367*/
76e3520e 4368STATIC void
830247a4 4369S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4370{
c277df42
IZ
4371 register regnode *scan;
4372 register regnode *temp;
a0d0e21e 4373
c277df42 4374 if (SIZE_ONLY)
a0d0e21e
LW
4375 return;
4376
4377 /* Find last node. */
4378 scan = p;
4379 for (;;) {
4380 temp = regnext(scan);
4381 if (temp == NULL)
4382 break;
4383 scan = temp;
4384 }
a687059c 4385
c277df42
IZ
4386 if (reg_off_by_arg[OP(scan)]) {
4387 ARG_SET(scan, val - scan);
a0ed51b3
LW
4388 }
4389 else {
c277df42
IZ
4390 NEXT_OFF(scan) = val - scan;
4391 }
a687059c
LW
4392}
4393
4394/*
a0d0e21e
LW
4395- regoptail - regtail on operand of first argument; nop if operandless
4396*/
76e3520e 4397STATIC void
830247a4 4398S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4399{
a0d0e21e 4400 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4401 if (p == NULL || SIZE_ONLY)
4402 return;
22c35a8c 4403 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4404 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4405 }
22c35a8c 4406 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4407 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4408 }
4409 else
a0d0e21e 4410 return;
a687059c
LW
4411}
4412
4413/*
4414 - regcurly - a little FSA that accepts {\d+,?\d*}
4415 */
79072805 4416STATIC I32
cea2e8a9 4417S_regcurly(pTHX_ register char *s)
a687059c
LW
4418{
4419 if (*s++ != '{')
4420 return FALSE;
f0fcb552 4421 if (!isDIGIT(*s))
a687059c 4422 return FALSE;
f0fcb552 4423 while (isDIGIT(*s))
a687059c
LW
4424 s++;
4425 if (*s == ',')
4426 s++;
f0fcb552 4427 while (isDIGIT(*s))
a687059c
LW
4428 s++;
4429 if (*s != '}')
4430 return FALSE;
4431 return TRUE;
4432}
4433
a687059c 4434
8fa7f367
JH
4435#ifdef DEBUGGING
4436
76e3520e 4437STATIC regnode *
cea2e8a9 4438S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4439{
f248d071 4440 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4441 register regnode *next;
c277df42
IZ
4442
4443 while (op != END && (!last || node < last)) {
4444 /* While that wasn't END last time... */
4445
4446 NODE_ALIGN(node);
4447 op = OP(node);
4448 if (op == CLOSE)
4449 l--;
4450 next = regnext(node);
4451 /* Where, what. */
4452 if (OP(node) == OPTIMIZED)
4453 goto after_print;
4454 regprop(sv, node);
b900a521 4455 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4456 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4457 if (next == NULL) /* Next ptr. */
4458 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4459 else
b900a521 4460 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4461 (void)PerlIO_putc(Perl_debug_log, '\n');
4462 after_print:
22c35a8c 4463 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4464 register regnode *nnode = (OP(next) == LONGJMP
4465 ? regnext(next)
c277df42
IZ
4466 : next);
4467 if (last && nnode > last)
4468 nnode = last;
4469 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4470 }
22c35a8c 4471 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4472 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4473 }
4474 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4475 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4476 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4477 }
22c35a8c 4478 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4479 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4480 next, sv, l + 1);
a0ed51b3
LW
4481 }
4482 else if ( op == PLUS || op == STAR) {
c277df42 4483 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4484 }
4485 else if (op == ANYOF) {
4f66b38d
HS
4486 /* arglen 1 + class block */
4487 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4488 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4489 node = NEXTOPER(node);
a0ed51b3 4490 }
22c35a8c 4491 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4492 /* Literal string, where present. */
cd439c50 4493 node += NODE_SZ_STR(node) - 1;
c277df42 4494 node = NEXTOPER(node);
a0ed51b3
LW
4495 }
4496 else {
c277df42
IZ
4497 node = NEXTOPER(node);
4498 node += regarglen[(U8)op];
4499 }
4500 if (op == CURLYX || op == OPEN)
4501 l++;
4502 else if (op == WHILEM)
4503 l--;
4504 }
4505 return node;
4506}
4507
8fa7f367
JH
4508#endif /* DEBUGGING */
4509
a687059c 4510/*
fd181c75 4511 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4512 */
4513void
864dbfa3 4514Perl_regdump(pTHX_ regexp *r)
a687059c 4515{
35ff7856 4516#ifdef DEBUGGING
46fc3d4c 4517 SV *sv = sv_newmortal();
a687059c 4518
c277df42 4519 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4520
4521 /* Header fields of interest. */
c277df42 4522 if (r->anchored_substr)
7b0972df 4523 PerlIO_printf(Perl_debug_log,
b81d288d 4524 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4525 PL_colors[0],
7b0972df 4526 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4527 SvPVX(r->anchored_substr),
3280af22 4528 PL_colors[1],
c277df42 4529 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4530 (IV)r->anchored_offset);
c277df42 4531 if (r->float_substr)
7b0972df 4532 PerlIO_printf(Perl_debug_log,
b81d288d 4533 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4534 PL_colors[0],
b81d288d 4535 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4536 SvPVX(r->float_substr),
3280af22 4537 PL_colors[1],
c277df42 4538 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4539 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4540 if (r->check_substr)
b81d288d
AB
4541 PerlIO_printf(Perl_debug_log,
4542 r->check_substr == r->float_substr
c277df42
IZ
4543 ? "(checking floating" : "(checking anchored");
4544 if (r->reganch & ROPT_NOSCAN)
4545 PerlIO_printf(Perl_debug_log, " noscan");
4546 if (r->reganch & ROPT_CHECK_ALL)
4547 PerlIO_printf(Perl_debug_log, " isall");
4548 if (r->check_substr)
4549 PerlIO_printf(Perl_debug_log, ") ");
4550
46fc3d4c 4551 if (r->regstclass) {
4552 regprop(sv, r->regstclass);
4553 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4554 }
774d564b 4555 if (r->reganch & ROPT_ANCH) {
4556 PerlIO_printf(Perl_debug_log, "anchored");
4557 if (r->reganch & ROPT_ANCH_BOL)
4558 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4559 if (r->reganch & ROPT_ANCH_MBOL)
4560 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4561 if (r->reganch & ROPT_ANCH_SBOL)
4562 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4563 if (r->reganch & ROPT_ANCH_GPOS)
4564 PerlIO_printf(Perl_debug_log, "(GPOS)");
4565 PerlIO_putc(Perl_debug_log, ' ');
4566 }
c277df42
IZ
4567 if (r->reganch & ROPT_GPOS_SEEN)
4568 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4569 if (r->reganch & ROPT_SKIP)
760ac839 4570 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4571 if (r->reganch & ROPT_IMPLICIT)
760ac839 4572 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4573 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4574 if (r->reganch & ROPT_EVAL_SEEN)
4575 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4576 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4577 if (r->offsets) {
4578 U32 i;
4579 U32 len = r->offsets[0];
392fbf5d 4580 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4581 for (i = 1; i <= len; i++)
392fbf5d
RB
4582 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4583 (UV)r->offsets[i*2-1],
4584 (UV)r->offsets[i*2]);
fac92740
MJD
4585 PerlIO_printf(Perl_debug_log, "\n");
4586 }
17c3b450 4587#endif /* DEBUGGING */
a687059c
LW
4588}
4589
8fa7f367
JH
4590#ifdef DEBUGGING
4591
653099ff
GS
4592STATIC void
4593S_put_byte(pTHX_ SV *sv, int c)
4594{
7be5a6cf 4595 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4596 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4597 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4598 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4599 else
4600 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4601}
4602
8fa7f367
JH
4603#endif /* DEBUGGING */
4604
a687059c 4605/*
a0d0e21e
LW
4606- regprop - printable representation of opcode
4607*/
46fc3d4c 4608void
864dbfa3 4609Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4610{
35ff7856 4611#ifdef DEBUGGING
9b155405 4612 register int k;
a0d0e21e 4613
54dc92de 4614 sv_setpvn(sv, "", 0);
9b155405 4615 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4616 /* It would be nice to FAIL() here, but this may be called from
4617 regexec.c, and it would be hard to supply pRExC_state. */
4618 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4619 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4620
4621 k = PL_regkind[(U8)OP(o)];
4622
2a782b5b
JH
4623 if (k == EXACT) {
4624 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4625 /* Using is_utf8_string() is a crude hack but it may
4626 * be the best for now since we have no flag "this EXACTish
4627 * node was UTF-8" --jhi */
4628 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4629 char *s = do_utf8 ?
c728cb41
JH
4630 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4631 UNI_DISPLAY_REGEX) :
2a782b5b 4632 STRING(o);
40eddc46 4633 int len = do_utf8 ?
2a782b5b
JH
4634 strlen(s) :
4635 STR_LEN(o);
4636 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4637 PL_colors[0],
4638 len, s,
4639 PL_colors[1]);
4640 }
9b155405 4641 else if (k == CURLY) {
cb434fcc 4642 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4643 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4644 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4645 }
2c2d71f5
JH
4646 else if (k == WHILEM && o->flags) /* Ordinal/of */
4647 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4648 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4649 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4650 else if (k == LOGICAL)
04ebc1ab 4651 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4652 else if (k == ANYOF) {
4653 int i, rangestart = -1;
ffc61ed2 4654 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4655 const char * const anyofs[] = { /* Should be syncronized with
4656 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4657 "\\w",
4658 "\\W",
4659 "\\s",
4660 "\\S",
4661 "\\d",
4662 "\\D",
4663 "[:alnum:]",
4664 "[:^alnum:]",
4665 "[:alpha:]",
4666 "[:^alpha:]",
4667 "[:ascii:]",
4668 "[:^ascii:]",
4669 "[:ctrl:]",
4670 "[:^ctrl:]",
4671 "[:graph:]",
4672 "[:^graph:]",
4673 "[:lower:]",
4674 "[:^lower:]",
4675 "[:print:]",
4676 "[:^print:]",
4677 "[:punct:]",
4678 "[:^punct:]",
4679 "[:upper:]",
aaa51d5e 4680 "[:^upper:]",
653099ff 4681 "[:xdigit:]",
aaa51d5e
JF
4682 "[:^xdigit:]",
4683 "[:space:]",
4684 "[:^space:]",
4685 "[:blank:]",
4686 "[:^blank:]"
653099ff
GS
4687 };
4688
19860706 4689 if (flags & ANYOF_LOCALE)
653099ff 4690 sv_catpv(sv, "{loc}");
19860706 4691 if (flags & ANYOF_FOLD)
653099ff
GS
4692 sv_catpv(sv, "{i}");
4693 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4694 if (flags & ANYOF_INVERT)
653099ff 4695 sv_catpv(sv, "^");
ffc61ed2
JH
4696 for (i = 0; i <= 256; i++) {
4697 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4698 if (rangestart == -1)
4699 rangestart = i;
4700 } else if (rangestart != -1) {
4701 if (i <= rangestart + 3)
4702 for (; rangestart < i; rangestart++)
653099ff 4703 put_byte(sv, rangestart);
ffc61ed2
JH
4704 else {
4705 put_byte(sv, rangestart);
4706 sv_catpv(sv, "-");
4707 put_byte(sv, i - 1);
653099ff 4708 }
ffc61ed2 4709 rangestart = -1;
653099ff 4710 }
847a199f 4711 }
ffc61ed2
JH
4712
4713 if (o->flags & ANYOF_CLASS)
4714 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4715 if (ANYOF_CLASS_TEST(o,i))
4716 sv_catpv(sv, anyofs[i]);
4717
4718 if (flags & ANYOF_UNICODE)
4719 sv_catpv(sv, "{unicode}");
1aa99e6b 4720 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4721 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4722
4723 {
4724 SV *lv;
9e55ce06 4725 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4726
ffc61ed2
JH
4727 if (lv) {
4728 if (sw) {
4729 UV i;
4730 U8 s[UTF8_MAXLEN+1];
b81d288d 4731
ffc61ed2 4732 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4733 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4734
3568d838 4735 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4736 if (rangestart == -1)
4737 rangestart = i;
4738 } else if (rangestart != -1) {
4739 U8 *p;
b81d288d 4740
ffc61ed2
JH
4741 if (i <= rangestart + 3)
4742 for (; rangestart < i; rangestart++) {
2b9d42f0 4743 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4744 put_byte(sv, *p);
4745 }
4746 else {
2b9d42f0 4747 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4748 put_byte(sv, *p);
4749 sv_catpv(sv, "-");
2b9d42f0 4750 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4751 put_byte(sv, *p);
4752 }
4753 rangestart = -1;
4754 }
19860706 4755 }
ffc61ed2
JH
4756
4757 sv_catpv(sv, "..."); /* et cetera */
19860706 4758 }
fde631ed 4759
ffc61ed2
JH
4760 {
4761 char *s = savepv(SvPVX(lv));
4762 char *origs = s;
b81d288d 4763
ffc61ed2 4764 while(*s && *s != '\n') s++;
b81d288d 4765
ffc61ed2
JH
4766 if (*s == '\n') {
4767 char *t = ++s;
4768
4769 while (*s) {
4770 if (*s == '\n')
4771 *s = ' ';
4772 s++;
4773 }
4774 if (s[-1] == ' ')
4775 s[-1] = 0;
4776
4777 sv_catpv(sv, t);
fde631ed 4778 }
b81d288d 4779
ffc61ed2 4780 Safefree(origs);
fde631ed
JH
4781 }
4782 }
653099ff 4783 }
ffc61ed2 4784
653099ff
GS
4785 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4786 }
9b155405 4787 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4788 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4789#endif /* DEBUGGING */
35ff7856 4790}
a687059c 4791
cad2e5aa
JH
4792SV *
4793Perl_re_intuit_string(pTHX_ regexp *prog)
4794{ /* Assume that RE_INTUIT is set */
4795 DEBUG_r(
4796 { STRLEN n_a;
4797 char *s = SvPV(prog->check_substr,n_a);
4798
4799 if (!PL_colorset) reginitcolors();
4800 PerlIO_printf(Perl_debug_log,
4801 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4802 PL_colors[4],PL_colors[5],PL_colors[0],
4803 s,
4804 PL_colors[1],
4805 (strlen(s) > 60 ? "..." : ""));
4806 } );
4807
4808 return prog->check_substr;
4809}
4810
2b69d0c2 4811void
864dbfa3 4812Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4813{
9e55ce06
JH
4814#ifdef DEBUGGING
4815 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4816#endif
7821416a
IZ
4817
4818 if (!r || (--r->refcnt > 0))
4819 return;
9e55ce06 4820 DEBUG_r({
9e55ce06 4821 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4822 UNI_DISPLAY_REGEX);
9e55ce06
JH
4823 int len = SvCUR(dsv);
4824 if (!PL_colorset)
4825 reginitcolors();
4826 PerlIO_printf(Perl_debug_log,
4827 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4828 PL_colors[4],PL_colors[5],PL_colors[0],
4829 len, len, s,
4830 PL_colors[1],
4831 len > 60 ? "..." : "");
4832 });
cad2e5aa 4833
c277df42 4834 if (r->precomp)
a0d0e21e 4835 Safefree(r->precomp);
fac92740
MJD
4836 if (r->offsets) /* 20010421 MJD */
4837 Safefree(r->offsets);
cf93c79d
IZ
4838 if (RX_MATCH_COPIED(r))
4839 Safefree(r->subbeg);
a193d654
GS
4840 if (r->substrs) {
4841 if (r->anchored_substr)
4842 SvREFCNT_dec(r->anchored_substr);
4843 if (r->float_substr)
4844 SvREFCNT_dec(r->float_substr);
2779dcf1 4845 Safefree(r->substrs);
a193d654 4846 }
c277df42
IZ
4847 if (r->data) {
4848 int n = r->data->count;
dfad63ad
HS
4849 AV* new_comppad = NULL;
4850 AV* old_comppad;
4851 SV** old_curpad;
4852
c277df42 4853 while (--n >= 0) {
261faec3 4854 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4855 switch (r->data->what[n]) {
4856 case 's':
4857 SvREFCNT_dec((SV*)r->data->data[n]);
4858 break;
653099ff
GS
4859 case 'f':
4860 Safefree(r->data->data[n]);
4861 break;
dfad63ad
HS
4862 case 'p':
4863 new_comppad = (AV*)r->data->data[n];
4864 break;
c277df42 4865 case 'o':
dfad63ad 4866 if (new_comppad == NULL)
cea2e8a9 4867 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4868 old_comppad = PL_comppad;
4869 old_curpad = PL_curpad;
1e6dc0b6
SB
4870 /* Watch out for global destruction's random ordering. */
4871 if (SvTYPE(new_comppad) == SVt_PVAV) {
4872 PL_comppad = new_comppad;
4873 PL_curpad = AvARRAY(new_comppad);
4874 }
4875 else
4876 PL_curpad = NULL;
9b978d73
DM
4877
4878 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4879 op_free((OP_4tree*)r->data->data[n]);
4880 }
4881
dfad63ad
HS
4882 PL_comppad = old_comppad;
4883 PL_curpad = old_curpad;
4884 SvREFCNT_dec((SV*)new_comppad);
4885 new_comppad = NULL;
c277df42
IZ
4886 break;
4887 case 'n':
9e55ce06 4888 break;
c277df42 4889 default:
830247a4 4890 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4891 }
4892 }
4893 Safefree(r->data->what);
4894 Safefree(r->data);
a0d0e21e
LW
4895 }
4896 Safefree(r->startp);
4897 Safefree(r->endp);
4898 Safefree(r);
a687059c 4899}
c277df42
IZ
4900
4901/*
4902 - regnext - dig the "next" pointer out of a node
4903 *
4904 * [Note, when REGALIGN is defined there are two places in regmatch()
4905 * that bypass this code for speed.]
4906 */
4907regnode *
864dbfa3 4908Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4909{
4910 register I32 offset;
4911
3280af22 4912 if (p == &PL_regdummy)
c277df42
IZ
4913 return(NULL);
4914
4915 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4916 if (offset == 0)
4917 return(NULL);
4918
c277df42 4919 return(p+offset);
c277df42
IZ
4920}
4921
01f988be 4922STATIC void
cea2e8a9 4923S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4924{
4925 va_list args;
4926 STRLEN l1 = strlen(pat1);
4927 STRLEN l2 = strlen(pat2);
4928 char buf[512];
06bf62c7 4929 SV *msv;
c277df42
IZ
4930 char *message;
4931
4932 if (l1 > 510)
4933 l1 = 510;
4934 if (l1 + l2 > 510)
4935 l2 = 510 - l1;
4936 Copy(pat1, buf, l1 , char);
4937 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4938 buf[l1 + l2] = '\n';
4939 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4940#ifdef I_STDARG
4941 /* ANSI variant takes additional second argument */
c277df42 4942 va_start(args, pat2);
8736538c
AS
4943#else
4944 va_start(args);
4945#endif
5a844595 4946 msv = vmess(buf, &args);
c277df42 4947 va_end(args);
06bf62c7 4948 message = SvPV(msv,l1);
c277df42
IZ
4949 if (l1 > 512)
4950 l1 = 512;
4951 Copy(message, buf, l1 , char);
4952 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4953 Perl_croak(aTHX_ "%s", buf);
c277df42 4954}
a0ed51b3
LW
4955
4956/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4957
4958void
864dbfa3 4959Perl_save_re_context(pTHX)
b81d288d 4960{
830247a4
IZ
4961#if 0
4962 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4963 SAVEI32(RExC_npar); /* () count. */
4964 SAVEI32(RExC_size); /* Code size. */
4965 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4966 SAVEVPTR(RExC_rx); /* from regcomp.c */
4967 SAVEI32(RExC_seen); /* from regcomp.c */
4968 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4969 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4970 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4971 SAVEPPTR(RExC_end); /* End of input for compile */
4972 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4973#endif
4974
4975 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4976 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4977 SAVEPPTR(PL_reginput); /* String-input pointer. */
4978 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4979 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4980 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4981 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4982 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4983 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4984 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4985 PL_reg_start_tmp = 0;
a0ed51b3
LW
4986 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4987 PL_reg_start_tmpl = 0;
7766f137 4988 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4989 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4990 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4991 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4992 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4993 SAVEVPTR(PL_regcc); /* from regexec.c */
4994 SAVEVPTR(PL_curcop);
7766f137
GS
4995 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4996 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4997 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4998 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4999 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 5000 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5001 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5002 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5003 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5004 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5005 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5006#ifdef DEBUGGING
b81d288d 5007 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5008#endif
a0ed51b3 5009}
51371543 5010
51371543 5011static void
acfe0abc 5012clear_re(pTHX_ void *r)
51371543
GS
5013{
5014 ReREFCNT_dec((regexp *)r);
5015}
ffbc6a93 5016