This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Changes.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
61a36c01 739
d65e4eab 740 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
741/*
742 Two problematic code points in Unicode casefolding of EXACT nodes:
743
744 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747 which casefold to
748
749 Unicode UTF-8
750
751 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
752 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754 This means that in case-insensitive matching (or "loose matching",
755 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756 length of the above casefolded versions) can match a target string
757 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758 This would rather mess up the minimum length computation.
759
760 What we'll do is to look for the tail four bytes, and then peek
761 at the preceding two bytes to see whether we need to decrease
762 the minimum length by four (six minus two).
763
764 Thanks to the design of UTF-8, there cannot be false matches:
765 A sequence of valid UTF-8 bytes cannot be a subsequence of
766 another valid sequence of UTF-8 bytes.
767
768*/
769 char *s0 = STRING(scan), *s, *t;
770 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771 char *t0 = "\xcc\x88\xcc\x81";
772 char *t1 = t0 + 3;
773
774 for (s = s0 + 2;
775 s < s2 && (t = ninstr(s, s1, t0, t1));
776 s = t + 4) {
777 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779 min -= 4;
780 }
781 }
782
c277df42
IZ
783#ifdef DEBUGGING
784 /* Allow dumping */
cd439c50 785 n = scan + NODE_SZ_STR(scan);
c277df42 786 while (n <= stop) {
22c35a8c 787 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
788 OP(n) = OPTIMIZED;
789 NEXT_OFF(n) = 0;
790 }
791 n++;
792 }
653099ff 793#endif
c277df42 794 }
653099ff
GS
795 /* Follow the next-chain of the current node and optimize
796 away all the NOTHINGs from it. */
c277df42 797 if (OP(scan) != CURLYX) {
048cfca1
GS
798 int max = (reg_off_by_arg[OP(scan)]
799 ? I32_MAX
800 /* I32 may be smaller than U16 on CRAYs! */
801 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
802 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803 int noff;
804 regnode *n = scan;
b81d288d 805
c277df42
IZ
806 /* Skip NOTHING and LONGJMP. */
807 while ((n = regnext(n))
22c35a8c 808 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
809 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810 && off + noff < max)
811 off += noff;
812 if (reg_off_by_arg[OP(scan)])
813 ARG(scan) = off;
b81d288d 814 else
c277df42
IZ
815 NEXT_OFF(scan) = off;
816 }
653099ff
GS
817 /* The principal pseudo-switch. Cannot be a switch, since we
818 look into several different things. */
b81d288d 819 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
820 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821 next = regnext(scan);
822 code = OP(scan);
b81d288d
AB
823
824 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 825 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 826 struct regnode_charclass_class accum;
c277df42 827
653099ff 828 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 829 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 830 if (flags & SCF_DO_STCLASS)
830247a4 831 cl_init_zero(pRExC_state, &accum);
c277df42 832 while (OP(scan) == code) {
830247a4 833 I32 deltanext, minnext, f = 0, fake;
653099ff 834 struct regnode_charclass_class this_class;
c277df42
IZ
835
836 num++;
837 data_fake.flags = 0;
b81d288d 838 if (data) {
2c2d71f5 839 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
840 data_fake.last_closep = data->last_closep;
841 }
842 else
843 data_fake.last_closep = &fake;
c277df42
IZ
844 next = regnext(scan);
845 scan = NEXTOPER(scan);
846 if (code != BRANCH)
847 scan = NEXTOPER(scan);
653099ff 848 if (flags & SCF_DO_STCLASS) {
830247a4 849 cl_init(pRExC_state, &this_class);
653099ff
GS
850 data_fake.start_class = &this_class;
851 f = SCF_DO_STCLASS_AND;
b81d288d 852 }
e1901655
IZ
853 if (flags & SCF_WHILEM_VISITED_POS)
854 f |= SCF_WHILEM_VISITED_POS;
653099ff 855 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
856 minnext = study_chunk(pRExC_state, &scan, &deltanext,
857 next, &data_fake, f);
b81d288d 858 if (min1 > minnext)
c277df42
IZ
859 min1 = minnext;
860 if (max1 < minnext + deltanext)
861 max1 = minnext + deltanext;
862 if (deltanext == I32_MAX)
aca2d497 863 is_inf = is_inf_internal = 1;
c277df42
IZ
864 scan = next;
865 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 pars++;
405ff068 867 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 868 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
869 if (data)
870 data->whilem_c = data_fake.whilem_c;
653099ff 871 if (flags & SCF_DO_STCLASS)
830247a4 872 cl_or(pRExC_state, &accum, &this_class);
b81d288d 873 if (code == SUSPEND)
c277df42
IZ
874 break;
875 }
876 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 min1 = 0;
878 if (flags & SCF_DO_SUBSTR) {
879 data->pos_min += min1;
880 data->pos_delta += max1 - min1;
881 if (max1 != min1 || is_inf)
882 data->longest = &(data->longest_float);
883 }
884 min += min1;
885 delta += max1 - min1;
653099ff 886 if (flags & SCF_DO_STCLASS_OR) {
830247a4 887 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
888 if (min1) {
889 cl_and(data->start_class, &and_with);
890 flags &= ~SCF_DO_STCLASS;
891 }
892 }
893 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
894 if (min1) {
895 cl_and(data->start_class, &accum);
653099ff 896 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
897 }
898 else {
b81d288d 899 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
900 * data->start_class */
901 StructCopy(data->start_class, &and_with,
902 struct regnode_charclass_class);
903 flags &= ~SCF_DO_STCLASS_AND;
904 StructCopy(&accum, data->start_class,
905 struct regnode_charclass_class);
906 flags |= SCF_DO_STCLASS_OR;
907 data->start_class->flags |= ANYOF_EOS;
908 }
653099ff 909 }
a0ed51b3
LW
910 }
911 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
912 scan = NEXTOPER(NEXTOPER(scan));
913 else /* single branch is optimized. */
914 scan = NEXTOPER(scan);
915 continue;
a0ed51b3
LW
916 }
917 else if (OP(scan) == EXACT) {
cd439c50 918 I32 l = STR_LEN(scan);
1aa99e6b 919 UV uc = *((U8*)STRING(scan));
a0ed51b3 920 if (UTF) {
1aa99e6b
IH
921 U8 *s = (U8*)STRING(scan);
922 l = utf8_length(s, s + l);
9041c2e3 923 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
924 }
925 min += l;
c277df42 926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
927 /* The code below prefers earlier match for fixed
928 offset, later match for variable offset. */
929 if (data->last_end == -1) { /* Update the start info. */
930 data->last_start_min = data->pos_min;
931 data->last_start_max = is_inf
b81d288d 932 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 933 }
cd439c50 934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
935 data->last_end = data->pos_min + l;
936 data->pos_min += l; /* As in the first entry. */
937 data->flags &= ~SF_BEFORE_EOL;
938 }
653099ff
GS
939 if (flags & SCF_DO_STCLASS_AND) {
940 /* Check whether it is compatible with what we know already! */
941 int compat = 1;
942
1aa99e6b 943 if (uc >= 0x100 ||
516a5887 944 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 945 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 946 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 947 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 948 )
653099ff
GS
949 compat = 0;
950 ANYOF_CLASS_ZERO(data->start_class);
951 ANYOF_BITMAP_ZERO(data->start_class);
952 if (compat)
1aa99e6b 953 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 954 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
955 if (uc < 0x100)
956 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
957 }
958 else if (flags & SCF_DO_STCLASS_OR) {
959 /* false positive possible if the class is case-folded */
1aa99e6b 960 if (uc < 0x100)
9b877dbb
IH
961 ANYOF_BITMAP_SET(data->start_class, uc);
962 else
963 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
964 data->start_class->flags &= ~ANYOF_EOS;
965 cl_and(data->start_class, &and_with);
966 }
967 flags &= ~SCF_DO_STCLASS;
a0ed51b3 968 }
653099ff 969 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 970 I32 l = STR_LEN(scan);
1aa99e6b 971 UV uc = *((U8*)STRING(scan));
653099ff
GS
972
973 /* Search for fixed substrings supports EXACT only. */
b81d288d 974 if (flags & SCF_DO_SUBSTR)
830247a4 975 scan_commit(pRExC_state, data);
a0ed51b3 976 if (UTF) {
1aa99e6b
IH
977 U8 *s = (U8 *)STRING(scan);
978 l = utf8_length(s, s + l);
9041c2e3 979 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
980 }
981 min += l;
c277df42 982 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 983 data->pos_min += l;
653099ff
GS
984 if (flags & SCF_DO_STCLASS_AND) {
985 /* Check whether it is compatible with what we know already! */
986 int compat = 1;
987
1aa99e6b 988 if (uc >= 0x100 ||
516a5887 989 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 990 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 991 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
992 compat = 0;
993 ANYOF_CLASS_ZERO(data->start_class);
994 ANYOF_BITMAP_ZERO(data->start_class);
995 if (compat) {
1aa99e6b 996 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
997 data->start_class->flags &= ~ANYOF_EOS;
998 data->start_class->flags |= ANYOF_FOLD;
999 if (OP(scan) == EXACTFL)
1000 data->start_class->flags |= ANYOF_LOCALE;
1001 }
1002 }
1003 else if (flags & SCF_DO_STCLASS_OR) {
1004 if (data->start_class->flags & ANYOF_FOLD) {
1005 /* false positive possible if the class is case-folded.
1006 Assume that the locale settings are the same... */
1aa99e6b
IH
1007 if (uc < 0x100)
1008 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 }
1011 cl_and(data->start_class, &and_with);
1012 }
1013 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1014 }
4d61ec05 1015 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1016 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1017 I32 f = flags, pos_before = 0;
c277df42 1018 regnode *oscan = scan;
653099ff
GS
1019 struct regnode_charclass_class this_class;
1020 struct regnode_charclass_class *oclass = NULL;
727f22e3 1021 I32 next_is_eval = 0;
653099ff 1022
22c35a8c 1023 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1024 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1025 scan = NEXTOPER(scan);
1026 goto finish;
1027 case PLUS:
653099ff 1028 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1029 next = NEXTOPER(scan);
653099ff 1030 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1031 mincount = 1;
1032 maxcount = REG_INFTY;
c277df42
IZ
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1035 goto do_curly;
1036 }
1037 }
1038 if (flags & SCF_DO_SUBSTR)
1039 data->pos_min++;
1040 min++;
1041 /* Fall through. */
1042 case STAR:
653099ff
GS
1043 if (flags & SCF_DO_STCLASS) {
1044 mincount = 0;
b81d288d 1045 maxcount = REG_INFTY;
653099ff
GS
1046 next = regnext(scan);
1047 scan = NEXTOPER(scan);
1048 goto do_curly;
1049 }
b81d288d 1050 is_inf = is_inf_internal = 1;
c277df42
IZ
1051 scan = regnext(scan);
1052 if (flags & SCF_DO_SUBSTR) {
830247a4 1053 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1054 data->longest = &(data->longest_float);
1055 }
1056 goto optimize_curly_tail;
1057 case CURLY:
b81d288d 1058 mincount = ARG1(scan);
c277df42
IZ
1059 maxcount = ARG2(scan);
1060 next = regnext(scan);
cb434fcc
IZ
1061 if (OP(scan) == CURLYX) {
1062 I32 lp = (data ? *(data->last_closep) : 0);
1063
1064 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 }
c277df42 1066 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1067 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1068 do_curly:
1069 if (flags & SCF_DO_SUBSTR) {
830247a4 1070 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1071 pos_before = data->pos_min;
1072 }
1073 if (data) {
1074 fl = data->flags;
1075 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 if (is_inf)
1077 data->flags |= SF_IS_INF;
1078 }
653099ff 1079 if (flags & SCF_DO_STCLASS) {
830247a4 1080 cl_init(pRExC_state, &this_class);
653099ff
GS
1081 oclass = data->start_class;
1082 data->start_class = &this_class;
1083 f |= SCF_DO_STCLASS_AND;
1084 f &= ~SCF_DO_STCLASS_OR;
1085 }
e1901655
IZ
1086 /* These are the cases when once a subexpression
1087 fails at a particular position, it cannot succeed
1088 even after backtracking at the enclosing scope.
b81d288d 1089
e1901655
IZ
1090 XXXX what if minimal match and we are at the
1091 initial run of {n,m}? */
1092 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1094
c277df42 1095 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1096 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 mincount == 0
653099ff
GS
1098 ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100 if (flags & SCF_DO_STCLASS)
1101 data->start_class = oclass;
1102 if (mincount == 0 || minnext == 0) {
1103 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1104 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1105 }
1106 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1107 /* Switch to OR mode: cache the old value of
653099ff
GS
1108 * data->start_class */
1109 StructCopy(data->start_class, &and_with,
1110 struct regnode_charclass_class);
1111 flags &= ~SCF_DO_STCLASS_AND;
1112 StructCopy(&this_class, data->start_class,
1113 struct regnode_charclass_class);
1114 flags |= SCF_DO_STCLASS_OR;
1115 data->start_class->flags |= ANYOF_EOS;
1116 }
1117 } else { /* Non-zero len */
1118 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1119 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1120 cl_and(data->start_class, &and_with);
1121 }
1122 else if (flags & SCF_DO_STCLASS_AND)
1123 cl_and(data->start_class, &this_class);
1124 flags &= ~SCF_DO_STCLASS;
1125 }
c277df42
IZ
1126 if (!scan) /* It was not CURLYX, but CURLY. */
1127 scan = next;
84037bb0 1128 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1129 /* ? quantifier ok, except for (?{ ... }) */
1130 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1131 && (minnext == 0) && (deltanext == 0)
99799961 1132 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1133 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1134 {
830247a4 1135 vWARN(RExC_parse,
b45f050a
JF
1136 "Quantifier unexpected on zero-length expression");
1137 }
1138
c277df42 1139 min += minnext * mincount;
b81d288d 1140 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1141 && (minnext + deltanext) > 0)
1142 || deltanext == I32_MAX);
aca2d497 1143 is_inf |= is_inf_internal;
c277df42
IZ
1144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1147 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1148 && data->flags & SF_IN_PAR
1149 && !(data->flags & SF_HAS_EVAL)
1150 && !deltanext && minnext == 1 ) {
1151 /* Try to optimize to CURLYN. */
1152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1153 regnode *nxt1 = nxt;
1154#ifdef DEBUGGING
1155 regnode *nxt2;
1156#endif
c277df42
IZ
1157
1158 /* Skip open. */
1159 nxt = regnext(nxt);
4d61ec05 1160 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1161 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1162 && STR_LEN(nxt) == 1))
c277df42 1163 goto nogo;
497b47a8 1164#ifdef DEBUGGING
c277df42 1165 nxt2 = nxt;
497b47a8 1166#endif
c277df42 1167 nxt = regnext(nxt);
b81d288d 1168 if (OP(nxt) != CLOSE)
c277df42
IZ
1169 goto nogo;
1170 /* Now we know that nxt2 is the only contents: */
1171 oscan->flags = ARG(nxt);
1172 OP(oscan) = CURLYN;
1173 OP(nxt1) = NOTHING; /* was OPEN. */
1174#ifdef DEBUGGING
1175 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1179 OP(nxt + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1181#endif
c277df42 1182 }
c277df42
IZ
1183 nogo:
1184
1185 /* Try optimization CURLYX => CURLYM. */
b81d288d 1186 if ( OP(oscan) == CURLYX && data
c277df42 1187 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1188 && !(data->flags & SF_HAS_EVAL)
1189 && !deltanext ) {
1190 /* XXXX How to optimize if data == 0? */
1191 /* Optimize to a simpler form. */
1192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193 regnode *nxt2;
1194
1195 OP(oscan) = CURLYM;
1196 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1197 && (OP(nxt2) != WHILEM))
c277df42
IZ
1198 nxt = nxt2;
1199 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1200 /* Need to optimize away parenths. */
1201 if (data->flags & SF_IN_PAR) {
1202 /* Set the parenth number. */
1203 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
b81d288d 1205 if (OP(nxt) != CLOSE)
b45f050a 1206 FAIL("Panic opt close");
c277df42
IZ
1207 oscan->flags = ARG(nxt);
1208 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1209 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210#ifdef DEBUGGING
1211 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212 OP(nxt + 1) = OPTIMIZED; /* was count. */
1213 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1215#endif
c277df42
IZ
1216#if 0
1217 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218 regnode *nnxt = regnext(nxt1);
b81d288d 1219
c277df42
IZ
1220 if (nnxt == nxt) {
1221 if (reg_off_by_arg[OP(nxt1)])
1222 ARG_SET(nxt1, nxt2 - nxt1);
1223 else if (nxt2 - nxt1 < U16_MAX)
1224 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 else
1226 OP(nxt) = NOTHING; /* Cannot beautify */
1227 }
1228 nxt1 = nnxt;
1229 }
1230#endif
1231 /* Optimize again: */
b81d288d 1232 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1233 NULL, 0);
a0ed51b3
LW
1234 }
1235 else
c277df42 1236 oscan->flags = 0;
c277df42 1237 }
e1901655
IZ
1238 else if ((OP(oscan) == CURLYX)
1239 && (flags & SCF_WHILEM_VISITED_POS)
1240 /* See the comment on a similar expression above.
1241 However, this time it not a subexpression
1242 we care about, but the expression itself. */
1243 && (maxcount == REG_INFTY)
1244 && data && ++data->whilem_c < 16) {
1245 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1246 /* Find WHILEM (as in regexec.c) */
1247 regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 nxt += ARG(nxt);
1251 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1252 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1253 }
b81d288d 1254 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1255 pars++;
1256 if (flags & SCF_DO_SUBSTR) {
1257 SV *last_str = Nullsv;
1258 int counted = mincount != 0;
1259
1260 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1261#if defined(SPARC64_GCC_WORKAROUND)
1262 I32 b = 0;
1263 STRLEN l = 0;
1264 char *s = NULL;
1265 I32 old = 0;
1266
1267 if (pos_before >= data->last_start_min)
1268 b = pos_before;
1269 else
1270 b = data->last_start_min;
1271
1272 l = 0;
1273 s = SvPV(data->last_found, l);
1274 old = b - data->last_start_min;
1275
1276#else
b81d288d 1277 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1278 ? pos_before : data->last_start_min;
1279 STRLEN l;
1280 char *s = SvPV(data->last_found, l);
a0ed51b3 1281 I32 old = b - data->last_start_min;
5d1c421c 1282#endif
a0ed51b3
LW
1283
1284 if (UTF)
1285 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1286
a0ed51b3 1287 l -= old;
c277df42 1288 /* Get the added string: */
79cb57f6 1289 last_str = newSVpvn(s + old, l);
c277df42
IZ
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1292 if (mincount > 1) {
1293 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1294 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
b81d288d 1298 SvCUR_set(data->last_found,
c277df42
IZ
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1301 data->last_end += l * (mincount - 1);
1302 }
2a8d9689
HS
1303 } else {
1304 /* start offset must point into the last copy */
1305 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1306 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307 * (minnext + data->pos_delta);
c277df42
IZ
1308 }
1309 }
1310 /* It is counted once already... */
1311 data->pos_min += minnext * (mincount - counted);
1312 data->pos_delta += - counted * deltanext +
1313 (minnext + deltanext) * maxcount - minnext * mincount;
1314 if (mincount != maxcount) {
653099ff
GS
1315 /* Cannot extend fixed substrings found inside
1316 the group. */
830247a4 1317 scan_commit(pRExC_state,data);
c277df42
IZ
1318 if (mincount && last_str) {
1319 sv_setsv(data->last_found, last_str);
1320 data->last_end = data->pos_min;
b81d288d 1321 data->last_start_min =
a0ed51b3 1322 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1323 data->last_start_max = is_inf
1324 ? I32_MAX
c277df42 1325 : data->pos_min + data->pos_delta
a0ed51b3 1326 - CHR_SVLEN(last_str);
c277df42
IZ
1327 }
1328 data->longest = &(data->longest_float);
1329 }
aca2d497 1330 SvREFCNT_dec(last_str);
c277df42 1331 }
405ff068 1332 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1333 data->flags |= SF_HAS_EVAL;
1334 optimize_curly_tail:
c277df42 1335 if (OP(oscan) != CURLYX) {
22c35a8c 1336 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1337 && NEXT_OFF(next))
1338 NEXT_OFF(oscan) += NEXT_OFF(next);
1339 }
c277df42 1340 continue;
653099ff 1341 default: /* REF and CLUMP only? */
c277df42 1342 if (flags & SCF_DO_SUBSTR) {
830247a4 1343 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1344 data->longest = &(data->longest_float);
1345 }
aca2d497 1346 is_inf = is_inf_internal = 1;
653099ff 1347 if (flags & SCF_DO_STCLASS_OR)
830247a4 1348 cl_anything(pRExC_state, data->start_class);
653099ff 1349 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1350 break;
1351 }
a0ed51b3 1352 }
ffc61ed2 1353 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1354 int value = 0;
653099ff 1355
c277df42 1356 if (flags & SCF_DO_SUBSTR) {
830247a4 1357 scan_commit(pRExC_state,data);
c277df42
IZ
1358 data->pos_min++;
1359 }
1360 min++;
653099ff
GS
1361 if (flags & SCF_DO_STCLASS) {
1362 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364 /* Some of the logic below assumes that switching
1365 locale on will only add false positives. */
1366 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1367 case SANY:
653099ff
GS
1368 default:
1369 do_default:
1370 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1372 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1373 break;
1374 case REG_ANY:
1375 if (OP(scan) == SANY)
1376 goto do_default;
1377 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1380 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1381 }
1382 if (flags & SCF_DO_STCLASS_AND || !value)
1383 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384 break;
1385 case ANYOF:
1386 if (flags & SCF_DO_STCLASS_AND)
1387 cl_and(data->start_class,
1388 (struct regnode_charclass_class*)scan);
1389 else
830247a4 1390 cl_or(pRExC_state, data->start_class,
653099ff
GS
1391 (struct regnode_charclass_class*)scan);
1392 break;
1393 case ALNUM:
1394 if (flags & SCF_DO_STCLASS_AND) {
1395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397 for (value = 0; value < 256; value++)
1398 if (!isALNUM(value))
1399 ANYOF_BITMAP_CLEAR(data->start_class, value);
1400 }
1401 }
1402 else {
1403 if (data->start_class->flags & ANYOF_LOCALE)
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 else {
1406 for (value = 0; value < 256; value++)
1407 if (isALNUM(value))
b81d288d 1408 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1409 }
1410 }
1411 break;
1412 case ALNUML:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (data->start_class->flags & ANYOF_LOCALE)
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 }
1417 else {
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 data->start_class->flags |= ANYOF_LOCALE;
1420 }
1421 break;
1422 case NALNUM:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1427 if (isALNUM(value))
1428 ANYOF_BITMAP_CLEAR(data->start_class, value);
1429 }
1430 }
1431 else {
1432 if (data->start_class->flags & ANYOF_LOCALE)
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 else {
1435 for (value = 0; value < 256; value++)
1436 if (!isALNUM(value))
b81d288d 1437 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1438 }
1439 }
1440 break;
1441 case NALNUML:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (data->start_class->flags & ANYOF_LOCALE)
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 }
1446 else {
1447 data->start_class->flags |= ANYOF_LOCALE;
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449 }
1450 break;
1451 case SPACE:
1452 if (flags & SCF_DO_STCLASS_AND) {
1453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455 for (value = 0; value < 256; value++)
1456 if (!isSPACE(value))
1457 ANYOF_BITMAP_CLEAR(data->start_class, value);
1458 }
1459 }
1460 else {
1461 if (data->start_class->flags & ANYOF_LOCALE)
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 else {
1464 for (value = 0; value < 256; value++)
1465 if (isSPACE(value))
b81d288d 1466 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1467 }
1468 }
1469 break;
1470 case SPACEL:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (data->start_class->flags & ANYOF_LOCALE)
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 }
1475 else {
1476 data->start_class->flags |= ANYOF_LOCALE;
1477 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478 }
1479 break;
1480 case NSPACE:
1481 if (flags & SCF_DO_STCLASS_AND) {
1482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1485 if (isSPACE(value))
1486 ANYOF_BITMAP_CLEAR(data->start_class, value);
1487 }
1488 }
1489 else {
1490 if (data->start_class->flags & ANYOF_LOCALE)
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 else {
1493 for (value = 0; value < 256; value++)
1494 if (!isSPACE(value))
b81d288d 1495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1496 }
1497 }
1498 break;
1499 case NSPACEL:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (data->start_class->flags & ANYOF_LOCALE) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (!isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 data->start_class->flags |= ANYOF_LOCALE;
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 }
1512 break;
1513 case DIGIT:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516 for (value = 0; value < 256; value++)
1517 if (!isDIGIT(value))
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519 }
1520 else {
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 else {
1524 for (value = 0; value < 256; value++)
1525 if (isDIGIT(value))
b81d288d 1526 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1527 }
1528 }
1529 break;
1530 case NDIGIT:
1531 if (flags & SCF_DO_STCLASS_AND) {
1532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
1535 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536 }
1537 else {
1538 if (data->start_class->flags & ANYOF_LOCALE)
1539 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 else {
1541 for (value = 0; value < 256; value++)
1542 if (!isDIGIT(value))
b81d288d 1543 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1544 }
1545 }
1546 break;
1547 }
1548 if (flags & SCF_DO_STCLASS_OR)
1549 cl_and(data->start_class, &and_with);
1550 flags &= ~SCF_DO_STCLASS;
1551 }
a0ed51b3 1552 }
22c35a8c 1553 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1554 data->flags |= (OP(scan) == MEOL
1555 ? SF_BEFORE_MEOL
1556 : SF_BEFORE_SEOL);
a0ed51b3 1557 }
653099ff
GS
1558 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1559 /* Lookbehind, or need to calculate parens/evals/stclass: */
1560 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1561 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1562 /* Lookahead/lookbehind */
cb434fcc 1563 I32 deltanext, minnext, fake = 0;
c277df42 1564 regnode *nscan;
653099ff
GS
1565 struct regnode_charclass_class intrnl;
1566 int f = 0;
c277df42
IZ
1567
1568 data_fake.flags = 0;
b81d288d 1569 if (data) {
2c2d71f5 1570 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1571 data_fake.last_closep = data->last_closep;
1572 }
1573 else
1574 data_fake.last_closep = &fake;
653099ff
GS
1575 if ( flags & SCF_DO_STCLASS && !scan->flags
1576 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1577 cl_init(pRExC_state, &intrnl);
653099ff 1578 data_fake.start_class = &intrnl;
e1901655 1579 f |= SCF_DO_STCLASS_AND;
653099ff 1580 }
e1901655
IZ
1581 if (flags & SCF_WHILEM_VISITED_POS)
1582 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1583 next = regnext(scan);
1584 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1585 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1586 if (scan->flags) {
1587 if (deltanext) {
9baa0206 1588 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1589 }
1590 else if (minnext > U8_MAX) {
9baa0206 1591 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1592 }
1593 scan->flags = minnext;
1594 }
1595 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 pars++;
405ff068 1597 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1598 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1599 if (data)
1600 data->whilem_c = data_fake.whilem_c;
e1901655 1601 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1602 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604 cl_and(data->start_class, &intrnl);
1605 if (was)
1606 data->start_class->flags |= ANYOF_EOS;
1607 }
a0ed51b3
LW
1608 }
1609 else if (OP(scan) == OPEN) {
c277df42 1610 pars++;
a0ed51b3 1611 }
cb434fcc
IZ
1612 else if (OP(scan) == CLOSE) {
1613 if (ARG(scan) == is_par) {
1614 next = regnext(scan);
c277df42 1615
cb434fcc
IZ
1616 if ( next && (OP(next) != WHILEM) && next < last)
1617 is_par = 0; /* Disable optimization */
1618 }
1619 if (data)
1620 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1621 }
1622 else if (OP(scan) == EVAL) {
c277df42
IZ
1623 if (data)
1624 data->flags |= SF_HAS_EVAL;
1625 }
96776eda 1626 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1627 if (flags & SCF_DO_SUBSTR) {
830247a4 1628 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1629 data->longest = &(data->longest_float);
1630 }
1631 is_inf = is_inf_internal = 1;
653099ff 1632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1633 cl_anything(pRExC_state, data->start_class);
96776eda 1634 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1635 }
c277df42
IZ
1636 /* Else: zero-length, ignore. */
1637 scan = regnext(scan);
1638 }
1639
1640 finish:
1641 *scanp = scan;
aca2d497 1642 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1643 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1644 data->pos_delta = I32_MAX - data->pos_min;
1645 if (is_par > U8_MAX)
1646 is_par = 0;
1647 if (is_par && pars==1 && data) {
1648 data->flags |= SF_IN_PAR;
1649 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1650 }
1651 else if (pars && data) {
c277df42
IZ
1652 data->flags |= SF_HAS_PAR;
1653 data->flags &= ~SF_IN_PAR;
1654 }
653099ff
GS
1655 if (flags & SCF_DO_STCLASS_OR)
1656 cl_and(data->start_class, &and_with);
c277df42
IZ
1657 return min;
1658}
1659
76e3520e 1660STATIC I32
830247a4 1661S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1662{
830247a4 1663 if (RExC_rx->data) {
b81d288d
AB
1664 Renewc(RExC_rx->data,
1665 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1666 char, struct reg_data);
830247a4
IZ
1667 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668 RExC_rx->data->count += n;
a0ed51b3
LW
1669 }
1670 else {
830247a4 1671 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1672 char, struct reg_data);
830247a4
IZ
1673 New(1208, RExC_rx->data->what, n, U8);
1674 RExC_rx->data->count = n;
c277df42 1675 }
830247a4
IZ
1676 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677 return RExC_rx->data->count - n;
c277df42
IZ
1678}
1679
d88dccdf 1680void
864dbfa3 1681Perl_reginitcolors(pTHX)
d88dccdf 1682{
d88dccdf
IZ
1683 int i = 0;
1684 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1685
d88dccdf
IZ
1686 if (s) {
1687 PL_colors[0] = s = savepv(s);
1688 while (++i < 6) {
1689 s = strchr(s, '\t');
1690 if (s) {
1691 *s = '\0';
1692 PL_colors[i] = ++s;
1693 }
1694 else
c712d376 1695 PL_colors[i] = s = "";
d88dccdf
IZ
1696 }
1697 } else {
b81d288d 1698 while (i < 6)
d88dccdf
IZ
1699 PL_colors[i++] = "";
1700 }
1701 PL_colorset = 1;
1702}
1703
8615cb43 1704
a687059c 1705/*
e50aee73 1706 - pregcomp - compile a regular expression into internal code
a687059c
LW
1707 *
1708 * We can't allocate space until we know how big the compiled form will be,
1709 * but we can't compile it (and thus know how big it is) until we've got a
1710 * place to put the code. So we cheat: we compile it twice, once with code
1711 * generation turned off and size counting turned on, and once "for real".
1712 * This also means that we don't allocate space until we are sure that the
1713 * thing really will compile successfully, and we never have to move the
1714 * code and thus invalidate pointers into it. (Note that it has to be in
1715 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 *
1717 * Beware that the optimization-preparation code in here knows about some
1718 * of the structure of the compiled regexp. [I'll say.]
1719 */
1720regexp *
864dbfa3 1721Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1722{
a0d0e21e 1723 register regexp *r;
c277df42 1724 regnode *scan;
c277df42 1725 regnode *first;
a0d0e21e 1726 I32 flags;
a0d0e21e
LW
1727 I32 minlen = 0;
1728 I32 sawplus = 0;
1729 I32 sawopen = 0;
2c2d71f5 1730 scan_data_t data;
830247a4
IZ
1731 RExC_state_t RExC_state;
1732 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1733
1734 if (exp == NULL)
c277df42 1735 FAIL("NULL regexp argument");
a0d0e21e 1736
a5961de5 1737 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1738
5cfc7842 1739 RExC_precomp = exp;
a5961de5
JH
1740 DEBUG_r({
1741 if (!PL_colorset) reginitcolors();
1742 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1744 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 });
830247a4
IZ
1746 RExC_flags16 = pm->op_pmflags;
1747 RExC_sawback = 0;
bbce6d69 1748
830247a4
IZ
1749 RExC_seen = 0;
1750 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751 RExC_seen_evals = 0;
1752 RExC_extralen = 0;
c277df42 1753
bbce6d69 1754 /* First pass: determine size, legality. */
830247a4 1755 RExC_parse = exp;
fac92740 1756 RExC_start = exp;
830247a4
IZ
1757 RExC_end = xend;
1758 RExC_naughty = 0;
1759 RExC_npar = 1;
1760 RExC_size = 0L;
1761 RExC_emit = &PL_regdummy;
1762 RExC_whilem_seen = 0;
85ddcde9
JH
1763#if 0 /* REGC() is (currently) a NOP at the first pass.
1764 * Clever compilers notice this and complain. --jhi */
830247a4 1765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1766#endif
830247a4 1767 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1768 RExC_precomp = Nullch;
a0d0e21e
LW
1769 return(NULL);
1770 }
830247a4 1771 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1772
c277df42
IZ
1773 /* Small enough for pointer-storage convention?
1774 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1775 if (RExC_size >= 0x10000L && RExC_extralen)
1776 RExC_size += RExC_extralen;
c277df42 1777 else
830247a4
IZ
1778 RExC_extralen = 0;
1779 if (RExC_whilem_seen > 15)
1780 RExC_whilem_seen = 15;
a0d0e21e 1781
bbce6d69 1782 /* Allocate space and initialize. */
830247a4 1783 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1784 char, regexp);
a0d0e21e 1785 if (r == NULL)
b45f050a
JF
1786 FAIL("Regexp out of space");
1787
0f79a09d
GS
1788#ifdef DEBUGGING
1789 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1790 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1791#endif
c277df42 1792 r->refcnt = 1;
bbce6d69 1793 r->prelen = xend - exp;
5cfc7842 1794 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1795 r->subbeg = NULL;
1796 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1797 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1798
1799 r->substrs = 0; /* Useful during FAIL. */
1800 r->startp = 0; /* Useful during FAIL. */
1801 r->endp = 0; /* Useful during FAIL. */
1802
fac92740
MJD
1803 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 if (r->offsets) {
1805 r->offsets[0] = RExC_size;
1806 }
1807 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1808 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1809 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1810 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1811
830247a4 1812 RExC_rx = r;
bbce6d69 1813
1814 /* Second pass: emit code. */
055bb491 1815 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1816 RExC_parse = exp;
1817 RExC_end = xend;
1818 RExC_naughty = 0;
1819 RExC_npar = 1;
fac92740 1820 RExC_emit_start = r->program;
830247a4 1821 RExC_emit = r->program;
2cd61cdb 1822 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1823 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1825 r->data = 0;
830247a4 1826 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1827 return(NULL);
1828
1829 /* Dig out information for optimizations. */
cf93c79d 1830 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1831 pm->op_pmflags = RExC_flags16;
a0ed51b3 1832 if (UTF)
5ff6fc6d 1833 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1834 r->regstclass = NULL;
830247a4 1835 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1836 r->reganch |= ROPT_NAUGHTY;
c277df42 1837 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1838
1839 /* XXXX To minimize changes to RE engine we always allocate
1840 3-units-long substrs field. */
1841 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
2c2d71f5 1843 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1844 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1845 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1846 I32 fake;
c5254dd6 1847 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1848 struct regnode_charclass_class ch_class;
1849 int stclass_flag;
cb434fcc 1850 I32 last_close = 0;
a0d0e21e
LW
1851
1852 first = scan;
c277df42 1853 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1854 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1855 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1856 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857 (OP(first) == PLUS) ||
1858 (OP(first) == MINMOD) ||
653099ff 1859 /* An {n,m} with n>0 */
22c35a8c 1860 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1861 if (OP(first) == PLUS)
1862 sawplus = 1;
1863 else
1864 first += regarglen[(U8)OP(first)];
1865 first = NEXTOPER(first);
a687059c
LW
1866 }
1867
a0d0e21e
LW
1868 /* Starting-point info. */
1869 again:
653099ff 1870 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1871 if (OP(first) == EXACT)
1872 ; /* Empty, get anchored substr later. */
1873 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1874 r->regstclass = first;
1875 }
653099ff 1876 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1877 r->regstclass = first;
22c35a8c
GS
1878 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1880 r->regstclass = first;
22c35a8c 1881 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1882 r->reganch |= (OP(first) == MBOL
1883 ? ROPT_ANCH_MBOL
1884 : (OP(first) == SBOL
1885 ? ROPT_ANCH_SBOL
1886 : ROPT_ANCH_BOL));
a0d0e21e 1887 first = NEXTOPER(first);
774d564b 1888 goto again;
1889 }
1890 else if (OP(first) == GPOS) {
1891 r->reganch |= ROPT_ANCH_GPOS;
1892 first = NEXTOPER(first);
1893 goto again;
a0d0e21e 1894 }
e09294f4 1895 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1896 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1897 !(r->reganch & ROPT_ANCH) )
1898 {
1899 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1900 int type = OP(NEXTOPER(first));
1901
ffc61ed2 1902 if (type == REG_ANY)
cad2e5aa
JH
1903 type = ROPT_ANCH_MBOL;
1904 else
1905 type = ROPT_ANCH_SBOL;
1906
1907 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1908 first = NEXTOPER(first);
774d564b 1909 goto again;
a0d0e21e 1910 }
b81d288d 1911 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1912 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1913 /* x+ must match at the 1st pos of run of x's */
1914 r->reganch |= ROPT_SKIP;
a0d0e21e 1915
c277df42 1916 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1917 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1918 (IV)(first - scan + 1)));
a0d0e21e
LW
1919 /*
1920 * If there's something expensive in the r.e., find the
1921 * longest literal string that must appear and make it the
1922 * regmust. Resolve ties in favor of later strings, since
1923 * the regstart check works with the beginning of the r.e.
1924 * and avoiding duplication strengthens checking. Not a
1925 * strong reason, but sufficient in the absence of others.
1926 * [Now we resolve ties in favor of the earlier string if
c277df42 1927 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1928 * earlier string may buy us something the later one won't.]
1929 */
a0d0e21e 1930 minlen = 0;
a687059c 1931
79cb57f6
GS
1932 data.longest_fixed = newSVpvn("",0);
1933 data.longest_float = newSVpvn("",0);
1934 data.last_found = newSVpvn("",0);
c277df42
IZ
1935 data.longest = &(data.longest_fixed);
1936 first = scan;
653099ff 1937 if (!r->regstclass) {
830247a4 1938 cl_init(pRExC_state, &ch_class);
653099ff
GS
1939 data.start_class = &ch_class;
1940 stclass_flag = SCF_DO_STCLASS_AND;
1941 } else /* XXXX Check for BOUND? */
1942 stclass_flag = 0;
cb434fcc 1943 data.last_closep = &last_close;
653099ff 1944
830247a4 1945 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1946 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1947 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1948 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1949 && !RExC_seen_zerolen
1950 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1951 r->reganch |= ROPT_CHECK_ALL;
830247a4 1952 scan_commit(pRExC_state, &data);
c277df42
IZ
1953 SvREFCNT_dec(data.last_found);
1954
a0ed51b3 1955 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1956 if (longest_float_length
c277df42
IZ
1957 || (data.flags & SF_FL_BEFORE_EOL
1958 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1959 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1960 int t;
1961
a0ed51b3 1962 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1963 && data.offset_fixed == data.offset_float_min
1964 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965 goto remove_float; /* As in (a)+. */
1966
c277df42
IZ
1967 r->float_substr = data.longest_float;
1968 r->float_min_offset = data.offset_float_min;
1969 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1970 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1972 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1973 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1974 }
1975 else {
aca2d497 1976 remove_float:
c277df42
IZ
1977 r->float_substr = Nullsv;
1978 SvREFCNT_dec(data.longest_float);
c5254dd6 1979 longest_float_length = 0;
a0d0e21e 1980 }
c277df42 1981
a0ed51b3 1982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1983 if (longest_fixed_length
c277df42
IZ
1984 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1986 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1987 int t;
1988
c277df42
IZ
1989 r->anchored_substr = data.longest_fixed;
1990 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1993 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1994 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1995 }
1996 else {
c277df42
IZ
1997 r->anchored_substr = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1999 longest_fixed_length = 0;
a0d0e21e 2000 }
b81d288d 2001 if (r->regstclass
ffc61ed2 2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
2003 r->regstclass = NULL;
2004 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005 && !(data.start_class->flags & ANYOF_EOS)
2006 && !cl_is_anything(data.start_class)) {
830247a4 2007 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2008
b81d288d 2009 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2010 struct regnode_charclass_class);
2011 StructCopy(data.start_class,
830247a4 2012 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2013 struct regnode_charclass_class);
830247a4 2014 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2015 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2016 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2017 DEBUG_r({ SV *sv = sv_newmortal();
2018 regprop(sv, (regnode*)data.start_class);
2019 PerlIO_printf(Perl_debug_log,
2020 "synthetic stclass `%s'.\n",
2021 SvPVX(sv));});
653099ff 2022 }
c277df42
IZ
2023
2024 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2025 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
2026 r->check_substr = r->anchored_substr;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2030 }
2031 else {
c277df42
IZ
2032 r->check_substr = r->float_substr;
2033 r->check_offset_min = data.offset_float_min;
2034 r->check_offset_max = data.offset_float_max;
a0d0e21e 2035 }
30382c73
IZ
2036 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037 This should be changed ASAP! */
2038 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
2039 r->reganch |= RE_USE_INTUIT;
2040 if (SvTAIL(r->check_substr))
2041 r->reganch |= RE_INTUIT_TAIL;
2042 }
a0ed51b3
LW
2043 }
2044 else {
c277df42
IZ
2045 /* Several toplevels. Best we can is to set minlen. */
2046 I32 fake;
653099ff 2047 struct regnode_charclass_class ch_class;
cb434fcc 2048 I32 last_close = 0;
c277df42
IZ
2049
2050 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051 scan = r->program + 1;
830247a4 2052 cl_init(pRExC_state, &ch_class);
653099ff 2053 data.start_class = &ch_class;
cb434fcc 2054 data.last_closep = &last_close;
e1901655 2055 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2056 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class)) {
830247a4 2059 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2060
b81d288d 2061 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
830247a4 2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2065 struct regnode_charclass_class);
830247a4 2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2068 DEBUG_r({ SV* sv = sv_newmortal();
2069 regprop(sv, (regnode*)data.start_class);
2070 PerlIO_printf(Perl_debug_log,
2071 "synthetic stclass `%s'.\n",
2072 SvPVX(sv));});
653099ff 2073 }
a0d0e21e
LW
2074 }
2075
a0d0e21e 2076 r->minlen = minlen;
b81d288d 2077 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2078 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2079 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2080 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2081 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2082 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2083 if (RExC_seen & REG_SEEN_CANY)
2084 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2085 Newz(1002, r->startp, RExC_npar, I32);
2086 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2087 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2088 DEBUG_r(regdump(r));
2089 return(r);
a687059c
LW
2090}
2091
2092/*
2093 - reg - regular expression, i.e. main body or parenthesized thing
2094 *
2095 * Caller must absorb opening parenthesis.
2096 *
2097 * Combining parenthesis handling with the base level of regular expression
2098 * is a trifle forced, but the need to tie the tails of the branches to what
2099 * follows makes it hard to avoid.
2100 */
76e3520e 2101STATIC regnode *
830247a4 2102S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2103 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2104{
c277df42
IZ
2105 register regnode *ret; /* Will be the head of the group. */
2106 register regnode *br;
2107 register regnode *lastbr;
2108 register regnode *ender = 0;
a0d0e21e 2109 register I32 parno = 0;
830247a4 2110 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2111
2112 /* for (?g), (?gc), and (?o) warnings; warning
2113 about (?c) will warn about (?g) -- japhy */
2114
2115 I32 wastedflags = 0x00,
2116 wasted_o = 0x01,
2117 wasted_g = 0x02,
2118 wasted_gc = 0x02 | 0x04,
2119 wasted_c = 0x04;
2120
fac92740 2121 char * parse_start = RExC_parse; /* MJD */
830247a4 2122 char *oregcomp_parse = RExC_parse;
c277df42 2123 char c;
a0d0e21e 2124
821b33a5 2125 *flagp = 0; /* Tentatively. */
a0d0e21e 2126
9d1d55b5 2127
a0d0e21e
LW
2128 /* Make an OPEN node, if parenthesized. */
2129 if (paren) {
fac92740 2130 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2131 U16 posflags = 0, negflags = 0;
2132 U16 *flagsp = &posflags;
0f5d15d6 2133 int logical = 0;
830247a4 2134 char *seqstart = RExC_parse;
ca9dfc88 2135
830247a4
IZ
2136 RExC_parse++;
2137 paren = *RExC_parse++;
c277df42 2138 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2139 switch (paren) {
fac92740 2140 case '<': /* (?<...) */
830247a4 2141 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2142 if (*RExC_parse == '!')
c277df42 2143 paren = ',';
b81d288d 2144 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2145 goto unknown;
830247a4 2146 RExC_parse++;
fac92740
MJD
2147 case '=': /* (?=...) */
2148 case '!': /* (?!...) */
830247a4 2149 RExC_seen_zerolen++;
fac92740
MJD
2150 case ':': /* (?:...) */
2151 case '>': /* (?>...) */
a0d0e21e 2152 break;
fac92740
MJD
2153 case '$': /* (?$...) */
2154 case '@': /* (?@...) */
8615cb43 2155 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2156 break;
fac92740 2157 case '#': /* (?#...) */
830247a4
IZ
2158 while (*RExC_parse && *RExC_parse != ')')
2159 RExC_parse++;
2160 if (*RExC_parse != ')')
c277df42 2161 FAIL("Sequence (?#... not terminated");
830247a4 2162 nextchar(pRExC_state);
a0d0e21e
LW
2163 *flagp = TRYAGAIN;
2164 return NULL;
fac92740 2165 case 'p': /* (?p...) */
f9373011
PM
2166 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2168 /* FALL THROUGH*/
fac92740 2169 case '?': /* (??...) */
0f5d15d6 2170 logical = 1;
438a3801
YST
2171 if (*RExC_parse != '{')
2172 goto unknown;
830247a4 2173 paren = *RExC_parse++;
0f5d15d6 2174 /* FALL THROUGH */
fac92740 2175 case '{': /* (?{...}) */
c277df42 2176 {
c277df42
IZ
2177 I32 count = 1, n = 0;
2178 char c;
830247a4 2179 char *s = RExC_parse;
c277df42
IZ
2180 SV *sv;
2181 OP_4tree *sop, *rop;
2182
830247a4
IZ
2183 RExC_seen_zerolen++;
2184 RExC_seen |= REG_SEEN_EVAL;
2185 while (count && (c = *RExC_parse)) {
2186 if (c == '\\' && RExC_parse[1])
2187 RExC_parse++;
b81d288d 2188 else if (c == '{')
c277df42 2189 count++;
b81d288d 2190 else if (c == '}')
c277df42 2191 count--;
830247a4 2192 RExC_parse++;
c277df42 2193 }
830247a4 2194 if (*RExC_parse != ')')
b45f050a 2195 {
b81d288d 2196 RExC_parse = s;
b45f050a
JF
2197 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2198 }
c277df42
IZ
2199 if (!SIZE_ONLY) {
2200 AV *av;
b81d288d
AB
2201
2202 if (RExC_parse - 1 - s)
830247a4 2203 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2204 else
79cb57f6 2205 sv = newSVpvn("", 0);
c277df42 2206
569233ed
SB
2207 ENTER;
2208 Perl_save_re_context(aTHX);
c277df42 2209 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2210 sop->op_private |= OPpREFCOUNTED;
2211 /* re_dup will OpREFCNT_inc */
2212 OpREFCNT_set(sop, 1);
569233ed 2213 LEAVE;
c277df42 2214
830247a4
IZ
2215 n = add_data(pRExC_state, 3, "nop");
2216 RExC_rx->data->data[n] = (void*)rop;
2217 RExC_rx->data->data[n+1] = (void*)sop;
2218 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2219 SvREFCNT_dec(sv);
a0ed51b3 2220 }
e24b16f9 2221 else { /* First pass */
830247a4 2222 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2223 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2224 /* No compiled RE interpolated, has runtime
2225 components ===> unsafe. */
2226 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2227 if (PL_tainting && PL_tainted)
cc6b7395 2228 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2229 }
2230
830247a4 2231 nextchar(pRExC_state);
0f5d15d6 2232 if (logical) {
830247a4 2233 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2234 if (!SIZE_ONLY)
2235 ret->flags = 2;
830247a4 2236 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2237 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2238 return ret;
2239 }
830247a4 2240 return reganode(pRExC_state, EVAL, n);
c277df42 2241 }
fac92740 2242 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2243 {
fac92740 2244 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2245 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2246 || RExC_parse[1] == '<'
830247a4 2247 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2248 I32 flag;
2249
830247a4 2250 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2251 if (!SIZE_ONLY)
2252 ret->flags = 1;
830247a4 2253 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2254 goto insert_if;
b81d288d 2255 }
a0ed51b3 2256 }
830247a4 2257 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2258 /* (?(1)...) */
830247a4 2259 parno = atoi(RExC_parse++);
c277df42 2260
830247a4
IZ
2261 while (isDIGIT(*RExC_parse))
2262 RExC_parse++;
fac92740
MJD
2263 ret = reganode(pRExC_state, GROUPP, parno);
2264
830247a4 2265 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2266 vFAIL("Switch condition not recognized");
c277df42 2267 insert_if:
830247a4
IZ
2268 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2269 br = regbranch(pRExC_state, &flags, 1);
c277df42 2270 if (br == NULL)
830247a4 2271 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2272 else
830247a4
IZ
2273 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2274 c = *nextchar(pRExC_state);
d1b80229
IZ
2275 if (flags&HASWIDTH)
2276 *flagp |= HASWIDTH;
c277df42 2277 if (c == '|') {
830247a4
IZ
2278 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2279 regbranch(pRExC_state, &flags, 1);
2280 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2281 if (flags&HASWIDTH)
2282 *flagp |= HASWIDTH;
830247a4 2283 c = *nextchar(pRExC_state);
a0ed51b3
LW
2284 }
2285 else
c277df42
IZ
2286 lastbr = NULL;
2287 if (c != ')')
8615cb43 2288 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2289 ender = reg_node(pRExC_state, TAIL);
2290 regtail(pRExC_state, br, ender);
c277df42 2291 if (lastbr) {
830247a4
IZ
2292 regtail(pRExC_state, lastbr, ender);
2293 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2294 }
2295 else
830247a4 2296 regtail(pRExC_state, ret, ender);
c277df42 2297 return ret;
a0ed51b3
LW
2298 }
2299 else {
830247a4 2300 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2301 }
2302 }
1b1626e4 2303 case 0:
830247a4 2304 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2305 vFAIL("Sequence (? incomplete");
1b1626e4 2306 break;
a0d0e21e 2307 default:
830247a4 2308 --RExC_parse;
fac92740 2309 parse_flags: /* (?i) */
830247a4 2310 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2311 /* (?g), (?gc) and (?o) are useless here
2312 and must be globally applied -- japhy */
2313
2314 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2315 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2316 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2317 if (! (wastedflags & wflagbit) ) {
2318 wastedflags |= wflagbit;
2319 vWARN5(
2320 RExC_parse + 1,
2321 "Useless (%s%c) - %suse /%c modifier",
2322 flagsp == &negflags ? "?-" : "?",
2323 *RExC_parse,
2324 flagsp == &negflags ? "don't " : "",
2325 *RExC_parse
2326 );
2327 }
2328 }
2329 }
2330 else if (*RExC_parse == 'c') {
2331 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2332 if (! (wastedflags & wasted_c) ) {
2333 wastedflags |= wasted_gc;
2334 vWARN3(
2335 RExC_parse + 1,
2336 "Useless (%sc) - %suse /gc modifier",
2337 flagsp == &negflags ? "?-" : "?",
2338 flagsp == &negflags ? "don't " : ""
2339 );
2340 }
2341 }
2342 }
2343 else { pmflag(flagsp, *RExC_parse); }
2344
830247a4 2345 ++RExC_parse;
ca9dfc88 2346 }
830247a4 2347 if (*RExC_parse == '-') {
ca9dfc88 2348 flagsp = &negflags;
9d1d55b5 2349 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2350 ++RExC_parse;
ca9dfc88 2351 goto parse_flags;
48c036b1 2352 }
830247a4
IZ
2353 RExC_flags16 |= posflags;
2354 RExC_flags16 &= ~negflags;
2355 if (*RExC_parse == ':') {
2356 RExC_parse++;
ca9dfc88
IZ
2357 paren = ':';
2358 break;
2359 }
c277df42 2360 unknown:
830247a4
IZ
2361 if (*RExC_parse != ')') {
2362 RExC_parse++;
2363 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2364 }
830247a4 2365 nextchar(pRExC_state);
a0d0e21e
LW
2366 *flagp = TRYAGAIN;
2367 return NULL;
2368 }
2369 }
fac92740 2370 else { /* (...) */
830247a4
IZ
2371 parno = RExC_npar;
2372 RExC_npar++;
2373 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2374 Set_Node_Length(ret, 1); /* MJD */
2375 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2376 open = 1;
a0d0e21e 2377 }
a0ed51b3 2378 }
fac92740 2379 else /* ! paren */
a0d0e21e
LW
2380 ret = NULL;
2381
2382 /* Pick up the branches, linking them together. */
fac92740 2383 parse_start = RExC_parse; /* MJD */
830247a4 2384 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2385 /* branch_len = (paren != 0); */
2386
a0d0e21e
LW
2387 if (br == NULL)
2388 return(NULL);
830247a4
IZ
2389 if (*RExC_parse == '|') {
2390 if (!SIZE_ONLY && RExC_extralen) {
2391 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2392 }
fac92740 2393 else { /* MJD */
830247a4 2394 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2395 Set_Node_Length(br, paren != 0);
2396 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2397 }
c277df42
IZ
2398 have_branch = 1;
2399 if (SIZE_ONLY)
830247a4 2400 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2401 }
2402 else if (paren == ':') {
c277df42
IZ
2403 *flagp |= flags&SIMPLE;
2404 }
2405 if (open) { /* Starts with OPEN. */
830247a4 2406 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2407 }
2408 else if (paren != '?') /* Not Conditional */
a0d0e21e 2409 ret = br;
32a0ca98 2410 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2411 lastbr = br;
830247a4
IZ
2412 while (*RExC_parse == '|') {
2413 if (!SIZE_ONLY && RExC_extralen) {
2414 ender = reganode(pRExC_state, LONGJMP,0);
2415 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2416 }
2417 if (SIZE_ONLY)
830247a4
IZ
2418 RExC_extralen += 2; /* Account for LONGJMP. */
2419 nextchar(pRExC_state);
2420 br = regbranch(pRExC_state, &flags, 0);
fac92740 2421
a687059c 2422 if (br == NULL)
a0d0e21e 2423 return(NULL);
830247a4 2424 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2425 lastbr = br;
821b33a5
IZ
2426 if (flags&HASWIDTH)
2427 *flagp |= HASWIDTH;
a687059c 2428 *flagp |= flags&SPSTART;
a0d0e21e
LW
2429 }
2430
c277df42
IZ
2431 if (have_branch || paren != ':') {
2432 /* Make a closing node, and hook it on the end. */
2433 switch (paren) {
2434 case ':':
830247a4 2435 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2436 break;
2437 case 1:
830247a4 2438 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2439 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2440 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2441 break;
2442 case '<':
c277df42
IZ
2443 case ',':
2444 case '=':
2445 case '!':
c277df42 2446 *flagp &= ~HASWIDTH;
821b33a5
IZ
2447 /* FALL THROUGH */
2448 case '>':
830247a4 2449 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2450 break;
2451 case 0:
830247a4 2452 ender = reg_node(pRExC_state, END);
c277df42
IZ
2453 break;
2454 }
830247a4 2455 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2456
c277df42
IZ
2457 if (have_branch) {
2458 /* Hook the tails of the branches to the closing node. */
2459 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2460 regoptail(pRExC_state, br, ender);
c277df42
IZ
2461 }
2462 }
a0d0e21e 2463 }
c277df42
IZ
2464
2465 {
2466 char *p;
2467 static char parens[] = "=!<,>";
2468
2469 if (paren && (p = strchr(parens, paren))) {
2470 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2471 int flag = (p - parens) > 1;
2472
2473 if (paren == '>')
2474 node = SUSPEND, flag = 0;
830247a4 2475 reginsert(pRExC_state, node,ret);
c277df42 2476 ret->flags = flag;
830247a4 2477 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2478 }
a0d0e21e
LW
2479 }
2480
2481 /* Check for proper termination. */
ce3e6498 2482 if (paren) {
830247a4
IZ
2483 RExC_flags16 = oregflags;
2484 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2485 RExC_parse = oregcomp_parse;
380a0633 2486 vFAIL("Unmatched (");
ce3e6498 2487 }
a0ed51b3 2488 }
830247a4
IZ
2489 else if (!paren && RExC_parse < RExC_end) {
2490 if (*RExC_parse == ')') {
2491 RExC_parse++;
380a0633 2492 vFAIL("Unmatched )");
a0ed51b3
LW
2493 }
2494 else
b45f050a 2495 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2496 /* NOTREACHED */
2497 }
a687059c 2498
a0d0e21e 2499 return(ret);
a687059c
LW
2500}
2501
2502/*
2503 - regbranch - one alternative of an | operator
2504 *
2505 * Implements the concatenation operator.
2506 */
76e3520e 2507STATIC regnode *
830247a4 2508S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2509{
c277df42
IZ
2510 register regnode *ret;
2511 register regnode *chain = NULL;
2512 register regnode *latest;
2513 I32 flags = 0, c = 0;
a0d0e21e 2514
b81d288d 2515 if (first)
c277df42
IZ
2516 ret = NULL;
2517 else {
b81d288d 2518 if (!SIZE_ONLY && RExC_extralen)
830247a4 2519 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2520 else {
830247a4 2521 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2522 Set_Node_Length(ret, 1);
2523 }
c277df42
IZ
2524 }
2525
b81d288d 2526 if (!first && SIZE_ONLY)
830247a4 2527 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2528
c277df42 2529 *flagp = WORST; /* Tentatively. */
a0d0e21e 2530
830247a4
IZ
2531 RExC_parse--;
2532 nextchar(pRExC_state);
2533 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2534 flags &= ~TRYAGAIN;
830247a4 2535 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2536 if (latest == NULL) {
2537 if (flags & TRYAGAIN)
2538 continue;
2539 return(NULL);
a0ed51b3
LW
2540 }
2541 else if (ret == NULL)
c277df42 2542 ret = latest;
a0d0e21e 2543 *flagp |= flags&HASWIDTH;
c277df42 2544 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2545 *flagp |= flags&SPSTART;
2546 else {
830247a4
IZ
2547 RExC_naughty++;
2548 regtail(pRExC_state, chain, latest);
a687059c 2549 }
a0d0e21e 2550 chain = latest;
c277df42
IZ
2551 c++;
2552 }
2553 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2554 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2555 if (ret == NULL)
2556 ret = chain;
2557 }
2558 if (c == 1) {
2559 *flagp |= flags&SIMPLE;
a0d0e21e 2560 }
a687059c 2561
a0d0e21e 2562 return(ret);
a687059c
LW
2563}
2564
2565/*
2566 - regpiece - something followed by possible [*+?]
2567 *
2568 * Note that the branching code sequences used for ? and the general cases
2569 * of * and + are somewhat optimized: they use the same NOTHING node as
2570 * both the endmarker for their branch list and the body of the last branch.
2571 * It might seem that this node could be dispensed with entirely, but the
2572 * endmarker role is not redundant.
2573 */
76e3520e 2574STATIC regnode *
830247a4 2575S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2576{
c277df42 2577 register regnode *ret;
a0d0e21e
LW
2578 register char op;
2579 register char *next;
2580 I32 flags;
830247a4 2581 char *origparse = RExC_parse;
a0d0e21e
LW
2582 char *maxpos;
2583 I32 min;
c277df42 2584 I32 max = REG_INFTY;
fac92740 2585 char *parse_start;
a0d0e21e 2586
830247a4 2587 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2588 if (ret == NULL) {
2589 if (flags & TRYAGAIN)
2590 *flagp |= TRYAGAIN;
2591 return(NULL);
2592 }
2593
830247a4 2594 op = *RExC_parse;
a0d0e21e 2595
830247a4 2596 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2597 parse_start = RExC_parse; /* MJD */
830247a4 2598 next = RExC_parse + 1;
a0d0e21e
LW
2599 maxpos = Nullch;
2600 while (isDIGIT(*next) || *next == ',') {
2601 if (*next == ',') {
2602 if (maxpos)
2603 break;
2604 else
2605 maxpos = next;
a687059c 2606 }
a0d0e21e
LW
2607 next++;
2608 }
2609 if (*next == '}') { /* got one */
2610 if (!maxpos)
2611 maxpos = next;
830247a4
IZ
2612 RExC_parse++;
2613 min = atoi(RExC_parse);
a0d0e21e
LW
2614 if (*maxpos == ',')
2615 maxpos++;
2616 else
830247a4 2617 maxpos = RExC_parse;
a0d0e21e
LW
2618 max = atoi(maxpos);
2619 if (!max && *maxpos != '0')
c277df42
IZ
2620 max = REG_INFTY; /* meaning "infinity" */
2621 else if (max >= REG_INFTY)
8615cb43 2622 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2623 RExC_parse = next;
2624 nextchar(pRExC_state);
a0d0e21e
LW
2625
2626 do_curly:
2627 if ((flags&SIMPLE)) {
830247a4
IZ
2628 RExC_naughty += 2 + RExC_naughty / 2;
2629 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2630 Set_Node_Offset(ret, parse_start+1); /* MJD */
2631 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2632 }
2633 else {
830247a4 2634 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2635
2636 w->flags = 0;
830247a4
IZ
2637 regtail(pRExC_state, ret, w);
2638 if (!SIZE_ONLY && RExC_extralen) {
2639 reginsert(pRExC_state, LONGJMP,ret);
2640 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2641 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2642 }
830247a4 2643 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2644 /* MJD hk */
2645 Set_Node_Offset(ret, parse_start+1);
2646 Set_Node_Length(ret,
2647 op == '{' ? (RExC_parse - parse_start) : 1);
2648
830247a4 2649 if (!SIZE_ONLY && RExC_extralen)
c277df42 2650 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2651 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2652 if (SIZE_ONLY)
830247a4
IZ
2653 RExC_whilem_seen++, RExC_extralen += 3;
2654 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2655 }
c277df42 2656 ret->flags = 0;
a0d0e21e
LW
2657
2658 if (min > 0)
821b33a5
IZ
2659 *flagp = WORST;
2660 if (max > 0)
2661 *flagp |= HASWIDTH;
a0d0e21e 2662 if (max && max < min)
8615cb43 2663 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2664 if (!SIZE_ONLY) {
2665 ARG1_SET(ret, min);
2666 ARG2_SET(ret, max);
a687059c 2667 }
a687059c 2668
a0d0e21e 2669 goto nest_check;
a687059c 2670 }
a0d0e21e 2671 }
a687059c 2672
a0d0e21e
LW
2673 if (!ISMULT1(op)) {
2674 *flagp = flags;
a687059c 2675 return(ret);
a0d0e21e 2676 }
bb20fd44 2677
c277df42 2678#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2679
2680 /* if this is reinstated, don't forget to put this back into perldiag:
2681
2682 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2683
2684 (F) The part of the regexp subject to either the * or + quantifier
2685 could match an empty string. The {#} shows in the regular
2686 expression about where the problem was discovered.
2687
2688 */
2689
bb20fd44 2690 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2691 vFAIL("Regexp *+ operand could be empty");
b81d288d 2692#endif
bb20fd44 2693
fac92740 2694 parse_start = RExC_parse;
830247a4 2695 nextchar(pRExC_state);
a0d0e21e 2696
821b33a5 2697 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2698
2699 if (op == '*' && (flags&SIMPLE)) {
830247a4 2700 reginsert(pRExC_state, STAR, ret);
c277df42 2701 ret->flags = 0;
830247a4 2702 RExC_naughty += 4;
a0d0e21e
LW
2703 }
2704 else if (op == '*') {
2705 min = 0;
2706 goto do_curly;
a0ed51b3
LW
2707 }
2708 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2709 reginsert(pRExC_state, PLUS, ret);
c277df42 2710 ret->flags = 0;
830247a4 2711 RExC_naughty += 3;
a0d0e21e
LW
2712 }
2713 else if (op == '+') {
2714 min = 1;
2715 goto do_curly;
a0ed51b3
LW
2716 }
2717 else if (op == '?') {
a0d0e21e
LW
2718 min = 0; max = 1;
2719 goto do_curly;
2720 }
2721 nest_check:
e476b1b5 2722 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2723 vWARN3(RExC_parse,
b45f050a 2724 "%.*s matches null string many times",
830247a4 2725 RExC_parse - origparse,
b45f050a 2726 origparse);
a0d0e21e
LW
2727 }
2728
830247a4
IZ
2729 if (*RExC_parse == '?') {
2730 nextchar(pRExC_state);
2731 reginsert(pRExC_state, MINMOD, ret);
2732 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2733 }
830247a4
IZ
2734 if (ISMULT2(RExC_parse)) {
2735 RExC_parse++;
b45f050a
JF
2736 vFAIL("Nested quantifiers");
2737 }
a0d0e21e
LW
2738
2739 return(ret);
a687059c
LW
2740}
2741
2742/*
2743 - regatom - the lowest level
2744 *
2745 * Optimization: gobbles an entire sequence of ordinary characters so that
2746 * it can turn them into a single node, which is smaller to store and
2747 * faster to run. Backslashed characters are exceptions, each becoming a
2748 * separate node; the code is simpler that way and it's not worth fixing.
2749 *
b45f050a 2750 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2751STATIC regnode *
830247a4 2752S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2753{
c277df42 2754 register regnode *ret = 0;
a0d0e21e 2755 I32 flags;
f06dbbb7 2756 char *parse_start = 0;
a0d0e21e
LW
2757
2758 *flagp = WORST; /* Tentatively. */
2759
2760tryagain:
830247a4 2761 switch (*RExC_parse) {
a0d0e21e 2762 case '^':
830247a4
IZ
2763 RExC_seen_zerolen++;
2764 nextchar(pRExC_state);
2765 if (RExC_flags16 & PMf_MULTILINE)
2766 ret = reg_node(pRExC_state, MBOL);
2767 else if (RExC_flags16 & PMf_SINGLELINE)
2768 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2769 else
830247a4 2770 ret = reg_node(pRExC_state, BOL);
fac92740 2771 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2772 break;
2773 case '$':
830247a4 2774 nextchar(pRExC_state);
b81d288d 2775 if (*RExC_parse)
830247a4
IZ
2776 RExC_seen_zerolen++;
2777 if (RExC_flags16 & PMf_MULTILINE)
2778 ret = reg_node(pRExC_state, MEOL);
2779 else if (RExC_flags16 & PMf_SINGLELINE)
2780 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2781 else
830247a4 2782 ret = reg_node(pRExC_state, EOL);
fac92740 2783 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2784 break;
2785 case '.':
830247a4 2786 nextchar(pRExC_state);
cce850e4 2787 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2788 ret = reg_node(pRExC_state, SANY);
2789 else
2790 ret = reg_node(pRExC_state, REG_ANY);
2791 *flagp |= HASWIDTH|SIMPLE;
830247a4 2792 RExC_naughty++;
fac92740 2793 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2794 break;
2795 case '[':
b45f050a 2796 {
830247a4 2797 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2798 ret = regclass(pRExC_state);
830247a4
IZ
2799 if (*RExC_parse != ']') {
2800 RExC_parse = oregcomp_parse;
b45f050a
JF
2801 vFAIL("Unmatched [");
2802 }
830247a4 2803 nextchar(pRExC_state);
a0d0e21e 2804 *flagp |= HASWIDTH|SIMPLE;
fac92740 2805 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2806 break;
b45f050a 2807 }
a0d0e21e 2808 case '(':
830247a4
IZ
2809 nextchar(pRExC_state);
2810 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2811 if (ret == NULL) {
bf93d4cc 2812 if (flags & TRYAGAIN) {
830247a4 2813 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2814 /* Make parent create an empty node if needed. */
2815 *flagp |= TRYAGAIN;
2816 return(NULL);
2817 }
a0d0e21e 2818 goto tryagain;
bf93d4cc 2819 }
a0d0e21e
LW
2820 return(NULL);
2821 }
c277df42 2822 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2823 break;
2824 case '|':
2825 case ')':
2826 if (flags & TRYAGAIN) {
2827 *flagp |= TRYAGAIN;
2828 return NULL;
2829 }
b45f050a 2830 vFAIL("Internal urp");
a0d0e21e
LW
2831 /* Supposed to be caught earlier. */
2832 break;
85afd4ae 2833 case '{':
830247a4
IZ
2834 if (!regcurly(RExC_parse)) {
2835 RExC_parse++;
85afd4ae
CS
2836 goto defchar;
2837 }
2838 /* FALL THROUGH */
a0d0e21e
LW
2839 case '?':
2840 case '+':
2841 case '*':
830247a4 2842 RExC_parse++;
b45f050a 2843 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2844 break;
2845 case '\\':
830247a4 2846 switch (*++RExC_parse) {
a0d0e21e 2847 case 'A':
830247a4
IZ
2848 RExC_seen_zerolen++;
2849 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2850 *flagp |= SIMPLE;
830247a4 2851 nextchar(pRExC_state);
fac92740 2852 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2853 break;
2854 case 'G':
830247a4
IZ
2855 ret = reg_node(pRExC_state, GPOS);
2856 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2857 *flagp |= SIMPLE;
830247a4 2858 nextchar(pRExC_state);
fac92740 2859 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2860 break;
2861 case 'Z':
830247a4 2862 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2863 *flagp |= SIMPLE;
a1917ab9 2864 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2865 nextchar(pRExC_state);
a0d0e21e 2866 break;
b85d18e9 2867 case 'z':
830247a4 2868 ret = reg_node(pRExC_state, EOS);
b85d18e9 2869 *flagp |= SIMPLE;
830247a4
IZ
2870 RExC_seen_zerolen++; /* Do not optimize RE away */
2871 nextchar(pRExC_state);
fac92740 2872 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2873 break;
4a2d328f 2874 case 'C':
f33976b4
DB
2875 ret = reg_node(pRExC_state, CANY);
2876 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2877 *flagp |= HASWIDTH|SIMPLE;
830247a4 2878 nextchar(pRExC_state);
fac92740 2879 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2880 break;
2881 case 'X':
830247a4 2882 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2883 *flagp |= HASWIDTH;
830247a4 2884 nextchar(pRExC_state);
fac92740 2885 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2886 break;
a0d0e21e 2887 case 'w':
ffc61ed2 2888 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2889 *flagp |= HASWIDTH|SIMPLE;
830247a4 2890 nextchar(pRExC_state);
fac92740 2891 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2892 break;
2893 case 'W':
ffc61ed2 2894 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2895 *flagp |= HASWIDTH|SIMPLE;
830247a4 2896 nextchar(pRExC_state);
fac92740 2897 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2898 break;
2899 case 'b':
830247a4
IZ
2900 RExC_seen_zerolen++;
2901 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2902 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2903 *flagp |= SIMPLE;
830247a4 2904 nextchar(pRExC_state);
fac92740 2905 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2906 break;
2907 case 'B':
830247a4
IZ
2908 RExC_seen_zerolen++;
2909 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2910 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2911 *flagp |= SIMPLE;
830247a4 2912 nextchar(pRExC_state);
fac92740 2913 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2914 break;
2915 case 's':
ffc61ed2 2916 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2917 *flagp |= HASWIDTH|SIMPLE;
830247a4 2918 nextchar(pRExC_state);
fac92740 2919 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2920 break;
2921 case 'S':
ffc61ed2 2922 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2923 *flagp |= HASWIDTH|SIMPLE;
830247a4 2924 nextchar(pRExC_state);
fac92740 2925 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2926 break;
2927 case 'd':
ffc61ed2 2928 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2929 *flagp |= HASWIDTH|SIMPLE;
830247a4 2930 nextchar(pRExC_state);
fac92740 2931 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2932 break;
2933 case 'D':
ffc61ed2 2934 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2935 *flagp |= HASWIDTH|SIMPLE;
830247a4 2936 nextchar(pRExC_state);
fac92740 2937 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2938 break;
a14b48bc
LW
2939 case 'p':
2940 case 'P':
3568d838 2941 {
830247a4 2942 char* oldregxend = RExC_end;
fac92740 2943 char* parse_start = RExC_parse;
a14b48bc 2944
830247a4 2945 if (RExC_parse[1] == '{') {
3568d838 2946 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2947 RExC_end = strchr(RExC_parse, '}');
2948 if (!RExC_end) {
0da60cf5 2949 U8 c = (U8)*RExC_parse;
830247a4
IZ
2950 RExC_parse += 2;
2951 RExC_end = oldregxend;
0da60cf5 2952 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2953 }
830247a4 2954 RExC_end++;
a14b48bc
LW
2955 }
2956 else
830247a4
IZ
2957 RExC_end = RExC_parse + 2;
2958 RExC_parse--;
a14b48bc 2959
ffc61ed2 2960 ret = regclass(pRExC_state);
a14b48bc 2961
830247a4
IZ
2962 RExC_end = oldregxend;
2963 RExC_parse--;
fac92740 2964 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2965 nextchar(pRExC_state);
a14b48bc
LW
2966 *flagp |= HASWIDTH|SIMPLE;
2967 }
2968 break;
a0d0e21e
LW
2969 case 'n':
2970 case 'r':
2971 case 't':
2972 case 'f':
2973 case 'e':
2974 case 'a':
2975 case 'x':
2976 case 'c':
2977 case '0':
2978 goto defchar;
2979 case '1': case '2': case '3': case '4':
2980 case '5': case '6': case '7': case '8': case '9':
2981 {
830247a4 2982 I32 num = atoi(RExC_parse);
a0d0e21e 2983
830247a4 2984 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2985 goto defchar;
2986 else {
fac92740 2987 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2988 while (isDIGIT(*RExC_parse))
2989 RExC_parse++;
b45f050a 2990
830247a4 2991 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2992 vFAIL("Reference to nonexistent group");
830247a4
IZ
2993 RExC_sawback = 1;
2994 ret = reganode(pRExC_state, FOLD
a0ed51b3 2995 ? (LOC ? REFFL : REFF)
c8756f30 2996 : REF, num);
a0d0e21e 2997 *flagp |= HASWIDTH;
fac92740
MJD
2998
2999 /* override incorrect value set in reganode MJD */
3000 Set_Node_Offset(ret, parse_start+1);
3001 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3002 RExC_parse--;
3003 nextchar(pRExC_state);
a0d0e21e
LW
3004 }
3005 }
3006 break;
3007 case '\0':
830247a4 3008 if (RExC_parse >= RExC_end)
b45f050a 3009 FAIL("Trailing \\");
a0d0e21e
LW
3010 /* FALL THROUGH */
3011 default:
c9f97d15
IZ
3012 /* Do not generate `unrecognized' warnings here, we fall
3013 back into the quick-grab loop below */
a0d0e21e
LW
3014 goto defchar;
3015 }
3016 break;
4633a7c4
LW
3017
3018 case '#':
830247a4
IZ
3019 if (RExC_flags16 & PMf_EXTENDED) {
3020 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3021 if (RExC_parse < RExC_end)
4633a7c4
LW
3022 goto tryagain;
3023 }
3024 /* FALL THROUGH */
3025
a0d0e21e 3026 default: {
ba210ebe 3027 register STRLEN len;
a0ed51b3 3028 register UV ender;
a0d0e21e 3029 register char *p;
c277df42 3030 char *oldp, *s;
ba210ebe 3031 STRLEN numlen;
80aecb99 3032 STRLEN foldlen;
60a8b682 3033 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3034
3035 parse_start = RExC_parse - 1;
a0d0e21e 3036
830247a4 3037 RExC_parse++;
a0d0e21e
LW
3038
3039 defchar:
830247a4 3040 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3041 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3042 : EXACT);
cd439c50 3043 s = STRING(ret);
830247a4
IZ
3044 for (len = 0, p = RExC_parse - 1;
3045 len < 127 && p < RExC_end;
a0d0e21e
LW
3046 len++)
3047 {
3048 oldp = p;
5b5a24f7 3049
830247a4
IZ
3050 if (RExC_flags16 & PMf_EXTENDED)
3051 p = regwhite(p, RExC_end);
a0d0e21e
LW
3052 switch (*p) {
3053 case '^':
3054 case '$':
3055 case '.':
3056 case '[':
3057 case '(':
3058 case ')':
3059 case '|':
3060 goto loopdone;
3061 case '\\':
3062 switch (*++p) {
3063 case 'A':
1ed8eac0
JF
3064 case 'C':
3065 case 'X':
a0d0e21e
LW
3066 case 'G':
3067 case 'Z':
b85d18e9 3068 case 'z':
a0d0e21e
LW
3069 case 'w':
3070 case 'W':
3071 case 'b':
3072 case 'B':
3073 case 's':
3074 case 'S':
3075 case 'd':
3076 case 'D':
a14b48bc
LW
3077 case 'p':
3078 case 'P':
a0d0e21e
LW
3079 --p;
3080 goto loopdone;
3081 case 'n':
3082 ender = '\n';
3083 p++;
a687059c 3084 break;
a0d0e21e
LW
3085 case 'r':
3086 ender = '\r';
3087 p++;
a687059c 3088 break;
a0d0e21e
LW
3089 case 't':
3090 ender = '\t';
3091 p++;
a687059c 3092 break;
a0d0e21e
LW
3093 case 'f':
3094 ender = '\f';
3095 p++;
a687059c 3096 break;
a0d0e21e 3097 case 'e':
c7f1f016 3098 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3099 p++;
a687059c 3100 break;
a0d0e21e 3101 case 'a':
c7f1f016 3102 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3103 p++;
a687059c 3104 break;
a0d0e21e 3105 case 'x':
a0ed51b3
LW
3106 if (*++p == '{') {
3107 char* e = strchr(p, '}');
b81d288d 3108
b45f050a 3109 if (!e) {
830247a4 3110 RExC_parse = p + 1;
b45f050a
JF
3111 vFAIL("Missing right brace on \\x{}");
3112 }
de5f0749 3113 else {
a4c04bdc
NC
3114 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3116 numlen = e - p - 1;
3117 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3118 if (ender > 0xff)
3119 RExC_utf8 = 1;
b21ed0a9
GS
3120 /* numlen is generous */
3121 if (numlen + len >= 127) {
a0ed51b3
LW
3122 p--;
3123 goto loopdone;
3124 }
3125 p = e + 1;
3126 }
a0ed51b3
LW
3127 }
3128 else {
a4c04bdc 3129 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3130 numlen = 2;
3131 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3132 p += numlen;
3133 }
a687059c 3134 break;
a0d0e21e
LW
3135 case 'c':
3136 p++;
bbce6d69 3137 ender = UCHARAT(p++);
3138 ender = toCTRL(ender);
a687059c 3139 break;
a0d0e21e
LW
3140 case '0': case '1': case '2': case '3':case '4':
3141 case '5': case '6': case '7': case '8':case '9':
3142 if (*p == '0' ||
830247a4 3143 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3144 I32 flags = 0;
3145 numlen = 3;
3146 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3147 p += numlen;
3148 }
3149 else {
3150 --p;
3151 goto loopdone;
a687059c
LW
3152 }
3153 break;
a0d0e21e 3154 case '\0':
830247a4 3155 if (p >= RExC_end)
b45f050a 3156 FAIL("Trailing \\");
a687059c 3157 /* FALL THROUGH */
a0d0e21e 3158 default:
e476b1b5 3159 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3160 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3161 goto normal_default;
a0d0e21e
LW
3162 }
3163 break;
a687059c 3164 default:
a0ed51b3 3165 normal_default:
fd400ab9 3166 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3167 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3168 &numlen, 0);
a0ed51b3
LW
3169 p += numlen;
3170 }
3171 else
3172 ender = *p++;
a0d0e21e 3173 break;
a687059c 3174 }
830247a4
IZ
3175 if (RExC_flags16 & PMf_EXTENDED)
3176 p = regwhite(p, RExC_end);
60a8b682
JH
3177 if (UTF && FOLD) {
3178 /* Prime the casefolded buffer. */
ac7e0132 3179 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3180 }
a0d0e21e
LW
3181 if (ISMULT2(p)) { /* Back off on ?+*. */
3182 if (len)
3183 p = oldp;
16ea2a2e 3184 else if (UTF) {
80aecb99 3185 if (FOLD) {
60a8b682 3186 /* Emit all the Unicode characters. */
80aecb99
JH
3187 for (foldbuf = tmpbuf;
3188 foldlen;
3189 foldlen -= numlen) {
3190 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57
JH
3191 if (numlen > 0) {
3192 reguni(pRExC_state, ender, s, &numlen);
3193 s += numlen;
3194 len += numlen;
3195 foldbuf += numlen;
47654450
JH
3196 if (numlen >= foldlen)
3197 break;
9dc45d57
JH
3198 }
3199 else
3200 break; /* "Can't happen." */
80aecb99
JH
3201 }
3202 }
3203 else {
3204 reguni(pRExC_state, ender, s, &numlen);
9dc45d57
JH
3205 if (numlen > 0) {
3206 s += numlen;
3207 len += numlen;
3208 }
80aecb99 3209 }
a0ed51b3 3210 }
a0d0e21e
LW
3211 else {
3212 len++;
cd439c50 3213 REGC(ender, s++);
a0d0e21e
LW
3214 }
3215 break;
a687059c 3216 }
16ea2a2e 3217 if (UTF) {
80aecb99 3218 if (FOLD) {
60a8b682 3219 /* Emit all the Unicode characters. */
80aecb99
JH
3220 for (foldbuf = tmpbuf;
3221 foldlen;
3222 foldlen -= numlen) {
3223 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57
JH
3224 if (numlen > 0) {
3225 reguni(pRExC_state, ender, s, &numlen);
9dc45d57 3226 len += numlen;
47654450 3227 s += numlen;
9dc45d57 3228 foldbuf += numlen;
47654450
JH
3229 if (numlen >= foldlen)
3230 break;
9dc45d57
JH
3231 }
3232 else
3233 break;
80aecb99
JH
3234 }
3235 }
3236 else {
3237 reguni(pRExC_state, ender, s, &numlen);
9dc45d57
JH
3238 if (numlen > 0) {
3239 s += numlen;
3240 len += numlen;
3241 }
80aecb99
JH
3242 }
3243 len--;
a0ed51b3
LW
3244 }
3245 else
cd439c50 3246 REGC(ender, s++);
a0d0e21e
LW
3247 }
3248 loopdone:
830247a4 3249 RExC_parse = p - 1;
fac92740 3250 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3251 nextchar(pRExC_state);
793db0cb
JH
3252 {
3253 /* len is STRLEN which is unsigned, need to copy to signed */
3254 IV iv = len;
3255 if (iv < 0)
3256 vFAIL("Internal disaster");
3257 }
a0d0e21e
LW
3258 if (len > 0)
3259 *flagp |= HASWIDTH;
3260 if (len == 1)
3261 *flagp |= SIMPLE;
c277df42 3262 if (!SIZE_ONLY)
cd439c50
IZ
3263 STR_LEN(ret) = len;
3264 if (SIZE_ONLY)
830247a4 3265 RExC_size += STR_SZ(len);
cd439c50 3266 else
830247a4 3267 RExC_emit += STR_SZ(len);
a687059c 3268 }
a0d0e21e
LW
3269 break;
3270 }
a687059c 3271
60a8b682
JH
3272 /* If the encoding pragma is in effect recode the text of
3273 * any EXACT-kind nodes. */
22c54be3 3274 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3275 STRLEN oldlen = STR_LEN(ret);
3276 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3277
3278 if (RExC_utf8)
3279 SvUTF8_on(sv);
3280 if (sv_utf8_downgrade(sv, TRUE)) {
3281 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3282 STRLEN newlen = SvCUR(sv);
3283
3284 if (!SIZE_ONLY) {
3285 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3286 (int)oldlen, STRING(ret),
3287 (int)newlen, s));
3288 Copy(s, STRING(ret), newlen, char);
3289 STR_LEN(ret) += newlen - oldlen;
3290 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3291 } else
3292 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3293 }
a72c7584
JH
3294 }
3295
a0d0e21e 3296 return(ret);
a687059c
LW
3297}
3298
873ef191 3299STATIC char *
cea2e8a9 3300S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3301{
3302 while (p < e) {
3303 if (isSPACE(*p))
3304 ++p;
3305 else if (*p == '#') {
3306 do {
3307 p++;
3308 } while (p < e && *p != '\n');
3309 }
3310 else
3311 break;
3312 }
3313 return p;
3314}
3315
b8c5462f
JH
3316/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3317 Character classes ([:foo:]) can also be negated ([:^foo:]).
3318 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3319 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3320 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3321
3322#define POSIXCC_DONE(c) ((c) == ':')
3323#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3324#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3325
b8c5462f 3326STATIC I32
830247a4 3327S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3328{
3329 char *posixcc = 0;
936ed897 3330 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3331
830247a4 3332 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3333 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3334 POSIXCC(UCHARAT(RExC_parse))) {
3335 char c = UCHARAT(RExC_parse);
830247a4 3336 char* s = RExC_parse++;
b81d288d 3337
9a86a77b 3338 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3339 RExC_parse++;
3340 if (RExC_parse == RExC_end)
620e46c5 3341 /* Grandfather lone [:, [=, [. */
830247a4 3342 RExC_parse = s;
620e46c5 3343 else {
830247a4 3344 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3345
9a86a77b 3346 if (UCHARAT(RExC_parse) == ']') {
830247a4 3347 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3348 posixcc = s + 1;
3349 if (*s == ':') {
3350 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3351 I32 skip = 5; /* the most common skip */
3352
3353 switch (*posixcc) {
3354 case 'a':
3355 if (strnEQ(posixcc, "alnum", 5))
3356 namedclass =
3357 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3358 else if (strnEQ(posixcc, "alpha", 5))
3359 namedclass =
3360 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3361 else if (strnEQ(posixcc, "ascii", 5))
3362 namedclass =
3363 complement ? ANYOF_NASCII : ANYOF_ASCII;
3364 break;
aaa51d5e
JF
3365 case 'b':
3366 if (strnEQ(posixcc, "blank", 5))
3367 namedclass =
3368 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3369 break;
b8c5462f
JH
3370 case 'c':
3371 if (strnEQ(posixcc, "cntrl", 5))
3372 namedclass =
3373 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3374 break;
3375 case 'd':
3376 if (strnEQ(posixcc, "digit", 5))
3377 namedclass =
3378 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3379 break;
3380 case 'g':
3381 if (strnEQ(posixcc, "graph", 5))
3382 namedclass =
3383 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3384 break;
3385 case 'l':
3386 if (strnEQ(posixcc, "lower", 5))
3387 namedclass =
3388 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3389 break;
3390 case 'p':
3391 if (strnEQ(posixcc, "print", 5))
3392 namedclass =
3393 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3394 else if (strnEQ(posixcc, "punct", 5))
3395 namedclass =
3396 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3397 break;
3398 case 's':
3399 if (strnEQ(posixcc, "space", 5))
3400 namedclass =
aaa51d5e 3401 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3402 break;
b8c5462f
JH
3403 case 'u':
3404 if (strnEQ(posixcc, "upper", 5))
3405 namedclass =
3406 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3407 break;
3408 case 'w': /* this is not POSIX, this is the Perl \w */
3409 if (strnEQ(posixcc, "word", 4)) {
3410 namedclass =
3411 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3412 skip = 4;
3413 }
3414 break;
3415 case 'x':
3416 if (strnEQ(posixcc, "xdigit", 6)) {
3417 namedclass =
3418 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3419 skip = 6;
3420 }
3421 break;
3422 }
ac561586
JH
3423 if (namedclass == OOB_NAMEDCLASS ||
3424 posixcc[skip] != ':' ||
3425 posixcc[skip+1] != ']')
b45f050a
JF
3426 {
3427 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3428 t - s - 1, s + 1);
3429 }
3430 } else if (!SIZE_ONLY) {
b8c5462f 3431 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3432
830247a4 3433 /* adjust RExC_parse so the warning shows after
b45f050a 3434 the class closes */
9a86a77b 3435 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3436 RExC_parse++;
b45f050a
JF
3437 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3438 }
b8c5462f
JH
3439 } else {
3440 /* Maternal grandfather:
3441 * "[:" ending in ":" but not in ":]" */
830247a4 3442 RExC_parse = s;
767d463e 3443 }
620e46c5
JH
3444 }
3445 }
3446
b8c5462f
JH
3447 return namedclass;
3448}
3449
3450STATIC void
830247a4 3451S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3452{
b938889d 3453 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3454 char *s = RExC_parse;
93733859 3455 char c = *s++;
b8c5462f
JH
3456
3457 while(*s && isALNUM(*s))
3458 s++;
3459 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3460 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3461
3462 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3463 if (POSIXCC_NOTYET(c)) {
830247a4 3464 /* adjust RExC_parse so the error shows after
b45f050a 3465 the class closes */
9a86a77b 3466 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3467 ;
3468 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3469 }
b8c5462f
JH
3470 }
3471 }
620e46c5
JH
3472}
3473
76e3520e 3474STATIC regnode *
830247a4 3475S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3476{
ffc61ed2 3477 register UV value;
9a86a77b 3478 register UV nextvalue;
3568d838 3479 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3480 register IV range = 0;
c277df42 3481 register regnode *ret;
ba210ebe 3482 STRLEN numlen;
ffc61ed2 3483 IV namedclass;
9c5ffd7c 3484 char *rangebegin = 0;
936ed897 3485 bool need_class = 0;
9c5ffd7c 3486 SV *listsv = Nullsv;
ffc61ed2
JH
3487 register char *e;
3488 UV n;
9e55ce06
JH
3489 bool optimize_invert = TRUE;
3490 AV* unicode_alternate = 0;
ffc61ed2
JH
3491
3492 ret = reganode(pRExC_state, ANYOF, 0);
3493
3494 if (!SIZE_ONLY)
3495 ANYOF_FLAGS(ret) = 0;
3496
9a86a77b 3497 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3498 RExC_naughty++;
3499 RExC_parse++;
3500 if (!SIZE_ONLY)
3501 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3502 }
a0d0e21e 3503
936ed897 3504 if (SIZE_ONLY)
830247a4 3505 RExC_size += ANYOF_SKIP;
936ed897 3506 else {
830247a4 3507 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3508 if (FOLD)
3509 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3510 if (LOC)
3511 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3512 ANYOF_BITMAP_ZERO(ret);
3513 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3514 }
b8c5462f 3515
9a86a77b
JH
3516 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3517
b938889d 3518 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3519 checkposixcc(pRExC_state);
b8c5462f 3520
f064b6ad
HS
3521 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3522 if (UCHARAT(RExC_parse) == ']')
3523 goto charclassloop;
ffc61ed2 3524
9a86a77b 3525 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3526
3527 charclassloop:
3528
3529 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3530
73b437c8 3531 if (!range)
830247a4 3532 rangebegin = RExC_parse;
ffc61ed2 3533 if (UTF) {
5e12f4fb 3534 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3535 RExC_end - RExC_parse,
3536 &numlen, 0);
ffc61ed2
JH
3537 RExC_parse += numlen;
3538 }
3539 else
3540 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3541 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3542 if (value == '[' && POSIXCC(nextvalue))
830247a4 3543 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3544 else if (value == '\\') {
ffc61ed2 3545 if (UTF) {
5e12f4fb 3546 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3547 RExC_end - RExC_parse,
3548 &numlen, 0);
3549 RExC_parse += numlen;
3550 }
3551 else
3552 value = UCHARAT(RExC_parse++);
470c3474 3553 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3554 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3555 * be a problem later if we want switch on Unicode.
3556 * A similar issue a little bit later when switching on
3557 * namedclass. --jhi */
ffc61ed2 3558 switch ((I32)value) {
b8c5462f
JH
3559 case 'w': namedclass = ANYOF_ALNUM; break;
3560 case 'W': namedclass = ANYOF_NALNUM; break;
3561 case 's': namedclass = ANYOF_SPACE; break;
3562 case 'S': namedclass = ANYOF_NSPACE; break;
3563 case 'd': namedclass = ANYOF_DIGIT; break;
3564 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3565 case 'p':
3566 case 'P':
3567 if (*RExC_parse == '{') {
0da60cf5 3568 U8 c = (U8)value;
ffc61ed2
JH
3569 e = strchr(RExC_parse++, '}');
3570 if (!e)
0da60cf5 3571 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3572 while (isSPACE(UCHARAT(RExC_parse)))
3573 RExC_parse++;
3574 if (e == RExC_parse)
0da60cf5 3575 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3576 n = e - RExC_parse;
ab13f0c7
JH
3577 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3578 n--;
ffc61ed2
JH
3579 }
3580 else {
3581 e = RExC_parse;
3582 n = 1;
3583 }
3584 if (!SIZE_ONLY) {
ab13f0c7
JH
3585 if (UCHARAT(RExC_parse) == '^') {
3586 RExC_parse++;
3587 n--;
3588 value = value == 'p' ? 'P' : 'p'; /* toggle */
3589 while (isSPACE(UCHARAT(RExC_parse))) {
3590 RExC_parse++;
3591 n--;
3592 }
3593 }
ffc61ed2 3594 if (value == 'p')
ab13f0c7
JH
3595 Perl_sv_catpvf(aTHX_ listsv,
3596 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3597 else
ab13f0c7
JH
3598 Perl_sv_catpvf(aTHX_ listsv,
3599 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3600 }
3601 RExC_parse = e + 1;
3602 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3603 continue;
b8c5462f
JH
3604 case 'n': value = '\n'; break;
3605 case 'r': value = '\r'; break;
3606 case 't': value = '\t'; break;
3607 case 'f': value = '\f'; break;
3608 case 'b': value = '\b'; break;
c7f1f016
NIS
3609 case 'e': value = ASCII_TO_NATIVE('\033');break;
3610 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3611 case 'x':
ffc61ed2 3612 if (*RExC_parse == '{') {
a4c04bdc
NC
3613 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3614 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3615 e = strchr(RExC_parse++, '}');
b81d288d 3616 if (!e)
ffc61ed2 3617 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3618
3619 numlen = e - RExC_parse;
3620 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3621 RExC_parse = e + 1;
3622 }
3623 else {
a4c04bdc 3624 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3625 numlen = 2;
3626 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3627 RExC_parse += numlen;
3628 }
b8c5462f
JH
3629 break;
3630 case 'c':
830247a4 3631 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3632 value = toCTRL(value);
3633 break;
3634 case '0': case '1': case '2': case '3': case '4':
3635 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3636 {
3637 I32 flags = 0;
3638 numlen = 3;
3639 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3640 RExC_parse += numlen;
b8c5462f 3641 break;
53305cf1 3642 }
1028017a 3643 default:
e476b1b5 3644 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3645 vWARN2(RExC_parse,
3646 "Unrecognized escape \\%c in character class passed through",
3647 (int)value);
1028017a 3648 break;
b8c5462f 3649 }
ffc61ed2
JH
3650 } /* end of \blah */
3651
3652 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3653
3654 if (!SIZE_ONLY && !need_class)
936ed897 3655 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3656
936ed897 3657 need_class = 1;
ffc61ed2
JH
3658
3659 /* a bad range like a-\d, a-[:digit:] ? */
3660 if (range) {
73b437c8 3661 if (!SIZE_ONLY) {
e476b1b5 3662 if (ckWARN(WARN_REGEXP))
830247a4 3663 vWARN4(RExC_parse,
b45f050a 3664 "False [] range \"%*.*s\"",
830247a4
IZ
3665 RExC_parse - rangebegin,
3666 RExC_parse - rangebegin,
b45f050a 3667 rangebegin);
3568d838
JH
3668 if (prevvalue < 256) {
3669 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3670 ANYOF_BITMAP_SET(ret, '-');
3671 }
3672 else {
3673 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3674 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3675 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3676 }
b8c5462f 3677 }
ffc61ed2
JH
3678
3679 range = 0; /* this was not a true range */
73b437c8 3680 }
ffc61ed2 3681
73b437c8 3682 if (!SIZE_ONLY) {
3568d838
JH
3683 if (namedclass > OOB_NAMEDCLASS)
3684 optimize_invert = FALSE;
e2962f66
JH
3685 /* Possible truncation here but in some 64-bit environments
3686 * the compiler gets heartburn about switch on 64-bit values.
3687 * A similar issue a little earlier when switching on value.
98f323fa 3688 * --jhi */
e2962f66 3689 switch ((I32)namedclass) {
73b437c8
JH
3690 case ANYOF_ALNUM:
3691 if (LOC)
936ed897 3692 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3693 else {
3694 for (value = 0; value < 256; value++)
3695 if (isALNUM(value))
936ed897 3696 ANYOF_BITMAP_SET(ret, value);
73b437c8 3697 }
ffc61ed2 3698 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3699 break;
3700 case ANYOF_NALNUM:
3701 if (LOC)
936ed897 3702 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3703 else {
3704 for (value = 0; value < 256; value++)
3705 if (!isALNUM(value))
936ed897 3706 ANYOF_BITMAP_SET(ret, value);
73b437c8 3707 }
ffc61ed2 3708 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3709 break;
ffc61ed2 3710 case ANYOF_ALNUMC:
73b437c8 3711 if (LOC)
ffc61ed2 3712 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3713 else {
3714 for (value = 0; value < 256; value++)
ffc61ed2 3715 if (isALNUMC(value))
936ed897 3716 ANYOF_BITMAP_SET(ret, value);
73b437c8 3717 }
ffc61ed2 3718 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3719 break;
3720 case ANYOF_NALNUMC:
3721 if (LOC)
936ed897 3722 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3723 else {
3724 for (value = 0; value < 256; value++)
3725 if (!isALNUMC(value))
936ed897 3726 ANYOF_BITMAP_SET(ret, value);
73b437c8 3727 }
ffc61ed2 3728 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3729 break;
3730 case ANYOF_ALPHA:
3731 if (LOC)
936ed897 3732 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3733 else {
3734 for (value = 0; value < 256; value++)
3735 if (isALPHA(value))
936ed897 3736 ANYOF_BITMAP_SET(ret, value);
73b437c8 3737 }
ffc61ed2 3738 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3739 break;
3740 case ANYOF_NALPHA:
3741 if (LOC)
936ed897 3742 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3743 else {
3744 for (value = 0; value < 256; value++)
3745 if (!isALPHA(value))
936ed897 3746 ANYOF_BITMAP_SET(ret, value);
73b437c8 3747 }
ffc61ed2 3748 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3749 break;
3750 case ANYOF_ASCII:
3751 if (LOC)
936ed897 3752 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3753 else {
c7f1f016 3754#ifndef EBCDIC
1ba5c669
JH
3755 for (value = 0; value < 128; value++)
3756 ANYOF_BITMAP_SET(ret, value);
3757#else /* EBCDIC */
ffbc6a93 3758 for (value = 0; value < 256; value++) {
3a3c4447
JH
3759 if (isASCII(value))
3760 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3761 }
1ba5c669 3762#endif /* EBCDIC */
73b437c8 3763 }
ffc61ed2 3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3765 break;
3766 case ANYOF_NASCII:
3767 if (LOC)
936ed897 3768 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3769 else {
c7f1f016 3770#ifndef EBCDIC
1ba5c669
JH
3771 for (value = 128; value < 256; value++)
3772 ANYOF_BITMAP_SET(ret, value);
3773#else /* EBCDIC */
ffbc6a93 3774 for (value = 0; value < 256; value++) {
3a3c4447
JH
3775 if (!isASCII(value))
3776 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3777 }
1ba5c669 3778#endif /* EBCDIC */
73b437c8 3779 }
ffc61ed2 3780 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3781 break;
aaa51d5e
JF
3782 case ANYOF_BLANK:
3783 if (LOC)
3784 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (isBLANK(value))
3788 ANYOF_BITMAP_SET(ret, value);
3789 }
ffc61ed2 3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3791 break;
3792 case ANYOF_NBLANK:
3793 if (LOC)
3794 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3795 else {
3796 for (value = 0; value < 256; value++)
3797 if (!isBLANK(value))
3798 ANYOF_BITMAP_SET(ret, value);
3799 }
ffc61ed2 3800 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3801 break;
73b437c8
JH
3802 case ANYOF_CNTRL:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3805 else {
3806 for (value = 0; value < 256; value++)
3807 if (isCNTRL(value))
936ed897 3808 ANYOF_BITMAP_SET(ret, value);
73b437c8 3809 }
ffc61ed2 3810 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3811 break;
3812 case ANYOF_NCNTRL:
3813 if (LOC)
936ed897 3814 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3815 else {
3816 for (value = 0; value < 256; value++)
3817 if (!isCNTRL(value))
936ed897 3818 ANYOF_BITMAP_SET(ret, value);
73b437c8 3819 }
ffc61ed2
JH
3820 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3821 break;
3822 case ANYOF_DIGIT:
3823 if (LOC)
3824 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3825 else {
3826 /* consecutive digits assumed */
3827 for (value = '0'; value <= '9'; value++)
3828 ANYOF_BITMAP_SET(ret, value);
3829 }
3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3831 break;
3832 case ANYOF_NDIGIT:
3833 if (LOC)
3834 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3835 else {
3836 /* consecutive digits assumed */
3837 for (value = 0; value < '0'; value++)
3838 ANYOF_BITMAP_SET(ret, value);
3839 for (value = '9' + 1; value < 256; value++)
3840 ANYOF_BITMAP_SET(ret, value);
3841 }
3842 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3843 break;
3844 case ANYOF_GRAPH:
3845 if (LOC)
936ed897 3846 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3847 else {
3848 for (value = 0; value < 256; value++)
3849 if (isGRAPH(value))
936ed897 3850 ANYOF_BITMAP_SET(ret, value);
73b437c8 3851 }
ffc61ed2 3852 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3853 break;
3854 case ANYOF_NGRAPH:
3855 if (LOC)
936ed897 3856 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3857 else {
3858 for (value = 0; value < 256; value++)
3859 if (!isGRAPH(value))
936ed897 3860 ANYOF_BITMAP_SET(ret, value);
73b437c8 3861 }
ffc61ed2 3862 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3863 break;
3864 case ANYOF_LOWER:
3865 if (LOC)
936ed897 3866 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3867 else {
3868 for (value = 0; value < 256; value++)
3869 if (isLOWER(value))
936ed897 3870 ANYOF_BITMAP_SET(ret, value);
73b437c8 3871 }
ffc61ed2 3872 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3873 break;
3874 case ANYOF_NLOWER:
3875 if (LOC)
936ed897 3876 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3877 else {
3878 for (value = 0; value < 256; value++)
3879 if (!isLOWER(value))
936ed897 3880 ANYOF_BITMAP_SET(ret, value);
73b437c8 3881 }
ffc61ed2 3882 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3883 break;
3884 case ANYOF_PRINT:
3885 if (LOC)
936ed897 3886 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3887 else {
3888 for (value = 0; value < 256; value++)
3889 if (isPRINT(value))
936ed897 3890 ANYOF_BITMAP_SET(ret, value);
73b437c8 3891 }
ffc61ed2 3892 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3893 break;
3894 case ANYOF_NPRINT:
3895 if (LOC)
936ed897 3896 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3897 else {
3898 for (value = 0; value < 256; value++)
3899 if (!isPRINT(value))
936ed897 3900 ANYOF_BITMAP_SET(ret, value);
73b437c8 3901 }
ffc61ed2 3902 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3903 break;
aaa51d5e
JF
3904 case ANYOF_PSXSPC:
3905 if (LOC)
3906 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3907 else {
3908 for (value = 0; value < 256; value++)
3909 if (isPSXSPC(value))
3910 ANYOF_BITMAP_SET(ret, value);
3911 }
ffc61ed2 3912 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3913 break;
3914 case ANYOF_NPSXSPC:
3915 if (LOC)
3916 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3917 else {
3918 for (value = 0; value < 256; value++)
3919 if (!isPSXSPC(value))
3920 ANYOF_BITMAP_SET(ret, value);
3921 }
ffc61ed2 3922 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3923 break;
73b437c8
JH
3924 case ANYOF_PUNCT:
3925 if (LOC)
936ed897 3926 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3927 else {
3928 for (value = 0; value < 256; value++)
3929 if (isPUNCT(value))
936ed897 3930 ANYOF_BITMAP_SET(ret, value);
73b437c8 3931 }
ffc61ed2 3932 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3933 break;
3934 case ANYOF_NPUNCT:
3935 if (LOC)
936ed897 3936 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3937 else {
3938 for (value = 0; value < 256; value++)
3939 if (!isPUNCT(value))
936ed897 3940 ANYOF_BITMAP_SET(ret, value);
73b437c8 3941 }
ffc61ed2
JH
3942 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3943 break;
3944 case ANYOF_SPACE:
3945 if (LOC)
3946 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3947 else {
3948 for (value = 0; value < 256; value++)
3949 if (isSPACE(value))
3950 ANYOF_BITMAP_SET(ret, value);
3951 }
3952 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3953 break;
3954 case ANYOF_NSPACE:
3955 if (LOC)
3956 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3957 else {
3958 for (value = 0; value < 256; value++)
3959 if (!isSPACE(value))
3960 ANYOF_BITMAP_SET(ret, value);
3961 }
3962 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3963 break;
3964 case ANYOF_UPPER:
3965 if (LOC)
936ed897 3966 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3967 else {
3968 for (value = 0; value < 256; value++)
3969 if (isUPPER(value))
936ed897 3970 ANYOF_BITMAP_SET(ret, value);
73b437c8 3971 }
ffc61ed2 3972 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3973 break;
3974 case ANYOF_NUPPER:
3975 if (LOC)
936ed897 3976 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3977 else {
3978 for (value = 0; value < 256; value++)
3979 if (!isUPPER(value))
936ed897 3980 ANYOF_BITMAP_SET(ret, value);
73b437c8 3981 }
ffc61ed2 3982 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3983 break;
3984 case ANYOF_XDIGIT:
3985 if (LOC)
936ed897 3986 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3987 else {
3988 for (value = 0; value < 256; value++)
3989 if (isXDIGIT(value))
936ed897 3990 ANYOF_BITMAP_SET(ret, value);
73b437c8 3991 }
ffc61ed2 3992 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3993 break;
3994 case ANYOF_NXDIGIT:
3995 if (LOC)
936ed897 3996 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3997 else {
3998 for (value = 0; value < 256; value++)
3999 if (!isXDIGIT(value))
936ed897 4000 ANYOF_BITMAP_SET(ret, value);
73b437c8 4001 }
ffc61ed2 4002 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4003 break;
4004 default:
b45f050a 4005 vFAIL("Invalid [::] class");
73b437c8 4006 break;
b8c5462f 4007 }
b8c5462f 4008 if (LOC)
936ed897 4009 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4010 continue;
a0d0e21e 4011 }
ffc61ed2
JH
4012 } /* end of namedclass \blah */
4013
a0d0e21e 4014 if (range) {
3a3c4447 4015 if (prevvalue > value) /* b-a */ {
b45f050a 4016 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4017 RExC_parse - rangebegin,
4018 RExC_parse - rangebegin,
b45f050a 4019 rangebegin);
3568d838 4020 range = 0; /* not a valid range */
73b437c8 4021 }
a0d0e21e
LW
4022 }
4023 else {
3568d838 4024 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4025 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4026 RExC_parse[1] != ']') {
4027 RExC_parse++;
ffc61ed2
JH
4028
4029 /* a bad range like \w-, [:word:]- ? */
4030 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4031 if (ckWARN(WARN_REGEXP))
830247a4 4032 vWARN4(RExC_parse,
b45f050a 4033 "False [] range \"%*.*s\"",
830247a4
IZ
4034 RExC_parse - rangebegin,
4035 RExC_parse - rangebegin,
b45f050a 4036 rangebegin);
73b437c8 4037 if (!SIZE_ONLY)
936ed897 4038 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4039 } else
ffc61ed2
JH
4040 range = 1; /* yeah, it's a range! */
4041 continue; /* but do it the next time */
a0d0e21e 4042 }
a687059c 4043 }
ffc61ed2 4044
93733859 4045 /* now is the next time */
ae5c130c 4046 if (!SIZE_ONLY) {
3568d838
JH
4047 IV i;
4048
4049 if (prevvalue < 256) {
4050 IV ceilvalue = value < 256 ? value : 255;
4051
4052#ifdef EBCDIC
3a3c4447
JH
4053 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4054 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4055 {
3568d838
JH
4056 if (isLOWER(prevvalue)) {
4057 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4058 if (isLOWER(i))
4059 ANYOF_BITMAP_SET(ret, i);
4060 } else {
3568d838 4061 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4062 if (isUPPER(i))
4063 ANYOF_BITMAP_SET(ret, i);
4064 }
8ada0baa 4065 }
ffc61ed2 4066 else
8ada0baa 4067#endif
a5961de5
JH
4068 for (i = prevvalue; i <= ceilvalue; i++)
4069 ANYOF_BITMAP_SET(ret, i);
3568d838 4070 }
a5961de5 4071 if (value > 255 || UTF) {
b08decb7
JH
4072 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4073 UV natvalue = NATIVE_TO_UNI(value);
4074
ffc61ed2 4075 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4076 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4077 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4078 prevnatvalue, natvalue);
4079 }
4080 else if (prevnatvalue == natvalue) {
4081 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4082 if (FOLD) {
254ba52a
JH
4083 U8 tmpbuf [UTF8_MAXLEN+1];
4084 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4085 STRLEN foldlen;
4086 UV f;
4087
b08decb7 4088 uvchr_to_utf8(tmpbuf, natvalue);
2ddfca77 4089 to_utf8_fold(tmpbuf, foldbuf, &foldlen);
9e8f1952 4090 f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0));
254ba52a 4091
c840d2a2
JH
4092 /* If folding and foldable and a single
4093 * character, insert also the folded version
4094 * to the charclass. */
9e55ce06
JH
4095 if (f != value) {
4096 if (foldlen == UNISKIP(f))
4097 Perl_sv_catpvf(aTHX_ listsv,
4098 "%04"UVxf"\n", f);
4099 else {
4100 /* Any multicharacter foldings
4101 * require the following transform:
4102 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4103 * where E folds into "pq" and F folds
4104 * into "rst", all other characters
4105 * fold to single characters. We save
4106 * away these multicharacter foldings,
4107 * to be later saved as part of the
4108 * additional "s" data. */
4109 SV *sv;
4110
4111 if (!unicode_alternate)
4112 unicode_alternate = newAV();
4113 sv = newSVpvn((char*)foldbuf, foldlen);
4114 SvUTF8_on(sv);
4115 av_push(unicode_alternate, sv);
4116 }
4117 }
254ba52a 4118
60a8b682
JH
4119 /* If folding and the value is one of the Greek
4120 * sigmas insert a few more sigmas to make the
4121 * folding rules of the sigmas to work right.
4122 * Note that not all the possible combinations
4123 * are handled here: some of them are handled
9e55ce06
JH
4124 * by the standard folding rules, and some of
4125 * them (literal or EXACTF cases) are handled
4126 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4127 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4128 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4129 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4130 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4131 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4132 }
4133 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4134 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4135 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4136 }
4137 }
ffc61ed2 4138 }
8ada0baa 4139 }
ffc61ed2
JH
4140
4141 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4142 }
ffc61ed2 4143
936ed897 4144 if (need_class) {
4f66b38d 4145 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4146 if (SIZE_ONLY)
830247a4 4147 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4148 else
830247a4 4149 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4150 }
ffc61ed2 4151
ae5c130c 4152 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4153 if (!SIZE_ONLY &&
ffc61ed2 4154 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4155 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4156 ) {
a0ed51b3 4157 for (value = 0; value < 256; ++value) {
936ed897 4158 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4159 IV fold = PL_fold[value];
4160
4161 if (fold != value)
4162 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4163 }
4164 }
936ed897 4165 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4166 }
ffc61ed2 4167
ae5c130c 4168 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4169 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4170 /* If the only flag is inversion. */
4171 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4172 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4173 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4174 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4175 }
a0d0e21e 4176
b81d288d 4177 if (!SIZE_ONLY) {
fde631ed 4178 AV *av = newAV();
ffc61ed2
JH
4179 SV *rv;
4180
9e55ce06
JH
4181 /* The 0th element stores the character class description
4182 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4183 * to initialize the appropriate swash (which gets stored in
4184 * the 1st element), and also useful for dumping the regnode.
4185 * The 2nd element stores the multicharacter foldings,
4186 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4187 av_store(av, 0, listsv);
4188 av_store(av, 1, NULL);
9e55ce06 4189 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4190 rv = newRV_noinc((SV*)av);
19860706 4191 n = add_data(pRExC_state, 1, "s");
830247a4 4192 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4193 ARG_SET(ret, n);
a0ed51b3
LW
4194 }
4195
4196 return ret;
4197}
4198
76e3520e 4199STATIC char*
830247a4 4200S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4201{
830247a4 4202 char* retval = RExC_parse++;
a0d0e21e 4203
4633a7c4 4204 for (;;) {
830247a4
IZ
4205 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4206 RExC_parse[2] == '#') {
4207 while (*RExC_parse && *RExC_parse != ')')
4208 RExC_parse++;
4209 RExC_parse++;
4633a7c4
LW
4210 continue;
4211 }
830247a4
IZ
4212 if (RExC_flags16 & PMf_EXTENDED) {
4213 if (isSPACE(*RExC_parse)) {
4214 RExC_parse++;
748a9306
LW
4215 continue;
4216 }
830247a4
IZ
4217 else if (*RExC_parse == '#') {
4218 while (*RExC_parse && *RExC_parse != '\n')
4219 RExC_parse++;
4220 RExC_parse++;
748a9306
LW
4221 continue;
4222 }
748a9306 4223 }
4633a7c4 4224 return retval;
a0d0e21e 4225 }
a687059c
LW
4226}
4227
4228/*
c277df42 4229- reg_node - emit a node
a0d0e21e 4230*/
76e3520e 4231STATIC regnode * /* Location. */
830247a4 4232S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4233{
c277df42
IZ
4234 register regnode *ret;
4235 register regnode *ptr;
a687059c 4236
830247a4 4237 ret = RExC_emit;
c277df42 4238 if (SIZE_ONLY) {
830247a4
IZ
4239 SIZE_ALIGN(RExC_size);
4240 RExC_size += 1;
a0d0e21e
LW
4241 return(ret);
4242 }
a687059c 4243
c277df42 4244 NODE_ALIGN_FILL(ret);
a0d0e21e 4245 ptr = ret;
c277df42 4246 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4247 if (RExC_offsets) { /* MJD */
4248 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4249 "reg_node", __LINE__,
4250 reg_name[op],
4251 RExC_emit - RExC_emit_start > RExC_offsets[0]
4252 ? "Overwriting end of array!\n" : "OK",
4253 RExC_emit - RExC_emit_start,
4254 RExC_parse - RExC_start,
4255 RExC_offsets[0]));
4256 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4257 }
4258
830247a4 4259 RExC_emit = ptr;
a687059c 4260
a0d0e21e 4261 return(ret);
a687059c
LW
4262}
4263
4264/*
a0d0e21e
LW
4265- reganode - emit a node with an argument
4266*/
76e3520e 4267STATIC regnode * /* Location. */
830247a4 4268S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4269{
c277df42
IZ
4270 register regnode *ret;
4271 register regnode *ptr;
fe14fcc3 4272
830247a4 4273 ret = RExC_emit;
c277df42 4274 if (SIZE_ONLY) {
830247a4
IZ
4275 SIZE_ALIGN(RExC_size);
4276 RExC_size += 2;
a0d0e21e
LW
4277 return(ret);
4278 }
fe14fcc3 4279
c277df42 4280 NODE_ALIGN_FILL(ret);
a0d0e21e 4281 ptr = ret;
c277df42 4282 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4283 if (RExC_offsets) { /* MJD */
4284 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4285 "reganode",
4286 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4287 "Overwriting end of array!\n" : "OK",
4288 RExC_emit - RExC_emit_start,
4289 RExC_parse - RExC_start,
4290 RExC_offsets[0]));
4291 Set_Cur_Node_Offset;
4292 }
4293
830247a4 4294 RExC_emit = ptr;
fe14fcc3 4295
a0d0e21e 4296 return(ret);
fe14fcc3
LW
4297}
4298
4299/*
cd439c50 4300- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4301*/
4302STATIC void
830247a4 4303S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4304{
5e12f4fb 4305 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4306}
4307
4308/*
a0d0e21e
LW
4309- reginsert - insert an operator in front of already-emitted operand
4310*
4311* Means relocating the operand.
4312*/
76e3520e 4313STATIC void
830247a4 4314S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4315{
c277df42
IZ
4316 register regnode *src;
4317 register regnode *dst;
4318 register regnode *place;
4319 register int offset = regarglen[(U8)op];
b81d288d 4320
22c35a8c 4321/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4322
4323 if (SIZE_ONLY) {
830247a4 4324 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4325 return;
4326 }
a687059c 4327
830247a4
IZ
4328 src = RExC_emit;
4329 RExC_emit += NODE_STEP_REGNODE + offset;
4330 dst = RExC_emit;
fac92740 4331 while (src > opnd) {
c277df42 4332 StructCopy(--src, --dst, regnode);
fac92740
MJD
4333 if (RExC_offsets) { /* MJD 20010112 */
4334 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4335 "reg_insert",
4336 dst - RExC_emit_start > RExC_offsets[0]
4337 ? "Overwriting end of array!\n" : "OK",
4338 src - RExC_emit_start,
4339 dst - RExC_emit_start,
4340 RExC_offsets[0]));
4341 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4342 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4343 }
4344 }
4345
a0d0e21e
LW
4346
4347 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4348 if (RExC_offsets) { /* MJD */
4349 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4350 "reginsert",
4351 place - RExC_emit_start > RExC_offsets[0]
4352 ? "Overwriting end of array!\n" : "OK",
4353 place - RExC_emit_start,
4354 RExC_parse - RExC_start,
4355 RExC_offsets[0]));
4356 Set_Node_Offset(place, RExC_parse);
4357 }
c277df42
IZ
4358 src = NEXTOPER(place);
4359 FILL_ADVANCE_NODE(place, op);
4360 Zero(src, offset, regnode);
a687059c
LW
4361}
4362
4363/*
c277df42 4364- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4365*/
76e3520e 4366STATIC void
830247a4 4367S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4368{
c277df42
IZ
4369 register regnode *scan;
4370 register regnode *temp;
a0d0e21e 4371
c277df42 4372 if (SIZE_ONLY)
a0d0e21e
LW
4373 return;
4374
4375 /* Find last node. */
4376 scan = p;
4377 for (;;) {
4378 temp = regnext(scan);
4379 if (temp == NULL)
4380 break;
4381 scan = temp;
4382 }
a687059c 4383
c277df42
IZ
4384 if (reg_off_by_arg[OP(scan)]) {
4385 ARG_SET(scan, val - scan);
a0ed51b3
LW
4386 }
4387 else {
c277df42
IZ
4388 NEXT_OFF(scan) = val - scan;
4389 }
a687059c
LW
4390}
4391
4392/*
a0d0e21e
LW
4393- regoptail - regtail on operand of first argument; nop if operandless
4394*/
76e3520e 4395STATIC void
830247a4 4396S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4397{
a0d0e21e 4398 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4399 if (p == NULL || SIZE_ONLY)
4400 return;
22c35a8c 4401 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4402 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4403 }
22c35a8c 4404 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4405 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4406 }
4407 else
a0d0e21e 4408 return;
a687059c
LW
4409}
4410
4411/*
4412 - regcurly - a little FSA that accepts {\d+,?\d*}
4413 */
79072805 4414STATIC I32
cea2e8a9 4415S_regcurly(pTHX_ register char *s)
a687059c
LW
4416{
4417 if (*s++ != '{')
4418 return FALSE;
f0fcb552 4419 if (!isDIGIT(*s))
a687059c 4420 return FALSE;
f0fcb552 4421 while (isDIGIT(*s))
a687059c
LW
4422 s++;
4423 if (*s == ',')
4424 s++;
f0fcb552 4425 while (isDIGIT(*s))
a687059c
LW
4426 s++;
4427 if (*s != '}')
4428 return FALSE;
4429 return TRUE;
4430}
4431
a687059c 4432
8fa7f367
JH
4433#ifdef DEBUGGING
4434
76e3520e 4435STATIC regnode *
cea2e8a9 4436S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4437{
f248d071 4438 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4439 register regnode *next;
c277df42
IZ
4440
4441 while (op != END && (!last || node < last)) {
4442 /* While that wasn't END last time... */
4443
4444 NODE_ALIGN(node);
4445 op = OP(node);
4446 if (op == CLOSE)
4447 l--;
4448 next = regnext(node);
4449 /* Where, what. */
4450 if (OP(node) == OPTIMIZED)
4451 goto after_print;
4452 regprop(sv, node);
b900a521 4453 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4454 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4455 if (next == NULL) /* Next ptr. */
4456 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4457 else
b900a521 4458 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4459 (void)PerlIO_putc(Perl_debug_log, '\n');
4460 after_print:
22c35a8c 4461 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4462 register regnode *nnode = (OP(next) == LONGJMP
4463 ? regnext(next)
c277df42
IZ
4464 : next);
4465 if (last && nnode > last)
4466 nnode = last;
4467 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4468 }
22c35a8c 4469 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4470 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4471 }
4472 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4473 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4474 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4475 }
22c35a8c 4476 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4477 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4478 next, sv, l + 1);
a0ed51b3
LW
4479 }
4480 else if ( op == PLUS || op == STAR) {
c277df42 4481 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4482 }
4483 else if (op == ANYOF) {
4f66b38d
HS
4484 /* arglen 1 + class block */
4485 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4486 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4487 node = NEXTOPER(node);
a0ed51b3 4488 }
22c35a8c 4489 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4490 /* Literal string, where present. */
cd439c50 4491 node += NODE_SZ_STR(node) - 1;
c277df42 4492 node = NEXTOPER(node);
a0ed51b3
LW
4493 }
4494 else {
c277df42
IZ
4495 node = NEXTOPER(node);
4496 node += regarglen[(U8)op];
4497 }
4498 if (op == CURLYX || op == OPEN)
4499 l++;
4500 else if (op == WHILEM)
4501 l--;
4502 }
4503 return node;
4504}
4505
8fa7f367
JH
4506#endif /* DEBUGGING */
4507
a687059c 4508/*
fd181c75 4509 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4510 */
4511void
864dbfa3 4512Perl_regdump(pTHX_ regexp *r)
a687059c 4513{
35ff7856 4514#ifdef DEBUGGING
46fc3d4c 4515 SV *sv = sv_newmortal();
a687059c 4516
c277df42 4517 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4518
4519 /* Header fields of interest. */
c277df42 4520 if (r->anchored_substr)
7b0972df 4521 PerlIO_printf(Perl_debug_log,
b81d288d 4522 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4523 PL_colors[0],
7b0972df 4524 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4525 SvPVX(r->anchored_substr),
3280af22 4526 PL_colors[1],
c277df42 4527 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4528 (IV)r->anchored_offset);
c277df42 4529 if (r->float_substr)
7b0972df 4530 PerlIO_printf(Perl_debug_log,
b81d288d 4531 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4532 PL_colors[0],
b81d288d 4533 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4534 SvPVX(r->float_substr),
3280af22 4535 PL_colors[1],
c277df42 4536 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4537 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4538 if (r->check_substr)
b81d288d
AB
4539 PerlIO_printf(Perl_debug_log,
4540 r->check_substr == r->float_substr
c277df42
IZ
4541 ? "(checking floating" : "(checking anchored");
4542 if (r->reganch & ROPT_NOSCAN)
4543 PerlIO_printf(Perl_debug_log, " noscan");
4544 if (r->reganch & ROPT_CHECK_ALL)
4545 PerlIO_printf(Perl_debug_log, " isall");
4546 if (r->check_substr)
4547 PerlIO_printf(Perl_debug_log, ") ");
4548
46fc3d4c 4549 if (r->regstclass) {
4550 regprop(sv, r->regstclass);
4551 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4552 }
774d564b 4553 if (r->reganch & ROPT_ANCH) {
4554 PerlIO_printf(Perl_debug_log, "anchored");
4555 if (r->reganch & ROPT_ANCH_BOL)
4556 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4557 if (r->reganch & ROPT_ANCH_MBOL)
4558 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4559 if (r->reganch & ROPT_ANCH_SBOL)
4560 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4561 if (r->reganch & ROPT_ANCH_GPOS)
4562 PerlIO_printf(Perl_debug_log, "(GPOS)");
4563 PerlIO_putc(Perl_debug_log, ' ');
4564 }
c277df42
IZ
4565 if (r->reganch & ROPT_GPOS_SEEN)
4566 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4567 if (r->reganch & ROPT_SKIP)
760ac839 4568 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4569 if (r->reganch & ROPT_IMPLICIT)
760ac839 4570 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4571 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4572 if (r->reganch & ROPT_EVAL_SEEN)
4573 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4574 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4575 if (r->offsets) {
4576 U32 i;
4577 U32 len = r->offsets[0];
392fbf5d 4578 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4579 for (i = 1; i <= len; i++)
392fbf5d
RB
4580 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4581 (UV)r->offsets[i*2-1],
4582 (UV)r->offsets[i*2]);
fac92740
MJD
4583 PerlIO_printf(Perl_debug_log, "\n");
4584 }
17c3b450 4585#endif /* DEBUGGING */
a687059c
LW
4586}
4587
8fa7f367
JH
4588#ifdef DEBUGGING
4589
653099ff
GS
4590STATIC void
4591S_put_byte(pTHX_ SV *sv, int c)
4592{
7be5a6cf 4593 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4594 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4595 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4596 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4597 else
4598 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4599}
4600
8fa7f367
JH
4601#endif /* DEBUGGING */
4602
a687059c 4603/*
a0d0e21e
LW
4604- regprop - printable representation of opcode
4605*/
46fc3d4c 4606void
864dbfa3 4607Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4608{
35ff7856 4609#ifdef DEBUGGING
9b155405 4610 register int k;
a0d0e21e 4611
54dc92de 4612 sv_setpvn(sv, "", 0);
9b155405 4613 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4614 /* It would be nice to FAIL() here, but this may be called from
4615 regexec.c, and it would be hard to supply pRExC_state. */
4616 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4617 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4618
4619 k = PL_regkind[(U8)OP(o)];
4620
2a782b5b
JH
4621 if (k == EXACT) {
4622 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4623 /* Using is_utf8_string() is a crude hack but it may
4624 * be the best for now since we have no flag "this EXACTish
4625 * node was UTF-8" --jhi */
4626 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4627 char *s = do_utf8 ?
c728cb41
JH
4628 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4629 UNI_DISPLAY_REGEX) :
2a782b5b 4630 STRING(o);
40eddc46 4631 int len = do_utf8 ?
2a782b5b
JH
4632 strlen(s) :
4633 STR_LEN(o);
4634 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4635 PL_colors[0],
4636 len, s,
4637 PL_colors[1]);
4638 }
9b155405 4639 else if (k == CURLY) {
cb434fcc 4640 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4641 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4642 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4643 }
2c2d71f5
JH
4644 else if (k == WHILEM && o->flags) /* Ordinal/of */
4645 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4646 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4647 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4648 else if (k == LOGICAL)
04ebc1ab 4649 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4650 else if (k == ANYOF) {
4651 int i, rangestart = -1;
ffc61ed2 4652 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4653 const char * const anyofs[] = { /* Should be syncronized with
4654 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4655 "\\w",
4656 "\\W",
4657 "\\s",
4658 "\\S",
4659 "\\d",
4660 "\\D",
4661 "[:alnum:]",
4662 "[:^alnum:]",
4663 "[:alpha:]",
4664 "[:^alpha:]",
4665 "[:ascii:]",
4666 "[:^ascii:]",
4667 "[:ctrl:]",
4668 "[:^ctrl:]",
4669 "[:graph:]",
4670 "[:^graph:]",
4671 "[:lower:]",
4672 "[:^lower:]",
4673 "[:print:]",
4674 "[:^print:]",
4675 "[:punct:]",
4676 "[:^punct:]",
4677 "[:upper:]",
aaa51d5e 4678 "[:^upper:]",
653099ff 4679 "[:xdigit:]",
aaa51d5e
JF
4680 "[:^xdigit:]",
4681 "[:space:]",
4682 "[:^space:]",
4683 "[:blank:]",
4684 "[:^blank:]"
653099ff
GS
4685 };
4686
19860706 4687 if (flags & ANYOF_LOCALE)
653099ff 4688 sv_catpv(sv, "{loc}");
19860706 4689 if (flags & ANYOF_FOLD)
653099ff
GS
4690 sv_catpv(sv, "{i}");
4691 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4692 if (flags & ANYOF_INVERT)
653099ff 4693 sv_catpv(sv, "^");
ffc61ed2
JH
4694 for (i = 0; i <= 256; i++) {
4695 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4696 if (rangestart == -1)
4697 rangestart = i;
4698 } else if (rangestart != -1) {
4699 if (i <= rangestart + 3)
4700 for (; rangestart < i; rangestart++)
653099ff 4701 put_byte(sv, rangestart);
ffc61ed2
JH
4702 else {
4703 put_byte(sv, rangestart);
4704 sv_catpv(sv, "-");
4705 put_byte(sv, i - 1);
653099ff 4706 }
ffc61ed2 4707 rangestart = -1;
653099ff 4708 }
847a199f 4709 }
ffc61ed2
JH
4710
4711 if (o->flags & ANYOF_CLASS)
4712 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4713 if (ANYOF_CLASS_TEST(o,i))
4714 sv_catpv(sv, anyofs[i]);
4715
4716 if (flags & ANYOF_UNICODE)
4717 sv_catpv(sv, "{unicode}");
1aa99e6b 4718 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4719 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4720
4721 {
4722 SV *lv;
9e55ce06 4723 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4724
ffc61ed2
JH
4725 if (lv) {
4726 if (sw) {
4727 UV i;
4728 U8 s[UTF8_MAXLEN+1];
b81d288d 4729
ffc61ed2 4730 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4731 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4732
3568d838 4733 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4734 if (rangestart == -1)
4735 rangestart = i;
4736 } else if (rangestart != -1) {
4737 U8 *p;
b81d288d 4738
ffc61ed2
JH
4739 if (i <= rangestart + 3)
4740 for (; rangestart < i; rangestart++) {
2b9d42f0 4741 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4742 put_byte(sv, *p);
4743 }
4744 else {
2b9d42f0 4745 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4746 put_byte(sv, *p);
4747 sv_catpv(sv, "-");
2b9d42f0 4748 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4749 put_byte(sv, *p);
4750 }
4751 rangestart = -1;
4752 }
19860706 4753 }
ffc61ed2
JH
4754
4755 sv_catpv(sv, "..."); /* et cetera */
19860706 4756 }
fde631ed 4757
ffc61ed2
JH
4758 {
4759 char *s = savepv(SvPVX(lv));
4760 char *origs = s;
b81d288d 4761
ffc61ed2 4762 while(*s && *s != '\n') s++;
b81d288d 4763
ffc61ed2
JH
4764 if (*s == '\n') {
4765 char *t = ++s;
4766
4767 while (*s) {
4768 if (*s == '\n')
4769 *s = ' ';
4770 s++;
4771 }
4772 if (s[-1] == ' ')
4773 s[-1] = 0;
4774
4775 sv_catpv(sv, t);
fde631ed 4776 }
b81d288d 4777
ffc61ed2 4778 Safefree(origs);
fde631ed
JH
4779 }
4780 }
653099ff 4781 }
ffc61ed2 4782
653099ff
GS
4783 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4784 }
9b155405 4785 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4786 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4787#endif /* DEBUGGING */
35ff7856 4788}
a687059c 4789
cad2e5aa
JH
4790SV *
4791Perl_re_intuit_string(pTHX_ regexp *prog)
4792{ /* Assume that RE_INTUIT is set */
4793 DEBUG_r(
4794 { STRLEN n_a;
4795 char *s = SvPV(prog->check_substr,n_a);
4796
4797 if (!PL_colorset) reginitcolors();
4798 PerlIO_printf(Perl_debug_log,
4799 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4800 PL_colors[4],PL_colors[5],PL_colors[0],
4801 s,
4802 PL_colors[1],
4803 (strlen(s) > 60 ? "..." : ""));
4804 } );
4805
4806 return prog->check_substr;
4807}
4808
2b69d0c2 4809void
864dbfa3 4810Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4811{
9e55ce06
JH
4812#ifdef DEBUGGING
4813 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4814#endif
7821416a
IZ
4815
4816 if (!r || (--r->refcnt > 0))
4817 return;
9e55ce06 4818 DEBUG_r({
9e55ce06 4819 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4820 UNI_DISPLAY_REGEX);
9e55ce06
JH
4821 int len = SvCUR(dsv);
4822 if (!PL_colorset)
4823 reginitcolors();
4824 PerlIO_printf(Perl_debug_log,
4825 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4826 PL_colors[4],PL_colors[5],PL_colors[0],
4827 len, len, s,
4828 PL_colors[1],
4829 len > 60 ? "..." : "");
4830 });
cad2e5aa 4831
c277df42 4832 if (r->precomp)
a0d0e21e 4833 Safefree(r->precomp);
fac92740
MJD
4834 if (r->offsets) /* 20010421 MJD */
4835 Safefree(r->offsets);
cf93c79d
IZ
4836 if (RX_MATCH_COPIED(r))
4837 Safefree(r->subbeg);
a193d654
GS
4838 if (r->substrs) {
4839 if (r->anchored_substr)
4840 SvREFCNT_dec(r->anchored_substr);
4841 if (r->float_substr)
4842 SvREFCNT_dec(r->float_substr);
2779dcf1 4843 Safefree(r->substrs);
a193d654 4844 }
c277df42
IZ
4845 if (r->data) {
4846 int n = r->data->count;
dfad63ad
HS
4847 AV* new_comppad = NULL;
4848 AV* old_comppad;
4849 SV** old_curpad;
4850
c277df42 4851 while (--n >= 0) {
261faec3 4852 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4853 switch (r->data->what[n]) {
4854 case 's':
4855 SvREFCNT_dec((SV*)r->data->data[n]);
4856 break;
653099ff
GS
4857 case 'f':
4858 Safefree(r->data->data[n]);
4859 break;
dfad63ad
HS
4860 case 'p':
4861 new_comppad = (AV*)r->data->data[n];
4862 break;
c277df42 4863 case 'o':
dfad63ad 4864 if (new_comppad == NULL)
cea2e8a9 4865 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4866 old_comppad = PL_comppad;
4867 old_curpad = PL_curpad;
1e6dc0b6
SB
4868 /* Watch out for global destruction's random ordering. */
4869 if (SvTYPE(new_comppad) == SVt_PVAV) {
4870 PL_comppad = new_comppad;
4871 PL_curpad = AvARRAY(new_comppad);
4872 }
4873 else
4874 PL_curpad = NULL;
9b978d73
DM
4875
4876 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4877 op_free((OP_4tree*)r->data->data[n]);
4878 }
4879
dfad63ad
HS
4880 PL_comppad = old_comppad;
4881 PL_curpad = old_curpad;
4882 SvREFCNT_dec((SV*)new_comppad);
4883 new_comppad = NULL;
c277df42
IZ
4884 break;
4885 case 'n':
9e55ce06 4886 break;
c277df42 4887 default:
830247a4 4888 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4889 }
4890 }
4891 Safefree(r->data->what);
4892 Safefree(r->data);
a0d0e21e
LW
4893 }
4894 Safefree(r->startp);
4895 Safefree(r->endp);
4896 Safefree(r);
a687059c 4897}
c277df42
IZ
4898
4899/*
4900 - regnext - dig the "next" pointer out of a node
4901 *
4902 * [Note, when REGALIGN is defined there are two places in regmatch()
4903 * that bypass this code for speed.]
4904 */
4905regnode *
864dbfa3 4906Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4907{
4908 register I32 offset;
4909
3280af22 4910 if (p == &PL_regdummy)
c277df42
IZ
4911 return(NULL);
4912
4913 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4914 if (offset == 0)
4915 return(NULL);
4916
c277df42 4917 return(p+offset);
c277df42
IZ
4918}
4919
01f988be 4920STATIC void
cea2e8a9 4921S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4922{
4923 va_list args;
4924 STRLEN l1 = strlen(pat1);
4925 STRLEN l2 = strlen(pat2);
4926 char buf[512];
06bf62c7 4927 SV *msv;
c277df42
IZ
4928 char *message;
4929
4930 if (l1 > 510)
4931 l1 = 510;
4932 if (l1 + l2 > 510)
4933 l2 = 510 - l1;
4934 Copy(pat1, buf, l1 , char);
4935 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4936 buf[l1 + l2] = '\n';
4937 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4938#ifdef I_STDARG
4939 /* ANSI variant takes additional second argument */
c277df42 4940 va_start(args, pat2);
8736538c
AS
4941#else
4942 va_start(args);
4943#endif
5a844595 4944 msv = vmess(buf, &args);
c277df42 4945 va_end(args);
06bf62c7 4946 message = SvPV(msv,l1);
c277df42
IZ
4947 if (l1 > 512)
4948 l1 = 512;
4949 Copy(message, buf, l1 , char);
4950 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4951 Perl_croak(aTHX_ "%s", buf);
c277df42 4952}
a0ed51b3
LW
4953
4954/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4955
4956void
864dbfa3 4957Perl_save_re_context(pTHX)
b81d288d 4958{
830247a4
IZ
4959#if 0
4960 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4961 SAVEI32(RExC_npar); /* () count. */
4962 SAVEI32(RExC_size); /* Code size. */
4963 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4964 SAVEVPTR(RExC_rx); /* from regcomp.c */
4965 SAVEI32(RExC_seen); /* from regcomp.c */
4966 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4967 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4968 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4969 SAVEPPTR(RExC_end); /* End of input for compile */
4970 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4971#endif
4972
4973 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4974 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4975 SAVEPPTR(PL_reginput); /* String-input pointer. */
4976 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4977 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4978 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4979 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4980 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4981 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4982 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4983 PL_reg_start_tmp = 0;
a0ed51b3
LW
4984 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4985 PL_reg_start_tmpl = 0;
7766f137 4986 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4987 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4988 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4989 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4990 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4991 SAVEVPTR(PL_regcc); /* from regexec.c */
4992 SAVEVPTR(PL_curcop);
7766f137
GS
4993 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4994 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4995 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4996 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4997 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4998 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4999 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5000 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5001 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 5002 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5003 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 5004#ifdef DEBUGGING
b81d288d 5005 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5006#endif
a0ed51b3 5007}
51371543 5008
51371543 5009static void
acfe0abc 5010clear_re(pTHX_ void *r)
51371543
GS
5011{
5012 ReREFCNT_dec((regexp *)r);
5013}
ffbc6a93 5014