This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Handle PERLIO= and document a bit.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4c79ee7a 72 **** Copyright (c) 1991-2003, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4 110typedef struct RExC_state_t {
e2509266 111 U32 flags; /* are we folding, multilining? */
830247a4
IZ
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380
MP
259#define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a
JF
274/*
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
277 * "...".
278 */
ccb2c380
MP
279#define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
282 \
283 if (!SIZE_ONLY) \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
288 ellipses = "..."; \
289 } \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
292} STMT_END
b45f050a
JF
293
294
295/*
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297 */
ccb2c380
MP
298#define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
302} STMT_END
b45f050a
JF
303
304/*
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306 */
ccb2c380
MP
307#define vFAIL(m) STMT_START { \
308 if (!SIZE_ONLY) \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
310 Simple_vFAIL(m); \
311} STMT_END
b45f050a
JF
312
313/*
314 * Like Simple_vFAIL(), but accepts two arguments.
315 */
ccb2c380
MP
316#define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
320} STMT_END
b45f050a
JF
321
322/*
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324 */
ccb2c380
MP
325#define vFAIL2(m,a1) STMT_START { \
326 if (!SIZE_ONLY) \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
329} STMT_END
b45f050a
JF
330
331
332/*
333 * Like Simple_vFAIL(), but accepts three arguments.
334 */
ccb2c380
MP
335#define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
339} STMT_END
b45f050a
JF
340
341/*
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343 */
ccb2c380
MP
344#define vFAIL3(m,a1,a2) STMT_START { \
345 if (!SIZE_ONLY) \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
348} STMT_END
b45f050a
JF
349
350/*
351 * Like Simple_vFAIL(), but accepts four arguments.
352 */
ccb2c380
MP
353#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
357} STMT_END
b45f050a
JF
358
359/*
360 * Like Simple_vFAIL(), but accepts five arguments.
361 */
ccb2c380
MP
362#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368
369#define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
420/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
a0ed51b3
LW
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42
IZ
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292
JH
498 {
499 SV * sv = data->last_found;
500 MAGIC *mg =
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
522S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 struct regnode_charclass_class *and_with)
560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
577
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
579 cl->flags &= ~ANYOF_UNICODE_ALL;
580 cl->flags |= ANYOF_UNICODE;
581 ARG_SET(cl, ARG(and_with));
582 }
583 if (!(and_with->flags & ANYOF_UNICODE_ALL))
584 cl->flags &= ~ANYOF_UNICODE_ALL;
585 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
586 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
587}
588
589/* 'OR' a given class with another one. Can create false positives */
590/* We assume that cl is not inverted */
591STATIC void
830247a4 592S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 593{
653099ff
GS
594 if (or_with->flags & ANYOF_INVERT) {
595 /* We do not use
596 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
597 * <= (B1 | !B2) | (CL1 | !CL2)
598 * which is wasteful if CL2 is small, but we ignore CL2:
599 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
600 * XXXX Can we handle case-fold? Unclear:
601 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
602 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
603 */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && !(or_with->flags & ANYOF_FOLD)
606 && !(cl->flags & ANYOF_FOLD) ) {
607 int i;
608
609 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
610 cl->bitmap[i] |= ~or_with->bitmap[i];
611 } /* XXXX: logic is complicated otherwise */
612 else {
830247a4 613 cl_anything(pRExC_state, cl);
653099ff
GS
614 }
615 } else {
616 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 618 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
619 || (cl->flags & ANYOF_FOLD)) ) {
620 int i;
621
622 /* OR char bitmap and class bitmap separately */
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= or_with->bitmap[i];
625 if (or_with->flags & ANYOF_CLASS) {
626 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
627 cl->classflags[i] |= or_with->classflags[i];
628 cl->flags |= ANYOF_CLASS;
629 }
630 }
631 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 632 cl_anything(pRExC_state, cl);
653099ff
GS
633 }
634 }
635 if (or_with->flags & ANYOF_EOS)
636 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
637
638 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
639 ARG(cl) != ARG(or_with)) {
640 cl->flags |= ANYOF_UNICODE_ALL;
641 cl->flags &= ~ANYOF_UNICODE;
642 }
643 if (or_with->flags & ANYOF_UNICODE_ALL) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
646 }
653099ff
GS
647}
648
5d1c421c
JH
649/*
650 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
651 * These need to be revisited when a newer toolchain becomes available.
652 */
653#if defined(__sparc64__) && defined(__GNUC__)
654# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
655# undef SPARC64_GCC_WORKAROUND
656# define SPARC64_GCC_WORKAROUND 1
657# endif
658#endif
659
653099ff
GS
660/* REx optimizer. Converts nodes into quickier variants "in place".
661 Finds fixed substrings. */
662
c277df42
IZ
663/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
664 to the position after last scanned or to NULL. */
665
76e3520e 666STATIC I32
830247a4 667S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
668 /* scanp: Start here (read-write). */
669 /* deltap: Write maxlen-minlen here. */
670 /* last: Stop before this one. */
671{
672 I32 min = 0, pars = 0, code;
673 regnode *scan = *scanp, *next;
674 I32 delta = 0;
675 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 676 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
677 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
678 scan_data_t data_fake;
653099ff 679 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 680
c277df42
IZ
681 while (scan && OP(scan) != END && scan < last) {
682 /* Peephole optimizer: */
683
22c35a8c 684 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 685 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
686 regnode *n = regnext(scan);
687 U32 stringok = 1;
688#ifdef DEBUGGING
689 regnode *stop = scan;
b81d288d 690#endif
c277df42 691
cd439c50 692 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
693 /* Skip NOTHING, merge EXACT*. */
694 while (n &&
b81d288d 695 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
696 (stringok && (OP(n) == OP(scan))))
697 && NEXT_OFF(n)
698 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
699 if (OP(n) == TAIL || n > next)
700 stringok = 0;
22c35a8c 701 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
702 NEXT_OFF(scan) += NEXT_OFF(n);
703 next = n + NODE_STEP_REGNODE;
704#ifdef DEBUGGING
705 if (stringok)
706 stop = n;
b81d288d 707#endif
c277df42 708 n = regnext(n);
a0ed51b3 709 }
f49d4d0f 710 else if (stringok) {
cd439c50 711 int oldl = STR_LEN(scan);
c277df42 712 regnode *nnext = regnext(n);
f49d4d0f 713
b81d288d 714 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
715 break;
716 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
717 STR_LEN(scan) += STR_LEN(n);
718 next = n + NODE_SZ_STR(n);
c277df42 719 /* Now we can overwrite *n : */
f49d4d0f 720 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 721#ifdef DEBUGGING
f49d4d0f 722 stop = next - 1;
b81d288d 723#endif
c277df42
IZ
724 n = nnext;
725 }
726 }
61a36c01 727
d65e4eab 728 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
729/*
730 Two problematic code points in Unicode casefolding of EXACT nodes:
731
732 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
733 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
734
735 which casefold to
736
737 Unicode UTF-8
738
739 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
740 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
741
742 This means that in case-insensitive matching (or "loose matching",
743 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
744 length of the above casefolded versions) can match a target string
745 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
746 This would rather mess up the minimum length computation.
747
748 What we'll do is to look for the tail four bytes, and then peek
749 at the preceding two bytes to see whether we need to decrease
750 the minimum length by four (six minus two).
751
752 Thanks to the design of UTF-8, there cannot be false matches:
753 A sequence of valid UTF-8 bytes cannot be a subsequence of
754 another valid sequence of UTF-8 bytes.
755
756*/
757 char *s0 = STRING(scan), *s, *t;
758 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
759 char *t0 = "\xcc\x88\xcc\x81";
760 char *t1 = t0 + 3;
761
762 for (s = s0 + 2;
763 s < s2 && (t = ninstr(s, s1, t0, t1));
764 s = t + 4) {
765 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
766 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
767 min -= 4;
768 }
769 }
770
c277df42
IZ
771#ifdef DEBUGGING
772 /* Allow dumping */
cd439c50 773 n = scan + NODE_SZ_STR(scan);
c277df42 774 while (n <= stop) {
22c35a8c 775 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
776 OP(n) = OPTIMIZED;
777 NEXT_OFF(n) = 0;
778 }
779 n++;
780 }
653099ff 781#endif
c277df42 782 }
653099ff
GS
783 /* Follow the next-chain of the current node and optimize
784 away all the NOTHINGs from it. */
c277df42 785 if (OP(scan) != CURLYX) {
048cfca1
GS
786 int max = (reg_off_by_arg[OP(scan)]
787 ? I32_MAX
788 /* I32 may be smaller than U16 on CRAYs! */
789 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
790 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
791 int noff;
792 regnode *n = scan;
b81d288d 793
c277df42
IZ
794 /* Skip NOTHING and LONGJMP. */
795 while ((n = regnext(n))
22c35a8c 796 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
797 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
798 && off + noff < max)
799 off += noff;
800 if (reg_off_by_arg[OP(scan)])
801 ARG(scan) = off;
b81d288d 802 else
c277df42
IZ
803 NEXT_OFF(scan) = off;
804 }
653099ff
GS
805 /* The principal pseudo-switch. Cannot be a switch, since we
806 look into several different things. */
b81d288d 807 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
808 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
809 next = regnext(scan);
810 code = OP(scan);
b81d288d
AB
811
812 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 813 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 814 struct regnode_charclass_class accum;
c277df42 815
653099ff 816 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 817 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 818 if (flags & SCF_DO_STCLASS)
830247a4 819 cl_init_zero(pRExC_state, &accum);
c277df42 820 while (OP(scan) == code) {
830247a4 821 I32 deltanext, minnext, f = 0, fake;
653099ff 822 struct regnode_charclass_class this_class;
c277df42
IZ
823
824 num++;
825 data_fake.flags = 0;
b81d288d 826 if (data) {
2c2d71f5 827 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
828 data_fake.last_closep = data->last_closep;
829 }
830 else
831 data_fake.last_closep = &fake;
c277df42
IZ
832 next = regnext(scan);
833 scan = NEXTOPER(scan);
834 if (code != BRANCH)
835 scan = NEXTOPER(scan);
653099ff 836 if (flags & SCF_DO_STCLASS) {
830247a4 837 cl_init(pRExC_state, &this_class);
653099ff
GS
838 data_fake.start_class = &this_class;
839 f = SCF_DO_STCLASS_AND;
b81d288d 840 }
e1901655
IZ
841 if (flags & SCF_WHILEM_VISITED_POS)
842 f |= SCF_WHILEM_VISITED_POS;
653099ff 843 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
844 minnext = study_chunk(pRExC_state, &scan, &deltanext,
845 next, &data_fake, f);
b81d288d 846 if (min1 > minnext)
c277df42
IZ
847 min1 = minnext;
848 if (max1 < minnext + deltanext)
849 max1 = minnext + deltanext;
850 if (deltanext == I32_MAX)
aca2d497 851 is_inf = is_inf_internal = 1;
c277df42
IZ
852 scan = next;
853 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
854 pars++;
405ff068 855 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 856 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
857 if (data)
858 data->whilem_c = data_fake.whilem_c;
653099ff 859 if (flags & SCF_DO_STCLASS)
830247a4 860 cl_or(pRExC_state, &accum, &this_class);
b81d288d 861 if (code == SUSPEND)
c277df42
IZ
862 break;
863 }
864 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
865 min1 = 0;
866 if (flags & SCF_DO_SUBSTR) {
867 data->pos_min += min1;
868 data->pos_delta += max1 - min1;
869 if (max1 != min1 || is_inf)
870 data->longest = &(data->longest_float);
871 }
872 min += min1;
873 delta += max1 - min1;
653099ff 874 if (flags & SCF_DO_STCLASS_OR) {
830247a4 875 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
876 if (min1) {
877 cl_and(data->start_class, &and_with);
878 flags &= ~SCF_DO_STCLASS;
879 }
880 }
881 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
882 if (min1) {
883 cl_and(data->start_class, &accum);
653099ff 884 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
885 }
886 else {
b81d288d 887 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
888 * data->start_class */
889 StructCopy(data->start_class, &and_with,
890 struct regnode_charclass_class);
891 flags &= ~SCF_DO_STCLASS_AND;
892 StructCopy(&accum, data->start_class,
893 struct regnode_charclass_class);
894 flags |= SCF_DO_STCLASS_OR;
895 data->start_class->flags |= ANYOF_EOS;
896 }
653099ff 897 }
a0ed51b3
LW
898 }
899 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
900 scan = NEXTOPER(NEXTOPER(scan));
901 else /* single branch is optimized. */
902 scan = NEXTOPER(scan);
903 continue;
a0ed51b3
LW
904 }
905 else if (OP(scan) == EXACT) {
cd439c50 906 I32 l = STR_LEN(scan);
1aa99e6b 907 UV uc = *((U8*)STRING(scan));
a0ed51b3 908 if (UTF) {
1aa99e6b
IH
909 U8 *s = (U8*)STRING(scan);
910 l = utf8_length(s, s + l);
9041c2e3 911 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
912 }
913 min += l;
c277df42 914 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
915 /* The code below prefers earlier match for fixed
916 offset, later match for variable offset. */
917 if (data->last_end == -1) { /* Update the start info. */
918 data->last_start_min = data->pos_min;
919 data->last_start_max = is_inf
b81d288d 920 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 921 }
cd439c50 922 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
923 {
924 SV * sv = data->last_found;
925 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
926 mg_find(sv, PERL_MAGIC_utf8) : NULL;
927 if (mg && mg->mg_len >= 0)
5e43f467
JH
928 mg->mg_len += utf8_length((U8*)STRING(scan),
929 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 930 }
33b8afdf
JH
931 if (UTF)
932 SvUTF8_on(data->last_found);
c277df42
IZ
933 data->last_end = data->pos_min + l;
934 data->pos_min += l; /* As in the first entry. */
935 data->flags &= ~SF_BEFORE_EOL;
936 }
653099ff
GS
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
939 int compat = 1;
940
1aa99e6b 941 if (uc >= 0x100 ||
516a5887 942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 944 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 945 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 946 )
653099ff
GS
947 compat = 0;
948 ANYOF_CLASS_ZERO(data->start_class);
949 ANYOF_BITMAP_ZERO(data->start_class);
950 if (compat)
1aa99e6b 951 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 952 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
953 if (uc < 0x100)
954 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
955 }
956 else if (flags & SCF_DO_STCLASS_OR) {
957 /* false positive possible if the class is case-folded */
1aa99e6b 958 if (uc < 0x100)
9b877dbb
IH
959 ANYOF_BITMAP_SET(data->start_class, uc);
960 else
961 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
962 data->start_class->flags &= ~ANYOF_EOS;
963 cl_and(data->start_class, &and_with);
964 }
965 flags &= ~SCF_DO_STCLASS;
a0ed51b3 966 }
653099ff 967 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 968 I32 l = STR_LEN(scan);
1aa99e6b 969 UV uc = *((U8*)STRING(scan));
653099ff
GS
970
971 /* Search for fixed substrings supports EXACT only. */
b81d288d 972 if (flags & SCF_DO_SUBSTR)
830247a4 973 scan_commit(pRExC_state, data);
a0ed51b3 974 if (UTF) {
1aa99e6b
IH
975 U8 *s = (U8 *)STRING(scan);
976 l = utf8_length(s, s + l);
9041c2e3 977 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
978 }
979 min += l;
c277df42 980 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 981 data->pos_min += l;
653099ff
GS
982 if (flags & SCF_DO_STCLASS_AND) {
983 /* Check whether it is compatible with what we know already! */
984 int compat = 1;
985
1aa99e6b 986 if (uc >= 0x100 ||
516a5887 987 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 988 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 989 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
990 compat = 0;
991 ANYOF_CLASS_ZERO(data->start_class);
992 ANYOF_BITMAP_ZERO(data->start_class);
993 if (compat) {
1aa99e6b 994 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
995 data->start_class->flags &= ~ANYOF_EOS;
996 data->start_class->flags |= ANYOF_FOLD;
997 if (OP(scan) == EXACTFL)
998 data->start_class->flags |= ANYOF_LOCALE;
999 }
1000 }
1001 else if (flags & SCF_DO_STCLASS_OR) {
1002 if (data->start_class->flags & ANYOF_FOLD) {
1003 /* false positive possible if the class is case-folded.
1004 Assume that the locale settings are the same... */
1aa99e6b
IH
1005 if (uc < 0x100)
1006 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1007 data->start_class->flags &= ~ANYOF_EOS;
1008 }
1009 cl_and(data->start_class, &and_with);
1010 }
1011 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1012 }
4d61ec05 1013 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1014 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1015 I32 f = flags, pos_before = 0;
c277df42 1016 regnode *oscan = scan;
653099ff
GS
1017 struct regnode_charclass_class this_class;
1018 struct regnode_charclass_class *oclass = NULL;
727f22e3 1019 I32 next_is_eval = 0;
653099ff 1020
22c35a8c 1021 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1022 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1023 scan = NEXTOPER(scan);
1024 goto finish;
1025 case PLUS:
653099ff 1026 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1027 next = NEXTOPER(scan);
653099ff 1028 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1029 mincount = 1;
1030 maxcount = REG_INFTY;
c277df42
IZ
1031 next = regnext(scan);
1032 scan = NEXTOPER(scan);
1033 goto do_curly;
1034 }
1035 }
1036 if (flags & SCF_DO_SUBSTR)
1037 data->pos_min++;
1038 min++;
1039 /* Fall through. */
1040 case STAR:
653099ff
GS
1041 if (flags & SCF_DO_STCLASS) {
1042 mincount = 0;
b81d288d 1043 maxcount = REG_INFTY;
653099ff
GS
1044 next = regnext(scan);
1045 scan = NEXTOPER(scan);
1046 goto do_curly;
1047 }
b81d288d 1048 is_inf = is_inf_internal = 1;
c277df42
IZ
1049 scan = regnext(scan);
1050 if (flags & SCF_DO_SUBSTR) {
830247a4 1051 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1052 data->longest = &(data->longest_float);
1053 }
1054 goto optimize_curly_tail;
1055 case CURLY:
b81d288d 1056 mincount = ARG1(scan);
c277df42
IZ
1057 maxcount = ARG2(scan);
1058 next = regnext(scan);
cb434fcc
IZ
1059 if (OP(scan) == CURLYX) {
1060 I32 lp = (data ? *(data->last_closep) : 0);
1061
1062 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1063 }
c277df42 1064 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1065 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1066 do_curly:
1067 if (flags & SCF_DO_SUBSTR) {
830247a4 1068 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1069 pos_before = data->pos_min;
1070 }
1071 if (data) {
1072 fl = data->flags;
1073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1074 if (is_inf)
1075 data->flags |= SF_IS_INF;
1076 }
653099ff 1077 if (flags & SCF_DO_STCLASS) {
830247a4 1078 cl_init(pRExC_state, &this_class);
653099ff
GS
1079 oclass = data->start_class;
1080 data->start_class = &this_class;
1081 f |= SCF_DO_STCLASS_AND;
1082 f &= ~SCF_DO_STCLASS_OR;
1083 }
e1901655
IZ
1084 /* These are the cases when once a subexpression
1085 fails at a particular position, it cannot succeed
1086 even after backtracking at the enclosing scope.
b81d288d 1087
e1901655
IZ
1088 XXXX what if minimal match and we are at the
1089 initial run of {n,m}? */
1090 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1091 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1092
c277df42 1093 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1094 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1095 mincount == 0
653099ff
GS
1096 ? (f & ~SCF_DO_SUBSTR) : f);
1097
1098 if (flags & SCF_DO_STCLASS)
1099 data->start_class = oclass;
1100 if (mincount == 0 || minnext == 0) {
1101 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1102 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1103 }
1104 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1105 /* Switch to OR mode: cache the old value of
653099ff
GS
1106 * data->start_class */
1107 StructCopy(data->start_class, &and_with,
1108 struct regnode_charclass_class);
1109 flags &= ~SCF_DO_STCLASS_AND;
1110 StructCopy(&this_class, data->start_class,
1111 struct regnode_charclass_class);
1112 flags |= SCF_DO_STCLASS_OR;
1113 data->start_class->flags |= ANYOF_EOS;
1114 }
1115 } else { /* Non-zero len */
1116 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1117 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1118 cl_and(data->start_class, &and_with);
1119 }
1120 else if (flags & SCF_DO_STCLASS_AND)
1121 cl_and(data->start_class, &this_class);
1122 flags &= ~SCF_DO_STCLASS;
1123 }
c277df42
IZ
1124 if (!scan) /* It was not CURLYX, but CURLY. */
1125 scan = next;
84037bb0 1126 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1127 /* ? quantifier ok, except for (?{ ... }) */
1128 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1129 && (minnext == 0) && (deltanext == 0)
99799961 1130 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1131 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1132 {
830247a4 1133 vWARN(RExC_parse,
b45f050a
JF
1134 "Quantifier unexpected on zero-length expression");
1135 }
1136
c277df42 1137 min += minnext * mincount;
b81d288d 1138 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1139 && (minnext + deltanext) > 0)
1140 || deltanext == I32_MAX);
aca2d497 1141 is_inf |= is_inf_internal;
c277df42
IZ
1142 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1143
1144 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1145 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1146 && data->flags & SF_IN_PAR
1147 && !(data->flags & SF_HAS_EVAL)
1148 && !deltanext && minnext == 1 ) {
1149 /* Try to optimize to CURLYN. */
1150 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1151 regnode *nxt1 = nxt;
1152#ifdef DEBUGGING
1153 regnode *nxt2;
1154#endif
c277df42
IZ
1155
1156 /* Skip open. */
1157 nxt = regnext(nxt);
4d61ec05 1158 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1159 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1160 && STR_LEN(nxt) == 1))
c277df42 1161 goto nogo;
497b47a8 1162#ifdef DEBUGGING
c277df42 1163 nxt2 = nxt;
497b47a8 1164#endif
c277df42 1165 nxt = regnext(nxt);
b81d288d 1166 if (OP(nxt) != CLOSE)
c277df42
IZ
1167 goto nogo;
1168 /* Now we know that nxt2 is the only contents: */
eb160463 1169 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1170 OP(oscan) = CURLYN;
1171 OP(nxt1) = NOTHING; /* was OPEN. */
1172#ifdef DEBUGGING
1173 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1174 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1175 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1176 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1177 OP(nxt + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1179#endif
c277df42 1180 }
c277df42
IZ
1181 nogo:
1182
1183 /* Try optimization CURLYX => CURLYM. */
b81d288d 1184 if ( OP(oscan) == CURLYX && data
c277df42 1185 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1186 && !(data->flags & SF_HAS_EVAL)
1187 && !deltanext ) {
1188 /* XXXX How to optimize if data == 0? */
1189 /* Optimize to a simpler form. */
1190 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1191 regnode *nxt2;
1192
1193 OP(oscan) = CURLYM;
1194 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1195 && (OP(nxt2) != WHILEM))
c277df42
IZ
1196 nxt = nxt2;
1197 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1198 /* Need to optimize away parenths. */
1199 if (data->flags & SF_IN_PAR) {
1200 /* Set the parenth number. */
1201 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1202
b81d288d 1203 if (OP(nxt) != CLOSE)
b45f050a 1204 FAIL("Panic opt close");
eb160463 1205 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1206 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1207 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1208#ifdef DEBUGGING
1209 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1210 OP(nxt + 1) = OPTIMIZED; /* was count. */
1211 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1212 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1213#endif
c277df42
IZ
1214#if 0
1215 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1216 regnode *nnxt = regnext(nxt1);
b81d288d 1217
c277df42
IZ
1218 if (nnxt == nxt) {
1219 if (reg_off_by_arg[OP(nxt1)])
1220 ARG_SET(nxt1, nxt2 - nxt1);
1221 else if (nxt2 - nxt1 < U16_MAX)
1222 NEXT_OFF(nxt1) = nxt2 - nxt1;
1223 else
1224 OP(nxt) = NOTHING; /* Cannot beautify */
1225 }
1226 nxt1 = nnxt;
1227 }
1228#endif
1229 /* Optimize again: */
b81d288d 1230 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1231 NULL, 0);
a0ed51b3
LW
1232 }
1233 else
c277df42 1234 oscan->flags = 0;
c277df42 1235 }
e1901655
IZ
1236 else if ((OP(oscan) == CURLYX)
1237 && (flags & SCF_WHILEM_VISITED_POS)
1238 /* See the comment on a similar expression above.
1239 However, this time it not a subexpression
1240 we care about, but the expression itself. */
1241 && (maxcount == REG_INFTY)
1242 && data && ++data->whilem_c < 16) {
1243 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1244 /* Find WHILEM (as in regexec.c) */
1245 regnode *nxt = oscan + NEXT_OFF(oscan);
1246
1247 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1248 nxt += ARG(nxt);
eb160463
GS
1249 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1250 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1251 }
b81d288d 1252 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1253 pars++;
1254 if (flags & SCF_DO_SUBSTR) {
1255 SV *last_str = Nullsv;
1256 int counted = mincount != 0;
1257
1258 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1259#if defined(SPARC64_GCC_WORKAROUND)
1260 I32 b = 0;
1261 STRLEN l = 0;
1262 char *s = NULL;
1263 I32 old = 0;
1264
1265 if (pos_before >= data->last_start_min)
1266 b = pos_before;
1267 else
1268 b = data->last_start_min;
1269
1270 l = 0;
1271 s = SvPV(data->last_found, l);
1272 old = b - data->last_start_min;
1273
1274#else
b81d288d 1275 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1276 ? pos_before : data->last_start_min;
1277 STRLEN l;
1278 char *s = SvPV(data->last_found, l);
a0ed51b3 1279 I32 old = b - data->last_start_min;
5d1c421c 1280#endif
a0ed51b3
LW
1281
1282 if (UTF)
1283 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1284
a0ed51b3 1285 l -= old;
c277df42 1286 /* Get the added string: */
79cb57f6 1287 last_str = newSVpvn(s + old, l);
0e933229
IH
1288 if (UTF)
1289 SvUTF8_on(last_str);
c277df42
IZ
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1292 if (mincount > 1) {
1293 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1294 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
b81d288d 1298 SvCUR_set(data->last_found,
c277df42
IZ
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
0eda9292
JH
1301 {
1302 SV * sv = data->last_found;
1303 MAGIC *mg =
1304 SvUTF8(sv) && SvMAGICAL(sv) ?
1305 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1306 if (mg && mg->mg_len >= 0)
1307 mg->mg_len += CHR_SVLEN(last_str);
1308 }
c277df42
IZ
1309 data->last_end += l * (mincount - 1);
1310 }
2a8d9689
HS
1311 } else {
1312 /* start offset must point into the last copy */
1313 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1314 data->last_start_max += is_inf ? I32_MAX
1315 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1316 }
1317 }
1318 /* It is counted once already... */
1319 data->pos_min += minnext * (mincount - counted);
1320 data->pos_delta += - counted * deltanext +
1321 (minnext + deltanext) * maxcount - minnext * mincount;
1322 if (mincount != maxcount) {
653099ff
GS
1323 /* Cannot extend fixed substrings found inside
1324 the group. */
830247a4 1325 scan_commit(pRExC_state,data);
c277df42
IZ
1326 if (mincount && last_str) {
1327 sv_setsv(data->last_found, last_str);
1328 data->last_end = data->pos_min;
b81d288d 1329 data->last_start_min =
a0ed51b3 1330 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1331 data->last_start_max = is_inf
1332 ? I32_MAX
c277df42 1333 : data->pos_min + data->pos_delta
a0ed51b3 1334 - CHR_SVLEN(last_str);
c277df42
IZ
1335 }
1336 data->longest = &(data->longest_float);
1337 }
aca2d497 1338 SvREFCNT_dec(last_str);
c277df42 1339 }
405ff068 1340 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1341 data->flags |= SF_HAS_EVAL;
1342 optimize_curly_tail:
c277df42 1343 if (OP(oscan) != CURLYX) {
22c35a8c 1344 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1345 && NEXT_OFF(next))
1346 NEXT_OFF(oscan) += NEXT_OFF(next);
1347 }
c277df42 1348 continue;
653099ff 1349 default: /* REF and CLUMP only? */
c277df42 1350 if (flags & SCF_DO_SUBSTR) {
830247a4 1351 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1352 data->longest = &(data->longest_float);
1353 }
aca2d497 1354 is_inf = is_inf_internal = 1;
653099ff 1355 if (flags & SCF_DO_STCLASS_OR)
830247a4 1356 cl_anything(pRExC_state, data->start_class);
653099ff 1357 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1358 break;
1359 }
a0ed51b3 1360 }
ffc61ed2 1361 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1362 int value = 0;
653099ff 1363
c277df42 1364 if (flags & SCF_DO_SUBSTR) {
830247a4 1365 scan_commit(pRExC_state,data);
c277df42
IZ
1366 data->pos_min++;
1367 }
1368 min++;
653099ff
GS
1369 if (flags & SCF_DO_STCLASS) {
1370 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1371
1372 /* Some of the logic below assumes that switching
1373 locale on will only add false positives. */
1374 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1375 case SANY:
653099ff
GS
1376 default:
1377 do_default:
1378 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1379 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1380 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1381 break;
1382 case REG_ANY:
1383 if (OP(scan) == SANY)
1384 goto do_default;
1385 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1386 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1387 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1388 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1389 }
1390 if (flags & SCF_DO_STCLASS_AND || !value)
1391 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1392 break;
1393 case ANYOF:
1394 if (flags & SCF_DO_STCLASS_AND)
1395 cl_and(data->start_class,
1396 (struct regnode_charclass_class*)scan);
1397 else
830247a4 1398 cl_or(pRExC_state, data->start_class,
653099ff
GS
1399 (struct regnode_charclass_class*)scan);
1400 break;
1401 case ALNUM:
1402 if (flags & SCF_DO_STCLASS_AND) {
1403 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1404 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1405 for (value = 0; value < 256; value++)
1406 if (!isALNUM(value))
1407 ANYOF_BITMAP_CLEAR(data->start_class, value);
1408 }
1409 }
1410 else {
1411 if (data->start_class->flags & ANYOF_LOCALE)
1412 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1413 else {
1414 for (value = 0; value < 256; value++)
1415 if (isALNUM(value))
b81d288d 1416 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1417 }
1418 }
1419 break;
1420 case ALNUML:
1421 if (flags & SCF_DO_STCLASS_AND) {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1424 }
1425 else {
1426 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1427 data->start_class->flags |= ANYOF_LOCALE;
1428 }
1429 break;
1430 case NALNUM:
1431 if (flags & SCF_DO_STCLASS_AND) {
1432 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1433 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1434 for (value = 0; value < 256; value++)
1435 if (isALNUM(value))
1436 ANYOF_BITMAP_CLEAR(data->start_class, value);
1437 }
1438 }
1439 else {
1440 if (data->start_class->flags & ANYOF_LOCALE)
1441 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1442 else {
1443 for (value = 0; value < 256; value++)
1444 if (!isALNUM(value))
b81d288d 1445 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1446 }
1447 }
1448 break;
1449 case NALNUML:
1450 if (flags & SCF_DO_STCLASS_AND) {
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1453 }
1454 else {
1455 data->start_class->flags |= ANYOF_LOCALE;
1456 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1457 }
1458 break;
1459 case SPACE:
1460 if (flags & SCF_DO_STCLASS_AND) {
1461 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1462 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1463 for (value = 0; value < 256; value++)
1464 if (!isSPACE(value))
1465 ANYOF_BITMAP_CLEAR(data->start_class, value);
1466 }
1467 }
1468 else {
1469 if (data->start_class->flags & ANYOF_LOCALE)
1470 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1471 else {
1472 for (value = 0; value < 256; value++)
1473 if (isSPACE(value))
b81d288d 1474 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1475 }
1476 }
1477 break;
1478 case SPACEL:
1479 if (flags & SCF_DO_STCLASS_AND) {
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1482 }
1483 else {
1484 data->start_class->flags |= ANYOF_LOCALE;
1485 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1486 }
1487 break;
1488 case NSPACE:
1489 if (flags & SCF_DO_STCLASS_AND) {
1490 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1491 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1492 for (value = 0; value < 256; value++)
1493 if (isSPACE(value))
1494 ANYOF_BITMAP_CLEAR(data->start_class, value);
1495 }
1496 }
1497 else {
1498 if (data->start_class->flags & ANYOF_LOCALE)
1499 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1500 else {
1501 for (value = 0; value < 256; value++)
1502 if (!isSPACE(value))
b81d288d 1503 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1504 }
1505 }
1506 break;
1507 case NSPACEL:
1508 if (flags & SCF_DO_STCLASS_AND) {
1509 if (data->start_class->flags & ANYOF_LOCALE) {
1510 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1511 for (value = 0; value < 256; value++)
1512 if (!isSPACE(value))
1513 ANYOF_BITMAP_CLEAR(data->start_class, value);
1514 }
1515 }
1516 else {
1517 data->start_class->flags |= ANYOF_LOCALE;
1518 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1519 }
1520 break;
1521 case DIGIT:
1522 if (flags & SCF_DO_STCLASS_AND) {
1523 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1524 for (value = 0; value < 256; value++)
1525 if (!isDIGIT(value))
1526 ANYOF_BITMAP_CLEAR(data->start_class, value);
1527 }
1528 else {
1529 if (data->start_class->flags & ANYOF_LOCALE)
1530 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1531 else {
1532 for (value = 0; value < 256; value++)
1533 if (isDIGIT(value))
b81d288d 1534 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1535 }
1536 }
1537 break;
1538 case NDIGIT:
1539 if (flags & SCF_DO_STCLASS_AND) {
1540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1541 for (value = 0; value < 256; value++)
1542 if (isDIGIT(value))
1543 ANYOF_BITMAP_CLEAR(data->start_class, value);
1544 }
1545 else {
1546 if (data->start_class->flags & ANYOF_LOCALE)
1547 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1548 else {
1549 for (value = 0; value < 256; value++)
1550 if (!isDIGIT(value))
b81d288d 1551 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1552 }
1553 }
1554 break;
1555 }
1556 if (flags & SCF_DO_STCLASS_OR)
1557 cl_and(data->start_class, &and_with);
1558 flags &= ~SCF_DO_STCLASS;
1559 }
a0ed51b3 1560 }
22c35a8c 1561 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1562 data->flags |= (OP(scan) == MEOL
1563 ? SF_BEFORE_MEOL
1564 : SF_BEFORE_SEOL);
a0ed51b3 1565 }
653099ff
GS
1566 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1567 /* Lookbehind, or need to calculate parens/evals/stclass: */
1568 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1569 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1570 /* Lookahead/lookbehind */
cb434fcc 1571 I32 deltanext, minnext, fake = 0;
c277df42 1572 regnode *nscan;
653099ff
GS
1573 struct regnode_charclass_class intrnl;
1574 int f = 0;
c277df42
IZ
1575
1576 data_fake.flags = 0;
b81d288d 1577 if (data) {
2c2d71f5 1578 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1579 data_fake.last_closep = data->last_closep;
1580 }
1581 else
1582 data_fake.last_closep = &fake;
653099ff
GS
1583 if ( flags & SCF_DO_STCLASS && !scan->flags
1584 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1585 cl_init(pRExC_state, &intrnl);
653099ff 1586 data_fake.start_class = &intrnl;
e1901655 1587 f |= SCF_DO_STCLASS_AND;
653099ff 1588 }
e1901655
IZ
1589 if (flags & SCF_WHILEM_VISITED_POS)
1590 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1591 next = regnext(scan);
1592 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1593 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1594 if (scan->flags) {
1595 if (deltanext) {
9baa0206 1596 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1597 }
1598 else if (minnext > U8_MAX) {
9baa0206 1599 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1600 }
eb160463 1601 scan->flags = (U8)minnext;
c277df42
IZ
1602 }
1603 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1604 pars++;
405ff068 1605 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1606 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1607 if (data)
1608 data->whilem_c = data_fake.whilem_c;
e1901655 1609 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1610 int was = (data->start_class->flags & ANYOF_EOS);
1611
1612 cl_and(data->start_class, &intrnl);
1613 if (was)
1614 data->start_class->flags |= ANYOF_EOS;
1615 }
a0ed51b3
LW
1616 }
1617 else if (OP(scan) == OPEN) {
c277df42 1618 pars++;
a0ed51b3 1619 }
cb434fcc 1620 else if (OP(scan) == CLOSE) {
eb160463 1621 if ((I32)ARG(scan) == is_par) {
cb434fcc 1622 next = regnext(scan);
c277df42 1623
cb434fcc
IZ
1624 if ( next && (OP(next) != WHILEM) && next < last)
1625 is_par = 0; /* Disable optimization */
1626 }
1627 if (data)
1628 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1629 }
1630 else if (OP(scan) == EVAL) {
c277df42
IZ
1631 if (data)
1632 data->flags |= SF_HAS_EVAL;
1633 }
96776eda 1634 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1635 if (flags & SCF_DO_SUBSTR) {
830247a4 1636 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1637 data->longest = &(data->longest_float);
1638 }
1639 is_inf = is_inf_internal = 1;
653099ff 1640 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1641 cl_anything(pRExC_state, data->start_class);
96776eda 1642 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1643 }
c277df42
IZ
1644 /* Else: zero-length, ignore. */
1645 scan = regnext(scan);
1646 }
1647
1648 finish:
1649 *scanp = scan;
aca2d497 1650 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1651 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1652 data->pos_delta = I32_MAX - data->pos_min;
1653 if (is_par > U8_MAX)
1654 is_par = 0;
1655 if (is_par && pars==1 && data) {
1656 data->flags |= SF_IN_PAR;
1657 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1658 }
1659 else if (pars && data) {
c277df42
IZ
1660 data->flags |= SF_HAS_PAR;
1661 data->flags &= ~SF_IN_PAR;
1662 }
653099ff
GS
1663 if (flags & SCF_DO_STCLASS_OR)
1664 cl_and(data->start_class, &and_with);
c277df42
IZ
1665 return min;
1666}
1667
76e3520e 1668STATIC I32
830247a4 1669S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1670{
830247a4 1671 if (RExC_rx->data) {
b81d288d
AB
1672 Renewc(RExC_rx->data,
1673 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1674 char, struct reg_data);
830247a4
IZ
1675 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1676 RExC_rx->data->count += n;
a0ed51b3
LW
1677 }
1678 else {
830247a4 1679 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1680 char, struct reg_data);
830247a4
IZ
1681 New(1208, RExC_rx->data->what, n, U8);
1682 RExC_rx->data->count = n;
c277df42 1683 }
830247a4
IZ
1684 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1685 return RExC_rx->data->count - n;
c277df42
IZ
1686}
1687
d88dccdf 1688void
864dbfa3 1689Perl_reginitcolors(pTHX)
d88dccdf 1690{
d88dccdf
IZ
1691 int i = 0;
1692 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1693
d88dccdf
IZ
1694 if (s) {
1695 PL_colors[0] = s = savepv(s);
1696 while (++i < 6) {
1697 s = strchr(s, '\t');
1698 if (s) {
1699 *s = '\0';
1700 PL_colors[i] = ++s;
1701 }
1702 else
c712d376 1703 PL_colors[i] = s = "";
d88dccdf
IZ
1704 }
1705 } else {
b81d288d 1706 while (i < 6)
d88dccdf
IZ
1707 PL_colors[i++] = "";
1708 }
1709 PL_colorset = 1;
1710}
1711
8615cb43 1712
a687059c 1713/*
e50aee73 1714 - pregcomp - compile a regular expression into internal code
a687059c
LW
1715 *
1716 * We can't allocate space until we know how big the compiled form will be,
1717 * but we can't compile it (and thus know how big it is) until we've got a
1718 * place to put the code. So we cheat: we compile it twice, once with code
1719 * generation turned off and size counting turned on, and once "for real".
1720 * This also means that we don't allocate space until we are sure that the
1721 * thing really will compile successfully, and we never have to move the
1722 * code and thus invalidate pointers into it. (Note that it has to be in
1723 * one piece because free() must be able to free it all.) [NB: not true in perl]
1724 *
1725 * Beware that the optimization-preparation code in here knows about some
1726 * of the structure of the compiled regexp. [I'll say.]
1727 */
1728regexp *
864dbfa3 1729Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1730{
a0d0e21e 1731 register regexp *r;
c277df42 1732 regnode *scan;
c277df42 1733 regnode *first;
a0d0e21e 1734 I32 flags;
a0d0e21e
LW
1735 I32 minlen = 0;
1736 I32 sawplus = 0;
1737 I32 sawopen = 0;
2c2d71f5 1738 scan_data_t data;
830247a4
IZ
1739 RExC_state_t RExC_state;
1740 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1741
1742 if (exp == NULL)
c277df42 1743 FAIL("NULL regexp argument");
a0d0e21e 1744
a5961de5 1745 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1746
5cfc7842 1747 RExC_precomp = exp;
a5961de5
JH
1748 DEBUG_r({
1749 if (!PL_colorset) reginitcolors();
1750 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1751 PL_colors[4],PL_colors[5],PL_colors[0],
1752 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1753 });
e2509266 1754 RExC_flags = pm->op_pmflags;
830247a4 1755 RExC_sawback = 0;
bbce6d69 1756
830247a4
IZ
1757 RExC_seen = 0;
1758 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1759 RExC_seen_evals = 0;
1760 RExC_extralen = 0;
c277df42 1761
bbce6d69 1762 /* First pass: determine size, legality. */
830247a4 1763 RExC_parse = exp;
fac92740 1764 RExC_start = exp;
830247a4
IZ
1765 RExC_end = xend;
1766 RExC_naughty = 0;
1767 RExC_npar = 1;
1768 RExC_size = 0L;
1769 RExC_emit = &PL_regdummy;
1770 RExC_whilem_seen = 0;
85ddcde9
JH
1771#if 0 /* REGC() is (currently) a NOP at the first pass.
1772 * Clever compilers notice this and complain. --jhi */
830247a4 1773 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1774#endif
830247a4 1775 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1776 RExC_precomp = Nullch;
a0d0e21e
LW
1777 return(NULL);
1778 }
830247a4 1779 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1780
c277df42
IZ
1781 /* Small enough for pointer-storage convention?
1782 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1783 if (RExC_size >= 0x10000L && RExC_extralen)
1784 RExC_size += RExC_extralen;
c277df42 1785 else
830247a4
IZ
1786 RExC_extralen = 0;
1787 if (RExC_whilem_seen > 15)
1788 RExC_whilem_seen = 15;
a0d0e21e 1789
bbce6d69 1790 /* Allocate space and initialize. */
830247a4 1791 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1792 char, regexp);
a0d0e21e 1793 if (r == NULL)
b45f050a
JF
1794 FAIL("Regexp out of space");
1795
0f79a09d
GS
1796#ifdef DEBUGGING
1797 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1798 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1799#endif
c277df42 1800 r->refcnt = 1;
bbce6d69 1801 r->prelen = xend - exp;
5cfc7842 1802 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1803 r->subbeg = NULL;
ed252734
NC
1804#ifdef PERL_COPY_ON_WRITE
1805 r->saved_copy = Nullsv;
1806#endif
cf93c79d 1807 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1808 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1809
1810 r->substrs = 0; /* Useful during FAIL. */
1811 r->startp = 0; /* Useful during FAIL. */
1812 r->endp = 0; /* Useful during FAIL. */
1813
fac92740
MJD
1814 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1815 if (r->offsets) {
1816 r->offsets[0] = RExC_size;
1817 }
1818 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1819 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1820 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1821 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1822
830247a4 1823 RExC_rx = r;
bbce6d69 1824
1825 /* Second pass: emit code. */
e2509266 1826 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1827 RExC_parse = exp;
1828 RExC_end = xend;
1829 RExC_naughty = 0;
1830 RExC_npar = 1;
fac92740 1831 RExC_emit_start = r->program;
830247a4 1832 RExC_emit = r->program;
2cd61cdb 1833 /* Store the count of eval-groups for security checks: */
eb160463 1834 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1835 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1836 r->data = 0;
830247a4 1837 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1838 return(NULL);
1839
1840 /* Dig out information for optimizations. */
cf93c79d 1841 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1842 pm->op_pmflags = RExC_flags;
a0ed51b3 1843 if (UTF)
5ff6fc6d 1844 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1845 r->regstclass = NULL;
830247a4 1846 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1847 r->reganch |= ROPT_NAUGHTY;
c277df42 1848 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1849
1850 /* XXXX To minimize changes to RE engine we always allocate
1851 3-units-long substrs field. */
1852 Newz(1004, r->substrs, 1, struct reg_substr_data);
1853
2c2d71f5 1854 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1855 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1856 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1857 I32 fake;
c5254dd6 1858 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1859 struct regnode_charclass_class ch_class;
1860 int stclass_flag;
cb434fcc 1861 I32 last_close = 0;
a0d0e21e
LW
1862
1863 first = scan;
c277df42 1864 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1865 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1866 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1867 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1868 (OP(first) == PLUS) ||
1869 (OP(first) == MINMOD) ||
653099ff 1870 /* An {n,m} with n>0 */
22c35a8c 1871 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1872 if (OP(first) == PLUS)
1873 sawplus = 1;
1874 else
1875 first += regarglen[(U8)OP(first)];
1876 first = NEXTOPER(first);
a687059c
LW
1877 }
1878
a0d0e21e
LW
1879 /* Starting-point info. */
1880 again:
653099ff 1881 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1882 if (OP(first) == EXACT)
1883 ; /* Empty, get anchored substr later. */
1884 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1885 r->regstclass = first;
1886 }
653099ff 1887 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1888 r->regstclass = first;
22c35a8c
GS
1889 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1890 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1891 r->regstclass = first;
22c35a8c 1892 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1893 r->reganch |= (OP(first) == MBOL
1894 ? ROPT_ANCH_MBOL
1895 : (OP(first) == SBOL
1896 ? ROPT_ANCH_SBOL
1897 : ROPT_ANCH_BOL));
a0d0e21e 1898 first = NEXTOPER(first);
774d564b 1899 goto again;
1900 }
1901 else if (OP(first) == GPOS) {
1902 r->reganch |= ROPT_ANCH_GPOS;
1903 first = NEXTOPER(first);
1904 goto again;
a0d0e21e 1905 }
e09294f4 1906 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1907 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1908 !(r->reganch & ROPT_ANCH) )
1909 {
1910 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1911 int type = OP(NEXTOPER(first));
1912
ffc61ed2 1913 if (type == REG_ANY)
cad2e5aa
JH
1914 type = ROPT_ANCH_MBOL;
1915 else
1916 type = ROPT_ANCH_SBOL;
1917
1918 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1919 first = NEXTOPER(first);
774d564b 1920 goto again;
a0d0e21e 1921 }
b81d288d 1922 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1923 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1924 /* x+ must match at the 1st pos of run of x's */
1925 r->reganch |= ROPT_SKIP;
a0d0e21e 1926
c277df42 1927 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1928 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1929 (IV)(first - scan + 1)));
a0d0e21e
LW
1930 /*
1931 * If there's something expensive in the r.e., find the
1932 * longest literal string that must appear and make it the
1933 * regmust. Resolve ties in favor of later strings, since
1934 * the regstart check works with the beginning of the r.e.
1935 * and avoiding duplication strengthens checking. Not a
1936 * strong reason, but sufficient in the absence of others.
1937 * [Now we resolve ties in favor of the earlier string if
c277df42 1938 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1939 * earlier string may buy us something the later one won't.]
1940 */
a0d0e21e 1941 minlen = 0;
a687059c 1942
79cb57f6
GS
1943 data.longest_fixed = newSVpvn("",0);
1944 data.longest_float = newSVpvn("",0);
1945 data.last_found = newSVpvn("",0);
c277df42
IZ
1946 data.longest = &(data.longest_fixed);
1947 first = scan;
653099ff 1948 if (!r->regstclass) {
830247a4 1949 cl_init(pRExC_state, &ch_class);
653099ff
GS
1950 data.start_class = &ch_class;
1951 stclass_flag = SCF_DO_STCLASS_AND;
1952 } else /* XXXX Check for BOUND? */
1953 stclass_flag = 0;
cb434fcc 1954 data.last_closep = &last_close;
653099ff 1955
830247a4 1956 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1957 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1958 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1959 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1960 && !RExC_seen_zerolen
1961 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1962 r->reganch |= ROPT_CHECK_ALL;
830247a4 1963 scan_commit(pRExC_state, &data);
c277df42
IZ
1964 SvREFCNT_dec(data.last_found);
1965
a0ed51b3 1966 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1967 if (longest_float_length
c277df42
IZ
1968 || (data.flags & SF_FL_BEFORE_EOL
1969 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1970 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1971 int t;
1972
a0ed51b3 1973 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1974 && data.offset_fixed == data.offset_float_min
1975 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1976 goto remove_float; /* As in (a)+. */
1977
33b8afdf
JH
1978 if (SvUTF8(data.longest_float)) {
1979 r->float_utf8 = data.longest_float;
1980 r->float_substr = Nullsv;
1981 } else {
1982 r->float_substr = data.longest_float;
1983 r->float_utf8 = Nullsv;
1984 }
c277df42
IZ
1985 r->float_min_offset = data.offset_float_min;
1986 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1987 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1988 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1989 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1990 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1991 }
1992 else {
aca2d497 1993 remove_float:
33b8afdf 1994 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1995 SvREFCNT_dec(data.longest_float);
c5254dd6 1996 longest_float_length = 0;
a0d0e21e 1997 }
c277df42 1998
a0ed51b3 1999 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2000 if (longest_fixed_length
c277df42
IZ
2001 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2002 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2003 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2004 int t;
2005
33b8afdf
JH
2006 if (SvUTF8(data.longest_fixed)) {
2007 r->anchored_utf8 = data.longest_fixed;
2008 r->anchored_substr = Nullsv;
2009 } else {
2010 r->anchored_substr = data.longest_fixed;
2011 r->anchored_utf8 = Nullsv;
2012 }
c277df42 2013 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2014 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2015 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2016 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2017 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2018 }
2019 else {
33b8afdf 2020 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2021 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2022 longest_fixed_length = 0;
a0d0e21e 2023 }
b81d288d 2024 if (r->regstclass
ffc61ed2 2025 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2026 r->regstclass = NULL;
33b8afdf
JH
2027 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2028 && stclass_flag
653099ff 2029 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2030 && !cl_is_anything(data.start_class))
2031 {
830247a4 2032 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2033
b81d288d 2034 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2035 struct regnode_charclass_class);
2036 StructCopy(data.start_class,
830247a4 2037 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2038 struct regnode_charclass_class);
830247a4 2039 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2040 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2041 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2042 DEBUG_r({ SV *sv = sv_newmortal();
2043 regprop(sv, (regnode*)data.start_class);
2044 PerlIO_printf(Perl_debug_log,
2045 "synthetic stclass `%s'.\n",
2046 SvPVX(sv));});
653099ff 2047 }
c277df42
IZ
2048
2049 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2050 if (longest_fixed_length > longest_float_length) {
c277df42 2051 r->check_substr = r->anchored_substr;
33b8afdf 2052 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2053 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2054 if (r->reganch & ROPT_ANCH_SINGLE)
2055 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2056 }
2057 else {
c277df42 2058 r->check_substr = r->float_substr;
33b8afdf 2059 r->check_utf8 = r->float_utf8;
c277df42
IZ
2060 r->check_offset_min = data.offset_float_min;
2061 r->check_offset_max = data.offset_float_max;
a0d0e21e 2062 }
30382c73
IZ
2063 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2064 This should be changed ASAP! */
33b8afdf 2065 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2066 r->reganch |= RE_USE_INTUIT;
33b8afdf 2067 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2068 r->reganch |= RE_INTUIT_TAIL;
2069 }
a0ed51b3
LW
2070 }
2071 else {
c277df42
IZ
2072 /* Several toplevels. Best we can is to set minlen. */
2073 I32 fake;
653099ff 2074 struct regnode_charclass_class ch_class;
cb434fcc 2075 I32 last_close = 0;
c277df42
IZ
2076
2077 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2078 scan = r->program + 1;
830247a4 2079 cl_init(pRExC_state, &ch_class);
653099ff 2080 data.start_class = &ch_class;
cb434fcc 2081 data.last_closep = &last_close;
e1901655 2082 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2083 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2084 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2085 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2086 && !cl_is_anything(data.start_class))
2087 {
830247a4 2088 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2089
b81d288d 2090 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2091 struct regnode_charclass_class);
2092 StructCopy(data.start_class,
830247a4 2093 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2094 struct regnode_charclass_class);
830247a4 2095 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2096 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2097 DEBUG_r({ SV* sv = sv_newmortal();
2098 regprop(sv, (regnode*)data.start_class);
2099 PerlIO_printf(Perl_debug_log,
2100 "synthetic stclass `%s'.\n",
2101 SvPVX(sv));});
653099ff 2102 }
a0d0e21e
LW
2103 }
2104
a0d0e21e 2105 r->minlen = minlen;
b81d288d 2106 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2107 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2108 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2109 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2110 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2111 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2112 if (RExC_seen & REG_SEEN_CANY)
2113 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2114 Newz(1002, r->startp, RExC_npar, I32);
2115 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2116 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2117 DEBUG_r(regdump(r));
2118 return(r);
a687059c
LW
2119}
2120
2121/*
2122 - reg - regular expression, i.e. main body or parenthesized thing
2123 *
2124 * Caller must absorb opening parenthesis.
2125 *
2126 * Combining parenthesis handling with the base level of regular expression
2127 * is a trifle forced, but the need to tie the tails of the branches to what
2128 * follows makes it hard to avoid.
2129 */
76e3520e 2130STATIC regnode *
830247a4 2131S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2132 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2133{
c277df42
IZ
2134 register regnode *ret; /* Will be the head of the group. */
2135 register regnode *br;
2136 register regnode *lastbr;
2137 register regnode *ender = 0;
a0d0e21e 2138 register I32 parno = 0;
e2509266 2139 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2140
2141 /* for (?g), (?gc), and (?o) warnings; warning
2142 about (?c) will warn about (?g) -- japhy */
2143
2144 I32 wastedflags = 0x00,
2145 wasted_o = 0x01,
2146 wasted_g = 0x02,
2147 wasted_gc = 0x02 | 0x04,
2148 wasted_c = 0x04;
2149
fac92740 2150 char * parse_start = RExC_parse; /* MJD */
830247a4 2151 char *oregcomp_parse = RExC_parse;
c277df42 2152 char c;
a0d0e21e 2153
821b33a5 2154 *flagp = 0; /* Tentatively. */
a0d0e21e 2155
9d1d55b5 2156
a0d0e21e
LW
2157 /* Make an OPEN node, if parenthesized. */
2158 if (paren) {
fac92740 2159 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2160 U32 posflags = 0, negflags = 0;
2161 U32 *flagsp = &posflags;
0f5d15d6 2162 int logical = 0;
830247a4 2163 char *seqstart = RExC_parse;
ca9dfc88 2164
830247a4
IZ
2165 RExC_parse++;
2166 paren = *RExC_parse++;
c277df42 2167 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2168 switch (paren) {
fac92740 2169 case '<': /* (?<...) */
830247a4 2170 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2171 if (*RExC_parse == '!')
c277df42 2172 paren = ',';
b81d288d 2173 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2174 goto unknown;
830247a4 2175 RExC_parse++;
fac92740
MJD
2176 case '=': /* (?=...) */
2177 case '!': /* (?!...) */
830247a4 2178 RExC_seen_zerolen++;
fac92740
MJD
2179 case ':': /* (?:...) */
2180 case '>': /* (?>...) */
a0d0e21e 2181 break;
fac92740
MJD
2182 case '$': /* (?$...) */
2183 case '@': /* (?@...) */
8615cb43 2184 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2185 break;
fac92740 2186 case '#': /* (?#...) */
830247a4
IZ
2187 while (*RExC_parse && *RExC_parse != ')')
2188 RExC_parse++;
2189 if (*RExC_parse != ')')
c277df42 2190 FAIL("Sequence (?#... not terminated");
830247a4 2191 nextchar(pRExC_state);
a0d0e21e
LW
2192 *flagp = TRYAGAIN;
2193 return NULL;
fac92740 2194 case 'p': /* (?p...) */
9014280d 2195 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2196 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2197 /* FALL THROUGH*/
fac92740 2198 case '?': /* (??...) */
0f5d15d6 2199 logical = 1;
438a3801
YST
2200 if (*RExC_parse != '{')
2201 goto unknown;
830247a4 2202 paren = *RExC_parse++;
0f5d15d6 2203 /* FALL THROUGH */
fac92740 2204 case '{': /* (?{...}) */
c277df42 2205 {
c277df42
IZ
2206 I32 count = 1, n = 0;
2207 char c;
830247a4 2208 char *s = RExC_parse;
c277df42
IZ
2209 SV *sv;
2210 OP_4tree *sop, *rop;
2211
830247a4
IZ
2212 RExC_seen_zerolen++;
2213 RExC_seen |= REG_SEEN_EVAL;
2214 while (count && (c = *RExC_parse)) {
2215 if (c == '\\' && RExC_parse[1])
2216 RExC_parse++;
b81d288d 2217 else if (c == '{')
c277df42 2218 count++;
b81d288d 2219 else if (c == '}')
c277df42 2220 count--;
830247a4 2221 RExC_parse++;
c277df42 2222 }
830247a4 2223 if (*RExC_parse != ')')
b45f050a 2224 {
b81d288d 2225 RExC_parse = s;
b45f050a
JF
2226 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2227 }
c277df42 2228 if (!SIZE_ONLY) {
f3548bdc 2229 PAD *pad;
b81d288d
AB
2230
2231 if (RExC_parse - 1 - s)
830247a4 2232 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2233 else
79cb57f6 2234 sv = newSVpvn("", 0);
c277df42 2235
569233ed
SB
2236 ENTER;
2237 Perl_save_re_context(aTHX);
f3548bdc 2238 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2239 sop->op_private |= OPpREFCOUNTED;
2240 /* re_dup will OpREFCNT_inc */
2241 OpREFCNT_set(sop, 1);
569233ed 2242 LEAVE;
c277df42 2243
830247a4
IZ
2244 n = add_data(pRExC_state, 3, "nop");
2245 RExC_rx->data->data[n] = (void*)rop;
2246 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2247 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2248 SvREFCNT_dec(sv);
a0ed51b3 2249 }
e24b16f9 2250 else { /* First pass */
830247a4 2251 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2252 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2253 /* No compiled RE interpolated, has runtime
2254 components ===> unsafe. */
2255 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2256 if (PL_tainting && PL_tainted)
cc6b7395 2257 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2258 }
2259
830247a4 2260 nextchar(pRExC_state);
0f5d15d6 2261 if (logical) {
830247a4 2262 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2263 if (!SIZE_ONLY)
2264 ret->flags = 2;
830247a4 2265 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2266 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2267 return ret;
2268 }
ccb2c380
MP
2269 ret = reganode(pRExC_state, EVAL, n);
2270 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2271 Set_Node_Offset(ret, parse_start);
2272 return ret;
c277df42 2273 }
fac92740 2274 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2275 {
fac92740 2276 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2277 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2278 || RExC_parse[1] == '<'
830247a4 2279 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2280 I32 flag;
2281
830247a4 2282 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2283 if (!SIZE_ONLY)
2284 ret->flags = 1;
830247a4 2285 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2286 goto insert_if;
b81d288d 2287 }
a0ed51b3 2288 }
830247a4 2289 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2290 /* (?(1)...) */
830247a4 2291 parno = atoi(RExC_parse++);
c277df42 2292
830247a4
IZ
2293 while (isDIGIT(*RExC_parse))
2294 RExC_parse++;
fac92740
MJD
2295 ret = reganode(pRExC_state, GROUPP, parno);
2296
830247a4 2297 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2298 vFAIL("Switch condition not recognized");
c277df42 2299 insert_if:
830247a4
IZ
2300 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2301 br = regbranch(pRExC_state, &flags, 1);
c277df42 2302 if (br == NULL)
830247a4 2303 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2304 else
830247a4
IZ
2305 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2306 c = *nextchar(pRExC_state);
d1b80229
IZ
2307 if (flags&HASWIDTH)
2308 *flagp |= HASWIDTH;
c277df42 2309 if (c == '|') {
830247a4
IZ
2310 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2311 regbranch(pRExC_state, &flags, 1);
2312 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2313 if (flags&HASWIDTH)
2314 *flagp |= HASWIDTH;
830247a4 2315 c = *nextchar(pRExC_state);
a0ed51b3
LW
2316 }
2317 else
c277df42
IZ
2318 lastbr = NULL;
2319 if (c != ')')
8615cb43 2320 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2321 ender = reg_node(pRExC_state, TAIL);
2322 regtail(pRExC_state, br, ender);
c277df42 2323 if (lastbr) {
830247a4
IZ
2324 regtail(pRExC_state, lastbr, ender);
2325 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2326 }
2327 else
830247a4 2328 regtail(pRExC_state, ret, ender);
c277df42 2329 return ret;
a0ed51b3
LW
2330 }
2331 else {
830247a4 2332 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2333 }
2334 }
1b1626e4 2335 case 0:
830247a4 2336 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2337 vFAIL("Sequence (? incomplete");
1b1626e4 2338 break;
a0d0e21e 2339 default:
830247a4 2340 --RExC_parse;
fac92740 2341 parse_flags: /* (?i) */
830247a4 2342 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2343 /* (?g), (?gc) and (?o) are useless here
2344 and must be globally applied -- japhy */
2345
2346 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2347 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2348 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2349 if (! (wastedflags & wflagbit) ) {
2350 wastedflags |= wflagbit;
2351 vWARN5(
2352 RExC_parse + 1,
2353 "Useless (%s%c) - %suse /%c modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 *RExC_parse,
2356 flagsp == &negflags ? "don't " : "",
2357 *RExC_parse
2358 );
2359 }
2360 }
2361 }
2362 else if (*RExC_parse == 'c') {
2363 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2364 if (! (wastedflags & wasted_c) ) {
2365 wastedflags |= wasted_gc;
2366 vWARN3(
2367 RExC_parse + 1,
2368 "Useless (%sc) - %suse /gc modifier",
2369 flagsp == &negflags ? "?-" : "?",
2370 flagsp == &negflags ? "don't " : ""
2371 );
2372 }
2373 }
2374 }
2375 else { pmflag(flagsp, *RExC_parse); }
2376
830247a4 2377 ++RExC_parse;
ca9dfc88 2378 }
830247a4 2379 if (*RExC_parse == '-') {
ca9dfc88 2380 flagsp = &negflags;
9d1d55b5 2381 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2382 ++RExC_parse;
ca9dfc88 2383 goto parse_flags;
48c036b1 2384 }
e2509266
JH
2385 RExC_flags |= posflags;
2386 RExC_flags &= ~negflags;
830247a4
IZ
2387 if (*RExC_parse == ':') {
2388 RExC_parse++;
ca9dfc88
IZ
2389 paren = ':';
2390 break;
2391 }
c277df42 2392 unknown:
830247a4
IZ
2393 if (*RExC_parse != ')') {
2394 RExC_parse++;
2395 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2396 }
830247a4 2397 nextchar(pRExC_state);
a0d0e21e
LW
2398 *flagp = TRYAGAIN;
2399 return NULL;
2400 }
2401 }
fac92740 2402 else { /* (...) */
830247a4
IZ
2403 parno = RExC_npar;
2404 RExC_npar++;
2405 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2406 Set_Node_Length(ret, 1); /* MJD */
2407 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2408 open = 1;
a0d0e21e 2409 }
a0ed51b3 2410 }
fac92740 2411 else /* ! paren */
a0d0e21e
LW
2412 ret = NULL;
2413
2414 /* Pick up the branches, linking them together. */
fac92740 2415 parse_start = RExC_parse; /* MJD */
830247a4 2416 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2417 /* branch_len = (paren != 0); */
2418
a0d0e21e
LW
2419 if (br == NULL)
2420 return(NULL);
830247a4
IZ
2421 if (*RExC_parse == '|') {
2422 if (!SIZE_ONLY && RExC_extralen) {
2423 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2424 }
fac92740 2425 else { /* MJD */
830247a4 2426 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2427 Set_Node_Length(br, paren != 0);
2428 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2429 }
c277df42
IZ
2430 have_branch = 1;
2431 if (SIZE_ONLY)
830247a4 2432 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2433 }
2434 else if (paren == ':') {
c277df42
IZ
2435 *flagp |= flags&SIMPLE;
2436 }
2437 if (open) { /* Starts with OPEN. */
830247a4 2438 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2439 }
2440 else if (paren != '?') /* Not Conditional */
a0d0e21e 2441 ret = br;
32a0ca98 2442 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2443 lastbr = br;
830247a4
IZ
2444 while (*RExC_parse == '|') {
2445 if (!SIZE_ONLY && RExC_extralen) {
2446 ender = reganode(pRExC_state, LONGJMP,0);
2447 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2448 }
2449 if (SIZE_ONLY)
830247a4
IZ
2450 RExC_extralen += 2; /* Account for LONGJMP. */
2451 nextchar(pRExC_state);
2452 br = regbranch(pRExC_state, &flags, 0);
fac92740 2453
a687059c 2454 if (br == NULL)
a0d0e21e 2455 return(NULL);
830247a4 2456 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2457 lastbr = br;
821b33a5
IZ
2458 if (flags&HASWIDTH)
2459 *flagp |= HASWIDTH;
a687059c 2460 *flagp |= flags&SPSTART;
a0d0e21e
LW
2461 }
2462
c277df42
IZ
2463 if (have_branch || paren != ':') {
2464 /* Make a closing node, and hook it on the end. */
2465 switch (paren) {
2466 case ':':
830247a4 2467 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2468 break;
2469 case 1:
830247a4 2470 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2471 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2472 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2473 break;
2474 case '<':
c277df42
IZ
2475 case ',':
2476 case '=':
2477 case '!':
c277df42 2478 *flagp &= ~HASWIDTH;
821b33a5
IZ
2479 /* FALL THROUGH */
2480 case '>':
830247a4 2481 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2482 break;
2483 case 0:
830247a4 2484 ender = reg_node(pRExC_state, END);
c277df42
IZ
2485 break;
2486 }
830247a4 2487 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2488
c277df42
IZ
2489 if (have_branch) {
2490 /* Hook the tails of the branches to the closing node. */
2491 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2492 regoptail(pRExC_state, br, ender);
c277df42
IZ
2493 }
2494 }
a0d0e21e 2495 }
c277df42
IZ
2496
2497 {
2498 char *p;
2499 static char parens[] = "=!<,>";
2500
2501 if (paren && (p = strchr(parens, paren))) {
eb160463 2502 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2503 int flag = (p - parens) > 1;
2504
2505 if (paren == '>')
2506 node = SUSPEND, flag = 0;
830247a4 2507 reginsert(pRExC_state, node,ret);
ccb2c380
MP
2508 Set_Node_Offset(ret, oregcomp_parse);
2509 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
c277df42 2510 ret->flags = flag;
830247a4 2511 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2512 }
a0d0e21e
LW
2513 }
2514
2515 /* Check for proper termination. */
ce3e6498 2516 if (paren) {
e2509266 2517 RExC_flags = oregflags;
830247a4
IZ
2518 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2519 RExC_parse = oregcomp_parse;
380a0633 2520 vFAIL("Unmatched (");
ce3e6498 2521 }
a0ed51b3 2522 }
830247a4
IZ
2523 else if (!paren && RExC_parse < RExC_end) {
2524 if (*RExC_parse == ')') {
2525 RExC_parse++;
380a0633 2526 vFAIL("Unmatched )");
a0ed51b3
LW
2527 }
2528 else
b45f050a 2529 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2530 /* NOTREACHED */
2531 }
a687059c 2532
a0d0e21e 2533 return(ret);
a687059c
LW
2534}
2535
2536/*
2537 - regbranch - one alternative of an | operator
2538 *
2539 * Implements the concatenation operator.
2540 */
76e3520e 2541STATIC regnode *
830247a4 2542S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2543{
c277df42
IZ
2544 register regnode *ret;
2545 register regnode *chain = NULL;
2546 register regnode *latest;
2547 I32 flags = 0, c = 0;
a0d0e21e 2548
b81d288d 2549 if (first)
c277df42
IZ
2550 ret = NULL;
2551 else {
b81d288d 2552 if (!SIZE_ONLY && RExC_extralen)
830247a4 2553 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2554 else {
830247a4 2555 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2556 Set_Node_Length(ret, 1);
2557 }
c277df42
IZ
2558 }
2559
b81d288d 2560 if (!first && SIZE_ONLY)
830247a4 2561 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2562
c277df42 2563 *flagp = WORST; /* Tentatively. */
a0d0e21e 2564
830247a4
IZ
2565 RExC_parse--;
2566 nextchar(pRExC_state);
2567 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2568 flags &= ~TRYAGAIN;
830247a4 2569 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2570 if (latest == NULL) {
2571 if (flags & TRYAGAIN)
2572 continue;
2573 return(NULL);
a0ed51b3
LW
2574 }
2575 else if (ret == NULL)
c277df42 2576 ret = latest;
a0d0e21e 2577 *flagp |= flags&HASWIDTH;
c277df42 2578 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2579 *flagp |= flags&SPSTART;
2580 else {
830247a4
IZ
2581 RExC_naughty++;
2582 regtail(pRExC_state, chain, latest);
a687059c 2583 }
a0d0e21e 2584 chain = latest;
c277df42
IZ
2585 c++;
2586 }
2587 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2588 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2589 if (ret == NULL)
2590 ret = chain;
2591 }
2592 if (c == 1) {
2593 *flagp |= flags&SIMPLE;
a0d0e21e 2594 }
a687059c 2595
a0d0e21e 2596 return(ret);
a687059c
LW
2597}
2598
2599/*
2600 - regpiece - something followed by possible [*+?]
2601 *
2602 * Note that the branching code sequences used for ? and the general cases
2603 * of * and + are somewhat optimized: they use the same NOTHING node as
2604 * both the endmarker for their branch list and the body of the last branch.
2605 * It might seem that this node could be dispensed with entirely, but the
2606 * endmarker role is not redundant.
2607 */
76e3520e 2608STATIC regnode *
830247a4 2609S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2610{
c277df42 2611 register regnode *ret;
a0d0e21e
LW
2612 register char op;
2613 register char *next;
2614 I32 flags;
830247a4 2615 char *origparse = RExC_parse;
a0d0e21e
LW
2616 char *maxpos;
2617 I32 min;
c277df42 2618 I32 max = REG_INFTY;
fac92740 2619 char *parse_start;
a0d0e21e 2620
830247a4 2621 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2622 if (ret == NULL) {
2623 if (flags & TRYAGAIN)
2624 *flagp |= TRYAGAIN;
2625 return(NULL);
2626 }
2627
830247a4 2628 op = *RExC_parse;
a0d0e21e 2629
830247a4 2630 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2631 parse_start = RExC_parse; /* MJD */
830247a4 2632 next = RExC_parse + 1;
a0d0e21e
LW
2633 maxpos = Nullch;
2634 while (isDIGIT(*next) || *next == ',') {
2635 if (*next == ',') {
2636 if (maxpos)
2637 break;
2638 else
2639 maxpos = next;
a687059c 2640 }
a0d0e21e
LW
2641 next++;
2642 }
2643 if (*next == '}') { /* got one */
2644 if (!maxpos)
2645 maxpos = next;
830247a4
IZ
2646 RExC_parse++;
2647 min = atoi(RExC_parse);
a0d0e21e
LW
2648 if (*maxpos == ',')
2649 maxpos++;
2650 else
830247a4 2651 maxpos = RExC_parse;
a0d0e21e
LW
2652 max = atoi(maxpos);
2653 if (!max && *maxpos != '0')
c277df42
IZ
2654 max = REG_INFTY; /* meaning "infinity" */
2655 else if (max >= REG_INFTY)
8615cb43 2656 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2657 RExC_parse = next;
2658 nextchar(pRExC_state);
a0d0e21e
LW
2659
2660 do_curly:
2661 if ((flags&SIMPLE)) {
830247a4
IZ
2662 RExC_naughty += 2 + RExC_naughty / 2;
2663 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2664 Set_Node_Offset(ret, parse_start+1); /* MJD */
2665 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2666 }
2667 else {
830247a4 2668 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2669
2670 w->flags = 0;
830247a4
IZ
2671 regtail(pRExC_state, ret, w);
2672 if (!SIZE_ONLY && RExC_extralen) {
2673 reginsert(pRExC_state, LONGJMP,ret);
2674 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2675 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2676 }
830247a4 2677 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2678 /* MJD hk */
2679 Set_Node_Offset(ret, parse_start+1);
2680 Set_Node_Length(ret,
2681 op == '{' ? (RExC_parse - parse_start) : 1);
2682
830247a4 2683 if (!SIZE_ONLY && RExC_extralen)
c277df42 2684 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2685 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2686 if (SIZE_ONLY)
830247a4
IZ
2687 RExC_whilem_seen++, RExC_extralen += 3;
2688 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2689 }
c277df42 2690 ret->flags = 0;
a0d0e21e
LW
2691
2692 if (min > 0)
821b33a5
IZ
2693 *flagp = WORST;
2694 if (max > 0)
2695 *flagp |= HASWIDTH;
a0d0e21e 2696 if (max && max < min)
8615cb43 2697 vFAIL("Can't do {n,m} with n > m");
c277df42 2698 if (!SIZE_ONLY) {
eb160463
GS
2699 ARG1_SET(ret, (U16)min);
2700 ARG2_SET(ret, (U16)max);
a687059c 2701 }
a687059c 2702
a0d0e21e 2703 goto nest_check;
a687059c 2704 }
a0d0e21e 2705 }
a687059c 2706
a0d0e21e
LW
2707 if (!ISMULT1(op)) {
2708 *flagp = flags;
a687059c 2709 return(ret);
a0d0e21e 2710 }
bb20fd44 2711
c277df42 2712#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2713
2714 /* if this is reinstated, don't forget to put this back into perldiag:
2715
2716 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2717
2718 (F) The part of the regexp subject to either the * or + quantifier
2719 could match an empty string. The {#} shows in the regular
2720 expression about where the problem was discovered.
2721
2722 */
2723
bb20fd44 2724 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2725 vFAIL("Regexp *+ operand could be empty");
b81d288d 2726#endif
bb20fd44 2727
fac92740 2728 parse_start = RExC_parse;
830247a4 2729 nextchar(pRExC_state);
a0d0e21e 2730
821b33a5 2731 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2732
2733 if (op == '*' && (flags&SIMPLE)) {
830247a4 2734 reginsert(pRExC_state, STAR, ret);
c277df42 2735 ret->flags = 0;
830247a4 2736 RExC_naughty += 4;
a0d0e21e
LW
2737 }
2738 else if (op == '*') {
2739 min = 0;
2740 goto do_curly;
a0ed51b3
LW
2741 }
2742 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2743 reginsert(pRExC_state, PLUS, ret);
c277df42 2744 ret->flags = 0;
830247a4 2745 RExC_naughty += 3;
a0d0e21e
LW
2746 }
2747 else if (op == '+') {
2748 min = 1;
2749 goto do_curly;
a0ed51b3
LW
2750 }
2751 else if (op == '?') {
a0d0e21e
LW
2752 min = 0; max = 1;
2753 goto do_curly;
2754 }
2755 nest_check:
e476b1b5 2756 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2757 vWARN3(RExC_parse,
b45f050a 2758 "%.*s matches null string many times",
830247a4 2759 RExC_parse - origparse,
b45f050a 2760 origparse);
a0d0e21e
LW
2761 }
2762
830247a4
IZ
2763 if (*RExC_parse == '?') {
2764 nextchar(pRExC_state);
2765 reginsert(pRExC_state, MINMOD, ret);
2766 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2767 }
830247a4
IZ
2768 if (ISMULT2(RExC_parse)) {
2769 RExC_parse++;
b45f050a
JF
2770 vFAIL("Nested quantifiers");
2771 }
a0d0e21e
LW
2772
2773 return(ret);
a687059c
LW
2774}
2775
2776/*
2777 - regatom - the lowest level
2778 *
2779 * Optimization: gobbles an entire sequence of ordinary characters so that
2780 * it can turn them into a single node, which is smaller to store and
2781 * faster to run. Backslashed characters are exceptions, each becoming a
2782 * separate node; the code is simpler that way and it's not worth fixing.
2783 *
b45f050a 2784 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2785STATIC regnode *
830247a4 2786S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2787{
c277df42 2788 register regnode *ret = 0;
a0d0e21e 2789 I32 flags;
f06dbbb7 2790 char *parse_start = 0;
a0d0e21e
LW
2791
2792 *flagp = WORST; /* Tentatively. */
2793
2794tryagain:
830247a4 2795 switch (*RExC_parse) {
a0d0e21e 2796 case '^':
830247a4
IZ
2797 RExC_seen_zerolen++;
2798 nextchar(pRExC_state);
e2509266 2799 if (RExC_flags & PMf_MULTILINE)
830247a4 2800 ret = reg_node(pRExC_state, MBOL);
e2509266 2801 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2802 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2803 else
830247a4 2804 ret = reg_node(pRExC_state, BOL);
fac92740 2805 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2806 break;
2807 case '$':
830247a4 2808 nextchar(pRExC_state);
b81d288d 2809 if (*RExC_parse)
830247a4 2810 RExC_seen_zerolen++;
e2509266 2811 if (RExC_flags & PMf_MULTILINE)
830247a4 2812 ret = reg_node(pRExC_state, MEOL);
e2509266 2813 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2814 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2815 else
830247a4 2816 ret = reg_node(pRExC_state, EOL);
fac92740 2817 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2818 break;
2819 case '.':
830247a4 2820 nextchar(pRExC_state);
e2509266 2821 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2822 ret = reg_node(pRExC_state, SANY);
2823 else
2824 ret = reg_node(pRExC_state, REG_ANY);
2825 *flagp |= HASWIDTH|SIMPLE;
830247a4 2826 RExC_naughty++;
fac92740 2827 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2828 break;
2829 case '[':
b45f050a 2830 {
830247a4 2831 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2832 ret = regclass(pRExC_state);
830247a4
IZ
2833 if (*RExC_parse != ']') {
2834 RExC_parse = oregcomp_parse;
b45f050a
JF
2835 vFAIL("Unmatched [");
2836 }
830247a4 2837 nextchar(pRExC_state);
a0d0e21e 2838 *flagp |= HASWIDTH|SIMPLE;
fac92740 2839 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2840 break;
b45f050a 2841 }
a0d0e21e 2842 case '(':
830247a4
IZ
2843 nextchar(pRExC_state);
2844 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2845 if (ret == NULL) {
bf93d4cc 2846 if (flags & TRYAGAIN) {
830247a4 2847 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2848 /* Make parent create an empty node if needed. */
2849 *flagp |= TRYAGAIN;
2850 return(NULL);
2851 }
a0d0e21e 2852 goto tryagain;
bf93d4cc 2853 }
a0d0e21e
LW
2854 return(NULL);
2855 }
c277df42 2856 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2857 break;
2858 case '|':
2859 case ')':
2860 if (flags & TRYAGAIN) {
2861 *flagp |= TRYAGAIN;
2862 return NULL;
2863 }
b45f050a 2864 vFAIL("Internal urp");
a0d0e21e
LW
2865 /* Supposed to be caught earlier. */
2866 break;
85afd4ae 2867 case '{':
830247a4
IZ
2868 if (!regcurly(RExC_parse)) {
2869 RExC_parse++;
85afd4ae
CS
2870 goto defchar;
2871 }
2872 /* FALL THROUGH */
a0d0e21e
LW
2873 case '?':
2874 case '+':
2875 case '*':
830247a4 2876 RExC_parse++;
b45f050a 2877 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2878 break;
2879 case '\\':
830247a4 2880 switch (*++RExC_parse) {
a0d0e21e 2881 case 'A':
830247a4
IZ
2882 RExC_seen_zerolen++;
2883 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2884 *flagp |= SIMPLE;
830247a4 2885 nextchar(pRExC_state);
fac92740 2886 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2887 break;
2888 case 'G':
830247a4
IZ
2889 ret = reg_node(pRExC_state, GPOS);
2890 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2891 *flagp |= SIMPLE;
830247a4 2892 nextchar(pRExC_state);
fac92740 2893 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2894 break;
2895 case 'Z':
830247a4 2896 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2897 *flagp |= SIMPLE;
a1917ab9 2898 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2899 nextchar(pRExC_state);
a0d0e21e 2900 break;
b85d18e9 2901 case 'z':
830247a4 2902 ret = reg_node(pRExC_state, EOS);
b85d18e9 2903 *flagp |= SIMPLE;
830247a4
IZ
2904 RExC_seen_zerolen++; /* Do not optimize RE away */
2905 nextchar(pRExC_state);
fac92740 2906 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2907 break;
4a2d328f 2908 case 'C':
f33976b4
DB
2909 ret = reg_node(pRExC_state, CANY);
2910 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2911 *flagp |= HASWIDTH|SIMPLE;
830247a4 2912 nextchar(pRExC_state);
fac92740 2913 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2914 break;
2915 case 'X':
830247a4 2916 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2917 *flagp |= HASWIDTH;
830247a4 2918 nextchar(pRExC_state);
fac92740 2919 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2920 break;
a0d0e21e 2921 case 'w':
eb160463 2922 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2923 *flagp |= HASWIDTH|SIMPLE;
830247a4 2924 nextchar(pRExC_state);
fac92740 2925 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2926 break;
2927 case 'W':
eb160463 2928 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2929 *flagp |= HASWIDTH|SIMPLE;
830247a4 2930 nextchar(pRExC_state);
fac92740 2931 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2932 break;
2933 case 'b':
830247a4
IZ
2934 RExC_seen_zerolen++;
2935 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2936 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2937 *flagp |= SIMPLE;
830247a4 2938 nextchar(pRExC_state);
fac92740 2939 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2940 break;
2941 case 'B':
830247a4
IZ
2942 RExC_seen_zerolen++;
2943 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2944 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2945 *flagp |= SIMPLE;
830247a4 2946 nextchar(pRExC_state);
fac92740 2947 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2948 break;
2949 case 's':
eb160463 2950 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2951 *flagp |= HASWIDTH|SIMPLE;
830247a4 2952 nextchar(pRExC_state);
fac92740 2953 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2954 break;
2955 case 'S':
eb160463 2956 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2957 *flagp |= HASWIDTH|SIMPLE;
830247a4 2958 nextchar(pRExC_state);
fac92740 2959 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2960 break;
2961 case 'd':
ffc61ed2 2962 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2963 *flagp |= HASWIDTH|SIMPLE;
830247a4 2964 nextchar(pRExC_state);
fac92740 2965 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2966 break;
2967 case 'D':
ffc61ed2 2968 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2969 *flagp |= HASWIDTH|SIMPLE;
830247a4 2970 nextchar(pRExC_state);
fac92740 2971 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2972 break;
a14b48bc
LW
2973 case 'p':
2974 case 'P':
3568d838 2975 {
830247a4 2976 char* oldregxend = RExC_end;
ccb2c380 2977 char* parse_start = RExC_parse - 2;
a14b48bc 2978
830247a4 2979 if (RExC_parse[1] == '{') {
3568d838 2980 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2981 RExC_end = strchr(RExC_parse, '}');
2982 if (!RExC_end) {
0da60cf5 2983 U8 c = (U8)*RExC_parse;
830247a4
IZ
2984 RExC_parse += 2;
2985 RExC_end = oldregxend;
0da60cf5 2986 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2987 }
830247a4 2988 RExC_end++;
a14b48bc 2989 }
af6f566e 2990 else {
830247a4 2991 RExC_end = RExC_parse + 2;
af6f566e
HS
2992 if (RExC_end > oldregxend)
2993 RExC_end = oldregxend;
2994 }
830247a4 2995 RExC_parse--;
a14b48bc 2996
ffc61ed2 2997 ret = regclass(pRExC_state);
a14b48bc 2998
830247a4
IZ
2999 RExC_end = oldregxend;
3000 RExC_parse--;
ccb2c380
MP
3001
3002 Set_Node_Offset(ret, parse_start + 2);
3003 Set_Node_Cur_Length(ret);
830247a4 3004 nextchar(pRExC_state);
a14b48bc
LW
3005 *flagp |= HASWIDTH|SIMPLE;
3006 }
3007 break;
a0d0e21e
LW
3008 case 'n':
3009 case 'r':
3010 case 't':
3011 case 'f':
3012 case 'e':
3013 case 'a':
3014 case 'x':
3015 case 'c':
3016 case '0':
3017 goto defchar;
3018 case '1': case '2': case '3': case '4':
3019 case '5': case '6': case '7': case '8': case '9':
3020 {
830247a4 3021 I32 num = atoi(RExC_parse);
a0d0e21e 3022
830247a4 3023 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3024 goto defchar;
3025 else {
fac92740 3026 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3027 while (isDIGIT(*RExC_parse))
3028 RExC_parse++;
b45f050a 3029
eb160463 3030 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3031 vFAIL("Reference to nonexistent group");
830247a4 3032 RExC_sawback = 1;
eb160463
GS
3033 ret = reganode(pRExC_state,
3034 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3035 num);
a0d0e21e 3036 *flagp |= HASWIDTH;
fac92740
MJD
3037
3038 /* override incorrect value set in reganode MJD */
3039 Set_Node_Offset(ret, parse_start+1);
3040 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3041 RExC_parse--;
3042 nextchar(pRExC_state);
a0d0e21e
LW
3043 }
3044 }
3045 break;
3046 case '\0':
830247a4 3047 if (RExC_parse >= RExC_end)
b45f050a 3048 FAIL("Trailing \\");
a0d0e21e
LW
3049 /* FALL THROUGH */
3050 default:
c9f97d15
IZ
3051 /* Do not generate `unrecognized' warnings here, we fall
3052 back into the quick-grab loop below */
a0d0e21e
LW
3053 goto defchar;
3054 }
3055 break;
4633a7c4
LW
3056
3057 case '#':
e2509266 3058 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3059 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3060 if (RExC_parse < RExC_end)
4633a7c4
LW
3061 goto tryagain;
3062 }
3063 /* FALL THROUGH */
3064
a0d0e21e 3065 default: {
ba210ebe 3066 register STRLEN len;
58ae7d3f 3067 register UV ender;
a0d0e21e 3068 register char *p;
c277df42 3069 char *oldp, *s;
ba210ebe 3070 STRLEN numlen;
80aecb99 3071 STRLEN foldlen;
60a8b682 3072 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3073
3074 parse_start = RExC_parse - 1;
a0d0e21e 3075
830247a4 3076 RExC_parse++;
a0d0e21e
LW
3077
3078 defchar:
58ae7d3f 3079 ender = 0;
eb160463
GS
3080 ret = reg_node(pRExC_state,
3081 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3082 s = STRING(ret);
830247a4
IZ
3083 for (len = 0, p = RExC_parse - 1;
3084 len < 127 && p < RExC_end;
a0d0e21e
LW
3085 len++)
3086 {
3087 oldp = p;
5b5a24f7 3088
e2509266 3089 if (RExC_flags & PMf_EXTENDED)
830247a4 3090 p = regwhite(p, RExC_end);
a0d0e21e
LW
3091 switch (*p) {
3092 case '^':
3093 case '$':
3094 case '.':
3095 case '[':
3096 case '(':
3097 case ')':
3098 case '|':
3099 goto loopdone;
3100 case '\\':
3101 switch (*++p) {
3102 case 'A':
1ed8eac0
JF
3103 case 'C':
3104 case 'X':
a0d0e21e
LW
3105 case 'G':
3106 case 'Z':
b85d18e9 3107 case 'z':
a0d0e21e
LW
3108 case 'w':
3109 case 'W':
3110 case 'b':
3111 case 'B':
3112 case 's':
3113 case 'S':
3114 case 'd':
3115 case 'D':
a14b48bc
LW
3116 case 'p':
3117 case 'P':
a0d0e21e
LW
3118 --p;
3119 goto loopdone;
3120 case 'n':
3121 ender = '\n';
3122 p++;
a687059c 3123 break;
a0d0e21e
LW
3124 case 'r':
3125 ender = '\r';
3126 p++;
a687059c 3127 break;
a0d0e21e
LW
3128 case 't':
3129 ender = '\t';
3130 p++;
a687059c 3131 break;
a0d0e21e
LW
3132 case 'f':
3133 ender = '\f';
3134 p++;
a687059c 3135 break;
a0d0e21e 3136 case 'e':
c7f1f016 3137 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3138 p++;
a687059c 3139 break;
a0d0e21e 3140 case 'a':
c7f1f016 3141 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3142 p++;
a687059c 3143 break;
a0d0e21e 3144 case 'x':
a0ed51b3
LW
3145 if (*++p == '{') {
3146 char* e = strchr(p, '}');
b81d288d 3147
b45f050a 3148 if (!e) {
830247a4 3149 RExC_parse = p + 1;
b45f050a
JF
3150 vFAIL("Missing right brace on \\x{}");
3151 }
de5f0749 3152 else {
a4c04bdc
NC
3153 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3154 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3155 numlen = e - p - 1;
3156 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3157 if (ender > 0xff)
3158 RExC_utf8 = 1;
b21ed0a9
GS
3159 /* numlen is generous */
3160 if (numlen + len >= 127) {
a0ed51b3
LW
3161 p--;
3162 goto loopdone;
3163 }
3164 p = e + 1;
3165 }
a0ed51b3
LW
3166 }
3167 else {
a4c04bdc 3168 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3169 numlen = 2;
3170 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3171 p += numlen;
3172 }
a687059c 3173 break;
a0d0e21e
LW
3174 case 'c':
3175 p++;
bbce6d69 3176 ender = UCHARAT(p++);
3177 ender = toCTRL(ender);
a687059c 3178 break;
a0d0e21e
LW
3179 case '0': case '1': case '2': case '3':case '4':
3180 case '5': case '6': case '7': case '8':case '9':
3181 if (*p == '0' ||
830247a4 3182 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3183 I32 flags = 0;
3184 numlen = 3;
3185 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3186 p += numlen;
3187 }
3188 else {
3189 --p;
3190 goto loopdone;
a687059c
LW
3191 }
3192 break;
a0d0e21e 3193 case '\0':
830247a4 3194 if (p >= RExC_end)
b45f050a 3195 FAIL("Trailing \\");
a687059c 3196 /* FALL THROUGH */
a0d0e21e 3197 default:
e476b1b5 3198 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3199 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3200 goto normal_default;
a0d0e21e
LW
3201 }
3202 break;
a687059c 3203 default:
a0ed51b3 3204 normal_default:
fd400ab9 3205 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3206 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3207 &numlen, 0);
a0ed51b3
LW
3208 p += numlen;
3209 }
3210 else
3211 ender = *p++;
a0d0e21e 3212 break;
a687059c 3213 }
e2509266 3214 if (RExC_flags & PMf_EXTENDED)
830247a4 3215 p = regwhite(p, RExC_end);
60a8b682
JH
3216 if (UTF && FOLD) {
3217 /* Prime the casefolded buffer. */
ac7e0132 3218 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3219 }
a0d0e21e
LW
3220 if (ISMULT2(p)) { /* Back off on ?+*. */
3221 if (len)
3222 p = oldp;
16ea2a2e 3223 else if (UTF) {
0ebc6274
JH
3224 STRLEN unilen;
3225
80aecb99 3226 if (FOLD) {
60a8b682 3227 /* Emit all the Unicode characters. */
80aecb99
JH
3228 for (foldbuf = tmpbuf;
3229 foldlen;
3230 foldlen -= numlen) {
3231 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3232 if (numlen > 0) {
0ebc6274
JH
3233 reguni(pRExC_state, ender, s, &unilen);
3234 s += unilen;
3235 len += unilen;
3236 /* In EBCDIC the numlen
3237 * and unilen can differ. */
9dc45d57 3238 foldbuf += numlen;
47654450
JH
3239 if (numlen >= foldlen)
3240 break;
9dc45d57
JH
3241 }
3242 else
3243 break; /* "Can't happen." */
80aecb99
JH
3244 }
3245 }
3246 else {
0ebc6274 3247 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3248 if (unilen > 0) {
0ebc6274
JH
3249 s += unilen;
3250 len += unilen;
9dc45d57 3251 }
80aecb99 3252 }
a0ed51b3 3253 }
a0d0e21e
LW
3254 else {
3255 len++;
eb160463 3256 REGC((char)ender, s++);
a0d0e21e
LW
3257 }
3258 break;
a687059c 3259 }
16ea2a2e 3260 if (UTF) {
0ebc6274
JH
3261 STRLEN unilen;
3262
80aecb99 3263 if (FOLD) {
60a8b682 3264 /* Emit all the Unicode characters. */
80aecb99
JH
3265 for (foldbuf = tmpbuf;
3266 foldlen;
3267 foldlen -= numlen) {
3268 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3269 if (numlen > 0) {
0ebc6274
JH
3270 reguni(pRExC_state, ender, s, &unilen);
3271 len += unilen;
3272 s += unilen;
3273 /* In EBCDIC the numlen
3274 * and unilen can differ. */
9dc45d57 3275 foldbuf += numlen;
47654450
JH
3276 if (numlen >= foldlen)
3277 break;
9dc45d57
JH
3278 }
3279 else
3280 break;
80aecb99
JH
3281 }
3282 }
3283 else {
0ebc6274 3284 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3285 if (unilen > 0) {
0ebc6274
JH
3286 s += unilen;
3287 len += unilen;
9dc45d57 3288 }
80aecb99
JH
3289 }
3290 len--;
a0ed51b3
LW
3291 }
3292 else
eb160463 3293 REGC((char)ender, s++);
a0d0e21e
LW
3294 }
3295 loopdone:
830247a4 3296 RExC_parse = p - 1;
fac92740 3297 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3298 nextchar(pRExC_state);
793db0cb
JH
3299 {
3300 /* len is STRLEN which is unsigned, need to copy to signed */
3301 IV iv = len;
3302 if (iv < 0)
3303 vFAIL("Internal disaster");
3304 }
a0d0e21e
LW
3305 if (len > 0)
3306 *flagp |= HASWIDTH;
3307 if (len == 1)
3308 *flagp |= SIMPLE;
c277df42 3309 if (!SIZE_ONLY)
cd439c50
IZ
3310 STR_LEN(ret) = len;
3311 if (SIZE_ONLY)
830247a4 3312 RExC_size += STR_SZ(len);
cd439c50 3313 else
830247a4 3314 RExC_emit += STR_SZ(len);
a687059c 3315 }
a0d0e21e
LW
3316 break;
3317 }
a687059c 3318
60a8b682
JH
3319 /* If the encoding pragma is in effect recode the text of
3320 * any EXACT-kind nodes. */
22c54be3 3321 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3322 STRLEN oldlen = STR_LEN(ret);
3323 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3324
3325 if (RExC_utf8)
3326 SvUTF8_on(sv);
3327 if (sv_utf8_downgrade(sv, TRUE)) {
3328 char *s = sv_recode_to_utf8(sv, PL_encoding);
3329 STRLEN newlen = SvCUR(sv);
3330
3331 if (SvUTF8(sv))
3332 RExC_utf8 = 1;
3333 if (!SIZE_ONLY) {
3334 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3335 (int)oldlen, STRING(ret),
3336 (int)newlen, s));
3337 Copy(s, STRING(ret), newlen, char);
3338 STR_LEN(ret) += newlen - oldlen;
3339 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3340 } else
3341 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3342 }
a72c7584
JH
3343 }
3344
a0d0e21e 3345 return(ret);
a687059c
LW
3346}
3347
873ef191 3348STATIC char *
cea2e8a9 3349S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3350{
3351 while (p < e) {
3352 if (isSPACE(*p))
3353 ++p;
3354 else if (*p == '#') {
3355 do {
3356 p++;
3357 } while (p < e && *p != '\n');
3358 }
3359 else
3360 break;
3361 }
3362 return p;
3363}
3364
b8c5462f
JH
3365/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3366 Character classes ([:foo:]) can also be negated ([:^foo:]).
3367 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3368 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3369 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3370
3371#define POSIXCC_DONE(c) ((c) == ':')
3372#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3373#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3374
b8c5462f 3375STATIC I32
830247a4 3376S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3377{
3378 char *posixcc = 0;
936ed897 3379 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3380
830247a4 3381 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3382 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3383 POSIXCC(UCHARAT(RExC_parse))) {
3384 char c = UCHARAT(RExC_parse);
830247a4 3385 char* s = RExC_parse++;
b81d288d 3386
9a86a77b 3387 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3388 RExC_parse++;
3389 if (RExC_parse == RExC_end)
620e46c5 3390 /* Grandfather lone [:, [=, [. */
830247a4 3391 RExC_parse = s;
620e46c5 3392 else {
830247a4 3393 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3394
9a86a77b 3395 if (UCHARAT(RExC_parse) == ']') {
830247a4 3396 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3397 posixcc = s + 1;
3398 if (*s == ':') {
3399 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3400 I32 skip = 5; /* the most common skip */
3401
3402 switch (*posixcc) {
3403 case 'a':
3404 if (strnEQ(posixcc, "alnum", 5))
3405 namedclass =
3406 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3407 else if (strnEQ(posixcc, "alpha", 5))
3408 namedclass =
3409 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3410 else if (strnEQ(posixcc, "ascii", 5))
3411 namedclass =
3412 complement ? ANYOF_NASCII : ANYOF_ASCII;
3413 break;
aaa51d5e
JF
3414 case 'b':
3415 if (strnEQ(posixcc, "blank", 5))
3416 namedclass =
3417 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3418 break;
b8c5462f
JH
3419 case 'c':
3420 if (strnEQ(posixcc, "cntrl", 5))
3421 namedclass =
3422 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3423 break;
3424 case 'd':
3425 if (strnEQ(posixcc, "digit", 5))
3426 namedclass =
3427 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3428 break;
3429 case 'g':
3430 if (strnEQ(posixcc, "graph", 5))
3431 namedclass =
3432 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3433 break;
3434 case 'l':
3435 if (strnEQ(posixcc, "lower", 5))
3436 namedclass =
3437 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3438 break;
3439 case 'p':
3440 if (strnEQ(posixcc, "print", 5))
3441 namedclass =
3442 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3443 else if (strnEQ(posixcc, "punct", 5))
3444 namedclass =
3445 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3446 break;
3447 case 's':
3448 if (strnEQ(posixcc, "space", 5))
3449 namedclass =
aaa51d5e 3450 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3451 break;
b8c5462f
JH
3452 case 'u':
3453 if (strnEQ(posixcc, "upper", 5))
3454 namedclass =
3455 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3456 break;
3457 case 'w': /* this is not POSIX, this is the Perl \w */
3458 if (strnEQ(posixcc, "word", 4)) {
3459 namedclass =
3460 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3461 skip = 4;
3462 }
3463 break;
3464 case 'x':
3465 if (strnEQ(posixcc, "xdigit", 6)) {
3466 namedclass =
3467 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3468 skip = 6;
3469 }
3470 break;
3471 }
ac561586
JH
3472 if (namedclass == OOB_NAMEDCLASS ||
3473 posixcc[skip] != ':' ||
3474 posixcc[skip+1] != ']')
b45f050a
JF
3475 {
3476 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3477 t - s - 1, s + 1);
3478 }
3479 } else if (!SIZE_ONLY) {
b8c5462f 3480 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3481
830247a4 3482 /* adjust RExC_parse so the warning shows after
b45f050a 3483 the class closes */
9a86a77b 3484 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3485 RExC_parse++;
b45f050a
JF
3486 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3487 }
b8c5462f
JH
3488 } else {
3489 /* Maternal grandfather:
3490 * "[:" ending in ":" but not in ":]" */
830247a4 3491 RExC_parse = s;
767d463e 3492 }
620e46c5
JH
3493 }
3494 }
3495
b8c5462f
JH
3496 return namedclass;
3497}
3498
3499STATIC void
830247a4 3500S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3501{
b938889d 3502 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3503 char *s = RExC_parse;
93733859 3504 char c = *s++;
b8c5462f
JH
3505
3506 while(*s && isALNUM(*s))
3507 s++;
3508 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3509 if (ckWARN(WARN_REGEXP))
3510 vWARN3(s+2,
3511 "POSIX syntax [%c %c] belongs inside character classes",
3512 c, c);
b45f050a
JF
3513
3514 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3515 if (POSIXCC_NOTYET(c)) {
830247a4 3516 /* adjust RExC_parse so the error shows after
b45f050a 3517 the class closes */
9a86a77b 3518 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3519 ;
3520 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3521 }
b8c5462f
JH
3522 }
3523 }
620e46c5
JH
3524}
3525
76e3520e 3526STATIC regnode *
830247a4 3527S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3528{
ffc61ed2 3529 register UV value;
9a86a77b 3530 register UV nextvalue;
3568d838 3531 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3532 register IV range = 0;
c277df42 3533 register regnode *ret;
ba210ebe 3534 STRLEN numlen;
ffc61ed2 3535 IV namedclass;
9c5ffd7c 3536 char *rangebegin = 0;
936ed897 3537 bool need_class = 0;
9c5ffd7c 3538 SV *listsv = Nullsv;
ffc61ed2
JH
3539 register char *e;
3540 UV n;
9e55ce06
JH
3541 bool optimize_invert = TRUE;
3542 AV* unicode_alternate = 0;
1b2d223b
JH
3543#ifdef EBCDIC
3544 UV literal_endpoint = 0;
3545#endif
ffc61ed2
JH
3546
3547 ret = reganode(pRExC_state, ANYOF, 0);
3548
3549 if (!SIZE_ONLY)
3550 ANYOF_FLAGS(ret) = 0;
3551
9a86a77b 3552 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3553 RExC_naughty++;
3554 RExC_parse++;
3555 if (!SIZE_ONLY)
3556 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3557 }
a0d0e21e 3558
936ed897 3559 if (SIZE_ONLY)
830247a4 3560 RExC_size += ANYOF_SKIP;
936ed897 3561 else {
830247a4 3562 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3563 if (FOLD)
3564 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3565 if (LOC)
3566 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3567 ANYOF_BITMAP_ZERO(ret);
3568 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3569 }
b8c5462f 3570
9a86a77b
JH
3571 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3572
b938889d 3573 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3574 checkposixcc(pRExC_state);
b8c5462f 3575
f064b6ad
HS
3576 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3577 if (UCHARAT(RExC_parse) == ']')
3578 goto charclassloop;
ffc61ed2 3579
9a86a77b 3580 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3581
3582 charclassloop:
3583
3584 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3585
73b437c8 3586 if (!range)
830247a4 3587 rangebegin = RExC_parse;
ffc61ed2 3588 if (UTF) {
5e12f4fb 3589 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3590 RExC_end - RExC_parse,
3591 &numlen, 0);
ffc61ed2
JH
3592 RExC_parse += numlen;
3593 }
3594 else
3595 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3596 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3597 if (value == '[' && POSIXCC(nextvalue))
830247a4 3598 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3599 else if (value == '\\') {
ffc61ed2 3600 if (UTF) {
5e12f4fb 3601 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3602 RExC_end - RExC_parse,
3603 &numlen, 0);
3604 RExC_parse += numlen;
3605 }
3606 else
3607 value = UCHARAT(RExC_parse++);
470c3474 3608 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3609 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3610 * be a problem later if we want switch on Unicode.
3611 * A similar issue a little bit later when switching on
3612 * namedclass. --jhi */
ffc61ed2 3613 switch ((I32)value) {
b8c5462f
JH
3614 case 'w': namedclass = ANYOF_ALNUM; break;
3615 case 'W': namedclass = ANYOF_NALNUM; break;
3616 case 's': namedclass = ANYOF_SPACE; break;
3617 case 'S': namedclass = ANYOF_NSPACE; break;
3618 case 'd': namedclass = ANYOF_DIGIT; break;
3619 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3620 case 'p':
3621 case 'P':
af6f566e 3622 if (RExC_parse >= RExC_end)
2a4859cd 3623 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3624 if (*RExC_parse == '{') {
0da60cf5 3625 U8 c = (U8)value;
ffc61ed2
JH
3626 e = strchr(RExC_parse++, '}');
3627 if (!e)
0da60cf5 3628 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3629 while (isSPACE(UCHARAT(RExC_parse)))
3630 RExC_parse++;
3631 if (e == RExC_parse)
0da60cf5 3632 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3633 n = e - RExC_parse;
ab13f0c7
JH
3634 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3635 n--;
ffc61ed2
JH
3636 }
3637 else {
3638 e = RExC_parse;
3639 n = 1;
3640 }
3641 if (!SIZE_ONLY) {
ab13f0c7
JH
3642 if (UCHARAT(RExC_parse) == '^') {
3643 RExC_parse++;
3644 n--;
3645 value = value == 'p' ? 'P' : 'p'; /* toggle */
3646 while (isSPACE(UCHARAT(RExC_parse))) {
3647 RExC_parse++;
3648 n--;
3649 }
3650 }
ffc61ed2 3651 if (value == 'p')
ab13f0c7
JH
3652 Perl_sv_catpvf(aTHX_ listsv,
3653 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3654 else
ab13f0c7
JH
3655 Perl_sv_catpvf(aTHX_ listsv,
3656 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3657 }
3658 RExC_parse = e + 1;
3659 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3660 continue;
b8c5462f
JH
3661 case 'n': value = '\n'; break;
3662 case 'r': value = '\r'; break;
3663 case 't': value = '\t'; break;
3664 case 'f': value = '\f'; break;
3665 case 'b': value = '\b'; break;
c7f1f016
NIS
3666 case 'e': value = ASCII_TO_NATIVE('\033');break;
3667 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3668 case 'x':
ffc61ed2 3669 if (*RExC_parse == '{') {
a4c04bdc
NC
3670 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3671 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3672 e = strchr(RExC_parse++, '}');
b81d288d 3673 if (!e)
ffc61ed2 3674 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3675
3676 numlen = e - RExC_parse;
3677 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3678 RExC_parse = e + 1;
3679 }
3680 else {
a4c04bdc 3681 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3682 numlen = 2;
3683 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3684 RExC_parse += numlen;
3685 }
b8c5462f
JH
3686 break;
3687 case 'c':
830247a4 3688 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3689 value = toCTRL(value);
3690 break;
3691 case '0': case '1': case '2': case '3': case '4':
3692 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3693 {
3694 I32 flags = 0;
3695 numlen = 3;
3696 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3697 RExC_parse += numlen;
b8c5462f 3698 break;
53305cf1 3699 }
1028017a 3700 default:
e476b1b5 3701 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3702 vWARN2(RExC_parse,
3703 "Unrecognized escape \\%c in character class passed through",
3704 (int)value);
1028017a 3705 break;
b8c5462f 3706 }
ffc61ed2 3707 } /* end of \blah */
1b2d223b
JH
3708#ifdef EBCDIC
3709 else
3710 literal_endpoint++;
3711#endif
ffc61ed2
JH
3712
3713 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3714
3715 if (!SIZE_ONLY && !need_class)
936ed897 3716 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3717
936ed897 3718 need_class = 1;
ffc61ed2
JH
3719
3720 /* a bad range like a-\d, a-[:digit:] ? */
3721 if (range) {
73b437c8 3722 if (!SIZE_ONLY) {
e476b1b5 3723 if (ckWARN(WARN_REGEXP))
830247a4 3724 vWARN4(RExC_parse,
b45f050a 3725 "False [] range \"%*.*s\"",
830247a4
IZ
3726 RExC_parse - rangebegin,
3727 RExC_parse - rangebegin,
b45f050a 3728 rangebegin);
3568d838
JH
3729 if (prevvalue < 256) {
3730 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3731 ANYOF_BITMAP_SET(ret, '-');
3732 }
3733 else {
3734 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3735 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3736 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3737 }
b8c5462f 3738 }
ffc61ed2
JH
3739
3740 range = 0; /* this was not a true range */
73b437c8 3741 }
ffc61ed2 3742
73b437c8 3743 if (!SIZE_ONLY) {
3568d838
JH
3744 if (namedclass > OOB_NAMEDCLASS)
3745 optimize_invert = FALSE;
e2962f66
JH
3746 /* Possible truncation here but in some 64-bit environments
3747 * the compiler gets heartburn about switch on 64-bit values.
3748 * A similar issue a little earlier when switching on value.
98f323fa 3749 * --jhi */
e2962f66 3750 switch ((I32)namedclass) {
73b437c8
JH
3751 case ANYOF_ALNUM:
3752 if (LOC)
936ed897 3753 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3754 else {
3755 for (value = 0; value < 256; value++)
3756 if (isALNUM(value))
936ed897 3757 ANYOF_BITMAP_SET(ret, value);
73b437c8 3758 }
ffc61ed2 3759 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3760 break;
3761 case ANYOF_NALNUM:
3762 if (LOC)
936ed897 3763 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3764 else {
3765 for (value = 0; value < 256; value++)
3766 if (!isALNUM(value))
936ed897 3767 ANYOF_BITMAP_SET(ret, value);
73b437c8 3768 }
ffc61ed2 3769 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3770 break;
ffc61ed2 3771 case ANYOF_ALNUMC:
73b437c8 3772 if (LOC)
ffc61ed2 3773 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3774 else {
3775 for (value = 0; value < 256; value++)
ffc61ed2 3776 if (isALNUMC(value))
936ed897 3777 ANYOF_BITMAP_SET(ret, value);
73b437c8 3778 }
ffc61ed2 3779 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3780 break;
3781 case ANYOF_NALNUMC:
3782 if (LOC)
936ed897 3783 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3784 else {
3785 for (value = 0; value < 256; value++)
3786 if (!isALNUMC(value))
936ed897 3787 ANYOF_BITMAP_SET(ret, value);
73b437c8 3788 }
ffc61ed2 3789 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3790 break;
3791 case ANYOF_ALPHA:
3792 if (LOC)
936ed897 3793 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3794 else {
3795 for (value = 0; value < 256; value++)
3796 if (isALPHA(value))
936ed897 3797 ANYOF_BITMAP_SET(ret, value);
73b437c8 3798 }
ffc61ed2 3799 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3800 break;
3801 case ANYOF_NALPHA:
3802 if (LOC)
936ed897 3803 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3804 else {
3805 for (value = 0; value < 256; value++)
3806 if (!isALPHA(value))
936ed897 3807 ANYOF_BITMAP_SET(ret, value);
73b437c8 3808 }
ffc61ed2 3809 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3810 break;
3811 case ANYOF_ASCII:
3812 if (LOC)
936ed897 3813 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3814 else {
c7f1f016 3815#ifndef EBCDIC
1ba5c669
JH
3816 for (value = 0; value < 128; value++)
3817 ANYOF_BITMAP_SET(ret, value);
3818#else /* EBCDIC */
ffbc6a93 3819 for (value = 0; value < 256; value++) {
3a3c4447
JH
3820 if (isASCII(value))
3821 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3822 }
1ba5c669 3823#endif /* EBCDIC */
73b437c8 3824 }
ffc61ed2 3825 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3826 break;
3827 case ANYOF_NASCII:
3828 if (LOC)
936ed897 3829 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3830 else {
c7f1f016 3831#ifndef EBCDIC
1ba5c669
JH
3832 for (value = 128; value < 256; value++)
3833 ANYOF_BITMAP_SET(ret, value);
3834#else /* EBCDIC */
ffbc6a93 3835 for (value = 0; value < 256; value++) {
3a3c4447
JH
3836 if (!isASCII(value))
3837 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3838 }
1ba5c669 3839#endif /* EBCDIC */
73b437c8 3840 }
ffc61ed2 3841 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3842 break;
aaa51d5e
JF
3843 case ANYOF_BLANK:
3844 if (LOC)
3845 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3846 else {
3847 for (value = 0; value < 256; value++)
3848 if (isBLANK(value))
3849 ANYOF_BITMAP_SET(ret, value);
3850 }
ffc61ed2 3851 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3852 break;
3853 case ANYOF_NBLANK:
3854 if (LOC)
3855 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3856 else {
3857 for (value = 0; value < 256; value++)
3858 if (!isBLANK(value))
3859 ANYOF_BITMAP_SET(ret, value);
3860 }
ffc61ed2 3861 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3862 break;
73b437c8
JH
3863 case ANYOF_CNTRL:
3864 if (LOC)
936ed897 3865 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3866 else {
3867 for (value = 0; value < 256; value++)
3868 if (isCNTRL(value))
936ed897 3869 ANYOF_BITMAP_SET(ret, value);
73b437c8 3870 }
ffc61ed2 3871 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3872 break;
3873 case ANYOF_NCNTRL:
3874 if (LOC)
936ed897 3875 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3876 else {
3877 for (value = 0; value < 256; value++)
3878 if (!isCNTRL(value))
936ed897 3879 ANYOF_BITMAP_SET(ret, value);
73b437c8 3880 }
ffc61ed2
JH
3881 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3882 break;
3883 case ANYOF_DIGIT:
3884 if (LOC)
3885 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3886 else {
3887 /* consecutive digits assumed */
3888 for (value = '0'; value <= '9'; value++)
3889 ANYOF_BITMAP_SET(ret, value);
3890 }
3891 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3892 break;
3893 case ANYOF_NDIGIT:
3894 if (LOC)
3895 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3896 else {
3897 /* consecutive digits assumed */
3898 for (value = 0; value < '0'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3900 for (value = '9' + 1; value < 256; value++)
3901 ANYOF_BITMAP_SET(ret, value);
3902 }
3903 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3904 break;
3905 case ANYOF_GRAPH:
3906 if (LOC)
936ed897 3907 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3908 else {
3909 for (value = 0; value < 256; value++)
3910 if (isGRAPH(value))
936ed897 3911 ANYOF_BITMAP_SET(ret, value);
73b437c8 3912 }
ffc61ed2 3913 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3914 break;
3915 case ANYOF_NGRAPH:
3916 if (LOC)
936ed897 3917 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3918 else {
3919 for (value = 0; value < 256; value++)
3920 if (!isGRAPH(value))
936ed897 3921 ANYOF_BITMAP_SET(ret, value);
73b437c8 3922 }
ffc61ed2 3923 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3924 break;
3925 case ANYOF_LOWER:
3926 if (LOC)
936ed897 3927 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3928 else {
3929 for (value = 0; value < 256; value++)
3930 if (isLOWER(value))
936ed897 3931 ANYOF_BITMAP_SET(ret, value);
73b437c8 3932 }
ffc61ed2 3933 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3934 break;
3935 case ANYOF_NLOWER:
3936 if (LOC)
936ed897 3937 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3938 else {
3939 for (value = 0; value < 256; value++)
3940 if (!isLOWER(value))
936ed897 3941 ANYOF_BITMAP_SET(ret, value);
73b437c8 3942 }
ffc61ed2 3943 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3944 break;
3945 case ANYOF_PRINT:
3946 if (LOC)
936ed897 3947 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3948 else {
3949 for (value = 0; value < 256; value++)
3950 if (isPRINT(value))
936ed897 3951 ANYOF_BITMAP_SET(ret, value);
73b437c8 3952 }
ffc61ed2 3953 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3954 break;
3955 case ANYOF_NPRINT:
3956 if (LOC)
936ed897 3957 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3958 else {
3959 for (value = 0; value < 256; value++)
3960 if (!isPRINT(value))
936ed897 3961 ANYOF_BITMAP_SET(ret, value);
73b437c8 3962 }
ffc61ed2 3963 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3964 break;
aaa51d5e
JF
3965 case ANYOF_PSXSPC:
3966 if (LOC)
3967 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3968 else {
3969 for (value = 0; value < 256; value++)
3970 if (isPSXSPC(value))
3971 ANYOF_BITMAP_SET(ret, value);
3972 }
ffc61ed2 3973 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3974 break;
3975 case ANYOF_NPSXSPC:
3976 if (LOC)
3977 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3978 else {
3979 for (value = 0; value < 256; value++)
3980 if (!isPSXSPC(value))
3981 ANYOF_BITMAP_SET(ret, value);
3982 }
ffc61ed2 3983 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3984 break;
73b437c8
JH
3985 case ANYOF_PUNCT:
3986 if (LOC)
936ed897 3987 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3988 else {
3989 for (value = 0; value < 256; value++)
3990 if (isPUNCT(value))
936ed897 3991 ANYOF_BITMAP_SET(ret, value);
73b437c8 3992 }
ffc61ed2 3993 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3994 break;
3995 case ANYOF_NPUNCT:
3996 if (LOC)
936ed897 3997 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3998 else {
3999 for (value = 0; value < 256; value++)
4000 if (!isPUNCT(value))
936ed897 4001 ANYOF_BITMAP_SET(ret, value);
73b437c8 4002 }
ffc61ed2
JH
4003 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4004 break;
4005 case ANYOF_SPACE:
4006 if (LOC)
4007 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4008 else {
4009 for (value = 0; value < 256; value++)
4010 if (isSPACE(value))
4011 ANYOF_BITMAP_SET(ret, value);
4012 }
4013 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4014 break;
4015 case ANYOF_NSPACE:
4016 if (LOC)
4017 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4018 else {
4019 for (value = 0; value < 256; value++)
4020 if (!isSPACE(value))
4021 ANYOF_BITMAP_SET(ret, value);
4022 }
4023 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4024 break;
4025 case ANYOF_UPPER:
4026 if (LOC)
936ed897 4027 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4028 else {
4029 for (value = 0; value < 256; value++)
4030 if (isUPPER(value))
936ed897 4031 ANYOF_BITMAP_SET(ret, value);
73b437c8 4032 }
ffc61ed2 4033 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4034 break;
4035 case ANYOF_NUPPER:
4036 if (LOC)
936ed897 4037 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4038 else {
4039 for (value = 0; value < 256; value++)
4040 if (!isUPPER(value))
936ed897 4041 ANYOF_BITMAP_SET(ret, value);
73b437c8 4042 }
ffc61ed2 4043 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4044 break;
4045 case ANYOF_XDIGIT:
4046 if (LOC)
936ed897 4047 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4048 else {
4049 for (value = 0; value < 256; value++)
4050 if (isXDIGIT(value))
936ed897 4051 ANYOF_BITMAP_SET(ret, value);
73b437c8 4052 }
ffc61ed2 4053 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4054 break;
4055 case ANYOF_NXDIGIT:
4056 if (LOC)
936ed897 4057 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4058 else {
4059 for (value = 0; value < 256; value++)
4060 if (!isXDIGIT(value))
936ed897 4061 ANYOF_BITMAP_SET(ret, value);
73b437c8 4062 }
ffc61ed2 4063 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4064 break;
4065 default:
b45f050a 4066 vFAIL("Invalid [::] class");
73b437c8 4067 break;
b8c5462f 4068 }
b8c5462f 4069 if (LOC)
936ed897 4070 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4071 continue;
a0d0e21e 4072 }
ffc61ed2
JH
4073 } /* end of namedclass \blah */
4074
a0d0e21e 4075 if (range) {
eb160463 4076 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4077 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4078 RExC_parse - rangebegin,
4079 RExC_parse - rangebegin,
b45f050a 4080 rangebegin);
3568d838 4081 range = 0; /* not a valid range */
73b437c8 4082 }
a0d0e21e
LW
4083 }
4084 else {
3568d838 4085 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4086 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4087 RExC_parse[1] != ']') {
4088 RExC_parse++;
ffc61ed2
JH
4089
4090 /* a bad range like \w-, [:word:]- ? */
4091 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4092 if (ckWARN(WARN_REGEXP))
830247a4 4093 vWARN4(RExC_parse,
b45f050a 4094 "False [] range \"%*.*s\"",
830247a4
IZ
4095 RExC_parse - rangebegin,
4096 RExC_parse - rangebegin,
b45f050a 4097 rangebegin);
73b437c8 4098 if (!SIZE_ONLY)
936ed897 4099 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4100 } else
ffc61ed2
JH
4101 range = 1; /* yeah, it's a range! */
4102 continue; /* but do it the next time */
a0d0e21e 4103 }
a687059c 4104 }
ffc61ed2 4105
93733859 4106 /* now is the next time */
ae5c130c 4107 if (!SIZE_ONLY) {
3568d838
JH
4108 IV i;
4109
4110 if (prevvalue < 256) {
4111 IV ceilvalue = value < 256 ? value : 255;
4112
4113#ifdef EBCDIC
1b2d223b
JH
4114 /* In EBCDIC [\x89-\x91] should include
4115 * the \x8e but [i-j] should not. */
4116 if (literal_endpoint == 2 &&
4117 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4118 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4119 {
3568d838
JH
4120 if (isLOWER(prevvalue)) {
4121 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4122 if (isLOWER(i))
4123 ANYOF_BITMAP_SET(ret, i);
4124 } else {
3568d838 4125 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4126 if (isUPPER(i))
4127 ANYOF_BITMAP_SET(ret, i);
4128 }
8ada0baa 4129 }
ffc61ed2 4130 else
8ada0baa 4131#endif
a5961de5
JH
4132 for (i = prevvalue; i <= ceilvalue; i++)
4133 ANYOF_BITMAP_SET(ret, i);
3568d838 4134 }
a5961de5 4135 if (value > 255 || UTF) {
b08decb7
JH
4136 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4137 UV natvalue = NATIVE_TO_UNI(value);
4138
ffc61ed2 4139 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4140 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4141 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4142 prevnatvalue, natvalue);
4143 }
4144 else if (prevnatvalue == natvalue) {
4145 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4146 if (FOLD) {
254ba52a
JH
4147 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4148 STRLEN foldlen;
2f3bf011 4149 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4150
c840d2a2
JH
4151 /* If folding and foldable and a single
4152 * character, insert also the folded version
4153 * to the charclass. */
9e55ce06 4154 if (f != value) {
eb160463 4155 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4156 Perl_sv_catpvf(aTHX_ listsv,
4157 "%04"UVxf"\n", f);
4158 else {
4159 /* Any multicharacter foldings
4160 * require the following transform:
4161 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4162 * where E folds into "pq" and F folds
4163 * into "rst", all other characters
4164 * fold to single characters. We save
4165 * away these multicharacter foldings,
4166 * to be later saved as part of the
4167 * additional "s" data. */
4168 SV *sv;
4169
4170 if (!unicode_alternate)
4171 unicode_alternate = newAV();
4172 sv = newSVpvn((char*)foldbuf, foldlen);
4173 SvUTF8_on(sv);
4174 av_push(unicode_alternate, sv);
4175 }
4176 }
254ba52a 4177
60a8b682
JH
4178 /* If folding and the value is one of the Greek
4179 * sigmas insert a few more sigmas to make the
4180 * folding rules of the sigmas to work right.
4181 * Note that not all the possible combinations
4182 * are handled here: some of them are handled
9e55ce06
JH
4183 * by the standard folding rules, and some of
4184 * them (literal or EXACTF cases) are handled
4185 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4186 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4187 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4188 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4189 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4190 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4191 }
4192 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4193 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4194 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4195 }
4196 }
ffc61ed2 4197 }
1b2d223b
JH
4198#ifdef EBCDIC
4199 literal_endpoint = 0;
4200#endif
8ada0baa 4201 }
ffc61ed2
JH
4202
4203 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4204 }
ffc61ed2 4205
936ed897 4206 if (need_class) {
4f66b38d 4207 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4208 if (SIZE_ONLY)
830247a4 4209 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4210 else
830247a4 4211 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4212 }
ffc61ed2 4213
ae5c130c 4214 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4215 if (!SIZE_ONLY &&
ffc61ed2 4216 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4217 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4218 ) {
a0ed51b3 4219 for (value = 0; value < 256; ++value) {
936ed897 4220 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4221 UV fold = PL_fold[value];
ffc61ed2
JH
4222
4223 if (fold != value)
4224 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4225 }
4226 }
936ed897 4227 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4228 }
ffc61ed2 4229
ae5c130c 4230 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4231 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4232 /* If the only flag is inversion. */
4233 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4234 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4235 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4236 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4237 }
a0d0e21e 4238
b81d288d 4239 if (!SIZE_ONLY) {
fde631ed 4240 AV *av = newAV();
ffc61ed2
JH
4241 SV *rv;
4242
9e55ce06 4243 /* The 0th element stores the character class description
6a0407ee 4244 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4245 * to initialize the appropriate swash (which gets stored in
4246 * the 1st element), and also useful for dumping the regnode.
4247 * The 2nd element stores the multicharacter foldings,
6a0407ee 4248 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4249 av_store(av, 0, listsv);
4250 av_store(av, 1, NULL);
9e55ce06 4251 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4252 rv = newRV_noinc((SV*)av);
19860706 4253 n = add_data(pRExC_state, 1, "s");
830247a4 4254 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4255 ARG_SET(ret, n);
a0ed51b3
LW
4256 }
4257
4258 return ret;
4259}
4260
76e3520e 4261STATIC char*
830247a4 4262S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4263{
830247a4 4264 char* retval = RExC_parse++;
a0d0e21e 4265
4633a7c4 4266 for (;;) {
830247a4
IZ
4267 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4268 RExC_parse[2] == '#') {
4269 while (*RExC_parse && *RExC_parse != ')')
4270 RExC_parse++;
4271 RExC_parse++;
4633a7c4
LW
4272 continue;
4273 }
e2509266 4274 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4275 if (isSPACE(*RExC_parse)) {
4276 RExC_parse++;
748a9306
LW
4277 continue;
4278 }
830247a4
IZ
4279 else if (*RExC_parse == '#') {
4280 while (*RExC_parse && *RExC_parse != '\n')
4281 RExC_parse++;
4282 RExC_parse++;
748a9306
LW
4283 continue;
4284 }
748a9306 4285 }
4633a7c4 4286 return retval;
a0d0e21e 4287 }
a687059c
LW
4288}
4289
4290/*
c277df42 4291- reg_node - emit a node
a0d0e21e 4292*/
76e3520e 4293STATIC regnode * /* Location. */
830247a4 4294S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4295{
c277df42
IZ
4296 register regnode *ret;
4297 register regnode *ptr;
a687059c 4298
830247a4 4299 ret = RExC_emit;
c277df42 4300 if (SIZE_ONLY) {
830247a4
IZ
4301 SIZE_ALIGN(RExC_size);
4302 RExC_size += 1;
a0d0e21e
LW
4303 return(ret);
4304 }
a687059c 4305
c277df42 4306 NODE_ALIGN_FILL(ret);
a0d0e21e 4307 ptr = ret;
c277df42 4308 FILL_ADVANCE_NODE(ptr, op);
fac92740 4309 if (RExC_offsets) { /* MJD */
ccb2c380 4310 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4311 "reg_node", __LINE__,
4312 reg_name[op],
4313 RExC_emit - RExC_emit_start > RExC_offsets[0]
4314 ? "Overwriting end of array!\n" : "OK",
4315 RExC_emit - RExC_emit_start,
4316 RExC_parse - RExC_start,
4317 RExC_offsets[0]));
ccb2c380 4318 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4319 }
4320
830247a4 4321 RExC_emit = ptr;
a687059c 4322
a0d0e21e 4323 return(ret);
a687059c
LW
4324}
4325
4326/*
a0d0e21e
LW
4327- reganode - emit a node with an argument
4328*/
76e3520e 4329STATIC regnode * /* Location. */
830247a4 4330S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4331{
c277df42
IZ
4332 register regnode *ret;
4333 register regnode *ptr;
fe14fcc3 4334
830247a4 4335 ret = RExC_emit;
c277df42 4336 if (SIZE_ONLY) {
830247a4
IZ
4337 SIZE_ALIGN(RExC_size);
4338 RExC_size += 2;
a0d0e21e
LW
4339 return(ret);
4340 }
fe14fcc3 4341
c277df42 4342 NODE_ALIGN_FILL(ret);
a0d0e21e 4343 ptr = ret;
c277df42 4344 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4345 if (RExC_offsets) { /* MJD */
ccb2c380 4346 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4347 "reganode",
ccb2c380
MP
4348 __LINE__,
4349 reg_name[op],
fac92740
MJD
4350 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4351 "Overwriting end of array!\n" : "OK",
4352 RExC_emit - RExC_emit_start,
4353 RExC_parse - RExC_start,
4354 RExC_offsets[0]));
ccb2c380 4355 Set_Cur_Node_Offset;
fac92740
MJD
4356 }
4357
830247a4 4358 RExC_emit = ptr;
fe14fcc3 4359
a0d0e21e 4360 return(ret);
fe14fcc3
LW
4361}
4362
4363/*
cd439c50 4364- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4365*/
4366STATIC void
830247a4 4367S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4368{
5e12f4fb 4369 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4370}
4371
4372/*
a0d0e21e
LW
4373- reginsert - insert an operator in front of already-emitted operand
4374*
4375* Means relocating the operand.
4376*/
76e3520e 4377STATIC void
830247a4 4378S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4379{
c277df42
IZ
4380 register regnode *src;
4381 register regnode *dst;
4382 register regnode *place;
4383 register int offset = regarglen[(U8)op];
b81d288d 4384
22c35a8c 4385/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4386
4387 if (SIZE_ONLY) {
830247a4 4388 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4389 return;
4390 }
a687059c 4391
830247a4
IZ
4392 src = RExC_emit;
4393 RExC_emit += NODE_STEP_REGNODE + offset;
4394 dst = RExC_emit;
fac92740 4395 while (src > opnd) {
c277df42 4396 StructCopy(--src, --dst, regnode);
fac92740 4397 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4398 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4399 "reg_insert",
ccb2c380
MP
4400 __LINE__,
4401 reg_name[op],
fac92740
MJD
4402 dst - RExC_emit_start > RExC_offsets[0]
4403 ? "Overwriting end of array!\n" : "OK",
4404 src - RExC_emit_start,
4405 dst - RExC_emit_start,
4406 RExC_offsets[0]));
ccb2c380
MP
4407 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4408 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4409 }
4410 }
4411
a0d0e21e
LW
4412
4413 place = opnd; /* Op node, where operand used to be. */
fac92740 4414 if (RExC_offsets) { /* MJD */
ccb2c380 4415 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4416 "reginsert",
ccb2c380
MP
4417 __LINE__,
4418 reg_name[op],
fac92740
MJD
4419 place - RExC_emit_start > RExC_offsets[0]
4420 ? "Overwriting end of array!\n" : "OK",
4421 place - RExC_emit_start,
4422 RExC_parse - RExC_start,
4423 RExC_offsets[0]));
ccb2c380 4424 Set_Node_Offset(place, RExC_parse);
fac92740 4425 }
c277df42
IZ
4426 src = NEXTOPER(place);
4427 FILL_ADVANCE_NODE(place, op);
4428 Zero(src, offset, regnode);
a687059c
LW
4429}
4430
4431/*
c277df42 4432- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4433*/
76e3520e 4434STATIC void
830247a4 4435S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4436{
c277df42
IZ
4437 register regnode *scan;
4438 register regnode *temp;
a0d0e21e 4439
c277df42 4440 if (SIZE_ONLY)
a0d0e21e
LW
4441 return;
4442
4443 /* Find last node. */
4444 scan = p;
4445 for (;;) {
4446 temp = regnext(scan);
4447 if (temp == NULL)
4448 break;
4449 scan = temp;
4450 }
a687059c 4451
c277df42
IZ
4452 if (reg_off_by_arg[OP(scan)]) {
4453 ARG_SET(scan, val - scan);
a0ed51b3
LW
4454 }
4455 else {
c277df42
IZ
4456 NEXT_OFF(scan) = val - scan;
4457 }
a687059c
LW
4458}
4459
4460/*
a0d0e21e
LW
4461- regoptail - regtail on operand of first argument; nop if operandless
4462*/
76e3520e 4463STATIC void
830247a4 4464S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4465{
a0d0e21e 4466 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4467 if (p == NULL || SIZE_ONLY)
4468 return;
22c35a8c 4469 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4470 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4471 }
22c35a8c 4472 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4473 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4474 }
4475 else
a0d0e21e 4476 return;
a687059c
LW
4477}
4478
4479/*
4480 - regcurly - a little FSA that accepts {\d+,?\d*}
4481 */
79072805 4482STATIC I32
cea2e8a9 4483S_regcurly(pTHX_ register char *s)
a687059c
LW
4484{
4485 if (*s++ != '{')
4486 return FALSE;
f0fcb552 4487 if (!isDIGIT(*s))
a687059c 4488 return FALSE;
f0fcb552 4489 while (isDIGIT(*s))
a687059c
LW
4490 s++;
4491 if (*s == ',')
4492 s++;
f0fcb552 4493 while (isDIGIT(*s))
a687059c
LW
4494 s++;
4495 if (*s != '}')
4496 return FALSE;
4497 return TRUE;
4498}
4499
a687059c 4500
8fa7f367
JH
4501#ifdef DEBUGGING
4502
76e3520e 4503STATIC regnode *
cea2e8a9 4504S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4505{
f248d071 4506 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4507 register regnode *next;
c277df42
IZ
4508
4509 while (op != END && (!last || node < last)) {
4510 /* While that wasn't END last time... */
4511
4512 NODE_ALIGN(node);
4513 op = OP(node);
4514 if (op == CLOSE)
4515 l--;
4516 next = regnext(node);
4517 /* Where, what. */
4518 if (OP(node) == OPTIMIZED)
4519 goto after_print;
4520 regprop(sv, node);
b900a521 4521 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4522 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4523 if (next == NULL) /* Next ptr. */
4524 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4525 else
b900a521 4526 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4527 (void)PerlIO_putc(Perl_debug_log, '\n');
4528 after_print:
22c35a8c 4529 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4530 register regnode *nnode = (OP(next) == LONGJMP
4531 ? regnext(next)
c277df42
IZ
4532 : next);
4533 if (last && nnode > last)
4534 nnode = last;
4535 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4536 }
22c35a8c 4537 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4538 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4539 }
4540 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4541 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4542 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4543 }
22c35a8c 4544 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4545 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4546 next, sv, l + 1);
a0ed51b3
LW
4547 }
4548 else if ( op == PLUS || op == STAR) {
c277df42 4549 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4550 }
4551 else if (op == ANYOF) {
4f66b38d
HS
4552 /* arglen 1 + class block */
4553 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4554 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4555 node = NEXTOPER(node);
a0ed51b3 4556 }
22c35a8c 4557 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4558 /* Literal string, where present. */
cd439c50 4559 node += NODE_SZ_STR(node) - 1;
c277df42 4560 node = NEXTOPER(node);
a0ed51b3
LW
4561 }
4562 else {
c277df42
IZ
4563 node = NEXTOPER(node);
4564 node += regarglen[(U8)op];
4565 }
4566 if (op == CURLYX || op == OPEN)
4567 l++;
4568 else if (op == WHILEM)
4569 l--;
4570 }
4571 return node;
4572}
4573
8fa7f367
JH
4574#endif /* DEBUGGING */
4575
a687059c 4576/*
fd181c75 4577 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4578 */
4579void
864dbfa3 4580Perl_regdump(pTHX_ regexp *r)
a687059c 4581{
35ff7856 4582#ifdef DEBUGGING
46fc3d4c 4583 SV *sv = sv_newmortal();
a687059c 4584
c277df42 4585 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4586
4587 /* Header fields of interest. */
c277df42 4588 if (r->anchored_substr)
7b0972df 4589 PerlIO_printf(Perl_debug_log,
b81d288d 4590 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4591 PL_colors[0],
7b0972df 4592 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4593 SvPVX(r->anchored_substr),
3280af22 4594 PL_colors[1],
c277df42 4595 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4596 (IV)r->anchored_offset);
33b8afdf
JH
4597 else if (r->anchored_utf8)
4598 PerlIO_printf(Perl_debug_log,
4599 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4600 PL_colors[0],
4601 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4602 SvPVX(r->anchored_utf8),
4603 PL_colors[1],
4604 SvTAIL(r->anchored_utf8) ? "$" : "",
4605 (IV)r->anchored_offset);
c277df42 4606 if (r->float_substr)
7b0972df 4607 PerlIO_printf(Perl_debug_log,
b81d288d 4608 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4609 PL_colors[0],
b81d288d 4610 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4611 SvPVX(r->float_substr),
3280af22 4612 PL_colors[1],
c277df42 4613 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4614 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4615 else if (r->float_utf8)
4616 PerlIO_printf(Perl_debug_log,
4617 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4618 PL_colors[0],
4619 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4620 SvPVX(r->float_utf8),
4621 PL_colors[1],
4622 SvTAIL(r->float_utf8) ? "$" : "",
4623 (IV)r->float_min_offset, (UV)r->float_max_offset);
4624 if (r->check_substr || r->check_utf8)
b81d288d
AB
4625 PerlIO_printf(Perl_debug_log,
4626 r->check_substr == r->float_substr
33b8afdf 4627 && r->check_utf8 == r->float_utf8
c277df42
IZ
4628 ? "(checking floating" : "(checking anchored");
4629 if (r->reganch & ROPT_NOSCAN)
4630 PerlIO_printf(Perl_debug_log, " noscan");
4631 if (r->reganch & ROPT_CHECK_ALL)
4632 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4633 if (r->check_substr || r->check_utf8)
c277df42
IZ
4634 PerlIO_printf(Perl_debug_log, ") ");
4635
46fc3d4c 4636 if (r->regstclass) {
4637 regprop(sv, r->regstclass);
4638 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4639 }
774d564b 4640 if (r->reganch & ROPT_ANCH) {
4641 PerlIO_printf(Perl_debug_log, "anchored");
4642 if (r->reganch & ROPT_ANCH_BOL)
4643 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4644 if (r->reganch & ROPT_ANCH_MBOL)
4645 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4646 if (r->reganch & ROPT_ANCH_SBOL)
4647 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4648 if (r->reganch & ROPT_ANCH_GPOS)
4649 PerlIO_printf(Perl_debug_log, "(GPOS)");
4650 PerlIO_putc(Perl_debug_log, ' ');
4651 }
c277df42
IZ
4652 if (r->reganch & ROPT_GPOS_SEEN)
4653 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4654 if (r->reganch & ROPT_SKIP)
760ac839 4655 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4656 if (r->reganch & ROPT_IMPLICIT)
760ac839 4657 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4658 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4659 if (r->reganch & ROPT_EVAL_SEEN)
4660 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4661 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4662 if (r->offsets) {
4663 U32 i;
4664 U32 len = r->offsets[0];
392fbf5d 4665 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4666 for (i = 1; i <= len; i++)
392fbf5d
RB
4667 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4668 (UV)r->offsets[i*2-1],
4669 (UV)r->offsets[i*2]);
fac92740
MJD
4670 PerlIO_printf(Perl_debug_log, "\n");
4671 }
17c3b450 4672#endif /* DEBUGGING */
a687059c
LW
4673}
4674
8fa7f367
JH
4675#ifdef DEBUGGING
4676
653099ff
GS
4677STATIC void
4678S_put_byte(pTHX_ SV *sv, int c)
4679{
7be5a6cf 4680 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4681 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4682 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4683 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4684 else
4685 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4686}
4687
8fa7f367
JH
4688#endif /* DEBUGGING */
4689
a687059c 4690/*
a0d0e21e
LW
4691- regprop - printable representation of opcode
4692*/
46fc3d4c 4693void
864dbfa3 4694Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4695{
35ff7856 4696#ifdef DEBUGGING
9b155405 4697 register int k;
a0d0e21e 4698
54dc92de 4699 sv_setpvn(sv, "", 0);
9b155405 4700 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4701 /* It would be nice to FAIL() here, but this may be called from
4702 regexec.c, and it would be hard to supply pRExC_state. */
4703 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4704 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4705
4706 k = PL_regkind[(U8)OP(o)];
4707
2a782b5b
JH
4708 if (k == EXACT) {
4709 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4710 /* Using is_utf8_string() is a crude hack but it may
4711 * be the best for now since we have no flag "this EXACTish
4712 * node was UTF-8" --jhi */
4713 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4714 char *s = do_utf8 ?
c728cb41
JH
4715 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4716 UNI_DISPLAY_REGEX) :
2a782b5b 4717 STRING(o);
40eddc46 4718 int len = do_utf8 ?
2a782b5b
JH
4719 strlen(s) :
4720 STR_LEN(o);
4721 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4722 PL_colors[0],
4723 len, s,
4724 PL_colors[1]);
4725 }
9b155405 4726 else if (k == CURLY) {
cb434fcc 4727 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4728 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4729 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4730 }
2c2d71f5
JH
4731 else if (k == WHILEM && o->flags) /* Ordinal/of */
4732 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4733 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4734 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4735 else if (k == LOGICAL)
04ebc1ab 4736 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4737 else if (k == ANYOF) {
4738 int i, rangestart = -1;
ffc61ed2 4739 U8 flags = ANYOF_FLAGS(o);
a6d05634 4740 const char * const anyofs[] = { /* Should be synchronized with
19860706 4741 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4742 "\\w",
4743 "\\W",
4744 "\\s",
4745 "\\S",
4746 "\\d",
4747 "\\D",
4748 "[:alnum:]",
4749 "[:^alnum:]",
4750 "[:alpha:]",
4751 "[:^alpha:]",
4752 "[:ascii:]",
4753 "[:^ascii:]",
4754 "[:ctrl:]",
4755 "[:^ctrl:]",
4756 "[:graph:]",
4757 "[:^graph:]",
4758 "[:lower:]",
4759 "[:^lower:]",
4760 "[:print:]",
4761 "[:^print:]",
4762 "[:punct:]",
4763 "[:^punct:]",
4764 "[:upper:]",
aaa51d5e 4765 "[:^upper:]",
653099ff 4766 "[:xdigit:]",
aaa51d5e
JF
4767 "[:^xdigit:]",
4768 "[:space:]",
4769 "[:^space:]",
4770 "[:blank:]",
4771 "[:^blank:]"
653099ff
GS
4772 };
4773
19860706 4774 if (flags & ANYOF_LOCALE)
653099ff 4775 sv_catpv(sv, "{loc}");
19860706 4776 if (flags & ANYOF_FOLD)
653099ff
GS
4777 sv_catpv(sv, "{i}");
4778 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4779 if (flags & ANYOF_INVERT)
653099ff 4780 sv_catpv(sv, "^");
ffc61ed2
JH
4781 for (i = 0; i <= 256; i++) {
4782 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4783 if (rangestart == -1)
4784 rangestart = i;
4785 } else if (rangestart != -1) {
4786 if (i <= rangestart + 3)
4787 for (; rangestart < i; rangestart++)
653099ff 4788 put_byte(sv, rangestart);
ffc61ed2
JH
4789 else {
4790 put_byte(sv, rangestart);
4791 sv_catpv(sv, "-");
4792 put_byte(sv, i - 1);
653099ff 4793 }
ffc61ed2 4794 rangestart = -1;
653099ff 4795 }
847a199f 4796 }
ffc61ed2
JH
4797
4798 if (o->flags & ANYOF_CLASS)
4799 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4800 if (ANYOF_CLASS_TEST(o,i))
4801 sv_catpv(sv, anyofs[i]);
4802
4803 if (flags & ANYOF_UNICODE)
4804 sv_catpv(sv, "{unicode}");
1aa99e6b 4805 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4806 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4807
4808 {
4809 SV *lv;
9e55ce06 4810 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4811
ffc61ed2
JH
4812 if (lv) {
4813 if (sw) {
ffc61ed2 4814 U8 s[UTF8_MAXLEN+1];
b81d288d 4815
ffc61ed2 4816 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4817 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4818
3568d838 4819 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4820 if (rangestart == -1)
4821 rangestart = i;
4822 } else if (rangestart != -1) {
4823 U8 *p;
b81d288d 4824
ffc61ed2
JH
4825 if (i <= rangestart + 3)
4826 for (; rangestart < i; rangestart++) {
2b9d42f0 4827 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4828 put_byte(sv, *p);
4829 }
4830 else {
2b9d42f0 4831 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4832 put_byte(sv, *p);
4833 sv_catpv(sv, "-");
2b9d42f0 4834 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4835 put_byte(sv, *p);
4836 }
4837 rangestart = -1;
4838 }
19860706 4839 }
ffc61ed2
JH
4840
4841 sv_catpv(sv, "..."); /* et cetera */
19860706 4842 }
fde631ed 4843
ffc61ed2
JH
4844 {
4845 char *s = savepv(SvPVX(lv));
4846 char *origs = s;
b81d288d 4847
ffc61ed2 4848 while(*s && *s != '\n') s++;
b81d288d 4849
ffc61ed2
JH
4850 if (*s == '\n') {
4851 char *t = ++s;
4852
4853 while (*s) {
4854 if (*s == '\n')
4855 *s = ' ';
4856 s++;
4857 }
4858 if (s[-1] == ' ')
4859 s[-1] = 0;
4860
4861 sv_catpv(sv, t);
fde631ed 4862 }
b81d288d 4863
ffc61ed2 4864 Safefree(origs);
fde631ed
JH
4865 }
4866 }
653099ff 4867 }
ffc61ed2 4868
653099ff
GS
4869 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4870 }
9b155405 4871 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4872 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4873#endif /* DEBUGGING */
35ff7856 4874}
a687059c 4875
cad2e5aa
JH
4876SV *
4877Perl_re_intuit_string(pTHX_ regexp *prog)
4878{ /* Assume that RE_INTUIT is set */
4879 DEBUG_r(
4880 { STRLEN n_a;
33b8afdf
JH
4881 char *s = SvPV(prog->check_substr
4882 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4883
4884 if (!PL_colorset) reginitcolors();
4885 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4886 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4887 PL_colors[4],
4888 prog->check_substr ? "" : "utf8 ",
4889 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4890 s,
4891 PL_colors[1],
4892 (strlen(s) > 60 ? "..." : ""));
4893 } );
4894
33b8afdf 4895 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4896}
4897
2b69d0c2 4898void
864dbfa3 4899Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4900{
9e55ce06
JH
4901#ifdef DEBUGGING
4902 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4903#endif
7821416a
IZ
4904
4905 if (!r || (--r->refcnt > 0))
4906 return;
9e55ce06 4907 DEBUG_r({
d103360b
HS
4908 int len;
4909 char *s;
4910
4911 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4912 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4913 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4914 len = SvCUR(dsv);
9e55ce06
JH
4915 if (!PL_colorset)
4916 reginitcolors();
4917 PerlIO_printf(Perl_debug_log,
4918 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4919 PL_colors[4],PL_colors[5],PL_colors[0],
4920 len, len, s,
4921 PL_colors[1],
4922 len > 60 ? "..." : "");
4923 });
cad2e5aa 4924
c277df42 4925 if (r->precomp)
a0d0e21e 4926 Safefree(r->precomp);
fac92740
MJD
4927 if (r->offsets) /* 20010421 MJD */
4928 Safefree(r->offsets);
ed252734
NC
4929 RX_MATCH_COPY_FREE(r);
4930#ifdef PERL_COPY_ON_WRITE
4931 if (r->saved_copy)
4932 SvREFCNT_dec(r->saved_copy);
4933#endif
a193d654
GS
4934 if (r->substrs) {
4935 if (r->anchored_substr)
4936 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4937 if (r->anchored_utf8)
4938 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4939 if (r->float_substr)
4940 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4941 if (r->float_utf8)
4942 SvREFCNT_dec(r->float_utf8);
2779dcf1 4943 Safefree(r->substrs);
a193d654 4944 }
c277df42
IZ
4945 if (r->data) {
4946 int n = r->data->count;
f3548bdc
DM
4947 PAD* new_comppad = NULL;
4948 PAD* old_comppad;
dfad63ad 4949
c277df42 4950 while (--n >= 0) {
261faec3 4951 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4952 switch (r->data->what[n]) {
4953 case 's':
4954 SvREFCNT_dec((SV*)r->data->data[n]);
4955 break;
653099ff
GS
4956 case 'f':
4957 Safefree(r->data->data[n]);
4958 break;
dfad63ad
HS
4959 case 'p':
4960 new_comppad = (AV*)r->data->data[n];
4961 break;
c277df42 4962 case 'o':
dfad63ad 4963 if (new_comppad == NULL)
cea2e8a9 4964 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4965 PAD_SAVE_LOCAL(old_comppad,
4966 /* Watch out for global destruction's random ordering. */
4967 (SvTYPE(new_comppad) == SVt_PVAV) ?
4968 new_comppad : Null(PAD *)
4969 );
9b978d73
DM
4970 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4971 op_free((OP_4tree*)r->data->data[n]);
4972 }
4973
f3548bdc 4974 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4975 SvREFCNT_dec((SV*)new_comppad);
4976 new_comppad = NULL;
c277df42
IZ
4977 break;
4978 case 'n':
9e55ce06 4979 break;
c277df42 4980 default:
830247a4 4981 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4982 }
4983 }
4984 Safefree(r->data->what);
4985 Safefree(r->data);
a0d0e21e
LW
4986 }
4987 Safefree(r->startp);
4988 Safefree(r->endp);
4989 Safefree(r);
a687059c 4990}
c277df42
IZ
4991
4992/*
4993 - regnext - dig the "next" pointer out of a node
4994 *
4995 * [Note, when REGALIGN is defined there are two places in regmatch()
4996 * that bypass this code for speed.]
4997 */
4998regnode *
864dbfa3 4999Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5000{
5001 register I32 offset;
5002
3280af22 5003 if (p == &PL_regdummy)
c277df42
IZ
5004 return(NULL);
5005
5006 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5007 if (offset == 0)
5008 return(NULL);
5009
c277df42 5010 return(p+offset);
c277df42
IZ
5011}
5012
01f988be 5013STATIC void
cea2e8a9 5014S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5015{
5016 va_list args;
5017 STRLEN l1 = strlen(pat1);
5018 STRLEN l2 = strlen(pat2);
5019 char buf[512];
06bf62c7 5020 SV *msv;
c277df42
IZ
5021 char *message;
5022
5023 if (l1 > 510)
5024 l1 = 510;
5025 if (l1 + l2 > 510)
5026 l2 = 510 - l1;
5027 Copy(pat1, buf, l1 , char);
5028 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5029 buf[l1 + l2] = '\n';
5030 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5031#ifdef I_STDARG
5032 /* ANSI variant takes additional second argument */
c277df42 5033 va_start(args, pat2);
8736538c
AS
5034#else
5035 va_start(args);
5036#endif
5a844595 5037 msv = vmess(buf, &args);
c277df42 5038 va_end(args);
06bf62c7 5039 message = SvPV(msv,l1);
c277df42
IZ
5040 if (l1 > 512)
5041 l1 = 512;
5042 Copy(message, buf, l1 , char);
5043 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5044 Perl_croak(aTHX_ "%s", buf);
c277df42 5045}
a0ed51b3
LW
5046
5047/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5048
5049void
864dbfa3 5050Perl_save_re_context(pTHX)
b81d288d 5051{
830247a4 5052 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5053 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5054 SAVEPPTR(PL_reginput); /* String-input pointer. */
5055 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5056 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5057 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5058 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5059 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5060 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5061 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5062 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5063 PL_reg_start_tmp = 0;
a0ed51b3
LW
5064 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5065 PL_reg_start_tmpl = 0;
7766f137 5066 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5067 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5068 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5069 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5070 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5071 SAVEVPTR(PL_regcc); /* from regexec.c */
5072 SAVEVPTR(PL_curcop);
7766f137
GS
5073 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5074 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5075 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5076 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5077 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5078 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5079 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5080 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5081 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5082 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5083 PL_reg_oldsaved = Nullch;
5084 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5085 PL_reg_oldsavedlen = 0;
ed252734
NC
5086#ifdef PERL_COPY_ON_WRITE
5087 SAVESPTR(PL_nrs);
5088 PL_nrs = Nullsv;
5089#endif
a5db57d6
GS
5090 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5091 PL_reg_maxiter = 0;
5092 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5093 PL_reg_leftiter = 0;
5094 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5095 PL_reg_poscache = Nullch;
5096 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5097 PL_reg_poscache_size = 0;
5098 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5099 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5100 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5101
5102 {
5103 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5104 U32 i;
ada6e8a9
AMS
5105 GV *mgv;
5106 REGEXP *rx;
5107 char digits[16];
5108
5109 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5110 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5111 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5112 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5113 save_scalar(mgv);
5114 }
5115 }
5116 }
5117
54b6e2fa 5118#ifdef DEBUGGING
b81d288d 5119 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5120#endif
a0ed51b3 5121}
51371543 5122
51371543 5123static void
acfe0abc 5124clear_re(pTHX_ void *r)
51371543
GS
5125{
5126 ReREFCNT_dec((regexp *)r);
5127}
ffbc6a93 5128