This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unused variables.
[perl5.git] / regcomp.c
<
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2
JH
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 74 ****
9ef589d8
LW
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
77
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGCOMP_C
a687059c 85#include "perl.h"
d06ea78c 86
acfe0abc 87#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
88# include "INTERN.h"
89#endif
c277df42
IZ
90
91#define REG_COMP_C
a687059c
LW
92#include "regcomp.h"
93
d4cce5f1 94#ifdef op
11343788 95#undef op
d4cce5f1 96#endif /* op */
11343788 97
fe14fcc3
LW
98#ifdef MSDOS
99# if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104# endif /* BUGGY_MSC6 */
105#endif /* MSDOS */
106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
830247a4 111typedef struct RExC_state_t {
e2509266 112 U32 flags; /* are we folding, multilining? */
830247a4
IZ
113 char *precomp; /* uncompiled string. */
114 regexp *rx;
fac92740 115 char *start; /* Start of input for compile */
830247a4
IZ
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 119 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
126 I32 extralen;
127 I32 seen_zerolen;
128 I32 seen_evals;
1aa99e6b 129 I32 utf8;
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
830247a4 155
a687059c
LW
156#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
a687059c 159
35c8bce7
LW
160#ifdef SPSTART
161#undef SPSTART /* dratted cpp namespace... */
162#endif
a687059c
LW
163/*
164 * Flags to be passed up and down.
165 */
a687059c 166#define WORST 0 /* Worst case. */
821b33a5 167#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
168#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169#define SPSTART 0x4 /* Starts with * or +. */
170#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 171
2c2d71f5
JH
172/* Length of a variant. */
173
174typedef struct scan_data_t {
175 I32 len_min;
176 I32 len_delta;
177 I32 pos_min;
178 I32 pos_delta;
179 SV *last_found;
180 I32 last_end; /* min value, <0 unless valid. */
181 I32 last_start_min;
182 I32 last_start_max;
183 SV **longest; /* Either &l_fixed, or &l_float. */
184 SV *longest_fixed;
185 I32 offset_fixed;
186 SV *longest_float;
187 I32 offset_float_min;
188 I32 offset_float_max;
189 I32 flags;
190 I32 whilem_c;
cb434fcc 191 I32 *last_closep;
653099ff 192 struct regnode_charclass_class *start_class;
2c2d71f5
JH
193} scan_data_t;
194
a687059c 195/*
e50aee73 196 * Forward declarations for pregcomp()'s friends.
a687059c 197 */
a0d0e21e 198
b81d288d 199static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 200 0, 0, 0, 0, 0, 0};
c277df42
IZ
201
202#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203#define SF_BEFORE_SEOL 0x1
204#define SF_BEFORE_MEOL 0x2
205#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
09b7f37c
CB
208#ifdef NO_UNARY_PLUS
209# define SF_FIX_SHIFT_EOL (0+2)
210# define SF_FL_SHIFT_EOL (0+4)
211#else
212# define SF_FIX_SHIFT_EOL (+2)
213# define SF_FL_SHIFT_EOL (+4)
214#endif
c277df42
IZ
215
216#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221#define SF_IS_INF 0x40
222#define SF_HAS_PAR 0x80
223#define SF_IN_PAR 0x100
224#define SF_HAS_EVAL 0x200
4bfe0158 225#define SCF_DO_SUBSTR 0x400
653099ff
GS
226#define SCF_DO_STCLASS_AND 0x0800
227#define SCF_DO_STCLASS_OR 0x1000
228#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 229#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 230
eb160463 231#define UTF (RExC_utf8 != 0)
e2509266
JH
232#define LOC ((RExC_flags & PMf_LOCALE) != 0)
233#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 234
ffc61ed2 235#define OOB_UNICODE 12345678
93733859 236#define OOB_NAMEDCLASS -1
b8c5462f 237
a0ed51b3
LW
238#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
8615cb43 241
b45f050a
JF
242/* length of regex to show in messages that don't mark a position within */
243#define RegexLengthToShowInErrorMessages 127
244
245/*
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
249 */
7253e4e3
RK
250#define MARKER1 "<-- HERE" /* marker as it appears in the description */
251#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 252
7253e4e3 253#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
254
255/*
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
258 * "...".
259 */
ccb2c380
MP
260#define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
263 \
264 if (!SIZE_ONLY) \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
269 ellipses = "..."; \
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
273} STMT_END
8615cb43 274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
ccb2c380
MP
280#define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
283 \
284 if (!SIZE_ONLY) \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
289 ellipses = "..."; \
290 } \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
293} STMT_END
b45f050a
JF
294
295
296/*
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 */
ccb2c380
MP
299#define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
303} STMT_END
b45f050a
JF
304
305/*
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 */
ccb2c380
MP
308#define vFAIL(m) STMT_START { \
309 if (!SIZE_ONLY) \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
311 Simple_vFAIL(m); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Like Simple_vFAIL(), but accepts two arguments.
316 */
ccb2c380
MP
317#define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 */
ccb2c380
MP
326#define vFAIL2(m,a1) STMT_START { \
327 if (!SIZE_ONLY) \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
330} STMT_END
b45f050a
JF
331
332
333/*
334 * Like Simple_vFAIL(), but accepts three arguments.
335 */
ccb2c380
MP
336#define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
340} STMT_END
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 */
ccb2c380
MP
345#define vFAIL3(m,a1,a2) STMT_START { \
346 if (!SIZE_ONLY) \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Like Simple_vFAIL(), but accepts four arguments.
353 */
ccb2c380
MP
354#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts five arguments.
362 */
ccb2c380
MP
363#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
368
369
370#define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
375
376#define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
382
383
384#define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
388} STMT_END
389
390#define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
394} STMT_END
395
396#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400} STMT_END
401
402#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
9d1d55b5 407
8615cb43 408
cd439c50 409/* Allow for side effects in s */
ccb2c380
MP
410#define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412} STMT_END
cd439c50 413
fac92740
MJD
414/* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
418 */
419
420#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
421/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424#define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 if (! SIZE_ONLY) { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
428 if((node) < 0) { \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 } else { \
431 RExC_offsets[2*(node)-1] = (byte); \
432 } \
433 } \
434} STMT_END
435
436#define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440#define Set_Node_Length_To_R(node,len) STMT_START { \
441 if (! SIZE_ONLY) { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
444 if((node) < 0) { \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 } else { \
447 RExC_offsets[2*(node)] = (len); \
448 } \
449 } \
450} STMT_END
451
452#define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455#define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
457
458/* Get offsets and lengths */
459#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
acfe0abc 462static void clear_re(pTHX_ void *r);
4327152a 463
653099ff
GS
464/* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
467
4327152a 468STATIC void
830247a4 469S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 470{
a0ed51b3
LW
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 473
c277df42
IZ
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 sv_setsv(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
b81d288d 479 data->flags
c277df42
IZ
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 else
482 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
483 }
484 else {
c277df42 485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
486 data->offset_float_max = (l
487 ? data->last_start_max
c277df42 488 : data->pos_min + data->pos_delta);
9051bda5
HS
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
c277df42 491 if (data->flags & SF_BEFORE_EOL)
b81d288d 492 data->flags
c277df42
IZ
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 else
495 data->flags &= ~SF_FL_BEFORE_EOL;
496 }
497 }
498 SvCUR_set(data->last_found, 0);
0eda9292
JH
499 {
500 SV * sv = data->last_found;
501 MAGIC *mg =
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
504 mg->mg_len = 0;
505 }
c277df42
IZ
506 data->last_end = -1;
507 data->flags &= ~SF_BEFORE_EOL;
508}
509
653099ff
GS
510/* Can match anything (initialization) */
511STATIC void
830247a4 512S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 513{
653099ff 514 ANYOF_CLASS_ZERO(cl);
f8bef550 515 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
517 if (LOC)
518 cl->flags |= ANYOF_LOCALE;
519}
520
521/* Can match anything (initialization) */
522STATIC int
523S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524{
525 int value;
526
aaa51d5e 527 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 return 1;
1aa99e6b
IH
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 return 0;
f8bef550
NC
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
533 return 0;
653099ff
GS
534 return 1;
535}
536
537/* Can match anything (initialization) */
538STATIC void
830247a4 539S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544}
545
546STATIC void
830247a4 547S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 548{
8ecf7187 549 Zero(cl, 1, struct regnode_charclass_class);
653099ff 550 cl->type = ANYOF;
830247a4 551 cl_anything(pRExC_state, cl);
653099ff
GS
552 if (LOC)
553 cl->flags |= ANYOF_LOCALE;
554}
555
556/* 'And' a given class with another one. Can create false positives */
557/* We assume that cl is not inverted */
558STATIC void
559S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
561{
653099ff
GS
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
567 int i;
568
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
572 else
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
578
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
584 if (!(and_with->flags & ANYOF_UNICODE_ALL))
585 cl->flags &= ~ANYOF_UNICODE_ALL;
586 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
587 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
588}
589
590/* 'OR' a given class with another one. Can create false positives */
591/* We assume that cl is not inverted */
592STATIC void
830247a4 593S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 594{
653099ff
GS
595 if (or_with->flags & ANYOF_INVERT) {
596 /* We do not use
597 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
598 * <= (B1 | !B2) | (CL1 | !CL2)
599 * which is wasteful if CL2 is small, but we ignore CL2:
600 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
601 * XXXX Can we handle case-fold? Unclear:
602 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
603 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
604 */
605 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
606 && !(or_with->flags & ANYOF_FOLD)
607 && !(cl->flags & ANYOF_FOLD) ) {
608 int i;
609
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= ~or_with->bitmap[i];
612 } /* XXXX: logic is complicated otherwise */
613 else {
830247a4 614 cl_anything(pRExC_state, cl);
653099ff
GS
615 }
616 } else {
617 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 619 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
620 || (cl->flags & ANYOF_FOLD)) ) {
621 int i;
622
623 /* OR char bitmap and class bitmap separately */
624 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
625 cl->bitmap[i] |= or_with->bitmap[i];
626 if (or_with->flags & ANYOF_CLASS) {
627 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
628 cl->classflags[i] |= or_with->classflags[i];
629 cl->flags |= ANYOF_CLASS;
630 }
631 }
632 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 }
636 if (or_with->flags & ANYOF_EOS)
637 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
638
639 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
640 ARG(cl) != ARG(or_with)) {
641 cl->flags |= ANYOF_UNICODE_ALL;
642 cl->flags &= ~ANYOF_UNICODE;
643 }
644 if (or_with->flags & ANYOF_UNICODE_ALL) {
645 cl->flags |= ANYOF_UNICODE_ALL;
646 cl->flags &= ~ANYOF_UNICODE;
647 }
653099ff
GS
648}
649
5d1c421c
JH
650/*
651 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
652 * These need to be revisited when a newer toolchain becomes available.
653 */
654#if defined(__sparc64__) && defined(__GNUC__)
655# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
656# undef SPARC64_GCC_WORKAROUND
657# define SPARC64_GCC_WORKAROUND 1
658# endif
659#endif
660
653099ff
GS
661/* REx optimizer. Converts nodes into quickier variants "in place".
662 Finds fixed substrings. */
663
c277df42
IZ
664/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
665 to the position after last scanned or to NULL. */
666
76e3520e 667STATIC I32
830247a4 668S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
669 /* scanp: Start here (read-write). */
670 /* deltap: Write maxlen-minlen here. */
671 /* last: Stop before this one. */
672{
673 I32 min = 0, pars = 0, code;
674 regnode *scan = *scanp, *next;
675 I32 delta = 0;
676 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 677 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
678 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
679 scan_data_t data_fake;
653099ff 680 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 681
c277df42
IZ
682 while (scan && OP(scan) != END && scan < last) {
683 /* Peephole optimizer: */
684
22c35a8c 685 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 686 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
687 regnode *n = regnext(scan);
688 U32 stringok = 1;
689#ifdef DEBUGGING
690 regnode *stop = scan;
b81d288d 691#endif
c277df42 692
cd439c50 693 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
694 /* Skip NOTHING, merge EXACT*. */
695 while (n &&
b81d288d 696 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
697 (stringok && (OP(n) == OP(scan))))
698 && NEXT_OFF(n)
699 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
700 if (OP(n) == TAIL || n > next)
701 stringok = 0;
22c35a8c 702 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
703 NEXT_OFF(scan) += NEXT_OFF(n);
704 next = n + NODE_STEP_REGNODE;
705#ifdef DEBUGGING
706 if (stringok)
707 stop = n;
b81d288d 708#endif
c277df42 709 n = regnext(n);
a0ed51b3 710 }
f49d4d0f 711 else if (stringok) {
cd439c50 712 int oldl = STR_LEN(scan);
c277df42 713 regnode *nnext = regnext(n);
f49d4d0f 714
b81d288d 715 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
716 break;
717 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
718 STR_LEN(scan) += STR_LEN(n);
719 next = n + NODE_SZ_STR(n);
c277df42 720 /* Now we can overwrite *n : */
f49d4d0f 721 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 722#ifdef DEBUGGING
f49d4d0f 723 stop = next - 1;
b81d288d 724#endif
c277df42
IZ
725 n = nnext;
726 }
727 }
61a36c01 728
d65e4eab 729 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
730/*
731 Two problematic code points in Unicode casefolding of EXACT nodes:
732
733 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
734 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
735
736 which casefold to
737
738 Unicode UTF-8
739
740 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
741 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
742
743 This means that in case-insensitive matching (or "loose matching",
744 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
745 length of the above casefolded versions) can match a target string
746 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
747 This would rather mess up the minimum length computation.
748
749 What we'll do is to look for the tail four bytes, and then peek
750 at the preceding two bytes to see whether we need to decrease
751 the minimum length by four (six minus two).
752
753 Thanks to the design of UTF-8, there cannot be false matches:
754 A sequence of valid UTF-8 bytes cannot be a subsequence of
755 another valid sequence of UTF-8 bytes.
756
757*/
758 char *s0 = STRING(scan), *s, *t;
759 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
760 char *t0 = "\xcc\x88\xcc\x81";
761 char *t1 = t0 + 3;
762
763 for (s = s0 + 2;
764 s < s2 && (t = ninstr(s, s1, t0, t1));
765 s = t + 4) {
766 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
767 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
768 min -= 4;
769 }
770 }
771
c277df42
IZ
772#ifdef DEBUGGING
773 /* Allow dumping */
cd439c50 774 n = scan + NODE_SZ_STR(scan);
c277df42 775 while (n <= stop) {
22c35a8c 776 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
777 OP(n) = OPTIMIZED;
778 NEXT_OFF(n) = 0;
779 }
780 n++;
781 }
653099ff 782#endif
c277df42 783 }
653099ff
GS
784 /* Follow the next-chain of the current node and optimize
785 away all the NOTHINGs from it. */
c277df42 786 if (OP(scan) != CURLYX) {
048cfca1
GS
787 int max = (reg_off_by_arg[OP(scan)]
788 ? I32_MAX
789 /* I32 may be smaller than U16 on CRAYs! */
790 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
791 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
792 int noff;
793 regnode *n = scan;
b81d288d 794
c277df42
IZ
795 /* Skip NOTHING and LONGJMP. */
796 while ((n = regnext(n))
22c35a8c 797 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
798 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
799 && off + noff < max)
800 off += noff;
801 if (reg_off_by_arg[OP(scan)])
802 ARG(scan) = off;
b81d288d 803 else
c277df42
IZ
804 NEXT_OFF(scan) = off;
805 }
653099ff
GS
806 /* The principal pseudo-switch. Cannot be a switch, since we
807 look into several different things. */
b81d288d 808 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
809 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
810 next = regnext(scan);
811 code = OP(scan);
b81d288d
AB
812
813 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 814 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 815 struct regnode_charclass_class accum;
c277df42 816
653099ff 817 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 818 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 819 if (flags & SCF_DO_STCLASS)
830247a4 820 cl_init_zero(pRExC_state, &accum);
c277df42 821 while (OP(scan) == code) {
830247a4 822 I32 deltanext, minnext, f = 0, fake;
653099ff 823 struct regnode_charclass_class this_class;
c277df42
IZ
824
825 num++;
826 data_fake.flags = 0;
b81d288d 827 if (data) {
2c2d71f5 828 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
829 data_fake.last_closep = data->last_closep;
830 }
831 else
832 data_fake.last_closep = &fake;
c277df42
IZ
833 next = regnext(scan);
834 scan = NEXTOPER(scan);
835 if (code != BRANCH)
836 scan = NEXTOPER(scan);
653099ff 837 if (flags & SCF_DO_STCLASS) {
830247a4 838 cl_init(pRExC_state, &this_class);
653099ff
GS
839 data_fake.start_class = &this_class;
840 f = SCF_DO_STCLASS_AND;
b81d288d 841 }
e1901655
IZ
842 if (flags & SCF_WHILEM_VISITED_POS)
843 f |= SCF_WHILEM_VISITED_POS;
653099ff 844 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
845 minnext = study_chunk(pRExC_state, &scan, &deltanext,
846 next, &data_fake, f);
b81d288d 847 if (min1 > minnext)
c277df42
IZ
848 min1 = minnext;
849 if (max1 < minnext + deltanext)
850 max1 = minnext + deltanext;
851 if (deltanext == I32_MAX)
aca2d497 852 is_inf = is_inf_internal = 1;
c277df42
IZ
853 scan = next;
854 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
855 pars++;
405ff068 856 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 857 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
858 if (data)
859 data->whilem_c = data_fake.whilem_c;
653099ff 860 if (flags & SCF_DO_STCLASS)
830247a4 861 cl_or(pRExC_state, &accum, &this_class);
b81d288d 862 if (code == SUSPEND)
c277df42
IZ
863 break;
864 }
865 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
866 min1 = 0;
867 if (flags & SCF_DO_SUBSTR) {
868 data->pos_min += min1;
869 data->pos_delta += max1 - min1;
870 if (max1 != min1 || is_inf)
871 data->longest = &(data->longest_float);
872 }
873 min += min1;
874 delta += max1 - min1;
653099ff 875 if (flags & SCF_DO_STCLASS_OR) {
830247a4 876 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
877 if (min1) {
878 cl_and(data->start_class, &and_with);
879 flags &= ~SCF_DO_STCLASS;
880 }
881 }
882 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
883 if (min1) {
884 cl_and(data->start_class, &accum);
653099ff 885 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
886 }
887 else {
b81d288d 888 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
889 * data->start_class */
890 StructCopy(data->start_class, &and_with,
891 struct regnode_charclass_class);
892 flags &= ~SCF_DO_STCLASS_AND;
893 StructCopy(&accum, data->start_class,
894 struct regnode_charclass_class);
895 flags |= SCF_DO_STCLASS_OR;
896 data->start_class->flags |= ANYOF_EOS;
897 }
653099ff 898 }
a0ed51b3
LW
899 }
900 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
901 scan = NEXTOPER(NEXTOPER(scan));
902 else /* single branch is optimized. */
903 scan = NEXTOPER(scan);
904 continue;
a0ed51b3
LW
905 }
906 else if (OP(scan) == EXACT) {
cd439c50 907 I32 l = STR_LEN(scan);
1aa99e6b 908 UV uc = *((U8*)STRING(scan));
a0ed51b3 909 if (UTF) {
1aa99e6b
IH
910 U8 *s = (U8*)STRING(scan);
911 l = utf8_length(s, s + l);
9041c2e3 912 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
913 }
914 min += l;
c277df42 915 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
916 /* The code below prefers earlier match for fixed
917 offset, later match for variable offset. */
918 if (data->last_end == -1) { /* Update the start info. */
919 data->last_start_min = data->pos_min;
920 data->last_start_max = is_inf
b81d288d 921 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 922 }
cd439c50 923 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
924 {
925 SV * sv = data->last_found;
926 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
927 mg_find(sv, PERL_MAGIC_utf8) : NULL;
928 if (mg && mg->mg_len >= 0)
5e43f467
JH
929 mg->mg_len += utf8_length((U8*)STRING(scan),
930 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 931 }
33b8afdf
JH
932 if (UTF)
933 SvUTF8_on(data->last_found);
c277df42
IZ
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
937 }
653099ff
GS
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
940 int compat = 1;
941
1aa99e6b 942 if (uc >= 0x100 ||
516a5887 943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 945 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 947 )
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat)
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 953 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
954 if (uc < 0x100)
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
956 }
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
1aa99e6b 959 if (uc < 0x100)
9b877dbb
IH
960 ANYOF_BITMAP_SET(data->start_class, uc);
961 else
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
653099ff 968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 969 I32 l = STR_LEN(scan);
1aa99e6b 970 UV uc = *((U8*)STRING(scan));
653099ff
GS
971
972 /* Search for fixed substrings supports EXACT only. */
b81d288d 973 if (flags & SCF_DO_SUBSTR)
830247a4 974 scan_commit(pRExC_state, data);
a0ed51b3 975 if (UTF) {
1aa99e6b
IH
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
9041c2e3 978 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
979 }
980 min += l;
c277df42 981 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 982 data->pos_min += l;
653099ff
GS
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
985 int compat = 1;
986
1aa99e6b 987 if (uc >= 0x100 ||
516a5887 988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
991 compat = 0;
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
994 if (compat) {
1aa99e6b 995 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1000 }
1001 }
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1aa99e6b
IH
1006 if (uc < 0x100)
1007 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1008 data->start_class->flags &= ~ANYOF_EOS;
1009 }
1010 cl_and(data->start_class, &and_with);
1011 }
1012 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1013 }
4d61ec05 1014 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1016 I32 f = flags, pos_before = 0;
c277df42 1017 regnode *oscan = scan;
653099ff
GS
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
727f22e3 1020 I32 next_is_eval = 0;
653099ff 1021
22c35a8c 1022 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1023 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1024 scan = NEXTOPER(scan);
1025 goto finish;
1026 case PLUS:
653099ff 1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1028 next = NEXTOPER(scan);
653099ff 1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1030 mincount = 1;
1031 maxcount = REG_INFTY;
c277df42
IZ
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1034 goto do_curly;
1035 }
1036 }
1037 if (flags & SCF_DO_SUBSTR)
1038 data->pos_min++;
1039 min++;
1040 /* Fall through. */
1041 case STAR:
653099ff
GS
1042 if (flags & SCF_DO_STCLASS) {
1043 mincount = 0;
b81d288d 1044 maxcount = REG_INFTY;
653099ff
GS
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1047 goto do_curly;
1048 }
b81d288d 1049 is_inf = is_inf_internal = 1;
c277df42
IZ
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
830247a4 1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1053 data->longest = &(data->longest_float);
1054 }
1055 goto optimize_curly_tail;
1056 case CURLY:
b81d288d 1057 mincount = ARG1(scan);
c277df42
IZ
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
cb434fcc
IZ
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1062
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064 }
c277df42 1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1066 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1067 do_curly:
1068 if (flags & SCF_DO_SUBSTR) {
830247a4 1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1070 pos_before = data->pos_min;
1071 }
1072 if (data) {
1073 fl = data->flags;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075 if (is_inf)
1076 data->flags |= SF_IS_INF;
1077 }
653099ff 1078 if (flags & SCF_DO_STCLASS) {
830247a4 1079 cl_init(pRExC_state, &this_class);
653099ff
GS
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1084 }
e1901655
IZ
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
b81d288d 1088
e1901655
IZ
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1093
c277df42 1094 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096 mincount == 0
653099ff
GS
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1098
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1103 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1104 }
1105 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1106 /* Switch to OR mode: cache the old value of
653099ff
GS
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1115 }
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1118 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1119 cl_and(data->start_class, &and_with);
1120 }
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1124 }
c277df42
IZ
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1126 scan = next;
84037bb0 1127 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1130 && (minnext == 0) && (deltanext == 0)
99799961 1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1133 {
830247a4 1134 vWARN(RExC_parse,
b45f050a
JF
1135 "Quantifier unexpected on zero-length expression");
1136 }
1137
c277df42 1138 min += minnext * mincount;
b81d288d 1139 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
aca2d497 1142 is_inf |= is_inf_internal;
c277df42
IZ
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144
1145 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1146 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1152 regnode *nxt1 = nxt;
1153#ifdef DEBUGGING
1154 regnode *nxt2;
1155#endif
c277df42
IZ
1156
1157 /* Skip open. */
1158 nxt = regnext(nxt);
4d61ec05 1159 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1161 && STR_LEN(nxt) == 1))
c277df42 1162 goto nogo;
497b47a8 1163#ifdef DEBUGGING
c277df42 1164 nxt2 = nxt;
497b47a8 1165#endif
c277df42 1166 nxt = regnext(nxt);
b81d288d 1167 if (OP(nxt) != CLOSE)
c277df42
IZ
1168 goto nogo;
1169 /* Now we know that nxt2 is the only contents: */
eb160463 1170 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1171 OP(oscan) = CURLYN;
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1173#ifdef DEBUGGING
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1180#endif
c277df42 1181 }
c277df42
IZ
1182 nogo:
1183
1184 /* Try optimization CURLYX => CURLYM. */
b81d288d 1185 if ( OP(oscan) == CURLYX && data
c277df42 1186 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1187 && !(data->flags & SF_HAS_EVAL)
1188 && !deltanext ) {
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1192 regnode *nxt2;
1193
1194 OP(oscan) = CURLYM;
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1196 && (OP(nxt2) != WHILEM))
c277df42
IZ
1197 nxt = nxt2;
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203
b81d288d 1204 if (OP(nxt) != CLOSE)
b45f050a 1205 FAIL("Panic opt close");
eb160463 1206 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1209#ifdef DEBUGGING
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1214#endif
c277df42
IZ
1215#if 0
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
b81d288d 1218
c277df42
IZ
1219 if (nnxt == nxt) {
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1224 else
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1226 }
1227 nxt1 = nnxt;
1228 }
1229#endif
1230 /* Optimize again: */
b81d288d 1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1232 NULL, 0);
a0ed51b3
LW
1233 }
1234 else
c277df42 1235 oscan->flags = 0;
c277df42 1236 }
e1901655
IZ
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1247
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249 nxt += ARG(nxt);
eb160463
GS
1250 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1251 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1252 }
b81d288d 1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1254 pars++;
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1258
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1260#if defined(SPARC64_GCC_WORKAROUND)
1261 I32 b = 0;
1262 STRLEN l = 0;
1263 char *s = NULL;
1264 I32 old = 0;
1265
1266 if (pos_before >= data->last_start_min)
1267 b = pos_before;
1268 else
1269 b = data->last_start_min;
1270
1271 l = 0;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1274
1275#else
b81d288d 1276 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1277 ? pos_before : data->last_start_min;
1278 STRLEN l;
1279 char *s = SvPV(data->last_found, l);
a0ed51b3 1280 I32 old = b - data->last_start_min;
5d1c421c 1281#endif
a0ed51b3
LW
1282
1283 if (UTF)
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1285
a0ed51b3 1286 l -= old;
c277df42 1287 /* Get the added string: */
79cb57f6 1288 last_str = newSVpvn(s + old, l);
0e933229
IH
1289 if (UTF)
1290 SvUTF8_on(last_str);
c277df42
IZ
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1293 if (mincount > 1) {
1294 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1295 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
b81d288d 1299 SvCUR_set(data->last_found,
c277df42
IZ
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
0eda9292
JH
1302 {
1303 SV * sv = data->last_found;
1304 MAGIC *mg =
1305 SvUTF8(sv) && SvMAGICAL(sv) ?
1306 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1307 if (mg && mg->mg_len >= 0)
1308 mg->mg_len += CHR_SVLEN(last_str);
1309 }
c277df42
IZ
1310 data->last_end += l * (mincount - 1);
1311 }
2a8d9689
HS
1312 } else {
1313 /* start offset must point into the last copy */
1314 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1315 data->last_start_max += is_inf ? I32_MAX
1316 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1317 }
1318 }
1319 /* It is counted once already... */
1320 data->pos_min += minnext * (mincount - counted);
1321 data->pos_delta += - counted * deltanext +
1322 (minnext + deltanext) * maxcount - minnext * mincount;
1323 if (mincount != maxcount) {
653099ff
GS
1324 /* Cannot extend fixed substrings found inside
1325 the group. */
830247a4 1326 scan_commit(pRExC_state,data);
c277df42
IZ
1327 if (mincount && last_str) {
1328 sv_setsv(data->last_found, last_str);
1329 data->last_end = data->pos_min;
b81d288d 1330 data->last_start_min =
a0ed51b3 1331 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1332 data->last_start_max = is_inf
1333 ? I32_MAX
c277df42 1334 : data->pos_min + data->pos_delta
a0ed51b3 1335 - CHR_SVLEN(last_str);
c277df42
IZ
1336 }
1337 data->longest = &(data->longest_float);
1338 }
aca2d497 1339 SvREFCNT_dec(last_str);
c277df42 1340 }
405ff068 1341 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1342 data->flags |= SF_HAS_EVAL;
1343 optimize_curly_tail:
c277df42 1344 if (OP(oscan) != CURLYX) {
22c35a8c 1345 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1346 && NEXT_OFF(next))
1347 NEXT_OFF(oscan) += NEXT_OFF(next);
1348 }
c277df42 1349 continue;
653099ff 1350 default: /* REF and CLUMP only? */
c277df42 1351 if (flags & SCF_DO_SUBSTR) {
830247a4 1352 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1353 data->longest = &(data->longest_float);
1354 }
aca2d497 1355 is_inf = is_inf_internal = 1;
653099ff 1356 if (flags & SCF_DO_STCLASS_OR)
830247a4 1357 cl_anything(pRExC_state, data->start_class);
653099ff 1358 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1359 break;
1360 }
a0ed51b3 1361 }
ffc61ed2 1362 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1363 int value = 0;
653099ff 1364
c277df42 1365 if (flags & SCF_DO_SUBSTR) {
830247a4 1366 scan_commit(pRExC_state,data);
c277df42
IZ
1367 data->pos_min++;
1368 }
1369 min++;
653099ff
GS
1370 if (flags & SCF_DO_STCLASS) {
1371 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1372
1373 /* Some of the logic below assumes that switching
1374 locale on will only add false positives. */
1375 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1376 case SANY:
653099ff
GS
1377 default:
1378 do_default:
1379 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1380 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1381 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1382 break;
1383 case REG_ANY:
1384 if (OP(scan) == SANY)
1385 goto do_default;
1386 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1387 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1388 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1389 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1390 }
1391 if (flags & SCF_DO_STCLASS_AND || !value)
1392 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1393 break;
1394 case ANYOF:
1395 if (flags & SCF_DO_STCLASS_AND)
1396 cl_and(data->start_class,
1397 (struct regnode_charclass_class*)scan);
1398 else
830247a4 1399 cl_or(pRExC_state, data->start_class,
653099ff
GS
1400 (struct regnode_charclass_class*)scan);
1401 break;
1402 case ALNUM:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1406 for (value = 0; value < 256; value++)
1407 if (!isALNUM(value))
1408 ANYOF_BITMAP_CLEAR(data->start_class, value);
1409 }
1410 }
1411 else {
1412 if (data->start_class->flags & ANYOF_LOCALE)
1413 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1414 else {
1415 for (value = 0; value < 256; value++)
1416 if (isALNUM(value))
b81d288d 1417 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1418 }
1419 }
1420 break;
1421 case ALNUML:
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (data->start_class->flags & ANYOF_LOCALE)
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1425 }
1426 else {
1427 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1428 data->start_class->flags |= ANYOF_LOCALE;
1429 }
1430 break;
1431 case NALNUM:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1435 for (value = 0; value < 256; value++)
1436 if (isALNUM(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 if (data->start_class->flags & ANYOF_LOCALE)
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1443 else {
1444 for (value = 0; value < 256; value++)
1445 if (!isALNUM(value))
b81d288d 1446 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1447 }
1448 }
1449 break;
1450 case NALNUML:
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (data->start_class->flags & ANYOF_LOCALE)
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1454 }
1455 else {
1456 data->start_class->flags |= ANYOF_LOCALE;
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458 }
1459 break;
1460 case SPACE:
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1464 for (value = 0; value < 256; value++)
1465 if (!isSPACE(value))
1466 ANYOF_BITMAP_CLEAR(data->start_class, value);
1467 }
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (isSPACE(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 case SPACEL:
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (data->start_class->flags & ANYOF_LOCALE)
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1483 }
1484 else {
1485 data->start_class->flags |= ANYOF_LOCALE;
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487 }
1488 break;
1489 case NSPACE:
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1493 for (value = 0; value < 256; value++)
1494 if (isSPACE(value))
1495 ANYOF_BITMAP_CLEAR(data->start_class, value);
1496 }
1497 }
1498 else {
1499 if (data->start_class->flags & ANYOF_LOCALE)
1500 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1501 else {
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
b81d288d 1504 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1505 }
1506 }
1507 break;
1508 case NSPACEL:
1509 if (flags & SCF_DO_STCLASS_AND) {
1510 if (data->start_class->flags & ANYOF_LOCALE) {
1511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
1514 ANYOF_BITMAP_CLEAR(data->start_class, value);
1515 }
1516 }
1517 else {
1518 data->start_class->flags |= ANYOF_LOCALE;
1519 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1520 }
1521 break;
1522 case DIGIT:
1523 if (flags & SCF_DO_STCLASS_AND) {
1524 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1525 for (value = 0; value < 256; value++)
1526 if (!isDIGIT(value))
1527 ANYOF_BITMAP_CLEAR(data->start_class, value);
1528 }
1529 else {
1530 if (data->start_class->flags & ANYOF_LOCALE)
1531 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1532 else {
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
b81d288d 1535 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1536 }
1537 }
1538 break;
1539 case NDIGIT:
1540 if (flags & SCF_DO_STCLASS_AND) {
1541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1542 for (value = 0; value < 256; value++)
1543 if (isDIGIT(value))
1544 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545 }
1546 else {
1547 if (data->start_class->flags & ANYOF_LOCALE)
1548 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1549 else {
1550 for (value = 0; value < 256; value++)
1551 if (!isDIGIT(value))
b81d288d 1552 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1553 }
1554 }
1555 break;
1556 }
1557 if (flags & SCF_DO_STCLASS_OR)
1558 cl_and(data->start_class, &and_with);
1559 flags &= ~SCF_DO_STCLASS;
1560 }
a0ed51b3 1561 }
22c35a8c 1562 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1563 data->flags |= (OP(scan) == MEOL
1564 ? SF_BEFORE_MEOL
1565 : SF_BEFORE_SEOL);
a0ed51b3 1566 }
653099ff
GS
1567 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1568 /* Lookbehind, or need to calculate parens/evals/stclass: */
1569 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1570 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1571 /* Lookahead/lookbehind */
cb434fcc 1572 I32 deltanext, minnext, fake = 0;
c277df42 1573 regnode *nscan;
653099ff
GS
1574 struct regnode_charclass_class intrnl;
1575 int f = 0;
c277df42
IZ
1576
1577 data_fake.flags = 0;
b81d288d 1578 if (data) {
2c2d71f5 1579 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1580 data_fake.last_closep = data->last_closep;
1581 }
1582 else
1583 data_fake.last_closep = &fake;
653099ff
GS
1584 if ( flags & SCF_DO_STCLASS && !scan->flags
1585 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1586 cl_init(pRExC_state, &intrnl);
653099ff 1587 data_fake.start_class = &intrnl;
e1901655 1588 f |= SCF_DO_STCLASS_AND;
653099ff 1589 }
e1901655
IZ
1590 if (flags & SCF_WHILEM_VISITED_POS)
1591 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1592 next = regnext(scan);
1593 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1594 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1595 if (scan->flags) {
1596 if (deltanext) {
9baa0206 1597 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1598 }
1599 else if (minnext > U8_MAX) {
9baa0206 1600 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1601 }
eb160463 1602 scan->flags = (U8)minnext;
c277df42
IZ
1603 }
1604 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1605 pars++;
405ff068 1606 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1607 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1608 if (data)
1609 data->whilem_c = data_fake.whilem_c;
e1901655 1610 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1611 int was = (data->start_class->flags & ANYOF_EOS);
1612
1613 cl_and(data->start_class, &intrnl);
1614 if (was)
1615 data->start_class->flags |= ANYOF_EOS;
1616 }
a0ed51b3
LW
1617 }
1618 else if (OP(scan) == OPEN) {
c277df42 1619 pars++;
a0ed51b3 1620 }
cb434fcc 1621 else if (OP(scan) == CLOSE) {
eb160463 1622 if ((I32)ARG(scan) == is_par) {
cb434fcc 1623 next = regnext(scan);
c277df42 1624
cb434fcc
IZ
1625 if ( next && (OP(next) != WHILEM) && next < last)
1626 is_par = 0; /* Disable optimization */
1627 }
1628 if (data)
1629 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1630 }
1631 else if (OP(scan) == EVAL) {
c277df42
IZ
1632 if (data)
1633 data->flags |= SF_HAS_EVAL;
1634 }
96776eda 1635 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1636 if (flags & SCF_DO_SUBSTR) {
830247a4 1637 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1638 data->longest = &(data->longest_float);
1639 }
1640 is_inf = is_inf_internal = 1;
653099ff 1641 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1642 cl_anything(pRExC_state, data->start_class);
96776eda 1643 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1644 }
c277df42
IZ
1645 /* Else: zero-length, ignore. */
1646 scan = regnext(scan);
1647 }
1648
1649 finish:
1650 *scanp = scan;
aca2d497 1651 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1652 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1653 data->pos_delta = I32_MAX - data->pos_min;
1654 if (is_par > U8_MAX)
1655 is_par = 0;
1656 if (is_par && pars==1 && data) {
1657 data->flags |= SF_IN_PAR;
1658 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1659 }
1660 else if (pars && data) {
c277df42
IZ
1661 data->flags |= SF_HAS_PAR;
1662 data->flags &= ~SF_IN_PAR;
1663 }
653099ff
GS
1664 if (flags & SCF_DO_STCLASS_OR)
1665 cl_and(data->start_class, &and_with);
c277df42
IZ
1666 return min;
1667}
1668
76e3520e 1669STATIC I32
830247a4 1670S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1671{
830247a4 1672 if (RExC_rx->data) {
b81d288d
AB
1673 Renewc(RExC_rx->data,
1674 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1675 char, struct reg_data);
830247a4
IZ
1676 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1677 RExC_rx->data->count += n;
a0ed51b3
LW
1678 }
1679 else {
830247a4 1680 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1681 char, struct reg_data);
830247a4
IZ
1682 New(1208, RExC_rx->data->what, n, U8);
1683 RExC_rx->data->count = n;
c277df42 1684 }
830247a4
IZ
1685 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1686 return RExC_rx->data->count - n;
c277df42
IZ
1687}
1688
d88dccdf 1689void
864dbfa3 1690Perl_reginitcolors(pTHX)
d88dccdf 1691{
d88dccdf
IZ
1692 int i = 0;
1693 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1694
d88dccdf
IZ
1695 if (s) {
1696 PL_colors[0] = s = savepv(s);
1697 while (++i < 6) {
1698 s = strchr(s, '\t');
1699 if (s) {
1700 *s = '\0';
1701 PL_colors[i] = ++s;
1702 }
1703 else
c712d376 1704 PL_colors[i] = s = "";
d88dccdf
IZ
1705 }
1706 } else {
b81d288d 1707 while (i < 6)
d88dccdf
IZ
1708 PL_colors[i++] = "";
1709 }
1710 PL_colorset = 1;
1711}
1712
8615cb43 1713
a687059c 1714/*
e50aee73 1715 - pregcomp - compile a regular expression into internal code
a687059c
LW
1716 *
1717 * We can't allocate space until we know how big the compiled form will be,
1718 * but we can't compile it (and thus know how big it is) until we've got a
1719 * place to put the code. So we cheat: we compile it twice, once with code
1720 * generation turned off and size counting turned on, and once "for real".
1721 * This also means that we don't allocate space until we are sure that the
1722 * thing really will compile successfully, and we never have to move the
1723 * code and thus invalidate pointers into it. (Note that it has to be in
1724 * one piece because free() must be able to free it all.) [NB: not true in perl]
1725 *
1726 * Beware that the optimization-preparation code in here knows about some
1727 * of the structure of the compiled regexp. [I'll say.]
1728 */
1729regexp *
864dbfa3 1730Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1731{
a0d0e21e 1732 register regexp *r;
c277df42 1733 regnode *scan;
c277df42 1734 regnode *first;
a0d0e21e 1735 I32 flags;
a0d0e21e
LW
1736 I32 minlen = 0;
1737 I32 sawplus = 0;
1738 I32 sawopen = 0;
2c2d71f5 1739 scan_data_t data;
830247a4
IZ
1740 RExC_state_t RExC_state;
1741 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1742
1743 if (exp == NULL)
c277df42 1744 FAIL("NULL regexp argument");
a0d0e21e 1745
a5961de5 1746 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1747
5cfc7842 1748 RExC_precomp = exp;
a5961de5
JH
1749 DEBUG_r({
1750 if (!PL_colorset) reginitcolors();
1751 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1752 PL_colors[4],PL_colors[5],PL_colors[0],
1753 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1754 });
e2509266 1755 RExC_flags = pm->op_pmflags;
830247a4 1756 RExC_sawback = 0;
bbce6d69 1757
830247a4
IZ
1758 RExC_seen = 0;
1759 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1760 RExC_seen_evals = 0;
1761 RExC_extralen = 0;
c277df42 1762
bbce6d69 1763 /* First pass: determine size, legality. */
830247a4 1764 RExC_parse = exp;
fac92740 1765 RExC_start = exp;
830247a4
IZ
1766 RExC_end = xend;
1767 RExC_naughty = 0;
1768 RExC_npar = 1;
1769 RExC_size = 0L;
1770 RExC_emit = &PL_regdummy;
1771 RExC_whilem_seen = 0;
85ddcde9
JH
1772#if 0 /* REGC() is (currently) a NOP at the first pass.
1773 * Clever compilers notice this and complain. --jhi */
830247a4 1774 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1775#endif
830247a4 1776 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1777 RExC_precomp = Nullch;
a0d0e21e
LW
1778 return(NULL);
1779 }
830247a4 1780 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1781
c277df42
IZ
1782 /* Small enough for pointer-storage convention?
1783 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1784 if (RExC_size >= 0x10000L && RExC_extralen)
1785 RExC_size += RExC_extralen;
c277df42 1786 else
830247a4
IZ
1787 RExC_extralen = 0;
1788 if (RExC_whilem_seen > 15)
1789 RExC_whilem_seen = 15;
a0d0e21e 1790
bbce6d69 1791 /* Allocate space and initialize. */
830247a4 1792 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1793 char, regexp);
a0d0e21e 1794 if (r == NULL)
b45f050a
JF
1795 FAIL("Regexp out of space");
1796
0f79a09d
GS
1797#ifdef DEBUGGING
1798 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1799 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1800#endif
c277df42 1801 r->refcnt = 1;
bbce6d69 1802 r->prelen = xend - exp;
5cfc7842 1803 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1804 r->subbeg = NULL;
ed252734
NC
1805#ifdef PERL_COPY_ON_WRITE
1806 r->saved_copy = Nullsv;
1807#endif
cf93c79d 1808 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1809 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1810
1811 r->substrs = 0; /* Useful during FAIL. */
1812 r->startp = 0; /* Useful during FAIL. */
1813 r->endp = 0; /* Useful during FAIL. */
1814
fac92740
MJD
1815 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1816 if (r->offsets) {
1817 r->offsets[0] = RExC_size;
1818 }
1819 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1820 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1821 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1822 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1823
830247a4 1824 RExC_rx = r;
bbce6d69 1825
1826 /* Second pass: emit code. */
e2509266 1827 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1828 RExC_parse = exp;
1829 RExC_end = xend;
1830 RExC_naughty = 0;
1831 RExC_npar = 1;
fac92740 1832 RExC_emit_start = r->program;
830247a4 1833 RExC_emit = r->program;
2cd61cdb 1834 /* Store the count of eval-groups for security checks: */
eb160463 1835 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1836 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1837 r->data = 0;
830247a4 1838 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1839 return(NULL);
1840
1841 /* Dig out information for optimizations. */
cf93c79d 1842 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1843 pm->op_pmflags = RExC_flags;
a0ed51b3 1844 if (UTF)
5ff6fc6d 1845 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1846 r->regstclass = NULL;
830247a4 1847 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1848 r->reganch |= ROPT_NAUGHTY;
c277df42 1849 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1850
1851 /* XXXX To minimize changes to RE engine we always allocate
1852 3-units-long substrs field. */
1853 Newz(1004, r->substrs, 1, struct reg_substr_data);
1854
2c2d71f5 1855 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1856 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1857 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1858 I32 fake;
c5254dd6 1859 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1860 struct regnode_charclass_class ch_class;
1861 int stclass_flag;
cb434fcc 1862 I32 last_close = 0;
a0d0e21e
LW
1863
1864 first = scan;
c277df42 1865 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1866 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1867 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1868 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869 (OP(first) == PLUS) ||
1870 (OP(first) == MINMOD) ||
653099ff 1871 /* An {n,m} with n>0 */
22c35a8c 1872 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1873 if (OP(first) == PLUS)
1874 sawplus = 1;
1875 else
1876 first += regarglen[(U8)OP(first)];
1877 first = NEXTOPER(first);
a687059c
LW
1878 }
1879
a0d0e21e
LW
1880 /* Starting-point info. */
1881 again:
653099ff 1882 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1883 if (OP(first) == EXACT)
1884 ; /* Empty, get anchored substr later. */
1885 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1886 r->regstclass = first;
1887 }
653099ff 1888 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1889 r->regstclass = first;
22c35a8c
GS
1890 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1892 r->regstclass = first;
22c35a8c 1893 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1894 r->reganch |= (OP(first) == MBOL
1895 ? ROPT_ANCH_MBOL
1896 : (OP(first) == SBOL
1897 ? ROPT_ANCH_SBOL
1898 : ROPT_ANCH_BOL));
a0d0e21e 1899 first = NEXTOPER(first);
774d564b 1900 goto again;
1901 }
1902 else if (OP(first) == GPOS) {
1903 r->reganch |= ROPT_ANCH_GPOS;
1904 first = NEXTOPER(first);
1905 goto again;
a0d0e21e 1906 }
e09294f4 1907 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1908 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1909 !(r->reganch & ROPT_ANCH) )
1910 {
1911 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1912 int type = OP(NEXTOPER(first));
1913
ffc61ed2 1914 if (type == REG_ANY)
cad2e5aa
JH
1915 type = ROPT_ANCH_MBOL;
1916 else
1917 type = ROPT_ANCH_SBOL;
1918
1919 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1920 first = NEXTOPER(first);
774d564b 1921 goto again;
a0d0e21e 1922 }
b81d288d 1923 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
a0d0e21e 1927
c277df42 1928 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1930 (IV)(first - scan + 1)));
a0d0e21e
LW
1931 /*
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
c277df42 1939 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1940 * earlier string may buy us something the later one won't.]
1941 */
a0d0e21e 1942 minlen = 0;
a687059c 1943
79cb57f6
GS
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
c277df42
IZ
1947 data.longest = &(data.longest_fixed);
1948 first = scan;
653099ff 1949 if (!r->regstclass) {
830247a4 1950 cl_init(pRExC_state, &ch_class);
653099ff
GS
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1954 stclass_flag = 0;
cb434fcc 1955 data.last_closep = &last_close;
653099ff 1956
830247a4 1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1960 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1963 r->reganch |= ROPT_CHECK_ALL;
830247a4 1964 scan_commit(pRExC_state, &data);
c277df42
IZ
1965 SvREFCNT_dec(data.last_found);
1966
a0ed51b3 1967 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1968 if (longest_float_length
c277df42
IZ
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1971 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1972 int t;
1973
a0ed51b3 1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1978
33b8afdf
JH
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1982 } else {
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1985 }
c277df42
IZ
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1990 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1992 }
1993 else {
aca2d497 1994 remove_float:
33b8afdf 1995 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1996 SvREFCNT_dec(data.longest_float);
c5254dd6 1997 longest_float_length = 0;
a0d0e21e 1998 }
c277df42 1999
a0ed51b3 2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2001 if (longest_fixed_length
c277df42
IZ
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2004 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2005 int t;
2006
33b8afdf
JH
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2010 } else {
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2013 }
c277df42 2014 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2017 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2019 }
2020 else {
33b8afdf 2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2022 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2023 longest_fixed_length = 0;
a0d0e21e 2024 }
b81d288d 2025 if (r->regstclass
ffc61ed2 2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2027 r->regstclass = NULL;
33b8afdf
JH
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029 && stclass_flag
653099ff 2030 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2031 && !cl_is_anything(data.start_class))
2032 {
830247a4 2033 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2034
b81d288d 2035 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
830247a4 2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2039 struct regnode_charclass_class);
830247a4 2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2042 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
2046 "synthetic stclass `%s'.\n",
2047 SvPVX(sv));});
653099ff 2048 }
c277df42
IZ
2049
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2051 if (longest_fixed_length > longest_float_length) {
c277df42 2052 r->check_substr = r->anchored_substr;
33b8afdf 2053 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2057 }
2058 else {
c277df42 2059 r->check_substr = r->float_substr;
33b8afdf 2060 r->check_utf8 = r->float_utf8;
c277df42
IZ
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
a0d0e21e 2063 }
30382c73
IZ
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
33b8afdf 2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2067 r->reganch |= RE_USE_INTUIT;
33b8afdf 2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2069 r->reganch |= RE_INTUIT_TAIL;
2070 }
a0ed51b3
LW
2071 }
2072 else {
c277df42
IZ
2073 /* Several toplevels. Best we can is to set minlen. */
2074 I32 fake;
653099ff 2075 struct regnode_charclass_class ch_class;
cb434fcc 2076 I32 last_close = 0;
c277df42
IZ
2077
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
830247a4 2080 cl_init(pRExC_state, &ch_class);
653099ff 2081 data.start_class = &ch_class;
cb434fcc 2082 data.last_closep = &last_close;
e1901655 2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2086 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2087 && !cl_is_anything(data.start_class))
2088 {
830247a4 2089 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2090
b81d288d 2091 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
830247a4 2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2095 struct regnode_charclass_class);
830247a4 2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
2101 "synthetic stclass `%s'.\n",
2102 SvPVX(sv));});
653099ff 2103 }
a0d0e21e
LW
2104 }
2105
a0d0e21e 2106 r->minlen = minlen;
b81d288d 2107 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2108 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2111 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2112 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2115 Newz(1002, r->startp, RExC_npar, I32);
2116 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2117 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2118 DEBUG_r(regdump(r));
2119 return(r);
a687059c
LW
2120}
2121
2122/*
2123 - reg - regular expression, i.e. main body or parenthesized thing
2124 *
2125 * Caller must absorb opening parenthesis.
2126 *
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2130 */
76e3520e 2131STATIC regnode *
830247a4 2132S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2134{
c277df42
IZ
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
a0d0e21e 2139 register I32 parno = 0;
e2509266 2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2141
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2144
2145 I32 wastedflags = 0x00,
2146 wasted_o = 0x01,
2147 wasted_g = 0x02,
2148 wasted_gc = 0x02 | 0x04,
2149 wasted_c = 0x04;
2150
fac92740 2151 char * parse_start = RExC_parse; /* MJD */
830247a4 2152 char *oregcomp_parse = RExC_parse;
c277df42 2153 char c;
a0d0e21e 2154
821b33a5 2155 *flagp = 0; /* Tentatively. */
a0d0e21e 2156
9d1d55b5 2157
a0d0e21e
LW
2158 /* Make an OPEN node, if parenthesized. */
2159 if (paren) {
fac92740 2160 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
0f5d15d6 2163 int logical = 0;
830247a4 2164 char *seqstart = RExC_parse;
ca9dfc88 2165
830247a4
IZ
2166 RExC_parse++;
2167 paren = *RExC_parse++;
c277df42 2168 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2169 switch (paren) {
fac92740 2170 case '<': /* (?<...) */
830247a4 2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2172 if (*RExC_parse == '!')
c277df42 2173 paren = ',';
b81d288d 2174 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2175 goto unknown;
830247a4 2176 RExC_parse++;
fac92740
MJD
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
830247a4 2179 RExC_seen_zerolen++;
fac92740
MJD
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
a0d0e21e 2182 break;
fac92740
MJD
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
8615cb43 2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2186 break;
fac92740 2187 case '#': /* (?#...) */
830247a4
IZ
2188 while (*RExC_parse && *RExC_parse != ')')
2189 RExC_parse++;
2190 if (*RExC_parse != ')')
c277df42 2191 FAIL("Sequence (?#... not terminated");
830247a4 2192 nextchar(pRExC_state);
a0d0e21e
LW
2193 *flagp = TRYAGAIN;
2194 return NULL;
fac92740 2195 case 'p': /* (?p...) */
9014280d 2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2198 /* FALL THROUGH*/
fac92740 2199 case '?': /* (??...) */
0f5d15d6 2200 logical = 1;
438a3801
YST
2201 if (*RExC_parse != '{')
2202 goto unknown;
830247a4 2203 paren = *RExC_parse++;
0f5d15d6 2204 /* FALL THROUGH */
fac92740 2205 case '{': /* (?{...}) */
c277df42 2206 {
c277df42
IZ
2207 I32 count = 1, n = 0;
2208 char c;
830247a4 2209 char *s = RExC_parse;
c277df42
IZ
2210 SV *sv;
2211 OP_4tree *sop, *rop;
2212
830247a4
IZ
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2217 RExC_parse++;
b81d288d 2218 else if (c == '{')
c277df42 2219 count++;
b81d288d 2220 else if (c == '}')
c277df42 2221 count--;
830247a4 2222 RExC_parse++;
c277df42 2223 }
830247a4 2224 if (*RExC_parse != ')')
b45f050a 2225 {
b81d288d 2226 RExC_parse = s;
b45f050a
JF
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228 }
c277df42 2229 if (!SIZE_ONLY) {
f3548bdc 2230 PAD *pad;
b81d288d
AB
2231
2232 if (RExC_parse - 1 - s)
830247a4 2233 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2234 else
79cb57f6 2235 sv = newSVpvn("", 0);
c277df42 2236
569233ed
SB
2237 ENTER;
2238 Perl_save_re_context(aTHX);
f3548bdc 2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
569233ed 2243 LEAVE;
c277df42 2244
830247a4
IZ
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2248 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2249 SvREFCNT_dec(sv);
a0ed51b3 2250 }
e24b16f9 2251 else { /* First pass */
830247a4 2252 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2253 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2257 if (PL_tainting && PL_tainted)
cc6b7395 2258 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2259 }
2260
830247a4 2261 nextchar(pRExC_state);
0f5d15d6 2262 if (logical) {
830247a4 2263 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2264 if (!SIZE_ONLY)
2265 ret->flags = 2;
830247a4 2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2267 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2268 return ret;
2269 }
ccb2c380
MP
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2273 return ret;
c277df42 2274 }
fac92740 2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2276 {
fac92740 2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
830247a4 2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2281 I32 flag;
2282
830247a4 2283 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2284 if (!SIZE_ONLY)
2285 ret->flags = 1;
830247a4 2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2287 goto insert_if;
b81d288d 2288 }
a0ed51b3 2289 }
830247a4 2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2291 /* (?(1)...) */
830247a4 2292 parno = atoi(RExC_parse++);
c277df42 2293
830247a4
IZ
2294 while (isDIGIT(*RExC_parse))
2295 RExC_parse++;
fac92740
MJD
2296 ret = reganode(pRExC_state, GROUPP, parno);
2297
830247a4 2298 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2299 vFAIL("Switch condition not recognized");
c277df42 2300 insert_if:
830247a4
IZ
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
c277df42 2303 if (br == NULL)
830247a4 2304 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2305 else
830247a4
IZ
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
d1b80229
IZ
2308 if (flags&HASWIDTH)
2309 *flagp |= HASWIDTH;
c277df42 2310 if (c == '|') {
830247a4
IZ
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2314 if (flags&HASWIDTH)
2315 *flagp |= HASWIDTH;
830247a4 2316 c = *nextchar(pRExC_state);
a0ed51b3
LW
2317 }
2318 else
c277df42
IZ
2319 lastbr = NULL;
2320 if (c != ')')
8615cb43 2321 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
c277df42 2324 if (lastbr) {
830247a4
IZ
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2327 }
2328 else
830247a4 2329 regtail(pRExC_state, ret, ender);
c277df42 2330 return ret;
a0ed51b3
LW
2331 }
2332 else {
830247a4 2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2334 }
2335 }
1b1626e4 2336 case 0:
830247a4 2337 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2338 vFAIL("Sequence (? incomplete");
1b1626e4 2339 break;
a0d0e21e 2340 default:
830247a4 2341 --RExC_parse;
fac92740 2342 parse_flags: /* (?i) */
830247a4 2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2346
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2352 vWARN5(
2353 RExC_parse + 1,
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2356 *RExC_parse,
2357 flagsp == &negflags ? "don't " : "",
2358 *RExC_parse
2359 );
2360 }
2361 }
2362 }
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2367 vWARN3(
2368 RExC_parse + 1,
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2372 );
2373 }
2374 }
2375 }
2376 else { pmflag(flagsp, *RExC_parse); }
2377
830247a4 2378 ++RExC_parse;
ca9dfc88 2379 }
830247a4 2380 if (*RExC_parse == '-') {
ca9dfc88 2381 flagsp = &negflags;
9d1d55b5 2382 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2383 ++RExC_parse;
ca9dfc88 2384 goto parse_flags;
48c036b1 2385 }
e2509266
JH
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
830247a4
IZ
2388 if (*RExC_parse == ':') {
2389 RExC_parse++;
ca9dfc88
IZ
2390 paren = ':';
2391 break;
2392 }
c277df42 2393 unknown:
830247a4
IZ
2394 if (*RExC_parse != ')') {
2395 RExC_parse++;
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2397 }
830247a4 2398 nextchar(pRExC_state);
a0d0e21e
LW
2399 *flagp = TRYAGAIN;
2400 return NULL;
2401 }
2402 }
fac92740 2403 else { /* (...) */
830247a4
IZ
2404 parno = RExC_npar;
2405 RExC_npar++;
2406 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2409 open = 1;
a0d0e21e 2410 }
a0ed51b3 2411 }
fac92740 2412 else /* ! paren */
a0d0e21e
LW
2413 ret = NULL;
2414
2415 /* Pick up the branches, linking them together. */
fac92740 2416 parse_start = RExC_parse; /* MJD */
830247a4 2417 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2418 /* branch_len = (paren != 0); */
2419
a0d0e21e
LW
2420 if (br == NULL)
2421 return(NULL);
830247a4
IZ
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2425 }
fac92740 2426 else { /* MJD */
830247a4 2427 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430 }
c277df42
IZ
2431 have_branch = 1;
2432 if (SIZE_ONLY)
830247a4 2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2434 }
2435 else if (paren == ':') {
c277df42
IZ
2436 *flagp |= flags&SIMPLE;
2437 }
2438 if (open) { /* Starts with OPEN. */
830247a4 2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2440 }
2441 else if (paren != '?') /* Not Conditional */
a0d0e21e 2442 ret = br;
32a0ca98 2443 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2444 lastbr = br;
830247a4
IZ
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2449 }
2450 if (SIZE_ONLY)
830247a4
IZ
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
fac92740 2454
a687059c 2455 if (br == NULL)
a0d0e21e 2456 return(NULL);
830247a4 2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2458 lastbr = br;
821b33a5
IZ
2459 if (flags&HASWIDTH)
2460 *flagp |= HASWIDTH;
a687059c 2461 *flagp |= flags&SPSTART;
a0d0e21e
LW
2462 }
2463
c277df42
IZ
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2466 switch (paren) {
2467 case ':':
830247a4 2468 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2469 break;
2470 case 1:
830247a4 2471 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2474 break;
2475 case '<':
c277df42
IZ
2476 case ',':
2477 case '=':
2478 case '!':
c277df42 2479 *flagp &= ~HASWIDTH;
821b33a5
IZ
2480 /* FALL THROUGH */
2481 case '>':
830247a4 2482 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2483 break;
2484 case 0:
830247a4 2485 ender = reg_node(pRExC_state, END);
c277df42
IZ
2486 break;
2487 }
830247a4 2488 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2489
c277df42
IZ
2490 if (have_branch) {
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2493 regoptail(pRExC_state, br, ender);
c277df42
IZ
2494 }
2495 }
a0d0e21e 2496 }
c277df42
IZ
2497
2498 {
2499 char *p;
2500 static char parens[] = "=!<,>";
2501
2502 if (paren && (p = strchr(parens, paren))) {
eb160463 2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2504 int flag = (p - parens) > 1;
2505
2506 if (paren == '>')
2507 node = SUSPEND, flag = 0;
830247a4 2508 reginsert(pRExC_state, node,ret);
ccb2c380
MP
2509 Set_Node_Offset(ret, oregcomp_parse);
2510 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
c277df42 2511 ret->flags = flag;
830247a4 2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2513 }
a0d0e21e
LW
2514 }
2515
2516 /* Check for proper termination. */
ce3e6498 2517 if (paren) {
e2509266 2518 RExC_flags = oregflags;
830247a4
IZ
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
380a0633 2521 vFAIL("Unmatched (");
ce3e6498 2522 }
a0ed51b3 2523 }
830247a4
IZ
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2526 RExC_parse++;
380a0633 2527 vFAIL("Unmatched )");
a0ed51b3
LW
2528 }
2529 else
b45f050a 2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2531 /* NOTREACHED */
2532 }
a687059c 2533
a0d0e21e 2534 return(ret);
a687059c
LW
2535}
2536
2537/*
2538 - regbranch - one alternative of an | operator
2539 *
2540 * Implements the concatenation operator.
2541 */
76e3520e 2542STATIC regnode *
830247a4 2543S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2544{
c277df42
IZ
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
a0d0e21e 2549
b81d288d 2550 if (first)
c277df42
IZ
2551 ret = NULL;
2552 else {
b81d288d 2553 if (!SIZE_ONLY && RExC_extralen)
830247a4 2554 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2555 else {
830247a4 2556 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2557 Set_Node_Length(ret, 1);
2558 }
c277df42
IZ
2559 }
2560
b81d288d 2561 if (!first && SIZE_ONLY)
830247a4 2562 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2563
c277df42 2564 *flagp = WORST; /* Tentatively. */
a0d0e21e 2565
830247a4
IZ
2566 RExC_parse--;
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2569 flags &= ~TRYAGAIN;
830247a4 2570 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2573 continue;
2574 return(NULL);
a0ed51b3
LW
2575 }
2576 else if (ret == NULL)
c277df42 2577 ret = latest;
a0d0e21e 2578 *flagp |= flags&HASWIDTH;
c277df42 2579 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2580 *flagp |= flags&SPSTART;
2581 else {
830247a4
IZ
2582 RExC_naughty++;
2583 regtail(pRExC_state, chain, latest);
a687059c 2584 }
a0d0e21e 2585 chain = latest;
c277df42
IZ
2586 c++;
2587 }
2588 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2589 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2590 if (ret == NULL)
2591 ret = chain;
2592 }
2593 if (c == 1) {
2594 *flagp |= flags&SIMPLE;
a0d0e21e 2595 }
a687059c 2596
a0d0e21e 2597 return(ret);
a687059c
LW
2598}
2599
2600/*
2601 - regpiece - something followed by possible [*+?]
2602 *
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2608 */
76e3520e 2609STATIC regnode *
830247a4 2610S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2611{
c277df42 2612 register regnode *ret;
a0d0e21e
LW
2613 register char op;
2614 register char *next;
2615 I32 flags;
830247a4 2616 char *origparse = RExC_parse;
a0d0e21e
LW
2617 char *maxpos;
2618 I32 min;
c277df42 2619 I32 max = REG_INFTY;
fac92740 2620 char *parse_start;
a0d0e21e 2621
830247a4 2622 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2623 if (ret == NULL) {
2624 if (flags & TRYAGAIN)
2625 *flagp |= TRYAGAIN;
2626 return(NULL);
2627 }
2628
830247a4 2629 op = *RExC_parse;
a0d0e21e 2630
830247a4 2631 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2632 parse_start = RExC_parse; /* MJD */
830247a4 2633 next = RExC_parse + 1;
a0d0e21e
LW
2634 maxpos = Nullch;
2635 while (isDIGIT(*next) || *next == ',') {
2636 if (*next == ',') {
2637 if (maxpos)
2638 break;
2639 else
2640 maxpos = next;
a687059c 2641 }
a0d0e21e
LW
2642 next++;
2643 }
2644 if (*next == '}') { /* got one */
2645 if (!maxpos)
2646 maxpos = next;
830247a4
IZ
2647 RExC_parse++;
2648 min = atoi(RExC_parse);
a0d0e21e
LW
2649 if (*maxpos == ',')
2650 maxpos++;
2651 else
830247a4 2652 maxpos = RExC_parse;
a0d0e21e
LW
2653 max = atoi(maxpos);
2654 if (!max && *maxpos != '0')
c277df42
IZ
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
8615cb43 2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2658 RExC_parse = next;
2659 nextchar(pRExC_state);
a0d0e21e
LW
2660
2661 do_curly:
2662 if ((flags&SIMPLE)) {
830247a4
IZ
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2667 }
2668 else {
830247a4 2669 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2670
2671 w->flags = 0;
830247a4
IZ
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2677 }
830247a4 2678 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2679 /* MJD hk */
2680 Set_Node_Offset(ret, parse_start+1);
2681 Set_Node_Length(ret,
2682 op == '{' ? (RExC_parse - parse_start) : 1);
2683
830247a4 2684 if (!SIZE_ONLY && RExC_extralen)
c277df42 2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2687 if (SIZE_ONLY)
830247a4
IZ
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2690 }
c277df42 2691 ret->flags = 0;
a0d0e21e
LW
2692
2693 if (min > 0)
821b33a5
IZ
2694 *flagp = WORST;
2695 if (max > 0)
2696 *flagp |= HASWIDTH;
a0d0e21e 2697 if (max && max < min)
8615cb43 2698 vFAIL("Can't do {n,m} with n > m");
c277df42 2699 if (!SIZE_ONLY) {
eb160463
GS
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
a687059c 2702 }
a687059c 2703
a0d0e21e 2704 goto nest_check;
a687059c 2705 }
a0d0e21e 2706 }
a687059c 2707
a0d0e21e
LW
2708 if (!ISMULT1(op)) {
2709 *flagp = flags;
a687059c 2710 return(ret);
a0d0e21e 2711 }
bb20fd44 2712
c277df42 2713#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2714
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2722
2723 */
2724
bb20fd44 2725 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2726 vFAIL("Regexp *+ operand could be empty");
b81d288d 2727#endif
bb20fd44 2728
fac92740 2729 parse_start = RExC_parse;
830247a4 2730 nextchar(pRExC_state);
a0d0e21e 2731
821b33a5 2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2733
2734 if (op == '*' && (flags&SIMPLE)) {
830247a4 2735 reginsert(pRExC_state, STAR, ret);
c277df42 2736 ret->flags = 0;
830247a4 2737 RExC_naughty += 4;
a0d0e21e
LW
2738 }
2739 else if (op == '*') {
2740 min = 0;
2741 goto do_curly;
a0ed51b3
LW
2742 }
2743 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2744 reginsert(pRExC_state, PLUS, ret);
c277df42 2745 ret->flags = 0;
830247a4 2746 RExC_naughty += 3;
a0d0e21e
LW
2747 }
2748 else if (op == '+') {
2749 min = 1;
2750 goto do_curly;
a0ed51b3
LW
2751 }
2752 else if (op == '?') {
a0d0e21e
LW
2753 min = 0; max = 1;
2754 goto do_curly;
2755 }
2756 nest_check:
e476b1b5 2757 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2758 vWARN3(RExC_parse,
b45f050a 2759 "%.*s matches null string many times",
830247a4 2760 RExC_parse - origparse,
b45f050a 2761 origparse);
a0d0e21e
LW
2762 }
2763
830247a4
IZ
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2768 }
830247a4
IZ
2769 if (ISMULT2(RExC_parse)) {
2770 RExC_parse++;
b45f050a
JF
2771 vFAIL("Nested quantifiers");
2772 }
a0d0e21e
LW
2773
2774 return(ret);
a687059c
LW
2775}
2776
2777/*
2778 - regatom - the lowest level
2779 *
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2784 *
b45f050a 2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2786STATIC regnode *
830247a4 2787S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2788{
c277df42 2789 register regnode *ret = 0;
a0d0e21e 2790 I32 flags;
f06dbbb7 2791 char *parse_start = 0;
a0d0e21e
LW
2792
2793 *flagp = WORST; /* Tentatively. */
2794
2795tryagain:
830247a4 2796 switch (*RExC_parse) {
a0d0e21e 2797 case '^':
830247a4
IZ
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
e2509266 2800 if (RExC_flags & PMf_MULTILINE)
830247a4 2801 ret = reg_node(pRExC_state, MBOL);
e2509266 2802 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2803 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2804 else
830247a4 2805 ret = reg_node(pRExC_state, BOL);
fac92740 2806 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2807 break;
2808 case '$':
830247a4 2809 nextchar(pRExC_state);
b81d288d 2810 if (*RExC_parse)
830247a4 2811 RExC_seen_zerolen++;
e2509266 2812 if (RExC_flags & PMf_MULTILINE)
830247a4 2813 ret = reg_node(pRExC_state, MEOL);
e2509266 2814 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2815 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2816 else
830247a4 2817 ret = reg_node(pRExC_state, EOL);
fac92740 2818 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2819 break;
2820 case '.':
830247a4 2821 nextchar(pRExC_state);
e2509266 2822 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2823 ret = reg_node(pRExC_state, SANY);
2824 else
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
830247a4 2827 RExC_naughty++;
fac92740 2828 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2829 break;
2830 case '[':
b45f050a 2831 {
830247a4 2832 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2833 ret = regclass(pRExC_state);
830247a4
IZ
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
b45f050a
JF
2836 vFAIL("Unmatched [");
2837 }
830247a4 2838 nextchar(pRExC_state);
a0d0e21e 2839 *flagp |= HASWIDTH|SIMPLE;
fac92740 2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2841 break;
b45f050a 2842 }
a0d0e21e 2843 case '(':
830247a4
IZ
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2846 if (ret == NULL) {
bf93d4cc 2847 if (flags & TRYAGAIN) {
830247a4 2848 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2849 /* Make parent create an empty node if needed. */
2850 *flagp |= TRYAGAIN;
2851 return(NULL);
2852 }
a0d0e21e 2853 goto tryagain;
bf93d4cc 2854 }
a0d0e21e
LW
2855 return(NULL);
2856 }
c277df42 2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2858 break;
2859 case '|':
2860 case ')':
2861 if (flags & TRYAGAIN) {
2862 *flagp |= TRYAGAIN;
2863 return NULL;
2864 }
b45f050a 2865 vFAIL("Internal urp");
a0d0e21e
LW
2866 /* Supposed to be caught earlier. */
2867 break;
85afd4ae 2868 case '{':
830247a4
IZ
2869 if (!regcurly(RExC_parse)) {
2870 RExC_parse++;
85afd4ae
CS
2871 goto defchar;
2872 }
2873 /* FALL THROUGH */
a0d0e21e
LW
2874 case '?':
2875 case '+':
2876 case '*':
830247a4 2877 RExC_parse++;
b45f050a 2878 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2879 break;
2880 case '\\':
830247a4 2881 switch (*++RExC_parse) {
a0d0e21e 2882 case 'A':
830247a4
IZ
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2885 *flagp |= SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2888 break;
2889 case 'G':
830247a4
IZ
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2892 *flagp |= SIMPLE;
830247a4 2893 nextchar(pRExC_state);
fac92740 2894 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2895 break;
2896 case 'Z':
830247a4 2897 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2898 *flagp |= SIMPLE;
a1917ab9 2899 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2900 nextchar(pRExC_state);
a0d0e21e 2901 break;
b85d18e9 2902 case 'z':
830247a4 2903 ret = reg_node(pRExC_state, EOS);
b85d18e9 2904 *flagp |= SIMPLE;
830247a4
IZ
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
fac92740 2907 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2908 break;
4a2d328f 2909 case 'C':
f33976b4
DB
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2912 *flagp |= HASWIDTH|SIMPLE;
830247a4 2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2915 break;
2916 case 'X':
830247a4 2917 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2918 *flagp |= HASWIDTH;
830247a4 2919 nextchar(pRExC_state);
fac92740 2920 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2921 break;
a0d0e21e 2922 case 'w':
eb160463 2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2924 *flagp |= HASWIDTH|SIMPLE;
830247a4 2925 nextchar(pRExC_state);
fac92740 2926 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2927 break;
2928 case 'W':
eb160463 2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2930 *flagp |= HASWIDTH|SIMPLE;
830247a4 2931 nextchar(pRExC_state);
fac92740 2932 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2933 break;
2934 case 'b':
830247a4
IZ
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2938 *flagp |= SIMPLE;
830247a4 2939 nextchar(pRExC_state);
fac92740 2940 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2941 break;
2942 case 'B':
830247a4
IZ
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2946 *flagp |= SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2949 break;
2950 case 's':
eb160463 2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2952 *flagp |= HASWIDTH|SIMPLE;
830247a4 2953 nextchar(pRExC_state);
fac92740 2954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2955 break;
2956 case 'S':
eb160463 2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2958 *flagp |= HASWIDTH|SIMPLE;
830247a4 2959 nextchar(pRExC_state);
fac92740 2960 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2961 break;
2962 case 'd':
ffc61ed2 2963 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2964 *flagp |= HASWIDTH|SIMPLE;
830247a4 2965 nextchar(pRExC_state);
fac92740 2966 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2967 break;
2968 case 'D':
ffc61ed2 2969 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2970 *flagp |= HASWIDTH|SIMPLE;
830247a4 2971 nextchar(pRExC_state);
fac92740 2972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2973 break;
a14b48bc
LW
2974 case 'p':
2975 case 'P':
3568d838 2976 {
830247a4 2977 char* oldregxend = RExC_end;
ccb2c380 2978 char* parse_start = RExC_parse - 2;
a14b48bc 2979
830247a4 2980 if (RExC_parse[1] == '{') {
3568d838 2981 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2982 RExC_end = strchr(RExC_parse, '}');
2983 if (!RExC_end) {
0da60cf5 2984 U8 c = (U8)*RExC_parse;
830247a4
IZ
2985 RExC_parse += 2;
2986 RExC_end = oldregxend;
0da60cf5 2987 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2988 }
830247a4 2989 RExC_end++;
a14b48bc 2990 }
af6f566e 2991 else {
830247a4 2992 RExC_end = RExC_parse + 2;
af6f566e
HS
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2995 }
830247a4 2996 RExC_parse--;
a14b48bc 2997
ffc61ed2 2998 ret = regclass(pRExC_state);
a14b48bc 2999
830247a4
IZ
3000 RExC_end = oldregxend;
3001 RExC_parse--;
ccb2c380
MP
3002
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
830247a4 3005 nextchar(pRExC_state);
a14b48bc
LW
3006 *flagp |= HASWIDTH|SIMPLE;
3007 }
3008 break;
a0d0e21e
LW
3009 case 'n':
3010 case 'r':
3011 case 't':
3012 case 'f':
3013 case 'e':
3014 case 'a':
3015 case 'x':
3016 case 'c':
3017 case '0':
3018 goto defchar;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3021 {
830247a4 3022 I32 num = atoi(RExC_parse);
a0d0e21e 3023
830247a4 3024 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3025 goto defchar;
3026 else {
fac92740 3027 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3028 while (isDIGIT(*RExC_parse))
3029 RExC_parse++;
b45f050a 3030
eb160463 3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3032 vFAIL("Reference to nonexistent group");
830247a4 3033 RExC_sawback = 1;
eb160463
GS
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036 num);
a0d0e21e 3037 *flagp |= HASWIDTH;
fac92740
MJD
3038
3039 /* override incorrect value set in reganode MJD */
3040 Set_Node_Offset(ret, parse_start+1);
3041 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3042 RExC_parse--;
3043 nextchar(pRExC_state);
a0d0e21e
LW
3044 }
3045 }
3046 break;
3047 case '\0':
830247a4 3048 if (RExC_parse >= RExC_end)
b45f050a 3049 FAIL("Trailing \\");
a0d0e21e
LW
3050 /* FALL THROUGH */
3051 default:
c9f97d15
IZ
3052 /* Do not generate `unrecognized' warnings here, we fall
3053 back into the quick-grab loop below */
a0d0e21e
LW
3054 goto defchar;
3055 }
3056 break;
4633a7c4
LW
3057
3058 case '#':
e2509266 3059 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3060 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061 if (RExC_parse < RExC_end)
4633a7c4
LW
3062 goto tryagain;
3063 }
3064 /* FALL THROUGH */
3065
a0d0e21e 3066 default: {
ba210ebe 3067 register STRLEN len;
58ae7d3f 3068 register UV ender;
a0d0e21e 3069 register char *p;
c277df42 3070 char *oldp, *s;
ba210ebe 3071 STRLEN numlen;
80aecb99 3072 STRLEN foldlen;
60a8b682 3073 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3074
3075 parse_start = RExC_parse - 1;
a0d0e21e 3076
830247a4 3077 RExC_parse++;
a0d0e21e
LW
3078
3079 defchar:
58ae7d3f 3080 ender = 0;
eb160463
GS
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3083 s = STRING(ret);
830247a4
IZ
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
a0d0e21e
LW
3086 len++)
3087 {
3088 oldp = p;
5b5a24f7 3089
e2509266 3090 if (RExC_flags & PMf_EXTENDED)
830247a4 3091 p = regwhite(p, RExC_end);
a0d0e21e
LW
3092 switch (*p) {
3093 case '^':
3094 case '$':
3095 case '.':
3096 case '[':
3097 case '(':
3098 case ')':
3099 case '|':
3100 goto loopdone;
3101 case '\\':
3102 switch (*++p) {
3103 case 'A':
1ed8eac0
JF
3104 case 'C':
3105 case 'X':
a0d0e21e
LW
3106 case 'G':
3107 case 'Z':
b85d18e9 3108 case 'z':
a0d0e21e
LW
3109 case 'w':
3110 case 'W':
3111 case 'b':
3112 case 'B':
3113 case 's':
3114 case 'S':
3115 case 'd':
3116 case 'D':
a14b48bc
LW
3117 case 'p':
3118 case 'P':
a0d0e21e
LW
3119 --p;
3120 goto loopdone;
3121 case 'n':
3122 ender = '\n';
3123 p++;
a687059c 3124 break;
a0d0e21e
LW
3125 case 'r':
3126 ender = '\r';
3127 p++;
a687059c 3128 break;
a0d0e21e
LW
3129 case 't':
3130 ender = '\t';
3131 p++;
a687059c 3132 break;
a0d0e21e
LW
3133 case 'f':
3134 ender = '\f';
3135 p++;
a687059c 3136 break;
a0d0e21e 3137 case 'e':
c7f1f016 3138 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3139 p++;
a687059c 3140 break;
a0d0e21e 3141 case 'a':
c7f1f016 3142 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3143 p++;
a687059c 3144 break;
a0d0e21e 3145 case 'x':
a0ed51b3
LW
3146 if (*++p == '{') {
3147 char* e = strchr(p, '}');
b81d288d 3148
b45f050a 3149 if (!e) {
830247a4 3150 RExC_parse = p + 1;
b45f050a
JF
3151 vFAIL("Missing right brace on \\x{}");
3152 }
de5f0749 3153 else {
a4c04bdc
NC
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3156 numlen = e - p - 1;
3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3158 if (ender > 0xff)
3159 RExC_utf8 = 1;
b21ed0a9
GS
3160 /* numlen is generous */
3161 if (numlen + len >= 127) {
a0ed51b3
LW
3162 p--;
3163 goto loopdone;
3164 }
3165 p = e + 1;
3166 }
a0ed51b3
LW
3167 }
3168 else {
a4c04bdc 3169 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3170 numlen = 2;
3171 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3172 p += numlen;
3173 }
a687059c 3174 break;
a0d0e21e
LW
3175 case 'c':
3176 p++;
bbce6d69 3177 ender = UCHARAT(p++);
3178 ender = toCTRL(ender);
a687059c 3179 break;
a0d0e21e
LW
3180 case '0': case '1': case '2': case '3':case '4':
3181 case '5': case '6': case '7': case '8':case '9':
3182 if (*p == '0' ||
830247a4 3183 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3184 I32 flags = 0;
3185 numlen = 3;
3186 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3187 p += numlen;
3188 }
3189 else {
3190 --p;
3191 goto loopdone;
a687059c
LW
3192 }
3193 break;
a0d0e21e 3194 case '\0':
830247a4 3195 if (p >= RExC_end)
b45f050a 3196 FAIL("Trailing \\");
a687059c 3197 /* FALL THROUGH */
a0d0e21e 3198 default:
e476b1b5 3199 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3200 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3201 goto normal_default;
a0d0e21e
LW
3202 }
3203 break;
a687059c 3204 default:
a0ed51b3 3205 normal_default:
fd400ab9 3206 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3207 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3208 &numlen, 0);
a0ed51b3
LW
3209 p += numlen;
3210 }
3211 else
3212 ender = *p++;
a0d0e21e 3213 break;
a687059c 3214 }
e2509266 3215 if (RExC_flags & PMf_EXTENDED)
830247a4 3216 p = regwhite(p, RExC_end);
60a8b682
JH
3217 if (UTF && FOLD) {
3218 /* Prime the casefolded buffer. */
ac7e0132 3219 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3220 }
a0d0e21e
LW
3221 if (ISMULT2(p)) { /* Back off on ?+*. */
3222 if (len)
3223 p = oldp;
16ea2a2e 3224 else if (UTF) {
0ebc6274
JH
3225 STRLEN unilen;
3226
80aecb99 3227 if (FOLD) {
60a8b682 3228 /* Emit all the Unicode characters. */
80aecb99
JH
3229 for (foldbuf = tmpbuf;
3230 foldlen;
3231 foldlen -= numlen) {
3232 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3233 if (numlen > 0) {
0ebc6274
JH
3234 reguni(pRExC_state, ender, s, &unilen);
3235 s += unilen;
3236 len += unilen;
3237 /* In EBCDIC the numlen
3238 * and unilen can differ. */
9dc45d57 3239 foldbuf += numlen;
47654450
JH
3240 if (numlen >= foldlen)
3241 break;
9dc45d57
JH
3242 }
3243 else
3244 break; /* "Can't happen." */
80aecb99
JH
3245 }
3246 }
3247 else {
0ebc6274 3248 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3249 if (unilen > 0) {
0ebc6274
JH
3250 s += unilen;
3251 len += unilen;
9dc45d57 3252 }
80aecb99 3253 }
a0ed51b3 3254 }
a0d0e21e
LW
3255 else {
3256 len++;
eb160463 3257 REGC((char)ender, s++);
a0d0e21e
LW
3258 }
3259 break;
a687059c 3260 }
16ea2a2e 3261 if (UTF) {
0ebc6274
JH
3262 STRLEN unilen;
3263
80aecb99 3264 if (FOLD) {
60a8b682 3265 /* Emit all the Unicode characters. */
80aecb99
JH
3266 for (foldbuf = tmpbuf;
3267 foldlen;
3268 foldlen -= numlen) {
3269 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3270 if (numlen > 0) {
0ebc6274
JH
3271 reguni(pRExC_state, ender, s, &unilen);
3272 len += unilen;
3273 s += unilen;
3274 /* In EBCDIC the numlen
3275 * and unilen can differ. */
9dc45d57 3276 foldbuf += numlen;
47654450
JH
3277 if (numlen >= foldlen)
3278 break;
9dc45d57
JH
3279 }
3280 else
3281 break;
80aecb99
JH
3282 }
3283 }
3284 else {
0ebc6274 3285 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3286 if (unilen > 0) {
0ebc6274
JH
3287 s += unilen;
3288 len += unilen;
9dc45d57 3289 }
80aecb99
JH
3290 }
3291 len--;
a0ed51b3
LW
3292 }
3293 else
eb160463 3294 REGC((char)ender, s++);
a0d0e21e
LW
3295 }
3296 loopdone:
830247a4 3297 RExC_parse = p - 1;
fac92740 3298 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3299 nextchar(pRExC_state);
793db0cb
JH
3300 {
3301 /* len is STRLEN which is unsigned, need to copy to signed */
3302 IV iv = len;
3303 if (iv < 0)
3304 vFAIL("Internal disaster");
3305 }
a0d0e21e
LW
3306 if (len > 0)
3307 *flagp |= HASWIDTH;
3308 if (len == 1)
3309 *flagp |= SIMPLE;
c277df42 3310 if (!SIZE_ONLY)
cd439c50
IZ
3311 STR_LEN(ret) = len;
3312 if (SIZE_ONLY)
830247a4 3313 RExC_size += STR_SZ(len);
cd439c50 3314 else
830247a4 3315 RExC_emit += STR_SZ(len);
a687059c 3316 }
a0d0e21e
LW
3317 break;
3318 }
a687059c 3319
60a8b682
JH
3320 /* If the encoding pragma is in effect recode the text of
3321 * any EXACT-kind nodes. */
22c54be3 3322 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3323 STRLEN oldlen = STR_LEN(ret);
3324 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3325
3326 if (RExC_utf8)
3327 SvUTF8_on(sv);
3328 if (sv_utf8_downgrade(sv, TRUE)) {
3329 char *s = sv_recode_to_utf8(sv, PL_encoding);
3330 STRLEN newlen = SvCUR(sv);
3331
3332 if (SvUTF8(sv))
3333 RExC_utf8 = 1;
3334 if (!SIZE_ONLY) {
3335 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3336 (int)oldlen, STRING(ret),
3337 (int)newlen, s));
3338 Copy(s, STRING(ret), newlen, char);
3339 STR_LEN(ret) += newlen - oldlen;
3340 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3341 } else
3342 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3343 }
a72c7584
JH
3344 }
3345
a0d0e21e 3346 return(ret);
a687059c
LW
3347}
3348
873ef191 3349STATIC char *
cea2e8a9 3350S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3351{
3352 while (p < e) {
3353 if (isSPACE(*p))
3354 ++p;
3355 else if (*p == '#') {
3356 do {
3357 p++;
3358 } while (p < e && *p != '\n');
3359 }
3360 else
3361 break;
3362 }
3363 return p;
3364}
3365
b8c5462f
JH
3366/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3367 Character classes ([:foo:]) can also be negated ([:^foo:]).
3368 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3369 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3370 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3371
3372#define POSIXCC_DONE(c) ((c) == ':')
3373#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3374#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3375
b8c5462f 3376STATIC I32
830247a4 3377S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3378{
3379 char *posixcc = 0;
936ed897 3380 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3381
830247a4 3382 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3383 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3384 POSIXCC(UCHARAT(RExC_parse))) {
3385 char c = UCHARAT(RExC_parse);
830247a4 3386 char* s = RExC_parse++;
b81d288d 3387
9a86a77b 3388 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3389 RExC_parse++;
3390 if (RExC_parse == RExC_end)
620e46c5 3391 /* Grandfather lone [:, [=, [. */
830247a4 3392 RExC_parse = s;
620e46c5 3393 else {
830247a4 3394 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3395
9a86a77b 3396 if (UCHARAT(RExC_parse) == ']') {
830247a4 3397 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3398 posixcc = s + 1;
3399 if (*s == ':') {
3400 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401 I32 skip = 5; /* the most common skip */
3402
3403 switch (*posixcc) {
3404 case 'a':
3405 if (strnEQ(posixcc, "alnum", 5))
3406 namedclass =
3407 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3408 else if (strnEQ(posixcc, "alpha", 5))
3409 namedclass =
3410 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3411 else if (strnEQ(posixcc, "ascii", 5))
3412 namedclass =
3413 complement ? ANYOF_NASCII : ANYOF_ASCII;
3414 break;
aaa51d5e
JF
3415 case 'b':
3416 if (strnEQ(posixcc, "blank", 5))
3417 namedclass =
3418 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3419 break;
b8c5462f
JH
3420 case 'c':
3421 if (strnEQ(posixcc, "cntrl", 5))
3422 namedclass =
3423 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3424 break;
3425 case 'd':
3426 if (strnEQ(posixcc, "digit", 5))
3427 namedclass =
3428 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3429 break;
3430 case 'g':
3431 if (strnEQ(posixcc, "graph", 5))
3432 namedclass =
3433 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3434 break;
3435 case 'l':
3436 if (strnEQ(posixcc, "lower", 5))
3437 namedclass =
3438 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3439 break;
3440 case 'p':
3441 if (strnEQ(posixcc, "print", 5))
3442 namedclass =
3443 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3444 else if (strnEQ(posixcc, "punct", 5))
3445 namedclass =
3446 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3447 break;
3448 case 's':
3449 if (strnEQ(posixcc, "space", 5))
3450 namedclass =
aaa51d5e 3451 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3452 break;
b8c5462f
JH
3453 case 'u':
3454 if (strnEQ(posixcc, "upper", 5))
3455 namedclass =
3456 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3457 break;
3458 case 'w': /* this is not POSIX, this is the Perl \w */
3459 if (strnEQ(posixcc, "word", 4)) {
3460 namedclass =
3461 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3462 skip = 4;
3463 }
3464 break;
3465 case 'x':
3466 if (strnEQ(posixcc, "xdigit", 6)) {
3467 namedclass =
3468 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3469 skip = 6;
3470 }
3471 break;
3472 }
ac561586
JH
3473 if (namedclass == OOB_NAMEDCLASS ||
3474 posixcc[skip] != ':' ||
3475 posixcc[skip+1] != ']')
b45f050a
JF
3476 {
3477 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3478 t - s - 1, s + 1);
3479 }
3480 } else if (!SIZE_ONLY) {
b8c5462f 3481 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3482
830247a4 3483 /* adjust RExC_parse so the warning shows after
b45f050a 3484 the class closes */
9a86a77b 3485 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3486 RExC_parse++;
b45f050a
JF
3487 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3488 }
b8c5462f
JH
3489 } else {
3490 /* Maternal grandfather: