This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: sitecustomize.pl [PATCH]
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380
MP
270#define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380
MP
290#define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
431/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
a0ed51b3
LW
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c
JH
663/*
664 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665 * These need to be revisited when a newer toolchain becomes available.
666 */
667#if defined(__sparc64__) && defined(__GNUC__)
668# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669# undef SPARC64_GCC_WORKAROUND
670# define SPARC64_GCC_WORKAROUND 1
671# endif
672#endif
673
653099ff
GS
674/* REx optimizer. Converts nodes into quickier variants "in place".
675 Finds fixed substrings. */
676
c277df42
IZ
677/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678 to the position after last scanned or to NULL. */
679
76e3520e 680STATIC I32
830247a4 681S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
682 /* scanp: Start here (read-write). */
683 /* deltap: Write maxlen-minlen here. */
684 /* last: Stop before this one. */
685{
686 I32 min = 0, pars = 0, code;
687 regnode *scan = *scanp, *next;
688 I32 delta = 0;
689 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 690 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
691 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692 scan_data_t data_fake;
653099ff 693 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 694
c277df42
IZ
695 while (scan && OP(scan) != END && scan < last) {
696 /* Peephole optimizer: */
697
22c35a8c 698 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 699 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
700 regnode *n = regnext(scan);
701 U32 stringok = 1;
702#ifdef DEBUGGING
703 regnode *stop = scan;
b81d288d 704#endif
c277df42 705
cd439c50 706 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
707 /* Skip NOTHING, merge EXACT*. */
708 while (n &&
b81d288d 709 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
710 (stringok && (OP(n) == OP(scan))))
711 && NEXT_OFF(n)
712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713 if (OP(n) == TAIL || n > next)
714 stringok = 0;
22c35a8c 715 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 next = n + NODE_STEP_REGNODE;
718#ifdef DEBUGGING
719 if (stringok)
720 stop = n;
b81d288d 721#endif
c277df42 722 n = regnext(n);
a0ed51b3 723 }
f49d4d0f 724 else if (stringok) {
cd439c50 725 int oldl = STR_LEN(scan);
c277df42 726 regnode *nnext = regnext(n);
f49d4d0f 727
b81d288d 728 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
729 break;
730 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
731 STR_LEN(scan) += STR_LEN(n);
732 next = n + NODE_SZ_STR(n);
c277df42 733 /* Now we can overwrite *n : */
f49d4d0f 734 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 735#ifdef DEBUGGING
f49d4d0f 736 stop = next - 1;
b81d288d 737#endif
c277df42
IZ
738 n = nnext;
739 }
740 }
61a36c01 741
d65e4eab 742 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
743/*
744 Two problematic code points in Unicode casefolding of EXACT nodes:
745
746 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748
749 which casefold to
750
751 Unicode UTF-8
752
753 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
754 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
755
756 This means that in case-insensitive matching (or "loose matching",
757 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758 length of the above casefolded versions) can match a target string
759 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760 This would rather mess up the minimum length computation.
761
762 What we'll do is to look for the tail four bytes, and then peek
763 at the preceding two bytes to see whether we need to decrease
764 the minimum length by four (six minus two).
765
766 Thanks to the design of UTF-8, there cannot be false matches:
767 A sequence of valid UTF-8 bytes cannot be a subsequence of
768 another valid sequence of UTF-8 bytes.
769
770*/
771 char *s0 = STRING(scan), *s, *t;
772 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773 char *t0 = "\xcc\x88\xcc\x81";
774 char *t1 = t0 + 3;
775
776 for (s = s0 + 2;
777 s < s2 && (t = ninstr(s, s1, t0, t1));
778 s = t + 4) {
779 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
781 min -= 4;
782 }
783 }
784
c277df42
IZ
785#ifdef DEBUGGING
786 /* Allow dumping */
cd439c50 787 n = scan + NODE_SZ_STR(scan);
c277df42 788 while (n <= stop) {
22c35a8c 789 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
790 OP(n) = OPTIMIZED;
791 NEXT_OFF(n) = 0;
792 }
793 n++;
794 }
653099ff 795#endif
c277df42 796 }
653099ff
GS
797 /* Follow the next-chain of the current node and optimize
798 away all the NOTHINGs from it. */
c277df42 799 if (OP(scan) != CURLYX) {
048cfca1
GS
800 int max = (reg_off_by_arg[OP(scan)]
801 ? I32_MAX
802 /* I32 may be smaller than U16 on CRAYs! */
803 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
804 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 int noff;
806 regnode *n = scan;
b81d288d 807
c277df42
IZ
808 /* Skip NOTHING and LONGJMP. */
809 while ((n = regnext(n))
22c35a8c 810 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
811 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812 && off + noff < max)
813 off += noff;
814 if (reg_off_by_arg[OP(scan)])
815 ARG(scan) = off;
b81d288d 816 else
c277df42
IZ
817 NEXT_OFF(scan) = off;
818 }
653099ff
GS
819 /* The principal pseudo-switch. Cannot be a switch, since we
820 look into several different things. */
b81d288d 821 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
822 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823 next = regnext(scan);
824 code = OP(scan);
b81d288d
AB
825
826 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 827 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 828 struct regnode_charclass_class accum;
c277df42 829
653099ff 830 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 831 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 832 if (flags & SCF_DO_STCLASS)
830247a4 833 cl_init_zero(pRExC_state, &accum);
c277df42 834 while (OP(scan) == code) {
830247a4 835 I32 deltanext, minnext, f = 0, fake;
653099ff 836 struct regnode_charclass_class this_class;
c277df42
IZ
837
838 num++;
839 data_fake.flags = 0;
b81d288d 840 if (data) {
2c2d71f5 841 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
842 data_fake.last_closep = data->last_closep;
843 }
844 else
845 data_fake.last_closep = &fake;
c277df42
IZ
846 next = regnext(scan);
847 scan = NEXTOPER(scan);
848 if (code != BRANCH)
849 scan = NEXTOPER(scan);
653099ff 850 if (flags & SCF_DO_STCLASS) {
830247a4 851 cl_init(pRExC_state, &this_class);
653099ff
GS
852 data_fake.start_class = &this_class;
853 f = SCF_DO_STCLASS_AND;
b81d288d 854 }
e1901655
IZ
855 if (flags & SCF_WHILEM_VISITED_POS)
856 f |= SCF_WHILEM_VISITED_POS;
653099ff 857 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
858 minnext = study_chunk(pRExC_state, &scan, &deltanext,
859 next, &data_fake, f);
b81d288d 860 if (min1 > minnext)
c277df42
IZ
861 min1 = minnext;
862 if (max1 < minnext + deltanext)
863 max1 = minnext + deltanext;
864 if (deltanext == I32_MAX)
aca2d497 865 is_inf = is_inf_internal = 1;
c277df42
IZ
866 scan = next;
867 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868 pars++;
405ff068 869 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 870 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
871 if (data)
872 data->whilem_c = data_fake.whilem_c;
653099ff 873 if (flags & SCF_DO_STCLASS)
830247a4 874 cl_or(pRExC_state, &accum, &this_class);
b81d288d 875 if (code == SUSPEND)
c277df42
IZ
876 break;
877 }
878 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879 min1 = 0;
880 if (flags & SCF_DO_SUBSTR) {
881 data->pos_min += min1;
882 data->pos_delta += max1 - min1;
883 if (max1 != min1 || is_inf)
884 data->longest = &(data->longest_float);
885 }
886 min += min1;
887 delta += max1 - min1;
653099ff 888 if (flags & SCF_DO_STCLASS_OR) {
830247a4 889 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
890 if (min1) {
891 cl_and(data->start_class, &and_with);
892 flags &= ~SCF_DO_STCLASS;
893 }
894 }
895 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
896 if (min1) {
897 cl_and(data->start_class, &accum);
653099ff 898 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
899 }
900 else {
b81d288d 901 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
902 * data->start_class */
903 StructCopy(data->start_class, &and_with,
904 struct regnode_charclass_class);
905 flags &= ~SCF_DO_STCLASS_AND;
906 StructCopy(&accum, data->start_class,
907 struct regnode_charclass_class);
908 flags |= SCF_DO_STCLASS_OR;
909 data->start_class->flags |= ANYOF_EOS;
910 }
653099ff 911 }
a0ed51b3
LW
912 }
913 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
914 scan = NEXTOPER(NEXTOPER(scan));
915 else /* single branch is optimized. */
916 scan = NEXTOPER(scan);
917 continue;
a0ed51b3
LW
918 }
919 else if (OP(scan) == EXACT) {
cd439c50 920 I32 l = STR_LEN(scan);
1aa99e6b 921 UV uc = *((U8*)STRING(scan));
a0ed51b3 922 if (UTF) {
1aa99e6b
IH
923 U8 *s = (U8*)STRING(scan);
924 l = utf8_length(s, s + l);
9041c2e3 925 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
926 }
927 min += l;
c277df42 928 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
929 /* The code below prefers earlier match for fixed
930 offset, later match for variable offset. */
931 if (data->last_end == -1) { /* Update the start info. */
932 data->last_start_min = data->pos_min;
933 data->last_start_max = is_inf
b81d288d 934 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 935 }
cd439c50 936 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
937 {
938 SV * sv = data->last_found;
939 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
941 if (mg && mg->mg_len >= 0)
5e43f467
JH
942 mg->mg_len += utf8_length((U8*)STRING(scan),
943 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 944 }
33b8afdf
JH
945 if (UTF)
946 SvUTF8_on(data->last_found);
c277df42
IZ
947 data->last_end = data->pos_min + l;
948 data->pos_min += l; /* As in the first entry. */
949 data->flags &= ~SF_BEFORE_EOL;
950 }
653099ff
GS
951 if (flags & SCF_DO_STCLASS_AND) {
952 /* Check whether it is compatible with what we know already! */
953 int compat = 1;
954
1aa99e6b 955 if (uc >= 0x100 ||
516a5887 956 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 957 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 958 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 959 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 960 )
653099ff
GS
961 compat = 0;
962 ANYOF_CLASS_ZERO(data->start_class);
963 ANYOF_BITMAP_ZERO(data->start_class);
964 if (compat)
1aa99e6b 965 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 966 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
967 if (uc < 0x100)
968 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
969 }
970 else if (flags & SCF_DO_STCLASS_OR) {
971 /* false positive possible if the class is case-folded */
1aa99e6b 972 if (uc < 0x100)
9b877dbb
IH
973 ANYOF_BITMAP_SET(data->start_class, uc);
974 else
975 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
976 data->start_class->flags &= ~ANYOF_EOS;
977 cl_and(data->start_class, &and_with);
978 }
979 flags &= ~SCF_DO_STCLASS;
a0ed51b3 980 }
653099ff 981 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 982 I32 l = STR_LEN(scan);
1aa99e6b 983 UV uc = *((U8*)STRING(scan));
653099ff
GS
984
985 /* Search for fixed substrings supports EXACT only. */
b81d288d 986 if (flags & SCF_DO_SUBSTR)
830247a4 987 scan_commit(pRExC_state, data);
a0ed51b3 988 if (UTF) {
1aa99e6b
IH
989 U8 *s = (U8 *)STRING(scan);
990 l = utf8_length(s, s + l);
9041c2e3 991 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
992 }
993 min += l;
c277df42 994 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 995 data->pos_min += l;
653099ff
GS
996 if (flags & SCF_DO_STCLASS_AND) {
997 /* Check whether it is compatible with what we know already! */
998 int compat = 1;
999
1aa99e6b 1000 if (uc >= 0x100 ||
516a5887 1001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 1002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 1003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
1004 compat = 0;
1005 ANYOF_CLASS_ZERO(data->start_class);
1006 ANYOF_BITMAP_ZERO(data->start_class);
1007 if (compat) {
1aa99e6b 1008 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 data->start_class->flags |= ANYOF_FOLD;
1011 if (OP(scan) == EXACTFL)
1012 data->start_class->flags |= ANYOF_LOCALE;
1013 }
1014 }
1015 else if (flags & SCF_DO_STCLASS_OR) {
1016 if (data->start_class->flags & ANYOF_FOLD) {
1017 /* false positive possible if the class is case-folded.
1018 Assume that the locale settings are the same... */
1aa99e6b
IH
1019 if (uc < 0x100)
1020 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1021 data->start_class->flags &= ~ANYOF_EOS;
1022 }
1023 cl_and(data->start_class, &and_with);
1024 }
1025 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1026 }
4d61ec05 1027 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1029 I32 f = flags, pos_before = 0;
c277df42 1030 regnode *oscan = scan;
653099ff
GS
1031 struct regnode_charclass_class this_class;
1032 struct regnode_charclass_class *oclass = NULL;
727f22e3 1033 I32 next_is_eval = 0;
653099ff 1034
22c35a8c 1035 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1036 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1037 scan = NEXTOPER(scan);
1038 goto finish;
1039 case PLUS:
653099ff 1040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1041 next = NEXTOPER(scan);
653099ff 1042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1043 mincount = 1;
1044 maxcount = REG_INFTY;
c277df42
IZ
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1047 goto do_curly;
1048 }
1049 }
1050 if (flags & SCF_DO_SUBSTR)
1051 data->pos_min++;
1052 min++;
1053 /* Fall through. */
1054 case STAR:
653099ff
GS
1055 if (flags & SCF_DO_STCLASS) {
1056 mincount = 0;
b81d288d 1057 maxcount = REG_INFTY;
653099ff
GS
1058 next = regnext(scan);
1059 scan = NEXTOPER(scan);
1060 goto do_curly;
1061 }
b81d288d 1062 is_inf = is_inf_internal = 1;
c277df42
IZ
1063 scan = regnext(scan);
1064 if (flags & SCF_DO_SUBSTR) {
830247a4 1065 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1066 data->longest = &(data->longest_float);
1067 }
1068 goto optimize_curly_tail;
1069 case CURLY:
b81d288d 1070 mincount = ARG1(scan);
c277df42
IZ
1071 maxcount = ARG2(scan);
1072 next = regnext(scan);
cb434fcc
IZ
1073 if (OP(scan) == CURLYX) {
1074 I32 lp = (data ? *(data->last_closep) : 0);
1075
1076 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1077 }
c277df42 1078 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1079 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1080 do_curly:
1081 if (flags & SCF_DO_SUBSTR) {
830247a4 1082 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1083 pos_before = data->pos_min;
1084 }
1085 if (data) {
1086 fl = data->flags;
1087 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1088 if (is_inf)
1089 data->flags |= SF_IS_INF;
1090 }
653099ff 1091 if (flags & SCF_DO_STCLASS) {
830247a4 1092 cl_init(pRExC_state, &this_class);
653099ff
GS
1093 oclass = data->start_class;
1094 data->start_class = &this_class;
1095 f |= SCF_DO_STCLASS_AND;
1096 f &= ~SCF_DO_STCLASS_OR;
1097 }
e1901655
IZ
1098 /* These are the cases when once a subexpression
1099 fails at a particular position, it cannot succeed
1100 even after backtracking at the enclosing scope.
b81d288d 1101
e1901655
IZ
1102 XXXX what if minimal match and we are at the
1103 initial run of {n,m}? */
1104 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1106
c277df42 1107 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1108 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1109 mincount == 0
653099ff
GS
1110 ? (f & ~SCF_DO_SUBSTR) : f);
1111
1112 if (flags & SCF_DO_STCLASS)
1113 data->start_class = oclass;
1114 if (mincount == 0 || minnext == 0) {
1115 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1116 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1117 }
1118 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1119 /* Switch to OR mode: cache the old value of
653099ff
GS
1120 * data->start_class */
1121 StructCopy(data->start_class, &and_with,
1122 struct regnode_charclass_class);
1123 flags &= ~SCF_DO_STCLASS_AND;
1124 StructCopy(&this_class, data->start_class,
1125 struct regnode_charclass_class);
1126 flags |= SCF_DO_STCLASS_OR;
1127 data->start_class->flags |= ANYOF_EOS;
1128 }
1129 } else { /* Non-zero len */
1130 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1131 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1132 cl_and(data->start_class, &and_with);
1133 }
1134 else if (flags & SCF_DO_STCLASS_AND)
1135 cl_and(data->start_class, &this_class);
1136 flags &= ~SCF_DO_STCLASS;
1137 }
c277df42
IZ
1138 if (!scan) /* It was not CURLYX, but CURLY. */
1139 scan = next;
84037bb0 1140 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1141 /* ? quantifier ok, except for (?{ ... }) */
1142 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1143 && (minnext == 0) && (deltanext == 0)
99799961 1144 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1145 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1146 {
830247a4 1147 vWARN(RExC_parse,
b45f050a
JF
1148 "Quantifier unexpected on zero-length expression");
1149 }
1150
c277df42 1151 min += minnext * mincount;
b81d288d 1152 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1153 && (minnext + deltanext) > 0)
1154 || deltanext == I32_MAX);
aca2d497 1155 is_inf |= is_inf_internal;
c277df42
IZ
1156 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1157
1158 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1159 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1160 && data->flags & SF_IN_PAR
1161 && !(data->flags & SF_HAS_EVAL)
1162 && !deltanext && minnext == 1 ) {
1163 /* Try to optimize to CURLYN. */
1164 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1165 regnode *nxt1 = nxt;
1166#ifdef DEBUGGING
1167 regnode *nxt2;
1168#endif
c277df42
IZ
1169
1170 /* Skip open. */
1171 nxt = regnext(nxt);
4d61ec05 1172 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1173 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1174 && STR_LEN(nxt) == 1))
c277df42 1175 goto nogo;
497b47a8 1176#ifdef DEBUGGING
c277df42 1177 nxt2 = nxt;
497b47a8 1178#endif
c277df42 1179 nxt = regnext(nxt);
b81d288d 1180 if (OP(nxt) != CLOSE)
c277df42
IZ
1181 goto nogo;
1182 /* Now we know that nxt2 is the only contents: */
eb160463 1183 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1184 OP(oscan) = CURLYN;
1185 OP(nxt1) = NOTHING; /* was OPEN. */
1186#ifdef DEBUGGING
1187 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1191 OP(nxt + 1) = OPTIMIZED; /* was count. */
1192 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1193#endif
c277df42 1194 }
c277df42
IZ
1195 nogo:
1196
1197 /* Try optimization CURLYX => CURLYM. */
b81d288d 1198 if ( OP(oscan) == CURLYX && data
c277df42 1199 && !(data->flags & SF_HAS_PAR)
c277df42 1200 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
1201 && !deltanext /* atom is fixed width */
1202 && minnext != 0 /* CURLYM can't handle zero width */
1203 ) {
c277df42
IZ
1204 /* XXXX How to optimize if data == 0? */
1205 /* Optimize to a simpler form. */
1206 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1207 regnode *nxt2;
1208
1209 OP(oscan) = CURLYM;
1210 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1211 && (OP(nxt2) != WHILEM))
c277df42
IZ
1212 nxt = nxt2;
1213 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1214 /* Need to optimize away parenths. */
1215 if (data->flags & SF_IN_PAR) {
1216 /* Set the parenth number. */
1217 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1218
b81d288d 1219 if (OP(nxt) != CLOSE)
b45f050a 1220 FAIL("Panic opt close");
eb160463 1221 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1222 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1223 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1224#ifdef DEBUGGING
1225 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226 OP(nxt + 1) = OPTIMIZED; /* was count. */
1227 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1229#endif
c277df42
IZ
1230#if 0
1231 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232 regnode *nnxt = regnext(nxt1);
b81d288d 1233
c277df42
IZ
1234 if (nnxt == nxt) {
1235 if (reg_off_by_arg[OP(nxt1)])
1236 ARG_SET(nxt1, nxt2 - nxt1);
1237 else if (nxt2 - nxt1 < U16_MAX)
1238 NEXT_OFF(nxt1) = nxt2 - nxt1;
1239 else
1240 OP(nxt) = NOTHING; /* Cannot beautify */
1241 }
1242 nxt1 = nnxt;
1243 }
1244#endif
1245 /* Optimize again: */
b81d288d 1246 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1247 NULL, 0);
a0ed51b3
LW
1248 }
1249 else
c277df42 1250 oscan->flags = 0;
c277df42 1251 }
e1901655
IZ
1252 else if ((OP(oscan) == CURLYX)
1253 && (flags & SCF_WHILEM_VISITED_POS)
1254 /* See the comment on a similar expression above.
1255 However, this time it not a subexpression
1256 we care about, but the expression itself. */
1257 && (maxcount == REG_INFTY)
1258 && data && ++data->whilem_c < 16) {
1259 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1260 /* Find WHILEM (as in regexec.c) */
1261 regnode *nxt = oscan + NEXT_OFF(oscan);
1262
1263 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1264 nxt += ARG(nxt);
eb160463
GS
1265 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1267 }
b81d288d 1268 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1269 pars++;
1270 if (flags & SCF_DO_SUBSTR) {
1271 SV *last_str = Nullsv;
1272 int counted = mincount != 0;
1273
1274 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1275#if defined(SPARC64_GCC_WORKAROUND)
1276 I32 b = 0;
1277 STRLEN l = 0;
1278 char *s = NULL;
1279 I32 old = 0;
1280
1281 if (pos_before >= data->last_start_min)
1282 b = pos_before;
1283 else
1284 b = data->last_start_min;
1285
1286 l = 0;
1287 s = SvPV(data->last_found, l);
1288 old = b - data->last_start_min;
1289
1290#else
b81d288d 1291 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1292 ? pos_before : data->last_start_min;
1293 STRLEN l;
1294 char *s = SvPV(data->last_found, l);
a0ed51b3 1295 I32 old = b - data->last_start_min;
5d1c421c 1296#endif
a0ed51b3
LW
1297
1298 if (UTF)
1299 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1300
a0ed51b3 1301 l -= old;
c277df42 1302 /* Get the added string: */
79cb57f6 1303 last_str = newSVpvn(s + old, l);
0e933229
IH
1304 if (UTF)
1305 SvUTF8_on(last_str);
c277df42
IZ
1306 if (deltanext == 0 && pos_before == b) {
1307 /* What was added is a constant string */
1308 if (mincount > 1) {
1309 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1310 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1311 SvPVX(last_str), l, mincount - 1);
1312 SvCUR(last_str) *= mincount;
1313 /* Add additional parts. */
b81d288d 1314 SvCUR_set(data->last_found,
c277df42
IZ
1315 SvCUR(data->last_found) - l);
1316 sv_catsv(data->last_found, last_str);
0eda9292
JH
1317 {
1318 SV * sv = data->last_found;
1319 MAGIC *mg =
1320 SvUTF8(sv) && SvMAGICAL(sv) ?
1321 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322 if (mg && mg->mg_len >= 0)
1323 mg->mg_len += CHR_SVLEN(last_str);
1324 }
c277df42
IZ
1325 data->last_end += l * (mincount - 1);
1326 }
2a8d9689
HS
1327 } else {
1328 /* start offset must point into the last copy */
1329 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1330 data->last_start_max += is_inf ? I32_MAX
1331 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1332 }
1333 }
1334 /* It is counted once already... */
1335 data->pos_min += minnext * (mincount - counted);
1336 data->pos_delta += - counted * deltanext +
1337 (minnext + deltanext) * maxcount - minnext * mincount;
1338 if (mincount != maxcount) {
653099ff
GS
1339 /* Cannot extend fixed substrings found inside
1340 the group. */
830247a4 1341 scan_commit(pRExC_state,data);
c277df42
IZ
1342 if (mincount && last_str) {
1343 sv_setsv(data->last_found, last_str);
1344 data->last_end = data->pos_min;
b81d288d 1345 data->last_start_min =
a0ed51b3 1346 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1347 data->last_start_max = is_inf
1348 ? I32_MAX
c277df42 1349 : data->pos_min + data->pos_delta
a0ed51b3 1350 - CHR_SVLEN(last_str);
c277df42
IZ
1351 }
1352 data->longest = &(data->longest_float);
1353 }
aca2d497 1354 SvREFCNT_dec(last_str);
c277df42 1355 }
405ff068 1356 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1357 data->flags |= SF_HAS_EVAL;
1358 optimize_curly_tail:
c277df42 1359 if (OP(oscan) != CURLYX) {
22c35a8c 1360 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1361 && NEXT_OFF(next))
1362 NEXT_OFF(oscan) += NEXT_OFF(next);
1363 }
c277df42 1364 continue;
653099ff 1365 default: /* REF and CLUMP only? */
c277df42 1366 if (flags & SCF_DO_SUBSTR) {
830247a4 1367 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1368 data->longest = &(data->longest_float);
1369 }
aca2d497 1370 is_inf = is_inf_internal = 1;
653099ff 1371 if (flags & SCF_DO_STCLASS_OR)
830247a4 1372 cl_anything(pRExC_state, data->start_class);
653099ff 1373 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1374 break;
1375 }
a0ed51b3 1376 }
ffc61ed2 1377 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1378 int value = 0;
653099ff 1379
c277df42 1380 if (flags & SCF_DO_SUBSTR) {
830247a4 1381 scan_commit(pRExC_state,data);
c277df42
IZ
1382 data->pos_min++;
1383 }
1384 min++;
653099ff
GS
1385 if (flags & SCF_DO_STCLASS) {
1386 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1387
1388 /* Some of the logic below assumes that switching
1389 locale on will only add false positives. */
1390 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1391 case SANY:
653099ff
GS
1392 default:
1393 do_default:
1394 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1396 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1397 break;
1398 case REG_ANY:
1399 if (OP(scan) == SANY)
1400 goto do_default;
1401 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1404 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1405 }
1406 if (flags & SCF_DO_STCLASS_AND || !value)
1407 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1408 break;
1409 case ANYOF:
1410 if (flags & SCF_DO_STCLASS_AND)
1411 cl_and(data->start_class,
1412 (struct regnode_charclass_class*)scan);
1413 else
830247a4 1414 cl_or(pRExC_state, data->start_class,
653099ff
GS
1415 (struct regnode_charclass_class*)scan);
1416 break;
1417 case ALNUM:
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 for (value = 0; value < 256; value++)
1422 if (!isALNUM(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1424 }
1425 }
1426 else {
1427 if (data->start_class->flags & ANYOF_LOCALE)
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1429 else {
1430 for (value = 0; value < 256; value++)
1431 if (isALNUM(value))
b81d288d 1432 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1433 }
1434 }
1435 break;
1436 case ALNUML:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (data->start_class->flags & ANYOF_LOCALE)
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1440 }
1441 else {
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443 data->start_class->flags |= ANYOF_LOCALE;
1444 }
1445 break;
1446 case NALNUM:
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 for (value = 0; value < 256; value++)
1451 if (isALNUM(value))
1452 ANYOF_BITMAP_CLEAR(data->start_class, value);
1453 }
1454 }
1455 else {
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458 else {
1459 for (value = 0; value < 256; value++)
1460 if (!isALNUM(value))
b81d288d 1461 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1462 }
1463 }
1464 break;
1465 case NALNUML:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (data->start_class->flags & ANYOF_LOCALE)
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1469 }
1470 else {
1471 data->start_class->flags |= ANYOF_LOCALE;
1472 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1473 }
1474 break;
1475 case SPACE:
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 for (value = 0; value < 256; value++)
1480 if (!isSPACE(value))
1481 ANYOF_BITMAP_CLEAR(data->start_class, value);
1482 }
1483 }
1484 else {
1485 if (data->start_class->flags & ANYOF_LOCALE)
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487 else {
1488 for (value = 0; value < 256; value++)
1489 if (isSPACE(value))
b81d288d 1490 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1491 }
1492 }
1493 break;
1494 case SPACEL:
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (data->start_class->flags & ANYOF_LOCALE)
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1498 }
1499 else {
1500 data->start_class->flags |= ANYOF_LOCALE;
1501 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1502 }
1503 break;
1504 case NSPACE:
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1509 if (isSPACE(value))
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1511 }
1512 }
1513 else {
1514 if (data->start_class->flags & ANYOF_LOCALE)
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516 else {
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
b81d288d 1519 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1520 }
1521 }
1522 break;
1523 case NSPACEL:
1524 if (flags & SCF_DO_STCLASS_AND) {
1525 if (data->start_class->flags & ANYOF_LOCALE) {
1526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527 for (value = 0; value < 256; value++)
1528 if (!isSPACE(value))
1529 ANYOF_BITMAP_CLEAR(data->start_class, value);
1530 }
1531 }
1532 else {
1533 data->start_class->flags |= ANYOF_LOCALE;
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1535 }
1536 break;
1537 case DIGIT:
1538 if (flags & SCF_DO_STCLASS_AND) {
1539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543 }
1544 else {
1545 if (data->start_class->flags & ANYOF_LOCALE)
1546 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1547 else {
1548 for (value = 0; value < 256; value++)
1549 if (isDIGIT(value))
b81d288d 1550 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1551 }
1552 }
1553 break;
1554 case NDIGIT:
1555 if (flags & SCF_DO_STCLASS_AND) {
1556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557 for (value = 0; value < 256; value++)
1558 if (isDIGIT(value))
1559 ANYOF_BITMAP_CLEAR(data->start_class, value);
1560 }
1561 else {
1562 if (data->start_class->flags & ANYOF_LOCALE)
1563 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1564 else {
1565 for (value = 0; value < 256; value++)
1566 if (!isDIGIT(value))
b81d288d 1567 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1568 }
1569 }
1570 break;
1571 }
1572 if (flags & SCF_DO_STCLASS_OR)
1573 cl_and(data->start_class, &and_with);
1574 flags &= ~SCF_DO_STCLASS;
1575 }
a0ed51b3 1576 }
22c35a8c 1577 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1578 data->flags |= (OP(scan) == MEOL
1579 ? SF_BEFORE_MEOL
1580 : SF_BEFORE_SEOL);
a0ed51b3 1581 }
653099ff
GS
1582 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1583 /* Lookbehind, or need to calculate parens/evals/stclass: */
1584 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1585 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1586 /* Lookahead/lookbehind */
cb434fcc 1587 I32 deltanext, minnext, fake = 0;
c277df42 1588 regnode *nscan;
653099ff
GS
1589 struct regnode_charclass_class intrnl;
1590 int f = 0;
c277df42
IZ
1591
1592 data_fake.flags = 0;
b81d288d 1593 if (data) {
2c2d71f5 1594 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1595 data_fake.last_closep = data->last_closep;
1596 }
1597 else
1598 data_fake.last_closep = &fake;
653099ff
GS
1599 if ( flags & SCF_DO_STCLASS && !scan->flags
1600 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1601 cl_init(pRExC_state, &intrnl);
653099ff 1602 data_fake.start_class = &intrnl;
e1901655 1603 f |= SCF_DO_STCLASS_AND;
653099ff 1604 }
e1901655
IZ
1605 if (flags & SCF_WHILEM_VISITED_POS)
1606 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1607 next = regnext(scan);
1608 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1609 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1610 if (scan->flags) {
1611 if (deltanext) {
9baa0206 1612 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1613 }
1614 else if (minnext > U8_MAX) {
9baa0206 1615 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1616 }
eb160463 1617 scan->flags = (U8)minnext;
c277df42
IZ
1618 }
1619 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1620 pars++;
405ff068 1621 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1622 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1623 if (data)
1624 data->whilem_c = data_fake.whilem_c;
e1901655 1625 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1626 int was = (data->start_class->flags & ANYOF_EOS);
1627
1628 cl_and(data->start_class, &intrnl);
1629 if (was)
1630 data->start_class->flags |= ANYOF_EOS;
1631 }
a0ed51b3
LW
1632 }
1633 else if (OP(scan) == OPEN) {
c277df42 1634 pars++;
a0ed51b3 1635 }
cb434fcc 1636 else if (OP(scan) == CLOSE) {
eb160463 1637 if ((I32)ARG(scan) == is_par) {
cb434fcc 1638 next = regnext(scan);
c277df42 1639
cb434fcc
IZ
1640 if ( next && (OP(next) != WHILEM) && next < last)
1641 is_par = 0; /* Disable optimization */
1642 }
1643 if (data)
1644 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1645 }
1646 else if (OP(scan) == EVAL) {
c277df42
IZ
1647 if (data)
1648 data->flags |= SF_HAS_EVAL;
1649 }
96776eda 1650 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1651 if (flags & SCF_DO_SUBSTR) {
830247a4 1652 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1653 data->longest = &(data->longest_float);
1654 }
1655 is_inf = is_inf_internal = 1;
653099ff 1656 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1657 cl_anything(pRExC_state, data->start_class);
96776eda 1658 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1659 }
c277df42
IZ
1660 /* Else: zero-length, ignore. */
1661 scan = regnext(scan);
1662 }
1663
1664 finish:
1665 *scanp = scan;
aca2d497 1666 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1667 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1668 data->pos_delta = I32_MAX - data->pos_min;
1669 if (is_par > U8_MAX)
1670 is_par = 0;
1671 if (is_par && pars==1 && data) {
1672 data->flags |= SF_IN_PAR;
1673 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1674 }
1675 else if (pars && data) {
c277df42
IZ
1676 data->flags |= SF_HAS_PAR;
1677 data->flags &= ~SF_IN_PAR;
1678 }
653099ff
GS
1679 if (flags & SCF_DO_STCLASS_OR)
1680 cl_and(data->start_class, &and_with);
c277df42
IZ
1681 return min;
1682}
1683
76e3520e 1684STATIC I32
830247a4 1685S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1686{
830247a4 1687 if (RExC_rx->data) {
b81d288d
AB
1688 Renewc(RExC_rx->data,
1689 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1690 char, struct reg_data);
830247a4
IZ
1691 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692 RExC_rx->data->count += n;
a0ed51b3
LW
1693 }
1694 else {
830247a4 1695 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1696 char, struct reg_data);
830247a4
IZ
1697 New(1208, RExC_rx->data->what, n, U8);
1698 RExC_rx->data->count = n;
c277df42 1699 }
830247a4
IZ
1700 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701 return RExC_rx->data->count - n;
c277df42
IZ
1702}
1703
d88dccdf 1704void
864dbfa3 1705Perl_reginitcolors(pTHX)
d88dccdf 1706{
d88dccdf
IZ
1707 int i = 0;
1708 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1709
d88dccdf
IZ
1710 if (s) {
1711 PL_colors[0] = s = savepv(s);
1712 while (++i < 6) {
1713 s = strchr(s, '\t');
1714 if (s) {
1715 *s = '\0';
1716 PL_colors[i] = ++s;
1717 }
1718 else
c712d376 1719 PL_colors[i] = s = "";
d88dccdf
IZ
1720 }
1721 } else {
b81d288d 1722 while (i < 6)
d88dccdf
IZ
1723 PL_colors[i++] = "";
1724 }
1725 PL_colorset = 1;
1726}
1727
8615cb43 1728
a687059c 1729/*
e50aee73 1730 - pregcomp - compile a regular expression into internal code
a687059c
LW
1731 *
1732 * We can't allocate space until we know how big the compiled form will be,
1733 * but we can't compile it (and thus know how big it is) until we've got a
1734 * place to put the code. So we cheat: we compile it twice, once with code
1735 * generation turned off and size counting turned on, and once "for real".
1736 * This also means that we don't allocate space until we are sure that the
1737 * thing really will compile successfully, and we never have to move the
1738 * code and thus invalidate pointers into it. (Note that it has to be in
1739 * one piece because free() must be able to free it all.) [NB: not true in perl]
1740 *
1741 * Beware that the optimization-preparation code in here knows about some
1742 * of the structure of the compiled regexp. [I'll say.]
1743 */
1744regexp *
864dbfa3 1745Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1746{
a0d0e21e 1747 register regexp *r;
c277df42 1748 regnode *scan;
c277df42 1749 regnode *first;
a0d0e21e 1750 I32 flags;
a0d0e21e
LW
1751 I32 minlen = 0;
1752 I32 sawplus = 0;
1753 I32 sawopen = 0;
2c2d71f5 1754 scan_data_t data;
830247a4
IZ
1755 RExC_state_t RExC_state;
1756 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1757
1758 if (exp == NULL)
c277df42 1759 FAIL("NULL regexp argument");
a0d0e21e 1760
a5961de5 1761 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1762
5cfc7842 1763 RExC_precomp = exp;
a5961de5
JH
1764 DEBUG_r({
1765 if (!PL_colorset) reginitcolors();
1766 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767 PL_colors[4],PL_colors[5],PL_colors[0],
1768 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1769 });
e2509266 1770 RExC_flags = pm->op_pmflags;
830247a4 1771 RExC_sawback = 0;
bbce6d69 1772
830247a4
IZ
1773 RExC_seen = 0;
1774 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775 RExC_seen_evals = 0;
1776 RExC_extralen = 0;
c277df42 1777
bbce6d69 1778 /* First pass: determine size, legality. */
830247a4 1779 RExC_parse = exp;
fac92740 1780 RExC_start = exp;
830247a4
IZ
1781 RExC_end = xend;
1782 RExC_naughty = 0;
1783 RExC_npar = 1;
1784 RExC_size = 0L;
1785 RExC_emit = &PL_regdummy;
1786 RExC_whilem_seen = 0;
85ddcde9
JH
1787#if 0 /* REGC() is (currently) a NOP at the first pass.
1788 * Clever compilers notice this and complain. --jhi */
830247a4 1789 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1790#endif
830247a4 1791 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1792 RExC_precomp = Nullch;
a0d0e21e
LW
1793 return(NULL);
1794 }
830247a4 1795 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1796
c277df42
IZ
1797 /* Small enough for pointer-storage convention?
1798 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1799 if (RExC_size >= 0x10000L && RExC_extralen)
1800 RExC_size += RExC_extralen;
c277df42 1801 else
830247a4
IZ
1802 RExC_extralen = 0;
1803 if (RExC_whilem_seen > 15)
1804 RExC_whilem_seen = 15;
a0d0e21e 1805
bbce6d69 1806 /* Allocate space and initialize. */
830247a4 1807 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1808 char, regexp);
a0d0e21e 1809 if (r == NULL)
b45f050a
JF
1810 FAIL("Regexp out of space");
1811
0f79a09d
GS
1812#ifdef DEBUGGING
1813 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1814 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1815#endif
c277df42 1816 r->refcnt = 1;
bbce6d69 1817 r->prelen = xend - exp;
5cfc7842 1818 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1819 r->subbeg = NULL;
ed252734
NC
1820#ifdef PERL_COPY_ON_WRITE
1821 r->saved_copy = Nullsv;
1822#endif
cf93c79d 1823 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1824 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1825
1826 r->substrs = 0; /* Useful during FAIL. */
1827 r->startp = 0; /* Useful during FAIL. */
1828 r->endp = 0; /* Useful during FAIL. */
1829
fac92740
MJD
1830 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1831 if (r->offsets) {
1832 r->offsets[0] = RExC_size;
1833 }
1834 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1835 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1836 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1837 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1838
830247a4 1839 RExC_rx = r;
bbce6d69 1840
1841 /* Second pass: emit code. */
e2509266 1842 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1843 RExC_parse = exp;
1844 RExC_end = xend;
1845 RExC_naughty = 0;
1846 RExC_npar = 1;
fac92740 1847 RExC_emit_start = r->program;
830247a4 1848 RExC_emit = r->program;
2cd61cdb 1849 /* Store the count of eval-groups for security checks: */
eb160463 1850 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1851 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1852 r->data = 0;
830247a4 1853 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1854 return(NULL);
1855
1856 /* Dig out information for optimizations. */
cf93c79d 1857 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1858 pm->op_pmflags = RExC_flags;
a0ed51b3 1859 if (UTF)
5ff6fc6d 1860 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1861 r->regstclass = NULL;
830247a4 1862 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1863 r->reganch |= ROPT_NAUGHTY;
c277df42 1864 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1865
1866 /* XXXX To minimize changes to RE engine we always allocate
1867 3-units-long substrs field. */
1868 Newz(1004, r->substrs, 1, struct reg_substr_data);
1869
2c2d71f5 1870 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1871 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1872 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1873 I32 fake;
c5254dd6 1874 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1875 struct regnode_charclass_class ch_class;
1876 int stclass_flag;
cb434fcc 1877 I32 last_close = 0;
a0d0e21e
LW
1878
1879 first = scan;
c277df42 1880 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1881 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1882 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1883 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1884 (OP(first) == PLUS) ||
1885 (OP(first) == MINMOD) ||
653099ff 1886 /* An {n,m} with n>0 */
22c35a8c 1887 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1888 if (OP(first) == PLUS)
1889 sawplus = 1;
1890 else
1891 first += regarglen[(U8)OP(first)];
1892 first = NEXTOPER(first);
a687059c
LW
1893 }
1894
a0d0e21e
LW
1895 /* Starting-point info. */
1896 again:
653099ff 1897 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1898 if (OP(first) == EXACT)
1899 ; /* Empty, get anchored substr later. */
1900 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1901 r->regstclass = first;
1902 }
653099ff 1903 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1904 r->regstclass = first;
22c35a8c
GS
1905 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1906 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1907 r->regstclass = first;
22c35a8c 1908 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1909 r->reganch |= (OP(first) == MBOL
1910 ? ROPT_ANCH_MBOL
1911 : (OP(first) == SBOL
1912 ? ROPT_ANCH_SBOL
1913 : ROPT_ANCH_BOL));
a0d0e21e 1914 first = NEXTOPER(first);
774d564b 1915 goto again;
1916 }
1917 else if (OP(first) == GPOS) {
1918 r->reganch |= ROPT_ANCH_GPOS;
1919 first = NEXTOPER(first);
1920 goto again;
a0d0e21e 1921 }
e09294f4 1922 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1923 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1924 !(r->reganch & ROPT_ANCH) )
1925 {
1926 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1927 int type = OP(NEXTOPER(first));
1928
ffc61ed2 1929 if (type == REG_ANY)
cad2e5aa
JH
1930 type = ROPT_ANCH_MBOL;
1931 else
1932 type = ROPT_ANCH_SBOL;
1933
1934 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1935 first = NEXTOPER(first);
774d564b 1936 goto again;
a0d0e21e 1937 }
b81d288d 1938 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1939 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1940 /* x+ must match at the 1st pos of run of x's */
1941 r->reganch |= ROPT_SKIP;
a0d0e21e 1942
c277df42 1943 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1944 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1945 (IV)(first - scan + 1)));
a0d0e21e
LW
1946 /*
1947 * If there's something expensive in the r.e., find the
1948 * longest literal string that must appear and make it the
1949 * regmust. Resolve ties in favor of later strings, since
1950 * the regstart check works with the beginning of the r.e.
1951 * and avoiding duplication strengthens checking. Not a
1952 * strong reason, but sufficient in the absence of others.
1953 * [Now we resolve ties in favor of the earlier string if
c277df42 1954 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1955 * earlier string may buy us something the later one won't.]
1956 */
a0d0e21e 1957 minlen = 0;
a687059c 1958
79cb57f6
GS
1959 data.longest_fixed = newSVpvn("",0);
1960 data.longest_float = newSVpvn("",0);
1961 data.last_found = newSVpvn("",0);
c277df42
IZ
1962 data.longest = &(data.longest_fixed);
1963 first = scan;
653099ff 1964 if (!r->regstclass) {
830247a4 1965 cl_init(pRExC_state, &ch_class);
653099ff
GS
1966 data.start_class = &ch_class;
1967 stclass_flag = SCF_DO_STCLASS_AND;
1968 } else /* XXXX Check for BOUND? */
1969 stclass_flag = 0;
cb434fcc 1970 data.last_closep = &last_close;
653099ff 1971
830247a4 1972 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1973 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1974 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1975 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1976 && !RExC_seen_zerolen
1977 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1978 r->reganch |= ROPT_CHECK_ALL;
830247a4 1979 scan_commit(pRExC_state, &data);
c277df42
IZ
1980 SvREFCNT_dec(data.last_found);
1981
a0ed51b3 1982 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1983 if (longest_float_length
c277df42
IZ
1984 || (data.flags & SF_FL_BEFORE_EOL
1985 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1986 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1987 int t;
1988
a0ed51b3 1989 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1990 && data.offset_fixed == data.offset_float_min
1991 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1992 goto remove_float; /* As in (a)+. */
1993
33b8afdf
JH
1994 if (SvUTF8(data.longest_float)) {
1995 r->float_utf8 = data.longest_float;
1996 r->float_substr = Nullsv;
1997 } else {
1998 r->float_substr = data.longest_float;
1999 r->float_utf8 = Nullsv;
2000 }
c277df42
IZ
2001 r->float_min_offset = data.offset_float_min;
2002 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
2003 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2004 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 2005 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2006 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2007 }
2008 else {
aca2d497 2009 remove_float:
33b8afdf 2010 r->float_substr = r->float_utf8 = Nullsv;
c277df42 2011 SvREFCNT_dec(data.longest_float);
c5254dd6 2012 longest_float_length = 0;
a0d0e21e 2013 }
c277df42 2014
a0ed51b3 2015 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2016 if (longest_fixed_length
c277df42
IZ
2017 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2018 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2019 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2020 int t;
2021
33b8afdf
JH
2022 if (SvUTF8(data.longest_fixed)) {
2023 r->anchored_utf8 = data.longest_fixed;
2024 r->anchored_substr = Nullsv;
2025 } else {
2026 r->anchored_substr = data.longest_fixed;
2027 r->anchored_utf8 = Nullsv;
2028 }
c277df42 2029 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2030 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2031 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2032 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2033 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2034 }
2035 else {
33b8afdf 2036 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2037 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2038 longest_fixed_length = 0;
a0d0e21e 2039 }
b81d288d 2040 if (r->regstclass
ffc61ed2 2041 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2042 r->regstclass = NULL;
33b8afdf
JH
2043 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2044 && stclass_flag
653099ff 2045 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2046 && !cl_is_anything(data.start_class))
2047 {
830247a4 2048 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2049
b81d288d 2050 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2051 struct regnode_charclass_class);
2052 StructCopy(data.start_class,
830247a4 2053 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2054 struct regnode_charclass_class);
830247a4 2055 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2056 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2057 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2058 DEBUG_r({ SV *sv = sv_newmortal();
2059 regprop(sv, (regnode*)data.start_class);
2060 PerlIO_printf(Perl_debug_log,
2061 "synthetic stclass `%s'.\n",
2062 SvPVX(sv));});
653099ff 2063 }
c277df42
IZ
2064
2065 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2066 if (longest_fixed_length > longest_float_length) {
c277df42 2067 r->check_substr = r->anchored_substr;
33b8afdf 2068 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2069 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2070 if (r->reganch & ROPT_ANCH_SINGLE)
2071 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2072 }
2073 else {
c277df42 2074 r->check_substr = r->float_substr;
33b8afdf 2075 r->check_utf8 = r->float_utf8;
c277df42
IZ
2076 r->check_offset_min = data.offset_float_min;
2077 r->check_offset_max = data.offset_float_max;
a0d0e21e 2078 }
30382c73
IZ
2079 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2080 This should be changed ASAP! */
33b8afdf 2081 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2082 r->reganch |= RE_USE_INTUIT;
33b8afdf 2083 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2084 r->reganch |= RE_INTUIT_TAIL;
2085 }
a0ed51b3
LW
2086 }
2087 else {
c277df42
IZ
2088 /* Several toplevels. Best we can is to set minlen. */
2089 I32 fake;
653099ff 2090 struct regnode_charclass_class ch_class;
cb434fcc 2091 I32 last_close = 0;
c277df42
IZ
2092
2093 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2094 scan = r->program + 1;
830247a4 2095 cl_init(pRExC_state, &ch_class);
653099ff 2096 data.start_class = &ch_class;
cb434fcc 2097 data.last_closep = &last_close;
e1901655 2098 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2099 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2100 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2101 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2102 && !cl_is_anything(data.start_class))
2103 {
830247a4 2104 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2105
b81d288d 2106 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2107 struct regnode_charclass_class);
2108 StructCopy(data.start_class,
830247a4 2109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2110 struct regnode_charclass_class);
830247a4 2111 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2113 DEBUG_r({ SV* sv = sv_newmortal();
2114 regprop(sv, (regnode*)data.start_class);
2115 PerlIO_printf(Perl_debug_log,
2116 "synthetic stclass `%s'.\n",
2117 SvPVX(sv));});
653099ff 2118 }
a0d0e21e
LW
2119 }
2120
a0d0e21e 2121 r->minlen = minlen;
b81d288d 2122 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2123 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2124 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2125 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2126 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2127 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2128 if (RExC_seen & REG_SEEN_CANY)
2129 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2130 Newz(1002, r->startp, RExC_npar, I32);
2131 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2132 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2133 DEBUG_r(regdump(r));
2134 return(r);
a687059c
LW
2135}
2136
2137/*
2138 - reg - regular expression, i.e. main body or parenthesized thing
2139 *
2140 * Caller must absorb opening parenthesis.
2141 *
2142 * Combining parenthesis handling with the base level of regular expression
2143 * is a trifle forced, but the need to tie the tails of the branches to what
2144 * follows makes it hard to avoid.
2145 */
76e3520e 2146STATIC regnode *
830247a4 2147S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2148 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2149{
c277df42
IZ
2150 register regnode *ret; /* Will be the head of the group. */
2151 register regnode *br;
2152 register regnode *lastbr;
2153 register regnode *ender = 0;
a0d0e21e 2154 register I32 parno = 0;
e2509266 2155 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2156
2157 /* for (?g), (?gc), and (?o) warnings; warning
2158 about (?c) will warn about (?g) -- japhy */
2159
2160 I32 wastedflags = 0x00,
2161 wasted_o = 0x01,
2162 wasted_g = 0x02,
2163 wasted_gc = 0x02 | 0x04,
2164 wasted_c = 0x04;
2165
fac92740 2166 char * parse_start = RExC_parse; /* MJD */
830247a4 2167 char *oregcomp_parse = RExC_parse;
c277df42 2168 char c;
a0d0e21e 2169
821b33a5 2170 *flagp = 0; /* Tentatively. */
a0d0e21e 2171
9d1d55b5 2172
a0d0e21e
LW
2173 /* Make an OPEN node, if parenthesized. */
2174 if (paren) {
fac92740 2175 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2176 U32 posflags = 0, negflags = 0;
2177 U32 *flagsp = &posflags;
0f5d15d6 2178 int logical = 0;
830247a4 2179 char *seqstart = RExC_parse;
ca9dfc88 2180
830247a4
IZ
2181 RExC_parse++;
2182 paren = *RExC_parse++;
c277df42 2183 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2184 switch (paren) {
fac92740 2185 case '<': /* (?<...) */
830247a4 2186 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2187 if (*RExC_parse == '!')
c277df42 2188 paren = ',';
b81d288d 2189 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2190 goto unknown;
830247a4 2191 RExC_parse++;
fac92740
MJD
2192 case '=': /* (?=...) */
2193 case '!': /* (?!...) */
830247a4 2194 RExC_seen_zerolen++;
fac92740
MJD
2195 case ':': /* (?:...) */
2196 case '>': /* (?>...) */
a0d0e21e 2197 break;
fac92740
MJD
2198 case '$': /* (?$...) */
2199 case '@': /* (?@...) */
8615cb43 2200 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2201 break;
fac92740 2202 case '#': /* (?#...) */
830247a4
IZ
2203 while (*RExC_parse && *RExC_parse != ')')
2204 RExC_parse++;
2205 if (*RExC_parse != ')')
c277df42 2206 FAIL("Sequence (?#... not terminated");
830247a4 2207 nextchar(pRExC_state);
a0d0e21e
LW
2208 *flagp = TRYAGAIN;
2209 return NULL;
fac92740 2210 case 'p': /* (?p...) */
9014280d 2211 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2212 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2213 /* FALL THROUGH*/
fac92740 2214 case '?': /* (??...) */
0f5d15d6 2215 logical = 1;
438a3801
YST
2216 if (*RExC_parse != '{')
2217 goto unknown;
830247a4 2218 paren = *RExC_parse++;
0f5d15d6 2219 /* FALL THROUGH */
fac92740 2220 case '{': /* (?{...}) */
c277df42 2221 {
c277df42
IZ
2222 I32 count = 1, n = 0;
2223 char c;
830247a4 2224 char *s = RExC_parse;
c277df42
IZ
2225 SV *sv;
2226 OP_4tree *sop, *rop;
2227
830247a4
IZ
2228 RExC_seen_zerolen++;
2229 RExC_seen |= REG_SEEN_EVAL;
2230 while (count && (c = *RExC_parse)) {
2231 if (c == '\\' && RExC_parse[1])
2232 RExC_parse++;
b81d288d 2233 else if (c == '{')
c277df42 2234 count++;
b81d288d 2235 else if (c == '}')
c277df42 2236 count--;
830247a4 2237 RExC_parse++;
c277df42 2238 }
830247a4 2239 if (*RExC_parse != ')')
b45f050a 2240 {
b81d288d 2241 RExC_parse = s;
b45f050a
JF
2242 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2243 }
c277df42 2244 if (!SIZE_ONLY) {
f3548bdc 2245 PAD *pad;
b81d288d
AB
2246
2247 if (RExC_parse - 1 - s)
830247a4 2248 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2249 else
79cb57f6 2250 sv = newSVpvn("", 0);
c277df42 2251
569233ed
SB
2252 ENTER;
2253 Perl_save_re_context(aTHX);
f3548bdc 2254 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2255 sop->op_private |= OPpREFCOUNTED;
2256 /* re_dup will OpREFCNT_inc */
2257 OpREFCNT_set(sop, 1);
569233ed 2258 LEAVE;
c277df42 2259
830247a4
IZ
2260 n = add_data(pRExC_state, 3, "nop");
2261 RExC_rx->data->data[n] = (void*)rop;
2262 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2263 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2264 SvREFCNT_dec(sv);
a0ed51b3 2265 }
e24b16f9 2266 else { /* First pass */
830247a4 2267 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 2268 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2269 /* No compiled RE interpolated, has runtime
2270 components ===> unsafe. */
2271 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2272 if (PL_tainting && PL_tainted)
cc6b7395 2273 FAIL("Eval-group in insecure regular expression");
923e4eb5 2274 if (IN_PERL_COMPILETIME)
b5c19bd7 2275 PL_cv_has_eval = 1;
c277df42 2276 }
b5c19bd7 2277
830247a4 2278 nextchar(pRExC_state);
0f5d15d6 2279 if (logical) {
830247a4 2280 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2281 if (!SIZE_ONLY)
2282 ret->flags = 2;
830247a4 2283 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2284 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2285 return ret;
2286 }
ccb2c380
MP
2287 ret = reganode(pRExC_state, EVAL, n);
2288 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2289 Set_Node_Offset(ret, parse_start);
2290 return ret;
c277df42 2291 }
fac92740 2292 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2293 {
fac92740 2294 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2295 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2296 || RExC_parse[1] == '<'
830247a4 2297 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2298 I32 flag;
2299
830247a4 2300 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2301 if (!SIZE_ONLY)
2302 ret->flags = 1;
830247a4 2303 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2304 goto insert_if;
b81d288d 2305 }
a0ed51b3 2306 }
830247a4 2307 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2308 /* (?(1)...) */
830247a4 2309 parno = atoi(RExC_parse++);
c277df42 2310
830247a4
IZ
2311 while (isDIGIT(*RExC_parse))
2312 RExC_parse++;
fac92740
MJD
2313 ret = reganode(pRExC_state, GROUPP, parno);
2314
830247a4 2315 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2316 vFAIL("Switch condition not recognized");
c277df42 2317 insert_if:
830247a4
IZ
2318 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2319 br = regbranch(pRExC_state, &flags, 1);
c277df42 2320 if (br == NULL)
830247a4 2321 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2322 else
830247a4
IZ
2323 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2324 c = *nextchar(pRExC_state);
d1b80229
IZ
2325 if (flags&HASWIDTH)
2326 *flagp |= HASWIDTH;
c277df42 2327 if (c == '|') {
830247a4
IZ
2328 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2329 regbranch(pRExC_state, &flags, 1);
2330 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2331 if (flags&HASWIDTH)
2332 *flagp |= HASWIDTH;
830247a4 2333 c = *nextchar(pRExC_state);
a0ed51b3
LW
2334 }
2335 else
c277df42
IZ
2336 lastbr = NULL;
2337 if (c != ')')
8615cb43 2338 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2339 ender = reg_node(pRExC_state, TAIL);
2340 regtail(pRExC_state, br, ender);
c277df42 2341 if (lastbr) {
830247a4
IZ
2342 regtail(pRExC_state, lastbr, ender);
2343 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2344 }
2345 else
830247a4 2346 regtail(pRExC_state, ret, ender);
c277df42 2347 return ret;
a0ed51b3
LW
2348 }
2349 else {
830247a4 2350 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2351 }
2352 }
1b1626e4 2353 case 0:
830247a4 2354 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2355 vFAIL("Sequence (? incomplete");
1b1626e4 2356 break;
a0d0e21e 2357 default:
830247a4 2358 --RExC_parse;
fac92740 2359 parse_flags: /* (?i) */
830247a4 2360 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2361 /* (?g), (?gc) and (?o) are useless here
2362 and must be globally applied -- japhy */
2363
2364 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2365 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2366 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2367 if (! (wastedflags & wflagbit) ) {
2368 wastedflags |= wflagbit;
2369 vWARN5(
2370 RExC_parse + 1,
2371 "Useless (%s%c) - %suse /%c modifier",
2372 flagsp == &negflags ? "?-" : "?",
2373 *RExC_parse,
2374 flagsp == &negflags ? "don't " : "",
2375 *RExC_parse
2376 );
2377 }
2378 }
2379 }
2380 else if (*RExC_parse == 'c') {
2381 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2382 if (! (wastedflags & wasted_c) ) {
2383 wastedflags |= wasted_gc;
2384 vWARN3(
2385 RExC_parse + 1,
2386 "Useless (%sc) - %suse /gc modifier",
2387 flagsp == &negflags ? "?-" : "?",
2388 flagsp == &negflags ? "don't " : ""
2389 );
2390 }
2391 }
2392 }
2393 else { pmflag(flagsp, *RExC_parse); }
2394
830247a4 2395 ++RExC_parse;
ca9dfc88 2396 }
830247a4 2397 if (*RExC_parse == '-') {
ca9dfc88 2398 flagsp = &negflags;
9d1d55b5 2399 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2400 ++RExC_parse;
ca9dfc88 2401 goto parse_flags;
48c036b1 2402 }
e2509266
JH
2403 RExC_flags |= posflags;
2404 RExC_flags &= ~negflags;
830247a4
IZ
2405 if (*RExC_parse == ':') {
2406 RExC_parse++;
ca9dfc88
IZ
2407 paren = ':';
2408 break;
2409 }
c277df42 2410 unknown:
830247a4
IZ
2411 if (*RExC_parse != ')') {
2412 RExC_parse++;
2413 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2414 }
830247a4 2415 nextchar(pRExC_state);
a0d0e21e
LW
2416 *flagp = TRYAGAIN;
2417 return NULL;
2418 }
2419 }
fac92740 2420 else { /* (...) */
830247a4
IZ
2421 parno = RExC_npar;
2422 RExC_npar++;
2423 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2424 Set_Node_Length(ret, 1); /* MJD */
2425 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2426 open = 1;
a0d0e21e 2427 }
a0ed51b3 2428 }
fac92740 2429 else /* ! paren */
a0d0e21e
LW
2430 ret = NULL;
2431
2432 /* Pick up the branches, linking them together. */
fac92740 2433 parse_start = RExC_parse; /* MJD */
830247a4 2434 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2435 /* branch_len = (paren != 0); */
2436
a0d0e21e
LW
2437 if (br == NULL)
2438 return(NULL);
830247a4
IZ
2439 if (*RExC_parse == '|') {
2440 if (!SIZE_ONLY && RExC_extralen) {
2441 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2442 }
fac92740 2443 else { /* MJD */
830247a4 2444 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2445 Set_Node_Length(br, paren != 0);
2446 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2447 }
c277df42
IZ
2448 have_branch = 1;
2449 if (SIZE_ONLY)
830247a4 2450 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2451 }
2452 else if (paren == ':') {
c277df42
IZ
2453 *flagp |= flags&SIMPLE;
2454 }
2455 if (open) { /* Starts with OPEN. */
830247a4 2456 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2457 }
2458 else if (paren != '?') /* Not Conditional */
a0d0e21e 2459 ret = br;
32a0ca98 2460 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2461 lastbr = br;
830247a4
IZ
2462 while (*RExC_parse == '|') {
2463 if (!SIZE_ONLY && RExC_extralen) {
2464 ender = reganode(pRExC_state, LONGJMP,0);
2465 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2466 }
2467 if (SIZE_ONLY)
830247a4
IZ
2468 RExC_extralen += 2; /* Account for LONGJMP. */
2469 nextchar(pRExC_state);
2470 br = regbranch(pRExC_state, &flags, 0);
fac92740 2471
a687059c 2472 if (br == NULL)
a0d0e21e 2473 return(NULL);
830247a4 2474 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2475 lastbr = br;
821b33a5
IZ
2476 if (flags&HASWIDTH)
2477 *flagp |= HASWIDTH;
a687059c 2478 *flagp |= flags&SPSTART;
a0d0e21e
LW
2479 }
2480
c277df42
IZ
2481 if (have_branch || paren != ':') {
2482 /* Make a closing node, and hook it on the end. */
2483 switch (paren) {
2484 case ':':
830247a4 2485 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2486 break;
2487 case 1:
830247a4 2488 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2489 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2490 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2491 break;
2492 case '<':
c277df42
IZ
2493 case ',':
2494 case '=':
2495 case '!':
c277df42 2496 *flagp &= ~HASWIDTH;
821b33a5
IZ
2497 /* FALL THROUGH */
2498 case '>':
830247a4 2499 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2500 break;
2501 case 0:
830247a4 2502 ender = reg_node(pRExC_state, END);
c277df42
IZ
2503 break;
2504 }
830247a4 2505 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2506
c277df42
IZ
2507 if (have_branch) {
2508 /* Hook the tails of the branches to the closing node. */
2509 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2510 regoptail(pRExC_state, br, ender);
c277df42
IZ
2511 }
2512 }
a0d0e21e 2513 }
c277df42
IZ
2514
2515 {
2516 char *p;
2517 static char parens[] = "=!<,>";
2518
2519 if (paren && (p = strchr(parens, paren))) {
eb160463 2520 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2521 int flag = (p - parens) > 1;
2522
2523 if (paren == '>')
2524 node = SUSPEND, flag = 0;
830247a4 2525 reginsert(pRExC_state, node,ret);
45948336
EP
2526 Set_Node_Cur_Length(ret);
2527 Set_Node_Offset(ret, parse_start + 1);
c277df42 2528 ret->flags = flag;
830247a4 2529 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2530 }
a0d0e21e
LW
2531 }
2532
2533 /* Check for proper termination. */
ce3e6498 2534 if (paren) {
e2509266 2535 RExC_flags = oregflags;
830247a4
IZ
2536 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2537 RExC_parse = oregcomp_parse;
380a0633 2538 vFAIL("Unmatched (");
ce3e6498 2539 }
a0ed51b3 2540 }
830247a4
IZ
2541 else if (!paren && RExC_parse < RExC_end) {
2542 if (*RExC_parse == ')') {
2543 RExC_parse++;
380a0633 2544 vFAIL("Unmatched )");
a0ed51b3
LW
2545 }
2546 else
b45f050a 2547 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2548 /* NOTREACHED */
2549 }
a687059c 2550
a0d0e21e 2551 return(ret);
a687059c
LW
2552}
2553
2554/*
2555 - regbranch - one alternative of an | operator
2556 *
2557 * Implements the concatenation operator.
2558 */
76e3520e 2559STATIC regnode *
830247a4 2560S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2561{
c277df42
IZ
2562 register regnode *ret;
2563 register regnode *chain = NULL;
2564 register regnode *latest;
2565 I32 flags = 0, c = 0;
a0d0e21e 2566
b81d288d 2567 if (first)
c277df42
IZ
2568 ret = NULL;
2569 else {
b81d288d 2570 if (!SIZE_ONLY && RExC_extralen)
830247a4 2571 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2572 else {
830247a4 2573 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2574 Set_Node_Length(ret, 1);
2575 }
c277df42
IZ
2576 }
2577
b81d288d 2578 if (!first && SIZE_ONLY)
830247a4 2579 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2580
c277df42 2581 *flagp = WORST; /* Tentatively. */
a0d0e21e 2582
830247a4
IZ
2583 RExC_parse--;
2584 nextchar(pRExC_state);
2585 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2586 flags &= ~TRYAGAIN;
830247a4 2587 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2588 if (latest == NULL) {
2589 if (flags & TRYAGAIN)
2590 continue;
2591 return(NULL);
a0ed51b3
LW
2592 }
2593 else if (ret == NULL)
c277df42 2594 ret = latest;
a0d0e21e 2595 *flagp |= flags&HASWIDTH;
c277df42 2596 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2597 *flagp |= flags&SPSTART;
2598 else {
830247a4
IZ
2599 RExC_naughty++;
2600 regtail(pRExC_state, chain, latest);
a687059c 2601 }
a0d0e21e 2602 chain = latest;
c277df42
IZ
2603 c++;
2604 }
2605 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2606 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2607 if (ret == NULL)
2608 ret = chain;
2609 }
2610 if (c == 1) {
2611 *flagp |= flags&SIMPLE;
a0d0e21e 2612 }
a687059c 2613
a0d0e21e 2614 return(ret);
a687059c
LW
2615}
2616
2617/*
2618 - regpiece - something followed by possible [*+?]
2619 *
2620 * Note that the branching code sequences used for ? and the general cases
2621 * of * and + are somewhat optimized: they use the same NOTHING node as
2622 * both the endmarker for their branch list and the body of the last branch.
2623 * It might seem that this node could be dispensed with entirely, but the
2624 * endmarker role is not redundant.
2625 */
76e3520e 2626STATIC regnode *
830247a4 2627S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2628{
c277df42 2629 register regnode *ret;
a0d0e21e
LW
2630 register char op;
2631 register char *next;
2632 I32 flags;
830247a4 2633 char *origparse = RExC_parse;
a0d0e21e
LW
2634 char *maxpos;
2635 I32 min;
c277df42 2636 I32 max = REG_INFTY;
fac92740 2637 char *parse_start;
a0d0e21e 2638
830247a4 2639 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2640 if (ret == NULL) {
2641 if (flags & TRYAGAIN)
2642 *flagp |= TRYAGAIN;
2643 return(NULL);
2644 }
2645
830247a4 2646 op = *RExC_parse;
a0d0e21e 2647
830247a4 2648 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2649 parse_start = RExC_parse; /* MJD */
830247a4 2650 next = RExC_parse + 1;
a0d0e21e
LW
2651 maxpos = Nullch;
2652 while (isDIGIT(*next) || *next == ',') {
2653 if (*next == ',') {
2654 if (maxpos)
2655 break;
2656 else
2657 maxpos = next;
a687059c 2658 }
a0d0e21e
LW
2659 next++;
2660 }
2661 if (*next == '}') { /* got one */
2662 if (!maxpos)
2663 maxpos = next;
830247a4
IZ
2664 RExC_parse++;
2665 min = atoi(RExC_parse);
a0d0e21e
LW
2666 if (*maxpos == ',')
2667 maxpos++;
2668 else
830247a4 2669 maxpos = RExC_parse;
a0d0e21e
LW
2670 max = atoi(maxpos);
2671 if (!max && *maxpos != '0')
c277df42
IZ
2672 max = REG_INFTY; /* meaning "infinity" */
2673 else if (max >= REG_INFTY)
8615cb43 2674 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2675 RExC_parse = next;
2676 nextchar(pRExC_state);
a0d0e21e
LW
2677
2678 do_curly:
2679 if ((flags&SIMPLE)) {
830247a4
IZ
2680 RExC_naughty += 2 + RExC_naughty / 2;
2681 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2682 Set_Node_Offset(ret, parse_start+1); /* MJD */
2683 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2684 }
2685 else {
830247a4 2686 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2687
2688 w->flags = 0;
830247a4
IZ
2689 regtail(pRExC_state, ret, w);
2690 if (!SIZE_ONLY && RExC_extralen) {
2691 reginsert(pRExC_state, LONGJMP,ret);
2692 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2693 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2694 }
830247a4 2695 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2696 /* MJD hk */
2697 Set_Node_Offset(ret, parse_start+1);
2698 Set_Node_Length(ret,
2699 op == '{' ? (RExC_parse - parse_start) : 1);
2700
830247a4 2701 if (!SIZE_ONLY && RExC_extralen)
c277df42 2702 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2703 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2704 if (SIZE_ONLY)
830247a4
IZ
2705 RExC_whilem_seen++, RExC_extralen += 3;
2706 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2707 }
c277df42 2708 ret->flags = 0;
a0d0e21e
LW
2709
2710 if (min > 0)
821b33a5
IZ
2711 *flagp = WORST;
2712 if (max > 0)
2713 *flagp |= HASWIDTH;
a0d0e21e 2714 if (max && max < min)
8615cb43 2715 vFAIL("Can't do {n,m} with n > m");
c277df42 2716 if (!SIZE_ONLY) {
eb160463
GS
2717 ARG1_SET(ret, (U16)min);
2718 ARG2_SET(ret, (U16)max);
a687059c 2719 }
a687059c 2720
a0d0e21e 2721 goto nest_check;
a687059c 2722 }
a0d0e21e 2723 }
a687059c 2724
a0d0e21e
LW
2725 if (!ISMULT1(op)) {
2726 *flagp = flags;
a687059c 2727 return(ret);
a0d0e21e 2728 }
bb20fd44 2729
c277df42 2730#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2731
2732 /* if this is reinstated, don't forget to put this back into perldiag:
2733
2734 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2735
2736 (F) The part of the regexp subject to either the * or + quantifier
2737 could match an empty string. The {#} shows in the regular
2738 expression about where the problem was discovered.
2739
2740 */
2741
bb20fd44 2742 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2743 vFAIL("Regexp *+ operand could be empty");
b81d288d 2744#endif
bb20fd44 2745
fac92740 2746 parse_start = RExC_parse;
830247a4 2747 nextchar(pRExC_state);
a0d0e21e 2748
821b33a5 2749 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2750
2751 if (op == '*' && (flags&SIMPLE)) {
830247a4 2752 reginsert(pRExC_state, STAR, ret);
c277df42 2753 ret->flags = 0;
830247a4 2754 RExC_naughty += 4;
a0d0e21e
LW
2755 }
2756 else if (op == '*') {
2757 min = 0;
2758 goto do_curly;
a0ed51b3
LW
2759 }
2760 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2761 reginsert(pRExC_state, PLUS, ret);
c277df42 2762 ret->flags = 0;
830247a4 2763 RExC_naughty += 3;
a0d0e21e
LW
2764 }
2765 else if (op == '+') {
2766 min = 1;
2767 goto do_curly;
a0ed51b3
LW
2768 }
2769 else if (op == '?') {
a0d0e21e
LW
2770 min = 0; max = 1;
2771 goto do_curly;
2772 }
2773 nest_check:
e476b1b5 2774 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2775 vWARN3(RExC_parse,
b45f050a 2776 "%.*s matches null string many times",
830247a4 2777 RExC_parse - origparse,
b45f050a 2778 origparse);
a0d0e21e
LW
2779 }
2780
830247a4
IZ
2781 if (*RExC_parse == '?') {
2782 nextchar(pRExC_state);
2783 reginsert(pRExC_state, MINMOD, ret);
2784 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2785 }
830247a4
IZ
2786 if (ISMULT2(RExC_parse)) {
2787 RExC_parse++;
b45f050a
JF
2788 vFAIL("Nested quantifiers");
2789 }
a0d0e21e
LW
2790
2791 return(ret);
a687059c
LW
2792}
2793
2794/*
2795 - regatom - the lowest level
2796 *
2797 * Optimization: gobbles an entire sequence of ordinary characters so that
2798 * it can turn them into a single node, which is smaller to store and
2799 * faster to run. Backslashed characters are exceptions, each becoming a
2800 * separate node; the code is simpler that way and it's not worth fixing.
2801 *
b45f050a 2802 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2803STATIC regnode *
830247a4 2804S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2805{
c277df42 2806 register regnode *ret = 0;
a0d0e21e 2807 I32 flags;
45948336 2808 char *parse_start = RExC_parse;
a0d0e21e
LW
2809
2810 *flagp = WORST; /* Tentatively. */
2811
2812tryagain:
830247a4 2813 switch (*RExC_parse) {
a0d0e21e 2814 case '^':
830247a4
IZ
2815 RExC_seen_zerolen++;
2816 nextchar(pRExC_state);
e2509266 2817 if (RExC_flags & PMf_MULTILINE)
830247a4 2818 ret = reg_node(pRExC_state, MBOL);
e2509266 2819 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2820 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2821 else
830247a4 2822 ret = reg_node(pRExC_state, BOL);
fac92740 2823 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2824 break;
2825 case '$':
830247a4 2826 nextchar(pRExC_state);
b81d288d 2827 if (*RExC_parse)
830247a4 2828 RExC_seen_zerolen++;
e2509266 2829 if (RExC_flags & PMf_MULTILINE)
830247a4 2830 ret = reg_node(pRExC_state, MEOL);
e2509266 2831 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2832 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2833 else
830247a4 2834 ret = reg_node(pRExC_state, EOL);
fac92740 2835 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2836 break;
2837 case '.':
830247a4 2838 nextchar(pRExC_state);
e2509266 2839 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2840 ret = reg_node(pRExC_state, SANY);
2841 else
2842 ret = reg_node(pRExC_state, REG_ANY);
2843 *flagp |= HASWIDTH|SIMPLE;
830247a4 2844 RExC_naughty++;
fac92740 2845 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2846 break;
2847 case '[':
b45f050a 2848 {
830247a4 2849 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2850 ret = regclass(pRExC_state);
830247a4
IZ
2851 if (*RExC_parse != ']') {
2852 RExC_parse = oregcomp_parse;
b45f050a
JF
2853 vFAIL("Unmatched [");
2854 }
830247a4 2855 nextchar(pRExC_state);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
fac92740 2857 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2858 break;
b45f050a 2859 }
a0d0e21e 2860 case '(':
830247a4
IZ
2861 nextchar(pRExC_state);
2862 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2863 if (ret == NULL) {
bf93d4cc 2864 if (flags & TRYAGAIN) {
830247a4 2865 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2866 /* Make parent create an empty node if needed. */
2867 *flagp |= TRYAGAIN;
2868 return(NULL);
2869 }
a0d0e21e 2870 goto tryagain;
bf93d4cc 2871 }
a0d0e21e
LW
2872 return(NULL);
2873 }
c277df42 2874 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2875 break;
2876 case '|':
2877 case ')':
2878 if (flags & TRYAGAIN) {
2879 *flagp |= TRYAGAIN;
2880 return NULL;
2881 }
b45f050a 2882 vFAIL("Internal urp");
a0d0e21e
LW
2883 /* Supposed to be caught earlier. */
2884 break;
85afd4ae 2885 case '{':
830247a4
IZ
2886 if (!regcurly(RExC_parse)) {
2887 RExC_parse++;
85afd4ae
CS
2888 goto defchar;
2889 }
2890 /* FALL THROUGH */
a0d0e21e
LW
2891 case '?':
2892 case '+':
2893 case '*':
830247a4 2894 RExC_parse++;
b45f050a 2895 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2896 break;
2897 case '\\':
830247a4 2898 switch (*++RExC_parse) {
a0d0e21e 2899 case 'A':
830247a4
IZ
2900 RExC_seen_zerolen++;
2901 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2902 *flagp |= SIMPLE;
830247a4 2903 nextchar(pRExC_state);
fac92740 2904 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2905 break;
2906 case 'G':
830247a4
IZ
2907 ret = reg_node(pRExC_state, GPOS);
2908 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2909 *flagp |= SIMPLE;
830247a4 2910 nextchar(pRExC_state);
fac92740 2911 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2912 break;
2913 case 'Z':
830247a4 2914 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2915 *flagp |= SIMPLE;
a1917ab9 2916 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2917 nextchar(pRExC_state);
a0d0e21e 2918 break;
b85d18e9 2919 case 'z':
830247a4 2920 ret = reg_node(pRExC_state, EOS);
b85d18e9 2921 *flagp |= SIMPLE;
830247a4
IZ
2922 RExC_seen_zerolen++; /* Do not optimize RE away */
2923 nextchar(pRExC_state);
fac92740 2924 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2925 break;
4a2d328f 2926 case 'C':
f33976b4
DB
2927 ret = reg_node(pRExC_state, CANY);
2928 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2929 *flagp |= HASWIDTH|SIMPLE;
830247a4 2930 nextchar(pRExC_state);
fac92740 2931 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2932 break;
2933 case 'X':
830247a4 2934 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2935 *flagp |= HASWIDTH;
830247a4 2936 nextchar(pRExC_state);
fac92740 2937 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2938 break;
a0d0e21e 2939 case 'w':
eb160463 2940 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2941 *flagp |= HASWIDTH|SIMPLE;
830247a4 2942 nextchar(pRExC_state);
fac92740 2943 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2944 break;
2945 case 'W':
eb160463 2946 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2947 *flagp |= HASWIDTH|SIMPLE;
830247a4 2948 nextchar(pRExC_state);
fac92740 2949 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2950 break;
2951 case 'b':
830247a4
IZ
2952 RExC_seen_zerolen++;
2953 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2954 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2955 *flagp |= SIMPLE;
830247a4 2956 nextchar(pRExC_state);
fac92740 2957 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2958 break;
2959 case 'B':
830247a4
IZ
2960 RExC_seen_zerolen++;
2961 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2962 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2963 *flagp |= SIMPLE;
830247a4 2964 nextchar(pRExC_state);
fac92740 2965 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2966 break;
2967 case 's':
eb160463 2968 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2969 *flagp |= HASWIDTH|SIMPLE;
830247a4 2970 nextchar(pRExC_state);
fac92740 2971 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2972 break;
2973 case 'S':
eb160463 2974 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2975 *flagp |= HASWIDTH|SIMPLE;
830247a4 2976 nextchar(pRExC_state);
fac92740 2977 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2978 break;
2979 case 'd':
ffc61ed2 2980 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2981 *flagp |= HASWIDTH|SIMPLE;
830247a4 2982 nextchar(pRExC_state);
fac92740 2983 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2984 break;
2985 case 'D':
ffc61ed2 2986 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2987 *flagp |= HASWIDTH|SIMPLE;
830247a4 2988 nextchar(pRExC_state);
fac92740 2989 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2990 break;
a14b48bc
LW
2991 case 'p':
2992 case 'P':
3568d838 2993 {
830247a4 2994 char* oldregxend = RExC_end;
ccb2c380 2995 char* parse_start = RExC_parse - 2;
a14b48bc 2996
830247a4 2997 if (RExC_parse[1] == '{') {
3568d838 2998 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2999 RExC_end = strchr(RExC_parse, '}');
3000 if (!RExC_end) {
0da60cf5 3001 U8 c = (U8)*RExC_parse;
830247a4
IZ
3002 RExC_parse += 2;
3003 RExC_end = oldregxend;
0da60cf5 3004 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 3005 }
830247a4 3006 RExC_end++;
a14b48bc 3007 }
af6f566e 3008 else {
830247a4 3009 RExC_end = RExC_parse + 2;
af6f566e
HS
3010 if (RExC_end > oldregxend)
3011 RExC_end = oldregxend;
3012 }
830247a4 3013 RExC_parse--;
a14b48bc 3014
ffc61ed2 3015 ret = regclass(pRExC_state);
a14b48bc 3016
830247a4
IZ
3017 RExC_end = oldregxend;
3018 RExC_parse--;
ccb2c380
MP
3019
3020 Set_Node_Offset(ret, parse_start + 2);
3021 Set_Node_Cur_Length(ret);
830247a4 3022 nextchar(pRExC_state);
a14b48bc
LW
3023 *flagp |= HASWIDTH|SIMPLE;
3024 }
3025 break;
a0d0e21e
LW
3026 case 'n':
3027 case 'r':
3028 case 't':
3029 case 'f':
3030 case 'e':
3031 case 'a':
3032 case 'x':
3033 case 'c':
3034 case '0':
3035 goto defchar;
3036 case '1': case '2': case '3': case '4':
3037 case '5': case '6': case '7': case '8': case '9':
3038 {
830247a4 3039 I32 num = atoi(RExC_parse);
a0d0e21e 3040
830247a4 3041 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3042 goto defchar;
3043 else {
fac92740 3044 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3045 while (isDIGIT(*RExC_parse))
3046 RExC_parse++;
b45f050a 3047
eb160463 3048 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3049 vFAIL("Reference to nonexistent group");
830247a4 3050 RExC_sawback = 1;
eb160463
GS
3051 ret = reganode(pRExC_state,
3052 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3053 num);
a0d0e21e 3054 *flagp |= HASWIDTH;
fac92740
MJD
3055
3056 /* override incorrect value set in reganode MJD */
3057 Set_Node_Offset(ret, parse_start+1);
3058 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3059 RExC_parse--;
3060 nextchar(pRExC_state);
a0d0e21e
LW
3061 }
3062 }
3063 break;
3064 case '\0':
830247a4 3065 if (RExC_parse >= RExC_end)
b45f050a 3066 FAIL("Trailing \\");
a0d0e21e
LW
3067 /* FALL THROUGH */
3068 default:
c9f97d15
IZ
3069 /* Do not generate `unrecognized' warnings here, we fall
3070 back into the quick-grab loop below */
45948336 3071 parse_start--;
a0d0e21e
LW
3072 goto defchar;
3073 }
3074 break;
4633a7c4
LW
3075
3076 case '#':
e2509266 3077 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3078 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3079 if (RExC_parse < RExC_end)
4633a7c4
LW
3080 goto tryagain;
3081 }
3082 /* FALL THROUGH */
3083
a0d0e21e 3084 default: {
ba210ebe 3085 register STRLEN len;
58ae7d3f 3086 register UV ender;
a0d0e21e 3087 register char *p;
c277df42 3088 char *oldp, *s;
ba210ebe 3089 STRLEN numlen;
80aecb99 3090 STRLEN foldlen;
89ebb4a3 3091 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
3092
3093 parse_start = RExC_parse - 1;
a0d0e21e 3094
830247a4 3095 RExC_parse++;
a0d0e21e
LW
3096
3097 defchar:
58ae7d3f 3098 ender = 0;
eb160463
GS
3099 ret = reg_node(pRExC_state,
3100 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3101 s = STRING(ret);
830247a4
IZ
3102 for (len = 0, p = RExC_parse - 1;
3103 len < 127 && p < RExC_end;
a0d0e21e
LW
3104 len++)
3105 {
3106 oldp = p;
5b5a24f7 3107
e2509266 3108 if (RExC_flags & PMf_EXTENDED)
830247a4 3109 p = regwhite(p, RExC_end);
a0d0e21e
LW
3110 switch (*p) {
3111 case '^':
3112 case '$':
3113 case '.':
3114 case '[':
3115 case '(':
3116 case ')':
3117 case '|':
3118 goto loopdone;
3119 case '\\':
3120 switch (*++p) {
3121 case 'A':
1ed8eac0
JF
3122 case 'C':
3123 case 'X':
a0d0e21e
LW
3124 case 'G':
3125 case 'Z':
b85d18e9 3126 case 'z':
a0d0e21e
LW
3127 case 'w':
3128 case 'W':
3129 case 'b':
3130 case 'B':
3131 case 's':
3132 case 'S':
3133 case 'd':
3134 case 'D':
a14b48bc
LW
3135 case 'p':
3136 case 'P':
a0d0e21e
LW
3137 --p;
3138 goto loopdone;
3139 case 'n':
3140 ender = '\n';
3141 p++;
a687059c 3142 break;
a0d0e21e
LW
3143 case 'r':
3144 ender = '\r';
3145 p++;
a687059c 3146 break;
a0d0e21e
LW
3147 case 't':
3148 ender = '\t';
3149 p++;
a687059c 3150 break;
a0d0e21e
LW
3151 case 'f':
3152 ender = '\f';
3153 p++;
a687059c 3154 break;
a0d0e21e 3155 case 'e':
c7f1f016 3156 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3157 p++;
a687059c 3158 break;
a0d0e21e 3159 case 'a':
c7f1f016 3160 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3161 p++;
a687059c 3162 break;
a0d0e21e 3163 case 'x':
a0ed51b3
LW
3164 if (*++p == '{') {
3165 char* e = strchr(p, '}');
b81d288d 3166
b45f050a 3167 if (!e) {
830247a4 3168 RExC_parse = p + 1;
b45f050a
JF
3169 vFAIL("Missing right brace on \\x{}");
3170 }
de5f0749 3171 else {
a4c04bdc
NC
3172 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3173 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3174 numlen = e - p - 1;
3175 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3176 if (ender > 0xff)
3177 RExC_utf8 = 1;
a0ed51b3
LW
3178 p = e + 1;
3179 }
a0ed51b3
LW
3180 }
3181 else {
a4c04bdc 3182 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3183 numlen = 2;
3184 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3185 p += numlen;
3186 }
a687059c 3187 break;
a0d0e21e
LW
3188 case 'c':
3189 p++;
bbce6d69 3190 ender = UCHARAT(p++);
3191 ender = toCTRL(ender);
a687059c 3192 break;
a0d0e21e
LW
3193 case '0': case '1': case '2': case '3':case '4':
3194 case '5': case '6': case '7': case '8':case '9':
3195 if (*p == '0' ||
830247a4 3196 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3197 I32 flags = 0;
3198 numlen = 3;
3199 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3200 p += numlen;
3201 }
3202 else {
3203 --p;
3204 goto loopdone;
a687059c
LW
3205 }
3206 break;
a0d0e21e 3207 case '\0':
830247a4 3208 if (p >= RExC_end)
b45f050a 3209 FAIL("Trailing \\");
a687059c 3210 /* FALL THROUGH */
a0d0e21e 3211 default:
e476b1b5 3212 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3213 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3214 goto normal_default;
a0d0e21e
LW
3215 }
3216 break;
a687059c 3217 default:
a0ed51b3 3218 normal_default:
fd400ab9 3219 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3220 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3221 &numlen, 0);
a0ed51b3
LW
3222 p += numlen;
3223 }
3224 else
3225 ender = *p++;
a0d0e21e 3226 break;
a687059c 3227 }
e2509266 3228 if (RExC_flags & PMf_EXTENDED)
830247a4 3229 p = regwhite(p, RExC_end);
60a8b682
JH
3230 if (UTF && FOLD) {
3231 /* Prime the casefolded buffer. */
ac7e0132 3232 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3233 }
a0d0e21e
LW
3234 if (ISMULT2(p)) { /* Back off on ?+*. */
3235 if (len)
3236 p = oldp;
16ea2a2e 3237 else if (UTF) {
0ebc6274
JH
3238 STRLEN unilen;
3239
80aecb99 3240 if (FOLD) {
60a8b682 3241 /* Emit all the Unicode characters. */
80aecb99
JH
3242 for (foldbuf = tmpbuf;
3243 foldlen;
3244 foldlen -= numlen) {
3245 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3246 if (numlen > 0) {
0ebc6274
JH
3247 reguni(pRExC_state, ender, s, &unilen);
3248 s += unilen;
3249 len += unilen;
3250 /* In EBCDIC the numlen
3251 * and unilen can differ. */
9dc45d57 3252 foldbuf += numlen;
47654450
JH
3253 if (numlen >= foldlen)
3254 break;
9dc45d57
JH
3255 }
3256 else
3257 break; /* "Can't happen." */
80aecb99
JH
3258 }
3259 }
3260 else {
0ebc6274 3261 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3262 if (unilen > 0) {
0ebc6274
JH
3263 s += unilen;
3264 len += unilen;
9dc45d57 3265 }
80aecb99 3266 }
a0ed51b3 3267 }
a0d0e21e
LW
3268 else {
3269 len++;
eb160463 3270 REGC((char)ender, s++);
a0d0e21e
LW
3271 }
3272 break;
a687059c 3273 }
16ea2a2e 3274 if (UTF) {
0ebc6274
JH
3275 STRLEN unilen;
3276
80aecb99 3277 if (FOLD) {
60a8b682 3278 /* Emit all the Unicode characters. */
80aecb99
JH
3279 for (foldbuf = tmpbuf;
3280 foldlen;
3281 foldlen -= numlen) {
3282 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3283 if (numlen > 0) {
0ebc6274
JH
3284 reguni(pRExC_state, ender, s, &unilen);
3285 len += unilen;
3286 s += unilen;
3287 /* In EBCDIC the numlen
3288 * and unilen can differ. */
9dc45d57 3289 foldbuf += numlen;
47654450
JH
3290 if (numlen >= foldlen)
3291 break;
9dc45d57
JH
3292 }
3293 else
3294 break;
80aecb99
JH
3295 }
3296 }
3297 else {
0ebc6274 3298 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3299 if (unilen > 0) {
0ebc6274
JH
3300 s += unilen;
3301 len += unilen;
9dc45d57 3302 }
80aecb99
JH
3303 }
3304 len--;
a0ed51b3
LW
3305 }
3306 else
eb160463 3307 REGC((char)ender, s++);
a0d0e21e
LW
3308 }
3309 loopdone:
830247a4 3310 RExC_parse = p - 1;
fac92740 3311 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3312 nextchar(pRExC_state);
793db0cb
JH
3313 {
3314 /* len is STRLEN which is unsigned, need to copy to signed */
3315 IV iv = len;
3316 if (iv < 0)
3317 vFAIL("Internal disaster");
3318 }
a0d0e21e
LW
3319 if (len > 0)
3320 *flagp |= HASWIDTH;
090f7165 3321 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3322 *flagp |= SIMPLE;
c277df42 3323 if (!SIZE_ONLY)
cd439c50
IZ
3324 STR_LEN(ret) = len;
3325 if (SIZE_ONLY)
830247a4 3326 RExC_size += STR_SZ(len);
cd439c50 3327 else
830247a4 3328 RExC_emit += STR_SZ(len);
a687059c 3329 }
a0d0e21e
LW
3330 break;
3331 }
a687059c 3332
60a8b682
JH
3333 /* If the encoding pragma is in effect recode the text of
3334 * any EXACT-kind nodes. */
22c54be3 3335 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3336 STRLEN oldlen = STR_LEN(ret);
3337 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3338
3339 if (RExC_utf8)
3340 SvUTF8_on(sv);
3341 if (sv_utf8_downgrade(sv, TRUE)) {
3342 char *s = sv_recode_to_utf8(sv, PL_encoding);
3343 STRLEN newlen = SvCUR(sv);
3344
3345 if (SvUTF8(sv))
3346 RExC_utf8 = 1;
3347 if (!SIZE_ONLY) {
3348 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3349 (int)oldlen, STRING(ret),
3350 (int)newlen, s));
3351 Copy(s, STRING(ret), newlen, char);
3352 STR_LEN(ret) += newlen - oldlen;
3353 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3354 } else
3355 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3356 }
a72c7584
JH
3357 }
3358
a0d0e21e 3359 return(ret);
a687059c
LW
3360}
3361
873ef191 3362STATIC char *
cea2e8a9 3363S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3364{
3365 while (p < e) {
3366 if (isSPACE(*p))
3367 ++p;
3368 else if (*p == '#') {
3369 do {
3370 p++;
3371 } while (p < e && *p != '\n');
3372 }
3373 else
3374 break;
3375 }
3376 return p;
3377}
3378
b8c5462f
JH
3379/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3380 Character classes ([:foo:]) can also be negated ([:^foo:]).
3381 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3382 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3383 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3384
3385#define POSIXCC_DONE(c) ((c) == ':')
3386#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3387#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3388
b8c5462f 3389STATIC I32
830247a4 3390S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3391{
3392 char *posixcc = 0;
936ed897 3393 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3394
830247a4 3395 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3396 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3397 POSIXCC(UCHARAT(RExC_parse))) {
3398 char c = UCHARAT(RExC_parse);
830247a4 3399 char* s = RExC_parse++;
b81d288d 3400
9a86a77b 3401 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3402 RExC_parse++;
3403 if (RExC_parse == RExC_end)
620e46c5 3404 /* Grandfather lone [:, [=, [. */
830247a4 3405 RExC_parse = s;
620e46c5 3406 else {
830247a4 3407 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3408
80916619
NC
3409 assert(*t == c);
3410
9a86a77b 3411 if (UCHARAT(RExC_parse) == ']') {
830247a4 3412 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3413 posixcc = s + 1;
3414 if (*s == ':') {
3415 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
3416 I32 skip = t - posixcc;
3417
3418 /* Initially switch on the length of the name. */
3419 switch (skip) {
3420 case 4:
3421 if (memEQ(posixcc, "word", 4)) {
3422 /* this is not POSIX, this is the Perl \w */;
3423 namedclass
3424 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3425 }
cc4319de 3426 break;
80916619
NC
3427 case 5:
3428 /* Names all of length 5. */
3429 /* alnum alpha ascii blank cntrl digit graph lower
3430 print punct space upper */
3431 /* Offset 4 gives the best switch position. */
3432 switch (posixcc[4]) {
3433 case 'a':
3434 if (memEQ(posixcc, "alph", 4)) {
3435 /* a */
3436 namedclass
3437 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3438 }
3439 break;
3440 case 'e':
3441 if (memEQ(posixcc, "spac", 4)) {
3442 /* e */
3443 namedclass
3444 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3445 }
3446 break;
3447 case 'h':
3448 if (memEQ(posixcc, "grap", 4)) {
3449 /* h */
3450 namedclass
3451 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3452 }
3453 break;
3454 case 'i':
3455 if (memEQ(posixcc, "asci", 4)) {
3456 /* i */
3457 namedclass
3458 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3459 }
3460 break;
3461 case 'k':
3462 if (memEQ(posixcc, "blan", 4)) {
3463 /* k */
3464 namedclass
3465 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3466 }
3467 break;
3468 case 'l':
3469 if (memEQ(posixcc, "cntr", 4)) {
3470 /* l */
3471 namedclass
3472 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3473 }
3474 break;
3475 case 'm':
3476 if (memEQ(posixcc, "alnu", 4)) {
3477 /* m */
3478 namedclass
3479 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3480 }
3481 break;
3482 case 'r':
3483 if (memEQ(posixcc, "lowe", 4)) {
3484 /* r */
3485 namedclass
3486 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3487 }
3488 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 3489 /* r */
80916619
NC
3490 namedclass
3491 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3492 }
3493 break;
3494 case 't':
3495 if (memEQ(posixcc, "digi", 4)) {
3496 /* t */
3497 namedclass
3498 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3499 }
3500 if (memEQ(posixcc, "prin", 4)) {
8fdec511 3501 /* t */
80916619
NC
3502 namedclass
3503 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3504 }
3505 if (memEQ(posixcc, "punc", 4)) {
3506 /* t */
3507 namedclass
3508 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3509 }
3510 break;
b8c5462f
JH
3511 }
3512 break;
80916619
NC
3513 case 6:
3514 if (memEQ(posixcc, "xdigit", 6)) {
3515 namedclass
3516 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
3517 }
3518 break;
3519 }
80916619
NC
3520
3521 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
3522 {
3523 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3524 t - s - 1, s + 1);
3525 }
80916619
NC
3526 assert (posixcc[skip] == ':');
3527 assert (posixcc[skip+1] == ']');
b45f050a 3528 } else if (!SIZE_ONLY) {
b8c5462f 3529 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3530
830247a4 3531 /* adjust RExC_parse so the warning shows after
b45f050a 3532 the class closes */
9a86a77b 3533 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3534 RExC_parse++;
b45f050a
JF
3535 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3536 }
b8c5462f
JH
3537 } else {
3538 /* Maternal grandfather:
3539 * "[:" ending in ":" but not in ":]" */
830247a4 3540 RExC_parse = s;
767d463e 3541 }
620e46c5
JH
3542 }
3543 }
3544
b8c5462f
JH
3545 return namedclass;
3546}
3547
3548STATIC void
830247a4 3549S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3550{
b938889d 3551 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3552 char *s = RExC_parse;
93733859 3553 char c = *s++;
b8c5462f
JH
3554
3555 while(*s && isALNUM(*s))
3556 s++;
3557 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3558 if (ckWARN(WARN_REGEXP))
3559 vWARN3(s+2,
3560 "POSIX syntax [%c %c] belongs inside character classes",
3561 c, c);
b45f050a
JF
3562
3563 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3564 if (POSIXCC_NOTYET(c)) {
830247a4 3565 /* adjust RExC_parse so the error shows after
b45f050a 3566 the class closes */
9a86a77b 3567 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3568 ;
3569 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3570 }
b8c5462f
JH
3571 }
3572 }
620e46c5
JH
3573}
3574
76e3520e 3575STATIC regnode *
830247a4 3576S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3577{
ffc61ed2 3578 register UV value;
9a86a77b 3579 register UV nextvalue;
3568d838 3580 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3581 register IV range = 0;
c277df42 3582 register regnode *ret;
ba210ebe 3583 STRLEN numlen;
ffc61ed2 3584 IV namedclass;
9c5ffd7c 3585 char *rangebegin = 0;
936ed897 3586 bool need_class = 0;
9c5ffd7c 3587 SV *listsv = Nullsv;
ffc61ed2
JH
3588 register char *e;
3589 UV n;
9e55ce06
JH
3590 bool optimize_invert = TRUE;
3591 AV* unicode_alternate = 0;
1b2d223b
JH
3592#ifdef EBCDIC
3593 UV literal_endpoint = 0;
3594#endif
ffc61ed2
JH
3595
3596 ret = reganode(pRExC_state, ANYOF, 0);
3597
3598 if (!SIZE_ONLY)
3599 ANYOF_FLAGS(ret) = 0;
3600
9a86a77b 3601 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3602 RExC_naughty++;
3603 RExC_parse++;
3604 if (!SIZE_ONLY)
3605 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3606 }
a0d0e21e 3607
936ed897 3608 if (SIZE_ONLY)
830247a4 3609 RExC_size += ANYOF_SKIP;
936ed897 3610 else {
830247a4 3611 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3612 if (FOLD)
3613 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3614 if (LOC)
3615 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3616 ANYOF_BITMAP_ZERO(ret);
3617 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3618 }
b8c5462f 3619
9a86a77b
JH
3620 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3621
b938889d 3622 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3623 checkposixcc(pRExC_state);
b8c5462f 3624
f064b6ad
HS
3625 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3626 if (UCHARAT(RExC_parse) == ']')
3627 goto charclassloop;
ffc61ed2 3628
9a86a77b 3629 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3630
3631 charclassloop:
3632
3633 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3634
73b437c8 3635 if (!range)
830247a4 3636 rangebegin = RExC_parse;
ffc61ed2 3637 if (UTF) {
5e12f4fb 3638 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3639 RExC_end - RExC_parse,
3640 &numlen, 0);
ffc61ed2
JH
3641 RExC_parse += numlen;
3642 }
3643 else
3644 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3645 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3646 if (value == '[' && POSIXCC(nextvalue))
830247a4 3647 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3648 else if (value == '\\') {
ffc61ed2 3649 if (UTF) {
5e12f4fb 3650 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3651 RExC_end - RExC_parse,
3652 &numlen, 0);
3653 RExC_parse += numlen;
3654 }
3655 else
3656 value = UCHARAT(RExC_parse++);
470c3474 3657 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3658 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3659 * be a problem later if we want switch on Unicode.
3660 * A similar issue a little bit later when switching on
3661 * namedclass. --jhi */
ffc61ed2 3662 switch ((I32)value) {
b8c5462f
JH
3663 case 'w': namedclass = ANYOF_ALNUM; break;
3664 case 'W': namedclass = ANYOF_NALNUM; break;
3665 case 's': namedclass = ANYOF_SPACE; break;
3666 case 'S': namedclass = ANYOF_NSPACE; break;
3667 case 'd': namedclass = ANYOF_DIGIT; break;
3668 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3669 case 'p':
3670 case 'P':
af6f566e 3671 if (RExC_parse >= RExC_end)
2a4859cd 3672 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3673 if (*RExC_parse == '{') {
0da60cf5 3674 U8 c = (U8)value;
ffc61ed2
JH
3675 e = strchr(RExC_parse++, '}');
3676 if (!e)
0da60cf5 3677 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3678 while (isSPACE(UCHARAT(RExC_parse)))
3679 RExC_parse++;
3680 if (e == RExC_parse)
0da60cf5 3681 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3682 n = e - RExC_parse;
ab13f0c7
JH
3683 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3684 n--;
ffc61ed2
JH
3685 }
3686 else {
3687 e = RExC_parse;
3688 n = 1;
3689 }
3690 if (!SIZE_ONLY) {
ab13f0c7
JH
3691 if (UCHARAT(RExC_parse) == '^') {
3692 RExC_parse++;
3693 n--;
3694 value = value == 'p' ? 'P' : 'p'; /* toggle */
3695 while (isSPACE(UCHARAT(RExC_parse))) {
3696 RExC_parse++;
3697 n--;
3698 }
3699 }
ffc61ed2 3700 if (value == 'p')
ab13f0c7
JH
3701 Perl_sv_catpvf(aTHX_ listsv,
3702 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3703 else
ab13f0c7
JH
3704 Perl_sv_catpvf(aTHX_ listsv,
3705 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3706 }
3707 RExC_parse = e + 1;
3708 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
3709 namedclass = ANYOF_MAX; /* no official name, but it's named */
3710 break;
b8c5462f
JH
3711 case 'n': value = '\n'; break;
3712 case 'r': value = '\r'; break;
3713 case 't': value = '\t'; break;
3714 case 'f': value = '\f'; break;
3715 case 'b': value = '\b'; break;
c7f1f016
NIS
3716 case 'e': value = ASCII_TO_NATIVE('\033');break;
3717 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3718 case 'x':
ffc61ed2 3719 if (*RExC_parse == '{') {
a4c04bdc
NC
3720 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3721 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3722 e = strchr(RExC_parse++, '}');
b81d288d 3723 if (!e)
ffc61ed2 3724 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3725
3726 numlen = e - RExC_parse;
3727 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3728 RExC_parse = e + 1;
3729 }
3730 else {
a4c04bdc 3731 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3732 numlen = 2;
3733 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3734 RExC_parse += numlen;
3735 }
b8c5462f
JH
3736 break;
3737 case 'c':
830247a4 3738 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3739 value = toCTRL(value);
3740 break;
3741 case '0': case '1': case '2': case '3': case '4':
3742 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3743 {
3744 I32 flags = 0;
3745 numlen = 3;
3746 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3747 RExC_parse += numlen;
b8c5462f 3748 break;
53305cf1 3749 }
1028017a 3750 default:
e476b1b5 3751 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3752 vWARN2(RExC_parse,
3753 "Unrecognized escape \\%c in character class passed through",
3754 (int)value);
1028017a 3755 break;
b8c5462f 3756 }
ffc61ed2 3757 } /* end of \blah */
1b2d223b
JH
3758#ifdef EBCDIC
3759 else
3760 literal_endpoint++;
3761#endif
ffc61ed2
JH
3762
3763 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3764
3765 if (!SIZE_ONLY && !need_class)
936ed897 3766 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3767
936ed897 3768 need_class = 1;
ffc61ed2
JH
3769
3770 /* a bad range like a-\d, a-[:digit:] ? */
3771 if (range) {
73b437c8 3772 if (!SIZE_ONLY) {
e476b1b5 3773 if (ckWARN(WARN_REGEXP))
830247a4 3774 vWARN4(RExC_parse,
b45f050a 3775 "False [] range \"%*.*s\"",
830247a4
IZ
3776 RExC_parse - rangebegin,
3777 RExC_parse - rangebegin,
b45f050a 3778 rangebegin);
3568d838
JH
3779 if (prevvalue < 256) {
3780 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3781 ANYOF_BITMAP_SET(ret, '-');
3782 }
3783 else {
3784 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3785 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3786 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3787 }
b8c5462f 3788 }
ffc61ed2
JH
3789
3790 range = 0; /* this was not a true range */
73b437c8 3791 }
ffc61ed2 3792
73b437c8 3793 if (!SIZE_ONLY) {
c49a72a9
NC
3794 const char *what = NULL;
3795 char yesno = 0;
3796
3568d838
JH
3797 if (namedclass > OOB_NAMEDCLASS)
3798 optimize_invert = FALSE;
e2962f66
JH
3799 /* Possible truncation here but in some 64-bit environments
3800 * the compiler gets heartburn about switch on 64-bit values.
3801 * A similar issue a little earlier when switching on value.
98f323fa 3802 * --jhi */
e2962f66 3803 switch ((I32)namedclass) {
73b437c8
JH
3804 case ANYOF_ALNUM:
3805 if (LOC)
936ed897 3806 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3807 else {
3808 for (value = 0; value < 256; value++)
3809 if (isALNUM(value))
936ed897 3810 ANYOF_BITMAP_SET(ret, value);
73b437c8 3811 }
c49a72a9
NC
3812 yesno = '+';
3813 what = "Word";
73b437c8
JH
3814 break;
3815 case ANYOF_NALNUM:
3816 if (LOC)
936ed897 3817 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3818 else {
3819 for (value = 0; value < 256; value++)
3820 if (!isALNUM(value))
936ed897 3821 ANYOF_BITMAP_SET(ret, value);
73b437c8 3822 }
c49a72a9
NC
3823 yesno = '!';
3824 what = "Word";
73b437c8 3825 break;
ffc61ed2 3826 case ANYOF_ALNUMC:
73b437c8 3827 if (LOC)
ffc61ed2 3828 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3829 else {
3830 for (value = 0; value < 256; value++)
ffc61ed2 3831 if (isALNUMC(value))
936ed897 3832 ANYOF_BITMAP_SET(ret, value);
73b437c8 3833 }
c49a72a9
NC
3834 yesno = '+';
3835 what = "Alnum";
73b437c8
JH
3836 break;
3837 case ANYOF_NALNUMC:
3838 if (LOC)
936ed897 3839 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (!isALNUMC(value))
936ed897 3843 ANYOF_BITMAP_SET(ret, value);
73b437c8 3844 }
c49a72a9
NC
3845 yesno = '!';
3846 what = "Alnum";
73b437c8
JH
3847 break;
3848 case ANYOF_ALPHA:
3849 if (LOC)
936ed897 3850 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3851 else {
3852 for (value = 0; value < 256; value++)
3853 if (isALPHA(value))
936ed897 3854 ANYOF_BITMAP_SET(ret, value);
73b437c8 3855 }
c49a72a9
NC
3856 yesno = '+';
3857 what = "Alpha";
73b437c8
JH
3858 break;
3859 case ANYOF_NALPHA:
3860 if (LOC)
936ed897 3861 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3862 else {
3863 for (value = 0; value < 256; value++)
3864 if (!isALPHA(value))
936ed897 3865 ANYOF_BITMAP_SET(ret, value);
73b437c8 3866 }
c49a72a9
NC
3867 yesno = '!';
3868 what = "Alpha";
73b437c8
JH
3869 break;
3870 case ANYOF_ASCII:
3871 if (LOC)
936ed897 3872 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3873 else {
c7f1f016 3874#ifndef EBCDIC
1ba5c669
JH
3875 for (value = 0; value < 128; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3877#else /* EBCDIC */
ffbc6a93 3878 for (value = 0; value < 256; value++) {
3a3c4447
JH
3879 if (isASCII(value))
3880 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3881 }
1ba5c669 3882#endif /* EBCDIC */
73b437c8 3883 }
c49a72a9
NC
3884 yesno = '+';
3885 what = "ASCII";
73b437c8
JH
3886 break;
3887 case ANYOF_NASCII:
3888 if (LOC)
936ed897 3889 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3890 else {
c7f1f016 3891#ifndef EBCDIC
1ba5c669
JH
3892 for (value = 128; value < 256; value++)
3893 ANYOF_BITMAP_SET(ret, value);
3894#else /* EBCDIC */
ffbc6a93 3895 for (value = 0; value < 256; value++) {
3a3c4447
JH
3896 if (!isASCII(value))
3897 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3898 }
1ba5c669 3899#endif /* EBCDIC */
73b437c8 3900 }
c49a72a9
NC
3901 yesno = '!';
3902 what = "ASCII";
73b437c8 3903 break;
aaa51d5e
JF
3904 case ANYOF_BLANK:
3905 if (LOC)
3906 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3907 else {
3908 for (value = 0; value < 256; value++)
3909 if (isBLANK(value))
3910 ANYOF_BITMAP_SET(ret, value);
3911 }
c49a72a9
NC
3912 yesno = '+';
3913 what = "Blank";
aaa51d5e
JF
3914 break;
3915 case ANYOF_NBLANK:
3916 if (LOC)
3917 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3918 else {
3919 for (value = 0; value < 256; value++)
3920 if (!isBLANK(value))
3921 ANYOF_BITMAP_SET(ret, value);
3922 }
c49a72a9
NC
3923 yesno = '!';
3924 what = "Blank";
aaa51d5e 3925 break;
73b437c8
JH
3926 case ANYOF_CNTRL:
3927 if (LOC)
936ed897 3928 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3929 else {
3930 for (value = 0; value < 256; value++)
3931 if (isCNTRL(value))
936ed897 3932 ANYOF_BITMAP_SET(ret, value);
73b437c8 3933 }
c49a72a9
NC
3934 yesno = '+';
3935 what = "Cntrl";
73b437c8
JH
3936 break;
3937 case ANYOF_NCNTRL:
3938 if (LOC)
936ed897 3939 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3940 else {
3941 for (value = 0; value < 256; value++)
3942 if (!isCNTRL(value))
936ed897 3943 ANYOF_BITMAP_SET(ret, value);
73b437c8 3944 }
c49a72a9
NC
3945 yesno = '!';
3946 what = "Cntrl";
ffc61ed2
JH
3947 break;
3948 case ANYOF_DIGIT:
3949 if (LOC)
3950 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3951 else {
3952 /* consecutive digits assumed */
3953 for (value = '0'; value <= '9'; value++)
3954 ANYOF_BITMAP_SET(ret, value);
3955 }
c49a72a9
NC
3956 yesno = '+';
3957 what = "Digit";
ffc61ed2
JH
3958 break;
3959 case ANYOF_NDIGIT:
3960 if (LOC)
3961 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3962 else {
3963 /* consecutive digits assumed */
3964 for (value = 0; value < '0'; value++)
3965 ANYOF_BITMAP_SET(ret, value);
3966 for (value = '9' + 1; value < 256; value++)
3967 ANYOF_BITMAP_SET(ret, value);
3968 }
c49a72a9
NC
3969 yesno = '!';
3970 what = "Digit";
73b437c8
JH
3971 break;
3972 case ANYOF_GRAPH:
3973 if (LOC)
936ed897 3974 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3975 else {
3976 for (value = 0; value < 256; value++)
3977 if (isGRAPH(value))
936ed897 3978 ANYOF_BITMAP_SET(ret, value);
73b437c8 3979 }
c49a72a9
NC
3980 yesno = '+';
3981 what = "Graph";
73b437c8
JH
3982 break;
3983 case ANYOF_NGRAPH:
3984 if (LOC)
936ed897 3985 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3986 else {
3987 for (value = 0; value < 256; value++)
3988 if (!isGRAPH(value))
936ed897 3989 ANYOF_BITMAP_SET(ret, value);
73b437c8 3990 }
c49a72a9
NC
3991 yesno = '!';
3992 what = "Graph";
73b437c8
JH
3993 break;
3994 case ANYOF_LOWER:
3995 if (LOC)
936ed897 3996 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3997 else {
3998 for (value = 0; value < 256; value++)
3999 if (isLOWER(value))
936ed897 4000 ANYOF_BITMAP_SET(ret, value);
73b437c8 4001 }
c49a72a9
NC
4002 yesno = '+';
4003 what = "Lower";
73b437c8
JH
4004 break;
4005 case ANYOF_NLOWER:
4006 if (LOC)
936ed897 4007 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
4008 else {
4009 for (value = 0; value < 256; value++)
4010 if (!isLOWER(value))
936ed897 4011 ANYOF_BITMAP_SET(ret, value);
73b437c8 4012 }
c49a72a9
NC
4013 yesno = '!';
4014 what = "Lower";
73b437c8
JH
4015 break;
4016 case ANYOF_PRINT:
4017 if (LOC)
936ed897 4018 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
4019 else {
4020 for (value = 0; value < 256; value++)
4021 if (isPRINT(value))
936ed897 4022 ANYOF_BITMAP_SET(ret, value);
73b437c8 4023 }
c49a72a9
NC
4024 yesno = '+';
4025 what = "Print";
73b437c8
JH
4026 break;
4027 case ANYOF_NPRINT:
4028 if (LOC)
936ed897 4029 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
4030 else {
4031 for (value = 0; value < 256; value++)
4032 if (!isPRINT(value))
936ed897 4033 ANYOF_BITMAP_SET(ret, value);
73b437c8 4034 }
c49a72a9
NC
4035 yesno = '!';
4036 what = "Print";
73b437c8 4037 break;
aaa51d5e
JF
4038 case ANYOF_PSXSPC:
4039 if (LOC)
4040 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4041 else {
4042 for (value = 0; value < 256; value++)
4043 if (isPSXSPC(value))
4044 ANYOF_BITMAP_SET(ret, value);
4045 }
c49a72a9
NC
4046 yesno = '+';
4047 what = "Space";
aaa51d5e
JF
4048 break;
4049 case ANYOF_NPSXSPC:
4050 if (LOC)
4051 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4052 else {
4053 for (value = 0; value < 256; value++)
4054 if (!isPSXSPC(value))
4055 ANYOF_BITMAP_SET(ret, value);
4056 }
c49a72a9
NC
4057 yesno = '!';
4058 what = "Space";
aaa51d5e 4059 break;
73b437c8
JH
4060 case ANYOF_PUNCT:
4061 if (LOC)
936ed897 4062 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
4063 else {
4064 for (value = 0; value < 256; value++)
4065 if (isPUNCT(value))
936ed897 4066 ANYOF_BITMAP_SET(ret, value);
73b437c8 4067 }
c49a72a9
NC
4068 yesno = '+';
4069 what = "Punct";
73b437c8
JH
4070 break;
4071 case ANYOF_NPUNCT:
4072 if (LOC)
936ed897 4073 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
4074 else {
4075 for (value = 0; value < 256; value++)
4076 if (!isPUNCT(value))
936ed897 4077 ANYOF_BITMAP_SET(ret, value);
73b437c8 4078 }
c49a72a9
NC
4079 yesno = '!';
4080 what = "Punct";
ffc61ed2
JH
4081 break;
4082 case ANYOF_SPACE:
4083 if (LOC)
4084 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4085 else {
4086 for (value = 0; value < 256; value++)
4087 if (isSPACE(value))
4088 ANYOF_BITMAP_SET(ret, value);
4089 }
c49a72a9
NC
4090 yesno = '+';
4091 what = "SpacePerl";
ffc61ed2
JH
4092 break;
4093 case ANYOF_NSPACE:
4094 if (LOC)
4095 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4096 else {
4097 for (value = 0; value < 256; value++)
4098 if (!isSPACE(value))
4099 ANYOF_BITMAP_SET(ret, value);
4100 }
c49a72a9
NC
4101 yesno = '!';
4102 what = "SpacePerl";
73b437c8
JH
4103 break;
4104 case ANYOF_UPPER:
4105 if (LOC)
936ed897 4106 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4107 else {
4108 for (value = 0; value < 256; value++)
4109 if (isUPPER(value))
936ed897 4110 ANYOF_BITMAP_SET(ret, value);
73b437c8 4111 }
c49a72a9
NC
4112 yesno = '+';
4113 what = "Upper";
73b437c8
JH
4114 break;
4115 case ANYOF_NUPPER:
4116 if (LOC)
936ed897 4117 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4118 else {
4119 for (value = 0; value < 256; value++)
4120 if (!isUPPER(value))
936ed897 4121 ANYOF_BITMAP_SET(ret, value);
73b437c8 4122 }
c49a72a9
NC
4123 yesno = '!';
4124 what = "Upper";
73b437c8
JH
4125 break;
4126 case ANYOF_XDIGIT:
4127 if (LOC)
936ed897 4128 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4129 else {
4130 for (value = 0; value < 256; value++)
4131 if (isXDIGIT(value))
936ed897 4132 ANYOF_BITMAP_SET(ret, value);
73b437c8 4133 }
c49a72a9
NC
4134 yesno = '+';
4135 what = "XDigit";
73b437c8
JH
4136 break;
4137 case ANYOF_NXDIGIT:
4138 if (LOC)
936ed897 4139 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4140 else {
4141 for (value = 0; value < 256; value++)
4142 if (!isXDIGIT(value))
936ed897 4143 ANYOF_BITMAP_SET(ret, value);
73b437c8 4144 }
c49a72a9
NC
4145 yesno = '!';
4146 what = "XDigit";
73b437c8 4147 break;
f81125e2
JP
4148 case ANYOF_MAX:
4149 /* this is to handle \p and \P */
4150 break;
73b437c8 4151 default:
b45f050a 4152 vFAIL("Invalid [::] class");
73b437c8 4153 break;
b8c5462f 4154 }
c49a72a9
NC
4155 if (what) {
4156 /* Strings such as "+utf8::isWord\n" */
4157 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4158 }
b8c5462f 4159 if (LOC)
936ed897 4160 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4161 continue;
a0d0e21e 4162 }
ffc61ed2
JH
4163 } /* end of namedclass \blah */
4164
a0d0e21e 4165 if (range) {
eb160463 4166 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4167 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4168 RExC_parse - rangebegin,
4169 RExC_parse - rangebegin,
b45f050a 4170 rangebegin);
3568d838 4171 range = 0; /* not a valid range */
73b437c8 4172 }
a0d0e21e
LW
4173 }
4174 else {
3568d838 4175 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4176 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4177 RExC_parse[1] != ']') {
4178 RExC_parse++;
ffc61ed2
JH
4179
4180 /* a bad range like \w-, [:word:]- ? */
4181 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4182 if (ckWARN(WARN_REGEXP))
830247a4 4183 vWARN4(RExC_parse,
b45f050a 4184 "False [] range \"%*.*s\"",
830247a4
IZ
4185 RExC_parse - rangebegin,
4186 RExC_parse - rangebegin,
b45f050a 4187 rangebegin);
73b437c8 4188 if (!SIZE_ONLY)
936ed897 4189 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4190 } else
ffc61ed2
JH
4191 range = 1; /* yeah, it's a range! */
4192 continue; /* but do it the next time */
a0d0e21e 4193 }
a687059c 4194 }
ffc61ed2 4195
93733859 4196 /* now is the next time */
ae5c130c 4197 if (!SIZE_ONLY) {
3568d838
JH
4198 IV i;
4199
4200 if (prevvalue < 256) {
4201 IV ceilvalue = value < 256 ? value : 255;
4202
4203#ifdef EBCDIC
1b2d223b
JH
4204 /* In EBCDIC [\x89-\x91] should include
4205 * the \x8e but [i-j] should not. */
4206 if (literal_endpoint == 2 &&
4207 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4208 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4209 {
3568d838
JH
4210 if (isLOWER(prevvalue)) {
4211 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4212 if (isLOWER(i))
4213 ANYOF_BITMAP_SET(ret, i);
4214 } else {
3568d838 4215 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4216 if (isUPPER(i))
4217 ANYOF_BITMAP_SET(ret, i);
4218 }
8ada0baa 4219 }
ffc61ed2 4220 else
8ada0baa 4221#endif
a5961de5
JH
4222 for (i = prevvalue; i <= ceilvalue; i++)
4223 ANYOF_BITMAP_SET(ret, i);
3568d838 4224 }
a5961de5 4225 if (value > 255 || UTF) {
b08decb7
JH
4226 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4227 UV natvalue = NATIVE_TO_UNI(value);
4228
ffc61ed2 4229 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4230 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4231 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4232 prevnatvalue, natvalue);
4233 }
4234 else if (prevnatvalue == natvalue) {
4235 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4236 if (FOLD) {
89ebb4a3 4237 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 4238 STRLEN foldlen;
2f3bf011 4239 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4240
c840d2a2
JH
4241 /* If folding and foldable and a single
4242 * character, insert also the folded version
4243 * to the charclass. */
9e55ce06 4244 if (f != value) {
eb160463 4245 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4246 Perl_sv_catpvf(aTHX_ listsv,
4247 "%04"UVxf"\n", f);
4248 else {
4249 /* Any multicharacter foldings
4250 * require the following transform:
4251 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4252 * where E folds into "pq" and F folds
4253 * into "rst", all other characters
4254 * fold to single characters. We save
4255 * away these multicharacter foldings,
4256 * to be later saved as part of the
4257 * additional "s" data. */
4258 SV *sv;
4259
4260 if (!unicode_alternate)
4261 unicode_alternate = newAV();
4262 sv = newSVpvn((char*)foldbuf, foldlen);
4263 SvUTF8_on(sv);
4264 av_push(unicode_alternate, sv);
4265 }
4266 }
254ba52a 4267
60a8b682
JH
4268 /* If folding and the value is one of the Greek
4269 * sigmas insert a few more sigmas to make the
4270 * folding rules of the sigmas to work right.
4271 * Note that not all the possible combinations
4272 * are handled here: some of them are handled
9e55ce06
JH
4273 * by the standard folding rules, and some of
4274 * them (literal or EXACTF cases) are handled
4275 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4276 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4277 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4278 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4279 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4280 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4281 }
4282 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4283 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4284 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4285 }
4286 }
ffc61ed2 4287 }
1b2d223b
JH
4288#ifdef EBCDIC
4289 literal_endpoint = 0;
4290#endif
8ada0baa 4291 }
ffc61ed2
JH
4292
4293 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4294 }
ffc61ed2 4295
936ed897 4296 if (need_class) {
4f66b38d 4297 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4298 if (SIZE_ONLY)
830247a4 4299 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4300 else
830247a4 4301 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4302 }
ffc61ed2 4303
ae5c130c 4304 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4305 if (!SIZE_ONLY &&
ffc61ed2 4306 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4307 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4308 ) {
a0ed51b3 4309 for (value = 0; value < 256; ++value) {
936ed897 4310 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4311 UV fold = PL_fold[value];
ffc61ed2
JH
4312
4313 if (fold != value)
4314 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4315 }
4316 }
936ed897 4317 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4318 }
ffc61ed2 4319
ae5c130c 4320 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4321 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4322 /* If the only flag is inversion. */
4323 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4324 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4325 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4326 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4327 }
a0d0e21e 4328
b81d288d 4329 if (!SIZE_ONLY) {
fde631ed 4330 AV *av = newAV();
ffc61ed2
JH
4331 SV *rv;
4332
9e55ce06 4333 /* The 0th element stores the character class description
6a0407ee 4334 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4335 * to initialize the appropriate swash (which gets stored in
4336 * the 1st element), and also useful for dumping the regnode.
4337 * The 2nd element stores the multicharacter foldings,
6a0407ee 4338 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4339 av_store(av, 0, listsv);
4340 av_store(av, 1, NULL);
9e55ce06 4341 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4342 rv = newRV_noinc((SV*)av);
19860706 4343 n = add_data(pRExC_state, 1, "s");
830247a4 4344 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4345 ARG_SET(ret, n);
a0ed51b3
LW
4346 }
4347
4348 return ret;
4349}
4350
76e3520e 4351STATIC char*
830247a4 4352S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4353{
830247a4 4354 char* retval = RExC_parse++;
a0d0e21e 4355
4633a7c4 4356 for (;;) {
830247a4
IZ
4357 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4358 RExC_parse[2] == '#') {
e994fd66
AE
4359 while (*RExC_parse != ')') {
4360 if (RExC_parse == RExC_end)
4361 FAIL("Sequence (?#... not terminated");
830247a4 4362 RExC_parse++;
e994fd66 4363 }
830247a4 4364 RExC_parse++;
4633a7c4
LW
4365 continue;
4366 }
e2509266 4367 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4368 if (isSPACE(*RExC_parse)) {
4369 RExC_parse++;
748a9306
LW
4370 continue;
4371 }
830247a4 4372 else if (*RExC_parse == '#') {
e994fd66
AE
4373 while (RExC_parse < RExC_end)
4374 if (*RExC_parse++ == '\n') break;
748a9306
LW
4375 continue;
4376 }
748a9306 4377 }
4633a7c4 4378 return retval;
a0d0e21e 4379 }
a687059c
LW
4380}
4381
4382/*
c277df42 4383- reg_node - emit a node
a0d0e21e 4384*/
76e3520e 4385STATIC regnode * /* Location. */
830247a4 4386S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4387{
c277df42
IZ
4388 register regnode *ret;
4389 register regnode *ptr;
a687059c 4390
830247a4 4391 ret = RExC_emit;
c277df42 4392 if (SIZE_ONLY) {
830247a4
IZ
4393 SIZE_ALIGN(RExC_size);
4394 RExC_size += 1;
a0d0e21e
LW
4395 return(ret);
4396 }
a687059c 4397
c277df42 4398 NODE_ALIGN_FILL(ret);
a0d0e21e 4399 ptr = ret;
c277df42 4400 FILL_ADVANCE_NODE(ptr, op);
fac92740 4401 if (RExC_offsets) { /* MJD */
ccb2c380 4402 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4403 "reg_node", __LINE__,
4404 reg_name[op],
4405 RExC_emit - RExC_emit_start > RExC_offsets[0]
4406 ? "Overwriting end of array!\n" : "OK",
4407 RExC_emit - RExC_emit_start,
4408 RExC_parse - RExC_start,
4409 RExC_offsets[0]));
ccb2c380 4410 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4411 }
4412
830247a4 4413 RExC_emit = ptr;
a687059c 4414
a0d0e21e 4415 return(ret);
a687059c
LW
4416}
4417
4418/*
a0d0e21e
LW
4419- reganode - emit a node with an argument
4420*/
76e3520e 4421STATIC regnode * /* Location. */
830247a4 4422S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4423{
c277df42
IZ
4424 register regnode *ret;
4425 register regnode *ptr;
fe14fcc3 4426
830247a4 4427 ret = RExC_emit;
c277df42 4428 if (SIZE_ONLY) {
830247a4
IZ
4429 SIZE_ALIGN(RExC_size);
4430 RExC_size += 2;
a0d0e21e
LW
4431 return(ret);
4432 }
fe14fcc3 4433
c277df42 4434 NODE_ALIGN_FILL(ret);
a0d0e21e 4435 ptr = ret;
c277df42 4436 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4437 if (RExC_offsets) { /* MJD */
ccb2c380 4438 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4439 "reganode",
ccb2c380
MP
4440 __LINE__,
4441 reg_name[op],
fac92740
MJD
4442 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4443 "Overwriting end of array!\n" : "OK",
4444 RExC_emit - RExC_emit_start,
4445 RExC_parse - RExC_start,
4446 RExC_offsets[0]));
ccb2c380 4447 Set_Cur_Node_Offset;
fac92740
MJD
4448 }
4449
830247a4 4450 RExC_emit = ptr;
fe14fcc3 4451
a0d0e21e 4452 return(ret);
fe14fcc3
LW
4453}
4454
4455/*
cd439c50 4456- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4457*/
4458STATIC void
830247a4 4459S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4460{
5e12f4fb 4461 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4462}
4463
4464/*
a0d0e21e
LW
4465- reginsert - insert an operator in front of already-emitted operand
4466*
4467* Means relocating the operand.
4468*/
76e3520e 4469STATIC void
830247a4 4470S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4471{
c277df42
IZ
4472 register regnode *src;
4473 register regnode *dst;
4474 register regnode *place;
4475 register int offset = regarglen[(U8)op];
b81d288d 4476
22c35a8c 4477/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4478
4479 if (SIZE_ONLY) {
830247a4 4480 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4481 return;
4482 }
a687059c 4483
830247a4
IZ
4484 src = RExC_emit;
4485 RExC_emit += NODE_STEP_REGNODE + offset;
4486 dst = RExC_emit;
fac92740 4487 while (src > opnd) {
c277df42 4488 StructCopy(--src, --dst, regnode);
fac92740 4489 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4490 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4491 "reg_insert",
ccb2c380
MP
4492 __LINE__,
4493 reg_name[op],
fac92740
MJD
4494 dst - RExC_emit_start > RExC_offsets[0]
4495 ? "Overwriting end of array!\n" : "OK",
4496 src - RExC_emit_start,
4497 dst - RExC_emit_start,
4498 RExC_offsets[0]));
ccb2c380
MP
4499 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4500 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4501 }
4502 }
4503
a0d0e21e
LW
4504
4505 place = opnd; /* Op node, where operand used to be. */
fac92740 4506 if (RExC_offsets) { /* MJD */
ccb2c380 4507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4508 "reginsert",
ccb2c380
MP
4509 __LINE__,
4510 reg_name[op],
fac92740
MJD
4511 place - RExC_emit_start > RExC_offsets[0]
4512 ? "Overwriting end of array!\n" : "OK",
4513 place - RExC_emit_start,
4514 RExC_parse - RExC_start,
4515 RExC_offsets[0]));
ccb2c380 4516 Set_Node_Offset(place, RExC_parse);
45948336 4517 Set_Node_Length(place, 1);
fac92740 4518 }
c277df42
IZ
4519 src = NEXTOPER(place);
4520 FILL_ADVANCE_NODE(place, op);
4521 Zero(src, offset, regnode);
a687059c
LW
4522}
4523
4524/*
c277df42 4525- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4526*/
76e3520e 4527STATIC void
830247a4 4528S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4529{
c277df42
IZ
4530 register regnode *scan;
4531 register regnode *temp;
a0d0e21e 4532
c277df42 4533 if (SIZE_ONLY)
a0d0e21e
LW
4534 return;
4535
4536 /* Find last node. */
4537 scan = p;
4538 for (;;) {
4539 temp = regnext(scan);
4540 if (temp == NULL)
4541 break;
4542 scan = temp;
4543 }
a687059c 4544
c277df42
IZ
4545 if (reg_off_by_arg[OP(scan)]) {
4546 ARG_SET(scan, val - scan);
a0ed51b3
LW
4547 }
4548 else {
c277df42
IZ
4549 NEXT_OFF(scan) = val - scan;
4550 }
a687059c
LW
4551}
4552
4553/*
a0d0e21e
LW
4554- regoptail - regtail on operand of first argument; nop if operandless
4555*/
76e3520e 4556STATIC void
830247a4 4557S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4558{
a0d0e21e 4559 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4560 if (p == NULL || SIZE_ONLY)
4561 return;
22c35a8c 4562 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4563 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4564 }
22c35a8c 4565 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4566 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4567 }
4568 else
a0d0e21e 4569 return;
a687059c
LW
4570}
4571
4572/*
4573 - regcurly - a little FSA that accepts {\d+,?\d*}
4574 */
79072805 4575STATIC I32
cea2e8a9 4576S_regcurly(pTHX_ register char *s)
a687059c
LW
4577{
4578 if (*s++ != '{')
4579 return FALSE;
f0fcb552 4580 if (!isDIGIT(*s))
a687059c 4581 return FALSE;
f0fcb552 4582 while (isDIGIT(*s))
a687059c
LW
4583 s++;
4584 if (*s == ',')
4585 s++;
f0fcb552 4586 while (isDIGIT(*s))
a687059c
LW
4587 s++;
4588 if (*s != '}')
4589 return FALSE;
4590 return TRUE;
4591}
4592
a687059c 4593
8fa7f367
JH
4594#ifdef DEBUGGING
4595
76e3520e 4596STATIC regnode *
cea2e8a9 4597S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4598{
f248d071 4599 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4600 register regnode *next;
c277df42
IZ
4601
4602 while (op != END && (!last || node < last)) {
4603 /* While that wasn't END last time... */
4604
4605 NODE_ALIGN(node);
4606 op = OP(node);
4607 if (op == CLOSE)
4608 l--;
4609 next = regnext(node);
4610 /* Where, what. */
4611 if (OP(node) == OPTIMIZED)
4612 goto after_print;
4613 regprop(sv, node);
b900a521 4614 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4615 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4616 if (next == NULL) /* Next ptr. */
4617 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4618 else
b900a521 4619 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4620 (void)PerlIO_putc(Perl_debug_log, '\n');
4621 after_print:
22c35a8c 4622 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4623 register regnode *nnode = (OP(next) == LONGJMP
4624 ? regnext(next)
c277df42
IZ
4625 : next);
4626 if (last && nnode > last)
4627 nnode = last;
4628 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4629 }
22c35a8c 4630 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4631 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4632 }
4633 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4634 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4635 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4636 }
22c35a8c 4637 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4638 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4639 next, sv, l + 1);
a0ed51b3
LW
4640 }
4641 else if ( op == PLUS || op == STAR) {
c277df42 4642 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4643 }
4644 else if (op == ANYOF) {
4f66b38d
HS
4645 /* arglen 1 + class block */
4646 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4647 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4648 node = NEXTOPER(node);
a0ed51b3 4649 }
22c35a8c 4650 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4651 /* Literal string, where present. */
cd439c50 4652 node += NODE_SZ_STR(node) - 1;
c277df42 4653 node = NEXTOPER(node);
a0ed51b3
LW
4654 }
4655 else {
c277df42
IZ
4656 node = NEXTOPER(node);
4657 node += regarglen[(U8)op];
4658 }
4659 if (op == CURLYX || op == OPEN)
4660 l++;
4661 else if (op == WHILEM)
4662 l--;
4663 }
4664 return node;
4665}
4666
8fa7f367
JH
4667#endif /* DEBUGGING */
4668
a687059c 4669/*
fd181c75 4670 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4671 */
4672void
864dbfa3 4673Perl_regdump(pTHX_ regexp *r)
a687059c 4674{
35ff7856 4675#ifdef DEBUGGING
46fc3d4c 4676 SV *sv = sv_newmortal();
a687059c 4677
c277df42 4678 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4679
4680 /* Header fields of interest. */
c277df42 4681 if (r->anchored_substr)
7b0972df 4682 PerlIO_printf(Perl_debug_log,
b81d288d 4683 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4684 PL_colors[0],
7b0972df 4685 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4686 SvPVX(r->anchored_substr),
3280af22 4687 PL_colors[1],
c277df42 4688 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4689 (IV)r->anchored_offset);
33b8afdf
JH
4690 else if (r->anchored_utf8)
4691 PerlIO_printf(Perl_debug_log,
4692 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4693 PL_colors[0],
4694 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4695 SvPVX(r->anchored_utf8),
4696 PL_colors[1],
4697 SvTAIL(r->anchored_utf8) ? "$" : "",
4698 (IV)r->anchored_offset);
c277df42 4699 if (r->float_substr)
7b0972df 4700 PerlIO_printf(Perl_debug_log,
b81d288d 4701 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4702 PL_colors[0],
b81d288d 4703 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4704 SvPVX(r->float_substr),
3280af22 4705 PL_colors[1],
c277df42 4706 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4707 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4708 else if (r->float_utf8)
4709 PerlIO_printf(Perl_debug_log,
4710 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4711 PL_colors[0],
4712 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4713 SvPVX(r->float_utf8),
4714 PL_colors[1],
4715 SvTAIL(r->float_utf8) ? "$" : "",
4716 (IV)r->float_min_offset, (UV)r->float_max_offset);
4717 if (r->check_substr || r->check_utf8)
b81d288d
AB
4718 PerlIO_printf(Perl_debug_log,
4719 r->check_substr == r->float_substr
33b8afdf 4720 && r->check_utf8 == r->float_utf8
c277df42
IZ
4721 ? "(checking floating" : "(checking anchored");
4722 if (r->reganch & ROPT_NOSCAN)
4723 PerlIO_printf(Perl_debug_log, " noscan");
4724 if (r->reganch & ROPT_CHECK_ALL)
4725 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4726 if (r->check_substr || r->check_utf8)
c277df42
IZ
4727 PerlIO_printf(Perl_debug_log, ") ");
4728
46fc3d4c 4729 if (r->regstclass) {
4730 regprop(sv, r->regstclass);
4731 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4732 }
774d564b 4733 if (r->reganch & ROPT_ANCH) {
4734 PerlIO_printf(Perl_debug_log, "anchored");
4735 if (r->reganch & ROPT_ANCH_BOL)
4736 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4737 if (r->reganch & ROPT_ANCH_MBOL)
4738 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4739 if (r->reganch & ROPT_ANCH_SBOL)
4740 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4741 if (r->reganch & ROPT_ANCH_GPOS)
4742 PerlIO_printf(Perl_debug_log, "(GPOS)");
4743 PerlIO_putc(Perl_debug_log, ' ');
4744 }
c277df42
IZ
4745 if (r->reganch & ROPT_GPOS_SEEN)
4746 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4747 if (r->reganch & ROPT_SKIP)
760ac839 4748 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4749 if (r->reganch & ROPT_IMPLICIT)
760ac839 4750 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4751 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4752 if (r->reganch & ROPT_EVAL_SEEN)
4753 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4754 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4755 if (r->offsets) {
4756 U32 i;
4757 U32 len = r->offsets[0];
392fbf5d 4758 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4759 for (i = 1; i <= len; i++)
392fbf5d
RB
4760 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4761 (UV)r->offsets[i*2-1],
4762 (UV)r->offsets[i*2]);
fac92740
MJD
4763 PerlIO_printf(Perl_debug_log, "\n");
4764 }
17c3b450 4765#endif /* DEBUGGING */
a687059c
LW
4766}
4767
8fa7f367
JH
4768#ifdef DEBUGGING
4769
653099ff
GS
4770STATIC void
4771S_put_byte(pTHX_ SV *sv, int c)
4772{
7be5a6cf 4773 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4774 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4775 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4776 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4777 else
4778 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4779}
4780
8fa7f367
JH
4781#endif /* DEBUGGING */
4782
a687059c 4783/*
a0d0e21e
LW
4784- regprop - printable representation of opcode
4785*/
46fc3d4c 4786void
864dbfa3 4787Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4788{
35ff7856 4789#ifdef DEBUGGING
9b155405 4790 register int k;
a0d0e21e 4791
54dc92de 4792 sv_setpvn(sv, "", 0);
9b155405 4793 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4794 /* It would be nice to FAIL() here, but this may be called from
4795 regexec.c, and it would be hard to supply pRExC_state. */
4796 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4797 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4798
4799 k = PL_regkind[(U8)OP(o)];
4800
2a782b5b
JH
4801 if (k == EXACT) {
4802 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4803 /* Using is_utf8_string() is a crude hack but it may
4804 * be the best for now since we have no flag "this EXACTish
4805 * node was UTF-8" --jhi */
4806 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4807 char *s = do_utf8 ?
c728cb41
JH
4808 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4809 UNI_DISPLAY_REGEX) :
2a782b5b 4810 STRING(o);
40eddc46 4811 int len = do_utf8 ?
2a782b5b
JH
4812 strlen(s) :
4813 STR_LEN(o);
4814 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4815 PL_colors[0],
4816 len, s,
4817 PL_colors[1]);
4818 }
9b155405 4819 else if (k == CURLY) {
cb434fcc 4820 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4822 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4823 }
2c2d71f5
JH
4824 else if (k == WHILEM && o->flags) /* Ordinal/of */
4825 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4826 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4827 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4828 else if (k == LOGICAL)
04ebc1ab 4829 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4830 else if (k == ANYOF) {
4831 int i, rangestart = -1;
ffc61ed2 4832 U8 flags = ANYOF_FLAGS(o);
a6d05634 4833 const char * const anyofs[] = { /* Should be synchronized with
19860706 4834 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4835 "\\w",
4836 "\\W",
4837 "\\s",
4838 "\\S",
4839 "\\d",
4840 "\\D",
4841 "[:alnum:]",
4842 "[:^alnum:]",
4843 "[:alpha:]",
4844 "[:^alpha:]",
4845 "[:ascii:]",
4846 "[:^ascii:]",
4847 "[:ctrl:]",
4848 "[:^ctrl:]",
4849 "[:graph:]",
4850 "[:^graph:]",
4851 "[:lower:]",
4852 "[:^lower:]",
4853 "[:print:]",
4854 "[:^print:]",
4855 "[:punct:]",
4856 "[:^punct:]",
4857 "[:upper:]",
aaa51d5e 4858 "[:^upper:]",
653099ff 4859 "[:xdigit:]",
aaa51d5e
JF
4860 "[:^xdigit:]",
4861 "[:space:]",
4862 "[:^space:]",
4863 "[:blank:]",
4864 "[:^blank:]"
653099ff
GS
4865 };
4866
19860706 4867 if (flags & ANYOF_LOCALE)
653099ff 4868 sv_catpv(sv, "{loc}");
19860706 4869 if (flags & ANYOF_FOLD)
653099ff
GS
4870 sv_catpv(sv, "{i}");
4871 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4872 if (flags & ANYOF_INVERT)
653099ff 4873 sv_catpv(sv, "^");
ffc61ed2
JH
4874 for (i = 0; i <= 256; i++) {
4875 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4876 if (rangestart == -1)
4877 rangestart = i;
4878 } else if (rangestart != -1) {
4879 if (i <= rangestart + 3)
4880 for (; rangestart < i; rangestart++)
653099ff 4881 put_byte(sv, rangestart);
ffc61ed2
JH
4882 else {
4883 put_byte(sv, rangestart);
4884 sv_catpv(sv, "-");
4885 put_byte(sv, i - 1);
653099ff 4886 }
ffc61ed2 4887 rangestart = -1;
653099ff 4888 }
847a199f 4889 }
ffc61ed2
JH
4890
4891 if (o->flags & ANYOF_CLASS)
4892 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4893 if (ANYOF_CLASS_TEST(o,i))
4894 sv_catpv(sv, anyofs[i]);
4895
4896 if (flags & ANYOF_UNICODE)
4897 sv_catpv(sv, "{unicode}");
1aa99e6b 4898 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4899 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4900
4901 {
4902 SV *lv;
9e55ce06 4903 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4904
ffc61ed2
JH
4905 if (lv) {
4906 if (sw) {
89ebb4a3 4907 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 4908
ffc61ed2 4909 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4910 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4911
3568d838 4912 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4913 if (rangestart == -1)
4914 rangestart = i;
4915 } else if (rangestart != -1) {
4916 U8 *p;
b81d288d 4917
ffc61ed2
JH
4918 if (i <= rangestart + 3)
4919 for (; rangestart < i; rangestart++) {
2b9d42f0 4920 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4921 put_byte(sv, *p);
4922 }
4923 else {
2b9d42f0 4924 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4925 put_byte(sv, *p);
4926 sv_catpv(sv, "-");
2b9d42f0 4927 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4928 put_byte(sv, *p);
4929 }
4930 rangestart = -1;
4931 }
19860706 4932 }
ffc61ed2
JH
4933
4934 sv_catpv(sv, "..."); /* et cetera */
19860706 4935 }
fde631ed 4936
ffc61ed2 4937 {
2e0de35c 4938 char *s = savesvpv(lv);
ffc61ed2 4939 char *origs = s;
b81d288d 4940
ffc61ed2 4941 while(*s && *s != '\n') s++;
b81d288d 4942
ffc61ed2
JH
4943 if (*s == '\n') {
4944 char *t = ++s;
4945
4946 while (*s) {
4947 if (*s == '\n')
4948 *s = ' ';
4949 s++;
4950 }
4951 if (s[-1] == ' ')
4952 s[-1] = 0;
4953
4954 sv_catpv(sv, t);
fde631ed 4955 }
b81d288d 4956
ffc61ed2 4957 Safefree(origs);
fde631ed
JH
4958 }
4959 }
653099ff 4960 }
ffc61ed2 4961
653099ff
GS
4962 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4963 }
9b155405 4964 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4965 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4966#endif /* DEBUGGING */
35ff7856 4967}
a687059c 4968
cad2e5aa
JH
4969SV *
4970Perl_re_intuit_string(pTHX_ regexp *prog)
4971{ /* Assume that RE_INTUIT is set */
4972 DEBUG_r(
4973 { STRLEN n_a;
33b8afdf
JH
4974 char *s = SvPV(prog->check_substr
4975 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4976
4977 if (!PL_colorset) reginitcolors();
4978 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4979 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4980 PL_colors[4],
4981 prog->check_substr ? "" : "utf8 ",
4982 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4983 s,
4984 PL_colors[1],
4985 (strlen(s) > 60 ? "..." : ""));
4986 } );
4987
33b8afdf 4988 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4989}
4990
2b69d0c2 4991void
864dbfa3 4992Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4993{
9e55ce06
JH
4994#ifdef DEBUGGING
4995 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4996#endif
7821416a
IZ
4997
4998 if (!r || (--r->refcnt > 0))
4999 return;
9e55ce06 5000 DEBUG_r({
d103360b
HS
5001 int len;
5002 char *s;
5003
5004 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
5005 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 5006 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 5007 len = SvCUR(dsv);
9e55ce06
JH
5008 if (!PL_colorset)
5009 reginitcolors();
5010 PerlIO_printf(Perl_debug_log,
5011 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
5012 PL_colors[4],PL_colors[5],PL_colors[0],
5013 len, len, s,
5014 PL_colors[1],
5015 len > 60 ? "..." : "");
5016 });
cad2e5aa 5017
c277df42 5018 if (r->precomp)
a0d0e21e 5019 Safefree(r->precomp);
fac92740
MJD
5020 if (r->offsets) /* 20010421 MJD */
5021 Safefree(r->offsets);
ed252734
NC
5022 RX_MATCH_COPY_FREE(r);
5023#ifdef PERL_COPY_ON_WRITE
5024 if (r->saved_copy)
5025 SvREFCNT_dec(r->saved_copy);
5026#endif
a193d654
GS
5027 if (r->substrs) {
5028 if (r->anchored_substr)
5029 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
5030 if (r->anchored_utf8)
5031 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
5032 if (r->float_substr)
5033 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
5034 if (r->float_utf8)
5035 SvREFCNT_dec(r->float_utf8);
2779dcf1 5036 Safefree(r->substrs);
a193d654 5037 }
c277df42
IZ
5038 if (r->data) {
5039 int n = r->data->count;
f3548bdc
DM
5040 PAD* new_comppad = NULL;
5041 PAD* old_comppad;
4026c95a 5042 PADOFFSET refcnt;
dfad63ad 5043
c277df42 5044 while (--n >= 0) {
261faec3 5045 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
5046 switch (r->data->what[n]) {
5047 case 's':
5048 SvREFCNT_dec((SV*)r->data->data[n]);
5049 break;
653099ff
GS
5050 case 'f':
5051 Safefree(r->data->data[n]);
5052 break;
dfad63ad
HS
5053 case 'p':
5054 new_comppad = (AV*)r->data->data[n];
5055 break;
c277df42 5056 case 'o':
dfad63ad 5057 if (new_comppad == NULL)
cea2e8a9 5058 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
5059 PAD_SAVE_LOCAL(old_comppad,
5060 /* Watch out for global destruction's random ordering. */
5061 (SvTYPE(new_comppad) == SVt_PVAV) ?
5062 new_comppad : Null(PAD *)
5063 );
b34c0dd4 5064 OP_REFCNT_LOCK;
4026c95a
SH
5065 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
5066 OP_REFCNT_UNLOCK;
5067 if (!refcnt)
9b978d73 5068 op_free((OP_4tree*)r->data->data[n]);
9b978d73 5069
f3548bdc 5070 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
5071 SvREFCNT_dec((SV*)new_comppad);
5072 new_comppad = NULL;
c277df42
IZ
5073 break;
5074 case 'n':
9e55ce06 5075 break;
c277df42 5076 default:
830247a4 5077 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
5078 }
5079 }
5080 Safefree(r->data->what);
5081 Safefree(r->data);
a0d0e21e
LW
5082 }
5083 Safefree(r->startp);
5084 Safefree(r->endp);
5085 Safefree(r);
a687059c 5086}
c277df42
IZ
5087
5088/*
5089 - regnext - dig the "next" pointer out of a node
5090 *
5091 * [Note, when REGALIGN is defined there are two places in regmatch()
5092 * that bypass this code for speed.]
5093 */
5094regnode *
864dbfa3 5095Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5096{
5097 register I32 offset;
5098
3280af22 5099 if (p == &PL_regdummy)
c277df42
IZ
5100 return(NULL);
5101
5102 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5103 if (offset == 0)
5104 return(NULL);
5105
c277df42 5106 return(p+offset);
c277df42
IZ
5107}
5108
01f988be 5109STATIC void
cea2e8a9 5110S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5111{
5112 va_list args;
5113 STRLEN l1 = strlen(pat1);
5114 STRLEN l2 = strlen(pat2);
5115 char buf[512];
06bf62c7 5116 SV *msv;
c277df42
IZ
5117 char *message;
5118
5119 if (l1 > 510)
5120 l1 = 510;
5121 if (l1 + l2 > 510)
5122 l2 = 510 - l1;
5123 Copy(pat1, buf, l1 , char);
5124 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5125 buf[l1 + l2] = '\n';
5126 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5127#ifdef I_STDARG
5128 /* ANSI variant takes additional second argument */
c277df42 5129 va_start(args, pat2);
8736538c
AS
5130#else
5131 va_start(args);
5132#endif
5a844595 5133 msv = vmess(buf, &args);
c277df42 5134 va_end(args);
06bf62c7 5135 message = SvPV(msv,l1);
c277df42
IZ
5136 if (l1 > 512)
5137 l1 = 512;
5138 Copy(message, buf, l1 , char);
197cf9b9 5139 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 5140 Perl_croak(aTHX_ "%s", buf);
c277df42 5141}
a0ed51b3
LW
5142
5143/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5144
5145void
864dbfa3 5146Perl_save_re_context(pTHX)
b81d288d 5147{
830247a4 5148 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5149 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5150 SAVEPPTR(PL_reginput); /* String-input pointer. */
5151 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5152 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5153 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5154 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5155 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5156 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5157 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5158 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5159 PL_reg_start_tmp = 0;
a0ed51b3
LW
5160 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5161 PL_reg_start_tmpl = 0;
7766f137 5162 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5163 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5164 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5165 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5166 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5167 SAVEVPTR(PL_regcc); /* from regexec.c */
5168 SAVEVPTR(PL_curcop);
7766f137
GS
5169 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5170 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5171 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5172 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5173 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5174 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5175 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5176 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5177 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5178 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5179 PL_reg_oldsaved = Nullch;
5180 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5181 PL_reg_oldsavedlen = 0;
ed252734
NC
5182#ifdef PERL_COPY_ON_WRITE
5183 SAVESPTR(PL_nrs);
5184 PL_nrs = Nullsv;
5185#endif
a5db57d6
GS
5186 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5187 PL_reg_maxiter = 0;
5188 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5189 PL_reg_leftiter = 0;
5190 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5191 PL_reg_poscache = Nullch;
5192 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5193 PL_reg_poscache_size = 0;
5194 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5195 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5196 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5197
5198 {
5199 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5200 U32 i;
ada6e8a9
AMS
5201 GV *mgv;
5202 REGEXP *rx;
cf28e18a 5203 char digits[TYPE_CHARS(long)];
ada6e8a9
AMS
5204
5205 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5206 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5207 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5208 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5209 save_scalar(mgv);
5210 }
5211 }
5212 }
5213
54b6e2fa 5214#ifdef DEBUGGING
b81d288d 5215 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5216#endif
a0ed51b3 5217}
51371543 5218
51371543 5219static void
acfe0abc 5220clear_re(pTHX_ void *r)
51371543
GS
5221{
5222 ReREFCNT_dec((regexp *)r);
5223}
ffbc6a93 5224
241d1a3b
NC
5225/*
5226 * Local variables:
5227 * c-indentation-style: bsd
5228 * c-basic-offset: 4
5229 * indent-tabs-mode: t
5230 * End:
5231 *
c49a72a9 5232 * vim: shiftwidth=4:
241d1a3b 5233*/