This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
40d34c0d
SB
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
bb3580c7
NC
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
40d34c0d
SB
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
e6906430 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
583439ab 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
56b65a99 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
56b65a99 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
56b65a99
AT
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
fe20fd30
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
43af864e 269#define FAIL(msg) STMT_START { \
a00f3e00 270 const char *ellipses = ""; \
43af864e
JH
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
43af864e 289#define FAIL2(pat,msg) STMT_START { \
a00f3e00 290 const char *ellipses = ""; \
43af864e
JH
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
43af864e 308#define Simple_vFAIL(m) STMT_START { \
e4057cfc 309 const IV offset = RExC_parse - RExC_precomp; \
43af864e
JH
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
43af864e
JH
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
43af864e 326#define Simple_vFAIL2(m,a1) STMT_START { \
e4057cfc 327 const IV offset = RExC_parse - RExC_precomp; \
43af864e
JH
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
43af864e
JH
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
43af864e 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
e4057cfc 346 const IV offset = RExC_parse - RExC_precomp; \
43af864e
JH
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
43af864e
JH
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
43af864e 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
e4057cfc 364 const IV offset = RExC_parse - RExC_precomp; \
43af864e
JH
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
43af864e 369#define vWARN(loc,m) STMT_START { \
e4057cfc 370 const IV offset = loc - RExC_precomp; \
43af864e
JH
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
e4057cfc 376 const IV offset = loc - RExC_precomp; \
43af864e
JH
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
e4057cfc 384 const IV offset = loc - RExC_precomp; \
43af864e
JH
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
e4057cfc 390 const IV offset = loc - RExC_precomp; \
43af864e
JH
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
e4057cfc 396 const IV offset = loc - RExC_precomp; \
43af864e
JH
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
e4057cfc 402 const IV offset = loc - RExC_precomp; \
43af864e
JH
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
43af864e
JH
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
43af864e
JH
420/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
c05e0e2f
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
0dbcf736 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
ec2e9529 498 {
e4057cfc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
ec2e9529
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
e4057cfc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
e4057cfc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
73c86719
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
73c86719
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
73c86719
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
e4057cfc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c
JH
652/*
653 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
654 * These need to be revisited when a newer toolchain becomes available.
655 */
656#if defined(__sparc64__) && defined(__GNUC__)
657# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
658# undef SPARC64_GCC_WORKAROUND
659# define SPARC64_GCC_WORKAROUND 1
660# endif
661#endif
662
653099ff
GS
663/* REx optimizer. Converts nodes into quickier variants "in place".
664 Finds fixed substrings. */
665
5332c881 666/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
667 to the position after last scanned or to NULL. */
668
76e3520e 669STATIC I32
830247a4 670S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
671 /* scanp: Start here (read-write). */
672 /* deltap: Write maxlen-minlen here. */
673 /* last: Stop before this one. */
674{
675 I32 min = 0, pars = 0, code;
676 regnode *scan = *scanp, *next;
677 I32 delta = 0;
678 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 679 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
680 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
681 scan_data_t data_fake;
653099ff 682 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 683
c277df42
IZ
684 while (scan && OP(scan) != END && scan < last) {
685 /* Peephole optimizer: */
686
22c35a8c 687 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 688 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
689 regnode *n = regnext(scan);
690 U32 stringok = 1;
691#ifdef DEBUGGING
692 regnode *stop = scan;
b81d288d 693#endif
c277df42 694
cd439c50 695 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
696 /* Skip NOTHING, merge EXACT*. */
697 while (n &&
b81d288d 698 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
699 (stringok && (OP(n) == OP(scan))))
700 && NEXT_OFF(n)
701 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
702 if (OP(n) == TAIL || n > next)
703 stringok = 0;
22c35a8c 704 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
705 NEXT_OFF(scan) += NEXT_OFF(n);
706 next = n + NODE_STEP_REGNODE;
707#ifdef DEBUGGING
708 if (stringok)
709 stop = n;
b81d288d 710#endif
c277df42 711 n = regnext(n);
a0ed51b3 712 }
f49d4d0f 713 else if (stringok) {
8c18bf38 714 const int oldl = STR_LEN(scan);
c277df42 715 regnode *nnext = regnext(n);
f49d4d0f 716
b81d288d 717 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
718 break;
719 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
720 STR_LEN(scan) += STR_LEN(n);
721 next = n + NODE_SZ_STR(n);
c277df42 722 /* Now we can overwrite *n : */
f49d4d0f 723 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 724#ifdef DEBUGGING
f49d4d0f 725 stop = next - 1;
b81d288d 726#endif
c277df42
IZ
727 n = nnext;
728 }
729 }
61a36c01 730
d65e4eab 731 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
732/*
733 Two problematic code points in Unicode casefolding of EXACT nodes:
734
735 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
736 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
737
738 which casefold to
739
740 Unicode UTF-8
741
742 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
743 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
744
745 This means that in case-insensitive matching (or "loose matching",
746 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
747 length of the above casefolded versions) can match a target string
748 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
749 This would rather mess up the minimum length computation.
750
751 What we'll do is to look for the tail four bytes, and then peek
752 at the preceding two bytes to see whether we need to decrease
753 the minimum length by four (six minus two).
754
755 Thanks to the design of UTF-8, there cannot be false matches:
756 A sequence of valid UTF-8 bytes cannot be a subsequence of
757 another valid sequence of UTF-8 bytes.
758
759*/
760 char *s0 = STRING(scan), *s, *t;
761 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
e4057cfc
AL
762 const char * const t0 = "\xcc\x88\xcc\x81";
763 const char * const t1 = t0 + 3;
e503e849 764
61a36c01
JH
765 for (s = s0 + 2;
766 s < s2 && (t = ninstr(s, s1, t0, t1));
767 s = t + 4) {
768 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
769 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
770 min -= 4;
771 }
772 }
773
c277df42
IZ
774#ifdef DEBUGGING
775 /* Allow dumping */
cd439c50 776 n = scan + NODE_SZ_STR(scan);
c277df42 777 while (n <= stop) {
22c35a8c 778 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
779 OP(n) = OPTIMIZED;
780 NEXT_OFF(n) = 0;
781 }
782 n++;
783 }
653099ff 784#endif
c277df42 785 }
653099ff
GS
786 /* Follow the next-chain of the current node and optimize
787 away all the NOTHINGs from it. */
c277df42 788 if (OP(scan) != CURLYX) {
8c18bf38 789 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
790 ? I32_MAX
791 /* I32 may be smaller than U16 on CRAYs! */
792 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
793 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
794 int noff;
795 regnode *n = scan;
b81d288d 796
c277df42
IZ
797 /* Skip NOTHING and LONGJMP. */
798 while ((n = regnext(n))
22c35a8c 799 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
800 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
801 && off + noff < max)
802 off += noff;
803 if (reg_off_by_arg[OP(scan)])
804 ARG(scan) = off;
b81d288d 805 else
c277df42
IZ
806 NEXT_OFF(scan) = off;
807 }
653099ff
GS
808 /* The principal pseudo-switch. Cannot be a switch, since we
809 look into several different things. */
b81d288d 810 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
811 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
812 next = regnext(scan);
813 code = OP(scan);
b81d288d
AB
814
815 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 816 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 817 struct regnode_charclass_class accum;
c277df42 818
653099ff 819 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 820 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 821 if (flags & SCF_DO_STCLASS)
830247a4 822 cl_init_zero(pRExC_state, &accum);
c277df42 823 while (OP(scan) == code) {
830247a4 824 I32 deltanext, minnext, f = 0, fake;
653099ff 825 struct regnode_charclass_class this_class;
c277df42
IZ
826
827 num++;
828 data_fake.flags = 0;
b81d288d 829 if (data) {
2c2d71f5 830 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
831 data_fake.last_closep = data->last_closep;
832 }
833 else
834 data_fake.last_closep = &fake;
c277df42
IZ
835 next = regnext(scan);
836 scan = NEXTOPER(scan);
837 if (code != BRANCH)
838 scan = NEXTOPER(scan);
653099ff 839 if (flags & SCF_DO_STCLASS) {
830247a4 840 cl_init(pRExC_state, &this_class);
653099ff
GS
841 data_fake.start_class = &this_class;
842 f = SCF_DO_STCLASS_AND;
b81d288d 843 }
e1901655
IZ
844 if (flags & SCF_WHILEM_VISITED_POS)
845 f |= SCF_WHILEM_VISITED_POS;
653099ff 846 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
847 minnext = study_chunk(pRExC_state, &scan, &deltanext,
848 next, &data_fake, f);
b81d288d 849 if (min1 > minnext)
c277df42
IZ
850 min1 = minnext;
851 if (max1 < minnext + deltanext)
852 max1 = minnext + deltanext;
853 if (deltanext == I32_MAX)
aca2d497 854 is_inf = is_inf_internal = 1;
c277df42
IZ
855 scan = next;
856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
857 pars++;
405ff068 858 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 859 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
860 if (data)
861 data->whilem_c = data_fake.whilem_c;
653099ff 862 if (flags & SCF_DO_STCLASS)
830247a4 863 cl_or(pRExC_state, &accum, &this_class);
b81d288d 864 if (code == SUSPEND)
c277df42
IZ
865 break;
866 }
867 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
868 min1 = 0;
869 if (flags & SCF_DO_SUBSTR) {
870 data->pos_min += min1;
871 data->pos_delta += max1 - min1;
872 if (max1 != min1 || is_inf)
873 data->longest = &(data->longest_float);
874 }
875 min += min1;
876 delta += max1 - min1;
653099ff 877 if (flags & SCF_DO_STCLASS_OR) {
830247a4 878 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
879 if (min1) {
880 cl_and(data->start_class, &and_with);
881 flags &= ~SCF_DO_STCLASS;
882 }
883 }
884 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
885 if (min1) {
886 cl_and(data->start_class, &accum);
653099ff 887 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
888 }
889 else {
b81d288d 890 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
891 * data->start_class */
892 StructCopy(data->start_class, &and_with,
893 struct regnode_charclass_class);
894 flags &= ~SCF_DO_STCLASS_AND;
895 StructCopy(&accum, data->start_class,
896 struct regnode_charclass_class);
897 flags |= SCF_DO_STCLASS_OR;
898 data->start_class->flags |= ANYOF_EOS;
899 }
653099ff 900 }
c9dc1ff4 901
a0ed51b3
LW
902 }
903 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
907 continue;
a0ed51b3
LW
908 }
909 else if (OP(scan) == EXACT) {
cd439c50 910 I32 l = STR_LEN(scan);
1aa99e6b 911 UV uc = *((U8*)STRING(scan));
a0ed51b3 912 if (UTF) {
8c18bf38 913 const U8 * const s = (U8*)STRING(scan);
47a91a97
NC
914 l = utf8_length((U8 *)s, (U8 *)s + l);
915 uc = utf8_to_uvchr((U8 *)s, NULL);
a0ed51b3
LW
916 }
917 min += l;
c277df42 918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
b81d288d 924 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 925 }
cd439c50 926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
ec2e9529 927 {
e7b555a6 928 SV * const sv = data->last_found;
e4057cfc 929 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
ec2e9529
JH
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
3fe130af
JH
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
ec2e9529 934 }
33b8afdf
JH
935 if (UTF)
936 SvUTF8_on(data->last_found);
c277df42
IZ
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
940 }
653099ff
GS
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
943 int compat = 1;
944
1aa99e6b 945 if (uc >= 0x100 ||
516a5887 946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 948 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 950 )
653099ff
GS
951 compat = 0;
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
954 if (compat)
1aa99e6b 955 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 956 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
957 if (uc < 0x100)
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
959 }
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
1aa99e6b 962 if (uc < 0x100)
9b877dbb
IH
963 ANYOF_BITMAP_SET(data->start_class, uc);
964 else
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
653099ff 971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 972 I32 l = STR_LEN(scan);
1aa99e6b 973 UV uc = *((U8*)STRING(scan));
653099ff
GS
974
975 /* Search for fixed substrings supports EXACT only. */
b81d288d 976 if (flags & SCF_DO_SUBSTR)
830247a4 977 scan_commit(pRExC_state, data);
a0ed51b3 978 if (UTF) {
1aa99e6b
IH
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
9041c2e3 981 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
982 }
983 min += l;
c277df42 984 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 985 data->pos_min += l;
653099ff
GS
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
988 int compat = 1;
989
1aa99e6b 990 if (uc >= 0x100 ||
516a5887 991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
994 compat = 0;
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
997 if (compat) {
1aa99e6b 998 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1003 }
1004 }
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1aa99e6b
IH
1009 if (uc < 0x100)
1010 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1011 data->start_class->flags &= ~ANYOF_EOS;
1012 }
1013 cl_and(data->start_class, &and_with);
1014 }
1015 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1016 }
a00f3e00 1017 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1019 I32 f = flags, pos_before = 0;
c277df42 1020 regnode *oscan = scan;
653099ff
GS
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
727f22e3 1023 I32 next_is_eval = 0;
653099ff 1024
22c35a8c 1025 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1026 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1027 scan = NEXTOPER(scan);
1028 goto finish;
1029 case PLUS:
653099ff 1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1031 next = NEXTOPER(scan);
653099ff 1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1033 mincount = 1;
1034 maxcount = REG_INFTY;
c277df42
IZ
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1037 goto do_curly;
1038 }
1039 }
1040 if (flags & SCF_DO_SUBSTR)
1041 data->pos_min++;
1042 min++;
1043 /* Fall through. */
1044 case STAR:
653099ff
GS
1045 if (flags & SCF_DO_STCLASS) {
1046 mincount = 0;
b81d288d 1047 maxcount = REG_INFTY;
653099ff
GS
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1050 goto do_curly;
1051 }
b81d288d 1052 is_inf = is_inf_internal = 1;
c277df42
IZ
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
830247a4 1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1056 data->longest = &(data->longest_float);
1057 }
1058 goto optimize_curly_tail;
1059 case CURLY:
b81d288d 1060 mincount = ARG1(scan);
c277df42
IZ
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
cb434fcc
IZ
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1065
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 }
c277df42 1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1069 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1070 do_curly:
1071 if (flags & SCF_DO_SUBSTR) {
830247a4 1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1073 pos_before = data->pos_min;
1074 }
1075 if (data) {
1076 fl = data->flags;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 if (is_inf)
1079 data->flags |= SF_IS_INF;
1080 }
653099ff 1081 if (flags & SCF_DO_STCLASS) {
830247a4 1082 cl_init(pRExC_state, &this_class);
653099ff
GS
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1087 }
e1901655
IZ
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
b81d288d 1091
e1901655
IZ
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1096
c277df42 1097 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 mincount == 0
653099ff
GS
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1106 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1107 }
1108 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1109 /* Switch to OR mode: cache the old value of
653099ff
GS
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1118 }
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1121 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1122 cl_and(data->start_class, &and_with);
1123 }
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1127 }
c277df42
IZ
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1129 scan = next;
f5e9f069
NC
1130 if ( /* ? quantifier ok, except for (?{ ... }) */
1131 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1132 && (minnext == 0) && (deltanext == 0)
99799961 1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
f5e9f069
NC
1134 && maxcount <= REG_INFTY/3 /* Complement check for big count */
1135 && ckWARN(WARN_REGEXP))
b45f050a 1136 {
830247a4 1137 vWARN(RExC_parse,
b45f050a
JF
1138 "Quantifier unexpected on zero-length expression");
1139 }
1140
c277df42 1141 min += minnext * mincount;
b81d288d 1142 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
aca2d497 1145 is_inf |= is_inf_internal;
c277df42
IZ
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1149 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1155 regnode *nxt1 = nxt;
1156#ifdef DEBUGGING
1157 regnode *nxt2;
1158#endif
c277df42
IZ
1159
1160 /* Skip open. */
1161 nxt = regnext(nxt);
a00f3e00 1162 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1164 && STR_LEN(nxt) == 1))
c277df42 1165 goto nogo;
497b47a8 1166#ifdef DEBUGGING
c277df42 1167 nxt2 = nxt;
497b47a8 1168#endif
c277df42 1169 nxt = regnext(nxt);
b81d288d 1170 if (OP(nxt) != CLOSE)
c277df42
IZ
1171 goto nogo;
1172 /* Now we know that nxt2 is the only contents: */
eb160463 1173 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1174 OP(oscan) = CURLYN;
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1176#ifdef DEBUGGING
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1183#endif
c277df42 1184 }
c277df42
IZ
1185 nogo:
1186
1187 /* Try optimization CURLYX => CURLYM. */
b81d288d 1188 if ( OP(oscan) == CURLYX && data
c277df42 1189 && !(data->flags & SF_HAS_PAR)
c277df42 1190 && !(data->flags & SF_HAS_EVAL)
b035a42e
NC
1191 && !deltanext /* atom is fixed width */
1192 && minnext != 0 /* CURLYM can't handle zero width */
1193 ) {
c277df42
IZ
1194 /* XXXX How to optimize if data == 0? */
1195 /* Optimize to a simpler form. */
1196 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 regnode *nxt2;
1198
1199 OP(oscan) = CURLYM;
1200 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1201 && (OP(nxt2) != WHILEM))
c277df42
IZ
1202 nxt = nxt2;
1203 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1204 /* Need to optimize away parenths. */
1205 if (data->flags & SF_IN_PAR) {
1206 /* Set the parenth number. */
1207 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208
b81d288d 1209 if (OP(nxt) != CLOSE)
b45f050a 1210 FAIL("Panic opt close");
eb160463 1211 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1212 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1213 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1214#ifdef DEBUGGING
1215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216 OP(nxt + 1) = OPTIMIZED; /* was count. */
1217 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1219#endif
c277df42
IZ
1220#if 0
1221 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222 regnode *nnxt = regnext(nxt1);
b81d288d 1223
c277df42
IZ
1224 if (nnxt == nxt) {
1225 if (reg_off_by_arg[OP(nxt1)])
1226 ARG_SET(nxt1, nxt2 - nxt1);
1227 else if (nxt2 - nxt1 < U16_MAX)
1228 NEXT_OFF(nxt1) = nxt2 - nxt1;
1229 else
1230 OP(nxt) = NOTHING; /* Cannot beautify */
1231 }
1232 nxt1 = nnxt;
1233 }
1234#endif
1235 /* Optimize again: */
b81d288d 1236 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1237 NULL, 0);
a0ed51b3
LW
1238 }
1239 else
c277df42 1240 oscan->flags = 0;
c277df42 1241 }
e1901655
IZ
1242 else if ((OP(oscan) == CURLYX)
1243 && (flags & SCF_WHILEM_VISITED_POS)
1244 /* See the comment on a similar expression above.
1245 However, this time it not a subexpression
1246 we care about, but the expression itself. */
1247 && (maxcount == REG_INFTY)
1248 && data && ++data->whilem_c < 16) {
1249 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1250 /* Find WHILEM (as in regexec.c) */
1251 regnode *nxt = oscan + NEXT_OFF(oscan);
1252
1253 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254 nxt += ARG(nxt);
eb160463
GS
1255 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1257 }
b81d288d 1258 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1259 pars++;
1260 if (flags & SCF_DO_SUBSTR) {
1261 SV *last_str = Nullsv;
1262 int counted = mincount != 0;
1263
1264 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1265#if defined(SPARC64_GCC_WORKAROUND)
1266 I32 b = 0;
1267 STRLEN l = 0;
71a0dd65 1268 const char *s = NULL;
5d1c421c
JH
1269 I32 old = 0;
1270
1271 if (pos_before >= data->last_start_min)
1272 b = pos_before;
1273 else
1274 b = data->last_start_min;
1275
1276 l = 0;
71a0dd65 1277 s = SvPV_const(data->last_found, l);
5d1c421c
JH
1278 old = b - data->last_start_min;
1279
1280#else
b81d288d 1281 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1282 ? pos_before : data->last_start_min;
1283 STRLEN l;
71a0dd65 1284 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 1285 I32 old = b - data->last_start_min;
5d1c421c 1286#endif
a0ed51b3
LW
1287
1288 if (UTF)
1289 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1290
a0ed51b3 1291 l -= old;
c277df42 1292 /* Get the added string: */
79cb57f6 1293 last_str = newSVpvn(s + old, l);
0e933229
IH
1294 if (UTF)
1295 SvUTF8_on(last_str);
c277df42
IZ
1296 if (deltanext == 0 && pos_before == b) {
1297 /* What was added is a constant string */
1298 if (mincount > 1) {
1299 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1300 repeatcpy(SvPVX(last_str) + l,
fdac8c4b 1301 SvPVX_const(last_str), l, mincount - 1);
a8dc4fe8 1302 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 1303 /* Add additional parts. */
b81d288d 1304 SvCUR_set(data->last_found,
c277df42
IZ
1305 SvCUR(data->last_found) - l);
1306 sv_catsv(data->last_found, last_str);
ec2e9529
JH
1307 {
1308 SV * sv = data->last_found;
1309 MAGIC *mg =
1310 SvUTF8(sv) && SvMAGICAL(sv) ?
1311 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312 if (mg && mg->mg_len >= 0)
1313 mg->mg_len += CHR_SVLEN(last_str);
1314 }
c277df42
IZ
1315 data->last_end += l * (mincount - 1);
1316 }
2a8d9689
HS
1317 } else {
1318 /* start offset must point into the last copy */
1319 data->last_start_min += minnext * (mincount - 1);
5b7ea690
JH
1320 data->last_start_max += is_inf ? I32_MAX
1321 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1322 }
1323 }
1324 /* It is counted once already... */
1325 data->pos_min += minnext * (mincount - counted);
1326 data->pos_delta += - counted * deltanext +
1327 (minnext + deltanext) * maxcount - minnext * mincount;
1328 if (mincount != maxcount) {
653099ff
GS
1329 /* Cannot extend fixed substrings found inside
1330 the group. */
830247a4 1331 scan_commit(pRExC_state,data);
c277df42
IZ
1332 if (mincount && last_str) {
1333 sv_setsv(data->last_found, last_str);
1334 data->last_end = data->pos_min;
b81d288d 1335 data->last_start_min =
a0ed51b3 1336 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1337 data->last_start_max = is_inf
1338 ? I32_MAX
c277df42 1339 : data->pos_min + data->pos_delta
a0ed51b3 1340 - CHR_SVLEN(last_str);
c277df42
IZ
1341 }
1342 data->longest = &(data->longest_float);
1343 }
aca2d497 1344 SvREFCNT_dec(last_str);
c277df42 1345 }
405ff068 1346 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1347 data->flags |= SF_HAS_EVAL;
1348 optimize_curly_tail:
c277df42 1349 if (OP(oscan) != CURLYX) {
22c35a8c 1350 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1351 && NEXT_OFF(next))
1352 NEXT_OFF(oscan) += NEXT_OFF(next);
1353 }
c277df42 1354 continue;
653099ff 1355 default: /* REF and CLUMP only? */
c277df42 1356 if (flags & SCF_DO_SUBSTR) {
830247a4 1357 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1358 data->longest = &(data->longest_float);
1359 }
aca2d497 1360 is_inf = is_inf_internal = 1;
653099ff 1361 if (flags & SCF_DO_STCLASS_OR)
830247a4 1362 cl_anything(pRExC_state, data->start_class);
653099ff 1363 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1364 break;
1365 }
a0ed51b3 1366 }
a00f3e00 1367 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 1368 int value = 0;
653099ff 1369
c277df42 1370 if (flags & SCF_DO_SUBSTR) {
830247a4 1371 scan_commit(pRExC_state,data);
c277df42
IZ
1372 data->pos_min++;
1373 }
1374 min++;
653099ff
GS
1375 if (flags & SCF_DO_STCLASS) {
1376 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1377
1378 /* Some of the logic below assumes that switching
1379 locale on will only add false positives. */
1380 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1381 case SANY:
653099ff
GS
1382 default:
1383 do_default:
1384 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1385 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1386 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1387 break;
1388 case REG_ANY:
1389 if (OP(scan) == SANY)
1390 goto do_default;
1391 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1392 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1393 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1394 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1395 }
1396 if (flags & SCF_DO_STCLASS_AND || !value)
1397 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398 break;
1399 case ANYOF:
1400 if (flags & SCF_DO_STCLASS_AND)
1401 cl_and(data->start_class,
1402 (struct regnode_charclass_class*)scan);
1403 else
830247a4 1404 cl_or(pRExC_state, data->start_class,
653099ff
GS
1405 (struct regnode_charclass_class*)scan);
1406 break;
1407 case ALNUM:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (!isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isALNUM(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case ALNUML:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430 }
1431 else {
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1433 data->start_class->flags |= ANYOF_LOCALE;
1434 }
1435 break;
1436 case NALNUM:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1440 for (value = 0; value < 256; value++)
1441 if (isALNUM(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isALNUM(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case NALNUML:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459 }
1460 else {
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1463 }
1464 break;
1465 case SPACE:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1469 for (value = 0; value < 256; value++)
1470 if (!isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1472 }
1473 }
1474 else {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477 else {
1478 for (value = 0; value < 256; value++)
1479 if (isSPACE(value))
b81d288d 1480 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1481 }
1482 }
1483 break;
1484 case SPACEL:
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE)
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488 }
1489 else {
1490 data->start_class->flags |= ANYOF_LOCALE;
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1492 }
1493 break;
1494 case NSPACE:
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1498 for (value = 0; value < 256; value++)
1499 if (isSPACE(value))
1500 ANYOF_BITMAP_CLEAR(data->start_class, value);
1501 }
1502 }
1503 else {
1504 if (data->start_class->flags & ANYOF_LOCALE)
1505 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1506 else {
1507 for (value = 0; value < 256; value++)
1508 if (!isSPACE(value))
b81d288d 1509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1510 }
1511 }
1512 break;
1513 case NSPACEL:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 if (data->start_class->flags & ANYOF_LOCALE) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 }
1521 }
1522 else {
1523 data->start_class->flags |= ANYOF_LOCALE;
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1525 }
1526 break;
1527 case DIGIT:
1528 if (flags & SCF_DO_STCLASS_AND) {
1529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1530 for (value = 0; value < 256; value++)
1531 if (!isDIGIT(value))
1532 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 }
1534 else {
1535 if (data->start_class->flags & ANYOF_LOCALE)
1536 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1537 else {
1538 for (value = 0; value < 256; value++)
1539 if (isDIGIT(value))
b81d288d 1540 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1541 }
1542 }
1543 break;
1544 case NDIGIT:
1545 if (flags & SCF_DO_STCLASS_AND) {
1546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1547 for (value = 0; value < 256; value++)
1548 if (isDIGIT(value))
1549 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550 }
1551 else {
1552 if (data->start_class->flags & ANYOF_LOCALE)
1553 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1554 else {
1555 for (value = 0; value < 256; value++)
1556 if (!isDIGIT(value))
b81d288d 1557 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1558 }
1559 }
1560 break;
1561 }
1562 if (flags & SCF_DO_STCLASS_OR)
1563 cl_and(data->start_class, &and_with);
1564 flags &= ~SCF_DO_STCLASS;
1565 }
a0ed51b3 1566 }
22c35a8c 1567 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1568 data->flags |= (OP(scan) == MEOL
1569 ? SF_BEFORE_MEOL
1570 : SF_BEFORE_SEOL);
a0ed51b3 1571 }
653099ff
GS
1572 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1573 /* Lookbehind, or need to calculate parens/evals/stclass: */
1574 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1575 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1576 /* Lookahead/lookbehind */
cb434fcc 1577 I32 deltanext, minnext, fake = 0;
c277df42 1578 regnode *nscan;
653099ff
GS
1579 struct regnode_charclass_class intrnl;
1580 int f = 0;
c277df42
IZ
1581
1582 data_fake.flags = 0;
b81d288d 1583 if (data) {
2c2d71f5 1584 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1585 data_fake.last_closep = data->last_closep;
1586 }
1587 else
1588 data_fake.last_closep = &fake;
653099ff
GS
1589 if ( flags & SCF_DO_STCLASS && !scan->flags
1590 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1591 cl_init(pRExC_state, &intrnl);
653099ff 1592 data_fake.start_class = &intrnl;
e1901655 1593 f |= SCF_DO_STCLASS_AND;
653099ff 1594 }
e1901655
IZ
1595 if (flags & SCF_WHILEM_VISITED_POS)
1596 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1597 next = regnext(scan);
1598 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1599 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1600 if (scan->flags) {
1601 if (deltanext) {
9baa0206 1602 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1603 }
1604 else if (minnext > U8_MAX) {
9baa0206 1605 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1606 }
eb160463 1607 scan->flags = (U8)minnext;
c277df42
IZ
1608 }
1609 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1610 pars++;
405ff068 1611 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1612 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1613 if (data)
1614 data->whilem_c = data_fake.whilem_c;
e1901655 1615 if (f & SCF_DO_STCLASS_AND) {
e4057cfc 1616 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
1617
1618 cl_and(data->start_class, &intrnl);
1619 if (was)
1620 data->start_class->flags |= ANYOF_EOS;
1621 }
a0ed51b3
LW
1622 }
1623 else if (OP(scan) == OPEN) {
c277df42 1624 pars++;
a0ed51b3 1625 }
cb434fcc 1626 else if (OP(scan) == CLOSE) {
eb160463 1627 if ((I32)ARG(scan) == is_par) {
cb434fcc 1628 next = regnext(scan);
c277df42 1629
cb434fcc
IZ
1630 if ( next && (OP(next) != WHILEM) && next < last)
1631 is_par = 0; /* Disable optimization */
1632 }
1633 if (data)
1634 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1635 }
1636 else if (OP(scan) == EVAL) {
c277df42
IZ
1637 if (data)
1638 data->flags |= SF_HAS_EVAL;
1639 }
96776eda 1640 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1641 if (flags & SCF_DO_SUBSTR) {
830247a4 1642 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1643 data->longest = &(data->longest_float);
1644 }
1645 is_inf = is_inf_internal = 1;
653099ff 1646 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1647 cl_anything(pRExC_state, data->start_class);
96776eda 1648 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1649 }
c277df42
IZ
1650 /* Else: zero-length, ignore. */
1651 scan = regnext(scan);
1652 }
1653
1654 finish:
1655 *scanp = scan;
aca2d497 1656 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1657 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1658 data->pos_delta = I32_MAX - data->pos_min;
1659 if (is_par > U8_MAX)
1660 is_par = 0;
1661 if (is_par && pars==1 && data) {
1662 data->flags |= SF_IN_PAR;
1663 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1664 }
1665 else if (pars && data) {
c277df42
IZ
1666 data->flags |= SF_HAS_PAR;
1667 data->flags &= ~SF_IN_PAR;
1668 }
653099ff
GS
1669 if (flags & SCF_DO_STCLASS_OR)
1670 cl_and(data->start_class, &and_with);
c277df42
IZ
1671 return min;
1672}
1673
76e3520e 1674STATIC I32
a00f3e00 1675S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 1676{
830247a4 1677 if (RExC_rx->data) {
b81d288d
AB
1678 Renewc(RExC_rx->data,
1679 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1680 char, struct reg_data);
830247a4
IZ
1681 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1682 RExC_rx->data->count += n;
a0ed51b3
LW
1683 }
1684 else {
cd7a8267 1685 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1686 char, struct reg_data);
cd7a8267 1687 Newx(RExC_rx->data->what, n, U8);
830247a4 1688 RExC_rx->data->count = n;
c277df42 1689 }
830247a4
IZ
1690 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1691 return RExC_rx->data->count - n;
c277df42
IZ
1692}
1693
d88dccdf 1694void
864dbfa3 1695Perl_reginitcolors(pTHX)
d88dccdf 1696{
228fe6e6 1697 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 1698 if (s) {
228fe6e6
AL
1699 char *t = savepv(s);
1700 int i = 0;
1701 PL_colors[0] = t;
d88dccdf 1702 while (++i < 6) {
228fe6e6
AL
1703 t = strchr(t, '\t');
1704 if (t) {
1705 *t = '\0';
1706 PL_colors[i] = ++t;
d88dccdf
IZ
1707 }
1708 else
228fe6e6 1709 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
1710 }
1711 } else {
228fe6e6 1712 int i = 0;
b81d288d 1713 while (i < 6)
ec6f298e 1714 PL_colors[i++] = (char *)"";
d88dccdf
IZ
1715 }
1716 PL_colorset = 1;
1717}
1718
8615cb43 1719
a687059c 1720/*
e50aee73 1721 - pregcomp - compile a regular expression into internal code
a687059c
LW
1722 *
1723 * We can't allocate space until we know how big the compiled form will be,
1724 * but we can't compile it (and thus know how big it is) until we've got a
1725 * place to put the code. So we cheat: we compile it twice, once with code
1726 * generation turned off and size counting turned on, and once "for real".
1727 * This also means that we don't allocate space until we are sure that the
1728 * thing really will compile successfully, and we never have to move the
1729 * code and thus invalidate pointers into it. (Note that it has to be in
1730 * one piece because free() must be able to free it all.) [NB: not true in perl]
1731 *
1732 * Beware that the optimization-preparation code in here knows about some
1733 * of the structure of the compiled regexp. [I'll say.]
1734 */
1735regexp *
864dbfa3 1736Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1737{
a0d0e21e 1738 register regexp *r;
c277df42 1739 regnode *scan;
c277df42 1740 regnode *first;
a0d0e21e 1741 I32 flags;
a0d0e21e
LW
1742 I32 minlen = 0;
1743 I32 sawplus = 0;
1744 I32 sawopen = 0;
2c2d71f5 1745 scan_data_t data;
830247a4
IZ
1746 RExC_state_t RExC_state;
1747 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1748
1749 if (exp == NULL)
c277df42 1750 FAIL("NULL regexp argument");
a0d0e21e 1751
a5961de5 1752 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1753
5cfc7842 1754 RExC_precomp = exp;
a5961de5
JH
1755 DEBUG_r({
1756 if (!PL_colorset) reginitcolors();
1757 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1758 PL_colors[4],PL_colors[5],PL_colors[0],
1759 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1760 });
e2509266 1761 RExC_flags = pm->op_pmflags;
830247a4 1762 RExC_sawback = 0;
bbce6d69 1763
830247a4
IZ
1764 RExC_seen = 0;
1765 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1766 RExC_seen_evals = 0;
1767 RExC_extralen = 0;
c277df42 1768
bbce6d69 1769 /* First pass: determine size, legality. */
830247a4 1770 RExC_parse = exp;
fac92740 1771 RExC_start = exp;
830247a4
IZ
1772 RExC_end = xend;
1773 RExC_naughty = 0;
1774 RExC_npar = 1;
1775 RExC_size = 0L;
1776 RExC_emit = &PL_regdummy;
1777 RExC_whilem_seen = 0;
85ddcde9
JH
1778#if 0 /* REGC() is (currently) a NOP at the first pass.
1779 * Clever compilers notice this and complain. --jhi */
830247a4 1780 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1781#endif
830247a4 1782 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1783 RExC_precomp = Nullch;
a0d0e21e
LW
1784 return(NULL);
1785 }
830247a4 1786 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1787
c277df42
IZ
1788 /* Small enough for pointer-storage convention?
1789 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1790 if (RExC_size >= 0x10000L && RExC_extralen)
1791 RExC_size += RExC_extralen;
c277df42 1792 else
830247a4
IZ
1793 RExC_extralen = 0;
1794 if (RExC_whilem_seen > 15)
1795 RExC_whilem_seen = 15;
a0d0e21e 1796
bbce6d69 1797 /* Allocate space and initialize. */
cd7a8267 1798 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1799 char, regexp);
a0d0e21e 1800 if (r == NULL)
b45f050a
JF
1801 FAIL("Regexp out of space");
1802
0f79a09d
GS
1803#ifdef DEBUGGING
1804 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1805 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1806#endif
c277df42 1807 r->refcnt = 1;
bbce6d69 1808 r->prelen = xend - exp;
5cfc7842 1809 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1810 r->subbeg = NULL;
1811 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1812 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1813
1814 r->substrs = 0; /* Useful during FAIL. */
1815 r->startp = 0; /* Useful during FAIL. */
1816 r->endp = 0; /* Useful during FAIL. */
1817
cd7a8267 1818 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 1819 if (r->offsets) {
e503e849 1820 r->offsets[0] = RExC_size;
fac92740
MJD
1821 }
1822 DEBUG_r(PerlIO_printf(Perl_debug_log,
e503e849
NC
1823 "%s %"UVuf" bytes for offset annotations.\n",
1824 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1825 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1826
830247a4 1827 RExC_rx = r;
bbce6d69 1828
1829 /* Second pass: emit code. */
e2509266 1830 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1831 RExC_parse = exp;
1832 RExC_end = xend;
1833 RExC_naughty = 0;
1834 RExC_npar = 1;
fac92740 1835 RExC_emit_start = r->program;
830247a4 1836 RExC_emit = r->program;
2cd61cdb 1837 /* Store the count of eval-groups for security checks: */
eb160463 1838 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1839 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1840 r->data = 0;
830247a4 1841 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1842 return(NULL);
1843
1844 /* Dig out information for optimizations. */
cf93c79d 1845 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1846 pm->op_pmflags = RExC_flags;
a0ed51b3 1847 if (UTF)
5ff6fc6d 1848 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1849 r->regstclass = NULL;
830247a4 1850 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1851 r->reganch |= ROPT_NAUGHTY;
c277df42 1852 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1853
1854 /* XXXX To minimize changes to RE engine we always allocate
1855 3-units-long substrs field. */
cd7a8267 1856 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 1857
2c2d71f5 1858 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1859 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1860 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1861 I32 fake;
c5254dd6 1862 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1863 struct regnode_charclass_class ch_class;
1864 int stclass_flag;
cb434fcc 1865 I32 last_close = 0;
a0d0e21e
LW
1866
1867 first = scan;
c277df42 1868 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1869 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1870 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1871 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1872 (OP(first) == PLUS) ||
1873 (OP(first) == MINMOD) ||
653099ff 1874 /* An {n,m} with n>0 */
22c35a8c 1875 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1876 if (OP(first) == PLUS)
1877 sawplus = 1;
1878 else
1879 first += regarglen[(U8)OP(first)];
1880 first = NEXTOPER(first);
a687059c
LW
1881 }
1882
a0d0e21e
LW
1883 /* Starting-point info. */
1884 again:
653099ff 1885 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1886 if (OP(first) == EXACT)
1887 ; /* Empty, get anchored substr later. */
1888 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1889 r->regstclass = first;
1890 }
a00f3e00 1891 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 1892 r->regstclass = first;
22c35a8c
GS
1893 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1894 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1895 r->regstclass = first;
22c35a8c 1896 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1897 r->reganch |= (OP(first) == MBOL
1898 ? ROPT_ANCH_MBOL
1899 : (OP(first) == SBOL
1900 ? ROPT_ANCH_SBOL
1901 : ROPT_ANCH_BOL));
a0d0e21e 1902 first = NEXTOPER(first);
774d564b 1903 goto again;
1904 }
1905 else if (OP(first) == GPOS) {
1906 r->reganch |= ROPT_ANCH_GPOS;
1907 first = NEXTOPER(first);
1908 goto again;
a0d0e21e 1909 }
e09294f4 1910 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1911 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1912 !(r->reganch & ROPT_ANCH) )
1913 {
1914 /* turn .* into ^.* with an implied $*=1 */
228fe6e6
AL
1915 const int type =
1916 (OP(NEXTOPER(first)) == REG_ANY)
1917 ? ROPT_ANCH_MBOL
1918 : ROPT_ANCH_SBOL;
cad2e5aa 1919 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1920 first = NEXTOPER(first);
774d564b 1921 goto again;
a0d0e21e 1922 }
b81d288d 1923 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
a0d0e21e 1927
c277df42 1928 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1930 (IV)(first - scan + 1)));
a0d0e21e
LW
1931 /*
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
c277df42 1939 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1940 * earlier string may buy us something the later one won't.]
1941 */
a0d0e21e 1942 minlen = 0;
a687059c 1943
79cb57f6
GS
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
c277df42
IZ
1947 data.longest = &(data.longest_fixed);
1948 first = scan;
653099ff 1949 if (!r->regstclass) {
830247a4 1950 cl_init(pRExC_state, &ch_class);
653099ff
GS
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1954 stclass_flag = 0;
cb434fcc 1955 data.last_closep = &last_close;
653099ff 1956
830247a4 1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1960 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1963 r->reganch |= ROPT_CHECK_ALL;
830247a4 1964 scan_commit(pRExC_state, &data);
c277df42
IZ
1965 SvREFCNT_dec(data.last_found);
1966
a0ed51b3 1967 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1968 if (longest_float_length
c277df42
IZ
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1971 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1972 int t;
1973
a0ed51b3 1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1978
33b8afdf
JH
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1982 } else {
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1985 }
c277df42
IZ
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1990 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1992 }
1993 else {
aca2d497 1994 remove_float:
33b8afdf 1995 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1996 SvREFCNT_dec(data.longest_float);
c5254dd6 1997 longest_float_length = 0;
a0d0e21e 1998 }
c277df42 1999
a0ed51b3 2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2001 if (longest_fixed_length
c277df42
IZ
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2004 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2005 int t;
2006
33b8afdf
JH
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2010 } else {
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2013 }
c277df42 2014 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2017 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2019 }
2020 else {
33b8afdf 2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2022 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2023 longest_fixed_length = 0;
a0d0e21e 2024 }
b81d288d 2025 if (r->regstclass
ffc61ed2 2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2027 r->regstclass = NULL;
33b8afdf
JH
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029 && stclass_flag
653099ff 2030 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2031 && !cl_is_anything(data.start_class))
2032 {
228fe6e6 2033 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 2034
cd7a8267 2035 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
830247a4 2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2039 struct regnode_charclass_class);
830247a4 2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2042 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
5332c881 2046 "synthetic stclass \"%s\".\n",
fdac8c4b 2047 SvPVX_const(sv));});
653099ff 2048 }
c277df42
IZ
2049
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2051 if (longest_fixed_length > longest_float_length) {
c277df42 2052 r->check_substr = r->anchored_substr;
33b8afdf 2053 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2057 }
2058 else {
c277df42 2059 r->check_substr = r->float_substr;
33b8afdf 2060 r->check_utf8 = r->float_utf8;
c277df42
IZ
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
a0d0e21e 2063 }
30382c73
IZ
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
33b8afdf 2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2067 r->reganch |= RE_USE_INTUIT;
33b8afdf 2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2069 r->reganch |= RE_INTUIT_TAIL;
2070 }
a0ed51b3
LW
2071 }
2072 else {
c277df42
IZ
2073 /* Several toplevels. Best we can is to set minlen. */
2074 I32 fake;
653099ff 2075 struct regnode_charclass_class ch_class;
cb434fcc 2076 I32 last_close = 0;
c277df42
IZ
2077
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
830247a4 2080 cl_init(pRExC_state, &ch_class);
653099ff 2081 data.start_class = &ch_class;
cb434fcc 2082 data.last_closep = &last_close;
e1901655 2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2086 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2087 && !cl_is_anything(data.start_class))
2088 {
228fe6e6 2089 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 2090
cd7a8267 2091 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
830247a4 2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2095 struct regnode_charclass_class);
830247a4 2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
5332c881 2101 "synthetic stclass \"%s\".\n",
fdac8c4b 2102 SvPVX_const(sv));});
653099ff 2103 }
a0d0e21e
LW
2104 }
2105
a0d0e21e 2106 r->minlen = minlen;
b81d288d 2107 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2108 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2111 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2112 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
cd7a8267
JC
2115 Newxz(r->startp, RExC_npar, I32);
2116 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 2117 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2118 DEBUG_r(regdump(r));
2119 return(r);
a687059c
LW
2120}
2121
2122/*
2123 - reg - regular expression, i.e. main body or parenthesized thing
2124 *
2125 * Caller must absorb opening parenthesis.
2126 *
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2130 */
76e3520e 2131STATIC regnode *
830247a4 2132S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2134{
c277df42
IZ
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
a0d0e21e 2139 register I32 parno = 0;
e2509266 2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2141
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2144
2145 I32 wastedflags = 0x00,
2146 wasted_o = 0x01,
2147 wasted_g = 0x02,
2148 wasted_gc = 0x02 | 0x04,
2149 wasted_c = 0x04;
2150
fac92740 2151 char * parse_start = RExC_parse; /* MJD */
e4057cfc 2152 char * const oregcomp_parse = RExC_parse;
c277df42 2153 char c;
a0d0e21e 2154
821b33a5 2155 *flagp = 0; /* Tentatively. */
a0d0e21e 2156
9d1d55b5 2157
a0d0e21e
LW
2158 /* Make an OPEN node, if parenthesized. */
2159 if (paren) {
fac92740 2160 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
0f5d15d6 2163 int logical = 0;
e4057cfc 2164 const char * const seqstart = RExC_parse;
ca9dfc88 2165
830247a4
IZ
2166 RExC_parse++;
2167 paren = *RExC_parse++;
c277df42 2168 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2169 switch (paren) {
fac92740 2170 case '<': /* (?<...) */
830247a4 2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2172 if (*RExC_parse == '!')
c277df42 2173 paren = ',';
b81d288d 2174 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2175 goto unknown;
830247a4 2176 RExC_parse++;
fac92740
MJD
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
830247a4 2179 RExC_seen_zerolen++;
fac92740
MJD
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
a0d0e21e 2182 break;
fac92740
MJD
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
8615cb43 2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2186 break;
fac92740 2187 case '#': /* (?#...) */
830247a4
IZ
2188 while (*RExC_parse && *RExC_parse != ')')
2189 RExC_parse++;
2190 if (*RExC_parse != ')')
c277df42 2191 FAIL("Sequence (?#... not terminated");
830247a4 2192 nextchar(pRExC_state);
a0d0e21e
LW
2193 *flagp = TRYAGAIN;
2194 return NULL;
fac92740 2195 case 'p': /* (?p...) */
9014280d 2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2198 /* FALL THROUGH*/
fac92740 2199 case '?': /* (??...) */
0f5d15d6 2200 logical = 1;
438a3801
YST
2201 if (*RExC_parse != '{')
2202 goto unknown;
830247a4 2203 paren = *RExC_parse++;
0f5d15d6 2204 /* FALL THROUGH */
fac92740 2205 case '{': /* (?{...}) */
c277df42 2206 {
c277df42
IZ
2207 I32 count = 1, n = 0;
2208 char c;
830247a4 2209 char *s = RExC_parse;
c277df42
IZ
2210 SV *sv;
2211 OP_4tree *sop, *rop;
2212
830247a4
IZ
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2217 RExC_parse++;
b81d288d 2218 else if (c == '{')
c277df42 2219 count++;
b81d288d 2220 else if (c == '}')
c277df42 2221 count--;
830247a4 2222 RExC_parse++;
c277df42 2223 }
830247a4 2224 if (*RExC_parse != ')')
b45f050a 2225 {
b81d288d 2226 RExC_parse = s;
b45f050a
JF
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228 }
c277df42 2229 if (!SIZE_ONLY) {
d7afa7f5 2230 PAD *pad;
b81d288d
AB
2231
2232 if (RExC_parse - 1 - s)
830247a4 2233 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2234 else
79cb57f6 2235 sv = newSVpvn("", 0);
c277df42 2236
569233ed
SB
2237 ENTER;
2238 Perl_save_re_context(aTHX);
d7afa7f5 2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
569233ed 2243 LEAVE;
c277df42 2244
830247a4
IZ
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
d7afa7f5 2248 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2249 SvREFCNT_dec(sv);
a0ed51b3 2250 }
e24b16f9 2251 else { /* First pass */
830247a4 2252 if (PL_reginterp_cnt < ++RExC_seen_evals
ef7b71f0 2253 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2257 if (PL_tainting && PL_tainted)
cc6b7395 2258 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2259 }
2260
830247a4 2261 nextchar(pRExC_state);
0f5d15d6 2262 if (logical) {
830247a4 2263 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2264 if (!SIZE_ONLY)
2265 ret->flags = 2;
830247a4 2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2267 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2268 return ret;
2269 }
43af864e
JH
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2273 return ret;
c277df42 2274 }
fac92740 2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2276 {
fac92740 2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
830247a4 2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2281 I32 flag;
2282
830247a4 2283 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2284 if (!SIZE_ONLY)
2285 ret->flags = 1;
830247a4 2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2287 goto insert_if;
b81d288d 2288 }
a0ed51b3 2289 }
830247a4 2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2291 /* (?(1)...) */
830247a4 2292 parno = atoi(RExC_parse++);
c277df42 2293
830247a4
IZ
2294 while (isDIGIT(*RExC_parse))
2295 RExC_parse++;
fac92740 2296 ret = reganode(pRExC_state, GROUPP, parno);
e503e849 2297
830247a4 2298 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2299 vFAIL("Switch condition not recognized");
c277df42 2300 insert_if:
830247a4
IZ
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
c277df42 2303 if (br == NULL)
830247a4 2304 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2305 else
830247a4
IZ
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
d1b80229
IZ
2308 if (flags&HASWIDTH)
2309 *flagp |= HASWIDTH;
c277df42 2310 if (c == '|') {
830247a4
IZ
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2314 if (flags&HASWIDTH)
2315 *flagp |= HASWIDTH;
830247a4 2316 c = *nextchar(pRExC_state);
a0ed51b3
LW
2317 }
2318 else
c277df42
IZ
2319 lastbr = NULL;
2320 if (c != ')')
8615cb43 2321 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
c277df42 2324 if (lastbr) {
830247a4
IZ
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2327 }
2328 else
830247a4 2329 regtail(pRExC_state, ret, ender);
c277df42 2330 return ret;
a0ed51b3
LW
2331 }
2332 else {
830247a4 2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2334 }
2335 }
1b1626e4 2336 case 0:
830247a4 2337 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2338 vFAIL("Sequence (? incomplete");
1b1626e4 2339 break;
a0d0e21e 2340 default:
830247a4 2341 --RExC_parse;
fac92740 2342 parse_flags: /* (?i) */
830247a4 2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2346
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2352 vWARN5(
2353 RExC_parse + 1,
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2356 *RExC_parse,
2357 flagsp == &negflags ? "don't " : "",
2358 *RExC_parse
2359 );
2360 }
2361 }
2362 }
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2367 vWARN3(
2368 RExC_parse + 1,
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2372 );
2373 }
2374 }
2375 }
2376 else { pmflag(flagsp, *RExC_parse); }
2377
830247a4 2378 ++RExC_parse;
ca9dfc88 2379 }
830247a4 2380 if (*RExC_parse == '-') {
ca9dfc88 2381 flagsp = &negflags;
9d1d55b5 2382 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2383 ++RExC_parse;
ca9dfc88 2384 goto parse_flags;
48c036b1 2385 }
e2509266
JH
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
830247a4
IZ
2388 if (*RExC_parse == ':') {
2389 RExC_parse++;
ca9dfc88
IZ
2390 paren = ':';
2391 break;
2392 }
c277df42 2393 unknown:
830247a4
IZ
2394 if (*RExC_parse != ')') {
2395 RExC_parse++;
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2397 }
830247a4 2398 nextchar(pRExC_state);
a0d0e21e
LW
2399 *flagp = TRYAGAIN;
2400 return NULL;
2401 }
2402 }
fac92740 2403 else { /* (...) */
830247a4
IZ
2404 parno = RExC_npar;
2405 RExC_npar++;
2406 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2409 open = 1;
a0d0e21e 2410 }
a0ed51b3 2411 }
fac92740 2412 else /* ! paren */
a0d0e21e
LW
2413 ret = NULL;
2414
2415 /* Pick up the branches, linking them together. */
fac92740 2416 parse_start = RExC_parse; /* MJD */
830247a4 2417 br = regbranch(pRExC_state, &flags, 1);
fac92740 2418 /* branch_len = (paren != 0); */
e503e849 2419
a0d0e21e
LW
2420 if (br == NULL)
2421 return(NULL);
830247a4
IZ
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2425 }
fac92740 2426 else { /* MJD */
830247a4 2427 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430 }
c277df42
IZ
2431 have_branch = 1;
2432 if (SIZE_ONLY)
830247a4 2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2434 }
2435 else if (paren == ':') {
c277df42
IZ
2436 *flagp |= flags&SIMPLE;
2437 }
2438 if (open) { /* Starts with OPEN. */
830247a4 2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2440 }
2441 else if (paren != '?') /* Not Conditional */
a0d0e21e 2442 ret = br;
32a0ca98 2443 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2444 lastbr = br;
830247a4
IZ
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2449 }
2450 if (SIZE_ONLY)
830247a4
IZ
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
e503e849 2454
a687059c 2455 if (br == NULL)
a0d0e21e 2456 return(NULL);
830247a4 2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2458 lastbr = br;
821b33a5
IZ
2459 if (flags&HASWIDTH)
2460 *flagp |= HASWIDTH;
a687059c 2461 *flagp |= flags&SPSTART;
a0d0e21e
LW
2462 }
2463
c277df42
IZ
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2466 switch (paren) {
2467 case ':':
830247a4 2468 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2469 break;
2470 case 1:
830247a4 2471 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2474 break;
2475 case '<':
c277df42
IZ
2476 case ',':
2477 case '=':
2478 case '!':
c277df42 2479 *flagp &= ~HASWIDTH;
821b33a5
IZ
2480 /* FALL THROUGH */
2481 case '>':
830247a4 2482 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2483 break;
2484 case 0:
830247a4 2485 ender = reg_node(pRExC_state, END);
c277df42
IZ
2486 break;
2487 }
830247a4 2488 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2489
c277df42
IZ
2490 if (have_branch) {
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2493 regoptail(pRExC_state, br, ender);
c277df42
IZ
2494 }
2495 }
a0d0e21e 2496 }
c277df42
IZ
2497
2498 {
c05e0e2f
AL
2499 const char *p;
2500 static const char parens[] = "=!<,>";
c277df42
IZ
2501
2502 if (paren && (p = strchr(parens, paren))) {
eb160463 2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2504 int flag = (p - parens) > 1;
2505
2506 if (paren == '>')
2507 node = SUSPEND, flag = 0;
830247a4 2508 reginsert(pRExC_state, node,ret);
4a04c497
NC
2509 Set_Node_Cur_Length(ret);
2510 Set_Node_Offset(ret, parse_start + 1);
c277df42 2511 ret->flags = flag;
830247a4 2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2513 }
a0d0e21e
LW
2514 }
2515
2516 /* Check for proper termination. */
ce3e6498 2517 if (paren) {
e2509266 2518 RExC_flags = oregflags;
830247a4
IZ
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
380a0633 2521 vFAIL("Unmatched (");
ce3e6498 2522 }
a0ed51b3 2523 }
830247a4
IZ
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2526 RExC_parse++;
380a0633 2527 vFAIL("Unmatched )");
a0ed51b3
LW
2528 }
2529 else
b45f050a 2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2531 /* NOTREACHED */
2532 }
a687059c 2533
a0d0e21e 2534 return(ret);
a687059c
LW
2535}
2536
2537/*
2538 - regbranch - one alternative of an | operator
2539 *
2540 * Implements the concatenation operator.
2541 */
76e3520e 2542STATIC regnode *
830247a4 2543S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2544{
c277df42
IZ
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
a0d0e21e 2549
b81d288d 2550 if (first)
c277df42
IZ
2551 ret = NULL;
2552 else {
b81d288d 2553 if (!SIZE_ONLY && RExC_extralen)
830247a4 2554 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2555 else {
830247a4 2556 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2557 Set_Node_Length(ret, 1);
2558 }
c277df42
IZ
2559 }
2560
b81d288d 2561 if (!first && SIZE_ONLY)
830247a4 2562 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2563
c277df42 2564 *flagp = WORST; /* Tentatively. */
a0d0e21e 2565
830247a4
IZ
2566 RExC_parse--;
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2569 flags &= ~TRYAGAIN;
830247a4 2570 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2573 continue;
2574 return(NULL);
a0ed51b3
LW
2575 }
2576 else if (ret == NULL)
c277df42 2577 ret = latest;
a0d0e21e 2578 *flagp |= flags&HASWIDTH;
c277df42 2579 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2580 *flagp |= flags&SPSTART;
2581 else {
830247a4
IZ
2582 RExC_naughty++;
2583 regtail(pRExC_state, chain, latest);
a687059c 2584 }
a0d0e21e 2585 chain = latest;
c277df42
IZ
2586 c++;
2587 }
2588 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2589 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2590 if (ret == NULL)
2591 ret = chain;
2592 }
2593 if (c == 1) {
2594 *flagp |= flags&SIMPLE;
a0d0e21e 2595 }
a687059c 2596
a0d0e21e 2597 return(ret);
a687059c
LW
2598}
2599
2600/*
2601 - regpiece - something followed by possible [*+?]
2602 *
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2608 */
76e3520e 2609STATIC regnode *
830247a4 2610S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2611{
c277df42 2612 register regnode *ret;
a0d0e21e
LW
2613 register char op;
2614 register char *next;
2615 I32 flags;
228fe6e6 2616 const char * const origparse = RExC_parse;
a0d0e21e
LW
2617 char *maxpos;
2618 I32 min;
c277df42 2619 I32 max = REG_INFTY;
fac92740 2620 char *parse_start;
a0d0e21e 2621
830247a4 2622 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2623 if (ret == NULL) {
2624 if (flags & TRYAGAIN)
2625 *flagp |= TRYAGAIN;
2626 return(NULL);
2627 }
2628
830247a4 2629 op = *RExC_parse;
a0d0e21e 2630
830247a4 2631 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2632 parse_start = RExC_parse; /* MJD */
830247a4 2633 next = RExC_parse + 1;
a0d0e21e
LW
2634 maxpos = Nullch;
2635 while (isDIGIT(*next) || *next == ',') {
2636 if (*next == ',') {
2637 if (maxpos)
2638 break;
2639 else
2640 maxpos = next;
a687059c 2641 }
a0d0e21e
LW
2642 next++;
2643 }
2644 if (*next == '}') { /* got one */
2645 if (!maxpos)
2646 maxpos = next;
830247a4
IZ
2647 RExC_parse++;
2648 min = atoi(RExC_parse);
a0d0e21e
LW
2649 if (*maxpos == ',')
2650 maxpos++;
2651 else
830247a4 2652 maxpos = RExC_parse;
a0d0e21e
LW
2653 max = atoi(maxpos);
2654 if (!max && *maxpos != '0')
c277df42
IZ
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
8615cb43 2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2658 RExC_parse = next;
2659 nextchar(pRExC_state);
a0d0e21e
LW
2660
2661 do_curly:
2662 if ((flags&SIMPLE)) {
830247a4
IZ
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2667 }
2668 else {
830247a4 2669 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2670
2671 w->flags = 0;
830247a4
IZ
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2677 }
830247a4 2678 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2679 /* MJD hk */
2680 Set_Node_Offset(ret, parse_start+1);
e503e849 2681 Set_Node_Length(ret,
fac92740 2682 op == '{' ? (RExC_parse - parse_start) : 1);
e503e849 2683
830247a4 2684 if (!SIZE_ONLY && RExC_extralen)
c277df42 2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2687 if (SIZE_ONLY)
830247a4
IZ
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2690 }
c277df42 2691 ret->flags = 0;
a0d0e21e
LW
2692
2693 if (min > 0)
821b33a5
IZ
2694 *flagp = WORST;
2695 if (max > 0)
2696 *flagp |= HASWIDTH;
a0d0e21e 2697 if (max && max < min)
8615cb43 2698 vFAIL("Can't do {n,m} with n > m");
c277df42 2699 if (!SIZE_ONLY) {
eb160463
GS
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
a687059c 2702 }
a687059c 2703
a0d0e21e 2704 goto nest_check;
a687059c 2705 }
a0d0e21e 2706 }
a687059c 2707
a0d0e21e
LW
2708 if (!ISMULT1(op)) {
2709 *flagp = flags;
a687059c 2710 return(ret);
a0d0e21e 2711 }
bb20fd44 2712
c277df42 2713#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2714
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2722
2723 */
2724
bb20fd44 2725 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2726 vFAIL("Regexp *+ operand could be empty");
b81d288d 2727#endif
bb20fd44 2728
fac92740 2729 parse_start = RExC_parse;
830247a4 2730 nextchar(pRExC_state);
a0d0e21e 2731
821b33a5 2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2733
2734 if (op == '*' && (flags&SIMPLE)) {
830247a4 2735 reginsert(pRExC_state, STAR, ret);
c277df42 2736 ret->flags = 0;
830247a4 2737 RExC_naughty += 4;
a0d0e21e
LW
2738 }
2739 else if (op == '*') {
2740 min = 0;
2741 goto do_curly;
a0ed51b3
LW
2742 }
2743 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2744 reginsert(pRExC_state, PLUS, ret);
c277df42 2745 ret->flags = 0;
830247a4 2746 RExC_naughty += 3;
a0d0e21e
LW
2747 }
2748 else if (op == '+') {
2749 min = 1;
2750 goto do_curly;
a0ed51b3
LW
2751 }
2752 else if (op == '?') {
a0d0e21e
LW
2753 min = 0; max = 1;
2754 goto do_curly;
2755 }
2756 nest_check:
f5e9f069 2757 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 2758 vWARN3(RExC_parse,
b45f050a 2759 "%.*s matches null string many times",
830247a4 2760 RExC_parse - origparse,
b45f050a 2761 origparse);
a0d0e21e
LW
2762 }
2763
830247a4
IZ
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2768 }
830247a4
IZ
2769 if (ISMULT2(RExC_parse)) {
2770 RExC_parse++;
b45f050a
JF
2771 vFAIL("Nested quantifiers");
2772 }
a0d0e21e
LW
2773
2774 return(ret);
a687059c
LW
2775}
2776
2777/*
2778 - regatom - the lowest level
2779 *
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2784 *
b45f050a 2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2786STATIC regnode *
830247a4 2787S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2788{
c277df42 2789 register regnode *ret = 0;
a0d0e21e 2790 I32 flags;
4a04c497 2791 char *parse_start = RExC_parse;
a0d0e21e
LW
2792
2793 *flagp = WORST; /* Tentatively. */
2794
2795tryagain:
830247a4 2796 switch (*RExC_parse) {
a0d0e21e 2797 case '^':
830247a4
IZ
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
e2509266 2800 if (RExC_flags & PMf_MULTILINE)
830247a4 2801 ret = reg_node(pRExC_state, MBOL);
e2509266 2802 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2803 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2804 else
830247a4 2805 ret = reg_node(pRExC_state, BOL);
fac92740 2806 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2807 break;
2808 case '$':
830247a4 2809 nextchar(pRExC_state);
b81d288d 2810 if (*RExC_parse)
830247a4 2811 RExC_seen_zerolen++;
e2509266 2812 if (RExC_flags & PMf_MULTILINE)
830247a4 2813 ret = reg_node(pRExC_state, MEOL);
e2509266 2814 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2815 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2816 else
830247a4 2817 ret = reg_node(pRExC_state, EOL);
fac92740 2818 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2819 break;
2820 case '.':
830247a4 2821 nextchar(pRExC_state);
e2509266 2822 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2823 ret = reg_node(pRExC_state, SANY);
2824 else
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
830247a4 2827 RExC_naughty++;
fac92740 2828 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2829 break;
2830 case '[':
b45f050a 2831 {
830247a4 2832 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2833 ret = regclass(pRExC_state);
830247a4
IZ
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
b45f050a
JF
2836 vFAIL("Unmatched [");
2837 }
830247a4 2838 nextchar(pRExC_state);
a0d0e21e 2839 *flagp |= HASWIDTH|SIMPLE;
fac92740 2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2841 break;
b45f050a 2842 }
a0d0e21e 2843 case '(':
830247a4
IZ
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2846 if (ret == NULL) {
bf93d4cc 2847 if (flags & TRYAGAIN) {
830247a4 2848 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2849 /* Make parent create an empty node if needed. */
2850 *flagp |= TRYAGAIN;
2851 return(NULL);
2852 }
a0d0e21e 2853 goto tryagain;
bf93d4cc 2854 }
a0d0e21e
LW
2855 return(NULL);
2856 }
c277df42 2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2858 break;
2859 case '|':
2860 case ')':
2861 if (flags & TRYAGAIN) {
2862 *flagp |= TRYAGAIN;
2863 return NULL;
2864 }
b45f050a 2865 vFAIL("Internal urp");
a0d0e21e
LW
2866 /* Supposed to be caught earlier. */
2867 break;
85afd4ae 2868 case '{':
830247a4
IZ
2869 if (!regcurly(RExC_parse)) {
2870 RExC_parse++;
85afd4ae
CS
2871 goto defchar;
2872 }
2873 /* FALL THROUGH */
a0d0e21e
LW
2874 case '?':
2875 case '+':
2876 case '*':
830247a4 2877 RExC_parse++;
b45f050a 2878 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2879 break;
2880 case '\\':
830247a4 2881 switch (*++RExC_parse) {
a0d0e21e 2882 case 'A':
830247a4
IZ
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2885 *flagp |= SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2888 break;
2889 case 'G':
830247a4
IZ
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2892 *flagp |= SIMPLE;
830247a4 2893 nextchar(pRExC_state);
fac92740 2894 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2895 break;
2896 case 'Z':
830247a4 2897 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2898 *flagp |= SIMPLE;
a1917ab9 2899 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2900 nextchar(pRExC_state);
a0d0e21e 2901 break;
b85d18e9 2902 case 'z':
830247a4 2903 ret = reg_node(pRExC_state, EOS);
b85d18e9 2904 *flagp |= SIMPLE;
830247a4
IZ
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
fac92740 2907 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2908 break;
4a2d328f 2909 case 'C':
f33976b4
DB
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2912 *flagp |= HASWIDTH|SIMPLE;
830247a4 2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2915 break;
2916 case 'X':
830247a4 2917 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2918 *flagp |= HASWIDTH;
830247a4 2919 nextchar(pRExC_state);
fac92740 2920 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2921 break;
a0d0e21e 2922 case 'w':
eb160463 2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2924 *flagp |= HASWIDTH|SIMPLE;
830247a4 2925 nextchar(pRExC_state);
fac92740 2926 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2927 break;
2928 case 'W':
eb160463 2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2930 *flagp |= HASWIDTH|SIMPLE;
830247a4 2931 nextchar(pRExC_state);
fac92740 2932 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2933 break;
2934 case 'b':
830247a4
IZ
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2938 *flagp |= SIMPLE;
830247a4 2939 nextchar(pRExC_state);
fac92740 2940 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2941 break;
2942 case 'B':
830247a4
IZ
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2946 *flagp |= SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2949 break;
2950 case 's':
eb160463 2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2952 *flagp |= HASWIDTH|SIMPLE;
830247a4 2953 nextchar(pRExC_state);
fac92740 2954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2955 break;
2956 case 'S':
eb160463 2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2958 *flagp |= HASWIDTH|SIMPLE;
830247a4 2959 nextchar(pRExC_state);
fac92740 2960 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2961 break;
2962 case 'd':
ffc61ed2 2963 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2964 *flagp |= HASWIDTH|SIMPLE;
830247a4 2965 nextchar(pRExC_state);
fac92740 2966 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2967 break;
2968 case 'D':
ffc61ed2 2969 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2970 *flagp |= HASWIDTH|SIMPLE;
830247a4 2971 nextchar(pRExC_state);
fac92740 2972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2973 break;
a14b48bc
LW
2974 case 'p':
2975 case 'P':
3568d838 2976 {
830247a4 2977 char* oldregxend = RExC_end;
43af864e 2978 char* parse_start = RExC_parse - 2;
a14b48bc 2979
830247a4 2980 if (RExC_parse[1] == '{') {
3568d838 2981 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2982 RExC_end = strchr(RExC_parse, '}');
2983 if (!RExC_end) {
0da60cf5 2984 U8 c = (U8)*RExC_parse;
830247a4
IZ
2985 RExC_parse += 2;
2986 RExC_end = oldregxend;
0da60cf5 2987 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2988 }
830247a4 2989 RExC_end++;
a14b48bc 2990 }
af6f566e 2991 else {
830247a4 2992 RExC_end = RExC_parse + 2;
af6f566e
HS
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2995 }
830247a4 2996 RExC_parse--;
a14b48bc 2997
ffc61ed2 2998 ret = regclass(pRExC_state);
a14b48bc 2999
830247a4
IZ
3000 RExC_end = oldregxend;
3001 RExC_parse--;
43af864e
JH
3002
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
830247a4 3005 nextchar(pRExC_state);
a14b48bc
LW
3006 *flagp |= HASWIDTH|SIMPLE;
3007 }
3008 break;
a0d0e21e
LW
3009 case 'n':
3010 case 'r':
3011 case 't':
3012 case 'f':
3013 case 'e':
3014 case 'a':
3015 case 'x':
3016 case 'c':
3017 case '0':
3018 goto defchar;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3021 {
228fe6e6 3022 const I32 num = atoi(RExC_parse);
a0d0e21e 3023
830247a4 3024 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3025 goto defchar;
3026 else {
fac92740 3027 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3028 while (isDIGIT(*RExC_parse))
3029 RExC_parse++;
b45f050a 3030
eb160463 3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3032 vFAIL("Reference to nonexistent group");
830247a4 3033 RExC_sawback = 1;
eb160463
GS
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036 num);
a0d0e21e 3037 *flagp |= HASWIDTH;
e503e849 3038
fac92740 3039 /* override incorrect value set in reganode MJD */
e503e849 3040 Set_Node_Offset(ret, parse_start+1);
fac92740 3041 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3042 RExC_parse--;
3043 nextchar(pRExC_state);
a0d0e21e
LW
3044 }
3045 }
3046 break;
3047 case '\0':
830247a4 3048 if (RExC_parse >= RExC_end)
b45f050a 3049 FAIL("Trailing \\");
a0d0e21e
LW
3050 /* FALL THROUGH */
3051 default:
5332c881 3052 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 3053 back into the quick-grab loop below */
4a04c497 3054 parse_start--;
a0d0e21e
LW
3055 goto defchar;
3056 }
3057 break;
4633a7c4
LW
3058
3059 case '#':
e2509266 3060 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3061 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3062 if (RExC_parse < RExC_end)
4633a7c4
LW
3063 goto tryagain;
3064 }
3065 /* FALL THROUGH */
3066
a0d0e21e 3067 default: {
ba210ebe 3068 register STRLEN len;
58ae7d3f 3069 register UV ender;
a0d0e21e 3070 register char *p;
c277df42 3071 char *oldp, *s;
80aecb99 3072 STRLEN foldlen;
a2a469f9 3073 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
3074
3075 parse_start = RExC_parse - 1;
a0d0e21e 3076
830247a4 3077 RExC_parse++;
a0d0e21e
LW
3078
3079 defchar:
58ae7d3f 3080 ender = 0;
eb160463
GS
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3083 s = STRING(ret);
830247a4
IZ
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
a0d0e21e
LW
3086 len++)
3087 {
3088 oldp = p;
5b5a24f7 3089
e2509266 3090 if (RExC_flags & PMf_EXTENDED)
830247a4 3091 p = regwhite(p, RExC_end);
a0d0e21e
LW
3092 switch (*p) {
3093 case '^':
3094 case '$':
3095 case '.':
3096 case '[':
3097 case '(':
3098 case ')':
3099 case '|':
3100 goto loopdone;
3101 case '\\':
3102 switch (*++p) {
3103 case 'A':
1ed8eac0
JF
3104 case 'C':
3105 case 'X':
a0d0e21e
LW
3106 case 'G':
3107 case 'Z':
b85d18e9 3108 case 'z':
a0d0e21e
LW
3109 case 'w':
3110 case 'W':
3111 case 'b':
3112 case 'B':
3113 case 's':
3114 case 'S':
3115 case 'd':
3116 case 'D':
a14b48bc
LW
3117 case 'p':
3118 case 'P':
a0d0e21e
LW
3119 --p;
3120 goto loopdone;
3121 case 'n':
3122 ender = '\n';
3123 p++;
a687059c 3124 break;
a0d0e21e
LW
3125 case 'r':
3126 ender = '\r';
3127 p++;
a687059c 3128 break;
a0d0e21e
LW
3129 case 't':
3130 ender = '\t';
3131 p++;
a687059c 3132 break;
a0d0e21e
LW
3133 case 'f':
3134 ender = '\f';
3135 p++;
a687059c 3136 break;
a0d0e21e 3137 case 'e':
c7f1f016 3138 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3139 p++;
a687059c 3140 break;
a0d0e21e 3141 case 'a':
c7f1f016 3142 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3143 p++;
a687059c 3144 break;
a0d0e21e 3145 case 'x':
a0ed51b3 3146 if (*++p == '{') {
228fe6e6 3147 char* const e = strchr(p, '}');
b81d288d 3148
b45f050a 3149 if (!e) {
830247a4 3150 RExC_parse = p + 1;
b45f050a
JF
3151 vFAIL("Missing right brace on \\x{}");
3152 }
de5f0749 3153 else {
a4c04bdc
NC
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
228fe6e6 3156 STRLEN numlen = e - p - 1;
53305cf1 3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3158 if (ender > 0xff)
3159 RExC_utf8 = 1;
a0ed51b3
LW
3160 p = e + 1;
3161 }
a0ed51b3
LW
3162 }
3163 else {
a4c04bdc 3164 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
228fe6e6 3165 STRLEN numlen = 2;
53305cf1 3166 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3167 p += numlen;
3168 }
a687059c 3169 break;
a0d0e21e
LW
3170 case 'c':
3171 p++;
bbce6d69 3172 ender = UCHARAT(p++);
3173 ender = toCTRL(ender);
a687059c 3174 break;
a0d0e21e
LW
3175 case '0': case '1': case '2': case '3':case '4':
3176 case '5': case '6': case '7': case '8':case '9':
3177 if (*p == '0' ||
830247a4 3178 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 3179 I32 flags = 0;
228fe6e6 3180 STRLEN numlen = 3;
53305cf1 3181 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3182 p += numlen;
3183 }
3184 else {
3185 --p;
3186 goto loopdone;
a687059c
LW
3187 }
3188 break;
a0d0e21e 3189 case '\0':
830247a4 3190 if (p >= RExC_end)
b45f050a 3191 FAIL("Trailing \\");
a687059c 3192 /* FALL THROUGH */
a0d0e21e 3193 default:
f5e9f069 3194 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 3195 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3196 goto normal_default;
a0d0e21e
LW
3197 }
3198 break;
a687059c 3199 default:
a0ed51b3 3200 normal_default:
fd400ab9 3201 if (UTF8_IS_START(*p) && UTF) {
228fe6e6 3202 STRLEN numlen;
5e12f4fb 3203 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3204 &numlen, 0);
a0ed51b3
LW
3205 p += numlen;
3206 }
3207 else
3208 ender = *p++;
a0d0e21e 3209 break;
a687059c 3210 }
e2509266 3211 if (RExC_flags & PMf_EXTENDED)
830247a4 3212 p = regwhite(p, RExC_end);
60a8b682
JH
3213 if (UTF && FOLD) {
3214 /* Prime the casefolded buffer. */
ac7e0132 3215 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3216 }
a0d0e21e
LW
3217 if (ISMULT2(p)) { /* Back off on ?+*. */
3218 if (len)
3219 p = oldp;
16ea2a2e 3220 else if (UTF) {
0ebc6274
JH
3221 STRLEN unilen;
3222
80aecb99 3223 if (FOLD) {
60a8b682 3224 /* Emit all the Unicode characters. */
228fe6e6 3225 STRLEN numlen;
80aecb99
JH
3226 for (foldbuf = tmpbuf;
3227 foldlen;
3228 foldlen -= numlen) {
3229 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3230 if (numlen > 0) {
0ebc6274
JH
3231 reguni(pRExC_state, ender, s, &unilen);
3232 s += unilen;
3233 len += unilen;
3234 /* In EBCDIC the numlen
3235 * and unilen can differ. */
9dc45d57 3236 foldbuf += numlen;
47654450
JH
3237 if (numlen >= foldlen)
3238 break;
9dc45d57
JH
3239 }
3240 else
3241 break; /* "Can't happen." */
80aecb99
JH
3242 }
3243 }
3244 else {
0ebc6274 3245 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3246 if (unilen > 0) {
0ebc6274
JH
3247 s += unilen;
3248 len += unilen;
9dc45d57 3249 }
80aecb99 3250 }
a0ed51b3 3251 }
a0d0e21e
LW
3252 else {
3253 len++;
eb160463 3254 REGC((char)ender, s++);
a0d0e21e
LW
3255 }
3256 break;
a687059c 3257 }
16ea2a2e 3258 if (UTF) {
0ebc6274
JH
3259 STRLEN unilen;
3260
80aecb99 3261 if (FOLD) {
60a8b682 3262 /* Emit all the Unicode characters. */
228fe6e6 3263 STRLEN numlen;
80aecb99
JH
3264 for (foldbuf = tmpbuf;
3265 foldlen;
3266 foldlen -= numlen) {
3267 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3268 if (numlen > 0) {
0ebc6274
JH
3269 reguni(pRExC_state, ender, s, &unilen);
3270 len += unilen;
3271 s += unilen;
3272 /* In EBCDIC the numlen
3273 * and unilen can differ. */
9dc45d57 3274 foldbuf += numlen;
47654450
JH
3275 if (numlen >= foldlen)
3276 break;
9dc45d57
JH
3277 }
3278 else
3279 break;
80aecb99
JH
3280 }
3281 }
3282 else {
0ebc6274 3283 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3284 if (unilen > 0) {
0ebc6274
JH
3285 s += unilen;
3286 len += unilen;
9dc45d57 3287 }
80aecb99
JH
3288 }
3289 len--;
a0ed51b3
LW
3290 }
3291 else
eb160463 3292 REGC((char)ender, s++);
a0d0e21e
LW
3293 }
3294 loopdone:
830247a4 3295 RExC_parse = p - 1;
fac92740 3296 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3297 nextchar(pRExC_state);
793db0cb
JH
3298 {
3299 /* len is STRLEN which is unsigned, need to copy to signed */
3300 IV iv = len;
3301 if (iv < 0)
3302 vFAIL("Internal disaster");
3303 }
a0d0e21e
LW
3304 if (len > 0)
3305 *flagp |= HASWIDTH;
9c20fa4a 3306 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3307 *flagp |= SIMPLE;
c277df42 3308 if (!SIZE_ONLY)
cd439c50
IZ
3309 STR_LEN(ret) = len;
3310 if (SIZE_ONLY)
830247a4 3311 RExC_size += STR_SZ(len);
cd439c50 3312 else
830247a4 3313 RExC_emit += STR_SZ(len);
a687059c 3314 }
a0d0e21e
LW
3315 break;
3316 }
a687059c 3317
60a8b682
JH
3318 /* If the encoding pragma is in effect recode the text of
3319 * any EXACT-kind nodes. */
22c54be3 3320 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
5b7ea690
JH
3321 STRLEN oldlen = STR_LEN(ret);
3322 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3323
3324 if (RExC_utf8)
3325 SvUTF8_on(sv);
3326 if (sv_utf8_downgrade(sv, TRUE)) {
228fe6e6
AL
3327 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3328 const STRLEN newlen = SvCUR(sv);
5b7ea690
JH
3329
3330 if (SvUTF8(sv))
3331 RExC_utf8 = 1;
3332 if (!SIZE_ONLY) {
3333 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3334 (int)oldlen, STRING(ret),
3335 (int)newlen, s));
3336 Copy(s, STRING(ret), newlen, char);
3337 STR_LEN(ret) += newlen - oldlen;
3338 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3339 } else
3340 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3341 }
a72c7584
JH
3342 }
3343
a0d0e21e 3344 return(ret);
a687059c
LW
3345}
3346
873ef191 3347STATIC char *
547d29e4 3348S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
3349{
3350 while (p < e) {
3351 if (isSPACE(*p))
3352 ++p;
3353 else if (*p == '#') {
3354 do {
3355 p++;
3356 } while (p < e && *p != '\n');
3357 }
3358 else
3359 break;
3360 }
3361 return p;
3362}
3363
b8c5462f
JH
3364/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3365 Character classes ([:foo:]) can also be negated ([:^foo:]).
3366 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3367 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3368 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3369
3370#define POSIXCC_DONE(c) ((c) == ':')
3371#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3372#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3373
b8c5462f 3374STATIC I32
830247a4 3375S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 3376{
936ed897 3377 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3378
830247a4 3379 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3380 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 3381 POSIXCC(UCHARAT(RExC_parse))) {
228fe6e6 3382 const char c = UCHARAT(RExC_parse);
830247a4 3383 char* s = RExC_parse++;
b81d288d 3384
9a86a77b 3385 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3386 RExC_parse++;
3387 if (RExC_parse == RExC_end)
620e46c5 3388 /* Grandfather lone [:, [=, [. */
830247a4 3389 RExC_parse = s;
620e46c5 3390 else {
228fe6e6 3391 const char* t = RExC_parse++; /* skip over the c */
e4057cfc 3392 const char *posixcc;
b8c5462f 3393
57620943
NC
3394 assert(*t == c);
3395
9a86a77b 3396 if (UCHARAT(RExC_parse) == ']') {
830247a4 3397 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3398 posixcc = s + 1;
3399 if (*s == ':') {
228fe6e6
AL
3400 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401 const I32 skip = t - posixcc;
57620943
NC
3402
3403 /* Initially switch on the length of the name. */
3404 switch (skip) {
3405 case 4:
3406 if (memEQ(posixcc, "word", 4)) {
3407 /* this is not POSIX, this is the Perl \w */;
3408 namedclass
3409 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3410 }
cc4319de 3411 break;
57620943
NC
3412 case 5:
3413 /* Names all of length 5. */
3414 /* alnum alpha ascii blank cntrl digit graph lower
3415 print punct space upper */
3416 /* Offset 4 gives the best switch position. */
3417 switch (posixcc[4]) {
3418 case 'a':
3419 if (memEQ(posixcc, "alph", 4)) {
3420 /* a */
3421 namedclass
3422 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3423 }
3424 break;
3425 case 'e':
3426 if (memEQ(posixcc, "spac", 4)) {
3427 /* e */
3428 namedclass
3429 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3430 }
3431 break;
3432 case 'h':
3433 if (memEQ(posixcc, "grap", 4)) {
3434 /* h */
3435 namedclass
3436 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3437 }
3438 break;
3439 case 'i':
3440 if (memEQ(posixcc, "asci", 4)) {
3441 /* i */
3442 namedclass
3443 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3444 }
3445 break;
3446 case 'k':
3447 if (memEQ(posixcc, "blan", 4)) {
3448 /* k */
3449 namedclass
3450 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3451 }
3452 break;
3453 case 'l':
3454 if (memEQ(posixcc, "cntr", 4)) {
3455 /* l */
3456 namedclass
3457 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3458 }
3459 break;
3460 case 'm':
3461 if (memEQ(posixcc, "alnu", 4)) {
3462 /* m */
3463 namedclass
3464 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3465 }
3466 break;
3467 case 'r':
3468 if (memEQ(posixcc, "lowe", 4)) {
3469 /* r */
3470 namedclass
3471 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3472 }
3473 if (memEQ(posixcc, "uppe", 4)) {
3474 /* r */
3475 namedclass
3476 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3477 }
3478 break;
3479 case 't':
3480 if (memEQ(posixcc, "digi", 4)) {
3481 /* t */
3482 namedclass
3483 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3484 }
3485 if (memEQ(posixcc, "prin", 4)) {
3486 /* t */
3487 namedclass
3488 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3489 }
3490 if (memEQ(posixcc, "punc", 4)) {
3491 /* t */
3492 namedclass
3493 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3494 }
3495 break;
b8c5462f
JH
3496 }
3497 break;
57620943
NC
3498 case 6:
3499 if (memEQ(posixcc, "xdigit", 6)) {
3500 namedclass
3501 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
3502 }
3503 break;
3504 }
57620943
NC
3505
3506 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
3507 {
3508 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3509 t - s - 1, s + 1);
3510 }
57620943
NC
3511 assert (posixcc[skip] == ':');
3512 assert (posixcc[skip+1] == ']');
b45f050a 3513 } else if (!SIZE_ONLY) {
b8c5462f 3514 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3515
830247a4 3516 /* adjust RExC_parse so the warning shows after
b45f050a 3517 the class closes */
9a86a77b 3518 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3519 RExC_parse++;
b45f050a
JF
3520 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3521 }
b8c5462f
JH
3522 } else {
3523 /* Maternal grandfather:
3524 * "[:" ending in ":" but not in ":]" */
830247a4 3525 RExC_parse = s;
767d463e 3526 }
620e46c5
JH
3527 }
3528 }
3529
b8c5462f
JH
3530 return namedclass;
3531}
3532
3533STATIC void
830247a4 3534S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3535{
b938889d 3536 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
228fe6e6
AL
3537 const char *s = RExC_parse;
3538 const char c = *s++;
b8c5462f
JH
3539
3540 while(*s && isALNUM(*s))
3541 s++;
3542 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3543 if (ckWARN(WARN_REGEXP))
3544 vWARN3(s+2,
3545 "POSIX syntax [%c %c] belongs inside character classes",
3546 c, c);
b45f050a
JF
3547
3548 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3549 if (POSIXCC_NOTYET(c)) {
830247a4 3550 /* adjust RExC_parse so the error shows after
b45f050a 3551 the class closes */
9a86a77b 3552 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3553 ;
3554 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3555 }
b8c5462f
JH
3556 }
3557 }
620e46c5
JH
3558}
3559
76e3520e 3560STATIC regnode *
830247a4 3561S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3562{
ffc61ed2 3563 register UV value;
9a86a77b 3564 register UV nextvalue;
3568d838 3565 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3566 register IV range = 0;
c277df42 3567 register regnode *ret;
ba210ebe 3568 STRLEN numlen;
ffc61ed2 3569 IV namedclass;
9c5ffd7c 3570 char *rangebegin = 0;
936ed897 3571 bool need_class = 0;
9c5ffd7c 3572 SV *listsv = Nullsv;
ffc61ed2
JH
3573 register char *e;
3574 UV n;
9e55ce06
JH
3575 bool optimize_invert = TRUE;
3576 AV* unicode_alternate = 0;
1b2d223b
JH
3577#ifdef EBCDIC
3578 UV literal_endpoint = 0;
3579#endif
ffc61ed2
JH
3580
3581 ret = reganode(pRExC_state, ANYOF, 0);
3582
3583 if (!SIZE_ONLY)
3584 ANYOF_FLAGS(ret) = 0;
3585
9a86a77b 3586 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3587 RExC_naughty++;
3588 RExC_parse++;
3589 if (!SIZE_ONLY)
3590 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3591 }
a0d0e21e 3592
936ed897 3593 if (SIZE_ONLY)
830247a4 3594 RExC_size += ANYOF_SKIP;
936ed897 3595 else {
830247a4 3596 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3597 if (FOLD)
3598 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3599 if (LOC)
3600 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3601 ANYOF_BITMAP_ZERO(ret);
3602 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3603 }
b8c5462f 3604
9a86a77b
JH
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606
b938889d 3607 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3608 checkposixcc(pRExC_state);
b8c5462f 3609
f064b6ad
HS
3610 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3611 if (UCHARAT(RExC_parse) == ']')
3612 goto charclassloop;
ffc61ed2 3613
9a86a77b 3614 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3615
3616 charclassloop:
3617
3618 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3619
73b437c8 3620 if (!range)
830247a4 3621 rangebegin = RExC_parse;
ffc61ed2 3622 if (UTF) {
5e12f4fb 3623 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3624 RExC_end - RExC_parse,
3625 &numlen, 0);
ffc61ed2
JH
3626 RExC_parse += numlen;
3627 }
3628 else
3629 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3630 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3631 if (value == '[' && POSIXCC(nextvalue))
830247a4 3632 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3633 else if (value == '\\') {
ffc61ed2 3634 if (UTF) {
5e12f4fb 3635 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3636 RExC_end - RExC_parse,
3637 &numlen, 0);
3638 RExC_parse += numlen;
3639 }
3640 else
3641 value = UCHARAT(RExC_parse++);
470c3474 3642 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3643 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3644 * be a problem later if we want switch on Unicode.
3645 * A similar issue a little bit later when switching on
3646 * namedclass. --jhi */
ffc61ed2 3647 switch ((I32)value) {
b8c5462f
JH
3648 case 'w': namedclass = ANYOF_ALNUM; break;
3649 case 'W': namedclass = ANYOF_NALNUM; break;
3650 case 's': namedclass = ANYOF_SPACE; break;
3651 case 'S': namedclass = ANYOF_NSPACE; break;
3652 case 'd': namedclass = ANYOF_DIGIT; break;
3653 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3654 case 'p':
3655 case 'P':
af6f566e 3656 if (RExC_parse >= RExC_end)
2a4859cd 3657 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3658 if (*RExC_parse == '{') {
228fe6e6 3659 const U8 c = (U8)value;
ffc61ed2
JH
3660 e = strchr(RExC_parse++, '}');
3661 if (!e)
0da60cf5 3662 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3663 while (isSPACE(UCHARAT(RExC_parse)))
3664 RExC_parse++;
3665 if (e == RExC_parse)
0da60cf5 3666 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3667 n = e - RExC_parse;
ab13f0c7
JH
3668 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3669 n--;
ffc61ed2
JH
3670 }
3671 else {
3672 e = RExC_parse;
3673 n = 1;
3674 }
3675 if (!SIZE_ONLY) {
ab13f0c7
JH
3676 if (UCHARAT(RExC_parse) == '^') {
3677 RExC_parse++;
3678 n--;
3679 value = value == 'p' ? 'P' : 'p'; /* toggle */
3680 while (isSPACE(UCHARAT(RExC_parse))) {
3681 RExC_parse++;
3682 n--;
3683 }
3684 }
ffc61ed2 3685 if (value == 'p')
ab13f0c7
JH
3686 Perl_sv_catpvf(aTHX_ listsv,
3687 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3688 else
ab13f0c7
JH
3689 Perl_sv_catpvf(aTHX_ listsv,
3690 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3691 }
3692 RExC_parse = e + 1;
3693 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
fd826e6e
JP
3694 namedclass = ANYOF_MAX; /* no official name, but it's named */
3695 break;
b8c5462f
JH
3696 case 'n': value = '\n'; break;
3697 case 'r': value = '\r'; break;
3698 case 't': value = '\t'; break;
3699 case 'f': value = '\f'; break;
3700 case 'b': value = '\b'; break;
c7f1f016
NIS
3701 case 'e': value = ASCII_TO_NATIVE('\033');break;
3702 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3703 case 'x':
ffc61ed2 3704 if (*RExC_parse == '{') {
a4c04bdc
NC
3705 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3706 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3707 e = strchr(RExC_parse++, '}');
b81d288d 3708 if (!e)
ffc61ed2 3709 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3710
3711 numlen = e - RExC_parse;
3712 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3713 RExC_parse = e + 1;
3714 }
3715 else {
a4c04bdc 3716 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3717 numlen = 2;
3718 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3719 RExC_parse += numlen;
3720 }
b8c5462f
JH
3721 break;
3722 case 'c':
830247a4 3723 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3724 value = toCTRL(value);
3725 break;
3726 case '0': case '1': case '2': case '3': case '4':
3727 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3728 {
3729 I32 flags = 0;
3730 numlen = 3;
3731 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3732 RExC_parse += numlen;
b8c5462f 3733 break;
53305cf1 3734 }
1028017a 3735 default:
f5e9f069 3736 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
3737 vWARN2(RExC_parse,
3738 "Unrecognized escape \\%c in character class passed through",
3739 (int)value);
1028017a 3740 break;
b8c5462f 3741 }
ffc61ed2 3742 } /* end of \blah */
1b2d223b
JH
3743#ifdef EBCDIC
3744 else
3745 literal_endpoint++;
3746#endif
ffc61ed2
JH
3747
3748 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3749
3750 if (!SIZE_ONLY && !need_class)
936ed897 3751 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3752
936ed897 3753 need_class = 1;
ffc61ed2
JH
3754
3755 /* a bad range like a-\d, a-[:digit:] ? */
3756 if (range) {
73b437c8 3757 if (!SIZE_ONLY) {
e476b1b5 3758 if (ckWARN(WARN_REGEXP))
830247a4 3759 vWARN4(RExC_parse,
b45f050a 3760 "False [] range \"%*.*s\"",
830247a4
IZ
3761 RExC_parse - rangebegin,
3762 RExC_parse - rangebegin,
b45f050a 3763 rangebegin);
3568d838
JH
3764 if (prevvalue < 256) {
3765 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3766 ANYOF_BITMAP_SET(ret, '-');
3767 }
3768 else {
3769 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3770 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3771 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3772 }
b8c5462f 3773 }
ffc61ed2
JH
3774
3775 range = 0; /* this was not a true range */
73b437c8 3776 }
ffc61ed2 3777
73b437c8 3778 if (!SIZE_ONLY) {
3965bf69
NC
3779 const char *what = NULL;
3780 char yesno = 0;
3781
3568d838
JH
3782 if (namedclass > OOB_NAMEDCLASS)
3783 optimize_invert = FALSE;
e2962f66
JH
3784 /* Possible truncation here but in some 64-bit environments
3785 * the compiler gets heartburn about switch on 64-bit values.
3786 * A similar issue a little earlier when switching on value.
98f323fa 3787 * --jhi */
e2962f66 3788 switch ((I32)namedclass) {
73b437c8
JH
3789 case ANYOF_ALNUM:
3790 if (LOC)
936ed897 3791 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3792 else {
3793 for (value = 0; value < 256; value++)
3794 if (isALNUM(value))
936ed897 3795 ANYOF_BITMAP_SET(ret, value);
73b437c8 3796 }
3965bf69
NC
3797 yesno = '+';
3798 what = "Word";
73b437c8
JH
3799 break;
3800 case ANYOF_NALNUM:
3801 if (LOC)
936ed897 3802 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3803 else {
3804 for (value = 0; value < 256; value++)
3805 if (!isALNUM(value))
936ed897 3806 ANYOF_BITMAP_SET(ret, value);
73b437c8 3807 }
3965bf69
NC
3808 yesno = '!';
3809 what = "Word";
73b437c8 3810 break;
ffc61ed2 3811 case ANYOF_ALNUMC:
73b437c8 3812 if (LOC)
ffc61ed2 3813 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3814 else {
3815 for (value = 0; value < 256; value++)
ffc61ed2 3816 if (isALNUMC(value))
936ed897 3817 ANYOF_BITMAP_SET(ret, value);
73b437c8 3818 }
3965bf69
NC
3819 yesno = '+';
3820 what = "Alnum";
73b437c8
JH
3821 break;
3822 case ANYOF_NALNUMC:
3823 if (LOC)
936ed897 3824 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3825 else {
3826 for (value = 0; value < 256; value++)
3827 if (!isALNUMC(value))
936ed897 3828 ANYOF_BITMAP_SET(ret, value);
73b437c8 3829 }
3965bf69
NC
3830 yesno = '!';
3831 what = "Alnum";
73b437c8
JH
3832 break;
3833 case ANYOF_ALPHA:
3834 if (LOC)
936ed897 3835 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3836 else {
3837 for (value = 0; value < 256; value++)
3838 if (isALPHA(value))
936ed897 3839 ANYOF_BITMAP_SET(ret, value);
73b437c8 3840 }
3965bf69
NC
3841 yesno = '+';
3842 what = "Alpha";
73b437c8
JH
3843 break;
3844 case ANYOF_NALPHA:
3845 if (LOC)
936ed897 3846 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3847 else {
3848 for (value = 0; value < 256; value++)
3849 if (!isALPHA(value))
936ed897 3850 ANYOF_BITMAP_SET(ret, value);
73b437c8 3851 }
3965bf69
NC
3852 yesno = '!';
3853 what = "Alpha";
73b437c8
JH
3854 break;
3855 case ANYOF_ASCII:
3856 if (LOC)
936ed897 3857 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3858 else {
c7f1f016 3859#ifndef EBCDIC
1ba5c669
JH
3860 for (value = 0; value < 128; value++)
3861 ANYOF_BITMAP_SET(ret, value);
3862#else /* EBCDIC */
ffbc6a93 3863 for (value = 0; value < 256; value++) {
3a3c4447
JH
3864 if (isASCII(value))
3865 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3866 }
1ba5c669 3867#endif /* EBCDIC */
73b437c8 3868 }
3965bf69
NC
3869 yesno = '+';
3870 what = "ASCII";
73b437c8
JH
3871 break;
3872 case ANYOF_NASCII:
3873 if (LOC)
936ed897 3874 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3875 else {
c7f1f016 3876#ifndef EBCDIC
1ba5c669
JH
3877 for (value = 128; value < 256; value++)
3878 ANYOF_BITMAP_SET(ret, value);
3879#else /* EBCDIC */
ffbc6a93 3880 for (value = 0; value < 256; value++) {
3a3c4447
JH
3881 if (!isASCII(value))
3882 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3883 }
1ba5c669 3884#endif /* EBCDIC */
73b437c8 3885 }
3965bf69
NC
3886 yesno = '!';
3887 what = "ASCII";
73b437c8 3888 break;
aaa51d5e
JF
3889 case ANYOF_BLANK:
3890 if (LOC)
3891 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3892 else {
3893 for (value = 0; value < 256; value++)
3894 if (isBLANK(value))
3895 ANYOF_BITMAP_SET(ret, value);
3896 }
3965bf69
NC
3897 yesno = '+';
3898 what = "Blank";
aaa51d5e
JF
3899 break;
3900 case ANYOF_NBLANK:
3901 if (LOC)
3902 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3903 else {
3904 for (value = 0; value < 256; value++)
3905 if (!isBLANK(value))
3906 ANYOF_BITMAP_SET(ret, value);
3907 }
3965bf69
NC
3908 yesno = '!';
3909 what = "Blank";
aaa51d5e 3910 break;
73b437c8
JH
3911 case ANYOF_CNTRL:
3912 if (LOC)
936ed897 3913 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3914 else {
3915 for (value = 0; value < 256; value++)
3916 if (isCNTRL(value))
936ed897 3917 ANYOF_BITMAP_SET(ret, value);
73b437c8 3918 }
3965bf69
NC
3919 yesno = '+';
3920 what = "Cntrl";
73b437c8
JH
3921 break;
3922 case ANYOF_NCNTRL:
3923 if (LOC)
936ed897 3924 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3925 else {
3926 for (value = 0; value < 256; value++)
3927 if (!isCNTRL(value))
936ed897 3928 ANYOF_BITMAP_SET(ret, value);
73b437c8 3929 }
3965bf69
NC
3930 yesno = '!';
3931 what = "Cntrl";
ffc61ed2
JH
3932 break;
3933 case ANYOF_DIGIT:
3934 if (LOC)
3935 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3936 else {
3937 /* consecutive digits assumed */
3938 for (value = '0'; value <= '9'; value++)
3939 ANYOF_BITMAP_SET(ret, value);
3940 }
3965bf69
NC
3941 yesno = '+';
3942 what = "Digit";
ffc61ed2
JH
3943 break;
3944 case ANYOF_NDIGIT:
3945 if (LOC)
3946 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3947 else {
3948 /* consecutive digits assumed */
3949 for (value = 0; value < '0'; value++)
3950 ANYOF_BITMAP_SET(ret, value);
3951 for (value = '9' + 1; value < 256; value++)
3952 ANYOF_BITMAP_SET(ret, value);
3953 }
3965bf69
NC
3954 yesno = '!';
3955 what = "Digit";
73b437c8
JH
3956 break;
3957 case ANYOF_GRAPH:
3958 if (LOC)
936ed897 3959 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3960 else {
3961 for (value = 0; value < 256; value++)
3962 if (isGRAPH(value))
936ed897 3963 ANYOF_BITMAP_SET(ret, value);
73b437c8 3964 }
3965bf69
NC
3965 yesno = '+';
3966 what = "Graph";
73b437c8
JH
3967 break;
3968 case ANYOF_NGRAPH:
3969 if (LOC)
936ed897 3970 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3971 else {
3972 for (value = 0; value < 256; value++)
3973 if (!isGRAPH(value))
936ed897 3974 ANYOF_BITMAP_SET(ret, value);
73b437c8 3975 }
3965bf69
NC
3976 yesno = '!';
3977 what = "Graph";
73b437c8
JH
3978 break;
3979 case ANYOF_LOWER:
3980 if (LOC)
936ed897 3981 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3982 else {
3983 for (value = 0; value < 256; value++)
3984 if (isLOWER(value))
936ed897 3985 ANYOF_BITMAP_SET(ret, value);
73b437c8 3986 }
3965bf69
NC
3987 yesno = '+';
3988 what = "Lower";
73b437c8
JH
3989 break;
3990 case ANYOF_NLOWER:
3991 if (LOC)
936ed897 3992 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3993 else {
3994 for (value = 0; value < 256; value++)
3995 if (!isLOWER(value))
936ed897 3996 ANYOF_BITMAP_SET(ret, value);
73b437c8 3997 }
3965bf69
NC
3998 yesno = '!';
3999 what = "Lower";
73b437c8
JH
4000 break;
4001 case ANYOF_PRINT:
4002 if (LOC)
936ed897 4003 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
4004 else {
4005 for (value = 0; value < 256; value++)
4006 if (isPRINT(value))
936ed897 4007 ANYOF_BITMAP_SET(ret, value);
73b437c8 4008 }
3965bf69
NC
4009 yesno = '+';
4010 what = "Print";
73b437c8
JH
4011 break;
4012 case ANYOF_NPRINT:
4013 if (LOC)
936ed897 4014 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
4015 else {
4016 for (value = 0; value < 256; value++)
4017 if (!isPRINT(value))
936ed897 4018 ANYOF_BITMAP_SET(ret, value);
73b437c8 4019 }
3965bf69
NC
4020 yesno = '!';
4021 what = "Print";
73b437c8 4022 break;
aaa51d5e
JF
4023 case ANYOF_PSXSPC:
4024 if (LOC)
4025 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4026 else {
4027 for (value = 0; value < 256; value++)
4028 if (isPSXSPC(value))
4029 ANYOF_BITMAP_SET(ret, value);
4030 }
3965bf69
NC
4031 yesno = '+';
4032 what = "Space";
aaa51d5e
JF
4033 break;
4034 case ANYOF_NPSXSPC:
4035 if (LOC)
4036 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4037 else {
4038 for (value = 0; value < 256; value++)
4039 if (!isPSXSPC(value))
4040 ANYOF_BITMAP_SET(ret, value);
4041 }
3965bf69
NC
4042 yesno = '!';
4043 what = "Space";
aaa51d5e 4044 break;
73b437c8
JH
4045 case ANYOF_PUNCT:
4046 if (LOC)
936ed897 4047 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
4048 else {
4049 for (value = 0; value < 256; value++)
4050 if (isPUNCT(value))
936ed897 4051 ANYOF_BITMAP_SET(ret, value);
73b437c8 4052 }
3965bf69
NC
4053 yesno = '+';
4054 what = "Punct";
73b437c8
JH
4055 break;
4056 case ANYOF_NPUNCT:
4057 if (LOC)
936ed897 4058 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
4059 else {
4060 for (value = 0; value < 256; value++)
4061 if (!isPUNCT(value))
936ed897 4062 ANYOF_BITMAP_SET(ret, value);
73b437c8 4063 }
3965bf69
NC
4064 yesno = '!';
4065 what = "Punct";
ffc61ed2
JH
4066 break;
4067 case ANYOF_SPACE:
4068 if (LOC)
4069 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4070 else {
4071 for (value = 0; value < 256; value++)
4072 if (isSPACE(value))
4073 ANYOF_BITMAP_SET(ret, value);
4074 }
3965bf69
NC
4075 yesno = '+';
4076 what = "SpacePerl";
ffc61ed2
JH
4077 break;
4078 case ANYOF_NSPACE:
4079 if (LOC)
4080 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4081 else {
4082 for (value = 0; value < 256; value++)
4083 if (!isSPACE(value))
4084 ANYOF_BITMAP_SET(ret, value);
4085 }
3965bf69
NC
4086 yesno = '!';
4087 what = "SpacePerl";
73b437c8
JH
4088 break;
4089 case ANYOF_UPPER:
4090 if (LOC)
936ed897 4091 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4092 else {
4093 for (value = 0; value < 256; value++)
4094 if (isUPPER(value))
936ed897 4095 ANYOF_BITMAP_SET(ret, value);
73b437c8 4096 }
3965bf69
NC
4097 yesno = '+';
4098 what = "Upper";
73b437c8
JH
4099 break;
4100 case ANYOF_NUPPER:
4101 if (LOC)
936ed897 4102 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4103 else {
4104 for (value = 0; value < 256; value++)
4105 if (!isUPPER(value))
936ed897 4106 ANYOF_BITMAP_SET(ret, value);
73b437c8 4107 }
3965bf69
NC
4108 yesno = '!';
4109 what = "Upper";
73b437c8
JH
4110 break;
4111 case ANYOF_XDIGIT:
4112 if (LOC)
936ed897 4113 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4114 else {
4115 for (value = 0; value < 256; value++)
4116 if (isXDIGIT(value))
936ed897 4117 ANYOF_BITMAP_SET(ret, value);
73b437c8 4118 }
3965bf69
NC
4119 yesno = '+';
4120 what = "XDigit";
73b437c8
JH
4121 break;
4122 case ANYOF_NXDIGIT:
4123 if (LOC)
936ed897 4124 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4125 else {
4126 for (value = 0; value < 256; value++)
4127 if (!isXDIGIT(value))
936ed897 4128 ANYOF_BITMAP_SET(ret, value);
73b437c8 4129 }
3965bf69
NC
4130 yesno = '!';
4131 what = "XDigit";
73b437c8 4132 break;
fd826e6e
JP
4133 case ANYOF_MAX:
4134 /* this is to handle \p and \P */
4135 break;
73b437c8 4136 default:
b45f050a 4137 vFAIL("Invalid [::] class");
73b437c8 4138 break;
b8c5462f 4139 }
3965bf69
NC
4140 if (what) {
4141 /* Strings such as "+utf8::isWord\n" */
4142 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4143 }
b8c5462f 4144 if (LOC)
936ed897 4145 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4146 continue;
a0d0e21e 4147 }
ffc61ed2
JH
4148 } /* end of namedclass \blah */
4149
a0d0e21e 4150 if (range) {
eb160463 4151 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4152 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4153 RExC_parse - rangebegin,
4154 RExC_parse - rangebegin,
b45f050a 4155 rangebegin);
3568d838 4156 range = 0; /* not a valid range */
73b437c8 4157 }
a0d0e21e
LW
4158 }
4159 else {
3568d838 4160 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4161 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4162 RExC_parse[1] != ']') {
4163 RExC_parse++;
ffc61ed2
JH
4164
4165 /* a bad range like \w-, [:word:]- ? */
4166 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4167 if (ckWARN(WARN_REGEXP))
830247a4 4168 vWARN4(RExC_parse,
b45f050a 4169 "False [] range \"%*.*s\"",
830247a4
IZ
4170 RExC_parse - rangebegin,
4171 RExC_parse - rangebegin,
b45f050a 4172 rangebegin);
73b437c8 4173 if (!SIZE_ONLY)
936ed897 4174 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4175 } else
ffc61ed2
JH
4176 range = 1; /* yeah, it's a range! */
4177 continue; /* but do it the next time */
a0d0e21e 4178 }
a687059c 4179 }
ffc61ed2 4180
93733859 4181 /* now is the next time */
ae5c130c 4182 if (!SIZE_ONLY) {
3568d838
JH
4183 IV i;
4184
4185 if (prevvalue < 256) {
228fe6e6 4186 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
4187
4188#ifdef EBCDIC
1b2d223b
JH
4189 /* In EBCDIC [\x89-\x91] should include
4190 * the \x8e but [i-j] should not. */
4191 if (literal_endpoint == 2 &&
4192 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4193 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4194 {
3568d838
JH
4195 if (isLOWER(prevvalue)) {
4196 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4197 if (isLOWER(i))
4198 ANYOF_BITMAP_SET(ret, i);
4199 } else {
3568d838 4200 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4201 if (isUPPER(i))
4202 ANYOF_BITMAP_SET(ret, i);
4203 }
8ada0baa 4204 }
ffc61ed2 4205 else
8ada0baa 4206#endif
a5961de5
JH
4207 for (i = prevvalue; i <= ceilvalue; i++)
4208 ANYOF_BITMAP_SET(ret, i);
3568d838 4209 }
a5961de5 4210 if (value > 255 || UTF) {
228fe6e6
AL
4211 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4212 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 4213
ffc61ed2 4214 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4215 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4216 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4217 prevnatvalue, natvalue);
4218 }
4219 else if (prevnatvalue == natvalue) {
4220 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4221 if (FOLD) {
a2a469f9 4222 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 4223 STRLEN foldlen;
228fe6e6 4224 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4225
c840d2a2
JH
4226 /* If folding and foldable and a single
4227 * character, insert also the folded version
4228 * to the charclass. */
9e55ce06 4229 if (f != value) {
eb160463 4230 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4231 Perl_sv_catpvf(aTHX_ listsv,
4232 "%04"UVxf"\n", f);
4233 else {
4234 /* Any multicharacter foldings
4235 * require the following transform:
4236 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4237 * where E folds into "pq" and F folds
4238 * into "rst", all other characters
4239 * fold to single characters. We save
4240 * away these multicharacter foldings,
4241 * to be later saved as part of the
4242 * additional "s" data. */
4243 SV *sv;
4244
4245 if (!unicode_alternate)
4246 unicode_alternate = newAV();
4247 sv = newSVpvn((char*)foldbuf, foldlen);
4248 SvUTF8_on(sv);
4249 av_push(unicode_alternate, sv);
4250 }
4251 }
254ba52a 4252
60a8b682
JH
4253 /* If folding and the value is one of the Greek
4254 * sigmas insert a few more sigmas to make the
4255 * folding rules of the sigmas to work right.
4256 * Note that not all the possible combinations
4257 * are handled here: some of them are handled
9e55ce06
JH
4258 * by the standard folding rules, and some of
4259 * them (literal or EXACTF cases) are handled
4260 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4261 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4262 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4263 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4264 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4265 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4266 }
4267 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4268 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4269 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4270 }
4271 }
ffc61ed2 4272 }
1b2d223b
JH
4273#ifdef EBCDIC
4274 literal_endpoint = 0;
4275#endif
8ada0baa 4276 }
ffc61ed2
JH
4277
4278 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4279 }
ffc61ed2 4280
936ed897 4281 if (need_class) {
4f66b38d 4282 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4283 if (SIZE_ONLY)
830247a4 4284 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4285 else
830247a4 4286 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4287 }
ffc61ed2 4288
ae5c130c 4289 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4290 if (!SIZE_ONLY &&
ffc61ed2 4291 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4292 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4293 ) {
a0ed51b3 4294 for (value = 0; value < 256; ++value) {
936ed897 4295 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4296 UV fold = PL_fold[value];
ffc61ed2
JH
4297
4298 if (fold != value)
4299 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4300 }
4301 }
936ed897 4302 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4303 }
ffc61ed2 4304
ae5c130c 4305 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4306 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4307 /* If the only flag is inversion. */
4308 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4309 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4310 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4311 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4312 }
a0d0e21e 4313
b81d288d 4314 if (!SIZE_ONLY) {
fde631ed 4315 AV *av = newAV();
ffc61ed2
JH
4316 SV *rv;
4317
9e55ce06 4318 /* The 0th element stores the character class description
5b7ea690 4319 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4320 * to initialize the appropriate swash (which gets stored in
4321 * the 1st element), and also useful for dumping the regnode.
4322 * The 2nd element stores the multicharacter foldings,
5b7ea690 4323 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4324 av_store(av, 0, listsv);
4325 av_store(av, 1, NULL);
9e55ce06 4326 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4327 rv = newRV_noinc((SV*)av);
19860706 4328 n = add_data(pRExC_state, 1, "s");
830247a4 4329 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4330 ARG_SET(ret, n);
a0ed51b3
LW
4331 }
4332
4333 return ret;
4334}
4335
76e3520e 4336STATIC char*
830247a4 4337S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4338{
830247a4 4339 char* retval = RExC_parse++;
a0d0e21e 4340
4633a7c4 4341 for (;;) {
830247a4
IZ
4342 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4343 RExC_parse[2] == '#') {
a00160b9
AE
4344 while (*RExC_parse != ')') {
4345 if (RExC_parse == RExC_end)
4346 FAIL("Sequence (?#... not terminated");
830247a4 4347 RExC_parse++;
a00160b9 4348 }
830247a4 4349 RExC_parse++;
4633a7c4
LW
4350 continue;
4351 }
e2509266 4352 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4353 if (isSPACE(*RExC_parse)) {
4354 RExC_parse++;
748a9306
LW
4355 continue;
4356 }
830247a4 4357 else if (*RExC_parse == '#') {
a00160b9
AE
4358 while (RExC_parse < RExC_end)
4359 if (*RExC_parse++ == '\n') break;
748a9306
LW
4360 continue;
4361 }
748a9306 4362 }
4633a7c4 4363 return retval;
a0d0e21e 4364 }
a687059c
LW
4365}
4366
4367/*
c277df42 4368- reg_node - emit a node
a0d0e21e 4369*/
76e3520e 4370STATIC regnode * /* Location. */
830247a4 4371S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4372{
c277df42 4373 register regnode *ptr;
547d29e4 4374 regnode * const ret = RExC_emit;
a687059c 4375
c277df42 4376 if (SIZE_ONLY) {
830247a4
IZ
4377 SIZE_ALIGN(RExC_size);
4378 RExC_size += 1;
a0d0e21e
LW
4379 return(ret);
4380 }
a687059c 4381
c277df42 4382 NODE_ALIGN_FILL(ret);
a0d0e21e 4383 ptr = ret;
c277df42 4384 FILL_ADVANCE_NODE(ptr, op);
fac92740 4385 if (RExC_offsets) { /* MJD */
43af864e 4386 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4387 "reg_node", __LINE__,
4388 reg_name[op],
4389 RExC_emit - RExC_emit_start > RExC_offsets[0]
4390 ? "Overwriting end of array!\n" : "OK",
4391 RExC_emit - RExC_emit_start,
4392 RExC_parse - RExC_start,
4393 RExC_offsets[0]));
43af864e 4394 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4395 }
4396
830247a4 4397 RExC_emit = ptr;
a687059c 4398
a0d0e21e 4399 return(ret);
a687059c
LW
4400}
4401
4402/*
a0d0e21e
LW
4403- reganode - emit a node with an argument
4404*/
76e3520e 4405STATIC regnode * /* Location. */
830247a4 4406S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4407{
c277df42 4408 register regnode *ptr;
547d29e4 4409 regnode * const ret = RExC_emit;
fe14fcc3 4410
c277df42 4411 if (SIZE_ONLY) {
830247a4
IZ
4412 SIZE_ALIGN(RExC_size);
4413 RExC_size += 2;
a0d0e21e
LW
4414 return(ret);
4415 }
fe14fcc3 4416
c277df42 4417 NODE_ALIGN_FILL(ret);
a0d0e21e 4418 ptr = ret;
c277df42 4419 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4420 if (RExC_offsets) { /* MJD */
43af864e 4421 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4422 "reganode",
43af864e
JH
4423 __LINE__,
4424 reg_name[op],
fac92740
MJD
4425 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4426 "Overwriting end of array!\n" : "OK",
4427 RExC_emit - RExC_emit_start,
4428 RExC_parse - RExC_start,
4429 RExC_offsets[0]));
43af864e 4430 Set_Cur_Node_Offset;
fac92740
MJD
4431 }
4432
830247a4 4433 RExC_emit = ptr;
fe14fcc3 4434
a0d0e21e 4435 return(ret);
fe14fcc3
LW
4436}
4437
4438/*
cd439c50 4439- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4440*/
4441STATIC void
e4057cfc 4442S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4443{
5e12f4fb 4444 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4445}
4446
4447/*
a0d0e21e
LW
4448- reginsert - insert an operator in front of already-emitted operand
4449*
4450* Means relocating the operand.
4451*/
76e3520e 4452STATIC void
830247a4 4453S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4454{
c277df42
IZ
4455 register regnode *src;
4456 register regnode *dst;
4457 register regnode *place;
547d29e4 4458 const int offset = regarglen[(U8)op];
b81d288d 4459
22c35a8c 4460/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4461
4462 if (SIZE_ONLY) {
830247a4 4463 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4464 return;
4465 }
a687059c 4466
830247a4
IZ
4467 src = RExC_emit;
4468 RExC_emit += NODE_STEP_REGNODE + offset;
4469 dst = RExC_emit;
fac92740 4470 while (src > opnd) {
c277df42 4471 StructCopy(--src, --dst, regnode);
fac92740 4472 if (RExC_offsets) { /* MJD 20010112 */
43af864e 4473 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4474 "reg_insert",
43af864e
JH
4475 __LINE__,
4476 reg_name[op],
fac92740
MJD
4477 dst - RExC_emit_start > RExC_offsets[0]
4478 ? "Overwriting end of array!\n" : "OK",
4479 src - RExC_emit_start,
4480 dst - RExC_emit_start,
4481 RExC_offsets[0]));
43af864e
JH
4482 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4483 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4484 }
4485 }
4486
a0d0e21e
LW
4487
4488 place = opnd; /* Op node, where operand used to be. */
fac92740 4489 if (RExC_offsets) { /* MJD */
43af864e 4490 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4491 "reginsert",
43af864e
JH
4492 __LINE__,
4493 reg_name[op],
fac92740
MJD
4494 place - RExC_emit_start > RExC_offsets[0]
4495 ? "Overwriting end of array!\n" : "OK",
4496 place - RExC_emit_start,
4497 RExC_parse - RExC_start,
4498 RExC_offsets[0]));
43af864e 4499 Set_Node_Offset(place, RExC_parse);
4a04c497 4500 Set_Node_Length(place, 1);
fac92740 4501 }
c277df42
IZ
4502 src = NEXTOPER(place);
4503 FILL_ADVANCE_NODE(place, op);
4504 Zero(src, offset, regnode);
a687059c
LW
4505}
4506
4507/*
c277df42 4508- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4509*/
76e3520e 4510STATIC void
830247a4 4511S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4512{
c277df42 4513 register regnode *scan;
a0d0e21e 4514
c277df42 4515 if (SIZE_ONLY)
a0d0e21e
LW
4516 return;
4517
4518 /* Find last node. */
4519 scan = p;
4520 for (;;) {
547d29e4 4521 regnode * const temp = regnext(scan);
a0d0e21e
LW
4522 if (temp == NULL)
4523 break;
4524 scan = temp;
4525 }
a687059c 4526
c277df42
IZ
4527 if (reg_off_by_arg[OP(scan)]) {
4528 ARG_SET(scan, val - scan);
a0ed51b3
LW
4529 }
4530 else {
c277df42
IZ
4531 NEXT_OFF(scan) = val - scan;
4532 }
a687059c
LW
4533}
4534
4535/*
a0d0e21e
LW
4536- regoptail - regtail on operand of first argument; nop if operandless
4537*/
76e3520e 4538STATIC void
830247a4 4539S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4540{
a0d0e21e 4541 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4542 if (p == NULL || SIZE_ONLY)
4543 return;
22c35a8c 4544 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4545 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4546 }
22c35a8c 4547 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4548 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4549 }
4550 else
a0d0e21e 4551 return;
a687059c
LW
4552}
4553
4554/*
4555 - regcurly - a little FSA that accepts {\d+,?\d*}
4556 */
79072805 4557STATIC I32
8c18bf38 4558S_regcurly(pTHX_ register const char *s)
a687059c
LW
4559{
4560 if (*s++ != '{')
4561 return FALSE;
f0fcb552 4562 if (!isDIGIT(*s))
a687059c 4563 return FALSE;
f0fcb552 4564 while (isDIGIT(*s))
a687059c
LW
4565 s++;
4566 if (*s == ',')
4567 s++;
f0fcb552 4568 while (isDIGIT(*s))
a687059c
LW
4569 s++;
4570 if (*s != '}')
4571 return FALSE;
4572 return TRUE;
4573}
4574
a687059c 4575/*
fd181c75 4576 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4577 */
4578void
864dbfa3 4579Perl_regdump(pTHX_ regexp *r)
a687059c 4580{
35ff7856 4581#ifdef DEBUGGING
46fc3d4c 4582 SV *sv = sv_newmortal();
a687059c 4583
c277df42 4584 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4585
4586 /* Header fields of interest. */
c277df42 4587 if (r->anchored_substr)
7b0972df 4588 PerlIO_printf(Perl_debug_log,
5332c881 4589 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 4590 PL_colors[0],
7b0972df 4591 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
fdac8c4b 4592 SvPVX_const(r->anchored_substr),
3280af22 4593 PL_colors[1],
c277df42 4594 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4595 (IV)r->anchored_offset);
33b8afdf
JH
4596 else if (r->anchored_utf8)
4597 PerlIO_printf(Perl_debug_log,
5332c881 4598 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
4599 PL_colors[0],
4600 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
fdac8c4b 4601 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
4602 PL_colors[1],
4603 SvTAIL(r->anchored_utf8) ? "$" : "",
4604 (IV)r->anchored_offset);
c277df42 4605 if (r->float_substr)
7b0972df 4606 PerlIO_printf(Perl_debug_log,
5332c881 4607 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 4608 PL_colors[0],
b81d288d 4609 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
fdac8c4b 4610 SvPVX_const(r->float_substr),
3280af22 4611 PL_colors[1],
c277df42 4612 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4613 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4614 else if (r->float_utf8)
4615 PerlIO_printf(Perl_debug_log,
5332c881 4616 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
4617 PL_colors[0],
4618 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
fdac8c4b 4619 SvPVX_const(r->float_utf8),
33b8afdf
JH
4620 PL_colors[1],
4621 SvTAIL(r->float_utf8) ? "$" : "",
4622 (IV)r->float_min_offset, (UV)r->float_max_offset);
4623 if (r->check_substr || r->check_utf8)
b81d288d
AB
4624 PerlIO_printf(Perl_debug_log,
4625 r->check_substr == r->float_substr
33b8afdf 4626 && r->check_utf8 == r->float_utf8
c277df42
IZ
4627 ? "(checking floating" : "(checking anchored");
4628 if (r->reganch & ROPT_NOSCAN)
4629 PerlIO_printf(Perl_debug_log, " noscan");
4630 if (r->reganch & ROPT_CHECK_ALL)
4631 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4632 if (r->check_substr || r->check_utf8)
c277df42
IZ
4633 PerlIO_printf(Perl_debug_log, ") ");
4634
46fc3d4c 4635 if (r->regstclass) {
4636 regprop(sv, r->regstclass);
fdac8c4b 4637 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 4638 }
774d564b 4639 if (r->reganch & ROPT_ANCH) {
4640 PerlIO_printf(Perl_debug_log, "anchored");
4641 if (r->reganch & ROPT_ANCH_BOL)
4642 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4643 if (r->reganch & ROPT_ANCH_MBOL)
4644 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4645 if (r->reganch & ROPT_ANCH_SBOL)
4646 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4647 if (r->reganch & ROPT_ANCH_GPOS)
4648 PerlIO_printf(Perl_debug_log, "(GPOS)");
4649 PerlIO_putc(Perl_debug_log, ' ');
4650 }
c277df42
IZ
4651 if (r->reganch & ROPT_GPOS_SEEN)
4652 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4653 if (r->reganch & ROPT_SKIP)
760ac839 4654 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4655 if (r->reganch & ROPT_IMPLICIT)
760ac839 4656 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4657 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4658 if (r->reganch & ROPT_EVAL_SEEN)
4659 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4660 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4661 if (r->offsets) {
4662 U32 i;
c05e0e2f 4663 const U32 len = r->offsets[0];
392fbf5d 4664 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4665 for (i = 1; i <= len; i++)
392fbf5d
RB
4666 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4667 (UV)r->offsets[i*2-1],
4668 (UV)r->offsets[i*2]);
fac92740
MJD
4669 PerlIO_printf(Perl_debug_log, "\n");
4670 }
17c3b450 4671#endif /* DEBUGGING */
a687059c
LW
4672}
4673
4674/*
a0d0e21e
LW
4675- regprop - printable representation of opcode
4676*/
46fc3d4c 4677void
864dbfa3 4678Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4679{
35ff7856 4680#ifdef DEBUGGING
9b155405 4681 register int k;
a0d0e21e 4682
54dc92de 4683 sv_setpvn(sv, "", 0);
9b155405 4684 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4685 /* It would be nice to FAIL() here, but this may be called from
4686 regexec.c, and it would be hard to supply pRExC_state. */
4687 Perl_croak(aTHX_ "Corrupted regexp opcode");
a00f3e00 4688 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
4689
4690 k = PL_regkind[(U8)OP(o)];
4691
2a782b5b 4692 if (k == EXACT) {
339a2a6a 4693 SV * const dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4694 /* Using is_utf8_string() is a crude hack but it may
4695 * be the best for now since we have no flag "this EXACTish
4696 * node was UTF-8" --jhi */
228fe6e6 4697 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
339a2a6a 4698 const char * const s = do_utf8 ?
c728cb41
JH
4699 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4700 UNI_DISPLAY_REGEX) :
2a782b5b 4701 STRING(o);
c05e0e2f 4702 const int len = do_utf8 ?
2a782b5b
JH
4703 strlen(s) :
4704 STR_LEN(o);
4705 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4706 PL_colors[0],
4707 len, s,
4708 PL_colors[1]);
4709 }
9b155405 4710 else if (k == CURLY) {
cb434fcc 4711 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4712 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4713 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4714 }
2c2d71f5
JH
4715 else if (k == WHILEM && o->flags) /* Ordinal/of */
4716 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4717 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4718 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4719 else if (k == LOGICAL)
04ebc1ab 4720 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4721 else if (k == ANYOF) {
4722 int i, rangestart = -1;
339a2a6a 4723 const U8 flags = ANYOF_FLAGS(o);
654c77f7 4724 const char * const anyofs[] = { /* Should be synchronized with
19860706 4725 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4726 "\\w",
4727 "\\W",
4728 "\\s",
4729 "\\S",
4730 "\\d",
4731 "\\D",
4732 "[:alnum:]",
4733 "[:^alnum:]",
4734 "[:alpha:]",
4735 "[:^alpha:]",
4736 "[:ascii:]",
4737 "[:^ascii:]",
4738 "[:ctrl:]",
4739 "[:^ctrl:]",
4740 "[:graph:]",
4741 "[:^graph:]",
4742 "[:lower:]",
4743 "[:^lower:]",
4744 "[:print:]",
4745 "[:^print:]",
4746 "[:punct:]",
4747 "[:^punct:]",
4748 "[:upper:]",
aaa51d5e 4749 "[:^upper:]",
653099ff 4750 "[:xdigit:]",
aaa51d5e
JF
4751 "[:^xdigit:]",
4752 "[:space:]",
4753 "[:^space:]",
4754 "[:blank:]",
4755 "[:^blank:]"
653099ff
GS
4756 };
4757
19860706 4758 if (flags & ANYOF_LOCALE)
653099ff 4759 sv_catpv(sv, "{loc}");
19860706 4760 if (flags & ANYOF_FOLD)
653099ff
GS
4761 sv_catpv(sv, "{i}");
4762 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4763 if (flags & ANYOF_INVERT)
653099ff 4764 sv_catpv(sv, "^");
ffc61ed2
JH
4765 for (i = 0; i <= 256; i++) {
4766 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4767 if (rangestart == -1)
4768 rangestart = i;
4769 } else if (rangestart != -1) {
4770 if (i <= rangestart + 3)
4771 for (; rangestart < i; rangestart++)
653099ff 4772 put_byte(sv, rangestart);
ffc61ed2
JH
4773 else {
4774 put_byte(sv, rangestart);
4775 sv_catpv(sv, "-");
4776 put_byte(sv, i - 1);
653099ff 4777 }
ffc61ed2 4778 rangestart = -1;
653099ff 4779 }
847a199f 4780 }
ffc61ed2
JH
4781
4782 if (o->flags & ANYOF_CLASS)
4783 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4784 if (ANYOF_CLASS_TEST(o,i))
4785 sv_catpv(sv, anyofs[i]);
4786
4787 if (flags & ANYOF_UNICODE)
4788 sv_catpv(sv, "{unicode}");
1aa99e6b 4789 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4790 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4791
4792 {
4793 SV *lv;
339a2a6a 4794 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4795
ffc61ed2
JH
4796 if (lv) {
4797 if (sw) {
a2a469f9 4798 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 4799
ffc61ed2 4800 for (i = 0; i <= 256; i++) { /* just the first 256 */
228fe6e6 4801 uvchr_to_utf8(s, i);
ffc61ed2 4802
3568d838 4803 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4804 if (rangestart == -1)
4805 rangestart = i;
4806 } else if (rangestart != -1) {
ffc61ed2
JH
4807 if (i <= rangestart + 3)
4808 for (; rangestart < i; rangestart++) {
339a2a6a
AL
4809 const U8 * const e = uvchr_to_utf8(s,rangestart);
4810 U8 *p;
4811 for(p = s; p < e; p++)
ffc61ed2
JH
4812 put_byte(sv, *p);
4813 }
4814 else {
339a2a6a
AL
4815 const U8 *e = uvchr_to_utf8(s,rangestart);
4816 U8 *p;
4817 for (p = s; p < e; p++)
ffc61ed2 4818 put_byte(sv, *p);
339a2a6a
AL
4819 sv_catpvn(sv, "-", 1);
4820 e = uvchr_to_utf8(s, i-1);
4821 for (p = s; p < e; p++)
228fe6e6 4822 put_byte(sv, *p);
ffc61ed2
JH
4823 }
4824 rangestart = -1;
4825 }
19860706 4826 }
ffc61ed2
JH
4827
4828 sv_catpv(sv, "..."); /* et cetera */
19860706 4829 }
fde631ed 4830
ffc61ed2 4831 {
04851bb3 4832 char *s = savesvpv(lv);
ffc61ed2 4833 char *origs = s;
b81d288d 4834
ffc61ed2 4835 while(*s && *s != '\n') s++;
b81d288d 4836
ffc61ed2 4837 if (*s == '\n') {
339a2a6a 4838 const char * const t = ++s;
ffc61ed2
JH
4839
4840 while (*s) {
4841 if (*s == '\n')
4842 *s = ' ';
4843 s++;
4844 }
4845 if (s[-1] == ' ')
4846 s[-1] = 0;
4847
4848 sv_catpv(sv, t);
fde631ed 4849 }
b81d288d 4850
ffc61ed2 4851 Safefree(origs);
fde631ed
JH
4852 }
4853 }
653099ff 4854 }
ffc61ed2 4855
653099ff
GS
4856 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4857 }
9b155405 4858 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4859 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4860#endif /* DEBUGGING */
35ff7856 4861}
a687059c 4862
cad2e5aa
JH
4863SV *
4864Perl_re_intuit_string(pTHX_ regexp *prog)
4865{ /* Assume that RE_INTUIT is set */
4866 DEBUG_r(
71a0dd65 4867 {
339a2a6a 4868 const char * const s = SvPV_nolen_const(prog->check_substr
71a0dd65 4869 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
4870
4871 if (!PL_colorset) reginitcolors();
4872 PerlIO_printf(Perl_debug_log,
5332c881 4873 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
4874 PL_colors[4],
4875 prog->check_substr ? "" : "utf8 ",
4876 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4877 s,
4878 PL_colors[1],
4879 (strlen(s) > 60 ? "..." : ""));
4880 } );
4881
33b8afdf 4882 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4883}
4884
2b69d0c2 4885void
864dbfa3 4886Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4887{
9e55ce06
JH
4888#ifdef DEBUGGING
4889 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4890#endif
7821416a
IZ
4891
4892 if (!r || (--r->refcnt > 0))
4893 return;
9e55ce06 4894 DEBUG_r({
c05e0e2f
AL
4895 const char *s = (r->reganch & ROPT_UTF8)
4896 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4897 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
c05e0e2f 4898 const int len = SvCUR(dsv);
9e55ce06
JH
4899 if (!PL_colorset)
4900 reginitcolors();
4901 PerlIO_printf(Perl_debug_log,
4902 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4903 PL_colors[4],PL_colors[5],PL_colors[0],
4904 len, len, s,
4905 PL_colors[1],
4906 len > 60 ? "..." : "");
4907 });
cad2e5aa 4908
4c58c75a
NC
4909 /* gcov results gave these as non-null 100% of the time, so there's no
4910 optimisation in checking them before calling Safefree */
4911 Safefree(r->precomp);
4912 Safefree(r->offsets); /* 20010421 MJD */
cf93c79d
IZ
4913 if (RX_MATCH_COPIED(r))
4914 Safefree(r->subbeg);
a193d654
GS
4915 if (r->substrs) {
4916 if (r->anchored_substr)
4917 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4918 if (r->anchored_utf8)
4919 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4920 if (r->float_substr)
4921 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4922 if (r->float_utf8)
4923 SvREFCNT_dec(r->float_utf8);
2779dcf1 4924 Safefree(r->substrs);
a193d654 4925 }
c277df42
IZ
4926 if (r->data) {
4927 int n = r->data->count;
d7afa7f5
JH
4928 PAD* new_comppad = NULL;
4929 PAD* old_comppad;
46330ab1 4930 PADOFFSET refcnt;
dfad63ad 4931
c277df42 4932 while (--n >= 0) {
261faec3 4933 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4934 switch (r->data->what[n]) {
4935 case 's':
4936 SvREFCNT_dec((SV*)r->data->data[n]);
4937 break;
653099ff
GS
4938 case 'f':
4939 Safefree(r->data->data[n]);
4940 break;
dfad63ad
HS
4941 case 'p':
4942 new_comppad = (AV*)r->data->data[n];
4943 break;
c277df42 4944 case 'o':
dfad63ad 4945 if (new_comppad == NULL)
cea2e8a9 4946 Perl_croak(aTHX_ "panic: pregfree comppad");
d7afa7f5
JH
4947 PAD_SAVE_LOCAL(old_comppad,
4948 /* Watch out for global destruction's random ordering. */
4949 (SvTYPE(new_comppad) == SVt_PVAV) ?
4950 new_comppad : Null(PAD *)
4951 );
46330ab1
NC
4952 OP_REFCNT_LOCK;
4953 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4954 OP_REFCNT_UNLOCK;
4955 if (!refcnt)
9b978d73 4956 op_free((OP_4tree*)r->data->data[n]);
9b978d73 4957
d7afa7f5 4958 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4959 SvREFCNT_dec((SV*)new_comppad);
4960 new_comppad = NULL;
c277df42
IZ
4961 break;
4962 case 'n':
9e55ce06 4963 break;
c277df42 4964 default:
830247a4 4965 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4966 }
4967 }
4968 Safefree(r->data->what);
4969 Safefree(r->data);
a0d0e21e
LW
4970 }
4971 Safefree(r->startp);
4972 Safefree(r->endp);
4973 Safefree(r);
a687059c 4974}
c277df42
IZ
4975
4976/*
4977 - regnext - dig the "next" pointer out of a node
4978 *
4979 * [Note, when REGALIGN is defined there are two places in regmatch()
4980 * that bypass this code for speed.]
4981 */
4982regnode *
864dbfa3 4983Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4984{
4985 register I32 offset;
4986
3280af22 4987 if (p == &PL_regdummy)
c277df42
IZ
4988 return(NULL);
4989
4990 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4991 if (offset == 0)
4992 return(NULL);
4993
c277df42 4994 return(p+offset);
c277df42
IZ
4995}
4996
01f988be 4997STATIC void
cea2e8a9 4998S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4999{
5000 va_list args;
5001 STRLEN l1 = strlen(pat1);
5002 STRLEN l2 = strlen(pat2);
5003 char buf[512];
06bf62c7 5004 SV *msv;
e2b56717 5005 const char *message;
c277df42
IZ
5006
5007 if (l1 > 510)
5008 l1 = 510;
5009 if (l1 + l2 > 510)
5010 l2 = 510 - l1;
5011 Copy(pat1, buf, l1 , char);
5012 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5013 buf[l1 + l2] = '\n';
5014 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5015#ifdef I_STDARG
5016 /* ANSI variant takes additional second argument */
c277df42 5017 va_start(args, pat2);
8736538c
AS
5018#else
5019 va_start(args);
5020#endif
5a844595 5021 msv = vmess(buf, &args);
c277df42 5022 va_end(args);
71a0dd65 5023 message = SvPV_const(msv,l1);
c277df42
IZ
5024 if (l1 > 512)
5025 l1 = 512;
5026 Copy(message, buf, l1 , char);
b9bd2e23 5027 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 5028 Perl_croak(aTHX_ "%s", buf);
c277df42 5029}
a0ed51b3
LW
5030
5031/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5032
5033void
864dbfa3 5034Perl_save_re_context(pTHX)
b81d288d 5035{
830247a4 5036 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5037 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5038 SAVEPPTR(PL_reginput); /* String-input pointer. */
5039 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5040 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5041 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5042 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5043 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
e26a9094 5044 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5045 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5046 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5047 PL_reg_start_tmp = 0;
a0ed51b3
LW
5048 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5049 PL_reg_start_tmpl = 0;
7766f137 5050 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5051 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5052 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5053 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5054 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5055 SAVEVPTR(PL_regcc); /* from regexec.c */
5056 SAVEVPTR(PL_curcop);
7766f137
GS
5057 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5058 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5059 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5060 SAVESPTR(PL_reg_sv); /* from regexec.c */
4f4e7967 5061 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5062 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5063 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5064 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5065 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
e26a9094
NC
5066 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5067 PL_reg_oldsaved = Nullch;
5068 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5069 PL_reg_oldsavedlen = 0;
5070 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5071 PL_reg_maxiter = 0;
5072 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5073 PL_reg_leftiter = 0;
5074 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5075 PL_reg_poscache = Nullch;
5076 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5077 PL_reg_poscache_size = 0;
5078 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5079 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5080 SAVEI32(PL_regsize); /* from regexec.c */
bda19f49
JH
5081
5082 {
5083 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
bda19f49 5084 REGEXP *rx;
bda19f49
JH
5085
5086 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
228fe6e6 5087 U32 i;
bda19f49 5088 for (i = 1; i <= rx->nparens; i++) {
228fe6e6
AL
5089 GV *mgv;
5090 char digits[TYPE_CHARS(long)];
bda19f49
JH
5091 sprintf(digits, "%lu", (long)i);
5092 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5093 save_scalar(mgv);
5094 }
5095 }
5096 }
5097
54b6e2fa 5098#ifdef DEBUGGING
b81d288d 5099 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5100#endif
a0ed51b3 5101}
51371543 5102
51371543 5103static void
acfe0abc 5104clear_re(pTHX_ void *r)
51371543
GS
5105{
5106 ReREFCNT_dec((regexp *)r);
5107}
ffbc6a93 5108
e4057cfc
AL
5109#ifdef DEBUGGING
5110
5111STATIC void
5112S_put_byte(pTHX_ SV *sv, int c)
5113{
5114 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5115 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5116 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5117 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5118 else
5119 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5120}
5121
5122
5123STATIC regnode *
5124S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5125{
5126 register U8 op = EXACT; /* Arbitrary non-END op. */
5127 register regnode *next;
5128
5129 while (op != END && (!last || node < last)) {
5130 /* While that wasn't END last time... */
5131
5132 NODE_ALIGN(node);
5133 op = OP(node);
5134 if (op == CLOSE)
5135 l--;
5136 next = regnext(node);
5137 /* Where, what. */
5138 if (OP(node) == OPTIMIZED)
5139 goto after_print;
5140 regprop(sv, node);
5141 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5142 (int)(2*l + 1), "", SvPVX_const(sv));
5143 if (next == NULL) /* Next ptr. */
5144 PerlIO_printf(Perl_debug_log, "(0)");
5145 else
5146 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5147 (void)PerlIO_putc(Perl_debug_log, '\n');
5148 after_print:
5149 if (PL_regkind[(U8)op] == BRANCHJ) {
5150 register regnode *nnode = (OP(next) == LONGJMP
5151 ? regnext(next)
5152 : next);
5153 if (last && nnode > last)
5154 nnode = last;
5155 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5156 }
5157 else if (PL_regkind[(U8)op] == BRANCH) {
5158 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5159 }
5160 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5161 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5162 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5163 }
5164 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5165 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5166 next, sv, l + 1);
5167 }
5168 else if ( op == PLUS || op == STAR) {
5169 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5170 }
5171 else if (op == ANYOF) {
5172 /* arglen 1 + class block */
5173 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5174 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5175 node = NEXTOPER(node);
5176 }
5177 else if (PL_regkind[(U8)op] == EXACT) {
5178 /* Literal string, where present. */
5179 node += NODE_SZ_STR(node) - 1;
5180 node = NEXTOPER(node);
5181 }
5182 else {
5183 node = NEXTOPER(node);
5184 node += regarglen[(U8)op];
5185 }
5186 if (op == CURLYX || op == OPEN)
5187 l++;
5188 else if (op == WHILEM)
5189 l--;
5190 }
5191 return node;
5192}
5193
5194#endif /* DEBUGGING */
5195
583439ab
NC
5196/*
5197 * Local variables:
5198 * c-indentation-style: bsd
5199 * c-basic-offset: 4
5200 * indent-tabs-mode: t
5201 * End:
5202 *
d8294a4d
NC
5203 * ex: set ts=8 sts=4 sw=4 noet:
5204 */