This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: optree tests and VMS progress (no really)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29
DM
10 * a regular expression.
11 */
12
a687059c
LW
13/* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
15 */
16
17/* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
20 */
21
e50aee73
AD
22/* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
25*/
26
b9d5759e
AD
27#ifdef PERL_EXT_RE_BUILD
28/* need to replace pregcomp et al, so enable that */
29# ifndef PERL_IN_XSUB_RE
30# define PERL_IN_XSUB_RE
31# endif
32/* need access to debugger hooks */
cad2e5aa 33# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
34# define DEBUGGING
35# endif
36#endif
37
38#ifdef PERL_IN_XSUB_RE
d06ea78c 39/* We *really* need to overwrite these symbols: */
56953603
IZ
40# define Perl_pregcomp my_regcomp
41# define Perl_regdump my_regdump
42# define Perl_regprop my_regprop
d06ea78c 43# define Perl_pregfree my_regfree
cad2e5aa
JH
44# define Perl_re_intuit_string my_re_intuit_string
45/* *These* symbols are masked to allow static link. */
d06ea78c 46# define Perl_regnext my_regnext
f0b8d043 47# define Perl_save_re_context my_save_re_context
b81d288d 48# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
49
50# define PERL_NO_GET_CONTEXT
b81d288d 51#endif
56953603 52
f0fcb552 53/*SUPPRESS 112*/
a687059c 54/*
e50aee73 55 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
56 *
57 * Copyright (c) 1986 by University of Toronto.
58 * Written by Henry Spencer. Not derived from licensed software.
59 *
60 * Permission is granted to anyone to use this software for any
61 * purpose on any computer system, and to redistribute it freely,
62 * subject to the following restrictions:
63 *
64 * 1. The author is not responsible for the consequences of use of
65 * this software, no matter how awful, even if they arise
66 * from defects in it.
67 *
68 * 2. The origin of this software must not be misrepresented, either
69 * by explicit claim or by omission.
70 *
71 * 3. Altered versions must be plainly marked as such, and must not
72 * be misrepresented as being the original software.
73 *
74 *
75 **** Alterations to Henry's code are...
76 ****
4bb101f2
JH
77 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
78 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 79 ****
9ef589d8
LW
80 **** You may distribute under the terms of either the GNU General Public
81 **** License or the Artistic License, as specified in the README file.
82
a687059c
LW
83 *
84 * Beware that some of this code is subtly aware of the way operator
85 * precedence is structured in regular expressions. Serious changes in
86 * regular-expression syntax might require a total rethink.
87 */
88#include "EXTERN.h"
864dbfa3 89#define PERL_IN_REGCOMP_C
a687059c 90#include "perl.h"
d06ea78c 91
acfe0abc 92#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
93# include "INTERN.h"
94#endif
c277df42
IZ
95
96#define REG_COMP_C
a687059c
LW
97#include "regcomp.h"
98
d4cce5f1 99#ifdef op
11343788 100#undef op
d4cce5f1 101#endif /* op */
11343788 102
fe14fcc3 103#ifdef MSDOS
7e4e8c89 104# if defined(BUGGY_MSC6)
fe14fcc3 105 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 106# pragma optimize("a",off)
fe14fcc3 107 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
108# pragma optimize("w",on )
109# endif /* BUGGY_MSC6 */
fe14fcc3
LW
110#endif /* MSDOS */
111
a687059c
LW
112#ifndef STATIC
113#define STATIC static
114#endif
115
830247a4 116typedef struct RExC_state_t {
e2509266 117 U32 flags; /* are we folding, multilining? */
830247a4
IZ
118 char *precomp; /* uncompiled string. */
119 regexp *rx;
fac92740 120 char *start; /* Start of input for compile */
830247a4
IZ
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 124 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 125 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
126 I32 naughty; /* How bad is this pattern? */
127 I32 sawback; /* Did we see \1, ...? */
128 U32 seen;
129 I32 size; /* Code size. */
130 I32 npar; /* () count. */
131 I32 extralen;
132 I32 seen_zerolen;
133 I32 seen_evals;
1aa99e6b 134 I32 utf8;
830247a4
IZ
135#if ADD_TO_REGEXEC
136 char *starttry; /* -Dr: where regtry was called. */
137#define RExC_starttry (pRExC_state->starttry)
138#endif
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
156#define RExC_extralen (pRExC_state->extralen)
157#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
158#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 159#define RExC_utf8 (pRExC_state->utf8)
830247a4 160
a687059c
LW
161#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
162#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
163 ((*s) == '{' && regcurly(s)))
a687059c 164
35c8bce7
LW
165#ifdef SPSTART
166#undef SPSTART /* dratted cpp namespace... */
167#endif
a687059c
LW
168/*
169 * Flags to be passed up and down.
170 */
a687059c 171#define WORST 0 /* Worst case. */
821b33a5 172#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
173#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
174#define SPSTART 0x4 /* Starts with * or +. */
175#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 176
2c2d71f5
JH
177/* Length of a variant. */
178
179typedef struct scan_data_t {
180 I32 len_min;
181 I32 len_delta;
182 I32 pos_min;
183 I32 pos_delta;
184 SV *last_found;
185 I32 last_end; /* min value, <0 unless valid. */
186 I32 last_start_min;
187 I32 last_start_max;
188 SV **longest; /* Either &l_fixed, or &l_float. */
189 SV *longest_fixed;
190 I32 offset_fixed;
191 SV *longest_float;
192 I32 offset_float_min;
193 I32 offset_float_max;
194 I32 flags;
195 I32 whilem_c;
cb434fcc 196 I32 *last_closep;
653099ff 197 struct regnode_charclass_class *start_class;
2c2d71f5
JH
198} scan_data_t;
199
a687059c 200/*
e50aee73 201 * Forward declarations for pregcomp()'s friends.
a687059c 202 */
a0d0e21e 203
b81d288d 204static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 205 0, 0, 0, 0, 0, 0};
c277df42
IZ
206
207#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208#define SF_BEFORE_SEOL 0x1
209#define SF_BEFORE_MEOL 0x2
210#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
212
09b7f37c
CB
213#ifdef NO_UNARY_PLUS
214# define SF_FIX_SHIFT_EOL (0+2)
215# define SF_FL_SHIFT_EOL (0+4)
216#else
217# define SF_FIX_SHIFT_EOL (+2)
218# define SF_FL_SHIFT_EOL (+4)
219#endif
c277df42
IZ
220
221#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223
224#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226#define SF_IS_INF 0x40
227#define SF_HAS_PAR 0x80
228#define SF_IN_PAR 0x100
229#define SF_HAS_EVAL 0x200
4bfe0158 230#define SCF_DO_SUBSTR 0x400
653099ff
GS
231#define SCF_DO_STCLASS_AND 0x0800
232#define SCF_DO_STCLASS_OR 0x1000
233#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 234#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 235
eb160463 236#define UTF (RExC_utf8 != 0)
e2509266
JH
237#define LOC ((RExC_flags & PMf_LOCALE) != 0)
238#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 239
ffc61ed2 240#define OOB_UNICODE 12345678
93733859 241#define OOB_NAMEDCLASS -1
b8c5462f 242
a0ed51b3
LW
243#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
244#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
245
8615cb43 246
b45f050a
JF
247/* length of regex to show in messages that don't mark a position within */
248#define RegexLengthToShowInErrorMessages 127
249
250/*
251 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
252 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
253 * op/pragma/warn/regcomp.
254 */
7253e4e3
RK
255#define MARKER1 "<-- HERE" /* marker as it appears in the description */
256#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 257
7253e4e3 258#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
259
260/*
261 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
262 * arg. Show regex, up to a maximum length. If it's too long, chop and add
263 * "...".
264 */
ccb2c380
MP
265#define FAIL(msg) STMT_START { \
266 char *ellipses = ""; \
267 IV len = RExC_end - RExC_precomp; \
268 \
269 if (!SIZE_ONLY) \
270 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
274 ellipses = "..."; \
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
277 msg, (int)len, RExC_precomp, ellipses); \
278} STMT_END
8615cb43 279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
ccb2c380
MP
285#define FAIL2(pat,msg) STMT_START { \
286 char *ellipses = ""; \
287 IV len = RExC_end - RExC_precomp; \
288 \
289 if (!SIZE_ONLY) \
290 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
291 if (len > RegexLengthToShowInErrorMessages) { \
292 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
293 len = RegexLengthToShowInErrorMessages - 10; \
294 ellipses = "..."; \
295 } \
296 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
297 msg, (int)len, RExC_precomp, ellipses); \
298} STMT_END
b45f050a
JF
299
300
301/*
302 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
303 */
ccb2c380
MP
304#define Simple_vFAIL(m) STMT_START { \
305 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
308} STMT_END
b45f050a
JF
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
ccb2c380
MP
313#define vFAIL(m) STMT_START { \
314 if (!SIZE_ONLY) \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL(m); \
317} STMT_END
b45f050a
JF
318
319/*
320 * Like Simple_vFAIL(), but accepts two arguments.
321 */
ccb2c380
MP
322#define Simple_vFAIL2(m,a1) STMT_START { \
323 IV offset = RExC_parse - RExC_precomp; \
324 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
325 (int)offset, RExC_precomp, RExC_precomp + offset); \
326} STMT_END
b45f050a
JF
327
328/*
329 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
330 */
ccb2c380
MP
331#define vFAIL2(m,a1) STMT_START { \
332 if (!SIZE_ONLY) \
333 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
334 Simple_vFAIL2(m, a1); \
335} STMT_END
b45f050a
JF
336
337
338/*
339 * Like Simple_vFAIL(), but accepts three arguments.
340 */
ccb2c380
MP
341#define Simple_vFAIL3(m, a1, a2) STMT_START { \
342 IV offset = RExC_parse - RExC_precomp; \
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
345} STMT_END
b45f050a
JF
346
347/*
348 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
349 */
ccb2c380
MP
350#define vFAIL3(m,a1,a2) STMT_START { \
351 if (!SIZE_ONLY) \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 Simple_vFAIL3(m, a1, a2); \
354} STMT_END
b45f050a
JF
355
356/*
357 * Like Simple_vFAIL(), but accepts four arguments.
358 */
ccb2c380
MP
359#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
360 IV offset = RExC_parse - RExC_precomp; \
361 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
362 (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
b45f050a
JF
364
365/*
366 * Like Simple_vFAIL(), but accepts five arguments.
367 */
ccb2c380
MP
368#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
369 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374
375#define vWARN(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
378 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
379} STMT_END
380
381#define vWARNdep(loc,m) STMT_START { \
382 IV offset = loc - RExC_precomp; \
383 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
384 "%s" REPORT_LOCATION, \
385 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386} STMT_END
387
388
389#define vWARN2(loc, m, a1) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN3(loc, m, a1, a2) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
406
407#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
408 IV offset = loc - RExC_precomp; \
409 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
410 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
9d1d55b5 412
8615cb43 413
cd439c50 414/* Allow for side effects in s */
ccb2c380
MP
415#define REGC(c,s) STMT_START { \
416 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
417} STMT_END
cd439c50 418
fac92740
MJD
419/* Macros for recording node offsets. 20001227 mjd@plover.com
420 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
421 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
422 * Element 0 holds the number n.
423 */
424
425#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
426/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
427
428
429#define Set_Node_Offset_To_R(node,byte) STMT_START { \
430 if (! SIZE_ONLY) { \
431 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
432 __LINE__, (node), (byte))); \
433 if((node) < 0) { \
434 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
435 } else { \
436 RExC_offsets[2*(node)-1] = (byte); \
437 } \
438 } \
439} STMT_END
440
441#define Set_Node_Offset(node,byte) \
442 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
443#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
444
445#define Set_Node_Length_To_R(node,len) STMT_START { \
446 if (! SIZE_ONLY) { \
447 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
448 __LINE__, (node), (len))); \
449 if((node) < 0) { \
450 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
451 } else { \
452 RExC_offsets[2*(node)] = (len); \
453 } \
454 } \
455} STMT_END
456
457#define Set_Node_Length(node,len) \
458 Set_Node_Length_To_R((node)-RExC_emit_start, len)
459#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
460#define Set_Node_Cur_Length(node) \
461 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
462
463/* Get offsets and lengths */
464#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
465#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
466
acfe0abc 467static void clear_re(pTHX_ void *r);
4327152a 468
653099ff
GS
469/* Mark that we cannot extend a found fixed substring at this point.
470 Updata the longest found anchored substring and the longest found
471 floating substrings if needed. */
472
4327152a 473STATIC void
830247a4 474S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 475{
a0ed51b3
LW
476 STRLEN l = CHR_SVLEN(data->last_found);
477 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 478
c277df42 479 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 480 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
481 if (*data->longest == data->longest_fixed) {
482 data->offset_fixed = l ? data->last_start_min : data->pos_min;
483 if (data->flags & SF_BEFORE_EOL)
b81d288d 484 data->flags
c277df42
IZ
485 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
486 else
487 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
488 }
489 else {
c277df42 490 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
491 data->offset_float_max = (l
492 ? data->last_start_max
c277df42 493 : data->pos_min + data->pos_delta);
9051bda5
HS
494 if ((U32)data->offset_float_max > (U32)I32_MAX)
495 data->offset_float_max = I32_MAX;
c277df42 496 if (data->flags & SF_BEFORE_EOL)
b81d288d 497 data->flags
c277df42
IZ
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499 else
500 data->flags &= ~SF_FL_BEFORE_EOL;
501 }
502 }
503 SvCUR_set(data->last_found, 0);
0eda9292
JH
504 {
505 SV * sv = data->last_found;
506 MAGIC *mg =
507 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
508 if (mg && mg->mg_len > 0)
509 mg->mg_len = 0;
510 }
c277df42
IZ
511 data->last_end = -1;
512 data->flags &= ~SF_BEFORE_EOL;
513}
514
653099ff
GS
515/* Can match anything (initialization) */
516STATIC void
830247a4 517S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 518{
653099ff 519 ANYOF_CLASS_ZERO(cl);
f8bef550 520 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 521 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
522 if (LOC)
523 cl->flags |= ANYOF_LOCALE;
524}
525
526/* Can match anything (initialization) */
527STATIC int
528S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
529{
530 int value;
531
aaa51d5e 532 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
533 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
534 return 1;
1aa99e6b
IH
535 if (!(cl->flags & ANYOF_UNICODE_ALL))
536 return 0;
f8bef550
NC
537 if (!ANYOF_BITMAP_TESTALLSET(cl))
538 return 0;
653099ff
GS
539 return 1;
540}
541
542/* Can match anything (initialization) */
543STATIC void
830247a4 544S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 545{
8ecf7187 546 Zero(cl, 1, struct regnode_charclass_class);
653099ff 547 cl->type = ANYOF;
830247a4 548 cl_anything(pRExC_state, cl);
653099ff
GS
549}
550
551STATIC void
830247a4 552S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 553{
8ecf7187 554 Zero(cl, 1, struct regnode_charclass_class);
653099ff 555 cl->type = ANYOF;
830247a4 556 cl_anything(pRExC_state, cl);
653099ff
GS
557 if (LOC)
558 cl->flags |= ANYOF_LOCALE;
559}
560
561/* 'And' a given class with another one. Can create false positives */
562/* We assume that cl is not inverted */
563STATIC void
564S_cl_and(pTHX_ struct regnode_charclass_class *cl,
565 struct regnode_charclass_class *and_with)
566{
653099ff
GS
567 if (!(and_with->flags & ANYOF_CLASS)
568 && !(cl->flags & ANYOF_CLASS)
569 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570 && !(and_with->flags & ANYOF_FOLD)
571 && !(cl->flags & ANYOF_FOLD)) {
572 int i;
573
574 if (and_with->flags & ANYOF_INVERT)
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= ~and_with->bitmap[i];
577 else
578 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
579 cl->bitmap[i] &= and_with->bitmap[i];
580 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
581 if (!(and_with->flags & ANYOF_EOS))
582 cl->flags &= ~ANYOF_EOS;
1aa99e6b 583
14ebb1a2
JH
584 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
585 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 cl->flags |= ANYOF_UNICODE;
588 ARG_SET(cl, ARG(and_with));
589 }
14ebb1a2
JH
590 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
591 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 592 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
593 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
594 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 595 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
596}
597
598/* 'OR' a given class with another one. Can create false positives */
599/* We assume that cl is not inverted */
600STATIC void
830247a4 601S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 602{
653099ff
GS
603 if (or_with->flags & ANYOF_INVERT) {
604 /* We do not use
605 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
606 * <= (B1 | !B2) | (CL1 | !CL2)
607 * which is wasteful if CL2 is small, but we ignore CL2:
608 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
609 * XXXX Can we handle case-fold? Unclear:
610 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
611 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
612 */
613 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
614 && !(or_with->flags & ANYOF_FOLD)
615 && !(cl->flags & ANYOF_FOLD) ) {
616 int i;
617
618 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
619 cl->bitmap[i] |= ~or_with->bitmap[i];
620 } /* XXXX: logic is complicated otherwise */
621 else {
830247a4 622 cl_anything(pRExC_state, cl);
653099ff
GS
623 }
624 } else {
625 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
626 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 627 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
628 || (cl->flags & ANYOF_FOLD)) ) {
629 int i;
630
631 /* OR char bitmap and class bitmap separately */
632 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
633 cl->bitmap[i] |= or_with->bitmap[i];
634 if (or_with->flags & ANYOF_CLASS) {
635 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
636 cl->classflags[i] |= or_with->classflags[i];
637 cl->flags |= ANYOF_CLASS;
638 }
639 }
640 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 641 cl_anything(pRExC_state, cl);
653099ff
GS
642 }
643 }
644 if (or_with->flags & ANYOF_EOS)
645 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
646
647 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
648 ARG(cl) != ARG(or_with)) {
649 cl->flags |= ANYOF_UNICODE_ALL;
650 cl->flags &= ~ANYOF_UNICODE;
651 }
652 if (or_with->flags & ANYOF_UNICODE_ALL) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
655 }
653099ff
GS
656}
657
5d1c421c
JH
658/*
659 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
660 * These need to be revisited when a newer toolchain becomes available.
661 */
662#if defined(__sparc64__) && defined(__GNUC__)
663# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
664# undef SPARC64_GCC_WORKAROUND
665# define SPARC64_GCC_WORKAROUND 1
666# endif
667#endif
668
653099ff
GS
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
c277df42
IZ
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
76e3520e 675STATIC I32
830247a4 676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 685 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
653099ff 688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 689
c277df42
IZ
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
22c35a8c 693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 694 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
b81d288d 699#endif
c277df42 700
cd439c50 701 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
b81d288d 704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
22c35a8c 710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
b81d288d 716#endif
c277df42 717 n = regnext(n);
a0ed51b3 718 }
f49d4d0f 719 else if (stringok) {
cd439c50 720 int oldl = STR_LEN(scan);
c277df42 721 regnode *nnext = regnext(n);
f49d4d0f 722
b81d288d 723 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
c277df42 728 /* Now we can overwrite *n : */
f49d4d0f 729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 730#ifdef DEBUGGING
f49d4d0f 731 stop = next - 1;
b81d288d 732#endif
c277df42
IZ
733 n = nnext;
734 }
735 }
61a36c01 736
d65e4eab 737 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
738/*
739 Two problematic code points in Unicode casefolding of EXACT nodes:
740
741 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
742 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743
744 which casefold to
745
746 Unicode UTF-8
747
748 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
749 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
750
751 This means that in case-insensitive matching (or "loose matching",
752 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
753 length of the above casefolded versions) can match a target string
754 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
755 This would rather mess up the minimum length computation.
756
757 What we'll do is to look for the tail four bytes, and then peek
758 at the preceding two bytes to see whether we need to decrease
759 the minimum length by four (six minus two).
760
761 Thanks to the design of UTF-8, there cannot be false matches:
762 A sequence of valid UTF-8 bytes cannot be a subsequence of
763 another valid sequence of UTF-8 bytes.
764
765*/
766 char *s0 = STRING(scan), *s, *t;
767 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
768 char *t0 = "\xcc\x88\xcc\x81";
769 char *t1 = t0 + 3;
770
771 for (s = s0 + 2;
772 s < s2 && (t = ninstr(s, s1, t0, t1));
773 s = t + 4) {
774 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
775 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
776 min -= 4;
777 }
778 }
779
c277df42
IZ
780#ifdef DEBUGGING
781 /* Allow dumping */
cd439c50 782 n = scan + NODE_SZ_STR(scan);
c277df42 783 while (n <= stop) {
22c35a8c 784 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
785 OP(n) = OPTIMIZED;
786 NEXT_OFF(n) = 0;
787 }
788 n++;
789 }
653099ff 790#endif
c277df42 791 }
653099ff
GS
792 /* Follow the next-chain of the current node and optimize
793 away all the NOTHINGs from it. */
c277df42 794 if (OP(scan) != CURLYX) {
048cfca1
GS
795 int max = (reg_off_by_arg[OP(scan)]
796 ? I32_MAX
797 /* I32 may be smaller than U16 on CRAYs! */
798 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
799 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
800 int noff;
801 regnode *n = scan;
b81d288d 802
c277df42
IZ
803 /* Skip NOTHING and LONGJMP. */
804 while ((n = regnext(n))
22c35a8c 805 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
806 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
807 && off + noff < max)
808 off += noff;
809 if (reg_off_by_arg[OP(scan)])
810 ARG(scan) = off;
b81d288d 811 else
c277df42
IZ
812 NEXT_OFF(scan) = off;
813 }
653099ff
GS
814 /* The principal pseudo-switch. Cannot be a switch, since we
815 look into several different things. */
b81d288d 816 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
817 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
818 next = regnext(scan);
819 code = OP(scan);
b81d288d
AB
820
821 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 822 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 823 struct regnode_charclass_class accum;
c277df42 824
653099ff 825 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 826 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 827 if (flags & SCF_DO_STCLASS)
830247a4 828 cl_init_zero(pRExC_state, &accum);
c277df42 829 while (OP(scan) == code) {
830247a4 830 I32 deltanext, minnext, f = 0, fake;
653099ff 831 struct regnode_charclass_class this_class;
c277df42
IZ
832
833 num++;
834 data_fake.flags = 0;
b81d288d 835 if (data) {
2c2d71f5 836 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
837 data_fake.last_closep = data->last_closep;
838 }
839 else
840 data_fake.last_closep = &fake;
c277df42
IZ
841 next = regnext(scan);
842 scan = NEXTOPER(scan);
843 if (code != BRANCH)
844 scan = NEXTOPER(scan);
653099ff 845 if (flags & SCF_DO_STCLASS) {
830247a4 846 cl_init(pRExC_state, &this_class);
653099ff
GS
847 data_fake.start_class = &this_class;
848 f = SCF_DO_STCLASS_AND;
b81d288d 849 }
e1901655
IZ
850 if (flags & SCF_WHILEM_VISITED_POS)
851 f |= SCF_WHILEM_VISITED_POS;
653099ff 852 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
853 minnext = study_chunk(pRExC_state, &scan, &deltanext,
854 next, &data_fake, f);
b81d288d 855 if (min1 > minnext)
c277df42
IZ
856 min1 = minnext;
857 if (max1 < minnext + deltanext)
858 max1 = minnext + deltanext;
859 if (deltanext == I32_MAX)
aca2d497 860 is_inf = is_inf_internal = 1;
c277df42
IZ
861 scan = next;
862 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
863 pars++;
405ff068 864 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 865 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
866 if (data)
867 data->whilem_c = data_fake.whilem_c;
653099ff 868 if (flags & SCF_DO_STCLASS)
830247a4 869 cl_or(pRExC_state, &accum, &this_class);
b81d288d 870 if (code == SUSPEND)
c277df42
IZ
871 break;
872 }
873 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
874 min1 = 0;
875 if (flags & SCF_DO_SUBSTR) {
876 data->pos_min += min1;
877 data->pos_delta += max1 - min1;
878 if (max1 != min1 || is_inf)
879 data->longest = &(data->longest_float);
880 }
881 min += min1;
882 delta += max1 - min1;
653099ff 883 if (flags & SCF_DO_STCLASS_OR) {
830247a4 884 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
885 if (min1) {
886 cl_and(data->start_class, &and_with);
887 flags &= ~SCF_DO_STCLASS;
888 }
889 }
890 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
891 if (min1) {
892 cl_and(data->start_class, &accum);
653099ff 893 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
894 }
895 else {
b81d288d 896 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
897 * data->start_class */
898 StructCopy(data->start_class, &and_with,
899 struct regnode_charclass_class);
900 flags &= ~SCF_DO_STCLASS_AND;
901 StructCopy(&accum, data->start_class,
902 struct regnode_charclass_class);
903 flags |= SCF_DO_STCLASS_OR;
904 data->start_class->flags |= ANYOF_EOS;
905 }
653099ff 906 }
a0ed51b3
LW
907 }
908 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
909 scan = NEXTOPER(NEXTOPER(scan));
910 else /* single branch is optimized. */
911 scan = NEXTOPER(scan);
912 continue;
a0ed51b3
LW
913 }
914 else if (OP(scan) == EXACT) {
cd439c50 915 I32 l = STR_LEN(scan);
1aa99e6b 916 UV uc = *((U8*)STRING(scan));
a0ed51b3 917 if (UTF) {
1aa99e6b
IH
918 U8 *s = (U8*)STRING(scan);
919 l = utf8_length(s, s + l);
9041c2e3 920 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
921 }
922 min += l;
c277df42 923 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
924 /* The code below prefers earlier match for fixed
925 offset, later match for variable offset. */
926 if (data->last_end == -1) { /* Update the start info. */
927 data->last_start_min = data->pos_min;
928 data->last_start_max = is_inf
b81d288d 929 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 930 }
cd439c50 931 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
932 {
933 SV * sv = data->last_found;
934 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
935 mg_find(sv, PERL_MAGIC_utf8) : NULL;
936 if (mg && mg->mg_len >= 0)
5e43f467
JH
937 mg->mg_len += utf8_length((U8*)STRING(scan),
938 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 939 }
33b8afdf
JH
940 if (UTF)
941 SvUTF8_on(data->last_found);
c277df42
IZ
942 data->last_end = data->pos_min + l;
943 data->pos_min += l; /* As in the first entry. */
944 data->flags &= ~SF_BEFORE_EOL;
945 }
653099ff
GS
946 if (flags & SCF_DO_STCLASS_AND) {
947 /* Check whether it is compatible with what we know already! */
948 int compat = 1;
949
1aa99e6b 950 if (uc >= 0x100 ||
516a5887 951 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 952 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 953 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 954 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 955 )
653099ff
GS
956 compat = 0;
957 ANYOF_CLASS_ZERO(data->start_class);
958 ANYOF_BITMAP_ZERO(data->start_class);
959 if (compat)
1aa99e6b 960 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 961 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
962 if (uc < 0x100)
963 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
964 }
965 else if (flags & SCF_DO_STCLASS_OR) {
966 /* false positive possible if the class is case-folded */
1aa99e6b 967 if (uc < 0x100)
9b877dbb
IH
968 ANYOF_BITMAP_SET(data->start_class, uc);
969 else
970 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
971 data->start_class->flags &= ~ANYOF_EOS;
972 cl_and(data->start_class, &and_with);
973 }
974 flags &= ~SCF_DO_STCLASS;
a0ed51b3 975 }
653099ff 976 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 977 I32 l = STR_LEN(scan);
1aa99e6b 978 UV uc = *((U8*)STRING(scan));
653099ff
GS
979
980 /* Search for fixed substrings supports EXACT only. */
b81d288d 981 if (flags & SCF_DO_SUBSTR)
830247a4 982 scan_commit(pRExC_state, data);
a0ed51b3 983 if (UTF) {
1aa99e6b
IH
984 U8 *s = (U8 *)STRING(scan);
985 l = utf8_length(s, s + l);
9041c2e3 986 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
987 }
988 min += l;
c277df42 989 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 990 data->pos_min += l;
653099ff
GS
991 if (flags & SCF_DO_STCLASS_AND) {
992 /* Check whether it is compatible with what we know already! */
993 int compat = 1;
994
1aa99e6b 995 if (uc >= 0x100 ||
516a5887 996 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 997 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 998 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
999 compat = 0;
1000 ANYOF_CLASS_ZERO(data->start_class);
1001 ANYOF_BITMAP_ZERO(data->start_class);
1002 if (compat) {
1aa99e6b 1003 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1004 data->start_class->flags &= ~ANYOF_EOS;
1005 data->start_class->flags |= ANYOF_FOLD;
1006 if (OP(scan) == EXACTFL)
1007 data->start_class->flags |= ANYOF_LOCALE;
1008 }
1009 }
1010 else if (flags & SCF_DO_STCLASS_OR) {
1011 if (data->start_class->flags & ANYOF_FOLD) {
1012 /* false positive possible if the class is case-folded.
1013 Assume that the locale settings are the same... */
1aa99e6b
IH
1014 if (uc < 0x100)
1015 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1016 data->start_class->flags &= ~ANYOF_EOS;
1017 }
1018 cl_and(data->start_class, &and_with);
1019 }
1020 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1021 }
4d61ec05 1022 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1023 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1024 I32 f = flags, pos_before = 0;
c277df42 1025 regnode *oscan = scan;
653099ff
GS
1026 struct regnode_charclass_class this_class;
1027 struct regnode_charclass_class *oclass = NULL;
727f22e3 1028 I32 next_is_eval = 0;
653099ff 1029
22c35a8c 1030 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1031 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1032 scan = NEXTOPER(scan);
1033 goto finish;
1034 case PLUS:
653099ff 1035 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1036 next = NEXTOPER(scan);
653099ff 1037 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1038 mincount = 1;
1039 maxcount = REG_INFTY;
c277df42
IZ
1040 next = regnext(scan);
1041 scan = NEXTOPER(scan);
1042 goto do_curly;
1043 }
1044 }
1045 if (flags & SCF_DO_SUBSTR)
1046 data->pos_min++;
1047 min++;
1048 /* Fall through. */
1049 case STAR:
653099ff
GS
1050 if (flags & SCF_DO_STCLASS) {
1051 mincount = 0;
b81d288d 1052 maxcount = REG_INFTY;
653099ff
GS
1053 next = regnext(scan);
1054 scan = NEXTOPER(scan);
1055 goto do_curly;
1056 }
b81d288d 1057 is_inf = is_inf_internal = 1;
c277df42
IZ
1058 scan = regnext(scan);
1059 if (flags & SCF_DO_SUBSTR) {
830247a4 1060 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1061 data->longest = &(data->longest_float);
1062 }
1063 goto optimize_curly_tail;
1064 case CURLY:
b81d288d 1065 mincount = ARG1(scan);
c277df42
IZ
1066 maxcount = ARG2(scan);
1067 next = regnext(scan);
cb434fcc
IZ
1068 if (OP(scan) == CURLYX) {
1069 I32 lp = (data ? *(data->last_closep) : 0);
1070
1071 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1072 }
c277df42 1073 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1074 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1075 do_curly:
1076 if (flags & SCF_DO_SUBSTR) {
830247a4 1077 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1078 pos_before = data->pos_min;
1079 }
1080 if (data) {
1081 fl = data->flags;
1082 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1083 if (is_inf)
1084 data->flags |= SF_IS_INF;
1085 }
653099ff 1086 if (flags & SCF_DO_STCLASS) {
830247a4 1087 cl_init(pRExC_state, &this_class);
653099ff
GS
1088 oclass = data->start_class;
1089 data->start_class = &this_class;
1090 f |= SCF_DO_STCLASS_AND;
1091 f &= ~SCF_DO_STCLASS_OR;
1092 }
e1901655
IZ
1093 /* These are the cases when once a subexpression
1094 fails at a particular position, it cannot succeed
1095 even after backtracking at the enclosing scope.
b81d288d 1096
e1901655
IZ
1097 XXXX what if minimal match and we are at the
1098 initial run of {n,m}? */
1099 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1100 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1101
c277df42 1102 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1103 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1104 mincount == 0
653099ff
GS
1105 ? (f & ~SCF_DO_SUBSTR) : f);
1106
1107 if (flags & SCF_DO_STCLASS)
1108 data->start_class = oclass;
1109 if (mincount == 0 || minnext == 0) {
1110 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1111 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1112 }
1113 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1114 /* Switch to OR mode: cache the old value of
653099ff
GS
1115 * data->start_class */
1116 StructCopy(data->start_class, &and_with,
1117 struct regnode_charclass_class);
1118 flags &= ~SCF_DO_STCLASS_AND;
1119 StructCopy(&this_class, data->start_class,
1120 struct regnode_charclass_class);
1121 flags |= SCF_DO_STCLASS_OR;
1122 data->start_class->flags |= ANYOF_EOS;
1123 }
1124 } else { /* Non-zero len */
1125 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1126 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1127 cl_and(data->start_class, &and_with);
1128 }
1129 else if (flags & SCF_DO_STCLASS_AND)
1130 cl_and(data->start_class, &this_class);
1131 flags &= ~SCF_DO_STCLASS;
1132 }
c277df42
IZ
1133 if (!scan) /* It was not CURLYX, but CURLY. */
1134 scan = next;
84037bb0 1135 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1136 /* ? quantifier ok, except for (?{ ... }) */
1137 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1138 && (minnext == 0) && (deltanext == 0)
99799961 1139 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1140 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1141 {
830247a4 1142 vWARN(RExC_parse,
b45f050a
JF
1143 "Quantifier unexpected on zero-length expression");
1144 }
1145
c277df42 1146 min += minnext * mincount;
b81d288d 1147 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1148 && (minnext + deltanext) > 0)
1149 || deltanext == I32_MAX);
aca2d497 1150 is_inf |= is_inf_internal;
c277df42
IZ
1151 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1152
1153 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1154 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1155 && data->flags & SF_IN_PAR
1156 && !(data->flags & SF_HAS_EVAL)
1157 && !deltanext && minnext == 1 ) {
1158 /* Try to optimize to CURLYN. */
1159 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1160 regnode *nxt1 = nxt;
1161#ifdef DEBUGGING
1162 regnode *nxt2;
1163#endif
c277df42
IZ
1164
1165 /* Skip open. */
1166 nxt = regnext(nxt);
4d61ec05 1167 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1168 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1169 && STR_LEN(nxt) == 1))
c277df42 1170 goto nogo;
497b47a8 1171#ifdef DEBUGGING
c277df42 1172 nxt2 = nxt;
497b47a8 1173#endif
c277df42 1174 nxt = regnext(nxt);
b81d288d 1175 if (OP(nxt) != CLOSE)
c277df42
IZ
1176 goto nogo;
1177 /* Now we know that nxt2 is the only contents: */
eb160463 1178 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1179 OP(oscan) = CURLYN;
1180 OP(nxt1) = NOTHING; /* was OPEN. */
1181#ifdef DEBUGGING
1182 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1184 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1185 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1186 OP(nxt + 1) = OPTIMIZED; /* was count. */
1187 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1188#endif
c277df42 1189 }
c277df42
IZ
1190 nogo:
1191
1192 /* Try optimization CURLYX => CURLYM. */
b81d288d 1193 if ( OP(oscan) == CURLYX && data
c277df42 1194 && !(data->flags & SF_HAS_PAR)
c277df42 1195 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
1196 && !deltanext /* atom is fixed width */
1197 && minnext != 0 /* CURLYM can't handle zero width */
1198 ) {
c277df42
IZ
1199 /* XXXX How to optimize if data == 0? */
1200 /* Optimize to a simpler form. */
1201 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1202 regnode *nxt2;
1203
1204 OP(oscan) = CURLYM;
1205 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1206 && (OP(nxt2) != WHILEM))
c277df42
IZ
1207 nxt = nxt2;
1208 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1209 /* Need to optimize away parenths. */
1210 if (data->flags & SF_IN_PAR) {
1211 /* Set the parenth number. */
1212 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1213
b81d288d 1214 if (OP(nxt) != CLOSE)
b45f050a 1215 FAIL("Panic opt close");
eb160463 1216 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1217 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1218 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1219#ifdef DEBUGGING
1220 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1221 OP(nxt + 1) = OPTIMIZED; /* was count. */
1222 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1223 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1224#endif
c277df42
IZ
1225#if 0
1226 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1227 regnode *nnxt = regnext(nxt1);
b81d288d 1228
c277df42
IZ
1229 if (nnxt == nxt) {
1230 if (reg_off_by_arg[OP(nxt1)])
1231 ARG_SET(nxt1, nxt2 - nxt1);
1232 else if (nxt2 - nxt1 < U16_MAX)
1233 NEXT_OFF(nxt1) = nxt2 - nxt1;
1234 else
1235 OP(nxt) = NOTHING; /* Cannot beautify */
1236 }
1237 nxt1 = nnxt;
1238 }
1239#endif
1240 /* Optimize again: */
b81d288d 1241 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1242 NULL, 0);
a0ed51b3
LW
1243 }
1244 else
c277df42 1245 oscan->flags = 0;
c277df42 1246 }
e1901655
IZ
1247 else if ((OP(oscan) == CURLYX)
1248 && (flags & SCF_WHILEM_VISITED_POS)
1249 /* See the comment on a similar expression above.
1250 However, this time it not a subexpression
1251 we care about, but the expression itself. */
1252 && (maxcount == REG_INFTY)
1253 && data && ++data->whilem_c < 16) {
1254 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1255 /* Find WHILEM (as in regexec.c) */
1256 regnode *nxt = oscan + NEXT_OFF(oscan);
1257
1258 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1259 nxt += ARG(nxt);
eb160463
GS
1260 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1261 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1262 }
b81d288d 1263 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1264 pars++;
1265 if (flags & SCF_DO_SUBSTR) {
1266 SV *last_str = Nullsv;
1267 int counted = mincount != 0;
1268
1269 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1270#if defined(SPARC64_GCC_WORKAROUND)
1271 I32 b = 0;
1272 STRLEN l = 0;
1273 char *s = NULL;
1274 I32 old = 0;
1275
1276 if (pos_before >= data->last_start_min)
1277 b = pos_before;
1278 else
1279 b = data->last_start_min;
1280
1281 l = 0;
1282 s = SvPV(data->last_found, l);
1283 old = b - data->last_start_min;
1284
1285#else
b81d288d 1286 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1287 ? pos_before : data->last_start_min;
1288 STRLEN l;
1289 char *s = SvPV(data->last_found, l);
a0ed51b3 1290 I32 old = b - data->last_start_min;
5d1c421c 1291#endif
a0ed51b3
LW
1292
1293 if (UTF)
1294 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1295
a0ed51b3 1296 l -= old;
c277df42 1297 /* Get the added string: */
79cb57f6 1298 last_str = newSVpvn(s + old, l);
0e933229
IH
1299 if (UTF)
1300 SvUTF8_on(last_str);
c277df42
IZ
1301 if (deltanext == 0 && pos_before == b) {
1302 /* What was added is a constant string */
1303 if (mincount > 1) {
1304 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1305 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1306 SvPVX(last_str), l, mincount - 1);
1307 SvCUR(last_str) *= mincount;
1308 /* Add additional parts. */
b81d288d 1309 SvCUR_set(data->last_found,
c277df42
IZ
1310 SvCUR(data->last_found) - l);
1311 sv_catsv(data->last_found, last_str);
0eda9292
JH
1312 {
1313 SV * sv = data->last_found;
1314 MAGIC *mg =
1315 SvUTF8(sv) && SvMAGICAL(sv) ?
1316 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1317 if (mg && mg->mg_len >= 0)
1318 mg->mg_len += CHR_SVLEN(last_str);
1319 }
c277df42
IZ
1320 data->last_end += l * (mincount - 1);
1321 }
2a8d9689
HS
1322 } else {
1323 /* start offset must point into the last copy */
1324 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1325 data->last_start_max += is_inf ? I32_MAX
1326 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1327 }
1328 }
1329 /* It is counted once already... */
1330 data->pos_min += minnext * (mincount - counted);
1331 data->pos_delta += - counted * deltanext +
1332 (minnext + deltanext) * maxcount - minnext * mincount;
1333 if (mincount != maxcount) {
653099ff
GS
1334 /* Cannot extend fixed substrings found inside
1335 the group. */
830247a4 1336 scan_commit(pRExC_state,data);
c277df42
IZ
1337 if (mincount && last_str) {
1338 sv_setsv(data->last_found, last_str);
1339 data->last_end = data->pos_min;
b81d288d 1340 data->last_start_min =
a0ed51b3 1341 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1342 data->last_start_max = is_inf
1343 ? I32_MAX
c277df42 1344 : data->pos_min + data->pos_delta
a0ed51b3 1345 - CHR_SVLEN(last_str);
c277df42
IZ
1346 }
1347 data->longest = &(data->longest_float);
1348 }
aca2d497 1349 SvREFCNT_dec(last_str);
c277df42 1350 }
405ff068 1351 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1352 data->flags |= SF_HAS_EVAL;
1353 optimize_curly_tail:
c277df42 1354 if (OP(oscan) != CURLYX) {
22c35a8c 1355 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1356 && NEXT_OFF(next))
1357 NEXT_OFF(oscan) += NEXT_OFF(next);
1358 }
c277df42 1359 continue;
653099ff 1360 default: /* REF and CLUMP only? */
c277df42 1361 if (flags & SCF_DO_SUBSTR) {
830247a4 1362 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1363 data->longest = &(data->longest_float);
1364 }
aca2d497 1365 is_inf = is_inf_internal = 1;
653099ff 1366 if (flags & SCF_DO_STCLASS_OR)
830247a4 1367 cl_anything(pRExC_state, data->start_class);
653099ff 1368 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1369 break;
1370 }
a0ed51b3 1371 }
ffc61ed2 1372 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1373 int value = 0;
653099ff 1374
c277df42 1375 if (flags & SCF_DO_SUBSTR) {
830247a4 1376 scan_commit(pRExC_state,data);
c277df42
IZ
1377 data->pos_min++;
1378 }
1379 min++;
653099ff
GS
1380 if (flags & SCF_DO_STCLASS) {
1381 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1382
1383 /* Some of the logic below assumes that switching
1384 locale on will only add false positives. */
1385 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1386 case SANY:
653099ff
GS
1387 default:
1388 do_default:
1389 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1390 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1391 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1392 break;
1393 case REG_ANY:
1394 if (OP(scan) == SANY)
1395 goto do_default;
1396 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1397 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1398 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1399 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1400 }
1401 if (flags & SCF_DO_STCLASS_AND || !value)
1402 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1403 break;
1404 case ANYOF:
1405 if (flags & SCF_DO_STCLASS_AND)
1406 cl_and(data->start_class,
1407 (struct regnode_charclass_class*)scan);
1408 else
830247a4 1409 cl_or(pRExC_state, data->start_class,
653099ff
GS
1410 (struct regnode_charclass_class*)scan);
1411 break;
1412 case ALNUM:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 for (value = 0; value < 256; value++)
1417 if (!isALNUM(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (isALNUM(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case ALNUML:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1435 }
1436 else {
1437 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1438 data->start_class->flags |= ANYOF_LOCALE;
1439 }
1440 break;
1441 case NALNUM:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 for (value = 0; value < 256; value++)
1446 if (isALNUM(value))
1447 ANYOF_BITMAP_CLEAR(data->start_class, value);
1448 }
1449 }
1450 else {
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453 else {
1454 for (value = 0; value < 256; value++)
1455 if (!isALNUM(value))
b81d288d 1456 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1457 }
1458 }
1459 break;
1460 case NALNUML:
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1464 }
1465 else {
1466 data->start_class->flags |= ANYOF_LOCALE;
1467 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1468 }
1469 break;
1470 case SPACE:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 for (value = 0; value < 256; value++)
1475 if (!isSPACE(value))
1476 ANYOF_BITMAP_CLEAR(data->start_class, value);
1477 }
1478 }
1479 else {
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482 else {
1483 for (value = 0; value < 256; value++)
1484 if (isSPACE(value))
b81d288d 1485 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1486 }
1487 }
1488 break;
1489 case SPACEL:
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1493 }
1494 else {
1495 data->start_class->flags |= ANYOF_LOCALE;
1496 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1497 }
1498 break;
1499 case NSPACE:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 if (data->start_class->flags & ANYOF_LOCALE)
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 else {
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
b81d288d 1514 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1515 }
1516 }
1517 break;
1518 case NSPACEL:
1519 if (flags & SCF_DO_STCLASS_AND) {
1520 if (data->start_class->flags & ANYOF_LOCALE) {
1521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1522 for (value = 0; value < 256; value++)
1523 if (!isSPACE(value))
1524 ANYOF_BITMAP_CLEAR(data->start_class, value);
1525 }
1526 }
1527 else {
1528 data->start_class->flags |= ANYOF_LOCALE;
1529 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1530 }
1531 break;
1532 case DIGIT:
1533 if (flags & SCF_DO_STCLASS_AND) {
1534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1535 for (value = 0; value < 256; value++)
1536 if (!isDIGIT(value))
1537 ANYOF_BITMAP_CLEAR(data->start_class, value);
1538 }
1539 else {
1540 if (data->start_class->flags & ANYOF_LOCALE)
1541 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1542 else {
1543 for (value = 0; value < 256; value++)
1544 if (isDIGIT(value))
b81d288d 1545 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1546 }
1547 }
1548 break;
1549 case NDIGIT:
1550 if (flags & SCF_DO_STCLASS_AND) {
1551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1552 for (value = 0; value < 256; value++)
1553 if (isDIGIT(value))
1554 ANYOF_BITMAP_CLEAR(data->start_class, value);
1555 }
1556 else {
1557 if (data->start_class->flags & ANYOF_LOCALE)
1558 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1559 else {
1560 for (value = 0; value < 256; value++)
1561 if (!isDIGIT(value))
b81d288d 1562 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1563 }
1564 }
1565 break;
1566 }
1567 if (flags & SCF_DO_STCLASS_OR)
1568 cl_and(data->start_class, &and_with);
1569 flags &= ~SCF_DO_STCLASS;
1570 }
a0ed51b3 1571 }
22c35a8c 1572 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1573 data->flags |= (OP(scan) == MEOL
1574 ? SF_BEFORE_MEOL
1575 : SF_BEFORE_SEOL);
a0ed51b3 1576 }
653099ff
GS
1577 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1578 /* Lookbehind, or need to calculate parens/evals/stclass: */
1579 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1580 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1581 /* Lookahead/lookbehind */
cb434fcc 1582 I32 deltanext, minnext, fake = 0;
c277df42 1583 regnode *nscan;
653099ff
GS
1584 struct regnode_charclass_class intrnl;
1585 int f = 0;
c277df42
IZ
1586
1587 data_fake.flags = 0;
b81d288d 1588 if (data) {
2c2d71f5 1589 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1590 data_fake.last_closep = data->last_closep;
1591 }
1592 else
1593 data_fake.last_closep = &fake;
653099ff
GS
1594 if ( flags & SCF_DO_STCLASS && !scan->flags
1595 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1596 cl_init(pRExC_state, &intrnl);
653099ff 1597 data_fake.start_class = &intrnl;
e1901655 1598 f |= SCF_DO_STCLASS_AND;
653099ff 1599 }
e1901655
IZ
1600 if (flags & SCF_WHILEM_VISITED_POS)
1601 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1602 next = regnext(scan);
1603 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1604 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1605 if (scan->flags) {
1606 if (deltanext) {
9baa0206 1607 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1608 }
1609 else if (minnext > U8_MAX) {
9baa0206 1610 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1611 }
eb160463 1612 scan->flags = (U8)minnext;
c277df42
IZ
1613 }
1614 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1615 pars++;
405ff068 1616 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1617 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1618 if (data)
1619 data->whilem_c = data_fake.whilem_c;
e1901655 1620 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1621 int was = (data->start_class->flags & ANYOF_EOS);
1622
1623 cl_and(data->start_class, &intrnl);
1624 if (was)
1625 data->start_class->flags |= ANYOF_EOS;
1626 }
a0ed51b3
LW
1627 }
1628 else if (OP(scan) == OPEN) {
c277df42 1629 pars++;
a0ed51b3 1630 }
cb434fcc 1631 else if (OP(scan) == CLOSE) {
eb160463 1632 if ((I32)ARG(scan) == is_par) {
cb434fcc 1633 next = regnext(scan);
c277df42 1634
cb434fcc
IZ
1635 if ( next && (OP(next) != WHILEM) && next < last)
1636 is_par = 0; /* Disable optimization */
1637 }
1638 if (data)
1639 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1640 }
1641 else if (OP(scan) == EVAL) {
c277df42
IZ
1642 if (data)
1643 data->flags |= SF_HAS_EVAL;
1644 }
96776eda 1645 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1646 if (flags & SCF_DO_SUBSTR) {
830247a4 1647 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1648 data->longest = &(data->longest_float);
1649 }
1650 is_inf = is_inf_internal = 1;
653099ff 1651 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1652 cl_anything(pRExC_state, data->start_class);
96776eda 1653 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1654 }
c277df42
IZ
1655 /* Else: zero-length, ignore. */
1656 scan = regnext(scan);
1657 }
1658
1659 finish:
1660 *scanp = scan;
aca2d497 1661 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1662 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1663 data->pos_delta = I32_MAX - data->pos_min;
1664 if (is_par > U8_MAX)
1665 is_par = 0;
1666 if (is_par && pars==1 && data) {
1667 data->flags |= SF_IN_PAR;
1668 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1669 }
1670 else if (pars && data) {
c277df42
IZ
1671 data->flags |= SF_HAS_PAR;
1672 data->flags &= ~SF_IN_PAR;
1673 }
653099ff
GS
1674 if (flags & SCF_DO_STCLASS_OR)
1675 cl_and(data->start_class, &and_with);
c277df42
IZ
1676 return min;
1677}
1678
76e3520e 1679STATIC I32
830247a4 1680S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1681{
830247a4 1682 if (RExC_rx->data) {
b81d288d
AB
1683 Renewc(RExC_rx->data,
1684 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1685 char, struct reg_data);
830247a4
IZ
1686 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1687 RExC_rx->data->count += n;
a0ed51b3
LW
1688 }
1689 else {
830247a4 1690 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1691 char, struct reg_data);
830247a4
IZ
1692 New(1208, RExC_rx->data->what, n, U8);
1693 RExC_rx->data->count = n;
c277df42 1694 }
830247a4
IZ
1695 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1696 return RExC_rx->data->count - n;
c277df42
IZ
1697}
1698
d88dccdf 1699void
864dbfa3 1700Perl_reginitcolors(pTHX)
d88dccdf 1701{
d88dccdf
IZ
1702 int i = 0;
1703 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1704
d88dccdf
IZ
1705 if (s) {
1706 PL_colors[0] = s = savepv(s);
1707 while (++i < 6) {
1708 s = strchr(s, '\t');
1709 if (s) {
1710 *s = '\0';
1711 PL_colors[i] = ++s;
1712 }
1713 else
c712d376 1714 PL_colors[i] = s = "";
d88dccdf
IZ
1715 }
1716 } else {
b81d288d 1717 while (i < 6)
d88dccdf
IZ
1718 PL_colors[i++] = "";
1719 }
1720 PL_colorset = 1;
1721}
1722
8615cb43 1723
a687059c 1724/*
e50aee73 1725 - pregcomp - compile a regular expression into internal code
a687059c
LW
1726 *
1727 * We can't allocate space until we know how big the compiled form will be,
1728 * but we can't compile it (and thus know how big it is) until we've got a
1729 * place to put the code. So we cheat: we compile it twice, once with code
1730 * generation turned off and size counting turned on, and once "for real".
1731 * This also means that we don't allocate space until we are sure that the
1732 * thing really will compile successfully, and we never have to move the
1733 * code and thus invalidate pointers into it. (Note that it has to be in
1734 * one piece because free() must be able to free it all.) [NB: not true in perl]
1735 *
1736 * Beware that the optimization-preparation code in here knows about some
1737 * of the structure of the compiled regexp. [I'll say.]
1738 */
1739regexp *
864dbfa3 1740Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1741{
a0d0e21e 1742 register regexp *r;
c277df42 1743 regnode *scan;
c277df42 1744 regnode *first;
a0d0e21e 1745 I32 flags;
a0d0e21e
LW
1746 I32 minlen = 0;
1747 I32 sawplus = 0;
1748 I32 sawopen = 0;
2c2d71f5 1749 scan_data_t data;
830247a4
IZ
1750 RExC_state_t RExC_state;
1751 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1752
1753 if (exp == NULL)
c277df42 1754 FAIL("NULL regexp argument");
a0d0e21e 1755
a5961de5 1756 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1757
5cfc7842 1758 RExC_precomp = exp;
a5961de5
JH
1759 DEBUG_r({
1760 if (!PL_colorset) reginitcolors();
1761 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1762 PL_colors[4],PL_colors[5],PL_colors[0],
1763 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1764 });
e2509266 1765 RExC_flags = pm->op_pmflags;
830247a4 1766 RExC_sawback = 0;
bbce6d69 1767
830247a4
IZ
1768 RExC_seen = 0;
1769 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1770 RExC_seen_evals = 0;
1771 RExC_extralen = 0;
c277df42 1772
bbce6d69 1773 /* First pass: determine size, legality. */
830247a4 1774 RExC_parse = exp;
fac92740 1775 RExC_start = exp;
830247a4
IZ
1776 RExC_end = xend;
1777 RExC_naughty = 0;
1778 RExC_npar = 1;
1779 RExC_size = 0L;
1780 RExC_emit = &PL_regdummy;
1781 RExC_whilem_seen = 0;
85ddcde9
JH
1782#if 0 /* REGC() is (currently) a NOP at the first pass.
1783 * Clever compilers notice this and complain. --jhi */
830247a4 1784 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1785#endif
830247a4 1786 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1787 RExC_precomp = Nullch;
a0d0e21e
LW
1788 return(NULL);
1789 }
830247a4 1790 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1791
c277df42
IZ
1792 /* Small enough for pointer-storage convention?
1793 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1794 if (RExC_size >= 0x10000L && RExC_extralen)
1795 RExC_size += RExC_extralen;
c277df42 1796 else
830247a4
IZ
1797 RExC_extralen = 0;
1798 if (RExC_whilem_seen > 15)
1799 RExC_whilem_seen = 15;
a0d0e21e 1800
bbce6d69 1801 /* Allocate space and initialize. */
830247a4 1802 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1803 char, regexp);
a0d0e21e 1804 if (r == NULL)
b45f050a
JF
1805 FAIL("Regexp out of space");
1806
0f79a09d
GS
1807#ifdef DEBUGGING
1808 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1809 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1810#endif
c277df42 1811 r->refcnt = 1;
bbce6d69 1812 r->prelen = xend - exp;
5cfc7842 1813 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1814 r->subbeg = NULL;
ed252734
NC
1815#ifdef PERL_COPY_ON_WRITE
1816 r->saved_copy = Nullsv;
1817#endif
cf93c79d 1818 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1819 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1820
1821 r->substrs = 0; /* Useful during FAIL. */
1822 r->startp = 0; /* Useful during FAIL. */
1823 r->endp = 0; /* Useful during FAIL. */
1824
fac92740
MJD
1825 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1826 if (r->offsets) {
1827 r->offsets[0] = RExC_size;
1828 }
1829 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1830 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1831 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1832 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1833
830247a4 1834 RExC_rx = r;
bbce6d69 1835
1836 /* Second pass: emit code. */
e2509266 1837 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1838 RExC_parse = exp;
1839 RExC_end = xend;
1840 RExC_naughty = 0;
1841 RExC_npar = 1;
fac92740 1842 RExC_emit_start = r->program;
830247a4 1843 RExC_emit = r->program;
2cd61cdb 1844 /* Store the count of eval-groups for security checks: */
eb160463 1845 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1846 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1847 r->data = 0;
830247a4 1848 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1849 return(NULL);
1850
1851 /* Dig out information for optimizations. */
cf93c79d 1852 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1853 pm->op_pmflags = RExC_flags;
a0ed51b3 1854 if (UTF)
5ff6fc6d 1855 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1856 r->regstclass = NULL;
830247a4 1857 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1858 r->reganch |= ROPT_NAUGHTY;
c277df42 1859 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1860
1861 /* XXXX To minimize changes to RE engine we always allocate
1862 3-units-long substrs field. */
1863 Newz(1004, r->substrs, 1, struct reg_substr_data);
1864
2c2d71f5 1865 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1866 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1867 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1868 I32 fake;
c5254dd6 1869 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1870 struct regnode_charclass_class ch_class;
1871 int stclass_flag;
cb434fcc 1872 I32 last_close = 0;
a0d0e21e
LW
1873
1874 first = scan;
c277df42 1875 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1876 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1877 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1878 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879 (OP(first) == PLUS) ||
1880 (OP(first) == MINMOD) ||
653099ff 1881 /* An {n,m} with n>0 */
22c35a8c 1882 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1883 if (OP(first) == PLUS)
1884 sawplus = 1;
1885 else
1886 first += regarglen[(U8)OP(first)];
1887 first = NEXTOPER(first);
a687059c
LW
1888 }
1889
a0d0e21e
LW
1890 /* Starting-point info. */
1891 again:
653099ff 1892 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1893 if (OP(first) == EXACT)
1894 ; /* Empty, get anchored substr later. */
1895 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1896 r->regstclass = first;
1897 }
653099ff 1898 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1899 r->regstclass = first;
22c35a8c
GS
1900 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1902 r->regstclass = first;
22c35a8c 1903 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1904 r->reganch |= (OP(first) == MBOL
1905 ? ROPT_ANCH_MBOL
1906 : (OP(first) == SBOL
1907 ? ROPT_ANCH_SBOL
1908 : ROPT_ANCH_BOL));
a0d0e21e 1909 first = NEXTOPER(first);
774d564b 1910 goto again;
1911 }
1912 else if (OP(first) == GPOS) {
1913 r->reganch |= ROPT_ANCH_GPOS;
1914 first = NEXTOPER(first);
1915 goto again;
a0d0e21e 1916 }
e09294f4 1917 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1918 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1919 !(r->reganch & ROPT_ANCH) )
1920 {
1921 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1922 int type = OP(NEXTOPER(first));
1923
ffc61ed2 1924 if (type == REG_ANY)
cad2e5aa
JH
1925 type = ROPT_ANCH_MBOL;
1926 else
1927 type = ROPT_ANCH_SBOL;
1928
1929 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1930 first = NEXTOPER(first);
774d564b 1931 goto again;
a0d0e21e 1932 }
b81d288d 1933 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1934 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1935 /* x+ must match at the 1st pos of run of x's */
1936 r->reganch |= ROPT_SKIP;
a0d0e21e 1937
c277df42 1938 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1939 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1940 (IV)(first - scan + 1)));
a0d0e21e
LW
1941 /*
1942 * If there's something expensive in the r.e., find the
1943 * longest literal string that must appear and make it the
1944 * regmust. Resolve ties in favor of later strings, since
1945 * the regstart check works with the beginning of the r.e.
1946 * and avoiding duplication strengthens checking. Not a
1947 * strong reason, but sufficient in the absence of others.
1948 * [Now we resolve ties in favor of the earlier string if
c277df42 1949 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1950 * earlier string may buy us something the later one won't.]
1951 */
a0d0e21e 1952 minlen = 0;
a687059c 1953
79cb57f6
GS
1954 data.longest_fixed = newSVpvn("",0);
1955 data.longest_float = newSVpvn("",0);
1956 data.last_found = newSVpvn("",0);
c277df42
IZ
1957 data.longest = &(data.longest_fixed);
1958 first = scan;
653099ff 1959 if (!r->regstclass) {
830247a4 1960 cl_init(pRExC_state, &ch_class);
653099ff
GS
1961 data.start_class = &ch_class;
1962 stclass_flag = SCF_DO_STCLASS_AND;
1963 } else /* XXXX Check for BOUND? */
1964 stclass_flag = 0;
cb434fcc 1965 data.last_closep = &last_close;
653099ff 1966
830247a4 1967 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1968 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1969 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1970 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1971 && !RExC_seen_zerolen
1972 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1973 r->reganch |= ROPT_CHECK_ALL;
830247a4 1974 scan_commit(pRExC_state, &data);
c277df42
IZ
1975 SvREFCNT_dec(data.last_found);
1976
a0ed51b3 1977 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1978 if (longest_float_length
c277df42
IZ
1979 || (data.flags & SF_FL_BEFORE_EOL
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1981 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1982 int t;
1983
a0ed51b3 1984 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1985 && data.offset_fixed == data.offset_float_min
1986 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987 goto remove_float; /* As in (a)+. */
1988
33b8afdf
JH
1989 if (SvUTF8(data.longest_float)) {
1990 r->float_utf8 = data.longest_float;
1991 r->float_substr = Nullsv;
1992 } else {
1993 r->float_substr = data.longest_float;
1994 r->float_utf8 = Nullsv;
1995 }
c277df42
IZ
1996 r->float_min_offset = data.offset_float_min;
1997 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1998 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 2000 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2001 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2002 }
2003 else {
aca2d497 2004 remove_float:
33b8afdf 2005 r->float_substr = r->float_utf8 = Nullsv;
c277df42 2006 SvREFCNT_dec(data.longest_float);
c5254dd6 2007 longest_float_length = 0;
a0d0e21e 2008 }
c277df42 2009
a0ed51b3 2010 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2011 if (longest_fixed_length
c277df42
IZ
2012 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2014 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2015 int t;
2016
33b8afdf
JH
2017 if (SvUTF8(data.longest_fixed)) {
2018 r->anchored_utf8 = data.longest_fixed;
2019 r->anchored_substr = Nullsv;
2020 } else {
2021 r->anchored_substr = data.longest_fixed;
2022 r->anchored_utf8 = Nullsv;
2023 }
c277df42 2024 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2025 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2027 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2028 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2029 }
2030 else {
33b8afdf 2031 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2032 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2033 longest_fixed_length = 0;
a0d0e21e 2034 }
b81d288d 2035 if (r->regstclass
ffc61ed2 2036 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2037 r->regstclass = NULL;
33b8afdf
JH
2038 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2039 && stclass_flag
653099ff 2040 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2041 && !cl_is_anything(data.start_class))
2042 {
830247a4 2043 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2044
b81d288d 2045 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2046 struct regnode_charclass_class);
2047 StructCopy(data.start_class,
830247a4 2048 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2049 struct regnode_charclass_class);
830247a4 2050 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2051 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2052 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2053 DEBUG_r({ SV *sv = sv_newmortal();
2054 regprop(sv, (regnode*)data.start_class);
2055 PerlIO_printf(Perl_debug_log,
2056 "synthetic stclass `%s'.\n",
2057 SvPVX(sv));});
653099ff 2058 }
c277df42
IZ
2059
2060 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2061 if (longest_fixed_length > longest_float_length) {
c277df42 2062 r->check_substr = r->anchored_substr;
33b8afdf 2063 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2064 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065 if (r->reganch & ROPT_ANCH_SINGLE)
2066 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2067 }
2068 else {
c277df42 2069 r->check_substr = r->float_substr;
33b8afdf 2070 r->check_utf8 = r->float_utf8;
c277df42
IZ
2071 r->check_offset_min = data.offset_float_min;
2072 r->check_offset_max = data.offset_float_max;
a0d0e21e 2073 }
30382c73
IZ
2074 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075 This should be changed ASAP! */
33b8afdf 2076 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2077 r->reganch |= RE_USE_INTUIT;
33b8afdf 2078 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2079 r->reganch |= RE_INTUIT_TAIL;
2080 }
a0ed51b3
LW
2081 }
2082 else {
c277df42
IZ
2083 /* Several toplevels. Best we can is to set minlen. */
2084 I32 fake;
653099ff 2085 struct regnode_charclass_class ch_class;
cb434fcc 2086 I32 last_close = 0;
c277df42
IZ
2087
2088 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089 scan = r->program + 1;
830247a4 2090 cl_init(pRExC_state, &ch_class);
653099ff 2091 data.start_class = &ch_class;
cb434fcc 2092 data.last_closep = &last_close;
e1901655 2093 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2094 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2096 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2097 && !cl_is_anything(data.start_class))
2098 {
830247a4 2099 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2100
b81d288d 2101 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2102 struct regnode_charclass_class);
2103 StructCopy(data.start_class,
830247a4 2104 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2105 struct regnode_charclass_class);
830247a4 2106 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2107 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2108 DEBUG_r({ SV* sv = sv_newmortal();
2109 regprop(sv, (regnode*)data.start_class);
2110 PerlIO_printf(Perl_debug_log,
2111 "synthetic stclass `%s'.\n",
2112 SvPVX(sv));});
653099ff 2113 }
a0d0e21e
LW
2114 }
2115
a0d0e21e 2116 r->minlen = minlen;
b81d288d 2117 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2118 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2119 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2120 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2121 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2122 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2123 if (RExC_seen & REG_SEEN_CANY)
2124 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2125 Newz(1002, r->startp, RExC_npar, I32);
2126 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2127 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2128 DEBUG_r(regdump(r));
2129 return(r);
a687059c
LW
2130}
2131
2132/*
2133 - reg - regular expression, i.e. main body or parenthesized thing
2134 *
2135 * Caller must absorb opening parenthesis.
2136 *
2137 * Combining parenthesis handling with the base level of regular expression
2138 * is a trifle forced, but the need to tie the tails of the branches to what
2139 * follows makes it hard to avoid.
2140 */
76e3520e 2141STATIC regnode *
830247a4 2142S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2143 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2144{
c277df42
IZ
2145 register regnode *ret; /* Will be the head of the group. */
2146 register regnode *br;
2147 register regnode *lastbr;
2148 register regnode *ender = 0;
a0d0e21e 2149 register I32 parno = 0;
e2509266 2150 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2151
2152 /* for (?g), (?gc), and (?o) warnings; warning
2153 about (?c) will warn about (?g) -- japhy */
2154
2155 I32 wastedflags = 0x00,
2156 wasted_o = 0x01,
2157 wasted_g = 0x02,
2158 wasted_gc = 0x02 | 0x04,
2159 wasted_c = 0x04;
2160
fac92740 2161 char * parse_start = RExC_parse; /* MJD */
830247a4 2162 char *oregcomp_parse = RExC_parse;
c277df42 2163 char c;
a0d0e21e 2164
821b33a5 2165 *flagp = 0; /* Tentatively. */
a0d0e21e 2166
9d1d55b5 2167
a0d0e21e
LW
2168 /* Make an OPEN node, if parenthesized. */
2169 if (paren) {
fac92740 2170 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2171 U32 posflags = 0, negflags = 0;
2172 U32 *flagsp = &posflags;
0f5d15d6 2173 int logical = 0;
830247a4 2174 char *seqstart = RExC_parse;
ca9dfc88 2175
830247a4
IZ
2176 RExC_parse++;
2177 paren = *RExC_parse++;
c277df42 2178 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2179 switch (paren) {
fac92740 2180 case '<': /* (?<...) */
830247a4 2181 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2182 if (*RExC_parse == '!')
c277df42 2183 paren = ',';
b81d288d 2184 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2185 goto unknown;
830247a4 2186 RExC_parse++;
fac92740
MJD
2187 case '=': /* (?=...) */
2188 case '!': /* (?!...) */
830247a4 2189 RExC_seen_zerolen++;
fac92740
MJD
2190 case ':': /* (?:...) */
2191 case '>': /* (?>...) */
a0d0e21e 2192 break;
fac92740
MJD
2193 case '$': /* (?$...) */
2194 case '@': /* (?@...) */
8615cb43 2195 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2196 break;
fac92740 2197 case '#': /* (?#...) */
830247a4
IZ
2198 while (*RExC_parse && *RExC_parse != ')')
2199 RExC_parse++;
2200 if (*RExC_parse != ')')
c277df42 2201 FAIL("Sequence (?#... not terminated");
830247a4 2202 nextchar(pRExC_state);
a0d0e21e
LW
2203 *flagp = TRYAGAIN;
2204 return NULL;
fac92740 2205 case 'p': /* (?p...) */
9014280d 2206 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2207 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2208 /* FALL THROUGH*/
fac92740 2209 case '?': /* (??...) */
0f5d15d6 2210 logical = 1;
438a3801
YST
2211 if (*RExC_parse != '{')
2212 goto unknown;
830247a4 2213 paren = *RExC_parse++;
0f5d15d6 2214 /* FALL THROUGH */
fac92740 2215 case '{': /* (?{...}) */
c277df42 2216 {
c277df42
IZ
2217 I32 count = 1, n = 0;
2218 char c;
830247a4 2219 char *s = RExC_parse;
c277df42
IZ
2220 SV *sv;
2221 OP_4tree *sop, *rop;
2222
830247a4
IZ
2223 RExC_seen_zerolen++;
2224 RExC_seen |= REG_SEEN_EVAL;
2225 while (count && (c = *RExC_parse)) {
2226 if (c == '\\' && RExC_parse[1])
2227 RExC_parse++;
b81d288d 2228 else if (c == '{')
c277df42 2229 count++;
b81d288d 2230 else if (c == '}')
c277df42 2231 count--;
830247a4 2232 RExC_parse++;
c277df42 2233 }
830247a4 2234 if (*RExC_parse != ')')
b45f050a 2235 {
b81d288d 2236 RExC_parse = s;
b45f050a
JF
2237 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2238 }
c277df42 2239 if (!SIZE_ONLY) {
f3548bdc 2240 PAD *pad;
b81d288d
AB
2241
2242 if (RExC_parse - 1 - s)
830247a4 2243 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2244 else
79cb57f6 2245 sv = newSVpvn("", 0);
c277df42 2246
569233ed
SB
2247 ENTER;
2248 Perl_save_re_context(aTHX);
f3548bdc 2249 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2250 sop->op_private |= OPpREFCOUNTED;
2251 /* re_dup will OpREFCNT_inc */
2252 OpREFCNT_set(sop, 1);
569233ed 2253 LEAVE;
c277df42 2254
830247a4
IZ
2255 n = add_data(pRExC_state, 3, "nop");
2256 RExC_rx->data->data[n] = (void*)rop;
2257 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2258 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2259 SvREFCNT_dec(sv);
a0ed51b3 2260 }
e24b16f9 2261 else { /* First pass */
830247a4 2262 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 2263 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2264 /* No compiled RE interpolated, has runtime
2265 components ===> unsafe. */
2266 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2267 if (PL_tainting && PL_tainted)
cc6b7395 2268 FAIL("Eval-group in insecure regular expression");
923e4eb5 2269 if (IN_PERL_COMPILETIME)
b5c19bd7 2270 PL_cv_has_eval = 1;
c277df42 2271 }
b5c19bd7 2272
830247a4 2273 nextchar(pRExC_state);
0f5d15d6 2274 if (logical) {
830247a4 2275 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2276 if (!SIZE_ONLY)
2277 ret->flags = 2;
830247a4 2278 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2279 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2280 return ret;
2281 }
ccb2c380
MP
2282 ret = reganode(pRExC_state, EVAL, n);
2283 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284 Set_Node_Offset(ret, parse_start);
2285 return ret;
c277df42 2286 }
fac92740 2287 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2288 {
fac92740 2289 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2290 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291 || RExC_parse[1] == '<'
830247a4 2292 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2293 I32 flag;
2294
830247a4 2295 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2296 if (!SIZE_ONLY)
2297 ret->flags = 1;
830247a4 2298 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2299 goto insert_if;
b81d288d 2300 }
a0ed51b3 2301 }
830247a4 2302 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2303 /* (?(1)...) */
830247a4 2304 parno = atoi(RExC_parse++);
c277df42 2305
830247a4
IZ
2306 while (isDIGIT(*RExC_parse))
2307 RExC_parse++;
fac92740
MJD
2308 ret = reganode(pRExC_state, GROUPP, parno);
2309
830247a4 2310 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2311 vFAIL("Switch condition not recognized");
c277df42 2312 insert_if:
830247a4
IZ
2313 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314 br = regbranch(pRExC_state, &flags, 1);
c277df42 2315 if (br == NULL)
830247a4 2316 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2317 else
830247a4
IZ
2318 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319 c = *nextchar(pRExC_state);
d1b80229
IZ
2320 if (flags&HASWIDTH)
2321 *flagp |= HASWIDTH;
c277df42 2322 if (c == '|') {
830247a4
IZ
2323 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324 regbranch(pRExC_state, &flags, 1);
2325 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2326 if (flags&HASWIDTH)
2327 *flagp |= HASWIDTH;
830247a4 2328 c = *nextchar(pRExC_state);
a0ed51b3
LW
2329 }
2330 else
c277df42
IZ
2331 lastbr = NULL;
2332 if (c != ')')
8615cb43 2333 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2334 ender = reg_node(pRExC_state, TAIL);
2335 regtail(pRExC_state, br, ender);
c277df42 2336 if (lastbr) {
830247a4
IZ
2337 regtail(pRExC_state, lastbr, ender);
2338 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2339 }
2340 else
830247a4 2341 regtail(pRExC_state, ret, ender);
c277df42 2342 return ret;
a0ed51b3
LW
2343 }
2344 else {
830247a4 2345 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2346 }
2347 }
1b1626e4 2348 case 0:
830247a4 2349 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2350 vFAIL("Sequence (? incomplete");
1b1626e4 2351 break;
a0d0e21e 2352 default:
830247a4 2353 --RExC_parse;
fac92740 2354 parse_flags: /* (?i) */
830247a4 2355 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2356 /* (?g), (?gc) and (?o) are useless here
2357 and must be globally applied -- japhy */
2358
2359 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362 if (! (wastedflags & wflagbit) ) {
2363 wastedflags |= wflagbit;
2364 vWARN5(
2365 RExC_parse + 1,
2366 "Useless (%s%c) - %suse /%c modifier",
2367 flagsp == &negflags ? "?-" : "?",
2368 *RExC_parse,
2369 flagsp == &negflags ? "don't " : "",
2370 *RExC_parse
2371 );
2372 }
2373 }
2374 }
2375 else if (*RExC_parse == 'c') {
2376 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377 if (! (wastedflags & wasted_c) ) {
2378 wastedflags |= wasted_gc;
2379 vWARN3(
2380 RExC_parse + 1,
2381 "Useless (%sc) - %suse /gc modifier",
2382 flagsp == &negflags ? "?-" : "?",
2383 flagsp == &negflags ? "don't " : ""
2384 );
2385 }
2386 }
2387 }
2388 else { pmflag(flagsp, *RExC_parse); }
2389
830247a4 2390 ++RExC_parse;
ca9dfc88 2391 }
830247a4 2392 if (*RExC_parse == '-') {
ca9dfc88 2393 flagsp = &negflags;
9d1d55b5 2394 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2395 ++RExC_parse;
ca9dfc88 2396 goto parse_flags;
48c036b1 2397 }
e2509266
JH
2398 RExC_flags |= posflags;
2399 RExC_flags &= ~negflags;
830247a4
IZ
2400 if (*RExC_parse == ':') {
2401 RExC_parse++;
ca9dfc88
IZ
2402 paren = ':';
2403 break;
2404 }
c277df42 2405 unknown:
830247a4
IZ
2406 if (*RExC_parse != ')') {
2407 RExC_parse++;
2408 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2409 }
830247a4 2410 nextchar(pRExC_state);
a0d0e21e
LW
2411 *flagp = TRYAGAIN;
2412 return NULL;
2413 }
2414 }
fac92740 2415 else { /* (...) */
830247a4
IZ
2416 parno = RExC_npar;
2417 RExC_npar++;
2418 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2419 Set_Node_Length(ret, 1); /* MJD */
2420 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2421 open = 1;
a0d0e21e 2422 }
a0ed51b3 2423 }
fac92740 2424 else /* ! paren */
a0d0e21e
LW
2425 ret = NULL;
2426
2427 /* Pick up the branches, linking them together. */
fac92740 2428 parse_start = RExC_parse; /* MJD */
830247a4 2429 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2430 /* branch_len = (paren != 0); */
2431
a0d0e21e
LW
2432 if (br == NULL)
2433 return(NULL);
830247a4
IZ
2434 if (*RExC_parse == '|') {
2435 if (!SIZE_ONLY && RExC_extralen) {
2436 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2437 }
fac92740 2438 else { /* MJD */
830247a4 2439 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2440 Set_Node_Length(br, paren != 0);
2441 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2442 }
c277df42
IZ
2443 have_branch = 1;
2444 if (SIZE_ONLY)
830247a4 2445 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2446 }
2447 else if (paren == ':') {
c277df42
IZ
2448 *flagp |= flags&SIMPLE;
2449 }
2450 if (open) { /* Starts with OPEN. */
830247a4 2451 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2452 }
2453 else if (paren != '?') /* Not Conditional */
a0d0e21e 2454 ret = br;
32a0ca98 2455 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2456 lastbr = br;
830247a4
IZ
2457 while (*RExC_parse == '|') {
2458 if (!SIZE_ONLY && RExC_extralen) {
2459 ender = reganode(pRExC_state, LONGJMP,0);
2460 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2461 }
2462 if (SIZE_ONLY)
830247a4
IZ
2463 RExC_extralen += 2; /* Account for LONGJMP. */
2464 nextchar(pRExC_state);
2465 br = regbranch(pRExC_state, &flags, 0);
fac92740 2466
a687059c 2467 if (br == NULL)
a0d0e21e 2468 return(NULL);
830247a4 2469 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2470 lastbr = br;
821b33a5
IZ
2471 if (flags&HASWIDTH)
2472 *flagp |= HASWIDTH;
a687059c 2473 *flagp |= flags&SPSTART;
a0d0e21e
LW
2474 }
2475
c277df42
IZ
2476 if (have_branch || paren != ':') {
2477 /* Make a closing node, and hook it on the end. */
2478 switch (paren) {
2479 case ':':
830247a4 2480 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2481 break;
2482 case 1:
830247a4 2483 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2484 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2486 break;
2487 case '<':
c277df42
IZ
2488 case ',':
2489 case '=':
2490 case '!':
c277df42 2491 *flagp &= ~HASWIDTH;
821b33a5
IZ
2492 /* FALL THROUGH */
2493 case '>':
830247a4 2494 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2495 break;
2496 case 0:
830247a4 2497 ender = reg_node(pRExC_state, END);
c277df42
IZ
2498 break;
2499 }
830247a4 2500 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2501
c277df42
IZ
2502 if (have_branch) {
2503 /* Hook the tails of the branches to the closing node. */
2504 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2505 regoptail(pRExC_state, br, ender);
c277df42
IZ
2506 }
2507 }
a0d0e21e 2508 }
c277df42
IZ
2509
2510 {
2511 char *p;
2512 static char parens[] = "=!<,>";
2513
2514 if (paren && (p = strchr(parens, paren))) {
eb160463 2515 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2516 int flag = (p - parens) > 1;
2517
2518 if (paren == '>')
2519 node = SUSPEND, flag = 0;
830247a4 2520 reginsert(pRExC_state, node,ret);
45948336
EP
2521 Set_Node_Cur_Length(ret);
2522 Set_Node_Offset(ret, parse_start + 1);
c277df42 2523 ret->flags = flag;
830247a4 2524 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2525 }
a0d0e21e
LW
2526 }
2527
2528 /* Check for proper termination. */
ce3e6498 2529 if (paren) {
e2509266 2530 RExC_flags = oregflags;
830247a4
IZ
2531 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532 RExC_parse = oregcomp_parse;
380a0633 2533 vFAIL("Unmatched (");
ce3e6498 2534 }
a0ed51b3 2535 }
830247a4
IZ
2536 else if (!paren && RExC_parse < RExC_end) {
2537 if (*RExC_parse == ')') {
2538 RExC_parse++;
380a0633 2539 vFAIL("Unmatched )");
a0ed51b3
LW
2540 }
2541 else
b45f050a 2542 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2543 /* NOTREACHED */
2544 }
a687059c 2545
a0d0e21e 2546 return(ret);
a687059c
LW
2547}
2548
2549/*
2550 - regbranch - one alternative of an | operator
2551 *
2552 * Implements the concatenation operator.
2553 */
76e3520e 2554STATIC regnode *
830247a4 2555S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2556{
c277df42
IZ
2557 register regnode *ret;
2558 register regnode *chain = NULL;
2559 register regnode *latest;
2560 I32 flags = 0, c = 0;
a0d0e21e 2561
b81d288d 2562 if (first)
c277df42
IZ
2563 ret = NULL;
2564 else {
b81d288d 2565 if (!SIZE_ONLY && RExC_extralen)
830247a4 2566 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2567 else {
830247a4 2568 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2569 Set_Node_Length(ret, 1);
2570 }
c277df42
IZ
2571 }
2572
b81d288d 2573 if (!first && SIZE_ONLY)
830247a4 2574 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2575
c277df42 2576 *flagp = WORST; /* Tentatively. */
a0d0e21e 2577
830247a4
IZ
2578 RExC_parse--;
2579 nextchar(pRExC_state);
2580 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2581 flags &= ~TRYAGAIN;
830247a4 2582 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2583 if (latest == NULL) {
2584 if (flags & TRYAGAIN)
2585 continue;
2586 return(NULL);
a0ed51b3
LW
2587 }
2588 else if (ret == NULL)
c277df42 2589 ret = latest;
a0d0e21e 2590 *flagp |= flags&HASWIDTH;
c277df42 2591 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2592 *flagp |= flags&SPSTART;
2593 else {
830247a4
IZ
2594 RExC_naughty++;
2595 regtail(pRExC_state, chain, latest);
a687059c 2596 }
a0d0e21e 2597 chain = latest;
c277df42
IZ
2598 c++;
2599 }
2600 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2601 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2602 if (ret == NULL)
2603 ret = chain;
2604 }
2605 if (c == 1) {
2606 *flagp |= flags&SIMPLE;
a0d0e21e 2607 }
a687059c 2608
a0d0e21e 2609 return(ret);
a687059c
LW
2610}
2611
2612/*
2613 - regpiece - something followed by possible [*+?]
2614 *
2615 * Note that the branching code sequences used for ? and the general cases
2616 * of * and + are somewhat optimized: they use the same NOTHING node as
2617 * both the endmarker for their branch list and the body of the last branch.
2618 * It might seem that this node could be dispensed with entirely, but the
2619 * endmarker role is not redundant.
2620 */
76e3520e 2621STATIC regnode *
830247a4 2622S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2623{
c277df42 2624 register regnode *ret;
a0d0e21e
LW
2625 register char op;
2626 register char *next;
2627 I32 flags;
830247a4 2628 char *origparse = RExC_parse;
a0d0e21e
LW
2629 char *maxpos;
2630 I32 min;
c277df42 2631 I32 max = REG_INFTY;
fac92740 2632 char *parse_start;
a0d0e21e 2633
830247a4 2634 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2635 if (ret == NULL) {
2636 if (flags & TRYAGAIN)
2637 *flagp |= TRYAGAIN;
2638 return(NULL);
2639 }
2640
830247a4 2641 op = *RExC_parse;
a0d0e21e 2642
830247a4 2643 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2644 parse_start = RExC_parse; /* MJD */
830247a4 2645 next = RExC_parse + 1;
a0d0e21e
LW
2646 maxpos = Nullch;
2647 while (isDIGIT(*next) || *next == ',') {
2648 if (*next == ',') {
2649 if (maxpos)
2650 break;
2651 else
2652 maxpos = next;
a687059c 2653 }
a0d0e21e
LW
2654 next++;
2655 }
2656 if (*next == '}') { /* got one */
2657 if (!maxpos)
2658 maxpos = next;
830247a4
IZ
2659 RExC_parse++;
2660 min = atoi(RExC_parse);
a0d0e21e
LW
2661 if (*maxpos == ',')
2662 maxpos++;
2663 else
830247a4 2664 maxpos = RExC_parse;
a0d0e21e
LW
2665 max = atoi(maxpos);
2666 if (!max && *maxpos != '0')
c277df42
IZ
2667 max = REG_INFTY; /* meaning "infinity" */
2668 else if (max >= REG_INFTY)
8615cb43 2669 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2670 RExC_parse = next;
2671 nextchar(pRExC_state);
a0d0e21e
LW
2672
2673 do_curly:
2674 if ((flags&SIMPLE)) {
830247a4
IZ
2675 RExC_naughty += 2 + RExC_naughty / 2;
2676 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2677 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2679 }
2680 else {
830247a4 2681 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2682
2683 w->flags = 0;
830247a4
IZ
2684 regtail(pRExC_state, ret, w);
2685 if (!SIZE_ONLY && RExC_extralen) {
2686 reginsert(pRExC_state, LONGJMP,ret);
2687 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2688 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2689 }
830247a4 2690 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2691 /* MJD hk */
2692 Set_Node_Offset(ret, parse_start+1);
2693 Set_Node_Length(ret,
2694 op == '{' ? (RExC_parse - parse_start) : 1);
2695
830247a4 2696 if (!SIZE_ONLY && RExC_extralen)
c277df42 2697 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2698 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2699 if (SIZE_ONLY)
830247a4
IZ
2700 RExC_whilem_seen++, RExC_extralen += 3;
2701 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2702 }
c277df42 2703 ret->flags = 0;
a0d0e21e
LW
2704
2705 if (min > 0)
821b33a5
IZ
2706 *flagp = WORST;
2707 if (max > 0)
2708 *flagp |= HASWIDTH;
a0d0e21e 2709 if (max && max < min)
8615cb43 2710 vFAIL("Can't do {n,m} with n > m");
c277df42 2711 if (!SIZE_ONLY) {
eb160463
GS
2712 ARG1_SET(ret, (U16)min);
2713 ARG2_SET(ret, (U16)max);
a687059c 2714 }
a687059c 2715
a0d0e21e 2716 goto nest_check;
a687059c 2717 }
a0d0e21e 2718 }
a687059c 2719
a0d0e21e
LW
2720 if (!ISMULT1(op)) {
2721 *flagp = flags;
a687059c 2722 return(ret);
a0d0e21e 2723 }
bb20fd44 2724
c277df42 2725#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2726
2727 /* if this is reinstated, don't forget to put this back into perldiag:
2728
2729 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2730
2731 (F) The part of the regexp subject to either the * or + quantifier
2732 could match an empty string. The {#} shows in the regular
2733 expression about where the problem was discovered.
2734
2735 */
2736
bb20fd44 2737 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2738 vFAIL("Regexp *+ operand could be empty");
b81d288d 2739#endif
bb20fd44 2740
fac92740 2741 parse_start = RExC_parse;
830247a4 2742 nextchar(pRExC_state);
a0d0e21e 2743
821b33a5 2744 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2745
2746 if (op == '*' && (flags&SIMPLE)) {
830247a4 2747 reginsert(pRExC_state, STAR, ret);
c277df42 2748 ret->flags = 0;
830247a4 2749 RExC_naughty += 4;
a0d0e21e
LW
2750 }
2751 else if (op == '*') {
2752 min = 0;
2753 goto do_curly;
a0ed51b3
LW
2754 }
2755 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2756 reginsert(pRExC_state, PLUS, ret);
c277df42 2757 ret->flags = 0;
830247a4 2758 RExC_naughty += 3;
a0d0e21e
LW
2759 }
2760 else if (op == '+') {
2761 min = 1;
2762 goto do_curly;
a0ed51b3
LW
2763 }
2764 else if (op == '?') {
a0d0e21e
LW
2765 min = 0; max = 1;
2766 goto do_curly;
2767 }
2768 nest_check:
e476b1b5 2769 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2770 vWARN3(RExC_parse,
b45f050a 2771 "%.*s matches null string many times",
830247a4 2772 RExC_parse - origparse,
b45f050a 2773 origparse);
a0d0e21e
LW
2774 }
2775
830247a4
IZ
2776 if (*RExC_parse == '?') {
2777 nextchar(pRExC_state);
2778 reginsert(pRExC_state, MINMOD, ret);
2779 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2780 }
830247a4
IZ
2781 if (ISMULT2(RExC_parse)) {
2782 RExC_parse++;
b45f050a
JF
2783 vFAIL("Nested quantifiers");
2784 }
a0d0e21e
LW
2785
2786 return(ret);
a687059c
LW
2787}
2788
2789/*
2790 - regatom - the lowest level
2791 *
2792 * Optimization: gobbles an entire sequence of ordinary characters so that
2793 * it can turn them into a single node, which is smaller to store and
2794 * faster to run. Backslashed characters are exceptions, each becoming a
2795 * separate node; the code is simpler that way and it's not worth fixing.
2796 *
b45f050a 2797 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2798STATIC regnode *
830247a4 2799S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2800{
c277df42 2801 register regnode *ret = 0;
a0d0e21e 2802 I32 flags;
45948336 2803 char *parse_start = RExC_parse;
a0d0e21e
LW
2804
2805 *flagp = WORST; /* Tentatively. */
2806
2807tryagain:
830247a4 2808 switch (*RExC_parse) {
a0d0e21e 2809 case '^':
830247a4
IZ
2810 RExC_seen_zerolen++;
2811 nextchar(pRExC_state);
e2509266 2812 if (RExC_flags & PMf_MULTILINE)
830247a4 2813 ret = reg_node(pRExC_state, MBOL);
e2509266 2814 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2815 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2816 else
830247a4 2817 ret = reg_node(pRExC_state, BOL);
fac92740 2818 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2819 break;
2820 case '$':
830247a4 2821 nextchar(pRExC_state);
b81d288d 2822 if (*RExC_parse)
830247a4 2823 RExC_seen_zerolen++;
e2509266 2824 if (RExC_flags & PMf_MULTILINE)
830247a4 2825 ret = reg_node(pRExC_state, MEOL);
e2509266 2826 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2827 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2828 else
830247a4 2829 ret = reg_node(pRExC_state, EOL);
fac92740 2830 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2831 break;
2832 case '.':
830247a4 2833 nextchar(pRExC_state);
e2509266 2834 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2835 ret = reg_node(pRExC_state, SANY);
2836 else
2837 ret = reg_node(pRExC_state, REG_ANY);
2838 *flagp |= HASWIDTH|SIMPLE;
830247a4 2839 RExC_naughty++;
fac92740 2840 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2841 break;
2842 case '[':
b45f050a 2843 {
830247a4 2844 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2845 ret = regclass(pRExC_state);
830247a4
IZ
2846 if (*RExC_parse != ']') {
2847 RExC_parse = oregcomp_parse;
b45f050a
JF
2848 vFAIL("Unmatched [");
2849 }
830247a4 2850 nextchar(pRExC_state);
a0d0e21e 2851 *flagp |= HASWIDTH|SIMPLE;
fac92740 2852 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2853 break;
b45f050a 2854 }
a0d0e21e 2855 case '(':
830247a4
IZ
2856 nextchar(pRExC_state);
2857 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2858 if (ret == NULL) {
bf93d4cc 2859 if (flags & TRYAGAIN) {
830247a4 2860 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2861 /* Make parent create an empty node if needed. */
2862 *flagp |= TRYAGAIN;
2863 return(NULL);
2864 }
a0d0e21e 2865 goto tryagain;
bf93d4cc 2866 }
a0d0e21e
LW
2867 return(NULL);
2868 }
c277df42 2869 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2870 break;
2871 case '|':
2872 case ')':
2873 if (flags & TRYAGAIN) {
2874 *flagp |= TRYAGAIN;
2875 return NULL;
2876 }
b45f050a 2877 vFAIL("Internal urp");
a0d0e21e
LW
2878 /* Supposed to be caught earlier. */
2879 break;
85afd4ae 2880 case '{':
830247a4
IZ
2881 if (!regcurly(RExC_parse)) {
2882 RExC_parse++;
85afd4ae
CS
2883 goto defchar;
2884 }
2885 /* FALL THROUGH */
a0d0e21e
LW
2886 case '?':
2887 case '+':
2888 case '*':
830247a4 2889 RExC_parse++;
b45f050a 2890 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2891 break;
2892 case '\\':
830247a4 2893 switch (*++RExC_parse) {
a0d0e21e 2894 case 'A':
830247a4
IZ
2895 RExC_seen_zerolen++;
2896 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2897 *flagp |= SIMPLE;
830247a4 2898 nextchar(pRExC_state);
fac92740 2899 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2900 break;
2901 case 'G':
830247a4
IZ
2902 ret = reg_node(pRExC_state, GPOS);
2903 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2904 *flagp |= SIMPLE;
830247a4 2905 nextchar(pRExC_state);
fac92740 2906 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2907 break;
2908 case 'Z':
830247a4 2909 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2910 *flagp |= SIMPLE;
a1917ab9 2911 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2912 nextchar(pRExC_state);
a0d0e21e 2913 break;
b85d18e9 2914 case 'z':
830247a4 2915 ret = reg_node(pRExC_state, EOS);
b85d18e9 2916 *flagp |= SIMPLE;
830247a4
IZ
2917 RExC_seen_zerolen++; /* Do not optimize RE away */
2918 nextchar(pRExC_state);
fac92740 2919 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2920 break;
4a2d328f 2921 case 'C':
f33976b4
DB
2922 ret = reg_node(pRExC_state, CANY);
2923 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2924 *flagp |= HASWIDTH|SIMPLE;
830247a4 2925 nextchar(pRExC_state);
fac92740 2926 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2927 break;
2928 case 'X':
830247a4 2929 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2930 *flagp |= HASWIDTH;
830247a4 2931 nextchar(pRExC_state);
fac92740 2932 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2933 break;
a0d0e21e 2934 case 'w':
eb160463 2935 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2936 *flagp |= HASWIDTH|SIMPLE;
830247a4 2937 nextchar(pRExC_state);
fac92740 2938 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2939 break;
2940 case 'W':
eb160463 2941 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2942 *flagp |= HASWIDTH|SIMPLE;
830247a4 2943 nextchar(pRExC_state);
fac92740 2944 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2945 break;
2946 case 'b':
830247a4
IZ
2947 RExC_seen_zerolen++;
2948 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2949 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2950 *flagp |= SIMPLE;
830247a4 2951 nextchar(pRExC_state);
fac92740 2952 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2953 break;
2954 case 'B':
830247a4
IZ
2955 RExC_seen_zerolen++;
2956 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2957 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2958 *flagp |= SIMPLE;
830247a4 2959 nextchar(pRExC_state);
fac92740 2960 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2961 break;
2962 case 's':
eb160463 2963 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2964 *flagp |= HASWIDTH|SIMPLE;
830247a4 2965 nextchar(pRExC_state);
fac92740 2966 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2967 break;
2968 case 'S':
eb160463 2969 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2970 *flagp |= HASWIDTH|SIMPLE;
830247a4 2971 nextchar(pRExC_state);
fac92740 2972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2973 break;
2974 case 'd':
ffc61ed2 2975 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2976 *flagp |= HASWIDTH|SIMPLE;
830247a4 2977 nextchar(pRExC_state);
fac92740 2978 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2979 break;
2980 case 'D':
ffc61ed2 2981 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2982 *flagp |= HASWIDTH|SIMPLE;
830247a4 2983 nextchar(pRExC_state);
fac92740 2984 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2985 break;
a14b48bc
LW
2986 case 'p':
2987 case 'P':
3568d838 2988 {
830247a4 2989 char* oldregxend = RExC_end;
ccb2c380 2990 char* parse_start = RExC_parse - 2;
a14b48bc 2991
830247a4 2992 if (RExC_parse[1] == '{') {
3568d838 2993 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2994 RExC_end = strchr(RExC_parse, '}');
2995 if (!RExC_end) {
0da60cf5 2996 U8 c = (U8)*RExC_parse;
830247a4
IZ
2997 RExC_parse += 2;
2998 RExC_end = oldregxend;
0da60cf5 2999 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 3000 }
830247a4 3001 RExC_end++;
a14b48bc 3002 }
af6f566e 3003 else {
830247a4 3004 RExC_end = RExC_parse + 2;
af6f566e
HS
3005 if (RExC_end > oldregxend)
3006 RExC_end = oldregxend;
3007 }
830247a4 3008 RExC_parse--;
a14b48bc 3009
ffc61ed2 3010 ret = regclass(pRExC_state);
a14b48bc 3011
830247a4
IZ
3012 RExC_end = oldregxend;
3013 RExC_parse--;
ccb2c380
MP
3014
3015 Set_Node_Offset(ret, parse_start + 2);
3016 Set_Node_Cur_Length(ret);
830247a4 3017 nextchar(pRExC_state);
a14b48bc
LW
3018 *flagp |= HASWIDTH|SIMPLE;
3019 }
3020 break;
a0d0e21e
LW
3021 case 'n':
3022 case 'r':
3023 case 't':
3024 case 'f':
3025 case 'e':
3026 case 'a':
3027 case 'x':
3028 case 'c':
3029 case '0':
3030 goto defchar;
3031 case '1': case '2': case '3': case '4':
3032 case '5': case '6': case '7': case '8': case '9':
3033 {
830247a4 3034 I32 num = atoi(RExC_parse);
a0d0e21e 3035
830247a4 3036 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3037 goto defchar;
3038 else {
fac92740 3039 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3040 while (isDIGIT(*RExC_parse))
3041 RExC_parse++;
b45f050a 3042
eb160463 3043 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3044 vFAIL("Reference to nonexistent group");
830247a4 3045 RExC_sawback = 1;
eb160463
GS
3046 ret = reganode(pRExC_state,
3047 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3048 num);
a0d0e21e 3049 *flagp |= HASWIDTH;
fac92740
MJD
3050
3051 /* override incorrect value set in reganode MJD */
3052 Set_Node_Offset(ret, parse_start+1);
3053 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3054 RExC_parse--;
3055 nextchar(pRExC_state);
a0d0e21e
LW
3056 }
3057 }
3058 break;
3059 case '\0':
830247a4 3060 if (RExC_parse >= RExC_end)
b45f050a 3061 FAIL("Trailing \\");
a0d0e21e
LW
3062 /* FALL THROUGH */
3063 default:
c9f97d15
IZ
3064 /* Do not generate `unrecognized' warnings here, we fall
3065 back into the quick-grab loop below */
45948336 3066 parse_start--;
a0d0e21e
LW
3067 goto defchar;
3068 }
3069 break;
4633a7c4
LW
3070
3071 case '#':
e2509266 3072 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3073 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074 if (RExC_parse < RExC_end)
4633a7c4
LW
3075 goto tryagain;
3076 }
3077 /* FALL THROUGH */
3078
a0d0e21e 3079 default: {
ba210ebe 3080 register STRLEN len;
58ae7d3f 3081 register UV ender;
a0d0e21e 3082 register char *p;
c277df42 3083 char *oldp, *s;
ba210ebe 3084 STRLEN numlen;
80aecb99 3085 STRLEN foldlen;
60a8b682 3086 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3087
3088 parse_start = RExC_parse - 1;
a0d0e21e 3089
830247a4 3090 RExC_parse++;
a0d0e21e
LW
3091
3092 defchar:
58ae7d3f 3093 ender = 0;
eb160463
GS
3094 ret = reg_node(pRExC_state,
3095 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3096 s = STRING(ret);
830247a4
IZ
3097 for (len = 0, p = RExC_parse - 1;
3098 len < 127 && p < RExC_end;
a0d0e21e
LW
3099 len++)
3100 {
3101 oldp = p;
5b5a24f7 3102
e2509266 3103 if (RExC_flags & PMf_EXTENDED)
830247a4 3104 p = regwhite(p, RExC_end);
a0d0e21e
LW
3105 switch (*p) {
3106 case '^':
3107 case '$':
3108 case '.':
3109 case '[':
3110 case '(':
3111 case ')':
3112 case '|':
3113 goto loopdone;
3114 case '\\':
3115 switch (*++p) {
3116 case 'A':
1ed8eac0
JF
3117 case 'C':
3118 case 'X':
a0d0e21e
LW
3119 case 'G':
3120 case 'Z':
b85d18e9 3121 case 'z':
a0d0e21e
LW
3122 case 'w':
3123 case 'W':
3124 case 'b':
3125 case 'B':
3126 case 's':
3127 case 'S':
3128 case 'd':
3129 case 'D':
a14b48bc
LW
3130 case 'p':
3131 case 'P':
a0d0e21e
LW
3132 --p;
3133 goto loopdone;
3134 case 'n':
3135 ender = '\n';
3136 p++;
a687059c 3137 break;
a0d0e21e
LW
3138 case 'r':
3139 ender = '\r';
3140 p++;
a687059c 3141 break;
a0d0e21e
LW
3142 case 't':
3143 ender = '\t';
3144 p++;
a687059c 3145 break;
a0d0e21e
LW
3146 case 'f':
3147 ender = '\f';
3148 p++;
a687059c 3149 break;
a0d0e21e 3150 case 'e':
c7f1f016 3151 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3152 p++;
a687059c 3153 break;
a0d0e21e 3154 case 'a':
c7f1f016 3155 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3156 p++;
a687059c 3157 break;
a0d0e21e 3158 case 'x':
a0ed51b3
LW
3159 if (*++p == '{') {
3160 char* e = strchr(p, '}');
b81d288d 3161
b45f050a 3162 if (!e) {
830247a4 3163 RExC_parse = p + 1;
b45f050a
JF
3164 vFAIL("Missing right brace on \\x{}");
3165 }
de5f0749 3166 else {
a4c04bdc
NC
3167 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3169 numlen = e - p - 1;
3170 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3171 if (ender > 0xff)
3172 RExC_utf8 = 1;
a0ed51b3
LW
3173 p = e + 1;
3174 }
a0ed51b3
LW
3175 }
3176 else {
a4c04bdc 3177 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3178 numlen = 2;
3179 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3180 p += numlen;
3181 }
a687059c 3182 break;
a0d0e21e
LW
3183 case 'c':
3184 p++;
bbce6d69 3185 ender = UCHARAT(p++);
3186 ender = toCTRL(ender);
a687059c 3187 break;
a0d0e21e
LW
3188 case '0': case '1': case '2': case '3':case '4':
3189 case '5': case '6': case '7': case '8':case '9':
3190 if (*p == '0' ||
830247a4 3191 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3192 I32 flags = 0;
3193 numlen = 3;
3194 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3195 p += numlen;
3196 }
3197 else {
3198 --p;
3199 goto loopdone;
a687059c
LW
3200 }
3201 break;
a0d0e21e 3202 case '\0':
830247a4 3203 if (p >= RExC_end)
b45f050a 3204 FAIL("Trailing \\");
a687059c 3205 /* FALL THROUGH */
a0d0e21e 3206 default:
e476b1b5 3207 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3208 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3209 goto normal_default;
a0d0e21e
LW
3210 }
3211 break;
a687059c 3212 default:
a0ed51b3 3213 normal_default:
fd400ab9 3214 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3215 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3216 &numlen, 0);
a0ed51b3
LW
3217 p += numlen;
3218 }
3219 else
3220 ender = *p++;
a0d0e21e 3221 break;
a687059c 3222 }
e2509266 3223 if (RExC_flags & PMf_EXTENDED)
830247a4 3224 p = regwhite(p, RExC_end);
60a8b682
JH
3225 if (UTF && FOLD) {
3226 /* Prime the casefolded buffer. */
ac7e0132 3227 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3228 }
a0d0e21e
LW
3229 if (ISMULT2(p)) { /* Back off on ?+*. */
3230 if (len)
3231 p = oldp;
16ea2a2e 3232 else if (UTF) {
0ebc6274
JH
3233 STRLEN unilen;
3234
80aecb99 3235 if (FOLD) {
60a8b682 3236 /* Emit all the Unicode characters. */
80aecb99
JH
3237 for (foldbuf = tmpbuf;
3238 foldlen;
3239 foldlen -= numlen) {
3240 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3241 if (numlen > 0) {
0ebc6274
JH
3242 reguni(pRExC_state, ender, s, &unilen);
3243 s += unilen;
3244 len += unilen;
3245 /* In EBCDIC the numlen
3246 * and unilen can differ. */
9dc45d57 3247 foldbuf += numlen;
47654450
JH
3248 if (numlen >= foldlen)
3249 break;
9dc45d57
JH
3250 }
3251 else
3252 break; /* "Can't happen." */
80aecb99
JH
3253 }
3254 }
3255 else {
0ebc6274 3256 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3257 if (unilen > 0) {
0ebc6274
JH
3258 s += unilen;
3259 len += unilen;
9dc45d57 3260 }
80aecb99 3261 }
a0ed51b3 3262 }
a0d0e21e
LW
3263 else {
3264 len++;
eb160463 3265 REGC((char)ender, s++);
a0d0e21e
LW
3266 }
3267 break;
a687059c 3268 }
16ea2a2e 3269 if (UTF) {
0ebc6274
JH
3270 STRLEN unilen;
3271
80aecb99 3272 if (FOLD) {
60a8b682 3273 /* Emit all the Unicode characters. */
80aecb99
JH
3274 for (foldbuf = tmpbuf;
3275 foldlen;
3276 foldlen -= numlen) {
3277 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3278 if (numlen > 0) {
0ebc6274
JH
3279 reguni(pRExC_state, ender, s, &unilen);
3280 len += unilen;
3281 s += unilen;
3282 /* In EBCDIC the numlen
3283 * and unilen can differ. */
9dc45d57 3284 foldbuf += numlen;
47654450
JH
3285 if (numlen >= foldlen)
3286 break;
9dc45d57
JH
3287 }
3288 else
3289 break;
80aecb99
JH
3290 }
3291 }
3292 else {
0ebc6274 3293 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3294 if (unilen > 0) {
0ebc6274
JH
3295 s += unilen;
3296 len += unilen;
9dc45d57 3297 }
80aecb99
JH
3298 }
3299 len--;
a0ed51b3
LW
3300 }
3301 else
eb160463 3302 REGC((char)ender, s++);
a0d0e21e
LW
3303 }
3304 loopdone:
830247a4 3305 RExC_parse = p - 1;
fac92740 3306 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3307 nextchar(pRExC_state);
793db0cb
JH
3308 {
3309 /* len is STRLEN which is unsigned, need to copy to signed */
3310 IV iv = len;
3311 if (iv < 0)
3312 vFAIL("Internal disaster");
3313 }
a0d0e21e
LW
3314 if (len > 0)
3315 *flagp |= HASWIDTH;
090f7165 3316 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3317 *flagp |= SIMPLE;
c277df42 3318 if (!SIZE_ONLY)
cd439c50
IZ
3319 STR_LEN(ret) = len;
3320 if (SIZE_ONLY)
830247a4 3321 RExC_size += STR_SZ(len);
cd439c50 3322 else
830247a4 3323 RExC_emit += STR_SZ(len);
a687059c 3324 }
a0d0e21e
LW
3325 break;
3326 }
a687059c 3327
60a8b682
JH
3328 /* If the encoding pragma is in effect recode the text of
3329 * any EXACT-kind nodes. */
22c54be3 3330 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3331 STRLEN oldlen = STR_LEN(ret);
3332 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333
3334 if (RExC_utf8)
3335 SvUTF8_on(sv);
3336 if (sv_utf8_downgrade(sv, TRUE)) {
3337 char *s = sv_recode_to_utf8(sv, PL_encoding);
3338 STRLEN newlen = SvCUR(sv);
3339
3340 if (SvUTF8(sv))
3341 RExC_utf8 = 1;
3342 if (!SIZE_ONLY) {
3343 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344 (int)oldlen, STRING(ret),
3345 (int)newlen, s));
3346 Copy(s, STRING(ret), newlen, char);
3347 STR_LEN(ret) += newlen - oldlen;
3348 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3349 } else
3350 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3351 }
a72c7584
JH
3352 }
3353
a0d0e21e 3354 return(ret);
a687059c
LW
3355}
3356
873ef191 3357STATIC char *
cea2e8a9 3358S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3359{
3360 while (p < e) {
3361 if (isSPACE(*p))
3362 ++p;
3363 else if (*p == '#') {
3364 do {
3365 p++;
3366 } while (p < e && *p != '\n');
3367 }
3368 else
3369 break;
3370 }
3371 return p;
3372}
3373
b8c5462f
JH
3374/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375 Character classes ([:foo:]) can also be negated ([:^foo:]).
3376 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3378 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3379
3380#define POSIXCC_DONE(c) ((c) == ':')
3381#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3383
b8c5462f 3384STATIC I32
830247a4 3385S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3386{
3387 char *posixcc = 0;
936ed897 3388 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3389
830247a4 3390 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3391 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3392 POSIXCC(UCHARAT(RExC_parse))) {
3393 char c = UCHARAT(RExC_parse);
830247a4 3394 char* s = RExC_parse++;
b81d288d 3395
9a86a77b 3396 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3397 RExC_parse++;
3398 if (RExC_parse == RExC_end)
620e46c5 3399 /* Grandfather lone [:, [=, [. */
830247a4 3400 RExC_parse = s;
620e46c5 3401 else {
830247a4 3402 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3403
9a86a77b 3404 if (UCHARAT(RExC_parse) == ']') {
830247a4 3405 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3406 posixcc = s + 1;
3407 if (*s == ':') {
3408 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409 I32 skip = 5; /* the most common skip */
3410
3411 switch (*posixcc) {
3412 case 'a':
3413 if (strnEQ(posixcc, "alnum", 5))
3414 namedclass =
3415 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416 else if (strnEQ(posixcc, "alpha", 5))
3417 namedclass =
3418 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419 else if (strnEQ(posixcc, "ascii", 5))
3420 namedclass =
3421 complement ? ANYOF_NASCII : ANYOF_ASCII;
3422 break;
aaa51d5e
JF
3423 case 'b':
3424 if (strnEQ(posixcc, "blank", 5))
3425 namedclass =
3426 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3427 break;
b8c5462f
JH
3428 case 'c':
3429 if (strnEQ(posixcc, "cntrl", 5))
3430 namedclass =
3431 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3432 break;
3433 case 'd':
3434 if (strnEQ(posixcc, "digit", 5))
3435 namedclass =
3436 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3437 break;
3438 case 'g':
3439 if (strnEQ(posixcc, "graph", 5))
3440 namedclass =
3441 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3442 break;
3443 case 'l':
3444 if (strnEQ(posixcc, "lower", 5))
3445 namedclass =
3446 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3447 break;
3448 case 'p':
3449 if (strnEQ(posixcc, "print", 5))
3450 namedclass =
3451 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452 else if (strnEQ(posixcc, "punct", 5))
3453 namedclass =
3454 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3455 break;
3456 case 's':
3457 if (strnEQ(posixcc, "space", 5))
3458 namedclass =
aaa51d5e 3459 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3460 break;
b8c5462f
JH
3461 case 'u':
3462 if (strnEQ(posixcc, "upper", 5))
3463 namedclass =
3464 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3465 break;
3466 case 'w': /* this is not POSIX, this is the Perl \w */
3467 if (strnEQ(posixcc, "word", 4)) {
3468 namedclass =
3469 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3470 skip = 4;
3471 }
3472 break;
3473 case 'x':
3474 if (strnEQ(posixcc, "xdigit", 6)) {
3475 namedclass =
3476 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3477 skip = 6;
3478 }
3479 break;
3480 }
ac561586
JH
3481 if (namedclass == OOB_NAMEDCLASS ||
3482 posixcc[skip] != ':' ||
3483 posixcc[skip+1] != ']')
b45f050a
JF
3484 {
3485 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3486 t - s - 1, s + 1);
3487 }
3488 } else if (!SIZE_ONLY) {
b8c5462f 3489 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3490
830247a4 3491 /* adjust RExC_parse so the warning shows after
b45f050a 3492 the class closes */
9a86a77b 3493 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3494 RExC_parse++;
b45f050a
JF
3495 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3496 }
b8c5462f
JH
3497 } else {
3498 /* Maternal grandfather:
3499 * "[:" ending in ":" but not in ":]" */
830247a4 3500 RExC_parse = s;
767d463e 3501 }
620e46c5
JH
3502 }
3503 }
3504
b8c5462f
JH
3505 return namedclass;
3506}
3507
3508STATIC void
830247a4 3509S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3510{
b938889d 3511 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3512 char *s = RExC_parse;
93733859 3513 char c = *s++;
b8c5462f
JH
3514
3515 while(*s && isALNUM(*s))
3516 s++;
3517 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3518 if (ckWARN(WARN_REGEXP))
3519 vWARN3(s+2,
3520 "POSIX syntax [%c %c] belongs inside character classes",
3521 c, c);
b45f050a
JF
3522
3523 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3524 if (POSIXCC_NOTYET(c)) {
830247a4 3525 /* adjust RExC_parse so the error shows after
b45f050a 3526 the class closes */
9a86a77b 3527 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3528 ;
3529 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3530 }
b8c5462f
JH
3531 }
3532 }
620e46c5
JH
3533}
3534
76e3520e 3535STATIC regnode *
830247a4 3536S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3537{
ffc61ed2 3538 register UV value;
9a86a77b 3539 register UV nextvalue;
3568d838 3540 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3541 register IV range = 0;
c277df42 3542 register regnode *ret;
ba210ebe 3543 STRLEN numlen;
ffc61ed2 3544 IV namedclass;
9c5ffd7c 3545 char *rangebegin = 0;
936ed897 3546 bool need_class = 0;
9c5ffd7c 3547 SV *listsv = Nullsv;
ffc61ed2
JH
3548 register char *e;
3549 UV n;
9e55ce06
JH
3550 bool optimize_invert = TRUE;
3551 AV* unicode_alternate = 0;
1b2d223b
JH
3552#ifdef EBCDIC
3553 UV literal_endpoint = 0;
3554#endif
ffc61ed2
JH
3555
3556 ret = reganode(pRExC_state, ANYOF, 0);
3557
3558 if (!SIZE_ONLY)
3559 ANYOF_FLAGS(ret) = 0;
3560
9a86a77b 3561 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3562 RExC_naughty++;
3563 RExC_parse++;
3564 if (!SIZE_ONLY)
3565 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3566 }
a0d0e21e 3567
936ed897 3568 if (SIZE_ONLY)
830247a4 3569 RExC_size += ANYOF_SKIP;
936ed897 3570 else {
830247a4 3571 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3572 if (FOLD)
3573 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3574 if (LOC)
3575 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3576 ANYOF_BITMAP_ZERO(ret);
3577 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3578 }
b8c5462f 3579
9a86a77b
JH
3580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3581
b938889d 3582 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3583 checkposixcc(pRExC_state);
b8c5462f 3584
f064b6ad
HS
3585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586 if (UCHARAT(RExC_parse) == ']')
3587 goto charclassloop;
ffc61ed2 3588
9a86a77b 3589 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3590
3591 charclassloop:
3592
3593 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3594
73b437c8 3595 if (!range)
830247a4 3596 rangebegin = RExC_parse;
ffc61ed2 3597 if (UTF) {
5e12f4fb 3598 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3599 RExC_end - RExC_parse,
3600 &numlen, 0);
ffc61ed2
JH
3601 RExC_parse += numlen;
3602 }
3603 else
3604 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606 if (value == '[' && POSIXCC(nextvalue))
830247a4 3607 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3608 else if (value == '\\') {
ffc61ed2 3609 if (UTF) {
5e12f4fb 3610 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3611 RExC_end - RExC_parse,
3612 &numlen, 0);
3613 RExC_parse += numlen;
3614 }
3615 else
3616 value = UCHARAT(RExC_parse++);
470c3474 3617 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3618 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3619 * be a problem later if we want switch on Unicode.
3620 * A similar issue a little bit later when switching on
3621 * namedclass. --jhi */
ffc61ed2 3622 switch ((I32)value) {
b8c5462f
JH
3623 case 'w': namedclass = ANYOF_ALNUM; break;
3624 case 'W': namedclass = ANYOF_NALNUM; break;
3625 case 's': namedclass = ANYOF_SPACE; break;
3626 case 'S': namedclass = ANYOF_NSPACE; break;
3627 case 'd': namedclass = ANYOF_DIGIT; break;
3628 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3629 case 'p':
3630 case 'P':
af6f566e 3631 if (RExC_parse >= RExC_end)
2a4859cd 3632 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3633 if (*RExC_parse == '{') {
0da60cf5 3634 U8 c = (U8)value;
ffc61ed2
JH
3635 e = strchr(RExC_parse++, '}');
3636 if (!e)
0da60cf5 3637 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3638 while (isSPACE(UCHARAT(RExC_parse)))
3639 RExC_parse++;
3640 if (e == RExC_parse)
0da60cf5 3641 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3642 n = e - RExC_parse;
ab13f0c7
JH
3643 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3644 n--;
ffc61ed2
JH
3645 }
3646 else {
3647 e = RExC_parse;
3648 n = 1;
3649 }
3650 if (!SIZE_ONLY) {
ab13f0c7
JH
3651 if (UCHARAT(RExC_parse) == '^') {
3652 RExC_parse++;
3653 n--;
3654 value = value == 'p' ? 'P' : 'p'; /* toggle */
3655 while (isSPACE(UCHARAT(RExC_parse))) {
3656 RExC_parse++;
3657 n--;
3658 }
3659 }
ffc61ed2 3660 if (value == 'p')
ab13f0c7
JH
3661 Perl_sv_catpvf(aTHX_ listsv,
3662 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3663 else
ab13f0c7
JH
3664 Perl_sv_catpvf(aTHX_ listsv,
3665 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3666 }
3667 RExC_parse = e + 1;
3668 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
3669 namedclass = ANYOF_MAX; /* no official name, but it's named */
3670 break;
b8c5462f
JH
3671 case 'n': value = '\n'; break;
3672 case 'r': value = '\r'; break;
3673 case 't': value = '\t'; break;
3674 case 'f': value = '\f'; break;
3675 case 'b': value = '\b'; break;
c7f1f016
NIS
3676 case 'e': value = ASCII_TO_NATIVE('\033');break;
3677 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3678 case 'x':
ffc61ed2 3679 if (*RExC_parse == '{') {
a4c04bdc
NC
3680 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3682 e = strchr(RExC_parse++, '}');
b81d288d 3683 if (!e)
ffc61ed2 3684 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3685
3686 numlen = e - RExC_parse;
3687 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3688 RExC_parse = e + 1;
3689 }
3690 else {
a4c04bdc 3691 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3692 numlen = 2;
3693 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3694 RExC_parse += numlen;
3695 }
b8c5462f
JH
3696 break;
3697 case 'c':
830247a4 3698 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3699 value = toCTRL(value);
3700 break;
3701 case '0': case '1': case '2': case '3': case '4':
3702 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3703 {
3704 I32 flags = 0;
3705 numlen = 3;
3706 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3707 RExC_parse += numlen;
b8c5462f 3708 break;
53305cf1 3709 }
1028017a 3710 default:
e476b1b5 3711 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3712 vWARN2(RExC_parse,
3713 "Unrecognized escape \\%c in character class passed through",
3714 (int)value);
1028017a 3715 break;
b8c5462f 3716 }
ffc61ed2 3717 } /* end of \blah */
1b2d223b
JH
3718#ifdef EBCDIC
3719 else
3720 literal_endpoint++;
3721#endif
ffc61ed2
JH
3722
3723 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3724
3725 if (!SIZE_ONLY && !need_class)
936ed897 3726 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3727
936ed897 3728 need_class = 1;
ffc61ed2
JH
3729
3730 /* a bad range like a-\d, a-[:digit:] ? */
3731 if (range) {
73b437c8 3732 if (!SIZE_ONLY) {
e476b1b5 3733 if (ckWARN(WARN_REGEXP))
830247a4 3734 vWARN4(RExC_parse,
b45f050a 3735 "False [] range \"%*.*s\"",
830247a4
IZ
3736 RExC_parse - rangebegin,
3737 RExC_parse - rangebegin,
b45f050a 3738 rangebegin);
3568d838
JH
3739 if (prevvalue < 256) {
3740 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3741 ANYOF_BITMAP_SET(ret, '-');
3742 }
3743 else {
3744 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3746 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3747 }
b8c5462f 3748 }
ffc61ed2
JH
3749
3750 range = 0; /* this was not a true range */
73b437c8 3751 }
ffc61ed2 3752
73b437c8 3753 if (!SIZE_ONLY) {
3568d838
JH
3754 if (namedclass > OOB_NAMEDCLASS)
3755 optimize_invert = FALSE;
e2962f66
JH
3756 /* Possible truncation here but in some 64-bit environments
3757 * the compiler gets heartburn about switch on 64-bit values.
3758 * A similar issue a little earlier when switching on value.
98f323fa 3759 * --jhi */
e2962f66 3760 switch ((I32)namedclass) {
73b437c8
JH
3761 case ANYOF_ALNUM:
3762 if (LOC)
936ed897 3763 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3764 else {
3765 for (value = 0; value < 256; value++)
3766 if (isALNUM(value))
936ed897 3767 ANYOF_BITMAP_SET(ret, value);
73b437c8 3768 }
ffc61ed2 3769 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3770 break;
3771 case ANYOF_NALNUM:
3772 if (LOC)
936ed897 3773 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3774 else {
3775 for (value = 0; value < 256; value++)
3776 if (!isALNUM(value))
936ed897 3777 ANYOF_BITMAP_SET(ret, value);
73b437c8 3778 }
ffc61ed2 3779 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3780 break;
ffc61ed2 3781 case ANYOF_ALNUMC:
73b437c8 3782 if (LOC)
ffc61ed2 3783 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3784 else {
3785 for (value = 0; value < 256; value++)
ffc61ed2 3786 if (isALNUMC(value))
936ed897 3787 ANYOF_BITMAP_SET(ret, value);
73b437c8 3788 }
ffc61ed2 3789 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3790 break;
3791 case ANYOF_NALNUMC:
3792 if (LOC)
936ed897 3793 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3794 else {
3795 for (value = 0; value < 256; value++)
3796 if (!isALNUMC(value))
936ed897 3797 ANYOF_BITMAP_SET(ret, value);
73b437c8 3798 }
ffc61ed2 3799 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3800 break;
3801 case ANYOF_ALPHA:
3802 if (LOC)
936ed897 3803 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3804 else {
3805 for (value = 0; value < 256; value++)
3806 if (isALPHA(value))
936ed897 3807 ANYOF_BITMAP_SET(ret, value);
73b437c8 3808 }
ffc61ed2 3809 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3810 break;
3811 case ANYOF_NALPHA:
3812 if (LOC)
936ed897 3813 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3814 else {
3815 for (value = 0; value < 256; value++)
3816 if (!isALPHA(value))
936ed897 3817 ANYOF_BITMAP_SET(ret, value);
73b437c8 3818 }
ffc61ed2 3819 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3820 break;
3821 case ANYOF_ASCII:
3822 if (LOC)
936ed897 3823 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3824 else {
c7f1f016 3825#ifndef EBCDIC
1ba5c669
JH
3826 for (value = 0; value < 128; value++)
3827 ANYOF_BITMAP_SET(ret, value);
3828#else /* EBCDIC */
ffbc6a93 3829 for (value = 0; value < 256; value++) {
3a3c4447
JH
3830 if (isASCII(value))
3831 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3832 }
1ba5c669 3833#endif /* EBCDIC */
73b437c8 3834 }
ffc61ed2 3835 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3836 break;
3837 case ANYOF_NASCII:
3838 if (LOC)
936ed897 3839 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3840 else {
c7f1f016 3841#ifndef EBCDIC
1ba5c669
JH
3842 for (value = 128; value < 256; value++)
3843 ANYOF_BITMAP_SET(ret, value);
3844#else /* EBCDIC */
ffbc6a93 3845 for (value = 0; value < 256; value++) {
3a3c4447
JH
3846 if (!isASCII(value))
3847 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3848 }
1ba5c669 3849#endif /* EBCDIC */
73b437c8 3850 }
ffc61ed2 3851 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3852 break;
aaa51d5e
JF
3853 case ANYOF_BLANK:
3854 if (LOC)
3855 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3856 else {
3857 for (value = 0; value < 256; value++)
3858 if (isBLANK(value))
3859 ANYOF_BITMAP_SET(ret, value);
3860 }
ffc61ed2 3861 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3862 break;
3863 case ANYOF_NBLANK:
3864 if (LOC)
3865 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3866 else {
3867 for (value = 0; value < 256; value++)
3868 if (!isBLANK(value))
3869 ANYOF_BITMAP_SET(ret, value);
3870 }
ffc61ed2 3871 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3872 break;
73b437c8
JH
3873 case ANYOF_CNTRL:
3874 if (LOC)
936ed897 3875 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3876 else {
3877 for (value = 0; value < 256; value++)
3878 if (isCNTRL(value))
936ed897 3879 ANYOF_BITMAP_SET(ret, value);
73b437c8 3880 }
ffc61ed2 3881 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3882 break;
3883 case ANYOF_NCNTRL:
3884 if (LOC)
936ed897 3885 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3886 else {
3887 for (value = 0; value < 256; value++)
3888 if (!isCNTRL(value))
936ed897 3889 ANYOF_BITMAP_SET(ret, value);
73b437c8 3890 }
ffc61ed2
JH
3891 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3892 break;
3893 case ANYOF_DIGIT:
3894 if (LOC)
3895 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3896 else {
3897 /* consecutive digits assumed */
3898 for (value = '0'; value <= '9'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3900 }
3901 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3902 break;
3903 case ANYOF_NDIGIT:
3904 if (LOC)
3905 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3906 else {
3907 /* consecutive digits assumed */
3908 for (value = 0; value < '0'; value++)
3909 ANYOF_BITMAP_SET(ret, value);
3910 for (value = '9' + 1; value < 256; value++)
3911 ANYOF_BITMAP_SET(ret, value);
3912 }
3913 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3914 break;
3915 case ANYOF_GRAPH:
3916 if (LOC)
936ed897 3917 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3918 else {
3919 for (value = 0; value < 256; value++)
3920 if (isGRAPH(value))
936ed897 3921 ANYOF_BITMAP_SET(ret, value);
73b437c8 3922 }
ffc61ed2 3923 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3924 break;
3925 case ANYOF_NGRAPH:
3926 if (LOC)
936ed897 3927 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3928 else {
3929 for (value = 0; value < 256; value++)
3930 if (!isGRAPH(value))
936ed897 3931 ANYOF_BITMAP_SET(ret, value);
73b437c8 3932 }
ffc61ed2 3933 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3934 break;
3935 case ANYOF_LOWER:
3936 if (LOC)
936ed897 3937 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3938 else {
3939 for (value = 0; value < 256; value++)
3940 if (isLOWER(value))
936ed897 3941 ANYOF_BITMAP_SET(ret, value);
73b437c8 3942 }
ffc61ed2 3943 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3944 break;
3945 case ANYOF_NLOWER:
3946 if (LOC)
936ed897 3947 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3948 else {
3949 for (value = 0; value < 256; value++)
3950 if (!isLOWER(value))
936ed897 3951 ANYOF_BITMAP_SET(ret, value);
73b437c8 3952 }
ffc61ed2 3953 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3954 break;
3955 case ANYOF_PRINT:
3956 if (LOC)
936ed897 3957 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3958 else {
3959 for (value = 0; value < 256; value++)
3960 if (isPRINT(value))
936ed897 3961 ANYOF_BITMAP_SET(ret, value);
73b437c8 3962 }
ffc61ed2 3963 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3964 break;
3965 case ANYOF_NPRINT:
3966 if (LOC)
936ed897 3967 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3968 else {
3969 for (value = 0; value < 256; value++)
3970 if (!isPRINT(value))
936ed897 3971 ANYOF_BITMAP_SET(ret, value);
73b437c8 3972 }
ffc61ed2 3973 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3974 break;
aaa51d5e
JF
3975 case ANYOF_PSXSPC:
3976 if (LOC)
3977 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3978 else {
3979 for (value = 0; value < 256; value++)
3980 if (isPSXSPC(value))
3981 ANYOF_BITMAP_SET(ret, value);
3982 }
ffc61ed2 3983 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3984 break;
3985 case ANYOF_NPSXSPC:
3986 if (LOC)
3987 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3988 else {
3989 for (value = 0; value < 256; value++)
3990 if (!isPSXSPC(value))
3991 ANYOF_BITMAP_SET(ret, value);
3992 }
ffc61ed2 3993 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3994 break;
73b437c8
JH
3995 case ANYOF_PUNCT:
3996 if (LOC)
936ed897 3997 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3998 else {
3999 for (value = 0; value < 256; value++)
4000 if (isPUNCT(value))
936ed897 4001 ANYOF_BITMAP_SET(ret, value);
73b437c8 4002 }
ffc61ed2 4003 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
4004 break;
4005 case ANYOF_NPUNCT:
4006 if (LOC)
936ed897 4007 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
4008 else {
4009 for (value = 0; value < 256; value++)
4010 if (!isPUNCT(value))
936ed897 4011 ANYOF_BITMAP_SET(ret, value);
73b437c8 4012 }
ffc61ed2
JH
4013 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4014 break;
4015 case ANYOF_SPACE:
4016 if (LOC)
4017 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4018 else {
4019 for (value = 0; value < 256; value++)
4020 if (isSPACE(value))
4021 ANYOF_BITMAP_SET(ret, value);
4022 }
4023 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4024 break;
4025 case ANYOF_NSPACE:
4026 if (LOC)
4027 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4028 else {
4029 for (value = 0; value < 256; value++)
4030 if (!isSPACE(value))
4031 ANYOF_BITMAP_SET(ret, value);
4032 }
4033 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4034 break;
4035 case ANYOF_UPPER:
4036 if (LOC)
936ed897 4037 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4038 else {
4039 for (value = 0; value < 256; value++)
4040 if (isUPPER(value))
936ed897 4041 ANYOF_BITMAP_SET(ret, value);
73b437c8 4042 }
ffc61ed2 4043 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4044 break;
4045 case ANYOF_NUPPER:
4046 if (LOC)
936ed897 4047 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4048 else {
4049 for (value = 0; value < 256; value++)
4050 if (!isUPPER(value))
936ed897 4051 ANYOF_BITMAP_SET(ret, value);
73b437c8 4052 }
ffc61ed2 4053 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4054 break;
4055 case ANYOF_XDIGIT:
4056 if (LOC)
936ed897 4057 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4058 else {
4059 for (value = 0; value < 256; value++)
4060 if (isXDIGIT(value))
936ed897 4061 ANYOF_BITMAP_SET(ret, value);
73b437c8 4062 }
ffc61ed2 4063 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4064 break;
4065 case ANYOF_NXDIGIT:
4066 if (LOC)
936ed897 4067 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4068 else {
4069 for (value = 0; value < 256; value++)
4070 if (!isXDIGIT(value))
936ed897 4071 ANYOF_BITMAP_SET(ret, value);
73b437c8 4072 }
ffc61ed2 4073 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8 4074 break;
f81125e2
JP
4075 case ANYOF_MAX:
4076 /* this is to handle \p and \P */
4077 break;
73b437c8 4078 default:
b45f050a 4079 vFAIL("Invalid [::] class");
73b437c8 4080 break;
b8c5462f 4081 }
b8c5462f 4082 if (LOC)
936ed897 4083 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4084 continue;
a0d0e21e 4085 }
ffc61ed2
JH
4086 } /* end of namedclass \blah */
4087
a0d0e21e 4088 if (range) {
eb160463 4089 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4090 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4091 RExC_parse - rangebegin,
4092 RExC_parse - rangebegin,
b45f050a 4093 rangebegin);
3568d838 4094 range = 0; /* not a valid range */
73b437c8 4095 }
a0d0e21e
LW
4096 }
4097 else {
3568d838 4098 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4099 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4100 RExC_parse[1] != ']') {
4101 RExC_parse++;
ffc61ed2
JH
4102
4103 /* a bad range like \w-, [:word:]- ? */
4104 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4105 if (ckWARN(WARN_REGEXP))
830247a4 4106 vWARN4(RExC_parse,
b45f050a 4107 "False [] range \"%*.*s\"",
830247a4
IZ
4108 RExC_parse - rangebegin,
4109 RExC_parse - rangebegin,
b45f050a 4110 rangebegin);
73b437c8 4111 if (!SIZE_ONLY)
936ed897 4112 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4113 } else
ffc61ed2
JH
4114 range = 1; /* yeah, it's a range! */
4115 continue; /* but do it the next time */
a0d0e21e 4116 }
a687059c 4117 }
ffc61ed2 4118
93733859 4119 /* now is the next time */
ae5c130c 4120 if (!SIZE_ONLY) {
3568d838
JH
4121 IV i;
4122
4123 if (prevvalue < 256) {
4124 IV ceilvalue = value < 256 ? value : 255;
4125
4126#ifdef EBCDIC
1b2d223b
JH
4127 /* In EBCDIC [\x89-\x91] should include
4128 * the \x8e but [i-j] should not. */
4129 if (literal_endpoint == 2 &&
4130 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4131 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4132 {
3568d838
JH
4133 if (isLOWER(prevvalue)) {
4134 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4135 if (isLOWER(i))
4136 ANYOF_BITMAP_SET(ret, i);
4137 } else {
3568d838 4138 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4139 if (isUPPER(i))
4140 ANYOF_BITMAP_SET(ret, i);
4141 }
8ada0baa 4142 }
ffc61ed2 4143 else
8ada0baa 4144#endif
a5961de5
JH
4145 for (i = prevvalue; i <= ceilvalue; i++)
4146 ANYOF_BITMAP_SET(ret, i);
3568d838 4147 }
a5961de5 4148 if (value > 255 || UTF) {
b08decb7
JH
4149 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4150 UV natvalue = NATIVE_TO_UNI(value);
4151
ffc61ed2 4152 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4153 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4155 prevnatvalue, natvalue);
4156 }
4157 else if (prevnatvalue == natvalue) {
4158 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4159 if (FOLD) {
254ba52a
JH
4160 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4161 STRLEN foldlen;
2f3bf011 4162 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4163
c840d2a2
JH
4164 /* If folding and foldable and a single
4165 * character, insert also the folded version
4166 * to the charclass. */
9e55ce06 4167 if (f != value) {
eb160463 4168 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4169 Perl_sv_catpvf(aTHX_ listsv,
4170 "%04"UVxf"\n", f);
4171 else {
4172 /* Any multicharacter foldings
4173 * require the following transform:
4174 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4175 * where E folds into "pq" and F folds
4176 * into "rst", all other characters
4177 * fold to single characters. We save
4178 * away these multicharacter foldings,
4179 * to be later saved as part of the
4180 * additional "s" data. */
4181 SV *sv;
4182
4183 if (!unicode_alternate)
4184 unicode_alternate = newAV();
4185 sv = newSVpvn((char*)foldbuf, foldlen);
4186 SvUTF8_on(sv);
4187 av_push(unicode_alternate, sv);
4188 }
4189 }
254ba52a 4190
60a8b682
JH
4191 /* If folding and the value is one of the Greek
4192 * sigmas insert a few more sigmas to make the
4193 * folding rules of the sigmas to work right.
4194 * Note that not all the possible combinations
4195 * are handled here: some of them are handled
9e55ce06
JH
4196 * by the standard folding rules, and some of
4197 * them (literal or EXACTF cases) are handled
4198 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4199 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4200 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4201 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4202 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4203 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4204 }
4205 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4206 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4207 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4208 }
4209 }
ffc61ed2 4210 }
1b2d223b
JH
4211#ifdef EBCDIC
4212 literal_endpoint = 0;
4213#endif
8ada0baa 4214 }
ffc61ed2
JH
4215
4216 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4217 }
ffc61ed2 4218
936ed897 4219 if (need_class) {
4f66b38d 4220 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4221 if (SIZE_ONLY)
830247a4 4222 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4223 else
830247a4 4224 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4225 }
ffc61ed2 4226
ae5c130c 4227 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4228 if (!SIZE_ONLY &&
ffc61ed2 4229 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4230 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4231 ) {
a0ed51b3 4232 for (value = 0; value < 256; ++value) {
936ed897 4233 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4234 UV fold = PL_fold[value];
ffc61ed2
JH
4235
4236 if (fold != value)
4237 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4238 }
4239 }
936ed897 4240 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4241 }
ffc61ed2 4242
ae5c130c 4243 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4244 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4245 /* If the only flag is inversion. */
4246 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4247 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4248 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4249 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4250 }
a0d0e21e 4251
b81d288d 4252 if (!SIZE_ONLY) {
fde631ed 4253 AV *av = newAV();
ffc61ed2
JH
4254 SV *rv;
4255
9e55ce06 4256 /* The 0th element stores the character class description
6a0407ee 4257 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4258 * to initialize the appropriate swash (which gets stored in
4259 * the 1st element), and also useful for dumping the regnode.
4260 * The 2nd element stores the multicharacter foldings,
6a0407ee 4261 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4262 av_store(av, 0, listsv);
4263 av_store(av, 1, NULL);
9e55ce06 4264 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4265 rv = newRV_noinc((SV*)av);
19860706 4266 n = add_data(pRExC_state, 1, "s");
830247a4 4267 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4268 ARG_SET(ret, n);
a0ed51b3
LW
4269 }
4270
4271 return ret;
4272}
4273
76e3520e 4274STATIC char*
830247a4 4275S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4276{
830247a4 4277 char* retval = RExC_parse++;
a0d0e21e 4278
4633a7c4 4279 for (;;) {
830247a4
IZ
4280 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4281 RExC_parse[2] == '#') {
e994fd66
AE
4282 while (*RExC_parse != ')') {
4283 if (RExC_parse == RExC_end)
4284 FAIL("Sequence (?#... not terminated");
830247a4 4285 RExC_parse++;
e994fd66 4286 }
830247a4 4287 RExC_parse++;
4633a7c4
LW
4288 continue;
4289 }
e2509266 4290 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4291 if (isSPACE(*RExC_parse)) {
4292 RExC_parse++;
748a9306
LW
4293 continue;
4294 }
830247a4 4295 else if (*RExC_parse == '#') {
e994fd66
AE
4296 while (RExC_parse < RExC_end)
4297 if (*RExC_parse++ == '\n') break;
748a9306
LW
4298 continue;
4299 }
748a9306 4300 }
4633a7c4 4301 return retval;
a0d0e21e 4302 }
a687059c
LW
4303}
4304
4305/*
c277df42 4306- reg_node - emit a node
a0d0e21e 4307*/
76e3520e 4308STATIC regnode * /* Location. */
830247a4 4309S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4310{
c277df42
IZ
4311 register regnode *ret;
4312 register regnode *ptr;
a687059c 4313
830247a4 4314 ret = RExC_emit;
c277df42 4315 if (SIZE_ONLY) {
830247a4
IZ
4316 SIZE_ALIGN(RExC_size);
4317 RExC_size += 1;
a0d0e21e
LW
4318 return(ret);
4319 }
a687059c 4320
c277df42 4321 NODE_ALIGN_FILL(ret);
a0d0e21e 4322 ptr = ret;
c277df42 4323 FILL_ADVANCE_NODE(ptr, op);
fac92740 4324 if (RExC_offsets) { /* MJD */
ccb2c380 4325 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4326 "reg_node", __LINE__,
4327 reg_name[op],
4328 RExC_emit - RExC_emit_start > RExC_offsets[0]
4329 ? "Overwriting end of array!\n" : "OK",
4330 RExC_emit - RExC_emit_start,
4331 RExC_parse - RExC_start,
4332 RExC_offsets[0]));
ccb2c380 4333 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4334 }
4335
830247a4 4336 RExC_emit = ptr;
a687059c 4337
a0d0e21e 4338 return(ret);
a687059c
LW
4339}
4340
4341/*
a0d0e21e
LW
4342- reganode - emit a node with an argument
4343*/
76e3520e 4344STATIC regnode * /* Location. */
830247a4 4345S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4346{
c277df42
IZ
4347 register regnode *ret;
4348 register regnode *ptr;
fe14fcc3 4349
830247a4 4350 ret = RExC_emit;
c277df42 4351 if (SIZE_ONLY) {
830247a4
IZ
4352 SIZE_ALIGN(RExC_size);
4353 RExC_size += 2;
a0d0e21e
LW
4354 return(ret);
4355 }
fe14fcc3 4356
c277df42 4357 NODE_ALIGN_FILL(ret);
a0d0e21e 4358 ptr = ret;
c277df42 4359 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4360 if (RExC_offsets) { /* MJD */
ccb2c380 4361 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4362 "reganode",
ccb2c380
MP
4363 __LINE__,
4364 reg_name[op],
fac92740
MJD
4365 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4366 "Overwriting end of array!\n" : "OK",
4367 RExC_emit - RExC_emit_start,
4368 RExC_parse - RExC_start,
4369 RExC_offsets[0]));
ccb2c380 4370 Set_Cur_Node_Offset;
fac92740
MJD
4371 }
4372
830247a4 4373 RExC_emit = ptr;
fe14fcc3 4374
a0d0e21e 4375 return(ret);
fe14fcc3
LW
4376}
4377
4378/*
cd439c50 4379- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4380*/
4381STATIC void
830247a4 4382S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4383{
5e12f4fb 4384 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4385}
4386
4387/*
a0d0e21e
LW
4388- reginsert - insert an operator in front of already-emitted operand
4389*
4390* Means relocating the operand.
4391*/
76e3520e 4392STATIC void
830247a4 4393S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4394{
c277df42
IZ
4395 register regnode *src;
4396 register regnode *dst;
4397 register regnode *place;
4398 register int offset = regarglen[(U8)op];
b81d288d 4399
22c35a8c 4400/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4401
4402 if (SIZE_ONLY) {
830247a4 4403 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4404 return;
4405 }
a687059c 4406
830247a4
IZ
4407 src = RExC_emit;
4408 RExC_emit += NODE_STEP_REGNODE + offset;
4409 dst = RExC_emit;
fac92740 4410 while (src > opnd) {
c277df42 4411 StructCopy(--src, --dst, regnode);
fac92740 4412 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4413 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4414 "reg_insert",
ccb2c380
MP
4415 __LINE__,
4416 reg_name[op],
fac92740
MJD
4417 dst - RExC_emit_start > RExC_offsets[0]
4418 ? "Overwriting end of array!\n" : "OK",
4419 src - RExC_emit_start,
4420 dst - RExC_emit_start,
4421 RExC_offsets[0]));
ccb2c380
MP
4422 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4423 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4424 }
4425 }
4426
a0d0e21e
LW
4427
4428 place = opnd; /* Op node, where operand used to be. */
fac92740 4429 if (RExC_offsets) { /* MJD */
ccb2c380 4430 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4431 "reginsert",
ccb2c380
MP
4432 __LINE__,
4433 reg_name[op],
fac92740
MJD
4434 place - RExC_emit_start > RExC_offsets[0]
4435 ? "Overwriting end of array!\n" : "OK",
4436 place - RExC_emit_start,
4437 RExC_parse - RExC_start,
4438 RExC_offsets[0]));
ccb2c380 4439 Set_Node_Offset(place, RExC_parse);
45948336 4440 Set_Node_Length(place, 1);
fac92740 4441 }
c277df42
IZ
4442 src = NEXTOPER(place);
4443 FILL_ADVANCE_NODE(place, op);
4444 Zero(src, offset, regnode);
a687059c
LW
4445}
4446
4447/*
c277df42 4448- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4449*/
76e3520e 4450STATIC void
830247a4 4451S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4452{
c277df42
IZ
4453 register regnode *scan;
4454 register regnode *temp;
a0d0e21e 4455
c277df42 4456 if (SIZE_ONLY)
a0d0e21e
LW
4457 return;
4458
4459 /* Find last node. */
4460 scan = p;
4461 for (;;) {
4462 temp = regnext(scan);
4463 if (temp == NULL)
4464 break;
4465 scan = temp;
4466 }
a687059c 4467
c277df42
IZ
4468 if (reg_off_by_arg[OP(scan)]) {
4469 ARG_SET(scan, val - scan);
a0ed51b3
LW
4470 }
4471 else {
c277df42
IZ
4472 NEXT_OFF(scan) = val - scan;
4473 }
a687059c
LW
4474}
4475
4476/*
a0d0e21e
LW
4477- regoptail - regtail on operand of first argument; nop if operandless
4478*/
76e3520e 4479STATIC void
830247a4 4480S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4481{
a0d0e21e 4482 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4483 if (p == NULL || SIZE_ONLY)
4484 return;
22c35a8c 4485 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4486 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4487 }
22c35a8c 4488 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4489 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4490 }
4491 else
a0d0e21e 4492 return;
a687059c
LW
4493}
4494
4495/*
4496 - regcurly - a little FSA that accepts {\d+,?\d*}
4497 */
79072805 4498STATIC I32
cea2e8a9 4499S_regcurly(pTHX_ register char *s)
a687059c
LW
4500{
4501 if (*s++ != '{')
4502 return FALSE;
f0fcb552 4503 if (!isDIGIT(*s))
a687059c 4504 return FALSE;
f0fcb552 4505 while (isDIGIT(*s))
a687059c
LW
4506 s++;
4507 if (*s == ',')
4508 s++;
f0fcb552 4509 while (isDIGIT(*s))
a687059c
LW
4510 s++;
4511 if (*s != '}')
4512 return FALSE;
4513 return TRUE;
4514}
4515
a687059c 4516
8fa7f367
JH
4517#ifdef DEBUGGING
4518
76e3520e 4519STATIC regnode *
cea2e8a9 4520S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4521{
f248d071 4522 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4523 register regnode *next;
c277df42
IZ
4524
4525 while (op != END && (!last || node < last)) {
4526 /* While that wasn't END last time... */
4527
4528 NODE_ALIGN(node);
4529 op = OP(node);
4530 if (op == CLOSE)
4531 l--;
4532 next = regnext(node);
4533 /* Where, what. */
4534 if (OP(node) == OPTIMIZED)
4535 goto after_print;
4536 regprop(sv, node);
b900a521 4537 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4538 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4539 if (next == NULL) /* Next ptr. */
4540 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4541 else
b900a521 4542 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4543 (void)PerlIO_putc(Perl_debug_log, '\n');
4544 after_print:
22c35a8c 4545 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4546 register regnode *nnode = (OP(next) == LONGJMP
4547 ? regnext(next)
c277df42
IZ
4548 : next);
4549 if (last && nnode > last)
4550 nnode = last;
4551 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4552 }
22c35a8c 4553 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4554 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4555 }
4556 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4557 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4558 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4559 }
22c35a8c 4560 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4561 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4562 next, sv, l + 1);
a0ed51b3
LW
4563 }
4564 else if ( op == PLUS || op == STAR) {
c277df42 4565 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4566 }
4567 else if (op == ANYOF) {
4f66b38d
HS
4568 /* arglen 1 + class block */
4569 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4570 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4571 node = NEXTOPER(node);
a0ed51b3 4572 }
22c35a8c 4573 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4574 /* Literal string, where present. */
cd439c50 4575 node += NODE_SZ_STR(node) - 1;
c277df42 4576 node = NEXTOPER(node);
a0ed51b3
LW
4577 }
4578 else {
c277df42
IZ
4579 node = NEXTOPER(node);
4580 node += regarglen[(U8)op];
4581 }
4582 if (op == CURLYX || op == OPEN)
4583 l++;
4584 else if (op == WHILEM)
4585 l--;
4586 }
4587 return node;
4588}
4589
8fa7f367
JH
4590#endif /* DEBUGGING */
4591
a687059c 4592/*
fd181c75 4593 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4594 */
4595void
864dbfa3 4596Perl_regdump(pTHX_ regexp *r)
a687059c 4597{
35ff7856 4598#ifdef DEBUGGING
46fc3d4c 4599 SV *sv = sv_newmortal();
a687059c 4600
c277df42 4601 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4602
4603 /* Header fields of interest. */
c277df42 4604 if (r->anchored_substr)
7b0972df 4605 PerlIO_printf(Perl_debug_log,
b81d288d 4606 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4607 PL_colors[0],
7b0972df 4608 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4609 SvPVX(r->anchored_substr),
3280af22 4610 PL_colors[1],
c277df42 4611 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4612 (IV)r->anchored_offset);
33b8afdf
JH
4613 else if (r->anchored_utf8)
4614 PerlIO_printf(Perl_debug_log,
4615 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4616 PL_colors[0],
4617 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4618 SvPVX(r->anchored_utf8),
4619 PL_colors[1],
4620 SvTAIL(r->anchored_utf8) ? "$" : "",
4621 (IV)r->anchored_offset);
c277df42 4622 if (r->float_substr)
7b0972df 4623 PerlIO_printf(Perl_debug_log,
b81d288d 4624 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4625 PL_colors[0],
b81d288d 4626 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4627 SvPVX(r->float_substr),
3280af22 4628 PL_colors[1],
c277df42 4629 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4630 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4631 else if (r->float_utf8)
4632 PerlIO_printf(Perl_debug_log,
4633 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4634 PL_colors[0],
4635 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4636 SvPVX(r->float_utf8),
4637 PL_colors[1],
4638 SvTAIL(r->float_utf8) ? "$" : "",
4639 (IV)r->float_min_offset, (UV)r->float_max_offset);
4640 if (r->check_substr || r->check_utf8)
b81d288d
AB
4641 PerlIO_printf(Perl_debug_log,
4642 r->check_substr == r->float_substr
33b8afdf 4643 && r->check_utf8 == r->float_utf8
c277df42
IZ
4644 ? "(checking floating" : "(checking anchored");
4645 if (r->reganch & ROPT_NOSCAN)
4646 PerlIO_printf(Perl_debug_log, " noscan");
4647 if (r->reganch & ROPT_CHECK_ALL)
4648 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4649 if (r->check_substr || r->check_utf8)
c277df42
IZ
4650 PerlIO_printf(Perl_debug_log, ") ");
4651
46fc3d4c 4652 if (r->regstclass) {
4653 regprop(sv, r->regstclass);
4654 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4655 }
774d564b 4656 if (r->reganch & ROPT_ANCH) {
4657 PerlIO_printf(Perl_debug_log, "anchored");
4658 if (r->reganch & ROPT_ANCH_BOL)
4659 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4660 if (r->reganch & ROPT_ANCH_MBOL)
4661 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4662 if (r->reganch & ROPT_ANCH_SBOL)
4663 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4664 if (r->reganch & ROPT_ANCH_GPOS)
4665 PerlIO_printf(Perl_debug_log, "(GPOS)");
4666 PerlIO_putc(Perl_debug_log, ' ');
4667 }
c277df42
IZ
4668 if (r->reganch & ROPT_GPOS_SEEN)
4669 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4670 if (r->reganch & ROPT_SKIP)
760ac839 4671 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4672 if (r->reganch & ROPT_IMPLICIT)
760ac839 4673 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4674 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4675 if (r->reganch & ROPT_EVAL_SEEN)
4676 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4677 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4678 if (r->offsets) {
4679 U32 i;
4680 U32 len = r->offsets[0];
392fbf5d 4681 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4682 for (i = 1; i <= len; i++)
392fbf5d
RB
4683 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4684 (UV)r->offsets[i*2-1],
4685 (UV)r->offsets[i*2]);
fac92740
MJD
4686 PerlIO_printf(Perl_debug_log, "\n");
4687 }
17c3b450 4688#endif /* DEBUGGING */
a687059c
LW
4689}
4690
8fa7f367
JH
4691#ifdef DEBUGGING
4692
653099ff
GS
4693STATIC void
4694S_put_byte(pTHX_ SV *sv, int c)
4695{
7be5a6cf 4696 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4697 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4698 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4699 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4700 else
4701 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4702}
4703
8fa7f367
JH
4704#endif /* DEBUGGING */
4705
a687059c 4706/*
a0d0e21e
LW
4707- regprop - printable representation of opcode
4708*/
46fc3d4c 4709void
864dbfa3 4710Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4711{
35ff7856 4712#ifdef DEBUGGING
9b155405 4713 register int k;
a0d0e21e 4714
54dc92de 4715 sv_setpvn(sv, "", 0);
9b155405 4716 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4717 /* It would be nice to FAIL() here, but this may be called from
4718 regexec.c, and it would be hard to supply pRExC_state. */
4719 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4720 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4721
4722 k = PL_regkind[(U8)OP(o)];
4723
2a782b5b
JH
4724 if (k == EXACT) {
4725 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4726 /* Using is_utf8_string() is a crude hack but it may
4727 * be the best for now since we have no flag "this EXACTish
4728 * node was UTF-8" --jhi */
4729 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4730 char *s = do_utf8 ?
c728cb41
JH
4731 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4732 UNI_DISPLAY_REGEX) :
2a782b5b 4733 STRING(o);
40eddc46 4734 int len = do_utf8 ?
2a782b5b
JH
4735 strlen(s) :
4736 STR_LEN(o);
4737 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4738 PL_colors[0],
4739 len, s,
4740 PL_colors[1]);
4741 }
9b155405 4742 else if (k == CURLY) {
cb434fcc 4743 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4744 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4745 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4746 }
2c2d71f5
JH
4747 else if (k == WHILEM && o->flags) /* Ordinal/of */
4748 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4749 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4750 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4751 else if (k == LOGICAL)
04ebc1ab 4752 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4753 else if (k == ANYOF) {
4754 int i, rangestart = -1;
ffc61ed2 4755 U8 flags = ANYOF_FLAGS(o);
a6d05634 4756 const char * const anyofs[] = { /* Should be synchronized with
19860706 4757 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4758 "\\w",
4759 "\\W",
4760 "\\s",
4761 "\\S",
4762 "\\d",
4763 "\\D",
4764 "[:alnum:]",
4765 "[:^alnum:]",
4766 "[:alpha:]",
4767 "[:^alpha:]",
4768 "[:ascii:]",
4769 "[:^ascii:]",
4770 "[:ctrl:]",
4771 "[:^ctrl:]",
4772 "[:graph:]",
4773 "[:^graph:]",
4774 "[:lower:]",
4775 "[:^lower:]",
4776 "[:print:]",
4777 "[:^print:]",
4778 "[:punct:]",
4779 "[:^punct:]",
4780 "[:upper:]",
aaa51d5e 4781 "[:^upper:]",
653099ff 4782 "[:xdigit:]",
aaa51d5e
JF
4783 "[:^xdigit:]",
4784 "[:space:]",
4785 "[:^space:]",
4786 "[:blank:]",
4787 "[:^blank:]"
653099ff
GS
4788 };
4789
19860706 4790 if (flags & ANYOF_LOCALE)
653099ff 4791 sv_catpv(sv, "{loc}");
19860706 4792 if (flags & ANYOF_FOLD)
653099ff
GS
4793 sv_catpv(sv, "{i}");
4794 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4795 if (flags & ANYOF_INVERT)
653099ff 4796 sv_catpv(sv, "^");
ffc61ed2
JH
4797 for (i = 0; i <= 256; i++) {
4798 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4799 if (rangestart == -1)
4800 rangestart = i;
4801 } else if (rangestart != -1) {
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++)
653099ff 4804 put_byte(sv, rangestart);
ffc61ed2
JH
4805 else {
4806 put_byte(sv, rangestart);
4807 sv_catpv(sv, "-");
4808 put_byte(sv, i - 1);
653099ff 4809 }
ffc61ed2 4810 rangestart = -1;
653099ff 4811 }
847a199f 4812 }
ffc61ed2
JH
4813
4814 if (o->flags & ANYOF_CLASS)
4815 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4816 if (ANYOF_CLASS_TEST(o,i))
4817 sv_catpv(sv, anyofs[i]);
4818
4819 if (flags & ANYOF_UNICODE)
4820 sv_catpv(sv, "{unicode}");
1aa99e6b 4821 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4822 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4823
4824 {
4825 SV *lv;
9e55ce06 4826 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4827
ffc61ed2
JH
4828 if (lv) {
4829 if (sw) {
ffc61ed2 4830 U8 s[UTF8_MAXLEN+1];
b81d288d 4831
ffc61ed2 4832 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4833 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4834
3568d838 4835 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4836 if (rangestart == -1)
4837 rangestart = i;
4838 } else if (rangestart != -1) {
4839 U8 *p;
b81d288d 4840
ffc61ed2
JH
4841 if (i <= rangestart + 3)
4842 for (; rangestart < i; rangestart++) {
2b9d42f0 4843 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4844 put_byte(sv, *p);
4845 }
4846 else {
2b9d42f0 4847 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4848 put_byte(sv, *p);
4849 sv_catpv(sv, "-");
2b9d42f0 4850 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4851 put_byte(sv, *p);
4852 }
4853 rangestart = -1;
4854 }
19860706 4855 }
ffc61ed2
JH
4856
4857 sv_catpv(sv, "..."); /* et cetera */
19860706 4858 }
fde631ed 4859
ffc61ed2
JH
4860 {
4861 char *s = savepv(SvPVX(lv));
4862 char *origs = s;
b81d288d 4863
ffc61ed2 4864 while(*s && *s != '\n') s++;
b81d288d 4865
ffc61ed2
JH
4866 if (*s == '\n') {
4867 char *t = ++s;
4868
4869 while (*s) {
4870 if (*s == '\n')
4871 *s = ' ';
4872 s++;
4873 }
4874 if (s[-1] == ' ')
4875 s[-1] = 0;
4876
4877 sv_catpv(sv, t);
fde631ed 4878 }
b81d288d 4879
ffc61ed2 4880 Safefree(origs);
fde631ed
JH
4881 }
4882 }
653099ff 4883 }
ffc61ed2 4884
653099ff
GS
4885 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4886 }
9b155405 4887 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4888 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4889#endif /* DEBUGGING */
35ff7856 4890}
a687059c 4891
cad2e5aa
JH
4892SV *
4893Perl_re_intuit_string(pTHX_ regexp *prog)
4894{ /* Assume that RE_INTUIT is set */
4895 DEBUG_r(
4896 { STRLEN n_a;
33b8afdf
JH
4897 char *s = SvPV(prog->check_substr
4898 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4899
4900 if (!PL_colorset) reginitcolors();
4901 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4902 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4903 PL_colors[4],
4904 prog->check_substr ? "" : "utf8 ",
4905 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4906 s,
4907 PL_colors[1],
4908 (strlen(s) > 60 ? "..." : ""));
4909 } );
4910
33b8afdf 4911 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4912}
4913
2b69d0c2 4914void
864dbfa3 4915Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4916{
9e55ce06
JH
4917#ifdef DEBUGGING
4918 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4919#endif
7821416a
IZ
4920
4921 if (!r || (--r->refcnt > 0))
4922 return;
9e55ce06 4923 DEBUG_r({
d103360b
HS
4924 int len;
4925 char *s;
4926
4927 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4928 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4929 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4930 len = SvCUR(dsv);
9e55ce06
JH
4931 if (!PL_colorset)
4932 reginitcolors();
4933 PerlIO_printf(Perl_debug_log,
4934 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4935 PL_colors[4],PL_colors[5],PL_colors[0],
4936 len, len, s,
4937 PL_colors[1],
4938 len > 60 ? "..." : "");
4939 });
cad2e5aa 4940
c277df42 4941 if (r->precomp)
a0d0e21e 4942 Safefree(r->precomp);
fac92740
MJD
4943 if (r->offsets) /* 20010421 MJD */
4944 Safefree(r->offsets);
ed252734
NC
4945 RX_MATCH_COPY_FREE(r);
4946#ifdef PERL_COPY_ON_WRITE
4947 if (r->saved_copy)
4948 SvREFCNT_dec(r->saved_copy);
4949#endif
a193d654
GS
4950 if (r->substrs) {
4951 if (r->anchored_substr)
4952 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4953 if (r->anchored_utf8)
4954 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4955 if (r->float_substr)
4956 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4957 if (r->float_utf8)
4958 SvREFCNT_dec(r->float_utf8);
2779dcf1 4959 Safefree(r->substrs);
a193d654 4960 }
c277df42
IZ
4961 if (r->data) {
4962 int n = r->data->count;
f3548bdc
DM
4963 PAD* new_comppad = NULL;
4964 PAD* old_comppad;
dfad63ad 4965
c277df42 4966 while (--n >= 0) {
261faec3 4967 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4968 switch (r->data->what[n]) {
4969 case 's':
4970 SvREFCNT_dec((SV*)r->data->data[n]);
4971 break;
653099ff
GS
4972 case 'f':
4973 Safefree(r->data->data[n]);
4974 break;
dfad63ad
HS
4975 case 'p':
4976 new_comppad = (AV*)r->data->data[n];
4977 break;
c277df42 4978 case 'o':
dfad63ad 4979 if (new_comppad == NULL)
cea2e8a9 4980 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4981 PAD_SAVE_LOCAL(old_comppad,
4982 /* Watch out for global destruction's random ordering. */
4983 (SvTYPE(new_comppad) == SVt_PVAV) ?
4984 new_comppad : Null(PAD *)
4985 );
b34c0dd4 4986 OP_REFCNT_LOCK;
9b978d73 4987 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
b34c0dd4 4988 OP_REFCNT_UNLOCK;
9b978d73
DM
4989 op_free((OP_4tree*)r->data->data[n]);
4990 }
b34c0dd4
DM
4991 else {
4992 OP_REFCNT_UNLOCK;
4993 }
9b978d73 4994
f3548bdc 4995 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4996 SvREFCNT_dec((SV*)new_comppad);
4997 new_comppad = NULL;
c277df42
IZ
4998 break;
4999 case 'n':
9e55ce06 5000 break;
c277df42 5001 default:
830247a4 5002 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
5003 }
5004 }
5005 Safefree(r->data->what);
5006 Safefree(r->data);
a0d0e21e
LW
5007 }
5008 Safefree(r->startp);
5009 Safefree(r->endp);
5010 Safefree(r);
a687059c 5011}
c277df42
IZ
5012
5013/*
5014 - regnext - dig the "next" pointer out of a node
5015 *
5016 * [Note, when REGALIGN is defined there are two places in regmatch()
5017 * that bypass this code for speed.]
5018 */
5019regnode *
864dbfa3 5020Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5021{
5022 register I32 offset;
5023
3280af22 5024 if (p == &PL_regdummy)
c277df42
IZ
5025 return(NULL);
5026
5027 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5028 if (offset == 0)
5029 return(NULL);
5030
c277df42 5031 return(p+offset);
c277df42
IZ
5032}
5033
01f988be 5034STATIC void
cea2e8a9 5035S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5036{
5037 va_list args;
5038 STRLEN l1 = strlen(pat1);
5039 STRLEN l2 = strlen(pat2);
5040 char buf[512];
06bf62c7 5041 SV *msv;
c277df42
IZ
5042 char *message;
5043
5044 if (l1 > 510)
5045 l1 = 510;
5046 if (l1 + l2 > 510)
5047 l2 = 510 - l1;
5048 Copy(pat1, buf, l1 , char);
5049 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5050 buf[l1 + l2] = '\n';
5051 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5052#ifdef I_STDARG
5053 /* ANSI variant takes additional second argument */
c277df42 5054 va_start(args, pat2);
8736538c
AS
5055#else
5056 va_start(args);
5057#endif
5a844595 5058 msv = vmess(buf, &args);
c277df42 5059 va_end(args);
06bf62c7 5060 message = SvPV(msv,l1);
c277df42
IZ
5061 if (l1 > 512)
5062 l1 = 512;
5063 Copy(message, buf, l1 , char);
197cf9b9 5064 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 5065 Perl_croak(aTHX_ "%s", buf);
c277df42 5066}
a0ed51b3
LW
5067
5068/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5069
5070void
864dbfa3 5071Perl_save_re_context(pTHX)
b81d288d 5072{
830247a4 5073 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5074 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5075 SAVEPPTR(PL_reginput); /* String-input pointer. */
5076 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5077 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5078 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5079 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5080 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5081 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5082 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5083 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5084 PL_reg_start_tmp = 0;
a0ed51b3
LW
5085 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5086 PL_reg_start_tmpl = 0;
7766f137 5087 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5088 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5089 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5090 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5091 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5092 SAVEVPTR(PL_regcc); /* from regexec.c */
5093 SAVEVPTR(PL_curcop);
7766f137
GS
5094 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5095 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5096 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5097 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5098 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5099 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5100 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5101 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5102 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5103 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5104 PL_reg_oldsaved = Nullch;
5105 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5106 PL_reg_oldsavedlen = 0;
ed252734
NC
5107#ifdef PERL_COPY_ON_WRITE
5108 SAVESPTR(PL_nrs);
5109 PL_nrs = Nullsv;
5110#endif
a5db57d6
GS
5111 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5112 PL_reg_maxiter = 0;
5113 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5114 PL_reg_leftiter = 0;
5115 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5116 PL_reg_poscache = Nullch;
5117 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5118 PL_reg_poscache_size = 0;
5119 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5120 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5121 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5122
5123 {
5124 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5125 U32 i;
ada6e8a9
AMS
5126 GV *mgv;
5127 REGEXP *rx;
5128 char digits[16];
5129
5130 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5131 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5132 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5133 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5134 save_scalar(mgv);
5135 }
5136 }
5137 }
5138
54b6e2fa 5139#ifdef DEBUGGING
b81d288d 5140 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5141#endif
a0ed51b3 5142}
51371543 5143
51371543 5144static void
acfe0abc 5145clear_re(pTHX_ void *r)
51371543
GS
5146{
5147 ReREFCNT_dec((regexp *)r);
5148}
ffbc6a93 5149