This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unused variables.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2
JH
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 74 ****
9ef589d8
LW
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
77
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGCOMP_C
a687059c 85#include "perl.h"
d06ea78c 86
acfe0abc 87#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
88# include "INTERN.h"
89#endif
c277df42
IZ
90
91#define REG_COMP_C
a687059c
LW
92#include "regcomp.h"
93
d4cce5f1 94#ifdef op
11343788 95#undef op
d4cce5f1 96#endif /* op */
11343788 97
fe14fcc3
LW
98#ifdef MSDOS
99# if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104# endif /* BUGGY_MSC6 */
105#endif /* MSDOS */
106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
830247a4 111typedef struct RExC_state_t {
e2509266 112 U32 flags; /* are we folding, multilining? */
830247a4
IZ
113 char *precomp; /* uncompiled string. */
114 regexp *rx;
fac92740 115 char *start; /* Start of input for compile */
830247a4
IZ
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 119 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
126 I32 extralen;
127 I32 seen_zerolen;
128 I32 seen_evals;
1aa99e6b 129 I32 utf8;
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
830247a4 155
a687059c
LW
156#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
a687059c 159
35c8bce7
LW
160#ifdef SPSTART
161#undef SPSTART /* dratted cpp namespace... */
162#endif
a687059c
LW
163/*
164 * Flags to be passed up and down.
165 */
a687059c 166#define WORST 0 /* Worst case. */
821b33a5 167#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
168#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169#define SPSTART 0x4 /* Starts with * or +. */
170#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 171
2c2d71f5
JH
172/* Length of a variant. */
173
174typedef struct scan_data_t {
175 I32 len_min;
176 I32 len_delta;
177 I32 pos_min;
178 I32 pos_delta;
179 SV *last_found;
180 I32 last_end; /* min value, <0 unless valid. */
181 I32 last_start_min;
182 I32 last_start_max;
183 SV **longest; /* Either &l_fixed, or &l_float. */
184 SV *longest_fixed;
185 I32 offset_fixed;
186 SV *longest_float;
187 I32 offset_float_min;
188 I32 offset_float_max;
189 I32 flags;
190 I32 whilem_c;
cb434fcc 191 I32 *last_closep;
653099ff 192 struct regnode_charclass_class *start_class;
2c2d71f5
JH
193} scan_data_t;
194
a687059c 195/*
e50aee73 196 * Forward declarations for pregcomp()'s friends.
a687059c 197 */
a0d0e21e 198
b81d288d 199static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 200 0, 0, 0, 0, 0, 0};
c277df42
IZ
201
202#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203#define SF_BEFORE_SEOL 0x1
204#define SF_BEFORE_MEOL 0x2
205#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
09b7f37c
CB
208#ifdef NO_UNARY_PLUS
209# define SF_FIX_SHIFT_EOL (0+2)
210# define SF_FL_SHIFT_EOL (0+4)
211#else
212# define SF_FIX_SHIFT_EOL (+2)
213# define SF_FL_SHIFT_EOL (+4)
214#endif
c277df42
IZ
215
216#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221#define SF_IS_INF 0x40
222#define SF_HAS_PAR 0x80
223#define SF_IN_PAR 0x100
224#define SF_HAS_EVAL 0x200
4bfe0158 225#define SCF_DO_SUBSTR 0x400
653099ff
GS
226#define SCF_DO_STCLASS_AND 0x0800
227#define SCF_DO_STCLASS_OR 0x1000
228#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 229#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 230
eb160463 231#define UTF (RExC_utf8 != 0)
e2509266
JH
232#define LOC ((RExC_flags & PMf_LOCALE) != 0)
233#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 234
ffc61ed2 235#define OOB_UNICODE 12345678
93733859 236#define OOB_NAMEDCLASS -1
b8c5462f 237
a0ed51b3
LW
238#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
8615cb43 241
b45f050a
JF
242/* length of regex to show in messages that don't mark a position within */
243#define RegexLengthToShowInErrorMessages 127
244
245/*
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
249 */
7253e4e3
RK
250#define MARKER1 "<-- HERE" /* marker as it appears in the description */
251#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 252
7253e4e3 253#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
254
255/*
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
258 * "...".
259 */
ccb2c380
MP
260#define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
263 \
264 if (!SIZE_ONLY) \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
269 ellipses = "..."; \
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
273} STMT_END
8615cb43 274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
ccb2c380
MP
280#define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
283 \
284 if (!SIZE_ONLY) \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
289 ellipses = "..."; \
290 } \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
293} STMT_END
b45f050a
JF
294
295
296/*
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 */
ccb2c380
MP
299#define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
303} STMT_END
b45f050a
JF
304
305/*
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 */
ccb2c380
MP
308#define vFAIL(m) STMT_START { \
309 if (!SIZE_ONLY) \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
311 Simple_vFAIL(m); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Like Simple_vFAIL(), but accepts two arguments.
316 */
ccb2c380
MP
317#define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 */
ccb2c380
MP
326#define vFAIL2(m,a1) STMT_START { \
327 if (!SIZE_ONLY) \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
330} STMT_END
b45f050a
JF
331
332
333/*
334 * Like Simple_vFAIL(), but accepts three arguments.
335 */
ccb2c380
MP
336#define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
340} STMT_END
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 */
ccb2c380
MP
345#define vFAIL3(m,a1,a2) STMT_START { \
346 if (!SIZE_ONLY) \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Like Simple_vFAIL(), but accepts four arguments.
353 */
ccb2c380
MP
354#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts five arguments.
362 */
ccb2c380
MP
363#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
368
369
370#define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
375
376#define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
382
383
384#define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
388} STMT_END
389
390#define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
394} STMT_END
395
396#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400} STMT_END
401
402#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
9d1d55b5 407
8615cb43 408
cd439c50 409/* Allow for side effects in s */
ccb2c380
MP
410#define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412} STMT_END
cd439c50 413
fac92740
MJD
414/* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
418 */
419
420#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
421/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424#define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 if (! SIZE_ONLY) { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
428 if((node) < 0) { \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 } else { \
431 RExC_offsets[2*(node)-1] = (byte); \
432 } \
433 } \
434} STMT_END
435
436#define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440#define Set_Node_Length_To_R(node,len) STMT_START { \
441 if (! SIZE_ONLY) { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
444 if((node) < 0) { \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 } else { \
447 RExC_offsets[2*(node)] = (len); \
448 } \
449 } \
450} STMT_END
451
452#define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455#define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
457
458/* Get offsets and lengths */
459#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
acfe0abc 462static void clear_re(pTHX_ void *r);
4327152a 463
653099ff
GS
464/* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
467
4327152a 468STATIC void
830247a4 469S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 470{
a0ed51b3
LW
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 473
c277df42
IZ
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 sv_setsv(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
b81d288d 479 data->flags
c277df42
IZ
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 else
482 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
483 }
484 else {
c277df42 485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
486 data->offset_float_max = (l
487 ? data->last_start_max
c277df42 488 : data->pos_min + data->pos_delta);
9051bda5
HS
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
c277df42 491 if (data->flags & SF_BEFORE_EOL)
b81d288d 492 data->flags
c277df42
IZ
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 else
495 data->flags &= ~SF_FL_BEFORE_EOL;
496 }
497 }
498 SvCUR_set(data->last_found, 0);
0eda9292
JH
499 {
500 SV * sv = data->last_found;
501 MAGIC *mg =
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
504 mg->mg_len = 0;
505 }
c277df42
IZ
506 data->last_end = -1;
507 data->flags &= ~SF_BEFORE_EOL;
508}
509
653099ff
GS
510/* Can match anything (initialization) */
511STATIC void
830247a4 512S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 513{
653099ff 514 ANYOF_CLASS_ZERO(cl);
f8bef550 515 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
517 if (LOC)
518 cl->flags |= ANYOF_LOCALE;
519}
520
521/* Can match anything (initialization) */
522STATIC int
523S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524{
525 int value;
526
aaa51d5e 527 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 return 1;
1aa99e6b
IH
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 return 0;
f8bef550
NC
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
533 return 0;
653099ff
GS
534 return 1;
535}
536
537/* Can match anything (initialization) */
538STATIC void
830247a4 539S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544}
545
546STATIC void
830247a4 547S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 548{
8ecf7187 549 Zero(cl, 1, struct regnode_charclass_class);
653099ff 550 cl->type = ANYOF;
830247a4 551 cl_anything(pRExC_state, cl);
653099ff
GS
552 if (LOC)
553 cl->flags |= ANYOF_LOCALE;
554}
555
556/* 'And' a given class with another one. Can create false positives */
557/* We assume that cl is not inverted */
558STATIC void
559S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
561{
653099ff
GS
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
567 int i;
568
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
572 else
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
578
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
584 if (!(and_with->flags & ANYOF_UNICODE_ALL))
585 cl->flags &= ~ANYOF_UNICODE_ALL;
586 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
587 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
588}
589
590/* 'OR' a given class with another one. Can create false positives */
591/* We assume that cl is not inverted */
592STATIC void
830247a4 593S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 594{
653099ff
GS
595 if (or_with->flags & ANYOF_INVERT) {
596 /* We do not use
597 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
598 * <= (B1 | !B2) | (CL1 | !CL2)
599 * which is wasteful if CL2 is small, but we ignore CL2:
600 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
601 * XXXX Can we handle case-fold? Unclear:
602 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
603 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
604 */
605 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
606 && !(or_with->flags & ANYOF_FOLD)
607 && !(cl->flags & ANYOF_FOLD) ) {
608 int i;
609
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= ~or_with->bitmap[i];
612 } /* XXXX: logic is complicated otherwise */
613 else {
830247a4 614 cl_anything(pRExC_state, cl);
653099ff
GS
615 }
616 } else {
617 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 619 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
620 || (cl->flags & ANYOF_FOLD)) ) {
621 int i;
622
623 /* OR char bitmap and class bitmap separately */
624 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
625 cl->bitmap[i] |= or_with->bitmap[i];
626 if (or_with->flags & ANYOF_CLASS) {
627 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
628 cl->classflags[i] |= or_with->classflags[i];
629 cl->flags |= ANYOF_CLASS;
630 }
631 }
632 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 }
636 if (or_with->flags & ANYOF_EOS)
637 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
638
639 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
640 ARG(cl) != ARG(or_with)) {
641 cl->flags |= ANYOF_UNICODE_ALL;
642 cl->flags &= ~ANYOF_UNICODE;
643 }
644 if (or_with->flags & ANYOF_UNICODE_ALL) {
645 cl->flags |= ANYOF_UNICODE_ALL;
646 cl->flags &= ~ANYOF_UNICODE;
647 }
653099ff
GS
648}
649
5d1c421c
JH
650/*
651 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
652 * These need to be revisited when a newer toolchain becomes available.
653 */
654#if defined(__sparc64__) && defined(__GNUC__)
655# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
656# undef SPARC64_GCC_WORKAROUND
657# define SPARC64_GCC_WORKAROUND 1
658# endif
659#endif
660
653099ff
GS
661/* REx optimizer. Converts nodes into quickier variants "in place".
662 Finds fixed substrings. */
663
c277df42
IZ
664/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
665 to the position after last scanned or to NULL. */
666
76e3520e 667STATIC I32
830247a4 668S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
669 /* scanp: Start here (read-write). */
670 /* deltap: Write maxlen-minlen here. */
671 /* last: Stop before this one. */
672{
673 I32 min = 0, pars = 0, code;
674 regnode *scan = *scanp, *next;
675 I32 delta = 0;
676 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 677 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
678 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
679 scan_data_t data_fake;
653099ff 680 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 681
c277df42
IZ
682 while (scan && OP(scan) != END && scan < last) {
683 /* Peephole optimizer: */
684
22c35a8c 685 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 686 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
687 regnode *n = regnext(scan);
688 U32 stringok = 1;
689#ifdef DEBUGGING
690 regnode *stop = scan;
b81d288d 691#endif
c277df42 692
cd439c50 693 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
694 /* Skip NOTHING, merge EXACT*. */
695 while (n &&
b81d288d 696 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
697 (stringok && (OP(n) == OP(scan))))
698 && NEXT_OFF(n)
699 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
700 if (OP(n) == TAIL || n > next)
701 stringok = 0;
22c35a8c 702 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
703 NEXT_OFF(scan) += NEXT_OFF(n);
704 next = n + NODE_STEP_REGNODE;
705#ifdef DEBUGGING
706 if (stringok)
707 stop = n;
b81d288d 708#endif
c277df42 709 n = regnext(n);
a0ed51b3 710 }
f49d4d0f 711 else if (stringok) {
cd439c50 712 int oldl = STR_LEN(scan);
c277df42 713 regnode *nnext = regnext(n);
f49d4d0f 714
b81d288d 715 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
716 break;
717 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
718 STR_LEN(scan) += STR_LEN(n);
719 next = n + NODE_SZ_STR(n);
c277df42 720 /* Now we can overwrite *n : */
f49d4d0f 721 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 722#ifdef DEBUGGING
f49d4d0f 723 stop = next - 1;
b81d288d 724#endif
c277df42
IZ
725 n = nnext;
726 }
727 }
61a36c01 728
d65e4eab 729 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
730/*
731 Two problematic code points in Unicode casefolding of EXACT nodes:
732
733 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
734 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
735
736 which casefold to
737
738 Unicode UTF-8
739
740 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
741 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
742
743 This means that in case-insensitive matching (or "loose matching",
744 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
745 length of the above casefolded versions) can match a target string
746 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
747 This would rather mess up the minimum length computation.
748
749 What we'll do is to look for the tail four bytes, and then peek
750 at the preceding two bytes to see whether we need to decrease
751 the minimum length by four (six minus two).
752
753 Thanks to the design of UTF-8, there cannot be false matches:
754 A sequence of valid UTF-8 bytes cannot be a subsequence of
755 another valid sequence of UTF-8 bytes.
756
757*/
758 char *s0 = STRING(scan), *s, *t;
759 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
760 char *t0 = "\xcc\x88\xcc\x81";
761 char *t1 = t0 + 3;
762
763 for (s = s0 + 2;
764 s < s2 && (t = ninstr(s, s1, t0, t1));
765 s = t + 4) {
766 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
767 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
768 min -= 4;
769 }
770 }
771
c277df42
IZ
772#ifdef DEBUGGING
773 /* Allow dumping */
cd439c50 774 n = scan + NODE_SZ_STR(scan);
c277df42 775 while (n <= stop) {
22c35a8c 776 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
777 OP(n) = OPTIMIZED;
778 NEXT_OFF(n) = 0;
779 }
780 n++;
781 }
653099ff 782#endif
c277df42 783 }
653099ff
GS
784 /* Follow the next-chain of the current node and optimize
785 away all the NOTHINGs from it. */
c277df42 786 if (OP(scan) != CURLYX) {
048cfca1
GS
787 int max = (reg_off_by_arg[OP(scan)]
788 ? I32_MAX
789 /* I32 may be smaller than U16 on CRAYs! */
790 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
791 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
792 int noff;
793 regnode *n = scan;
b81d288d 794
c277df42
IZ
795 /* Skip NOTHING and LONGJMP. */
796 while ((n = regnext(n))
22c35a8c 797 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
798 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
799 && off + noff < max)
800 off += noff;
801 if (reg_off_by_arg[OP(scan)])
802 ARG(scan) = off;
b81d288d 803 else
c277df42
IZ
804 NEXT_OFF(scan) = off;
805 }
653099ff
GS
806 /* The principal pseudo-switch. Cannot be a switch, since we
807 look into several different things. */
b81d288d 808 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
809 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
810 next = regnext(scan);
811 code = OP(scan);
b81d288d
AB
812
813 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 814 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 815 struct regnode_charclass_class accum;
c277df42 816
653099ff 817 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 818 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 819 if (flags & SCF_DO_STCLASS)
830247a4 820 cl_init_zero(pRExC_state, &accum);
c277df42 821 while (OP(scan) == code) {
830247a4 822 I32 deltanext, minnext, f = 0, fake;
653099ff 823 struct regnode_charclass_class this_class;
c277df42
IZ
824
825 num++;
826 data_fake.flags = 0;
b81d288d 827 if (data) {
2c2d71f5 828 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
829 data_fake.last_closep = data->last_closep;
830 }
831 else
832 data_fake.last_closep = &fake;
c277df42
IZ
833 next = regnext(scan);
834 scan = NEXTOPER(scan);
835 if (code != BRANCH)
836 scan = NEXTOPER(scan);
653099ff 837 if (flags & SCF_DO_STCLASS) {
830247a4 838 cl_init(pRExC_state, &this_class);
653099ff
GS
839 data_fake.start_class = &this_class;
840 f = SCF_DO_STCLASS_AND;
b81d288d 841 }
e1901655
IZ
842 if (flags & SCF_WHILEM_VISITED_POS)
843 f |= SCF_WHILEM_VISITED_POS;
653099ff 844 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
845 minnext = study_chunk(pRExC_state, &scan, &deltanext,
846 next, &data_fake, f);
b81d288d 847 if (min1 > minnext)
c277df42
IZ
848 min1 = minnext;
849 if (max1 < minnext + deltanext)
850 max1 = minnext + deltanext;
851 if (deltanext == I32_MAX)
aca2d497 852 is_inf = is_inf_internal = 1;
c277df42
IZ
853 scan = next;
854 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
855 pars++;
405ff068 856 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 857 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
858 if (data)
859 data->whilem_c = data_fake.whilem_c;
653099ff 860 if (flags & SCF_DO_STCLASS)
830247a4 861 cl_or(pRExC_state, &accum, &this_class);
b81d288d 862 if (code == SUSPEND)
c277df42
IZ
863 break;
864 }
865 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
866 min1 = 0;
867 if (flags & SCF_DO_SUBSTR) {
868 data->pos_min += min1;
869 data->pos_delta += max1 - min1;
870 if (max1 != min1 || is_inf)
871 data->longest = &(data->longest_float);
872 }
873 min += min1;
874 delta += max1 - min1;
653099ff 875 if (flags & SCF_DO_STCLASS_OR) {
830247a4 876 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
877 if (min1) {
878 cl_and(data->start_class, &and_with);
879 flags &= ~SCF_DO_STCLASS;
880 }
881 }
882 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
883 if (min1) {
884 cl_and(data->start_class, &accum);
653099ff 885 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
886 }
887 else {
b81d288d 888 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
889 * data->start_class */
890 StructCopy(data->start_class, &and_with,
891 struct regnode_charclass_class);
892 flags &= ~SCF_DO_STCLASS_AND;
893 StructCopy(&accum, data->start_class,
894 struct regnode_charclass_class);
895 flags |= SCF_DO_STCLASS_OR;
896 data->start_class->flags |= ANYOF_EOS;
897 }
653099ff 898 }
a0ed51b3
LW
899 }
900 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
901 scan = NEXTOPER(NEXTOPER(scan));
902 else /* single branch is optimized. */
903 scan = NEXTOPER(scan);
904 continue;
a0ed51b3
LW
905 }
906 else if (OP(scan) == EXACT) {
cd439c50 907 I32 l = STR_LEN(scan);
1aa99e6b 908 UV uc = *((U8*)STRING(scan));
a0ed51b3 909 if (UTF) {
1aa99e6b
IH
910 U8 *s = (U8*)STRING(scan);
911 l = utf8_length(s, s + l);
9041c2e3 912 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
913 }
914 min += l;
c277df42 915 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
916 /* The code below prefers earlier match for fixed
917 offset, later match for variable offset. */
918 if (data->last_end == -1) { /* Update the start info. */
919 data->last_start_min = data->pos_min;
920 data->last_start_max = is_inf
b81d288d 921 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 922 }
cd439c50 923 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
924 {
925 SV * sv = data->last_found;
926 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
927 mg_find(sv, PERL_MAGIC_utf8) : NULL;
928 if (mg && mg->mg_len >= 0)
5e43f467
JH
929 mg->mg_len += utf8_length((U8*)STRING(scan),
930 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 931 }
33b8afdf
JH
932 if (UTF)
933 SvUTF8_on(data->last_found);
c277df42
IZ
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
937 }
653099ff
GS
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
940 int compat = 1;
941
1aa99e6b 942 if (uc >= 0x100 ||
516a5887 943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 945 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 947 )
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat)
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 953 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
954 if (uc < 0x100)
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
956 }
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
1aa99e6b 959 if (uc < 0x100)
9b877dbb
IH
960 ANYOF_BITMAP_SET(data->start_class, uc);
961 else
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
653099ff 968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 969 I32 l = STR_LEN(scan);
1aa99e6b 970 UV uc = *((U8*)STRING(scan));
653099ff
GS
971
972 /* Search for fixed substrings supports EXACT only. */
b81d288d 973 if (flags & SCF_DO_SUBSTR)
830247a4 974 scan_commit(pRExC_state, data);
a0ed51b3 975 if (UTF) {
1aa99e6b
IH
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
9041c2e3 978 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
979 }
980 min += l;
c277df42 981 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 982 data->pos_min += l;
653099ff
GS
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
985 int compat = 1;
986
1aa99e6b 987 if (uc >= 0x100 ||
516a5887 988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
991 compat = 0;
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
994 if (compat) {
1aa99e6b 995 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1000 }
1001 }
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1aa99e6b
IH
1006 if (uc < 0x100)
1007 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1008 data->start_class->flags &= ~ANYOF_EOS;
1009 }
1010 cl_and(data->start_class, &and_with);
1011 }
1012 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1013 }
4d61ec05 1014 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1016 I32 f = flags, pos_before = 0;
c277df42 1017 regnode *oscan = scan;
653099ff
GS
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
727f22e3 1020 I32 next_is_eval = 0;
653099ff 1021
22c35a8c 1022 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1023 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1024 scan = NEXTOPER(scan);
1025 goto finish;
1026 case PLUS:
653099ff 1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1028 next = NEXTOPER(scan);
653099ff 1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1030 mincount = 1;
1031 maxcount = REG_INFTY;
c277df42
IZ
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1034 goto do_curly;
1035 }
1036 }
1037 if (flags & SCF_DO_SUBSTR)
1038 data->pos_min++;
1039 min++;
1040 /* Fall through. */
1041 case STAR:
653099ff
GS
1042 if (flags & SCF_DO_STCLASS) {
1043 mincount = 0;
b81d288d 1044 maxcount = REG_INFTY;
653099ff
GS
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1047 goto do_curly;
1048 }
b81d288d 1049 is_inf = is_inf_internal = 1;
c277df42
IZ
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
830247a4 1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1053 data->longest = &(data->longest_float);
1054 }
1055 goto optimize_curly_tail;
1056 case CURLY:
b81d288d 1057 mincount = ARG1(scan);
c277df42
IZ
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
cb434fcc
IZ
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1062
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064 }
c277df42 1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1066 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1067 do_curly:
1068 if (flags & SCF_DO_SUBSTR) {
830247a4 1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1070 pos_before = data->pos_min;
1071 }
1072 if (data) {
1073 fl = data->flags;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075 if (is_inf)
1076 data->flags |= SF_IS_INF;
1077 }
653099ff 1078 if (flags & SCF_DO_STCLASS) {
830247a4 1079 cl_init(pRExC_state, &this_class);
653099ff
GS
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1084 }
e1901655
IZ
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
b81d288d 1088
e1901655
IZ
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1093
c277df42 1094 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096 mincount == 0
653099ff
GS
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1098
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1103 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1104 }
1105 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1106 /* Switch to OR mode: cache the old value of
653099ff
GS
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1115 }
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1118 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1119 cl_and(data->start_class, &and_with);
1120 }
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1124 }
c277df42
IZ
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1126 scan = next;
84037bb0 1127 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1130 && (minnext == 0) && (deltanext == 0)
99799961 1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1133 {
830247a4 1134 vWARN(RExC_parse,
b45f050a
JF
1135 "Quantifier unexpected on zero-length expression");
1136 }
1137
c277df42 1138 min += minnext * mincount;
b81d288d 1139 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
aca2d497 1142 is_inf |= is_inf_internal;
c277df42
IZ
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144
1145 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1146 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1152 regnode *nxt1 = nxt;
1153#ifdef DEBUGGING
1154 regnode *nxt2;
1155#endif
c277df42
IZ
1156
1157 /* Skip open. */
1158 nxt = regnext(nxt);
4d61ec05 1159 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1161 && STR_LEN(nxt) == 1))
c277df42 1162 goto nogo;
497b47a8 1163#ifdef DEBUGGING
c277df42 1164 nxt2 = nxt;
497b47a8 1165#endif
c277df42 1166 nxt = regnext(nxt);
b81d288d 1167 if (OP(nxt) != CLOSE)
c277df42
IZ
1168 goto nogo;
1169 /* Now we know that nxt2 is the only contents: */
eb160463 1170 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1171 OP(oscan) = CURLYN;
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1173#ifdef DEBUGGING
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1180#endif
c277df42 1181 }
c277df42
IZ
1182 nogo:
1183
1184 /* Try optimization CURLYX => CURLYM. */
b81d288d 1185 if ( OP(oscan) == CURLYX && data
c277df42 1186 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1187 && !(data->flags & SF_HAS_EVAL)
1188 && !deltanext ) {
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1192 regnode *nxt2;
1193
1194 OP(oscan) = CURLYM;
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1196 && (OP(nxt2) != WHILEM))
c277df42
IZ
1197 nxt = nxt2;
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203
b81d288d 1204 if (OP(nxt) != CLOSE)
b45f050a 1205 FAIL("Panic opt close");
eb160463 1206 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1209#ifdef DEBUGGING
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1214#endif
c277df42
IZ
1215#if 0
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
b81d288d 1218
c277df42
IZ
1219 if (nnxt == nxt) {
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1224 else
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1226 }
1227 nxt1 = nnxt;
1228 }
1229#endif
1230 /* Optimize again: */
b81d288d 1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1232 NULL, 0);
a0ed51b3
LW
1233 }
1234 else
c277df42 1235 oscan->flags = 0;
c277df42 1236 }
e1901655
IZ
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1247
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249 nxt += ARG(nxt);
eb160463
GS
1250 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1251 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1252 }
b81d288d 1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1254 pars++;
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1258
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1260#if defined(SPARC64_GCC_WORKAROUND)
1261 I32 b = 0;
1262 STRLEN l = 0;
1263 char *s = NULL;
1264 I32 old = 0;
1265
1266 if (pos_before >= data->last_start_min)
1267 b = pos_before;
1268 else
1269 b = data->last_start_min;
1270
1271 l = 0;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1274
1275#else
b81d288d 1276 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1277 ? pos_before : data->last_start_min;
1278 STRLEN l;
1279 char *s = SvPV(data->last_found, l);
a0ed51b3 1280 I32 old = b - data->last_start_min;
5d1c421c 1281#endif
a0ed51b3
LW
1282
1283 if (UTF)
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1285
a0ed51b3 1286 l -= old;
c277df42 1287 /* Get the added string: */
79cb57f6 1288 last_str = newSVpvn(s + old, l);
0e933229
IH
1289 if (UTF)
1290 SvUTF8_on(last_str);
c277df42
IZ
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1293 if (mincount > 1) {
1294 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1295 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
b81d288d 1299 SvCUR_set(data->last_found,
c277df42
IZ
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
0eda9292
JH
1302 {
1303 SV * sv = data->last_found;
1304 MAGIC *mg =
1305 SvUTF8(sv) && SvMAGICAL(sv) ?
1306 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1307 if (mg && mg->mg_len >= 0)
1308 mg->mg_len += CHR_SVLEN(last_str);
1309 }
c277df42
IZ
1310 data->last_end += l * (mincount - 1);
1311 }
2a8d9689
HS
1312 } else {
1313 /* start offset must point into the last copy */
1314 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1315 data->last_start_max += is_inf ? I32_MAX
1316 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1317 }
1318 }
1319 /* It is counted once already... */
1320 data->pos_min += minnext * (mincount - counted);
1321 data->pos_delta += - counted * deltanext +
1322 (minnext + deltanext) * maxcount - minnext * mincount;
1323 if (mincount != maxcount) {
653099ff
GS
1324 /* Cannot extend fixed substrings found inside
1325 the group. */
830247a4 1326 scan_commit(pRExC_state,data);
c277df42
IZ
1327 if (mincount && last_str) {
1328 sv_setsv(data->last_found, last_str);
1329 data->last_end = data->pos_min;
b81d288d 1330 data->last_start_min =
a0ed51b3 1331 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1332 data->last_start_max = is_inf
1333 ? I32_MAX
c277df42 1334 : data->pos_min + data->pos_delta
a0ed51b3 1335 - CHR_SVLEN(last_str);
c277df42
IZ
1336 }
1337 data->longest = &(data->longest_float);
1338 }
aca2d497 1339 SvREFCNT_dec(last_str);
c277df42 1340 }
405ff068 1341 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1342 data->flags |= SF_HAS_EVAL;
1343 optimize_curly_tail:
c277df42 1344 if (OP(oscan) != CURLYX) {
22c35a8c 1345 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1346 && NEXT_OFF(next))
1347 NEXT_OFF(oscan) += NEXT_OFF(next);
1348 }
c277df42 1349 continue;
653099ff 1350 default: /* REF and CLUMP only? */
c277df42 1351 if (flags & SCF_DO_SUBSTR) {
830247a4 1352 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1353 data->longest = &(data->longest_float);
1354 }
aca2d497 1355 is_inf = is_inf_internal = 1;
653099ff 1356 if (flags & SCF_DO_STCLASS_OR)
830247a4 1357 cl_anything(pRExC_state, data->start_class);
653099ff 1358 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1359 break;
1360 }
a0ed51b3 1361 }
ffc61ed2 1362 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1363 int value = 0;
653099ff 1364
c277df42 1365 if (flags & SCF_DO_SUBSTR) {
830247a4 1366 scan_commit(pRExC_state,data);
c277df42
IZ
1367 data->pos_min++;
1368 }
1369 min++;
653099ff
GS
1370 if (flags & SCF_DO_STCLASS) {
1371 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1372
1373 /* Some of the logic below assumes that switching
1374 locale on will only add false positives. */
1375 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1376 case SANY:
653099ff
GS
1377 default:
1378 do_default:
1379 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1380 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1381 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1382 break;
1383 case REG_ANY:
1384 if (OP(scan) == SANY)
1385 goto do_default;
1386 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1387 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1388 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1389 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1390 }
1391 if (flags & SCF_DO_STCLASS_AND || !value)
1392 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1393 break;
1394 case ANYOF:
1395 if (flags & SCF_DO_STCLASS_AND)
1396 cl_and(data->start_class,
1397 (struct regnode_charclass_class*)scan);
1398 else
830247a4 1399 cl_or(pRExC_state, data->start_class,
653099ff
GS
1400 (struct regnode_charclass_class*)scan);
1401 break;
1402 case ALNUM:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1406 for (value = 0; value < 256; value++)
1407 if (!isALNUM(value))
1408 ANYOF_BITMAP_CLEAR(data->start_class, value);
1409 }
1410 }
1411 else {
1412 if (data->start_class->flags & ANYOF_LOCALE)
1413 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1414 else {
1415 for (value = 0; value < 256; value++)
1416 if (isALNUM(value))
b81d288d 1417 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1418 }
1419 }
1420 break;
1421 case ALNUML:
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (data->start_class->flags & ANYOF_LOCALE)
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1425 }
1426 else {
1427 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1428 data->start_class->flags |= ANYOF_LOCALE;
1429 }
1430 break;
1431 case NALNUM:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1435 for (value = 0; value < 256; value++)
1436 if (isALNUM(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 if (data->start_class->flags & ANYOF_LOCALE)
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1443 else {
1444 for (value = 0; value < 256; value++)
1445 if (!isALNUM(value))
b81d288d 1446 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1447 }
1448 }
1449 break;
1450 case NALNUML:
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (data->start_class->flags & ANYOF_LOCALE)
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1454 }
1455 else {
1456 data->start_class->flags |= ANYOF_LOCALE;
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458 }
1459 break;
1460 case SPACE:
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1464 for (value = 0; value < 256; value++)
1465 if (!isSPACE(value))
1466 ANYOF_BITMAP_CLEAR(data->start_class, value);
1467 }
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (isSPACE(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 case SPACEL:
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (data->start_class->flags & ANYOF_LOCALE)
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1483 }
1484 else {
1485 data->start_class->flags |= ANYOF_LOCALE;
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487 }
1488 break;
1489 case NSPACE:
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1493 for (value = 0; value < 256; value++)
1494 if (isSPACE(value))
1495 ANYOF_BITMAP_CLEAR(data->start_class, value);
1496 }
1497 }
1498 else {
1499 if (data->start_class->flags & ANYOF_LOCALE)
1500 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1501 else {
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
b81d288d 1504 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1505 }
1506 }
1507 break;
1508 case NSPACEL:
1509 if (flags & SCF_DO_STCLASS_AND) {
1510 if (data->start_class->flags & ANYOF_LOCALE) {
1511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
1514 ANYOF_BITMAP_CLEAR(data->start_class, value);
1515 }
1516 }
1517 else {
1518 data->start_class->flags |= ANYOF_LOCALE;
1519 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1520 }
1521 break;
1522 case DIGIT:
1523 if (flags & SCF_DO_STCLASS_AND) {
1524 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1525 for (value = 0; value < 256; value++)
1526 if (!isDIGIT(value))
1527 ANYOF_BITMAP_CLEAR(data->start_class, value);
1528 }
1529 else {
1530 if (data->start_class->flags & ANYOF_LOCALE)
1531 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1532 else {
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
b81d288d 1535 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1536 }
1537 }
1538 break;
1539 case NDIGIT:
1540 if (flags & SCF_DO_STCLASS_AND) {
1541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1542 for (value = 0; value < 256; value++)
1543 if (isDIGIT(value))
1544 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545 }
1546 else {
1547 if (data->start_class->flags & ANYOF_LOCALE)
1548 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1549 else {
1550 for (value = 0; value < 256; value++)
1551 if (!isDIGIT(value))
b81d288d 1552 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1553 }
1554 }
1555 break;
1556 }
1557 if (flags & SCF_DO_STCLASS_OR)
1558 cl_and(data->start_class, &and_with);
1559 flags &= ~SCF_DO_STCLASS;
1560 }
a0ed51b3 1561 }
22c35a8c 1562 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1563 data->flags |= (OP(scan) == MEOL
1564 ? SF_BEFORE_MEOL
1565 : SF_BEFORE_SEOL);
a0ed51b3 1566 }
653099ff
GS
1567 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1568 /* Lookbehind, or need to calculate parens/evals/stclass: */
1569 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1570 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1571 /* Lookahead/lookbehind */
cb434fcc 1572 I32 deltanext, minnext, fake = 0;
c277df42 1573 regnode *nscan;
653099ff
GS
1574 struct regnode_charclass_class intrnl;
1575 int f = 0;
c277df42
IZ
1576
1577 data_fake.flags = 0;
b81d288d 1578 if (data) {
2c2d71f5 1579 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1580 data_fake.last_closep = data->last_closep;
1581 }
1582 else
1583 data_fake.last_closep = &fake;
653099ff
GS
1584 if ( flags & SCF_DO_STCLASS && !scan->flags
1585 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1586 cl_init(pRExC_state, &intrnl);
653099ff 1587 data_fake.start_class = &intrnl;
e1901655 1588 f |= SCF_DO_STCLASS_AND;
653099ff 1589 }
e1901655
IZ
1590 if (flags & SCF_WHILEM_VISITED_POS)
1591 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1592 next = regnext(scan);
1593 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1594 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1595 if (scan->flags) {
1596 if (deltanext) {
9baa0206 1597 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1598 }
1599 else if (minnext > U8_MAX) {
9baa0206 1600 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1601 }
eb160463 1602 scan->flags = (U8)minnext;
c277df42
IZ
1603 }
1604 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1605 pars++;
405ff068 1606 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1607 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1608 if (data)
1609 data->whilem_c = data_fake.whilem_c;
e1901655 1610 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1611 int was = (data->start_class->flags & ANYOF_EOS);
1612
1613 cl_and(data->start_class, &intrnl);
1614 if (was)
1615 data->start_class->flags |= ANYOF_EOS;
1616 }
a0ed51b3
LW
1617 }
1618 else if (OP(scan) == OPEN) {
c277df42 1619 pars++;
a0ed51b3 1620 }
cb434fcc 1621 else if (OP(scan) == CLOSE) {
eb160463 1622 if ((I32)ARG(scan) == is_par) {
cb434fcc 1623 next = regnext(scan);
c277df42 1624
cb434fcc
IZ
1625 if ( next && (OP(next) != WHILEM) && next < last)
1626 is_par = 0; /* Disable optimization */
1627 }
1628 if (data)
1629 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1630 }
1631 else if (OP(scan) == EVAL) {
c277df42
IZ
1632 if (data)
1633 data->flags |= SF_HAS_EVAL;
1634 }
96776eda 1635 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1636 if (flags & SCF_DO_SUBSTR) {
830247a4 1637 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1638 data->longest = &(data->longest_float);
1639 }
1640 is_inf = is_inf_internal = 1;
653099ff 1641 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1642 cl_anything(pRExC_state, data->start_class);
96776eda 1643 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1644 }
c277df42
IZ
1645 /* Else: zero-length, ignore. */
1646 scan = regnext(scan);
1647 }
1648
1649 finish:
1650 *scanp = scan;
aca2d497 1651 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1652 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1653 data->pos_delta = I32_MAX - data->pos_min;
1654 if (is_par > U8_MAX)
1655 is_par = 0;
1656 if (is_par && pars==1 && data) {
1657 data->flags |= SF_IN_PAR;
1658 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1659 }
1660 else if (pars && data) {
c277df42
IZ
1661 data->flags |= SF_HAS_PAR;
1662 data->flags &= ~SF_IN_PAR;
1663 }
653099ff
GS
1664 if (flags & SCF_DO_STCLASS_OR)
1665 cl_and(data->start_class, &and_with);
c277df42
IZ
1666 return min;
1667}
1668
76e3520e 1669STATIC I32
830247a4 1670S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1671{
830247a4 1672 if (RExC_rx->data) {
b81d288d
AB
1673 Renewc(RExC_rx->data,
1674 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1675 char, struct reg_data);
830247a4
IZ
1676 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1677 RExC_rx->data->count += n;
a0ed51b3
LW
1678 }
1679 else {
830247a4 1680 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1681 char, struct reg_data);
830247a4
IZ
1682 New(1208, RExC_rx->data->what, n, U8);
1683 RExC_rx->data->count = n;
c277df42 1684 }
830247a4
IZ
1685 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1686 return RExC_rx->data->count - n;
c277df42
IZ
1687}
1688
d88dccdf 1689void
864dbfa3 1690Perl_reginitcolors(pTHX)
d88dccdf 1691{
d88dccdf
IZ
1692 int i = 0;
1693 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1694
d88dccdf
IZ
1695 if (s) {
1696 PL_colors[0] = s = savepv(s);
1697 while (++i < 6) {
1698 s = strchr(s, '\t');
1699 if (s) {
1700 *s = '\0';
1701 PL_colors[i] = ++s;
1702 }
1703 else
c712d376 1704 PL_colors[i] = s = "";
d88dccdf
IZ
1705 }
1706 } else {
b81d288d 1707 while (i < 6)
d88dccdf
IZ
1708 PL_colors[i++] = "";
1709 }
1710 PL_colorset = 1;
1711}
1712
8615cb43 1713
a687059c 1714/*
e50aee73 1715 - pregcomp - compile a regular expression into internal code
a687059c
LW
1716 *
1717 * We can't allocate space until we know how big the compiled form will be,
1718 * but we can't compile it (and thus know how big it is) until we've got a
1719 * place to put the code. So we cheat: we compile it twice, once with code
1720 * generation turned off and size counting turned on, and once "for real".
1721 * This also means that we don't allocate space until we are sure that the
1722 * thing really will compile successfully, and we never have to move the
1723 * code and thus invalidate pointers into it. (Note that it has to be in
1724 * one piece because free() must be able to free it all.) [NB: not true in perl]
1725 *
1726 * Beware that the optimization-preparation code in here knows about some
1727 * of the structure of the compiled regexp. [I'll say.]
1728 */
1729regexp *
864dbfa3 1730Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1731{
a0d0e21e 1732 register regexp *r;
c277df42 1733 regnode *scan;
c277df42 1734 regnode *first;
a0d0e21e 1735 I32 flags;
a0d0e21e
LW
1736 I32 minlen = 0;
1737 I32 sawplus = 0;
1738 I32 sawopen = 0;
2c2d71f5 1739 scan_data_t data;
830247a4
IZ
1740 RExC_state_t RExC_state;
1741 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1742
1743 if (exp == NULL)
c277df42 1744 FAIL("NULL regexp argument");
a0d0e21e 1745
a5961de5 1746 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1747
5cfc7842 1748 RExC_precomp = exp;
a5961de5
JH
1749 DEBUG_r({
1750 if (!PL_colorset) reginitcolors();
1751 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1752 PL_colors[4],PL_colors[5],PL_colors[0],
1753 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1754 });
e2509266 1755 RExC_flags = pm->op_pmflags;
830247a4 1756 RExC_sawback = 0;
bbce6d69 1757
830247a4
IZ
1758 RExC_seen = 0;
1759 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1760 RExC_seen_evals = 0;
1761 RExC_extralen = 0;
c277df42 1762
bbce6d69 1763 /* First pass: determine size, legality. */
830247a4 1764 RExC_parse = exp;
fac92740 1765 RExC_start = exp;
830247a4
IZ
1766 RExC_end = xend;
1767 RExC_naughty = 0;
1768 RExC_npar = 1;
1769 RExC_size = 0L;
1770 RExC_emit = &PL_regdummy;
1771 RExC_whilem_seen = 0;
85ddcde9
JH
1772#if 0 /* REGC() is (currently) a NOP at the first pass.
1773 * Clever compilers notice this and complain. --jhi */
830247a4 1774 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1775#endif
830247a4 1776 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1777 RExC_precomp = Nullch;
a0d0e21e
LW
1778 return(NULL);
1779 }
830247a4 1780 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1781
c277df42
IZ
1782 /* Small enough for pointer-storage convention?
1783 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1784 if (RExC_size >= 0x10000L && RExC_extralen)
1785 RExC_size += RExC_extralen;
c277df42 1786 else
830247a4
IZ
1787 RExC_extralen = 0;
1788 if (RExC_whilem_seen > 15)
1789 RExC_whilem_seen = 15;
a0d0e21e 1790
bbce6d69 1791 /* Allocate space and initialize. */
830247a4 1792 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1793 char, regexp);
a0d0e21e 1794 if (r == NULL)
b45f050a
JF
1795 FAIL("Regexp out of space");
1796
0f79a09d
GS
1797#ifdef DEBUGGING
1798 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1799 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1800#endif
c277df42 1801 r->refcnt = 1;
bbce6d69 1802 r->prelen = xend - exp;
5cfc7842 1803 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1804 r->subbeg = NULL;
ed252734
NC
1805#ifdef PERL_COPY_ON_WRITE
1806 r->saved_copy = Nullsv;
1807#endif
cf93c79d 1808 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1809 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1810
1811 r->substrs = 0; /* Useful during FAIL. */
1812 r->startp = 0; /* Useful during FAIL. */
1813 r->endp = 0; /* Useful during FAIL. */
1814
fac92740
MJD
1815 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1816 if (r->offsets) {
1817 r->offsets[0] = RExC_size;
1818 }
1819 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1820 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1821 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1822 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1823
830247a4 1824 RExC_rx = r;
bbce6d69 1825
1826 /* Second pass: emit code. */
e2509266 1827 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1828 RExC_parse = exp;
1829 RExC_end = xend;
1830 RExC_naughty = 0;
1831 RExC_npar = 1;
fac92740 1832 RExC_emit_start = r->program;
830247a4 1833 RExC_emit = r->program;
2cd61cdb 1834 /* Store the count of eval-groups for security checks: */
eb160463 1835 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1836 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1837 r->data = 0;
830247a4 1838 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1839 return(NULL);
1840
1841 /* Dig out information for optimizations. */
cf93c79d 1842 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1843 pm->op_pmflags = RExC_flags;
a0ed51b3 1844 if (UTF)
5ff6fc6d 1845 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1846 r->regstclass = NULL;
830247a4 1847 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1848 r->reganch |= ROPT_NAUGHTY;
c277df42 1849 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1850
1851 /* XXXX To minimize changes to RE engine we always allocate
1852 3-units-long substrs field. */
1853 Newz(1004, r->substrs, 1, struct reg_substr_data);
1854
2c2d71f5 1855 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1856 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1857 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1858 I32 fake;
c5254dd6 1859 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1860 struct regnode_charclass_class ch_class;
1861 int stclass_flag;
cb434fcc 1862 I32 last_close = 0;
a0d0e21e
LW
1863
1864 first = scan;
c277df42 1865 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1866 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1867 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1868 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869 (OP(first) == PLUS) ||
1870 (OP(first) == MINMOD) ||
653099ff 1871 /* An {n,m} with n>0 */
22c35a8c 1872 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1873 if (OP(first) == PLUS)
1874 sawplus = 1;
1875 else
1876 first += regarglen[(U8)OP(first)];
1877 first = NEXTOPER(first);
a687059c
LW
1878 }
1879
a0d0e21e
LW
1880 /* Starting-point info. */
1881 again:
653099ff 1882 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1883 if (OP(first) == EXACT)
1884 ; /* Empty, get anchored substr later. */
1885 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1886 r->regstclass = first;
1887 }
653099ff 1888 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1889 r->regstclass = first;
22c35a8c
GS
1890 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1892 r->regstclass = first;
22c35a8c 1893 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1894 r->reganch |= (OP(first) == MBOL
1895 ? ROPT_ANCH_MBOL
1896 : (OP(first) == SBOL
1897 ? ROPT_ANCH_SBOL
1898 : ROPT_ANCH_BOL));
a0d0e21e 1899 first = NEXTOPER(first);
774d564b 1900 goto again;
1901 }
1902 else if (OP(first) == GPOS) {
1903 r->reganch |= ROPT_ANCH_GPOS;
1904 first = NEXTOPER(first);
1905 goto again;
a0d0e21e 1906 }
e09294f4 1907 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1908 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1909 !(r->reganch & ROPT_ANCH) )
1910 {
1911 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1912 int type = OP(NEXTOPER(first));
1913
ffc61ed2 1914 if (type == REG_ANY)
cad2e5aa
JH
1915 type = ROPT_ANCH_MBOL;
1916 else
1917 type = ROPT_ANCH_SBOL;
1918
1919 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1920 first = NEXTOPER(first);
774d564b 1921 goto again;
a0d0e21e 1922 }
b81d288d 1923 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
a0d0e21e 1927
c277df42 1928 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1930 (IV)(first - scan + 1)));
a0d0e21e
LW
1931 /*
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
c277df42 1939 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1940 * earlier string may buy us something the later one won't.]
1941 */
a0d0e21e 1942 minlen = 0;
a687059c 1943
79cb57f6
GS
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
c277df42
IZ
1947 data.longest = &(data.longest_fixed);
1948 first = scan;
653099ff 1949 if (!r->regstclass) {
830247a4 1950 cl_init(pRExC_state, &ch_class);
653099ff
GS
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1954 stclass_flag = 0;
cb434fcc 1955 data.last_closep = &last_close;
653099ff 1956
830247a4 1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1960 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1963 r->reganch |= ROPT_CHECK_ALL;
830247a4 1964 scan_commit(pRExC_state, &data);
c277df42
IZ
1965 SvREFCNT_dec(data.last_found);
1966
a0ed51b3 1967 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1968 if (longest_float_length
c277df42
IZ
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1971 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1972 int t;
1973
a0ed51b3 1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1978
33b8afdf
JH
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1982 } else {
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1985 }
c277df42
IZ
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1990 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1992 }
1993 else {
aca2d497 1994 remove_float:
33b8afdf 1995 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1996 SvREFCNT_dec(data.longest_float);
c5254dd6 1997 longest_float_length = 0;
a0d0e21e 1998 }
c277df42 1999
a0ed51b3 2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2001 if (longest_fixed_length
c277df42
IZ
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2004 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2005 int t;
2006
33b8afdf
JH
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2010 } else {
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2013 }
c277df42 2014 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2017 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2019 }
2020 else {
33b8afdf 2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2022 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2023 longest_fixed_length = 0;
a0d0e21e 2024 }
b81d288d 2025 if (r->regstclass
ffc61ed2 2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2027 r->regstclass = NULL;
33b8afdf
JH
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029 && stclass_flag
653099ff 2030 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2031 && !cl_is_anything(data.start_class))
2032 {
830247a4 2033 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2034
b81d288d 2035 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
830247a4 2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2039 struct regnode_charclass_class);
830247a4 2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2042 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
2046 "synthetic stclass `%s'.\n",
2047 SvPVX(sv));});
653099ff 2048 }
c277df42
IZ
2049
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2051 if (longest_fixed_length > longest_float_length) {
c277df42 2052 r->check_substr = r->anchored_substr;
33b8afdf 2053 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2057 }
2058 else {
c277df42 2059 r->check_substr = r->float_substr;
33b8afdf 2060 r->check_utf8 = r->float_utf8;
c277df42
IZ
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
a0d0e21e 2063 }
30382c73
IZ
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
33b8afdf 2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2067 r->reganch |= RE_USE_INTUIT;
33b8afdf 2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2069 r->reganch |= RE_INTUIT_TAIL;
2070 }
a0ed51b3
LW
2071 }
2072 else {
c277df42
IZ
2073 /* Several toplevels. Best we can is to set minlen. */
2074 I32 fake;
653099ff 2075 struct regnode_charclass_class ch_class;
cb434fcc 2076 I32 last_close = 0;
c277df42
IZ
2077
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
830247a4 2080 cl_init(pRExC_state, &ch_class);
653099ff 2081 data.start_class = &ch_class;
cb434fcc 2082 data.last_closep = &last_close;
e1901655 2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2086 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2087 && !cl_is_anything(data.start_class))
2088 {
830247a4 2089 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2090
b81d288d 2091 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
830247a4 2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2095 struct regnode_charclass_class);
830247a4 2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
2101 "synthetic stclass `%s'.\n",
2102 SvPVX(sv));});
653099ff 2103 }
a0d0e21e
LW
2104 }
2105
a0d0e21e 2106 r->minlen = minlen;
b81d288d 2107 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2108 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2111 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2112 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2115 Newz(1002, r->startp, RExC_npar, I32);
2116 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2117 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2118 DEBUG_r(regdump(r));
2119 return(r);
a687059c
LW
2120}
2121
2122/*
2123 - reg - regular expression, i.e. main body or parenthesized thing
2124 *
2125 * Caller must absorb opening parenthesis.
2126 *
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2130 */
76e3520e 2131STATIC regnode *
830247a4 2132S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2134{
c277df42
IZ
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
a0d0e21e 2139 register I32 parno = 0;
e2509266 2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2141
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2144
2145 I32 wastedflags = 0x00,
2146 wasted_o = 0x01,
2147 wasted_g = 0x02,
2148 wasted_gc = 0x02 | 0x04,
2149 wasted_c = 0x04;
2150
fac92740 2151 char * parse_start = RExC_parse; /* MJD */
830247a4 2152 char *oregcomp_parse = RExC_parse;
c277df42 2153 char c;
a0d0e21e 2154
821b33a5 2155 *flagp = 0; /* Tentatively. */
a0d0e21e 2156
9d1d55b5 2157
a0d0e21e
LW
2158 /* Make an OPEN node, if parenthesized. */
2159 if (paren) {
fac92740 2160 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
0f5d15d6 2163 int logical = 0;
830247a4 2164 char *seqstart = RExC_parse;
ca9dfc88 2165
830247a4
IZ
2166 RExC_parse++;
2167 paren = *RExC_parse++;
c277df42 2168 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2169 switch (paren) {
fac92740 2170 case '<': /* (?<...) */
830247a4 2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2172 if (*RExC_parse == '!')
c277df42 2173 paren = ',';
b81d288d 2174 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2175 goto unknown;
830247a4 2176 RExC_parse++;
fac92740
MJD
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
830247a4 2179 RExC_seen_zerolen++;
fac92740
MJD
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
a0d0e21e 2182 break;
fac92740
MJD
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
8615cb43 2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2186 break;
fac92740 2187 case '#': /* (?#...) */
830247a4
IZ
2188 while (*RExC_parse && *RExC_parse != ')')
2189 RExC_parse++;
2190 if (*RExC_parse != ')')
c277df42 2191 FAIL("Sequence (?#... not terminated");
830247a4 2192 nextchar(pRExC_state);
a0d0e21e
LW
2193 *flagp = TRYAGAIN;
2194 return NULL;
fac92740 2195 case 'p': /* (?p...) */
9014280d 2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2198 /* FALL THROUGH*/
fac92740 2199 case '?': /* (??...) */
0f5d15d6 2200 logical = 1;
438a3801
YST
2201 if (*RExC_parse != '{')
2202 goto unknown;
830247a4 2203 paren = *RExC_parse++;
0f5d15d6 2204 /* FALL THROUGH */
fac92740 2205 case '{': /* (?{...}) */
c277df42 2206 {
c277df42
IZ
2207 I32 count = 1, n = 0;
2208 char c;
830247a4 2209 char *s = RExC_parse;
c277df42
IZ
2210 SV *sv;
2211 OP_4tree *sop, *rop;
2212
830247a4
IZ
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2217 RExC_parse++;
b81d288d 2218 else if (c == '{')
c277df42 2219 count++;
b81d288d 2220 else if (c == '}')
c277df42 2221 count--;
830247a4 2222 RExC_parse++;
c277df42 2223 }
830247a4 2224 if (*RExC_parse != ')')
b45f050a 2225 {
b81d288d 2226 RExC_parse = s;
b45f050a
JF
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228 }
c277df42 2229 if (!SIZE_ONLY) {
f3548bdc 2230 PAD *pad;
b81d288d
AB
2231
2232 if (RExC_parse - 1 - s)
830247a4 2233 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2234 else
79cb57f6 2235 sv = newSVpvn("", 0);
c277df42 2236
569233ed
SB
2237 ENTER;
2238 Perl_save_re_context(aTHX);
f3548bdc 2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
569233ed 2243 LEAVE;
c277df42 2244
830247a4
IZ
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2248 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2249 SvREFCNT_dec(sv);
a0ed51b3 2250 }
e24b16f9 2251 else { /* First pass */
830247a4 2252 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2253 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2257 if (PL_tainting && PL_tainted)
cc6b7395 2258 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2259 }
2260
830247a4 2261 nextchar(pRExC_state);
0f5d15d6 2262 if (logical) {
830247a4 2263 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2264 if (!SIZE_ONLY)
2265 ret->flags = 2;
830247a4 2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2267 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2268 return ret;
2269 }
ccb2c380
MP
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2273 return ret;
c277df42 2274 }
fac92740 2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2276 {
fac92740 2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
830247a4 2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2281 I32 flag;
2282
830247a4 2283 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2284 if (!SIZE_ONLY)
2285 ret->flags = 1;
830247a4 2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2287 goto insert_if;
b81d288d 2288 }
a0ed51b3 2289 }
830247a4 2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2291 /* (?(1)...) */
830247a4 2292 parno = atoi(RExC_parse++);
c277df42 2293
830247a4
IZ
2294 while (isDIGIT(*RExC_parse))
2295 RExC_parse++;
fac92740
MJD
2296 ret = reganode(pRExC_state, GROUPP, parno);
2297
830247a4 2298 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2299 vFAIL("Switch condition not recognized");
c277df42 2300 insert_if:
830247a4
IZ
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
c277df42 2303 if (br == NULL)
830247a4 2304 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2305 else
830247a4
IZ
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
d1b80229
IZ
2308 if (flags&HASWIDTH)
2309 *flagp |= HASWIDTH;
c277df42 2310 if (c == '|') {
830247a4
IZ
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2314 if (flags&HASWIDTH)
2315 *flagp |= HASWIDTH;
830247a4 2316 c = *nextchar(pRExC_state);
a0ed51b3
LW
2317 }
2318 else
c277df42
IZ
2319 lastbr = NULL;
2320 if (c != ')')
8615cb43 2321 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
c277df42 2324 if (lastbr) {
830247a4
IZ
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2327 }
2328 else
830247a4 2329 regtail(pRExC_state, ret, ender);
c277df42 2330 return ret;
a0ed51b3
LW
2331 }
2332 else {
830247a4 2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2334 }
2335 }
1b1626e4 2336 case 0:
830247a4 2337 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2338 vFAIL("Sequence (? incomplete");
1b1626e4 2339 break;
a0d0e21e 2340 default:
830247a4 2341 --RExC_parse;
fac92740 2342 parse_flags: /* (?i) */
830247a4 2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2346
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2352 vWARN5(
2353 RExC_parse + 1,
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2356 *RExC_parse,
2357 flagsp == &negflags ? "don't " : "",
2358 *RExC_parse
2359 );
2360 }
2361 }
2362 }
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2367 vWARN3(
2368 RExC_parse + 1,
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2372 );
2373 }
2374 }
2375 }
2376 else { pmflag(flagsp, *RExC_parse); }
2377
830247a4 2378 ++RExC_parse;
ca9dfc88 2379 }
830247a4 2380 if (*RExC_parse == '-') {
ca9dfc88 2381 flagsp = &negflags;
9d1d55b5 2382 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2383 ++RExC_parse;
ca9dfc88 2384 goto parse_flags;
48c036b1 2385 }
e2509266
JH
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
830247a4
IZ
2388 if (*RExC_parse == ':') {
2389 RExC_parse++;
ca9dfc88
IZ
2390 paren = ':';
2391 break;
2392 }
c277df42 2393 unknown:
830247a4
IZ
2394 if (*RExC_parse != ')') {
2395 RExC_parse++;
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2397 }
830247a4 2398 nextchar(pRExC_state);
a0d0e21e
LW
2399 *flagp = TRYAGAIN;
2400 return NULL;
2401 }
2402 }
fac92740 2403 else { /* (...) */
830247a4
IZ
2404 parno = RExC_npar;
2405 RExC_npar++;
2406 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2409 open = 1;
a0d0e21e 2410 }
a0ed51b3 2411 }
fac92740 2412 else /* ! paren */
a0d0e21e
LW
2413 ret = NULL;
2414
2415 /* Pick up the branches, linking them together. */
fac92740 2416 parse_start = RExC_parse; /* MJD */
830247a4 2417 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2418 /* branch_len = (paren != 0); */
2419
a0d0e21e
LW
2420 if (br == NULL)
2421 return(NULL);
830247a4
IZ
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2425 }
fac92740 2426 else { /* MJD */
830247a4 2427 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430 }
c277df42
IZ
2431 have_branch = 1;
2432 if (SIZE_ONLY)
830247a4 2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2434 }
2435 else if (paren == ':') {
c277df42
IZ
2436 *flagp |= flags&SIMPLE;
2437 }
2438 if (open) { /* Starts with OPEN. */
830247a4 2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2440 }
2441 else if (paren != '?') /* Not Conditional */
a0d0e21e 2442 ret = br;
32a0ca98 2443 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2444 lastbr = br;
830247a4
IZ
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2449 }
2450 if (SIZE_ONLY)
830247a4
IZ
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
fac92740 2454
a687059c 2455 if (br == NULL)
a0d0e21e 2456 return(NULL);
830247a4 2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2458 lastbr = br;
821b33a5
IZ
2459 if (flags&HASWIDTH)
2460 *flagp |= HASWIDTH;
a687059c 2461 *flagp |= flags&SPSTART;
a0d0e21e
LW
2462 }
2463
c277df42
IZ
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2466 switch (paren) {
2467 case ':':
830247a4 2468 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2469 break;
2470 case 1:
830247a4 2471 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2474 break;
2475 case '<':
c277df42
IZ
2476 case ',':
2477 case '=':
2478 case '!':
c277df42 2479 *flagp &= ~HASWIDTH;
821b33a5
IZ
2480 /* FALL THROUGH */
2481 case '>':
830247a4 2482 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2483 break;
2484 case 0:
830247a4 2485 ender = reg_node(pRExC_state, END);
c277df42
IZ
2486 break;
2487 }
830247a4 2488 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2489
c277df42
IZ
2490 if (have_branch) {
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2493 regoptail(pRExC_state, br, ender);
c277df42
IZ
2494 }
2495 }
a0d0e21e 2496 }
c277df42
IZ
2497
2498 {
2499 char *p;
2500 static char parens[] = "=!<,>";
2501
2502 if (paren && (p = strchr(parens, paren))) {
eb160463 2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2504 int flag = (p - parens) > 1;
2505
2506 if (paren == '>')
2507 node = SUSPEND, flag = 0;
830247a4 2508 reginsert(pRExC_state, node,ret);
ccb2c380
MP
2509 Set_Node_Offset(ret, oregcomp_parse);
2510 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
c277df42 2511 ret->flags = flag;
830247a4 2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2513 }
a0d0e21e
LW
2514 }
2515
2516 /* Check for proper termination. */
ce3e6498 2517 if (paren) {
e2509266 2518 RExC_flags = oregflags;
830247a4
IZ
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
380a0633 2521 vFAIL("Unmatched (");
ce3e6498 2522 }
a0ed51b3 2523 }
830247a4
IZ
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2526 RExC_parse++;
380a0633 2527 vFAIL("Unmatched )");
a0ed51b3
LW
2528 }
2529 else
b45f050a 2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2531 /* NOTREACHED */
2532 }
a687059c 2533
a0d0e21e 2534 return(ret);
a687059c
LW
2535}
2536
2537/*
2538 - regbranch - one alternative of an | operator
2539 *
2540 * Implements the concatenation operator.
2541 */
76e3520e 2542STATIC regnode *
830247a4 2543S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2544{
c277df42
IZ
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
a0d0e21e 2549
b81d288d 2550 if (first)
c277df42
IZ
2551 ret = NULL;
2552 else {
b81d288d 2553 if (!SIZE_ONLY && RExC_extralen)
830247a4 2554 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2555 else {
830247a4 2556 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2557 Set_Node_Length(ret, 1);
2558 }
c277df42
IZ
2559 }
2560
b81d288d 2561 if (!first && SIZE_ONLY)
830247a4 2562 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2563
c277df42 2564 *flagp = WORST; /* Tentatively. */
a0d0e21e 2565
830247a4
IZ
2566 RExC_parse--;
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2569 flags &= ~TRYAGAIN;
830247a4 2570 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2573 continue;
2574 return(NULL);
a0ed51b3
LW
2575 }
2576 else if (ret == NULL)
c277df42 2577 ret = latest;
a0d0e21e 2578 *flagp |= flags&HASWIDTH;
c277df42 2579 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2580 *flagp |= flags&SPSTART;
2581 else {
830247a4
IZ
2582 RExC_naughty++;
2583 regtail(pRExC_state, chain, latest);
a687059c 2584 }
a0d0e21e 2585 chain = latest;
c277df42
IZ
2586 c++;
2587 }
2588 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2589 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2590 if (ret == NULL)
2591 ret = chain;
2592 }
2593 if (c == 1) {
2594 *flagp |= flags&SIMPLE;
a0d0e21e 2595 }
a687059c 2596
a0d0e21e 2597 return(ret);
a687059c
LW
2598}
2599
2600/*
2601 - regpiece - something followed by possible [*+?]
2602 *
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2608 */
76e3520e 2609STATIC regnode *
830247a4 2610S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2611{
c277df42 2612 register regnode *ret;
a0d0e21e
LW
2613 register char op;
2614 register char *next;
2615 I32 flags;
830247a4 2616 char *origparse = RExC_parse;
a0d0e21e
LW
2617 char *maxpos;
2618 I32 min;
c277df42 2619 I32 max = REG_INFTY;
fac92740 2620 char *parse_start;
a0d0e21e 2621
830247a4 2622 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2623 if (ret == NULL) {
2624 if (flags & TRYAGAIN)
2625 *flagp |= TRYAGAIN;
2626 return(NULL);
2627 }
2628
830247a4 2629 op = *RExC_parse;
a0d0e21e 2630
830247a4 2631 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2632 parse_start = RExC_parse; /* MJD */
830247a4 2633 next = RExC_parse + 1;
a0d0e21e
LW
2634 maxpos = Nullch;
2635 while (isDIGIT(*next) || *next == ',') {
2636 if (*next == ',') {
2637 if (maxpos)
2638 break;
2639 else
2640 maxpos = next;
a687059c 2641 }
a0d0e21e
LW
2642 next++;
2643 }
2644 if (*next == '}') { /* got one */
2645 if (!maxpos)
2646 maxpos = next;
830247a4
IZ
2647 RExC_parse++;
2648 min = atoi(RExC_parse);
a0d0e21e
LW
2649 if (*maxpos == ',')
2650 maxpos++;
2651 else
830247a4 2652 maxpos = RExC_parse;
a0d0e21e
LW
2653 max = atoi(maxpos);
2654 if (!max && *maxpos != '0')
c277df42
IZ
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
8615cb43 2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2658 RExC_parse = next;
2659 nextchar(pRExC_state);
a0d0e21e
LW
2660
2661 do_curly:
2662 if ((flags&SIMPLE)) {
830247a4
IZ
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2667 }
2668 else {
830247a4 2669 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2670
2671 w->flags = 0;
830247a4
IZ
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2677 }
830247a4 2678 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2679 /* MJD hk */
2680 Set_Node_Offset(ret, parse_start+1);
2681 Set_Node_Length(ret,
2682 op == '{' ? (RExC_parse - parse_start) : 1);
2683
830247a4 2684 if (!SIZE_ONLY && RExC_extralen)
c277df42 2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2687 if (SIZE_ONLY)
830247a4
IZ
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2690 }
c277df42 2691 ret->flags = 0;
a0d0e21e
LW
2692
2693 if (min > 0)
821b33a5
IZ
2694 *flagp = WORST;
2695 if (max > 0)
2696 *flagp |= HASWIDTH;
a0d0e21e 2697 if (max && max < min)
8615cb43 2698 vFAIL("Can't do {n,m} with n > m");
c277df42 2699 if (!SIZE_ONLY) {
eb160463
GS
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
a687059c 2702 }
a687059c 2703
a0d0e21e 2704 goto nest_check;
a687059c 2705 }
a0d0e21e 2706 }
a687059c 2707
a0d0e21e
LW
2708 if (!ISMULT1(op)) {
2709 *flagp = flags;
a687059c 2710 return(ret);
a0d0e21e 2711 }
bb20fd44 2712
c277df42 2713#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2714
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2722
2723 */
2724
bb20fd44 2725 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2726 vFAIL("Regexp *+ operand could be empty");
b81d288d 2727#endif
bb20fd44 2728
fac92740 2729 parse_start = RExC_parse;
830247a4 2730 nextchar(pRExC_state);
a0d0e21e 2731
821b33a5 2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2733
2734 if (op == '*' && (flags&SIMPLE)) {
830247a4 2735 reginsert(pRExC_state, STAR, ret);
c277df42 2736 ret->flags = 0;
830247a4 2737 RExC_naughty += 4;
a0d0e21e
LW
2738 }
2739 else if (op == '*') {
2740 min = 0;
2741 goto do_curly;
a0ed51b3
LW
2742 }
2743 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2744 reginsert(pRExC_state, PLUS, ret);
c277df42 2745 ret->flags = 0;
830247a4 2746 RExC_naughty += 3;
a0d0e21e
LW
2747 }
2748 else if (op == '+') {
2749 min = 1;
2750 goto do_curly;
a0ed51b3
LW
2751 }
2752 else if (op == '?') {
a0d0e21e
LW
2753 min = 0; max = 1;
2754 goto do_curly;
2755 }
2756 nest_check:
e476b1b5 2757 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2758 vWARN3(RExC_parse,
b45f050a 2759 "%.*s matches null string many times",
830247a4 2760 RExC_parse - origparse,
b45f050a 2761 origparse);
a0d0e21e
LW
2762 }
2763
830247a4
IZ
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2768 }
830247a4
IZ
2769 if (ISMULT2(RExC_parse)) {
2770 RExC_parse++;
b45f050a
JF
2771 vFAIL("Nested quantifiers");
2772 }
a0d0e21e
LW
2773
2774 return(ret);
a687059c
LW
2775}
2776
2777/*
2778 - regatom - the lowest level
2779 *
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2784 *
b45f050a 2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2786STATIC regnode *
830247a4 2787S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2788{
c277df42 2789 register regnode *ret = 0;
a0d0e21e 2790 I32 flags;
f06dbbb7 2791 char *parse_start = 0;
a0d0e21e
LW
2792
2793 *flagp = WORST; /* Tentatively. */
2794
2795tryagain:
830247a4 2796 switch (*RExC_parse) {
a0d0e21e 2797 case '^':
830247a4
IZ
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
e2509266 2800 if (RExC_flags & PMf_MULTILINE)
830247a4 2801 ret = reg_node(pRExC_state, MBOL);
e2509266 2802 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2803 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2804 else
830247a4 2805 ret = reg_node(pRExC_state, BOL);
fac92740 2806 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2807 break;
2808 case '$':
830247a4 2809 nextchar(pRExC_state);
b81d288d 2810 if (*RExC_parse)
830247a4 2811 RExC_seen_zerolen++;
e2509266 2812 if (RExC_flags & PMf_MULTILINE)
830247a4 2813 ret = reg_node(pRExC_state, MEOL);
e2509266 2814 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2815 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2816 else
830247a4 2817 ret = reg_node(pRExC_state, EOL);
fac92740 2818 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2819 break;
2820 case '.':
830247a4 2821 nextchar(pRExC_state);
e2509266 2822 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2823 ret = reg_node(pRExC_state, SANY);
2824 else
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
830247a4 2827 RExC_naughty++;
fac92740 2828 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2829 break;
2830 case '[':
b45f050a 2831 {
830247a4 2832 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2833 ret = regclass(pRExC_state);
830247a4
IZ
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
b45f050a
JF
2836 vFAIL("Unmatched [");
2837 }
830247a4 2838 nextchar(pRExC_state);
a0d0e21e 2839 *flagp |= HASWIDTH|SIMPLE;
fac92740 2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2841 break;
b45f050a 2842 }
a0d0e21e 2843 case '(':
830247a4
IZ
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2846 if (ret == NULL) {
bf93d4cc 2847 if (flags & TRYAGAIN) {
830247a4 2848 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2849 /* Make parent create an empty node if needed. */
2850 *flagp |= TRYAGAIN;
2851 return(NULL);
2852 }
a0d0e21e 2853 goto tryagain;
bf93d4cc 2854 }
a0d0e21e
LW
2855 return(NULL);
2856 }
c277df42 2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2858 break;
2859 case '|':
2860 case ')':
2861 if (flags & TRYAGAIN) {
2862 *flagp |= TRYAGAIN;
2863 return NULL;
2864 }
b45f050a 2865 vFAIL("Internal urp");
a0d0e21e
LW
2866 /* Supposed to be caught earlier. */
2867 break;
85afd4ae 2868 case '{':
830247a4
IZ
2869 if (!regcurly(RExC_parse)) {
2870 RExC_parse++;
85afd4ae
CS
2871 goto defchar;
2872 }
2873 /* FALL THROUGH */
a0d0e21e
LW
2874 case '?':
2875 case '+':
2876 case '*':
830247a4 2877 RExC_parse++;
b45f050a 2878 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2879 break;
2880 case '\\':
830247a4 2881 switch (*++RExC_parse) {
a0d0e21e 2882 case 'A':
830247a4
IZ
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2885 *flagp |= SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2888 break;
2889 case 'G':
830247a4
IZ
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2892 *flagp |= SIMPLE;
830247a4 2893 nextchar(pRExC_state);
fac92740 2894 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2895 break;
2896 case 'Z':
830247a4 2897 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2898 *flagp |= SIMPLE;
a1917ab9 2899 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2900 nextchar(pRExC_state);
a0d0e21e 2901 break;
b85d18e9 2902 case 'z':
830247a4 2903 ret = reg_node(pRExC_state, EOS);
b85d18e9 2904 *flagp |= SIMPLE;
830247a4
IZ
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
fac92740 2907 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2908 break;
4a2d328f 2909 case 'C':
f33976b4
DB
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2912 *flagp |= HASWIDTH|SIMPLE;
830247a4 2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2915 break;
2916 case 'X':
830247a4 2917 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2918 *flagp |= HASWIDTH;
830247a4 2919 nextchar(pRExC_state);
fac92740 2920 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2921 break;
a0d0e21e 2922 case 'w':
eb160463 2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2924 *flagp |= HASWIDTH|SIMPLE;
830247a4 2925 nextchar(pRExC_state);
fac92740 2926 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2927 break;
2928 case 'W':
eb160463 2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2930 *flagp |= HASWIDTH|SIMPLE;
830247a4 2931 nextchar(pRExC_state);
fac92740 2932 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2933 break;
2934 case 'b':
830247a4
IZ
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2938 *flagp |= SIMPLE;
830247a4 2939 nextchar(pRExC_state);
fac92740 2940 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2941 break;
2942 case 'B':
830247a4
IZ
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2946 *flagp |= SIMPLE;
830247a4 2947 nextchar(pRExC_state);
fac92740 2948 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2949 break;
2950 case 's':
eb160463 2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2952 *flagp |= HASWIDTH|SIMPLE;
830247a4 2953 nextchar(pRExC_state);
fac92740 2954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2955 break;
2956 case 'S':
eb160463 2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2958 *flagp |= HASWIDTH|SIMPLE;
830247a4 2959 nextchar(pRExC_state);
fac92740 2960 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2961 break;
2962 case 'd':
ffc61ed2 2963 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2964 *flagp |= HASWIDTH|SIMPLE;
830247a4 2965 nextchar(pRExC_state);
fac92740 2966 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2967 break;
2968 case 'D':
ffc61ed2 2969 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2970 *flagp |= HASWIDTH|SIMPLE;
830247a4 2971 nextchar(pRExC_state);
fac92740 2972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2973 break;
a14b48bc
LW
2974 case 'p':
2975 case 'P':
3568d838 2976 {
830247a4 2977 char* oldregxend = RExC_end;
ccb2c380 2978 char* parse_start = RExC_parse - 2;
a14b48bc 2979
830247a4 2980 if (RExC_parse[1] == '{') {
3568d838 2981 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2982 RExC_end = strchr(RExC_parse, '}');
2983 if (!RExC_end) {
0da60cf5 2984 U8 c = (U8)*RExC_parse;
830247a4
IZ
2985 RExC_parse += 2;
2986 RExC_end = oldregxend;
0da60cf5 2987 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2988 }
830247a4 2989 RExC_end++;
a14b48bc 2990 }
af6f566e 2991 else {
830247a4 2992 RExC_end = RExC_parse + 2;
af6f566e
HS
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2995 }
830247a4 2996 RExC_parse--;
a14b48bc 2997
ffc61ed2 2998 ret = regclass(pRExC_state);
a14b48bc 2999
830247a4
IZ
3000 RExC_end = oldregxend;
3001 RExC_parse--;
ccb2c380
MP
3002
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
830247a4 3005 nextchar(pRExC_state);
a14b48bc
LW
3006 *flagp |= HASWIDTH|SIMPLE;
3007 }
3008 break;
a0d0e21e
LW
3009 case 'n':
3010 case 'r':
3011 case 't':
3012 case 'f':
3013 case 'e':
3014 case 'a':
3015 case 'x':
3016 case 'c':
3017 case '0':
3018 goto defchar;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3021 {
830247a4 3022 I32 num = atoi(RExC_parse);
a0d0e21e 3023
830247a4 3024 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3025 goto defchar;
3026 else {
fac92740 3027 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3028 while (isDIGIT(*RExC_parse))
3029 RExC_parse++;
b45f050a 3030
eb160463 3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3032 vFAIL("Reference to nonexistent group");
830247a4 3033 RExC_sawback = 1;
eb160463
GS
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036 num);
a0d0e21e 3037 *flagp |= HASWIDTH;
fac92740
MJD
3038
3039 /* override incorrect value set in reganode MJD */
3040 Set_Node_Offset(ret, parse_start+1);
3041 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3042 RExC_parse--;
3043 nextchar(pRExC_state);
a0d0e21e
LW
3044 }
3045 }
3046 break;
3047 case '\0':
830247a4 3048 if (RExC_parse >= RExC_end)
b45f050a 3049 FAIL("Trailing \\");
a0d0e21e
LW
3050 /* FALL THROUGH */
3051 default:
c9f97d15
IZ
3052 /* Do not generate `unrecognized' warnings here, we fall
3053 back into the quick-grab loop below */
a0d0e21e
LW
3054 goto defchar;
3055 }
3056 break;
4633a7c4
LW
3057
3058 case '#':
e2509266 3059 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3060 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061 if (RExC_parse < RExC_end)
4633a7c4
LW
3062 goto tryagain;
3063 }
3064 /* FALL THROUGH */
3065
a0d0e21e 3066 default: {
ba210ebe 3067 register STRLEN len;
58ae7d3f 3068 register UV ender;
a0d0e21e 3069 register char *p;
c277df42 3070 char *oldp, *s;
ba210ebe 3071 STRLEN numlen;
80aecb99 3072 STRLEN foldlen;
60a8b682 3073 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3074
3075 parse_start = RExC_parse - 1;
a0d0e21e 3076
830247a4 3077 RExC_parse++;
a0d0e21e
LW
3078
3079 defchar:
58ae7d3f 3080 ender = 0;
eb160463
GS
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3083 s = STRING(ret);
830247a4
IZ
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
a0d0e21e
LW
3086 len++)
3087 {
3088 oldp = p;
5b5a24f7 3089
e2509266 3090 if (RExC_flags & PMf_EXTENDED)
830247a4 3091 p = regwhite(p, RExC_end);
a0d0e21e
LW
3092 switch (*p) {
3093 case '^':
3094 case '$':
3095 case '.':
3096 case '[':
3097 case '(':
3098 case ')':
3099 case '|':
3100 goto loopdone;
3101 case '\\':
3102 switch (*++p) {
3103 case 'A':
1ed8eac0
JF
3104 case 'C':
3105 case 'X':
a0d0e21e
LW
3106 case 'G':
3107 case 'Z':
b85d18e9 3108 case 'z':
a0d0e21e
LW
3109 case 'w':
3110 case 'W':
3111 case 'b':
3112 case 'B':
3113 case 's':
3114 case 'S':
3115 case 'd':
3116 case 'D':
a14b48bc
LW
3117 case 'p':
3118 case 'P':
a0d0e21e
LW
3119 --p;
3120 goto loopdone;
3121 case 'n':
3122 ender = '\n';
3123 p++;
a687059c 3124 break;
a0d0e21e
LW
3125 case 'r':
3126 ender = '\r';
3127 p++;
a687059c 3128 break;
a0d0e21e
LW
3129 case 't':
3130 ender = '\t';
3131 p++;
a687059c 3132 break;
a0d0e21e
LW
3133 case 'f':
3134 ender = '\f';
3135 p++;
a687059c 3136 break;
a0d0e21e 3137 case 'e':
c7f1f016 3138 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3139 p++;
a687059c 3140 break;
a0d0e21e 3141 case 'a':
c7f1f016 3142 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3143 p++;
a687059c 3144 break;
a0d0e21e 3145 case 'x':
a0ed51b3
LW
3146 if (*++p == '{') {
3147 char* e = strchr(p, '}');
b81d288d 3148
b45f050a 3149 if (!e) {
830247a4 3150 RExC_parse = p + 1;
b45f050a
JF
3151 vFAIL("Missing right brace on \\x{}");
3152 }
de5f0749 3153 else {
a4c04bdc
NC
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3156 numlen = e - p - 1;
3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3158 if (ender > 0xff)
3159 RExC_utf8 = 1;
b21ed0a9
GS
3160 /* numlen is generous */
3161 if (numlen + len >= 127) {
a0ed51b3
LW
3162 p--;
3163 goto loopdone;
3164 }
3165 p = e + 1;
3166 }
a0ed51b3
LW
3167 }
3168 else {
a4c04bdc 3169 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3170 numlen = 2;
3171 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3172 p += numlen;
3173 }
a687059c 3174 break;
a0d0e21e
LW
3175 case 'c':
3176 p++;
bbce6d69 3177 ender = UCHARAT(p++);
3178 ender = toCTRL(ender);
a687059c 3179 break;
a0d0e21e
LW
3180 case '0': case '1': case '2': case '3':case '4':
3181 case '5': case '6': case '7': case '8':case '9':
3182 if (*p == '0' ||
830247a4 3183 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3184 I32 flags = 0;
3185 numlen = 3;
3186 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3187 p += numlen;
3188 }
3189 else {
3190 --p;
3191 goto loopdone;
a687059c
LW
3192 }
3193 break;
a0d0e21e 3194 case '\0':
830247a4 3195 if (p >= RExC_end)
b45f050a 3196 FAIL("Trailing \\");
a687059c 3197 /* FALL THROUGH */
a0d0e21e 3198 default:
e476b1b5 3199 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3200 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3201 goto normal_default;
a0d0e21e
LW
3202 }
3203 break;
a687059c 3204 default:
a0ed51b3 3205 normal_default:
fd400ab9 3206 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3207 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3208 &numlen, 0);
a0ed51b3
LW
3209 p += numlen;
3210 }
3211 else
3212 ender = *p++;
a0d0e21e 3213 break;
a687059c 3214 }
e2509266 3215 if (RExC_flags & PMf_EXTENDED)
830247a4 3216 p = regwhite(p, RExC_end);
60a8b682
JH
3217 if (UTF && FOLD) {
3218 /* Prime the casefolded buffer. */
ac7e0132 3219 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3220 }
a0d0e21e
LW
3221 if (ISMULT2(p)) { /* Back off on ?+*. */
3222 if (len)
3223 p = oldp;
16ea2a2e 3224 else if (UTF) {
0ebc6274
JH
3225 STRLEN unilen;
3226
80aecb99 3227 if (FOLD) {
60a8b682 3228 /* Emit all the Unicode characters. */
80aecb99
JH
3229 for (foldbuf = tmpbuf;
3230 foldlen;
3231 foldlen -= numlen) {
3232 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3233 if (numlen > 0) {
0ebc6274
JH
3234 reguni(pRExC_state, ender, s, &unilen);
3235 s += unilen;
3236 len += unilen;
3237 /* In EBCDIC the numlen
3238 * and unilen can differ. */
9dc45d57 3239 foldbuf += numlen;
47654450
JH
3240 if (numlen >= foldlen)
3241 break;
9dc45d57
JH
3242 }
3243 else
3244 break; /* "Can't happen." */
80aecb99
JH
3245 }
3246 }
3247 else {
0ebc6274 3248 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3249 if (unilen > 0) {
0ebc6274
JH
3250 s += unilen;
3251 len += unilen;
9dc45d57 3252 }
80aecb99 3253 }
a0ed51b3 3254 }
a0d0e21e
LW
3255 else {
3256 len++;
eb160463 3257 REGC((char)ender, s++);
a0d0e21e
LW
3258 }
3259 break;
a687059c 3260 }
16ea2a2e 3261 if (UTF) {
0ebc6274
JH
3262 STRLEN unilen;
3263
80aecb99 3264 if (FOLD) {
60a8b682 3265 /* Emit all the Unicode characters. */
80aecb99
JH
3266 for (foldbuf = tmpbuf;
3267 foldlen;
3268 foldlen -= numlen) {
3269 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3270 if (numlen > 0) {
0ebc6274
JH
3271 reguni(pRExC_state, ender, s, &unilen);
3272 len += unilen;
3273 s += unilen;
3274 /* In EBCDIC the numlen
3275 * and unilen can differ. */
9dc45d57 3276 foldbuf += numlen;
47654450
JH
3277 if (numlen >= foldlen)
3278 break;
9dc45d57
JH
3279 }
3280 else
3281 break;
80aecb99
JH
3282 }
3283 }
3284 else {
0ebc6274 3285 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3286 if (unilen > 0) {
0ebc6274
JH
3287 s += unilen;
3288 len += unilen;
9dc45d57 3289 }
80aecb99
JH
3290 }
3291 len--;
a0ed51b3
LW
3292 }
3293 else
eb160463 3294 REGC((char)ender, s++);
a0d0e21e
LW
3295 }
3296 loopdone:
830247a4 3297 RExC_parse = p - 1;
fac92740 3298 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3299 nextchar(pRExC_state);
793db0cb
JH
3300 {
3301 /* len is STRLEN which is unsigned, need to copy to signed */
3302 IV iv = len;
3303 if (iv < 0)
3304 vFAIL("Internal disaster");
3305 }
a0d0e21e
LW
3306 if (len > 0)
3307 *flagp |= HASWIDTH;
3308 if (len == 1)
3309 *flagp |= SIMPLE;
c277df42 3310 if (!SIZE_ONLY)
cd439c50
IZ
3311 STR_LEN(ret) = len;
3312 if (SIZE_ONLY)
830247a4 3313 RExC_size += STR_SZ(len);
cd439c50 3314 else
830247a4 3315 RExC_emit += STR_SZ(len);
a687059c 3316 }
a0d0e21e
LW
3317 break;
3318 }
a687059c 3319
60a8b682
JH
3320 /* If the encoding pragma is in effect recode the text of
3321 * any EXACT-kind nodes. */
22c54be3 3322 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3323 STRLEN oldlen = STR_LEN(ret);
3324 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3325
3326 if (RExC_utf8)
3327 SvUTF8_on(sv);
3328 if (sv_utf8_downgrade(sv, TRUE)) {
3329 char *s = sv_recode_to_utf8(sv, PL_encoding);
3330 STRLEN newlen = SvCUR(sv);
3331
3332 if (SvUTF8(sv))
3333 RExC_utf8 = 1;
3334 if (!SIZE_ONLY) {
3335 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3336 (int)oldlen, STRING(ret),
3337 (int)newlen, s));
3338 Copy(s, STRING(ret), newlen, char);
3339 STR_LEN(ret) += newlen - oldlen;
3340 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3341 } else
3342 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3343 }
a72c7584
JH
3344 }
3345
a0d0e21e 3346 return(ret);
a687059c
LW
3347}
3348
873ef191 3349STATIC char *
cea2e8a9 3350S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3351{
3352 while (p < e) {
3353 if (isSPACE(*p))
3354 ++p;
3355 else if (*p == '#') {
3356 do {
3357 p++;
3358 } while (p < e && *p != '\n');
3359 }
3360 else
3361 break;
3362 }
3363 return p;
3364}
3365
b8c5462f
JH
3366/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3367 Character classes ([:foo:]) can also be negated ([:^foo:]).
3368 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3369 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3370 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3371
3372#define POSIXCC_DONE(c) ((c) == ':')
3373#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3374#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3375
b8c5462f 3376STATIC I32
830247a4 3377S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3378{
3379 char *posixcc = 0;
936ed897 3380 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3381
830247a4 3382 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3383 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3384 POSIXCC(UCHARAT(RExC_parse))) {
3385 char c = UCHARAT(RExC_parse);
830247a4 3386 char* s = RExC_parse++;
b81d288d 3387
9a86a77b 3388 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3389 RExC_parse++;
3390 if (RExC_parse == RExC_end)
620e46c5 3391 /* Grandfather lone [:, [=, [. */
830247a4 3392 RExC_parse = s;
620e46c5 3393 else {
830247a4 3394 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3395
9a86a77b 3396 if (UCHARAT(RExC_parse) == ']') {
830247a4 3397 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3398 posixcc = s + 1;
3399 if (*s == ':') {
3400 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401 I32 skip = 5; /* the most common skip */
3402
3403 switch (*posixcc) {
3404 case 'a':
3405 if (strnEQ(posixcc, "alnum", 5))
3406 namedclass =
3407 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3408 else if (strnEQ(posixcc, "alpha", 5))
3409 namedclass =
3410 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3411 else if (strnEQ(posixcc, "ascii", 5))
3412 namedclass =
3413 complement ? ANYOF_NASCII : ANYOF_ASCII;
3414 break;
aaa51d5e
JF
3415 case 'b':
3416 if (strnEQ(posixcc, "blank", 5))
3417 namedclass =
3418 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3419 break;
b8c5462f
JH
3420 case 'c':
3421 if (strnEQ(posixcc, "cntrl", 5))
3422 namedclass =
3423 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3424 break;
3425 case 'd':
3426 if (strnEQ(posixcc, "digit", 5))
3427 namedclass =
3428 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3429 break;
3430 case 'g':
3431 if (strnEQ(posixcc, "graph", 5))
3432 namedclass =
3433 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3434 break;
3435 case 'l':
3436 if (strnEQ(posixcc, "lower", 5))
3437 namedclass =
3438 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3439 break;
3440 case 'p':
3441 if (strnEQ(posixcc, "print", 5))
3442 namedclass =
3443 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3444 else if (strnEQ(posixcc, "punct", 5))
3445 namedclass =
3446 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3447 break;
3448 case 's':
3449 if (strnEQ(posixcc, "space", 5))
3450 namedclass =
aaa51d5e 3451 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3452 break;
b8c5462f
JH
3453 case 'u':
3454 if (strnEQ(posixcc, "upper", 5))
3455 namedclass =
3456 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3457 break;
3458 case 'w': /* this is not POSIX, this is the Perl \w */
3459 if (strnEQ(posixcc, "word", 4)) {
3460 namedclass =
3461 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3462 skip = 4;
3463 }
3464 break;
3465 case 'x':
3466 if (strnEQ(posixcc, "xdigit", 6)) {
3467 namedclass =
3468 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3469 skip = 6;
3470 }
3471 break;
3472 }
ac561586
JH
3473 if (namedclass == OOB_NAMEDCLASS ||
3474 posixcc[skip] != ':' ||
3475 posixcc[skip+1] != ']')
b45f050a
JF
3476 {
3477 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3478 t - s - 1, s + 1);
3479 }
3480 } else if (!SIZE_ONLY) {
b8c5462f 3481 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3482
830247a4 3483 /* adjust RExC_parse so the warning shows after
b45f050a 3484 the class closes */
9a86a77b 3485 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3486 RExC_parse++;
b45f050a
JF
3487 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3488 }
b8c5462f
JH
3489 } else {
3490 /* Maternal grandfather:
3491 * "[:" ending in ":" but not in ":]" */
830247a4 3492 RExC_parse = s;
767d463e 3493 }
620e46c5
JH
3494 }
3495 }
3496
b8c5462f
JH
3497 return namedclass;
3498}
3499
3500STATIC void
830247a4 3501S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3502{
b938889d 3503 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3504 char *s = RExC_parse;
93733859 3505 char c = *s++;
b8c5462f
JH
3506
3507 while(*s && isALNUM(*s))
3508 s++;
3509 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3510 if (ckWARN(WARN_REGEXP))
3511 vWARN3(s+2,
3512 "POSIX syntax [%c %c] belongs inside character classes",
3513 c, c);
b45f050a
JF
3514
3515 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3516 if (POSIXCC_NOTYET(c)) {
830247a4 3517 /* adjust RExC_parse so the error shows after
b45f050a 3518 the class closes */
9a86a77b 3519 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3520 ;
3521 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3522 }
b8c5462f
JH
3523 }
3524 }
620e46c5
JH
3525}
3526
76e3520e 3527STATIC regnode *
830247a4 3528S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3529{
ffc61ed2 3530 register UV value;
9a86a77b 3531 register UV nextvalue;
3568d838 3532 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3533 register IV range = 0;
c277df42 3534 register regnode *ret;
ba210ebe 3535 STRLEN numlen;
ffc61ed2 3536 IV namedclass;
9c5ffd7c 3537 char *rangebegin = 0;
936ed897 3538 bool need_class = 0;
9c5ffd7c 3539 SV *listsv = Nullsv;
ffc61ed2
JH
3540 register char *e;
3541 UV n;
9e55ce06
JH
3542 bool optimize_invert = TRUE;
3543 AV* unicode_alternate = 0;
1b2d223b
JH
3544#ifdef EBCDIC
3545 UV literal_endpoint = 0;
3546#endif
ffc61ed2
JH
3547
3548 ret = reganode(pRExC_state, ANYOF, 0);
3549
3550 if (!SIZE_ONLY)
3551 ANYOF_FLAGS(ret) = 0;
3552
9a86a77b 3553 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3554 RExC_naughty++;
3555 RExC_parse++;
3556 if (!SIZE_ONLY)
3557 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3558 }
a0d0e21e 3559
936ed897 3560 if (SIZE_ONLY)
830247a4 3561 RExC_size += ANYOF_SKIP;
936ed897 3562 else {
830247a4 3563 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3564 if (FOLD)
3565 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3566 if (LOC)
3567 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3568 ANYOF_BITMAP_ZERO(ret);
3569 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3570 }
b8c5462f 3571
9a86a77b
JH
3572 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3573
b938889d 3574 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3575 checkposixcc(pRExC_state);
b8c5462f 3576
f064b6ad
HS
3577 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3578 if (UCHARAT(RExC_parse) == ']')
3579 goto charclassloop;
ffc61ed2 3580
9a86a77b 3581 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3582
3583 charclassloop:
3584
3585 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3586
73b437c8 3587 if (!range)
830247a4 3588 rangebegin = RExC_parse;
ffc61ed2 3589 if (UTF) {
5e12f4fb 3590 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3591 RExC_end - RExC_parse,
3592 &numlen, 0);
ffc61ed2
JH
3593 RExC_parse += numlen;
3594 }
3595 else
3596 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3597 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3598 if (value == '[' && POSIXCC(nextvalue))
830247a4 3599 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3600 else if (value == '\\') {
ffc61ed2 3601 if (UTF) {
5e12f4fb 3602 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3603 RExC_end - RExC_parse,
3604 &numlen, 0);
3605 RExC_parse += numlen;
3606 }
3607 else
3608 value = UCHARAT(RExC_parse++);
470c3474 3609 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3610 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3611 * be a problem later if we want switch on Unicode.
3612 * A similar issue a little bit later when switching on
3613 * namedclass. --jhi */
ffc61ed2 3614 switch ((I32)value) {
b8c5462f
JH
3615 case 'w': namedclass = ANYOF_ALNUM; break;
3616 case 'W': namedclass = ANYOF_NALNUM; break;
3617 case 's': namedclass = ANYOF_SPACE; break;
3618 case 'S': namedclass = ANYOF_NSPACE; break;
3619 case 'd': namedclass = ANYOF_DIGIT; break;
3620 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3621 case 'p':
3622 case 'P':
af6f566e 3623 if (RExC_parse >= RExC_end)
2a4859cd 3624 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3625 if (*RExC_parse == '{') {
0da60cf5 3626 U8 c = (U8)value;
ffc61ed2
JH
3627 e = strchr(RExC_parse++, '}');
3628 if (!e)
0da60cf5 3629 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3630 while (isSPACE(UCHARAT(RExC_parse)))
3631 RExC_parse++;
3632 if (e == RExC_parse)
0da60cf5 3633 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3634 n = e - RExC_parse;
ab13f0c7
JH
3635 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3636 n--;
ffc61ed2
JH
3637 }
3638 else {
3639 e = RExC_parse;
3640 n = 1;
3641 }
3642 if (!SIZE_ONLY) {
ab13f0c7
JH
3643 if (UCHARAT(RExC_parse) == '^') {
3644 RExC_parse++;
3645 n--;
3646 value = value == 'p' ? 'P' : 'p'; /* toggle */
3647 while (isSPACE(UCHARAT(RExC_parse))) {
3648 RExC_parse++;
3649 n--;
3650 }
3651 }
ffc61ed2 3652 if (value == 'p')
ab13f0c7
JH
3653 Perl_sv_catpvf(aTHX_ listsv,
3654 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3655 else
ab13f0c7
JH
3656 Perl_sv_catpvf(aTHX_ listsv,
3657 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3658 }
3659 RExC_parse = e + 1;
3660 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3661 continue;
b8c5462f
JH
3662 case 'n': value = '\n'; break;
3663 case 'r': value = '\r'; break;
3664 case 't': value = '\t'; break;
3665 case 'f': value = '\f'; break;
3666 case 'b': value = '\b'; break;
c7f1f016
NIS
3667 case 'e': value = ASCII_TO_NATIVE('\033');break;
3668 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3669 case 'x':
ffc61ed2 3670 if (*RExC_parse == '{') {
a4c04bdc
NC
3671 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3672 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3673 e = strchr(RExC_parse++, '}');
b81d288d 3674 if (!e)
ffc61ed2 3675 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3676
3677 numlen = e - RExC_parse;
3678 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3679 RExC_parse = e + 1;
3680 }
3681 else {
a4c04bdc 3682 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3683 numlen = 2;
3684 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3685 RExC_parse += numlen;
3686 }
b8c5462f
JH
3687 break;
3688 case 'c':
830247a4 3689 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3690 value = toCTRL(value);
3691 break;
3692 case '0': case '1': case '2': case '3': case '4':
3693 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3694 {
3695 I32 flags = 0;
3696 numlen = 3;
3697 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3698 RExC_parse += numlen;
b8c5462f 3699 break;
53305cf1 3700 }
1028017a 3701 default:
e476b1b5 3702 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3703 vWARN2(RExC_parse,
3704 "Unrecognized escape \\%c in character class passed through",
3705 (int)value);
1028017a 3706 break;
b8c5462f 3707 }
ffc61ed2 3708 } /* end of \blah */
1b2d223b
JH
3709#ifdef EBCDIC
3710 else
3711 literal_endpoint++;
3712#endif
ffc61ed2
JH
3713
3714 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3715
3716 if (!SIZE_ONLY && !need_class)
936ed897 3717 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3718
936ed897 3719 need_class = 1;
ffc61ed2
JH
3720
3721 /* a bad range like a-\d, a-[:digit:] ? */
3722 if (range) {
73b437c8 3723 if (!SIZE_ONLY) {
e476b1b5 3724 if (ckWARN(WARN_REGEXP))
830247a4 3725 vWARN4(RExC_parse,
b45f050a 3726 "False [] range \"%*.*s\"",
830247a4
IZ
3727 RExC_parse - rangebegin,
3728 RExC_parse - rangebegin,
b45f050a 3729 rangebegin);
3568d838
JH
3730 if (prevvalue < 256) {
3731 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3732 ANYOF_BITMAP_SET(ret, '-');
3733 }
3734 else {
3735 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3736 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3737 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3738 }
b8c5462f 3739 }
ffc61ed2
JH
3740
3741 range = 0; /* this was not a true range */
73b437c8 3742 }
ffc61ed2 3743
73b437c8 3744 if (!SIZE_ONLY) {
3568d838
JH
3745 if (namedclass > OOB_NAMEDCLASS)
3746 optimize_invert = FALSE;
e2962f66
JH
3747 /* Possible truncation here but in some 64-bit environments
3748 * the compiler gets heartburn about switch on 64-bit values.
3749 * A similar issue a little earlier when switching on value.
98f323fa 3750 * --jhi */
e2962f66 3751 switch ((I32)namedclass) {
73b437c8
JH
3752 case ANYOF_ALNUM:
3753 if (LOC)
936ed897 3754 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3755 else {
3756 for (value = 0; value < 256; value++)
3757 if (isALNUM(value))
936ed897 3758 ANYOF_BITMAP_SET(ret, value);
73b437c8 3759 }
ffc61ed2 3760 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3761 break;
3762 case ANYOF_NALNUM:
3763 if (LOC)
936ed897 3764 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3765 else {
3766 for (value = 0; value < 256; value++)
3767 if (!isALNUM(value))
936ed897 3768 ANYOF_BITMAP_SET(ret, value);
73b437c8 3769 }
ffc61ed2 3770 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3771 break;
ffc61ed2 3772 case ANYOF_ALNUMC:
73b437c8 3773 if (LOC)
ffc61ed2 3774 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3775 else {
3776 for (value = 0; value < 256; value++)
ffc61ed2 3777 if (isALNUMC(value))
936ed897 3778 ANYOF_BITMAP_SET(ret, value);
73b437c8 3779 }
ffc61ed2 3780 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3781 break;
3782 case ANYOF_NALNUMC:
3783 if (LOC)
936ed897 3784 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (!isALNUMC(value))
936ed897 3788 ANYOF_BITMAP_SET(ret, value);
73b437c8 3789 }
ffc61ed2 3790 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3791 break;
3792 case ANYOF_ALPHA:
3793 if (LOC)
936ed897 3794 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3795 else {
3796 for (value = 0; value < 256; value++)
3797 if (isALPHA(value))
936ed897 3798 ANYOF_BITMAP_SET(ret, value);
73b437c8 3799 }
ffc61ed2 3800 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3801 break;
3802 case ANYOF_NALPHA:
3803 if (LOC)
936ed897 3804 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3805 else {
3806 for (value = 0; value < 256; value++)
3807 if (!isALPHA(value))
936ed897 3808 ANYOF_BITMAP_SET(ret, value);
73b437c8 3809 }
ffc61ed2 3810 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3811 break;
3812 case ANYOF_ASCII:
3813 if (LOC)
936ed897 3814 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3815 else {
c7f1f016 3816#ifndef EBCDIC
1ba5c669
JH
3817 for (value = 0; value < 128; value++)
3818 ANYOF_BITMAP_SET(ret, value);
3819#else /* EBCDIC */
ffbc6a93 3820 for (value = 0; value < 256; value++) {
3a3c4447
JH
3821 if (isASCII(value))
3822 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3823 }
1ba5c669 3824#endif /* EBCDIC */
73b437c8 3825 }
ffc61ed2 3826 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3827 break;
3828 case ANYOF_NASCII:
3829 if (LOC)
936ed897 3830 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3831 else {
c7f1f016 3832#ifndef EBCDIC
1ba5c669
JH
3833 for (value = 128; value < 256; value++)
3834 ANYOF_BITMAP_SET(ret, value);
3835#else /* EBCDIC */
ffbc6a93 3836 for (value = 0; value < 256; value++) {
3a3c4447
JH
3837 if (!isASCII(value))
3838 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3839 }
1ba5c669 3840#endif /* EBCDIC */
73b437c8 3841 }
ffc61ed2 3842 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3843 break;
aaa51d5e
JF
3844 case ANYOF_BLANK:
3845 if (LOC)
3846 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3847 else {
3848 for (value = 0; value < 256; value++)
3849 if (isBLANK(value))
3850 ANYOF_BITMAP_SET(ret, value);
3851 }
ffc61ed2 3852 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3853 break;
3854 case ANYOF_NBLANK:
3855 if (LOC)
3856 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3857 else {
3858 for (value = 0; value < 256; value++)
3859 if (!isBLANK(value))
3860 ANYOF_BITMAP_SET(ret, value);
3861 }
ffc61ed2 3862 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3863 break;
73b437c8
JH
3864 case ANYOF_CNTRL:
3865 if (LOC)
936ed897 3866 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3867 else {
3868 for (value = 0; value < 256; value++)
3869 if (isCNTRL(value))
936ed897 3870 ANYOF_BITMAP_SET(ret, value);
73b437c8 3871 }
ffc61ed2 3872 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3873 break;
3874 case ANYOF_NCNTRL:
3875 if (LOC)
936ed897 3876 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3877 else {
3878 for (value = 0; value < 256; value++)
3879 if (!isCNTRL(value))
936ed897 3880 ANYOF_BITMAP_SET(ret, value);
73b437c8 3881 }
ffc61ed2
JH
3882 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3883 break;
3884 case ANYOF_DIGIT:
3885 if (LOC)
3886 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3887 else {
3888 /* consecutive digits assumed */
3889 for (value = '0'; value <= '9'; value++)
3890 ANYOF_BITMAP_SET(ret, value);
3891 }
3892 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3893 break;
3894 case ANYOF_NDIGIT:
3895 if (LOC)
3896 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3897 else {
3898 /* consecutive digits assumed */
3899 for (value = 0; value < '0'; value++)
3900 ANYOF_BITMAP_SET(ret, value);
3901 for (value = '9' + 1; value < 256; value++)
3902 ANYOF_BITMAP_SET(ret, value);
3903 }
3904 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3905 break;
3906 case ANYOF_GRAPH:
3907 if (LOC)
936ed897 3908 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3909 else {
3910 for (value = 0; value < 256; value++)
3911 if (isGRAPH(value))
936ed897 3912 ANYOF_BITMAP_SET(ret, value);
73b437c8 3913 }
ffc61ed2 3914 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3915 break;
3916 case ANYOF_NGRAPH:
3917 if (LOC)
936ed897 3918 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3919 else {
3920 for (value = 0; value < 256; value++)
3921 if (!isGRAPH(value))
936ed897 3922 ANYOF_BITMAP_SET(ret, value);
73b437c8 3923 }
ffc61ed2 3924 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3925 break;
3926 case ANYOF_LOWER:
3927 if (LOC)
936ed897 3928 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3929 else {
3930 for (value = 0; value < 256; value++)
3931 if (isLOWER(value))
936ed897 3932 ANYOF_BITMAP_SET(ret, value);
73b437c8 3933 }
ffc61ed2 3934 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3935 break;
3936 case ANYOF_NLOWER:
3937 if (LOC)
936ed897 3938 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3939 else {
3940 for (value = 0; value < 256; value++)
3941 if (!isLOWER(value))
936ed897 3942 ANYOF_BITMAP_SET(ret, value);
73b437c8 3943 }
ffc61ed2 3944 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3945 break;
3946 case ANYOF_PRINT:
3947 if (LOC)
936ed897 3948 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3949 else {
3950 for (value = 0; value < 256; value++)
3951 if (isPRINT(value))
936ed897 3952 ANYOF_BITMAP_SET(ret, value);
73b437c8 3953 }
ffc61ed2 3954 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3955 break;
3956 case ANYOF_NPRINT:
3957 if (LOC)
936ed897 3958 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3959 else {
3960 for (value = 0; value < 256; value++)
3961 if (!isPRINT(value))
936ed897 3962 ANYOF_BITMAP_SET(ret, value);
73b437c8 3963 }
ffc61ed2 3964 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3965 break;
aaa51d5e
JF
3966 case ANYOF_PSXSPC:
3967 if (LOC)
3968 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3969 else {
3970 for (value = 0; value < 256; value++)
3971 if (isPSXSPC(value))
3972 ANYOF_BITMAP_SET(ret, value);
3973 }
ffc61ed2 3974 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3975 break;
3976 case ANYOF_NPSXSPC:
3977 if (LOC)
3978 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3979 else {
3980 for (value = 0; value < 256; value++)
3981 if (!isPSXSPC(value))
3982 ANYOF_BITMAP_SET(ret, value);
3983 }
ffc61ed2 3984 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3985 break;
73b437c8
JH
3986 case ANYOF_PUNCT:
3987 if (LOC)
936ed897 3988 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3989 else {
3990 for (value = 0; value < 256; value++)
3991 if (isPUNCT(value))
936ed897 3992 ANYOF_BITMAP_SET(ret, value);
73b437c8 3993 }
ffc61ed2 3994 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3995 break;
3996 case ANYOF_NPUNCT:
3997 if (LOC)
936ed897 3998 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3999 else {
4000 for (value = 0; value < 256; value++)
4001 if (!isPUNCT(value))
936ed897 4002 ANYOF_BITMAP_SET(ret, value);
73b437c8 4003 }
ffc61ed2
JH
4004 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4005 break;
4006 case ANYOF_SPACE:
4007 if (LOC)
4008 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4009 else {
4010 for (value = 0; value < 256; value++)
4011 if (isSPACE(value))
4012 ANYOF_BITMAP_SET(ret, value);
4013 }
4014 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4015 break;
4016 case ANYOF_NSPACE:
4017 if (LOC)
4018 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4019 else {
4020 for (value = 0; value < 256; value++)
4021 if (!isSPACE(value))
4022 ANYOF_BITMAP_SET(ret, value);
4023 }
4024 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4025 break;
4026 case ANYOF_UPPER:
4027 if (LOC)
936ed897 4028 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4029 else {
4030 for (value = 0; value < 256; value++)
4031 if (isUPPER(value))
936ed897 4032 ANYOF_BITMAP_SET(ret, value);
73b437c8 4033 }
ffc61ed2 4034 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4035 break;
4036 case ANYOF_NUPPER:
4037 if (LOC)
936ed897 4038 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4039 else {
4040 for (value = 0; value < 256; value++)
4041 if (!isUPPER(value))
936ed897 4042 ANYOF_BITMAP_SET(ret, value);
73b437c8 4043 }
ffc61ed2 4044 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4045 break;
4046 case ANYOF_XDIGIT:
4047 if (LOC)
936ed897 4048 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4049 else {
4050 for (value = 0; value < 256; value++)
4051 if (isXDIGIT(value))
936ed897 4052 ANYOF_BITMAP_SET(ret, value);
73b437c8 4053 }
ffc61ed2 4054 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4055 break;
4056 case ANYOF_NXDIGIT:
4057 if (LOC)
936ed897 4058 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4059 else {
4060 for (value = 0; value < 256; value++)
4061 if (!isXDIGIT(value))
936ed897 4062 ANYOF_BITMAP_SET(ret, value);
73b437c8 4063 }
ffc61ed2 4064 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4065 break;
4066 default:
b45f050a 4067 vFAIL("Invalid [::] class");
73b437c8 4068 break;
b8c5462f 4069 }
b8c5462f 4070 if (LOC)
936ed897 4071 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4072 continue;
a0d0e21e 4073 }
ffc61ed2
JH
4074 } /* end of namedclass \blah */
4075
a0d0e21e 4076 if (range) {
eb160463 4077 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4078 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4079 RExC_parse - rangebegin,
4080 RExC_parse - rangebegin,
b45f050a 4081 rangebegin);
3568d838 4082 range = 0; /* not a valid range */
73b437c8 4083 }
a0d0e21e
LW
4084 }
4085 else {
3568d838 4086 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4087 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4088 RExC_parse[1] != ']') {
4089 RExC_parse++;
ffc61ed2
JH
4090
4091 /* a bad range like \w-, [:word:]- ? */
4092 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4093 if (ckWARN(WARN_REGEXP))
830247a4 4094 vWARN4(RExC_parse,
b45f050a 4095 "False [] range \"%*.*s\"",
830247a4
IZ
4096 RExC_parse - rangebegin,
4097 RExC_parse - rangebegin,
b45f050a 4098 rangebegin);
73b437c8 4099 if (!SIZE_ONLY)
936ed897 4100 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4101 } else
ffc61ed2
JH
4102 range = 1; /* yeah, it's a range! */
4103 continue; /* but do it the next time */
a0d0e21e 4104 }
a687059c 4105 }
ffc61ed2 4106
93733859 4107 /* now is the next time */
ae5c130c 4108 if (!SIZE_ONLY) {
3568d838
JH
4109 IV i;
4110
4111 if (prevvalue < 256) {
4112 IV ceilvalue = value < 256 ? value : 255;
4113
4114#ifdef EBCDIC
1b2d223b
JH
4115 /* In EBCDIC [\x89-\x91] should include
4116 * the \x8e but [i-j] should not. */
4117 if (literal_endpoint == 2 &&
4118 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4119 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4120 {
3568d838
JH
4121 if (isLOWER(prevvalue)) {
4122 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4123 if (isLOWER(i))
4124 ANYOF_BITMAP_SET(ret, i);
4125 } else {
3568d838 4126 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4127 if (isUPPER(i))
4128 ANYOF_BITMAP_SET(ret, i);
4129 }
8ada0baa 4130 }
ffc61ed2 4131 else
8ada0baa 4132#endif
a5961de5
JH
4133 for (i = prevvalue; i <= ceilvalue; i++)
4134 ANYOF_BITMAP_SET(ret, i);
3568d838 4135 }
a5961de5 4136 if (value > 255 || UTF) {
b08decb7
JH
4137 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4138 UV natvalue = NATIVE_TO_UNI(value);
4139
ffc61ed2 4140 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4141 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4142 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4143 prevnatvalue, natvalue);
4144 }
4145 else if (prevnatvalue == natvalue) {
4146 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4147 if (FOLD) {
254ba52a
JH
4148 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4149 STRLEN foldlen;
2f3bf011 4150 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4151
c840d2a2
JH
4152 /* If folding and foldable and a single
4153 * character, insert also the folded version
4154 * to the charclass. */
9e55ce06 4155 if (f != value) {
eb160463 4156 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4157 Perl_sv_catpvf(aTHX_ listsv,
4158 "%04"UVxf"\n", f);
4159 else {
4160 /* Any multicharacter foldings
4161 * require the following transform:
4162 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4163 * where E folds into "pq" and F folds
4164 * into "rst", all other characters
4165 * fold to single characters. We save
4166 * away these multicharacter foldings,
4167 * to be later saved as part of the
4168 * additional "s" data. */
4169 SV *sv;
4170
4171 if (!unicode_alternate)
4172 unicode_alternate = newAV();
4173 sv = newSVpvn((char*)foldbuf, foldlen);
4174 SvUTF8_on(sv);
4175 av_push(unicode_alternate, sv);
4176 }
4177 }
254ba52a 4178
60a8b682
JH
4179 /* If folding and the value is one of the Greek
4180 * sigmas insert a few more sigmas to make the
4181 * folding rules of the sigmas to work right.
4182 * Note that not all the possible combinations
4183 * are handled here: some of them are handled
9e55ce06
JH
4184 * by the standard folding rules, and some of
4185 * them (literal or EXACTF cases) are handled
4186 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4187 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4188 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4189 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4190 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4191 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4192 }
4193 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4194 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4195 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4196 }
4197 }
ffc61ed2 4198 }
1b2d223b
JH
4199#ifdef EBCDIC
4200 literal_endpoint = 0;
4201#endif
8ada0baa 4202 }
ffc61ed2
JH
4203
4204 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4205 }
ffc61ed2 4206
936ed897 4207 if (need_class) {
4f66b38d 4208 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4209 if (SIZE_ONLY)
830247a4 4210 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4211 else
830247a4 4212 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4213 }
ffc61ed2 4214
ae5c130c 4215 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4216 if (!SIZE_ONLY &&
ffc61ed2 4217 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4218 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4219 ) {
a0ed51b3 4220 for (value = 0; value < 256; ++value) {
936ed897 4221 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4222 UV fold = PL_fold[value];
ffc61ed2
JH
4223
4224 if (fold != value)
4225 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4226 }
4227 }
936ed897 4228 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4229 }
ffc61ed2 4230
ae5c130c 4231 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4232 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4233 /* If the only flag is inversion. */
4234 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4235 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4236 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4237 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4238 }
a0d0e21e 4239
b81d288d 4240 if (!SIZE_ONLY) {
fde631ed 4241 AV *av = newAV();
ffc61ed2
JH
4242 SV *rv;
4243
9e55ce06 4244 /* The 0th element stores the character class description
6a0407ee 4245 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4246 * to initialize the appropriate swash (which gets stored in
4247 * the 1st element), and also useful for dumping the regnode.
4248 * The 2nd element stores the multicharacter foldings,
6a0407ee 4249 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4250 av_store(av, 0, listsv);
4251 av_store(av, 1, NULL);
9e55ce06 4252 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4253 rv = newRV_noinc((SV*)av);
19860706 4254 n = add_data(pRExC_state, 1, "s");
830247a4 4255 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4256 ARG_SET(ret, n);
a0ed51b3
LW
4257 }
4258
4259 return ret;
4260}
4261
76e3520e 4262STATIC char*
830247a4 4263S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4264{
830247a4 4265 char* retval = RExC_parse++;
a0d0e21e 4266
4633a7c4 4267 for (;;) {
830247a4
IZ
4268 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4269 RExC_parse[2] == '#') {
4270 while (*RExC_parse && *RExC_parse != ')')
4271 RExC_parse++;
4272 RExC_parse++;
4633a7c4
LW
4273 continue;
4274 }
e2509266 4275 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4276 if (isSPACE(*RExC_parse)) {
4277 RExC_parse++;
748a9306
LW
4278 continue;
4279 }
830247a4
IZ
4280 else if (*RExC_parse == '#') {
4281 while (*RExC_parse && *RExC_parse != '\n')
4282 RExC_parse++;
4283 RExC_parse++;
748a9306
LW
4284 continue;
4285 }
748a9306 4286 }
4633a7c4 4287 return retval;
a0d0e21e 4288 }
a687059c
LW
4289}
4290
4291/*
c277df42 4292- reg_node - emit a node
a0d0e21e 4293*/
76e3520e 4294STATIC regnode * /* Location. */
830247a4 4295S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4296{
c277df42
IZ
4297 register regnode *ret;
4298 register regnode *ptr;
a687059c 4299
830247a4 4300 ret = RExC_emit;
c277df42 4301 if (SIZE_ONLY) {
830247a4
IZ
4302 SIZE_ALIGN(RExC_size);
4303 RExC_size += 1;
a0d0e21e
LW
4304 return(ret);
4305 }
a687059c 4306
c277df42 4307 NODE_ALIGN_FILL(ret);
a0d0e21e 4308 ptr = ret;
c277df42 4309 FILL_ADVANCE_NODE(ptr, op);
fac92740 4310 if (RExC_offsets) { /* MJD */
ccb2c380 4311 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4312 "reg_node", __LINE__,
4313 reg_name[op],
4314 RExC_emit - RExC_emit_start > RExC_offsets[0]
4315 ? "Overwriting end of array!\n" : "OK",
4316 RExC_emit - RExC_emit_start,
4317 RExC_parse - RExC_start,
4318 RExC_offsets[0]));
ccb2c380 4319 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4320 }
4321
830247a4 4322 RExC_emit = ptr;
a687059c 4323
a0d0e21e 4324 return(ret);
a687059c
LW
4325}
4326
4327/*
a0d0e21e
LW
4328- reganode - emit a node with an argument
4329*/
76e3520e 4330STATIC regnode * /* Location. */
830247a4 4331S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4332{
c277df42
IZ
4333 register regnode *ret;
4334 register regnode *ptr;
fe14fcc3 4335
830247a4 4336 ret = RExC_emit;
c277df42 4337 if (SIZE_ONLY) {
830247a4
IZ
4338 SIZE_ALIGN(RExC_size);
4339 RExC_size += 2;
a0d0e21e
LW
4340 return(ret);
4341 }
fe14fcc3 4342
c277df42 4343 NODE_ALIGN_FILL(ret);
a0d0e21e 4344 ptr = ret;
c277df42 4345 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4346 if (RExC_offsets) { /* MJD */
ccb2c380 4347 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4348 "reganode",
ccb2c380
MP
4349 __LINE__,
4350 reg_name[op],
fac92740
MJD
4351 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4352 "Overwriting end of array!\n" : "OK",
4353 RExC_emit - RExC_emit_start,
4354 RExC_parse - RExC_start,
4355 RExC_offsets[0]));
ccb2c380 4356 Set_Cur_Node_Offset;
fac92740
MJD
4357 }
4358
830247a4 4359 RExC_emit = ptr;
fe14fcc3 4360
a0d0e21e 4361 return(ret);
fe14fcc3
LW
4362}
4363
4364/*
cd439c50 4365- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4366*/
4367STATIC void
830247a4 4368S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4369{
5e12f4fb 4370 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4371}
4372
4373/*
a0d0e21e
LW
4374- reginsert - insert an operator in front of already-emitted operand
4375*
4376* Means relocating the operand.
4377*/
76e3520e 4378STATIC void
830247a4 4379S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4380{
c277df42
IZ
4381 register regnode *src;
4382 register regnode *dst;
4383 register regnode *place;
4384 register int offset = regarglen[(U8)op];
b81d288d 4385
22c35a8c 4386/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4387
4388 if (SIZE_ONLY) {
830247a4 4389 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4390 return;
4391 }
a687059c 4392
830247a4
IZ
4393 src = RExC_emit;
4394 RExC_emit += NODE_STEP_REGNODE + offset;
4395 dst = RExC_emit;
fac92740 4396 while (src > opnd) {
c277df42 4397 StructCopy(--src, --dst, regnode);
fac92740 4398 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4399 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4400 "reg_insert",
ccb2c380
MP
4401 __LINE__,
4402 reg_name[op],
fac92740
MJD
4403 dst - RExC_emit_start > RExC_offsets[0]
4404 ? "Overwriting end of array!\n" : "OK",
4405 src - RExC_emit_start,
4406 dst - RExC_emit_start,
4407 RExC_offsets[0]));
ccb2c380
MP
4408 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4409 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4410 }
4411 }
4412
a0d0e21e
LW
4413
4414 place = opnd; /* Op node, where operand used to be. */
fac92740 4415 if (RExC_offsets) { /* MJD */
ccb2c380 4416 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4417 "reginsert",
ccb2c380
MP
4418 __LINE__,
4419 reg_name[op],
fac92740
MJD
4420 place - RExC_emit_start > RExC_offsets[0]
4421 ? "Overwriting end of array!\n" : "OK",
4422 place - RExC_emit_start,
4423 RExC_parse - RExC_start,
4424 RExC_offsets[0]));
ccb2c380 4425 Set_Node_Offset(place, RExC_parse);
fac92740 4426 }
c277df42
IZ
4427 src = NEXTOPER(place);
4428 FILL_ADVANCE_NODE(place, op);
4429 Zero(src, offset, regnode);
a687059c
LW
4430}
4431
4432/*
c277df42 4433- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4434*/
76e3520e 4435STATIC void
830247a4 4436S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4437{
c277df42
IZ
4438 register regnode *scan;
4439 register regnode *temp;
a0d0e21e 4440
c277df42 4441 if (SIZE_ONLY)
a0d0e21e
LW
4442 return;
4443
4444 /* Find last node. */
4445 scan = p;
4446 for (;;) {
4447 temp = regnext(scan);
4448 if (temp == NULL)
4449 break;
4450 scan = temp;
4451 }
a687059c 4452
c277df42
IZ
4453 if (reg_off_by_arg[OP(scan)]) {
4454 ARG_SET(scan, val - scan);
a0ed51b3
LW
4455 }
4456 else {
c277df42
IZ
4457 NEXT_OFF(scan) = val - scan;
4458 }
a687059c
LW
4459}
4460
4461/*
a0d0e21e
LW
4462- regoptail - regtail on operand of first argument; nop if operandless
4463*/
76e3520e 4464STATIC void
830247a4 4465S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4466{
a0d0e21e 4467 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4468 if (p == NULL || SIZE_ONLY)
4469 return;
22c35a8c 4470 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4471 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4472 }
22c35a8c 4473 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4474 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4475 }
4476 else
a0d0e21e 4477 return;
a687059c
LW
4478}
4479
4480/*
4481 - regcurly - a little FSA that accepts {\d+,?\d*}
4482 */
79072805 4483STATIC I32
cea2e8a9 4484S_regcurly(pTHX_ register char *s)
a687059c
LW
4485{
4486 if (*s++ != '{')
4487 return FALSE;
f0fcb552 4488 if (!isDIGIT(*s))
a687059c 4489 return FALSE;
f0fcb552 4490 while (isDIGIT(*s))
a687059c
LW
4491 s++;
4492 if (*s == ',')
4493 s++;
f0fcb552 4494 while (isDIGIT(*s))
a687059c
LW
4495 s++;
4496 if (*s != '}')
4497 return FALSE;
4498 return TRUE;
4499}
4500
a687059c 4501
8fa7f367
JH
4502#ifdef DEBUGGING
4503
76e3520e 4504STATIC regnode *
cea2e8a9 4505S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4506{
f248d071 4507 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4508 register regnode *next;
c277df42
IZ
4509
4510 while (op != END && (!last || node < last)) {
4511 /* While that wasn't END last time... */
4512
4513 NODE_ALIGN(node);
4514 op = OP(node);
4515 if (op == CLOSE)
4516 l--;
4517 next = regnext(node);
4518 /* Where, what. */
4519 if (OP(node) == OPTIMIZED)
4520 goto after_print;
4521 regprop(sv, node);
b900a521 4522 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4523 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4524 if (next == NULL) /* Next ptr. */
4525 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4526 else
b900a521 4527 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4528 (void)PerlIO_putc(Perl_debug_log, '\n');
4529 after_print:
22c35a8c 4530 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4531 register regnode *nnode = (OP(next) == LONGJMP
4532 ? regnext(next)
c277df42
IZ
4533 : next);
4534 if (last && nnode > last)
4535 nnode = last;
4536 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4537 }
22c35a8c 4538 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4539 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4540 }
4541 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4542 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4543 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4544 }
22c35a8c 4545 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4546 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4547 next, sv, l + 1);
a0ed51b3
LW
4548 }
4549 else if ( op == PLUS || op == STAR) {
c277df42 4550 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4551 }
4552 else if (op == ANYOF) {
4f66b38d
HS
4553 /* arglen 1 + class block */
4554 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4555 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4556 node = NEXTOPER(node);
a0ed51b3 4557 }
22c35a8c 4558 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4559 /* Literal string, where present. */
cd439c50 4560 node += NODE_SZ_STR(node) - 1;
c277df42 4561 node = NEXTOPER(node);
a0ed51b3
LW
4562 }
4563 else {
c277df42
IZ
4564 node = NEXTOPER(node);
4565 node += regarglen[(U8)op];
4566 }
4567 if (op == CURLYX || op == OPEN)
4568 l++;
4569 else if (op == WHILEM)
4570 l--;
4571 }
4572 return node;
4573}
4574
8fa7f367
JH
4575#endif /* DEBUGGING */
4576
a687059c 4577/*
fd181c75 4578 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4579 */
4580void
864dbfa3 4581Perl_regdump(pTHX_ regexp *r)
a687059c 4582{
35ff7856 4583#ifdef DEBUGGING
46fc3d4c 4584 SV *sv = sv_newmortal();
a687059c 4585
c277df42 4586 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4587
4588 /* Header fields of interest. */
c277df42 4589 if (r->anchored_substr)
7b0972df 4590 PerlIO_printf(Perl_debug_log,
b81d288d 4591 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4592 PL_colors[0],
7b0972df 4593 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4594 SvPVX(r->anchored_substr),
3280af22 4595 PL_colors[1],
c277df42 4596 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4597 (IV)r->anchored_offset);
33b8afdf
JH
4598 else if (r->anchored_utf8)
4599 PerlIO_printf(Perl_debug_log,
4600 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4601 PL_colors[0],
4602 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4603 SvPVX(r->anchored_utf8),
4604 PL_colors[1],
4605 SvTAIL(r->anchored_utf8) ? "$" : "",
4606 (IV)r->anchored_offset);
c277df42 4607 if (r->float_substr)
7b0972df 4608 PerlIO_printf(Perl_debug_log,
b81d288d 4609 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4610 PL_colors[0],
b81d288d 4611 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4612 SvPVX(r->float_substr),
3280af22 4613 PL_colors[1],
c277df42 4614 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4615 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4616 else if (r->float_utf8)
4617 PerlIO_printf(Perl_debug_log,
4618 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4619 PL_colors[0],
4620 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4621 SvPVX(r->float_utf8),
4622 PL_colors[1],
4623 SvTAIL(r->float_utf8) ? "$" : "",
4624 (IV)r->float_min_offset, (UV)r->float_max_offset);
4625 if (r->check_substr || r->check_utf8)
b81d288d
AB
4626 PerlIO_printf(Perl_debug_log,
4627 r->check_substr == r->float_substr
33b8afdf 4628 && r->check_utf8 == r->float_utf8
c277df42
IZ
4629 ? "(checking floating" : "(checking anchored");
4630 if (r->reganch & ROPT_NOSCAN)
4631 PerlIO_printf(Perl_debug_log, " noscan");
4632 if (r->reganch & ROPT_CHECK_ALL)
4633 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4634 if (r->check_substr || r->check_utf8)
c277df42
IZ
4635 PerlIO_printf(Perl_debug_log, ") ");
4636
46fc3d4c 4637 if (r->regstclass) {
4638 regprop(sv, r->regstclass);
4639 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4640 }
774d564b 4641 if (r->reganch & ROPT_ANCH) {
4642 PerlIO_printf(Perl_debug_log, "anchored");
4643 if (r->reganch & ROPT_ANCH_BOL)
4644 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4645 if (r->reganch & ROPT_ANCH_MBOL)
4646 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4647 if (r->reganch & ROPT_ANCH_SBOL)
4648 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4649 if (r->reganch & ROPT_ANCH_GPOS)
4650 PerlIO_printf(Perl_debug_log, "(GPOS)");
4651 PerlIO_putc(Perl_debug_log, ' ');
4652 }
c277df42
IZ
4653 if (r->reganch & ROPT_GPOS_SEEN)
4654 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4655 if (r->reganch & ROPT_SKIP)
760ac839 4656 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4657 if (r->reganch & ROPT_IMPLICIT)
760ac839 4658 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4659 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4660 if (r->reganch & ROPT_EVAL_SEEN)
4661 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4662 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4663 if (r->offsets) {
4664 U32 i;
4665 U32 len = r->offsets[0];
392fbf5d 4666 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4667 for (i = 1; i <= len; i++)
392fbf5d
RB
4668 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4669 (UV)r->offsets[i*2-1],
4670 (UV)r->offsets[i*2]);
fac92740
MJD
4671 PerlIO_printf(Perl_debug_log, "\n");
4672 }
17c3b450 4673#endif /* DEBUGGING */
a687059c
LW
4674}
4675
8fa7f367
JH
4676#ifdef DEBUGGING
4677
653099ff
GS
4678STATIC void
4679S_put_byte(pTHX_ SV *sv, int c)
4680{
7be5a6cf 4681 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4682 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4683 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4684 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4685 else
4686 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4687}
4688
8fa7f367
JH
4689#endif /* DEBUGGING */
4690
a687059c 4691/*
a0d0e21e
LW
4692- regprop - printable representation of opcode
4693*/
46fc3d4c 4694void
864dbfa3 4695Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4696{
35ff7856 4697#ifdef DEBUGGING
9b155405 4698 register int k;
a0d0e21e 4699
54dc92de 4700 sv_setpvn(sv, "", 0);
9b155405 4701 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4702 /* It would be nice to FAIL() here, but this may be called from
4703 regexec.c, and it would be hard to supply pRExC_state. */
4704 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4705 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4706
4707 k = PL_regkind[(U8)OP(o)];
4708
2a782b5b
JH
4709 if (k == EXACT) {
4710 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4711 /* Using is_utf8_string() is a crude hack but it may
4712 * be the best for now since we have no flag "this EXACTish
4713 * node was UTF-8" --jhi */
4714 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4715 char *s = do_utf8 ?
c728cb41
JH
4716 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4717 UNI_DISPLAY_REGEX) :
2a782b5b 4718 STRING(o);
40eddc46 4719 int len = do_utf8 ?
2a782b5b
JH
4720 strlen(s) :
4721 STR_LEN(o);
4722 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4723 PL_colors[0],
4724 len, s,
4725 PL_colors[1]);
4726 }
9b155405 4727 else if (k == CURLY) {
cb434fcc 4728 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4729 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4730 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4731 }
2c2d71f5
JH
4732 else if (k == WHILEM && o->flags) /* Ordinal/of */
4733 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4734 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4735 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4736 else if (k == LOGICAL)
04ebc1ab 4737 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4738 else if (k == ANYOF) {
4739 int i, rangestart = -1;
ffc61ed2 4740 U8 flags = ANYOF_FLAGS(o);
a6d05634 4741 const char * const anyofs[] = { /* Should be synchronized with
19860706 4742 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4743 "\\w",
4744 "\\W",
4745 "\\s",
4746 "\\S",
4747 "\\d",
4748 "\\D",
4749 "[:alnum:]",
4750 "[:^alnum:]",
4751 "[:alpha:]",
4752 "[:^alpha:]",
4753 "[:ascii:]",
4754 "[:^ascii:]",
4755 "[:ctrl:]",
4756 "[:^ctrl:]",
4757 "[:graph:]",
4758 "[:^graph:]",
4759 "[:lower:]",
4760 "[:^lower:]",
4761 "[:print:]",
4762 "[:^print:]",
4763 "[:punct:]",
4764 "[:^punct:]",
4765 "[:upper:]",
aaa51d5e 4766 "[:^upper:]",
653099ff 4767 "[:xdigit:]",
aaa51d5e
JF
4768 "[:^xdigit:]",
4769 "[:space:]",
4770 "[:^space:]",
4771 "[:blank:]",
4772 "[:^blank:]"
653099ff
GS
4773 };
4774
19860706 4775 if (flags & ANYOF_LOCALE)
653099ff 4776 sv_catpv(sv, "{loc}");
19860706 4777 if (flags & ANYOF_FOLD)
653099ff
GS
4778 sv_catpv(sv, "{i}");
4779 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4780 if (flags & ANYOF_INVERT)
653099ff 4781 sv_catpv(sv, "^");
ffc61ed2
JH
4782 for (i = 0; i <= 256; i++) {
4783 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4784 if (rangestart == -1)
4785 rangestart = i;
4786 } else if (rangestart != -1) {
4787 if (i <= rangestart + 3)
4788 for (; rangestart < i; rangestart++)
653099ff 4789 put_byte(sv, rangestart);
ffc61ed2
JH
4790 else {
4791 put_byte(sv, rangestart);
4792 sv_catpv(sv, "-");
4793 put_byte(sv, i - 1);
653099ff 4794 }
ffc61ed2 4795 rangestart = -1;
653099ff 4796 }
847a199f 4797 }
ffc61ed2
JH
4798
4799 if (o->flags & ANYOF_CLASS)
4800 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4801 if (ANYOF_CLASS_TEST(o,i))
4802 sv_catpv(sv, anyofs[i]);
4803
4804 if (flags & ANYOF_UNICODE)
4805 sv_catpv(sv, "{unicode}");
1aa99e6b 4806 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4807 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4808
4809 {
4810 SV *lv;
9e55ce06 4811 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4812
ffc61ed2
JH
4813 if (lv) {
4814 if (sw) {
ffc61ed2 4815 U8 s[UTF8_MAXLEN+1];
b81d288d 4816
ffc61ed2 4817 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4818 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4819
3568d838 4820 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4821 if (rangestart == -1)
4822 rangestart = i;
4823 } else if (rangestart != -1) {
4824 U8 *p;
b81d288d 4825
ffc61ed2
JH
4826 if (i <= rangestart + 3)
4827 for (; rangestart < i; rangestart++) {
2b9d42f0 4828 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4829 put_byte(sv, *p);
4830 }
4831 else {
2b9d42f0 4832 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4833 put_byte(sv, *p);
4834 sv_catpv(sv, "-");
2b9d42f0 4835 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4836 put_byte(sv, *p);
4837 }
4838 rangestart = -1;
4839 }
19860706 4840 }
ffc61ed2
JH
4841
4842 sv_catpv(sv, "..."); /* et cetera */
19860706 4843 }
fde631ed 4844
ffc61ed2
JH
4845 {
4846 char *s = savepv(SvPVX(lv));
4847 char *origs = s;
b81d288d 4848
ffc61ed2 4849 while(*s && *s != '\n') s++;
b81d288d 4850
ffc61ed2
JH
4851 if (*s == '\n') {
4852 char *t = ++s;
4853
4854 while (*s) {
4855 if (*s == '\n')
4856 *s = ' ';
4857 s++;
4858 }
4859 if (s[-1] == ' ')
4860 s[-1] = 0;
4861
4862 sv_catpv(sv, t);
fde631ed 4863 }
b81d288d 4864
ffc61ed2 4865 Safefree(origs);
fde631ed
JH
4866 }
4867 }
653099ff 4868 }
ffc61ed2 4869
653099ff
GS
4870 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4871 }
9b155405 4872 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4873 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4874#endif /* DEBUGGING */
35ff7856 4875}
a687059c 4876
cad2e5aa
JH
4877SV *
4878Perl_re_intuit_string(pTHX_ regexp *prog)
4879{ /* Assume that RE_INTUIT is set */
4880 DEBUG_r(
4881 { STRLEN n_a;
33b8afdf
JH
4882 char *s = SvPV(prog->check_substr
4883 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4884
4885 if (!PL_colorset) reginitcolors();
4886 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4887 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4888 PL_colors[4],
4889 prog->check_substr ? "" : "utf8 ",
4890 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4891 s,
4892 PL_colors[1],
4893 (strlen(s) > 60 ? "..." : ""));
4894 } );
4895
33b8afdf 4896 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4897}
4898
2b69d0c2 4899void
864dbfa3 4900Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4901{
9e55ce06
JH
4902#ifdef DEBUGGING
4903 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4904#endif
7821416a
IZ
4905
4906 if (!r || (--r->refcnt > 0))
4907 return;
9e55ce06 4908 DEBUG_r({
d103360b
HS
4909 int len;
4910 char *s;
4911
4912 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4913 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4914 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4915 len = SvCUR(dsv);
9e55ce06
JH
4916 if (!PL_colorset)
4917 reginitcolors();
4918 PerlIO_printf(Perl_debug_log,
4919 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4920 PL_colors[4],PL_colors[5],PL_colors[0],
4921 len, len, s,
4922 PL_colors[1],
4923 len > 60 ? "..." : "");
4924 });
cad2e5aa 4925
c277df42 4926 if (r->precomp)
a0d0e21e 4927 Safefree(r->precomp);
fac92740
MJD
4928 if (r->offsets) /* 20010421 MJD */
4929 Safefree(r->offsets);
ed252734
NC
4930 RX_MATCH_COPY_FREE(r);
4931#ifdef PERL_COPY_ON_WRITE
4932 if (r->saved_copy)
4933 SvREFCNT_dec(r->saved_copy);
4934#endif
a193d654
GS
4935 if (r->substrs) {
4936 if (r->anchored_substr)
4937 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4938 if (r->anchored_utf8)
4939 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4940 if (r->float_substr)
4941 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4942 if (r->float_utf8)
4943 SvREFCNT_dec(r->float_utf8);
2779dcf1 4944 Safefree(r->substrs);
a193d654 4945 }
c277df42
IZ
4946 if (r->data) {
4947 int n = r->data->count;
f3548bdc
DM
4948 PAD* new_comppad = NULL;
4949 PAD* old_comppad;
dfad63ad 4950
c277df42 4951 while (--n >= 0) {
261faec3 4952 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4953 switch (r->data->what[n]) {
4954 case 's':
4955 SvREFCNT_dec((SV*)r->data->data[n]);
4956 break;
653099ff
GS
4957 case 'f':
4958 Safefree(r->data->data[n]);
4959 break;
dfad63ad
HS
4960 case 'p':
4961 new_comppad = (AV*)r->data->data[n];
4962 break;
c277df42 4963 case 'o':
dfad63ad 4964 if (new_comppad == NULL)
cea2e8a9 4965 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4966 PAD_SAVE_LOCAL(old_comppad,
4967 /* Watch out for global destruction's random ordering. */
4968 (SvTYPE(new_comppad) == SVt_PVAV) ?
4969 new_comppad : Null(PAD *)
4970 );
9b978d73
DM
4971 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4972 op_free((OP_4tree*)r->data->data[n]);
4973 }
4974
f3548bdc 4975 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4976 SvREFCNT_dec((SV*)new_comppad);
4977 new_comppad = NULL;
c277df42
IZ
4978 break;
4979 case 'n':
9e55ce06 4980 break;
c277df42 4981 default:
830247a4 4982 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4983 }
4984 }
4985 Safefree(r->data->what);
4986 Safefree(r->data);
a0d0e21e
LW
4987 }
4988 Safefree(r->startp);
4989 Safefree(r->endp);
4990 Safefree(r);
a687059c 4991}
c277df42
IZ
4992
4993/*
4994 - regnext - dig the "next" pointer out of a node
4995 *
4996 * [Note, when REGALIGN is defined there are two places in regmatch()
4997 * that bypass this code for speed.]
4998 */
4999regnode *
864dbfa3 5000Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5001{
5002 register I32 offset;
5003
3280af22 5004 if (p == &PL_regdummy)
c277df42
IZ
5005 return(NULL);
5006
5007 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5008 if (offset == 0)
5009 return(NULL);
5010
c277df42 5011 return(p+offset);
c277df42
IZ
5012}
5013
01f988be 5014STATIC void
cea2e8a9 5015S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5016{
5017 va_list args;
5018 STRLEN l1 = strlen(pat1);
5019 STRLEN l2 = strlen(pat2);
5020 char buf[512];
06bf62c7 5021 SV *msv;
c277df42
IZ
5022 char *message;
5023
5024 if (l1 > 510)
5025 l1 = 510;
5026 if (l1 + l2 > 510)
5027 l2 = 510 - l1;
5028 Copy(pat1, buf, l1 , char);
5029 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5030 buf[l1 + l2] = '\n';
5031 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5032#ifdef I_STDARG
5033 /* ANSI variant takes additional second argument */
c277df42 5034 va_start(args, pat2);
8736538c
AS
5035#else
5036 va_start(args);
5037#endif
5a844595 5038 msv = vmess(buf, &args);
c277df42 5039 va_end(args);
06bf62c7 5040 message = SvPV(msv,l1);
c277df42
IZ
5041 if (l1 > 512)
5042 l1 = 512;
5043 Copy(message, buf, l1 , char);
5044 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 5045 Perl_croak(aTHX_ "%s", buf);
c277df42 5046}
a0ed51b3
LW
5047
5048/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5049
5050void
864dbfa3 5051Perl_save_re_context(pTHX)
b81d288d 5052{
830247a4 5053 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5054 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5055 SAVEPPTR(PL_reginput); /* String-input pointer. */
5056 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5057 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5058 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5059 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5060 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5061 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5062 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5063 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5064 PL_reg_start_tmp = 0;
a0ed51b3
LW
5065 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5066 PL_reg_start_tmpl = 0;
7766f137 5067 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5068 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5069 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5070 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5071 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5072 SAVEVPTR(PL_regcc); /* from regexec.c */
5073 SAVEVPTR(PL_curcop);
7766f137
GS
5074 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5075 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5076 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5077 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5078 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5079 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5080 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5081 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5082 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5083 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5084 PL_reg_oldsaved = Nullch;
5085 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5086 PL_reg_oldsavedlen = 0;
ed252734
NC
5087#ifdef PERL_COPY_ON_WRITE
5088 SAVESPTR(PL_nrs);
5089 PL_nrs = Nullsv;
5090#endif
a5db57d6
GS
5091 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5092 PL_reg_maxiter = 0;
5093 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5094 PL_reg_leftiter = 0;
5095 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5096 PL_reg_poscache = Nullch;
5097 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5098 PL_reg_poscache_size = 0;
5099 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5100 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5101 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5102
5103 {
5104 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5105 U32 i;
ada6e8a9
AMS
5106 GV *mgv;
5107 REGEXP *rx;
5108 char digits[16];
5109
5110 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5111 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5112 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5113 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5114 save_scalar(mgv);
5115 }
5116 }
5117 }
5118
54b6e2fa 5119#ifdef DEBUGGING
b81d288d 5120 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5121#endif
a0ed51b3 5122}
51371543 5123
51371543 5124static void
acfe0abc 5125clear_re(pTHX_ void *r)
51371543
GS
5126{
5127 ReREFCNT_dec((regexp *)r);
5128}
ffbc6a93 5129