This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl_re_dup()
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
f9373011
PM
396#define vWARNdep(loc,m) \
397 STMT_START { \
398 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
399 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
400 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 } STMT_END \
403
b45f050a
JF
404
405#define vWARN2(loc, m, a1) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
409 a1, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN3(loc, m, a1, a2) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
417 a1, a2, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
419 } STMT_END
420
421#define vWARN4(loc, m, a1, a2, a3) \
422 STMT_START { \
830247a4 423 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
424 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
425 a1, a2, a3, \
830247a4 426 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
427 } STMT_END
428
9d1d55b5
JP
429/* used for the parse_flags section for (?c) -- japhy */
430#define vWARN5(loc, m, a1, a2, a3, a4) \
431 STMT_START { \
432 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
433 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
434 a1, a2, a3, a4, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436 } STMT_END
437
8615cb43 438
cd439c50 439/* Allow for side effects in s */
fd3f0ae2 440#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 441
fac92740
MJD
442/* Macros for recording node offsets. 20001227 mjd@plover.com
443 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
444 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
445 * Element 0 holds the number n.
446 */
447
448#define MJD_OFFSET_DEBUG(x)
449/* #define MJD_OFFSET_DEBUG(x) fprintf x */
450
451
452# define Set_Node_Offset_To_R(node,byte) \
453 STMT_START { \
454 if (! SIZE_ONLY) { \
455 if((node) < 0) { \
456 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
457 } else { \
458 RExC_offsets[2*(node)-1] = (byte); \
459 } \
460 } \
461 } STMT_END
462
463# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
464# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
465
466# define Set_Node_Length_To_R(node,len) \
467 STMT_START { \
468 if (! SIZE_ONLY) { \
469 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
470 if((node) < 0) { \
471 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
472 } else { \
473 RExC_offsets[2*(node)] = (len); \
474 } \
475 } \
476 } STMT_END
477
478# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
479# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
480# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
481
482/* Get offsets and lengths */
483#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
484#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
485
51371543 486static void clear_re(pTHXo_ void *r);
4327152a 487
653099ff
GS
488/* Mark that we cannot extend a found fixed substring at this point.
489 Updata the longest found anchored substring and the longest found
490 floating substrings if needed. */
491
4327152a 492STATIC void
830247a4 493S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 494{
a0ed51b3
LW
495 STRLEN l = CHR_SVLEN(data->last_found);
496 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 497
c277df42
IZ
498 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
499 sv_setsv(*data->longest, data->last_found);
500 if (*data->longest == data->longest_fixed) {
501 data->offset_fixed = l ? data->last_start_min : data->pos_min;
502 if (data->flags & SF_BEFORE_EOL)
b81d288d 503 data->flags
c277df42
IZ
504 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
505 else
506 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
507 }
508 else {
c277df42 509 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
510 data->offset_float_max = (l
511 ? data->last_start_max
c277df42
IZ
512 : data->pos_min + data->pos_delta);
513 if (data->flags & SF_BEFORE_EOL)
b81d288d 514 data->flags
c277df42
IZ
515 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
516 else
517 data->flags &= ~SF_FL_BEFORE_EOL;
518 }
519 }
520 SvCUR_set(data->last_found, 0);
521 data->last_end = -1;
522 data->flags &= ~SF_BEFORE_EOL;
523}
524
653099ff
GS
525/* Can match anything (initialization) */
526STATIC void
830247a4 527S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 528{
9af0acdb
JH
529 int value;
530
653099ff 531 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
532 for (value = 0; value < 256; ++value)
533 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 534 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* Can match anything (initialization) */
540STATIC int
541S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
542{
543 int value;
544
aaa51d5e 545 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
546 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
547 return 1;
1aa99e6b
IH
548 if (!(cl->flags & ANYOF_UNICODE_ALL))
549 return 0;
9af0acdb
JH
550 for (value = 0; value < 256; ++value)
551 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
552 return 0;
553 return 1;
554}
555
556/* Can match anything (initialization) */
557STATIC void
830247a4 558S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563}
564
565STATIC void
830247a4 566S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 567{
8ecf7187 568 Zero(cl, 1, struct regnode_charclass_class);
653099ff 569 cl->type = ANYOF;
830247a4 570 cl_anything(pRExC_state, cl);
653099ff
GS
571 if (LOC)
572 cl->flags |= ANYOF_LOCALE;
573}
574
575/* 'And' a given class with another one. Can create false positives */
576/* We assume that cl is not inverted */
577STATIC void
578S_cl_and(pTHX_ struct regnode_charclass_class *cl,
579 struct regnode_charclass_class *and_with)
580{
653099ff
GS
581 if (!(and_with->flags & ANYOF_CLASS)
582 && !(cl->flags & ANYOF_CLASS)
583 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
584 && !(and_with->flags & ANYOF_FOLD)
585 && !(cl->flags & ANYOF_FOLD)) {
586 int i;
587
588 if (and_with->flags & ANYOF_INVERT)
589 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
590 cl->bitmap[i] &= ~and_with->bitmap[i];
591 else
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] &= and_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
595 if (!(and_with->flags & ANYOF_EOS))
596 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
597
598 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
599 cl->flags &= ~ANYOF_UNICODE_ALL;
600 cl->flags |= ANYOF_UNICODE;
601 ARG_SET(cl, ARG(and_with));
602 }
603 if (!(and_with->flags & ANYOF_UNICODE_ALL))
604 cl->flags &= ~ANYOF_UNICODE_ALL;
605 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
606 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
607}
608
609/* 'OR' a given class with another one. Can create false positives */
610/* We assume that cl is not inverted */
611STATIC void
830247a4 612S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 613{
653099ff
GS
614 if (or_with->flags & ANYOF_INVERT) {
615 /* We do not use
616 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
617 * <= (B1 | !B2) | (CL1 | !CL2)
618 * which is wasteful if CL2 is small, but we ignore CL2:
619 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
620 * XXXX Can we handle case-fold? Unclear:
621 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
622 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
623 */
624 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
625 && !(or_with->flags & ANYOF_FOLD)
626 && !(cl->flags & ANYOF_FOLD) ) {
627 int i;
628
629 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
630 cl->bitmap[i] |= ~or_with->bitmap[i];
631 } /* XXXX: logic is complicated otherwise */
632 else {
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 } else {
636 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
637 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 638 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
639 || (cl->flags & ANYOF_FOLD)) ) {
640 int i;
641
642 /* OR char bitmap and class bitmap separately */
643 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
644 cl->bitmap[i] |= or_with->bitmap[i];
645 if (or_with->flags & ANYOF_CLASS) {
646 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
647 cl->classflags[i] |= or_with->classflags[i];
648 cl->flags |= ANYOF_CLASS;
649 }
650 }
651 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653 }
654 }
655 if (or_with->flags & ANYOF_EOS)
656 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
657
658 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
659 ARG(cl) != ARG(or_with)) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
663 if (or_with->flags & ANYOF_UNICODE_ALL) {
664 cl->flags |= ANYOF_UNICODE_ALL;
665 cl->flags &= ~ANYOF_UNICODE;
666 }
653099ff
GS
667}
668
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
c277df42
IZ
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
76e3520e 675STATIC I32
830247a4 676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 685 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
653099ff 688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 689
c277df42
IZ
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
22c35a8c 693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 694 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
b81d288d 699#endif
c277df42 700
cd439c50 701 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
b81d288d 704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
22c35a8c 710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
b81d288d 716#endif
c277df42 717 n = regnext(n);
a0ed51b3 718 }
f49d4d0f 719 else if (stringok) {
cd439c50 720 int oldl = STR_LEN(scan);
c277df42 721 regnode *nnext = regnext(n);
f49d4d0f 722
b81d288d 723 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
c277df42 728 /* Now we can overwrite *n : */
f49d4d0f 729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 730#ifdef DEBUGGING
f49d4d0f 731 stop = next - 1;
b81d288d 732#endif
c277df42
IZ
733 n = nnext;
734 }
735 }
736#ifdef DEBUGGING
737 /* Allow dumping */
cd439c50 738 n = scan + NODE_SZ_STR(scan);
c277df42 739 while (n <= stop) {
22c35a8c 740 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
741 OP(n) = OPTIMIZED;
742 NEXT_OFF(n) = 0;
743 }
744 n++;
745 }
653099ff 746#endif
c277df42 747 }
653099ff
GS
748 /* Follow the next-chain of the current node and optimize
749 away all the NOTHINGs from it. */
c277df42 750 if (OP(scan) != CURLYX) {
048cfca1
GS
751 int max = (reg_off_by_arg[OP(scan)]
752 ? I32_MAX
753 /* I32 may be smaller than U16 on CRAYs! */
754 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
755 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
756 int noff;
757 regnode *n = scan;
b81d288d 758
c277df42
IZ
759 /* Skip NOTHING and LONGJMP. */
760 while ((n = regnext(n))
22c35a8c 761 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
762 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
763 && off + noff < max)
764 off += noff;
765 if (reg_off_by_arg[OP(scan)])
766 ARG(scan) = off;
b81d288d 767 else
c277df42
IZ
768 NEXT_OFF(scan) = off;
769 }
653099ff
GS
770 /* The principal pseudo-switch. Cannot be a switch, since we
771 look into several different things. */
b81d288d 772 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
773 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
774 next = regnext(scan);
775 code = OP(scan);
b81d288d
AB
776
777 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 778 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 779 struct regnode_charclass_class accum;
c277df42 780
653099ff 781 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 782 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 783 if (flags & SCF_DO_STCLASS)
830247a4 784 cl_init_zero(pRExC_state, &accum);
c277df42 785 while (OP(scan) == code) {
830247a4 786 I32 deltanext, minnext, f = 0, fake;
653099ff 787 struct regnode_charclass_class this_class;
c277df42
IZ
788
789 num++;
790 data_fake.flags = 0;
b81d288d 791 if (data) {
2c2d71f5 792 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
793 data_fake.last_closep = data->last_closep;
794 }
795 else
796 data_fake.last_closep = &fake;
c277df42
IZ
797 next = regnext(scan);
798 scan = NEXTOPER(scan);
799 if (code != BRANCH)
800 scan = NEXTOPER(scan);
653099ff 801 if (flags & SCF_DO_STCLASS) {
830247a4 802 cl_init(pRExC_state, &this_class);
653099ff
GS
803 data_fake.start_class = &this_class;
804 f = SCF_DO_STCLASS_AND;
b81d288d 805 }
e1901655
IZ
806 if (flags & SCF_WHILEM_VISITED_POS)
807 f |= SCF_WHILEM_VISITED_POS;
653099ff 808 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
809 minnext = study_chunk(pRExC_state, &scan, &deltanext,
810 next, &data_fake, f);
b81d288d 811 if (min1 > minnext)
c277df42
IZ
812 min1 = minnext;
813 if (max1 < minnext + deltanext)
814 max1 = minnext + deltanext;
815 if (deltanext == I32_MAX)
aca2d497 816 is_inf = is_inf_internal = 1;
c277df42
IZ
817 scan = next;
818 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
819 pars++;
405ff068 820 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 821 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
822 if (data)
823 data->whilem_c = data_fake.whilem_c;
653099ff 824 if (flags & SCF_DO_STCLASS)
830247a4 825 cl_or(pRExC_state, &accum, &this_class);
b81d288d 826 if (code == SUSPEND)
c277df42
IZ
827 break;
828 }
829 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
830 min1 = 0;
831 if (flags & SCF_DO_SUBSTR) {
832 data->pos_min += min1;
833 data->pos_delta += max1 - min1;
834 if (max1 != min1 || is_inf)
835 data->longest = &(data->longest_float);
836 }
837 min += min1;
838 delta += max1 - min1;
653099ff 839 if (flags & SCF_DO_STCLASS_OR) {
830247a4 840 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
841 if (min1) {
842 cl_and(data->start_class, &and_with);
843 flags &= ~SCF_DO_STCLASS;
844 }
845 }
846 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
847 if (min1) {
848 cl_and(data->start_class, &accum);
653099ff 849 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
850 }
851 else {
b81d288d 852 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
853 * data->start_class */
854 StructCopy(data->start_class, &and_with,
855 struct regnode_charclass_class);
856 flags &= ~SCF_DO_STCLASS_AND;
857 StructCopy(&accum, data->start_class,
858 struct regnode_charclass_class);
859 flags |= SCF_DO_STCLASS_OR;
860 data->start_class->flags |= ANYOF_EOS;
861 }
653099ff 862 }
a0ed51b3
LW
863 }
864 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
865 scan = NEXTOPER(NEXTOPER(scan));
866 else /* single branch is optimized. */
867 scan = NEXTOPER(scan);
868 continue;
a0ed51b3
LW
869 }
870 else if (OP(scan) == EXACT) {
cd439c50 871 I32 l = STR_LEN(scan);
1aa99e6b 872 UV uc = *((U8*)STRING(scan));
a0ed51b3 873 if (UTF) {
1aa99e6b
IH
874 U8 *s = (U8*)STRING(scan);
875 l = utf8_length(s, s + l);
9041c2e3 876 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
877 }
878 min += l;
c277df42 879 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
880 /* The code below prefers earlier match for fixed
881 offset, later match for variable offset. */
882 if (data->last_end == -1) { /* Update the start info. */
883 data->last_start_min = data->pos_min;
884 data->last_start_max = is_inf
b81d288d 885 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 886 }
cd439c50 887 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
888 data->last_end = data->pos_min + l;
889 data->pos_min += l; /* As in the first entry. */
890 data->flags &= ~SF_BEFORE_EOL;
891 }
653099ff
GS
892 if (flags & SCF_DO_STCLASS_AND) {
893 /* Check whether it is compatible with what we know already! */
894 int compat = 1;
895
1aa99e6b 896 if (uc >= 0x100 ||
516a5887 897 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 898 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 899 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 900 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 901 )
653099ff
GS
902 compat = 0;
903 ANYOF_CLASS_ZERO(data->start_class);
904 ANYOF_BITMAP_ZERO(data->start_class);
905 if (compat)
1aa99e6b 906 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 907 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
908 if (uc < 0x100)
909 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
910 }
911 else if (flags & SCF_DO_STCLASS_OR) {
912 /* false positive possible if the class is case-folded */
1aa99e6b 913 if (uc < 0x100)
9b877dbb
IH
914 ANYOF_BITMAP_SET(data->start_class, uc);
915 else
916 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
917 data->start_class->flags &= ~ANYOF_EOS;
918 cl_and(data->start_class, &and_with);
919 }
920 flags &= ~SCF_DO_STCLASS;
a0ed51b3 921 }
653099ff 922 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 923 I32 l = STR_LEN(scan);
1aa99e6b 924 UV uc = *((U8*)STRING(scan));
653099ff
GS
925
926 /* Search for fixed substrings supports EXACT only. */
b81d288d 927 if (flags & SCF_DO_SUBSTR)
830247a4 928 scan_commit(pRExC_state, data);
a0ed51b3 929 if (UTF) {
1aa99e6b
IH
930 U8 *s = (U8 *)STRING(scan);
931 l = utf8_length(s, s + l);
9041c2e3 932 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
933 }
934 min += l;
c277df42 935 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 936 data->pos_min += l;
653099ff
GS
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
939 int compat = 1;
940
1aa99e6b 941 if (uc >= 0x100 ||
516a5887 942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 944 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
945 compat = 0;
946 ANYOF_CLASS_ZERO(data->start_class);
947 ANYOF_BITMAP_ZERO(data->start_class);
948 if (compat) {
1aa99e6b 949 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
950 data->start_class->flags &= ~ANYOF_EOS;
951 data->start_class->flags |= ANYOF_FOLD;
952 if (OP(scan) == EXACTFL)
953 data->start_class->flags |= ANYOF_LOCALE;
954 }
955 }
956 else if (flags & SCF_DO_STCLASS_OR) {
957 if (data->start_class->flags & ANYOF_FOLD) {
958 /* false positive possible if the class is case-folded.
959 Assume that the locale settings are the same... */
1aa99e6b
IH
960 if (uc < 0x100)
961 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
962 data->start_class->flags &= ~ANYOF_EOS;
963 }
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
4d61ec05 968 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 969 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 970 I32 f = flags, pos_before = 0;
c277df42 971 regnode *oscan = scan;
653099ff
GS
972 struct regnode_charclass_class this_class;
973 struct regnode_charclass_class *oclass = NULL;
974
22c35a8c 975 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 976 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
977 scan = NEXTOPER(scan);
978 goto finish;
979 case PLUS:
653099ff 980 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 981 next = NEXTOPER(scan);
653099ff 982 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
983 mincount = 1;
984 maxcount = REG_INFTY;
c277df42
IZ
985 next = regnext(scan);
986 scan = NEXTOPER(scan);
987 goto do_curly;
988 }
989 }
990 if (flags & SCF_DO_SUBSTR)
991 data->pos_min++;
992 min++;
993 /* Fall through. */
994 case STAR:
653099ff
GS
995 if (flags & SCF_DO_STCLASS) {
996 mincount = 0;
b81d288d 997 maxcount = REG_INFTY;
653099ff
GS
998 next = regnext(scan);
999 scan = NEXTOPER(scan);
1000 goto do_curly;
1001 }
b81d288d 1002 is_inf = is_inf_internal = 1;
c277df42
IZ
1003 scan = regnext(scan);
1004 if (flags & SCF_DO_SUBSTR) {
830247a4 1005 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1006 data->longest = &(data->longest_float);
1007 }
1008 goto optimize_curly_tail;
1009 case CURLY:
b81d288d 1010 mincount = ARG1(scan);
c277df42
IZ
1011 maxcount = ARG2(scan);
1012 next = regnext(scan);
cb434fcc
IZ
1013 if (OP(scan) == CURLYX) {
1014 I32 lp = (data ? *(data->last_closep) : 0);
1015
1016 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1017 }
c277df42
IZ
1018 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1019 do_curly:
1020 if (flags & SCF_DO_SUBSTR) {
830247a4 1021 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1022 pos_before = data->pos_min;
1023 }
1024 if (data) {
1025 fl = data->flags;
1026 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1027 if (is_inf)
1028 data->flags |= SF_IS_INF;
1029 }
653099ff 1030 if (flags & SCF_DO_STCLASS) {
830247a4 1031 cl_init(pRExC_state, &this_class);
653099ff
GS
1032 oclass = data->start_class;
1033 data->start_class = &this_class;
1034 f |= SCF_DO_STCLASS_AND;
1035 f &= ~SCF_DO_STCLASS_OR;
1036 }
e1901655
IZ
1037 /* These are the cases when once a subexpression
1038 fails at a particular position, it cannot succeed
1039 even after backtracking at the enclosing scope.
b81d288d 1040
e1901655
IZ
1041 XXXX what if minimal match and we are at the
1042 initial run of {n,m}? */
1043 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1044 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1045
c277df42 1046 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1047 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1048 mincount == 0
653099ff
GS
1049 ? (f & ~SCF_DO_SUBSTR) : f);
1050
1051 if (flags & SCF_DO_STCLASS)
1052 data->start_class = oclass;
1053 if (mincount == 0 || minnext == 0) {
1054 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1055 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1056 }
1057 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1058 /* Switch to OR mode: cache the old value of
653099ff
GS
1059 * data->start_class */
1060 StructCopy(data->start_class, &and_with,
1061 struct regnode_charclass_class);
1062 flags &= ~SCF_DO_STCLASS_AND;
1063 StructCopy(&this_class, data->start_class,
1064 struct regnode_charclass_class);
1065 flags |= SCF_DO_STCLASS_OR;
1066 data->start_class->flags |= ANYOF_EOS;
1067 }
1068 } else { /* Non-zero len */
1069 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1070 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1071 cl_and(data->start_class, &and_with);
1072 }
1073 else if (flags & SCF_DO_STCLASS_AND)
1074 cl_and(data->start_class, &this_class);
1075 flags &= ~SCF_DO_STCLASS;
1076 }
c277df42
IZ
1077 if (!scan) /* It was not CURLYX, but CURLY. */
1078 scan = next;
84037bb0
HS
1079 if (ckWARN(WARN_REGEXP)
1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
830247a4 1669 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
a0ed51b3 1672 else
1aa99e6b 1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
830247a4
IZ
1749 RExC_parse = exp;
1750 RExC_end = xend;
1751 RExC_naughty = 0;
1752 RExC_npar = 1;
fac92740 1753 RExC_emit_start = r->program;
830247a4 1754 RExC_emit = r->program;
2cd61cdb 1755 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1756 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1757 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1758 r->data = 0;
830247a4 1759 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1760 return(NULL);
1761
1762 /* Dig out information for optimizations. */
cf93c79d 1763 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1764 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1765 if (UTF)
1766 r->reganch |= ROPT_UTF8;
c277df42 1767 r->regstclass = NULL;
830247a4 1768 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1769 r->reganch |= ROPT_NAUGHTY;
c277df42 1770 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1771
1772 /* XXXX To minimize changes to RE engine we always allocate
1773 3-units-long substrs field. */
1774 Newz(1004, r->substrs, 1, struct reg_substr_data);
1775
2c2d71f5 1776 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1777 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1778 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1779 I32 fake;
c5254dd6 1780 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1781 struct regnode_charclass_class ch_class;
1782 int stclass_flag;
cb434fcc 1783 I32 last_close = 0;
a0d0e21e
LW
1784
1785 first = scan;
c277df42 1786 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1787 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1788 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1789 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1790 (OP(first) == PLUS) ||
1791 (OP(first) == MINMOD) ||
653099ff 1792 /* An {n,m} with n>0 */
22c35a8c 1793 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1794 if (OP(first) == PLUS)
1795 sawplus = 1;
1796 else
1797 first += regarglen[(U8)OP(first)];
1798 first = NEXTOPER(first);
a687059c
LW
1799 }
1800
a0d0e21e
LW
1801 /* Starting-point info. */
1802 again:
653099ff 1803 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1804 if (OP(first) == EXACT)
1805 ; /* Empty, get anchored substr later. */
1806 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1807 r->regstclass = first;
1808 }
653099ff 1809 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1810 r->regstclass = first;
22c35a8c
GS
1811 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1812 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1813 r->regstclass = first;
22c35a8c 1814 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1815 r->reganch |= (OP(first) == MBOL
1816 ? ROPT_ANCH_MBOL
1817 : (OP(first) == SBOL
1818 ? ROPT_ANCH_SBOL
1819 : ROPT_ANCH_BOL));
a0d0e21e 1820 first = NEXTOPER(first);
774d564b 1821 goto again;
1822 }
1823 else if (OP(first) == GPOS) {
1824 r->reganch |= ROPT_ANCH_GPOS;
1825 first = NEXTOPER(first);
1826 goto again;
a0d0e21e
LW
1827 }
1828 else if ((OP(first) == STAR &&
22c35a8c 1829 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1830 !(r->reganch & ROPT_ANCH) )
1831 {
1832 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1833 int type = OP(NEXTOPER(first));
1834
ffc61ed2 1835 if (type == REG_ANY)
cad2e5aa
JH
1836 type = ROPT_ANCH_MBOL;
1837 else
1838 type = ROPT_ANCH_SBOL;
1839
1840 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1841 first = NEXTOPER(first);
774d564b 1842 goto again;
a0d0e21e 1843 }
b81d288d 1844 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1845 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1846 /* x+ must match at the 1st pos of run of x's */
1847 r->reganch |= ROPT_SKIP;
a0d0e21e 1848
c277df42 1849 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1850 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1851 (IV)(first - scan + 1)));
a0d0e21e
LW
1852 /*
1853 * If there's something expensive in the r.e., find the
1854 * longest literal string that must appear and make it the
1855 * regmust. Resolve ties in favor of later strings, since
1856 * the regstart check works with the beginning of the r.e.
1857 * and avoiding duplication strengthens checking. Not a
1858 * strong reason, but sufficient in the absence of others.
1859 * [Now we resolve ties in favor of the earlier string if
c277df42 1860 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1861 * earlier string may buy us something the later one won't.]
1862 */
a0d0e21e 1863 minlen = 0;
a687059c 1864
79cb57f6
GS
1865 data.longest_fixed = newSVpvn("",0);
1866 data.longest_float = newSVpvn("",0);
1867 data.last_found = newSVpvn("",0);
c277df42
IZ
1868 data.longest = &(data.longest_fixed);
1869 first = scan;
653099ff 1870 if (!r->regstclass) {
830247a4 1871 cl_init(pRExC_state, &ch_class);
653099ff
GS
1872 data.start_class = &ch_class;
1873 stclass_flag = SCF_DO_STCLASS_AND;
1874 } else /* XXXX Check for BOUND? */
1875 stclass_flag = 0;
cb434fcc 1876 data.last_closep = &last_close;
653099ff 1877
830247a4 1878 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1879 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1880 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1881 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1882 && !RExC_seen_zerolen
1883 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1884 r->reganch |= ROPT_CHECK_ALL;
830247a4 1885 scan_commit(pRExC_state, &data);
c277df42
IZ
1886 SvREFCNT_dec(data.last_found);
1887
a0ed51b3 1888 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1889 if (longest_float_length
c277df42
IZ
1890 || (data.flags & SF_FL_BEFORE_EOL
1891 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1892 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1893 int t;
1894
a0ed51b3 1895 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1896 && data.offset_fixed == data.offset_float_min
1897 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1898 goto remove_float; /* As in (a)+. */
1899
c277df42
IZ
1900 r->float_substr = data.longest_float;
1901 r->float_min_offset = data.offset_float_min;
1902 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1903 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1904 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1905 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1906 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1907 }
1908 else {
aca2d497 1909 remove_float:
c277df42
IZ
1910 r->float_substr = Nullsv;
1911 SvREFCNT_dec(data.longest_float);
c5254dd6 1912 longest_float_length = 0;
a0d0e21e 1913 }
c277df42 1914
a0ed51b3 1915 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1916 if (longest_fixed_length
c277df42
IZ
1917 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1918 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1919 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1920 int t;
1921
c277df42
IZ
1922 r->anchored_substr = data.longest_fixed;
1923 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1924 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1925 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1926 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1927 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1928 }
1929 else {
c277df42
IZ
1930 r->anchored_substr = Nullsv;
1931 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1932 longest_fixed_length = 0;
a0d0e21e 1933 }
b81d288d 1934 if (r->regstclass
ffc61ed2 1935 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1936 r->regstclass = NULL;
1937 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1938 && !(data.start_class->flags & ANYOF_EOS)
1939 && !cl_is_anything(data.start_class)) {
830247a4 1940 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1941
b81d288d 1942 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1943 struct regnode_charclass_class);
1944 StructCopy(data.start_class,
830247a4 1945 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1946 struct regnode_charclass_class);
830247a4 1947 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1948 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1949 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1950 DEBUG_r({ SV *sv = sv_newmortal();
1951 regprop(sv, (regnode*)data.start_class);
1952 PerlIO_printf(Perl_debug_log,
1953 "synthetic stclass `%s'.\n",
1954 SvPVX(sv));});
653099ff 1955 }
c277df42
IZ
1956
1957 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1958 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1959 r->check_substr = r->anchored_substr;
1960 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1961 if (r->reganch & ROPT_ANCH_SINGLE)
1962 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1963 }
1964 else {
c277df42
IZ
1965 r->check_substr = r->float_substr;
1966 r->check_offset_min = data.offset_float_min;
1967 r->check_offset_max = data.offset_float_max;
a0d0e21e 1968 }
30382c73
IZ
1969 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1970 This should be changed ASAP! */
1971 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1972 r->reganch |= RE_USE_INTUIT;
1973 if (SvTAIL(r->check_substr))
1974 r->reganch |= RE_INTUIT_TAIL;
1975 }
a0ed51b3
LW
1976 }
1977 else {
c277df42
IZ
1978 /* Several toplevels. Best we can is to set minlen. */
1979 I32 fake;
653099ff 1980 struct regnode_charclass_class ch_class;
cb434fcc 1981 I32 last_close = 0;
c277df42
IZ
1982
1983 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1984 scan = r->program + 1;
830247a4 1985 cl_init(pRExC_state, &ch_class);
653099ff 1986 data.start_class = &ch_class;
cb434fcc 1987 data.last_closep = &last_close;
e1901655 1988 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1989 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1990 if (!(data.start_class->flags & ANYOF_EOS)
1991 && !cl_is_anything(data.start_class)) {
830247a4 1992 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1993
b81d288d 1994 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1995 struct regnode_charclass_class);
1996 StructCopy(data.start_class,
830247a4 1997 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1998 struct regnode_charclass_class);
830247a4 1999 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2000 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2001 DEBUG_r({ SV* sv = sv_newmortal();
2002 regprop(sv, (regnode*)data.start_class);
2003 PerlIO_printf(Perl_debug_log,
2004 "synthetic stclass `%s'.\n",
2005 SvPVX(sv));});
653099ff 2006 }
a0d0e21e
LW
2007 }
2008
a0d0e21e 2009 r->minlen = minlen;
b81d288d 2010 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2011 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2012 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2013 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2014 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2015 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
2016 if (RExC_seen & REG_SEEN_SANY)
2017 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
2018 Newz(1002, r->startp, RExC_npar, I32);
2019 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2020 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2021 DEBUG_r(regdump(r));
2022 return(r);
a687059c
LW
2023}
2024
2025/*
2026 - reg - regular expression, i.e. main body or parenthesized thing
2027 *
2028 * Caller must absorb opening parenthesis.
2029 *
2030 * Combining parenthesis handling with the base level of regular expression
2031 * is a trifle forced, but the need to tie the tails of the branches to what
2032 * follows makes it hard to avoid.
2033 */
76e3520e 2034STATIC regnode *
830247a4 2035S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2036 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2037{
c277df42
IZ
2038 register regnode *ret; /* Will be the head of the group. */
2039 register regnode *br;
2040 register regnode *lastbr;
2041 register regnode *ender = 0;
a0d0e21e 2042 register I32 parno = 0;
830247a4 2043 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2044
2045 /* for (?g), (?gc), and (?o) warnings; warning
2046 about (?c) will warn about (?g) -- japhy */
2047
2048 I32 wastedflags = 0x00,
2049 wasted_o = 0x01,
2050 wasted_g = 0x02,
2051 wasted_gc = 0x02 | 0x04,
2052 wasted_c = 0x04;
2053
fac92740 2054 char * parse_start = RExC_parse; /* MJD */
830247a4 2055 char *oregcomp_parse = RExC_parse;
c277df42 2056 char c;
a0d0e21e 2057
821b33a5 2058 *flagp = 0; /* Tentatively. */
a0d0e21e 2059
9d1d55b5 2060
a0d0e21e
LW
2061 /* Make an OPEN node, if parenthesized. */
2062 if (paren) {
fac92740 2063 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2064 U16 posflags = 0, negflags = 0;
2065 U16 *flagsp = &posflags;
0f5d15d6 2066 int logical = 0;
830247a4 2067 char *seqstart = RExC_parse;
ca9dfc88 2068
830247a4
IZ
2069 RExC_parse++;
2070 paren = *RExC_parse++;
c277df42 2071 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2072 switch (paren) {
fac92740 2073 case '<': /* (?<...) */
830247a4 2074 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2075 if (*RExC_parse == '!')
c277df42 2076 paren = ',';
b81d288d 2077 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2078 goto unknown;
830247a4 2079 RExC_parse++;
fac92740
MJD
2080 case '=': /* (?=...) */
2081 case '!': /* (?!...) */
830247a4 2082 RExC_seen_zerolen++;
fac92740
MJD
2083 case ':': /* (?:...) */
2084 case '>': /* (?>...) */
a0d0e21e 2085 break;
fac92740
MJD
2086 case '$': /* (?$...) */
2087 case '@': /* (?@...) */
8615cb43 2088 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2089 break;
fac92740 2090 case '#': /* (?#...) */
830247a4
IZ
2091 while (*RExC_parse && *RExC_parse != ')')
2092 RExC_parse++;
2093 if (*RExC_parse != ')')
c277df42 2094 FAIL("Sequence (?#... not terminated");
830247a4 2095 nextchar(pRExC_state);
a0d0e21e
LW
2096 *flagp = TRYAGAIN;
2097 return NULL;
fac92740 2098 case 'p': /* (?p...) */
f9373011
PM
2099 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2100 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2101 /* FALL THROUGH*/
fac92740 2102 case '?': /* (??...) */
0f5d15d6 2103 logical = 1;
830247a4 2104 paren = *RExC_parse++;
0f5d15d6 2105 /* FALL THROUGH */
fac92740 2106 case '{': /* (?{...}) */
c277df42 2107 {
c277df42
IZ
2108 I32 count = 1, n = 0;
2109 char c;
830247a4 2110 char *s = RExC_parse;
c277df42
IZ
2111 SV *sv;
2112 OP_4tree *sop, *rop;
2113
830247a4
IZ
2114 RExC_seen_zerolen++;
2115 RExC_seen |= REG_SEEN_EVAL;
2116 while (count && (c = *RExC_parse)) {
2117 if (c == '\\' && RExC_parse[1])
2118 RExC_parse++;
b81d288d 2119 else if (c == '{')
c277df42 2120 count++;
b81d288d 2121 else if (c == '}')
c277df42 2122 count--;
830247a4 2123 RExC_parse++;
c277df42 2124 }
830247a4 2125 if (*RExC_parse != ')')
b45f050a 2126 {
b81d288d 2127 RExC_parse = s;
b45f050a
JF
2128 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2129 }
c277df42
IZ
2130 if (!SIZE_ONLY) {
2131 AV *av;
b81d288d
AB
2132
2133 if (RExC_parse - 1 - s)
830247a4 2134 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2135 else
79cb57f6 2136 sv = newSVpvn("", 0);
c277df42 2137
569233ed
SB
2138 ENTER;
2139 Perl_save_re_context(aTHX);
c277df42 2140 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2141 LEAVE;
c277df42 2142
830247a4
IZ
2143 n = add_data(pRExC_state, 3, "nop");
2144 RExC_rx->data->data[n] = (void*)rop;
2145 RExC_rx->data->data[n+1] = (void*)sop;
2146 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2147 SvREFCNT_dec(sv);
a0ed51b3 2148 }
e24b16f9 2149 else { /* First pass */
830247a4 2150 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2151 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2152 /* No compiled RE interpolated, has runtime
2153 components ===> unsafe. */
2154 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2155 if (PL_tainting && PL_tainted)
cc6b7395 2156 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2157 }
2158
830247a4 2159 nextchar(pRExC_state);
0f5d15d6 2160 if (logical) {
830247a4 2161 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2162 if (!SIZE_ONLY)
2163 ret->flags = 2;
830247a4 2164 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2165 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2166 return ret;
2167 }
830247a4 2168 return reganode(pRExC_state, EVAL, n);
c277df42 2169 }
fac92740 2170 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2171 {
fac92740 2172 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2173 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2174 || RExC_parse[1] == '<'
830247a4 2175 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2176 I32 flag;
2177
830247a4 2178 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2179 if (!SIZE_ONLY)
2180 ret->flags = 1;
830247a4 2181 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2182 goto insert_if;
b81d288d 2183 }
a0ed51b3 2184 }
830247a4 2185 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2186 /* (?(1)...) */
830247a4 2187 parno = atoi(RExC_parse++);
c277df42 2188
830247a4
IZ
2189 while (isDIGIT(*RExC_parse))
2190 RExC_parse++;
fac92740
MJD
2191 ret = reganode(pRExC_state, GROUPP, parno);
2192
830247a4 2193 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2194 vFAIL("Switch condition not recognized");
c277df42 2195 insert_if:
830247a4
IZ
2196 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2197 br = regbranch(pRExC_state, &flags, 1);
c277df42 2198 if (br == NULL)
830247a4 2199 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2200 else
830247a4
IZ
2201 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2202 c = *nextchar(pRExC_state);
d1b80229
IZ
2203 if (flags&HASWIDTH)
2204 *flagp |= HASWIDTH;
c277df42 2205 if (c == '|') {
830247a4
IZ
2206 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2207 regbranch(pRExC_state, &flags, 1);
2208 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2209 if (flags&HASWIDTH)
2210 *flagp |= HASWIDTH;
830247a4 2211 c = *nextchar(pRExC_state);
a0ed51b3
LW
2212 }
2213 else
c277df42
IZ
2214 lastbr = NULL;
2215 if (c != ')')
8615cb43 2216 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2217 ender = reg_node(pRExC_state, TAIL);
2218 regtail(pRExC_state, br, ender);
c277df42 2219 if (lastbr) {
830247a4
IZ
2220 regtail(pRExC_state, lastbr, ender);
2221 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2222 }
2223 else
830247a4 2224 regtail(pRExC_state, ret, ender);
c277df42 2225 return ret;
a0ed51b3
LW
2226 }
2227 else {
830247a4 2228 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2229 }
2230 }
1b1626e4 2231 case 0:
830247a4 2232 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2233 vFAIL("Sequence (? incomplete");
1b1626e4 2234 break;
a0d0e21e 2235 default:
830247a4 2236 --RExC_parse;
fac92740 2237 parse_flags: /* (?i) */
830247a4 2238 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2239 /* (?g), (?gc) and (?o) are useless here
2240 and must be globally applied -- japhy */
2241
2242 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2243 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2244 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2245 if (! (wastedflags & wflagbit) ) {
2246 wastedflags |= wflagbit;
2247 vWARN5(
2248 RExC_parse + 1,
2249 "Useless (%s%c) - %suse /%c modifier",
2250 flagsp == &negflags ? "?-" : "?",
2251 *RExC_parse,
2252 flagsp == &negflags ? "don't " : "",
2253 *RExC_parse
2254 );
2255 }
2256 }
2257 }
2258 else if (*RExC_parse == 'c') {
2259 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2260 if (! (wastedflags & wasted_c) ) {
2261 wastedflags |= wasted_gc;
2262 vWARN3(
2263 RExC_parse + 1,
2264 "Useless (%sc) - %suse /gc modifier",
2265 flagsp == &negflags ? "?-" : "?",
2266 flagsp == &negflags ? "don't " : ""
2267 );
2268 }
2269 }
2270 }
2271 else { pmflag(flagsp, *RExC_parse); }
2272
830247a4 2273 ++RExC_parse;
ca9dfc88 2274 }
830247a4 2275 if (*RExC_parse == '-') {
ca9dfc88 2276 flagsp = &negflags;
9d1d55b5 2277 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2278 ++RExC_parse;
ca9dfc88 2279 goto parse_flags;
48c036b1 2280 }
830247a4
IZ
2281 RExC_flags16 |= posflags;
2282 RExC_flags16 &= ~negflags;
2283 if (*RExC_parse == ':') {
2284 RExC_parse++;
ca9dfc88
IZ
2285 paren = ':';
2286 break;
2287 }
c277df42 2288 unknown:
830247a4
IZ
2289 if (*RExC_parse != ')') {
2290 RExC_parse++;
2291 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2292 }
830247a4 2293 nextchar(pRExC_state);
a0d0e21e
LW
2294 *flagp = TRYAGAIN;
2295 return NULL;
2296 }
2297 }
fac92740 2298 else { /* (...) */
830247a4
IZ
2299 parno = RExC_npar;
2300 RExC_npar++;
2301 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2302 Set_Node_Length(ret, 1); /* MJD */
2303 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2304 open = 1;
a0d0e21e 2305 }
a0ed51b3 2306 }
fac92740 2307 else /* ! paren */
a0d0e21e
LW
2308 ret = NULL;
2309
2310 /* Pick up the branches, linking them together. */
fac92740 2311 parse_start = RExC_parse; /* MJD */
830247a4 2312 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2313 /* branch_len = (paren != 0); */
2314
a0d0e21e
LW
2315 if (br == NULL)
2316 return(NULL);
830247a4
IZ
2317 if (*RExC_parse == '|') {
2318 if (!SIZE_ONLY && RExC_extralen) {
2319 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2320 }
fac92740 2321 else { /* MJD */
830247a4 2322 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2323 Set_Node_Length(br, paren != 0);
2324 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2325 }
c277df42
IZ
2326 have_branch = 1;
2327 if (SIZE_ONLY)
830247a4 2328 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2329 }
2330 else if (paren == ':') {
c277df42
IZ
2331 *flagp |= flags&SIMPLE;
2332 }
2333 if (open) { /* Starts with OPEN. */
830247a4 2334 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2335 }
2336 else if (paren != '?') /* Not Conditional */
a0d0e21e 2337 ret = br;
821b33a5
IZ
2338 if (flags&HASWIDTH)
2339 *flagp |= HASWIDTH;
a0d0e21e 2340 *flagp |= flags&SPSTART;
c277df42 2341 lastbr = br;
830247a4
IZ
2342 while (*RExC_parse == '|') {
2343 if (!SIZE_ONLY && RExC_extralen) {
2344 ender = reganode(pRExC_state, LONGJMP,0);
2345 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2346 }
2347 if (SIZE_ONLY)
830247a4
IZ
2348 RExC_extralen += 2; /* Account for LONGJMP. */
2349 nextchar(pRExC_state);
2350 br = regbranch(pRExC_state, &flags, 0);
fac92740 2351
a687059c 2352 if (br == NULL)
a0d0e21e 2353 return(NULL);
830247a4 2354 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2355 lastbr = br;
821b33a5
IZ
2356 if (flags&HASWIDTH)
2357 *flagp |= HASWIDTH;
a687059c 2358 *flagp |= flags&SPSTART;
a0d0e21e
LW
2359 }
2360
c277df42
IZ
2361 if (have_branch || paren != ':') {
2362 /* Make a closing node, and hook it on the end. */
2363 switch (paren) {
2364 case ':':
830247a4 2365 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2366 break;
2367 case 1:
830247a4 2368 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2369 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2370 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2371 break;
2372 case '<':
c277df42
IZ
2373 case ',':
2374 case '=':
2375 case '!':
c277df42 2376 *flagp &= ~HASWIDTH;
821b33a5
IZ
2377 /* FALL THROUGH */
2378 case '>':
830247a4 2379 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2380 break;
2381 case 0:
830247a4 2382 ender = reg_node(pRExC_state, END);
c277df42
IZ
2383 break;
2384 }
830247a4 2385 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2386
c277df42
IZ
2387 if (have_branch) {
2388 /* Hook the tails of the branches to the closing node. */
2389 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2390 regoptail(pRExC_state, br, ender);
c277df42
IZ
2391 }
2392 }
a0d0e21e 2393 }
c277df42
IZ
2394
2395 {
2396 char *p;
2397 static char parens[] = "=!<,>";
2398
2399 if (paren && (p = strchr(parens, paren))) {
2400 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2401 int flag = (p - parens) > 1;
2402
2403 if (paren == '>')
2404 node = SUSPEND, flag = 0;
830247a4 2405 reginsert(pRExC_state, node,ret);
c277df42 2406 ret->flags = flag;
830247a4 2407 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2408 }
a0d0e21e
LW
2409 }
2410
2411 /* Check for proper termination. */
ce3e6498 2412 if (paren) {
830247a4
IZ
2413 RExC_flags16 = oregflags;
2414 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2415 RExC_parse = oregcomp_parse;
380a0633 2416 vFAIL("Unmatched (");
ce3e6498 2417 }
a0ed51b3 2418 }
830247a4
IZ
2419 else if (!paren && RExC_parse < RExC_end) {
2420 if (*RExC_parse == ')') {
2421 RExC_parse++;
380a0633 2422 vFAIL("Unmatched )");
a0ed51b3
LW
2423 }
2424 else
b45f050a 2425 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2426 /* NOTREACHED */
2427 }
a687059c 2428
a0d0e21e 2429 return(ret);
a687059c
LW
2430}
2431
2432/*
2433 - regbranch - one alternative of an | operator
2434 *
2435 * Implements the concatenation operator.
2436 */
76e3520e 2437STATIC regnode *
830247a4 2438S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2439{
c277df42
IZ
2440 register regnode *ret;
2441 register regnode *chain = NULL;
2442 register regnode *latest;
2443 I32 flags = 0, c = 0;
a0d0e21e 2444
b81d288d 2445 if (first)
c277df42
IZ
2446 ret = NULL;
2447 else {
b81d288d 2448 if (!SIZE_ONLY && RExC_extralen)
830247a4 2449 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2450 else {
830247a4 2451 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2452 Set_Node_Length(ret, 1);
2453 }
c277df42
IZ
2454 }
2455
b81d288d 2456 if (!first && SIZE_ONLY)
830247a4 2457 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2458
c277df42 2459 *flagp = WORST; /* Tentatively. */
a0d0e21e 2460
830247a4
IZ
2461 RExC_parse--;
2462 nextchar(pRExC_state);
2463 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2464 flags &= ~TRYAGAIN;
830247a4 2465 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2466 if (latest == NULL) {
2467 if (flags & TRYAGAIN)
2468 continue;
2469 return(NULL);
a0ed51b3
LW
2470 }
2471 else if (ret == NULL)
c277df42 2472 ret = latest;
a0d0e21e 2473 *flagp |= flags&HASWIDTH;
c277df42 2474 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2475 *flagp |= flags&SPSTART;
2476 else {
830247a4
IZ
2477 RExC_naughty++;
2478 regtail(pRExC_state, chain, latest);
a687059c 2479 }
a0d0e21e 2480 chain = latest;
c277df42
IZ
2481 c++;
2482 }
2483 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2484 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2485 if (ret == NULL)
2486 ret = chain;
2487 }
2488 if (c == 1) {
2489 *flagp |= flags&SIMPLE;
a0d0e21e 2490 }
a687059c 2491
a0d0e21e 2492 return(ret);
a687059c
LW
2493}
2494
2495/*
2496 - regpiece - something followed by possible [*+?]
2497 *
2498 * Note that the branching code sequences used for ? and the general cases
2499 * of * and + are somewhat optimized: they use the same NOTHING node as
2500 * both the endmarker for their branch list and the body of the last branch.
2501 * It might seem that this node could be dispensed with entirely, but the
2502 * endmarker role is not redundant.
2503 */
76e3520e 2504STATIC regnode *
830247a4 2505S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2506{
c277df42 2507 register regnode *ret;
a0d0e21e
LW
2508 register char op;
2509 register char *next;
2510 I32 flags;
830247a4 2511 char *origparse = RExC_parse;
a0d0e21e
LW
2512 char *maxpos;
2513 I32 min;
c277df42 2514 I32 max = REG_INFTY;
fac92740 2515 char *parse_start;
a0d0e21e 2516
830247a4 2517 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2518 if (ret == NULL) {
2519 if (flags & TRYAGAIN)
2520 *flagp |= TRYAGAIN;
2521 return(NULL);
2522 }
2523
830247a4 2524 op = *RExC_parse;
a0d0e21e 2525
830247a4 2526 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2527 parse_start = RExC_parse; /* MJD */
830247a4 2528 next = RExC_parse + 1;
a0d0e21e
LW
2529 maxpos = Nullch;
2530 while (isDIGIT(*next) || *next == ',') {
2531 if (*next == ',') {
2532 if (maxpos)
2533 break;
2534 else
2535 maxpos = next;
a687059c 2536 }
a0d0e21e
LW
2537 next++;
2538 }
2539 if (*next == '}') { /* got one */
2540 if (!maxpos)
2541 maxpos = next;
830247a4
IZ
2542 RExC_parse++;
2543 min = atoi(RExC_parse);
a0d0e21e
LW
2544 if (*maxpos == ',')
2545 maxpos++;
2546 else
830247a4 2547 maxpos = RExC_parse;
a0d0e21e
LW
2548 max = atoi(maxpos);
2549 if (!max && *maxpos != '0')
c277df42
IZ
2550 max = REG_INFTY; /* meaning "infinity" */
2551 else if (max >= REG_INFTY)
8615cb43 2552 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2553 RExC_parse = next;
2554 nextchar(pRExC_state);
a0d0e21e
LW
2555
2556 do_curly:
2557 if ((flags&SIMPLE)) {
830247a4
IZ
2558 RExC_naughty += 2 + RExC_naughty / 2;
2559 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2560 Set_Node_Offset(ret, parse_start+1); /* MJD */
2561 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2562 }
2563 else {
830247a4 2564 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2565
2566 w->flags = 0;
830247a4
IZ
2567 regtail(pRExC_state, ret, w);
2568 if (!SIZE_ONLY && RExC_extralen) {
2569 reginsert(pRExC_state, LONGJMP,ret);
2570 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2571 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2572 }
830247a4 2573 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2574 /* MJD hk */
2575 Set_Node_Offset(ret, parse_start+1);
2576 Set_Node_Length(ret,
2577 op == '{' ? (RExC_parse - parse_start) : 1);
2578
830247a4 2579 if (!SIZE_ONLY && RExC_extralen)
c277df42 2580 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2581 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2582 if (SIZE_ONLY)
830247a4
IZ
2583 RExC_whilem_seen++, RExC_extralen += 3;
2584 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2585 }
c277df42 2586 ret->flags = 0;
a0d0e21e
LW
2587
2588 if (min > 0)
821b33a5
IZ
2589 *flagp = WORST;
2590 if (max > 0)
2591 *flagp |= HASWIDTH;
a0d0e21e 2592 if (max && max < min)
8615cb43 2593 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2594 if (!SIZE_ONLY) {
2595 ARG1_SET(ret, min);
2596 ARG2_SET(ret, max);
a687059c 2597 }
a687059c 2598
a0d0e21e 2599 goto nest_check;
a687059c 2600 }
a0d0e21e 2601 }
a687059c 2602
a0d0e21e
LW
2603 if (!ISMULT1(op)) {
2604 *flagp = flags;
a687059c 2605 return(ret);
a0d0e21e 2606 }
bb20fd44 2607
c277df42 2608#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2609
2610 /* if this is reinstated, don't forget to put this back into perldiag:
2611
2612 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2613
2614 (F) The part of the regexp subject to either the * or + quantifier
2615 could match an empty string. The {#} shows in the regular
2616 expression about where the problem was discovered.
2617
2618 */
2619
bb20fd44 2620 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2621 vFAIL("Regexp *+ operand could be empty");
b81d288d 2622#endif
bb20fd44 2623
fac92740 2624 parse_start = RExC_parse;
830247a4 2625 nextchar(pRExC_state);
a0d0e21e 2626
821b33a5 2627 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2628
2629 if (op == '*' && (flags&SIMPLE)) {
830247a4 2630 reginsert(pRExC_state, STAR, ret);
c277df42 2631 ret->flags = 0;
830247a4 2632 RExC_naughty += 4;
a0d0e21e
LW
2633 }
2634 else if (op == '*') {
2635 min = 0;
2636 goto do_curly;
a0ed51b3
LW
2637 }
2638 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2639 reginsert(pRExC_state, PLUS, ret);
c277df42 2640 ret->flags = 0;
830247a4 2641 RExC_naughty += 3;
a0d0e21e
LW
2642 }
2643 else if (op == '+') {
2644 min = 1;
2645 goto do_curly;
a0ed51b3
LW
2646 }
2647 else if (op == '?') {
a0d0e21e
LW
2648 min = 0; max = 1;
2649 goto do_curly;
2650 }
2651 nest_check:
e476b1b5 2652 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2653 vWARN3(RExC_parse,
b45f050a 2654 "%.*s matches null string many times",
830247a4 2655 RExC_parse - origparse,
b45f050a 2656 origparse);
a0d0e21e
LW
2657 }
2658
830247a4
IZ
2659 if (*RExC_parse == '?') {
2660 nextchar(pRExC_state);
2661 reginsert(pRExC_state, MINMOD, ret);
2662 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2663 }
830247a4
IZ
2664 if (ISMULT2(RExC_parse)) {
2665 RExC_parse++;
b45f050a
JF
2666 vFAIL("Nested quantifiers");
2667 }
a0d0e21e
LW
2668
2669 return(ret);
a687059c
LW
2670}
2671
2672/*
2673 - regatom - the lowest level
2674 *
2675 * Optimization: gobbles an entire sequence of ordinary characters so that
2676 * it can turn them into a single node, which is smaller to store and
2677 * faster to run. Backslashed characters are exceptions, each becoming a
2678 * separate node; the code is simpler that way and it's not worth fixing.
2679 *
b45f050a 2680 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2681STATIC regnode *
830247a4 2682S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2683{
c277df42 2684 register regnode *ret = 0;
a0d0e21e 2685 I32 flags;
f06dbbb7 2686 char *parse_start = 0;
a0d0e21e
LW
2687
2688 *flagp = WORST; /* Tentatively. */
2689
2690tryagain:
830247a4 2691 switch (*RExC_parse) {
a0d0e21e 2692 case '^':
830247a4
IZ
2693 RExC_seen_zerolen++;
2694 nextchar(pRExC_state);
2695 if (RExC_flags16 & PMf_MULTILINE)
2696 ret = reg_node(pRExC_state, MBOL);
2697 else if (RExC_flags16 & PMf_SINGLELINE)
2698 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2699 else
830247a4 2700 ret = reg_node(pRExC_state, BOL);
fac92740 2701 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2702 break;
2703 case '$':
830247a4 2704 nextchar(pRExC_state);
b81d288d 2705 if (*RExC_parse)
830247a4
IZ
2706 RExC_seen_zerolen++;
2707 if (RExC_flags16 & PMf_MULTILINE)
2708 ret = reg_node(pRExC_state, MEOL);
2709 else if (RExC_flags16 & PMf_SINGLELINE)
2710 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2711 else
830247a4 2712 ret = reg_node(pRExC_state, EOL);
fac92740 2713 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2714 break;
2715 case '.':
830247a4 2716 nextchar(pRExC_state);
ffc61ed2
JH
2717 if (RExC_flags16 & PMf_SINGLELINE)
2718 ret = reg_node(pRExC_state, SANY);
2719 else
2720 ret = reg_node(pRExC_state, REG_ANY);
2721 *flagp |= HASWIDTH|SIMPLE;
830247a4 2722 RExC_naughty++;
fac92740 2723 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2724 break;
2725 case '[':
b45f050a 2726 {
830247a4 2727 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2728 ret = regclass(pRExC_state);
830247a4
IZ
2729 if (*RExC_parse != ']') {
2730 RExC_parse = oregcomp_parse;
b45f050a
JF
2731 vFAIL("Unmatched [");
2732 }
830247a4 2733 nextchar(pRExC_state);
a0d0e21e 2734 *flagp |= HASWIDTH|SIMPLE;
fac92740 2735 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2736 break;
b45f050a 2737 }
a0d0e21e 2738 case '(':
830247a4
IZ
2739 nextchar(pRExC_state);
2740 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2741 if (ret == NULL) {
bf93d4cc 2742 if (flags & TRYAGAIN) {
830247a4 2743 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2744 /* Make parent create an empty node if needed. */
2745 *flagp |= TRYAGAIN;
2746 return(NULL);
2747 }
a0d0e21e 2748 goto tryagain;
bf93d4cc 2749 }
a0d0e21e
LW
2750 return(NULL);
2751 }
c277df42 2752 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2753 break;
2754 case '|':
2755 case ')':
2756 if (flags & TRYAGAIN) {
2757 *flagp |= TRYAGAIN;
2758 return NULL;
2759 }
b45f050a 2760 vFAIL("Internal urp");
a0d0e21e
LW
2761 /* Supposed to be caught earlier. */
2762 break;
85afd4ae 2763 case '{':
830247a4
IZ
2764 if (!regcurly(RExC_parse)) {
2765 RExC_parse++;
85afd4ae
CS
2766 goto defchar;
2767 }
2768 /* FALL THROUGH */
a0d0e21e
LW
2769 case '?':
2770 case '+':
2771 case '*':
830247a4 2772 RExC_parse++;
b45f050a 2773 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2774 break;
2775 case '\\':
830247a4 2776 switch (*++RExC_parse) {
a0d0e21e 2777 case 'A':
830247a4
IZ
2778 RExC_seen_zerolen++;
2779 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2780 *flagp |= SIMPLE;
830247a4 2781 nextchar(pRExC_state);
fac92740 2782 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2783 break;
2784 case 'G':
830247a4
IZ
2785 ret = reg_node(pRExC_state, GPOS);
2786 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2787 *flagp |= SIMPLE;
830247a4 2788 nextchar(pRExC_state);
fac92740 2789 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2790 break;
2791 case 'Z':
830247a4 2792 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2793 *flagp |= SIMPLE;
830247a4 2794 nextchar(pRExC_state);
a0d0e21e 2795 break;
b85d18e9 2796 case 'z':
830247a4 2797 ret = reg_node(pRExC_state, EOS);
b85d18e9 2798 *flagp |= SIMPLE;
830247a4
IZ
2799 RExC_seen_zerolen++; /* Do not optimize RE away */
2800 nextchar(pRExC_state);
fac92740 2801 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2802 break;
4a2d328f 2803 case 'C':
830247a4 2804 ret = reg_node(pRExC_state, SANY);
3baa4c62 2805 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2806 *flagp |= HASWIDTH|SIMPLE;
830247a4 2807 nextchar(pRExC_state);
fac92740 2808 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2809 break;
2810 case 'X':
830247a4 2811 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2812 *flagp |= HASWIDTH;
830247a4 2813 nextchar(pRExC_state);
fac92740 2814 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2815 break;
a0d0e21e 2816 case 'w':
ffc61ed2 2817 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2818 *flagp |= HASWIDTH|SIMPLE;
830247a4 2819 nextchar(pRExC_state);
fac92740 2820 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2821 break;
2822 case 'W':
ffc61ed2 2823 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2824 *flagp |= HASWIDTH|SIMPLE;
830247a4 2825 nextchar(pRExC_state);
fac92740 2826 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2827 break;
2828 case 'b':
830247a4
IZ
2829 RExC_seen_zerolen++;
2830 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2831 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2832 *flagp |= SIMPLE;
830247a4 2833 nextchar(pRExC_state);
fac92740 2834 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2835 break;
2836 case 'B':
830247a4
IZ
2837 RExC_seen_zerolen++;
2838 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2839 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2840 *flagp |= SIMPLE;
830247a4 2841 nextchar(pRExC_state);
fac92740 2842 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2843 break;
2844 case 's':
ffc61ed2 2845 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2846 *flagp |= HASWIDTH|SIMPLE;
830247a4 2847 nextchar(pRExC_state);
fac92740 2848 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2849 break;
2850 case 'S':
ffc61ed2 2851 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2852 *flagp |= HASWIDTH|SIMPLE;
830247a4 2853 nextchar(pRExC_state);
fac92740 2854 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2855 break;
2856 case 'd':
ffc61ed2 2857 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2858 *flagp |= HASWIDTH|SIMPLE;
830247a4 2859 nextchar(pRExC_state);
fac92740 2860 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2861 break;
2862 case 'D':
ffc61ed2 2863 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2864 *flagp |= HASWIDTH|SIMPLE;
830247a4 2865 nextchar(pRExC_state);
fac92740 2866 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2867 break;
a14b48bc
LW
2868 case 'p':
2869 case 'P':
3568d838 2870 {
830247a4 2871 char* oldregxend = RExC_end;
fac92740 2872 char* parse_start = RExC_parse;
a14b48bc 2873
830247a4 2874 if (RExC_parse[1] == '{') {
3568d838 2875 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2876 RExC_end = strchr(RExC_parse, '}');
2877 if (!RExC_end) {
2878 RExC_parse += 2;
2879 RExC_end = oldregxend;
b45f050a
JF
2880 vFAIL("Missing right brace on \\p{}");
2881 }
830247a4 2882 RExC_end++;
a14b48bc
LW
2883 }
2884 else
830247a4
IZ
2885 RExC_end = RExC_parse + 2;
2886 RExC_parse--;
a14b48bc 2887
ffc61ed2 2888 ret = regclass(pRExC_state);
a14b48bc 2889
830247a4
IZ
2890 RExC_end = oldregxend;
2891 RExC_parse--;
fac92740 2892 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2893 nextchar(pRExC_state);
a14b48bc
LW
2894 *flagp |= HASWIDTH|SIMPLE;
2895 }
2896 break;
a0d0e21e
LW
2897 case 'n':
2898 case 'r':
2899 case 't':
2900 case 'f':
2901 case 'e':
2902 case 'a':
2903 case 'x':
2904 case 'c':
2905 case '0':
2906 goto defchar;
2907 case '1': case '2': case '3': case '4':
2908 case '5': case '6': case '7': case '8': case '9':
2909 {
830247a4 2910 I32 num = atoi(RExC_parse);
a0d0e21e 2911
830247a4 2912 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2913 goto defchar;
2914 else {
fac92740 2915 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2916 while (isDIGIT(*RExC_parse))
2917 RExC_parse++;
b45f050a 2918
830247a4 2919 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2920 vFAIL("Reference to nonexistent group");
830247a4
IZ
2921 RExC_sawback = 1;
2922 ret = reganode(pRExC_state, FOLD
a0ed51b3 2923 ? (LOC ? REFFL : REFF)
c8756f30 2924 : REF, num);
a0d0e21e 2925 *flagp |= HASWIDTH;
fac92740
MJD
2926
2927 /* override incorrect value set in reganode MJD */
2928 Set_Node_Offset(ret, parse_start+1);
2929 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2930 RExC_parse--;
2931 nextchar(pRExC_state);
a0d0e21e
LW
2932 }
2933 }
2934 break;
2935 case '\0':
830247a4 2936 if (RExC_parse >= RExC_end)
b45f050a 2937 FAIL("Trailing \\");
a0d0e21e
LW
2938 /* FALL THROUGH */
2939 default:
c9f97d15
IZ
2940 /* Do not generate `unrecognized' warnings here, we fall
2941 back into the quick-grab loop below */
a0d0e21e
LW
2942 goto defchar;
2943 }
2944 break;
4633a7c4
LW
2945
2946 case '#':
830247a4
IZ
2947 if (RExC_flags16 & PMf_EXTENDED) {
2948 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2949 if (RExC_parse < RExC_end)
4633a7c4
LW
2950 goto tryagain;
2951 }
2952 /* FALL THROUGH */
2953
a0d0e21e 2954 default: {
ba210ebe 2955 register STRLEN len;
a0ed51b3 2956 register UV ender;
a0d0e21e 2957 register char *p;
c277df42 2958 char *oldp, *s;
ba210ebe 2959 STRLEN numlen;
f06dbbb7
JH
2960
2961 parse_start = RExC_parse - 1;
a0d0e21e 2962
830247a4 2963 RExC_parse++;
a0d0e21e
LW
2964
2965 defchar:
830247a4 2966 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2967 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2968 : EXACT);
cd439c50 2969 s = STRING(ret);
830247a4
IZ
2970 for (len = 0, p = RExC_parse - 1;
2971 len < 127 && p < RExC_end;
a0d0e21e
LW
2972 len++)
2973 {
2974 oldp = p;
5b5a24f7 2975
830247a4
IZ
2976 if (RExC_flags16 & PMf_EXTENDED)
2977 p = regwhite(p, RExC_end);
a0d0e21e
LW
2978 switch (*p) {
2979 case '^':
2980 case '$':
2981 case '.':
2982 case '[':
2983 case '(':
2984 case ')':
2985 case '|':
2986 goto loopdone;
2987 case '\\':
2988 switch (*++p) {
2989 case 'A':
2990 case 'G':
2991 case 'Z':
b85d18e9 2992 case 'z':
a0d0e21e
LW
2993 case 'w':
2994 case 'W':
2995 case 'b':
2996 case 'B':
2997 case 's':
2998 case 'S':
2999 case 'd':
3000 case 'D':
a14b48bc
LW
3001 case 'p':
3002 case 'P':
a0d0e21e
LW
3003 --p;
3004 goto loopdone;
3005 case 'n':
3006 ender = '\n';
3007 p++;
a687059c 3008 break;
a0d0e21e
LW
3009 case 'r':
3010 ender = '\r';
3011 p++;
a687059c 3012 break;
a0d0e21e
LW
3013 case 't':
3014 ender = '\t';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 'f':
3018 ender = '\f';
3019 p++;
a687059c 3020 break;
a0d0e21e 3021 case 'e':
c7f1f016 3022 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3023 p++;
a687059c 3024 break;
a0d0e21e 3025 case 'a':
c7f1f016 3026 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'x':
a0ed51b3
LW
3030 if (*++p == '{') {
3031 char* e = strchr(p, '}');
b81d288d 3032
b45f050a 3033 if (!e) {
830247a4 3034 RExC_parse = p + 1;
b45f050a
JF
3035 vFAIL("Missing right brace on \\x{}");
3036 }
de5f0749 3037 else {
b21ed0a9 3038 numlen = 1; /* allow underscores */
de35ba6f 3039 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
3040 if (ender > 0xff)
3041 RExC_utf8 = 1;
b21ed0a9
GS
3042 /* numlen is generous */
3043 if (numlen + len >= 127) {
a0ed51b3
LW
3044 p--;
3045 goto loopdone;
3046 }
3047 p = e + 1;
3048 }
a0ed51b3
LW
3049 }
3050 else {
b21ed0a9 3051 numlen = 0; /* disallow underscores */
dff6d3cd 3052 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
3053 p += numlen;
3054 }
a687059c 3055 break;
a0d0e21e
LW
3056 case 'c':
3057 p++;
bbce6d69 3058 ender = UCHARAT(p++);
3059 ender = toCTRL(ender);
a687059c 3060 break;
a0d0e21e
LW
3061 case '0': case '1': case '2': case '3':case '4':
3062 case '5': case '6': case '7': case '8':case '9':
3063 if (*p == '0' ||
830247a4 3064 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 3065 numlen = 0; /* disallow underscores */
dff6d3cd 3066 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
3067 p += numlen;
3068 }
3069 else {
3070 --p;
3071 goto loopdone;
a687059c
LW
3072 }
3073 break;
a0d0e21e 3074 case '\0':
830247a4 3075 if (p >= RExC_end)
b45f050a 3076 FAIL("Trailing \\");
a687059c 3077 /* FALL THROUGH */
a0d0e21e 3078 default:
e476b1b5 3079 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3080 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3081 goto normal_default;
a0d0e21e
LW
3082 }
3083 break;
a687059c 3084 default:
a0ed51b3 3085 normal_default:
fd400ab9 3086 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3087 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3088 &numlen, 0);
a0ed51b3
LW
3089 p += numlen;
3090 }
3091 else
3092 ender = *p++;
a0d0e21e 3093 break;
a687059c 3094 }
830247a4
IZ
3095 if (RExC_flags16 & PMf_EXTENDED)
3096 p = regwhite(p, RExC_end);
a0ed51b3
LW
3097 if (UTF && FOLD) {
3098 if (LOC)
5e12f4fb 3099 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3100 else
3101 ender = toLOWER_uni(ender);
3102 }
a0d0e21e
LW
3103 if (ISMULT2(p)) { /* Back off on ?+*. */
3104 if (len)
3105 p = oldp;
2b9d42f0 3106 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3107 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3108 s += numlen;
3109 len += numlen;
3110 }
a0d0e21e
LW
3111 else {
3112 len++;
cd439c50 3113 REGC(ender, s++);
a0d0e21e
LW
3114 }
3115 break;
a687059c 3116 }
2b9d42f0 3117 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3118 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3119 s += numlen;
3120 len += numlen - 1;
3121 }
3122 else
cd439c50 3123 REGC(ender, s++);
a0d0e21e
LW
3124 }
3125 loopdone:
830247a4 3126 RExC_parse = p - 1;
fac92740 3127 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3128 nextchar(pRExC_state);
793db0cb
JH
3129 {
3130 /* len is STRLEN which is unsigned, need to copy to signed */
3131 IV iv = len;
3132 if (iv < 0)
3133 vFAIL("Internal disaster");
3134 }
a0d0e21e
LW
3135 if (len > 0)
3136 *flagp |= HASWIDTH;
3137 if (len == 1)
3138 *flagp |= SIMPLE;
c277df42 3139 if (!SIZE_ONLY)
cd439c50
IZ
3140 STR_LEN(ret) = len;
3141 if (SIZE_ONLY)
830247a4 3142 RExC_size += STR_SZ(len);
cd439c50 3143 else
830247a4 3144 RExC_emit += STR_SZ(len);
a687059c 3145 }
a0d0e21e
LW
3146 break;
3147 }
a687059c 3148
a0d0e21e 3149 return(ret);
a687059c
LW
3150}
3151
873ef191 3152STATIC char *
cea2e8a9 3153S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3154{
3155 while (p < e) {
3156 if (isSPACE(*p))
3157 ++p;
3158 else if (*p == '#') {
3159 do {
3160 p++;
3161 } while (p < e && *p != '\n');
3162 }
3163 else
3164 break;
3165 }
3166 return p;
3167}
3168
b8c5462f
JH
3169/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3170 Character classes ([:foo:]) can also be negated ([:^foo:]).
3171 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3172 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3173 but trigger warnings because they are currently unimplemented. */
3174STATIC I32
830247a4 3175S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3176{
3177 char *posixcc = 0;
936ed897 3178 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3179
830247a4 3180 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3181 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3182 (*RExC_parse == ':' ||
3183 *RExC_parse == '=' ||
3184 *RExC_parse == '.')) {
3185 char c = *RExC_parse;
3186 char* s = RExC_parse++;
b81d288d 3187
830247a4
IZ
3188 while (RExC_parse < RExC_end && *RExC_parse != c)
3189 RExC_parse++;
3190 if (RExC_parse == RExC_end)
620e46c5 3191 /* Grandfather lone [:, [=, [. */
830247a4 3192 RExC_parse = s;
620e46c5 3193 else {
830247a4 3194 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3195
830247a4
IZ
3196 if (*RExC_parse == ']') {
3197 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3198 posixcc = s + 1;
3199 if (*s == ':') {
3200 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3201 I32 skip = 5; /* the most common skip */
3202
3203 switch (*posixcc) {
3204 case 'a':
3205 if (strnEQ(posixcc, "alnum", 5))
3206 namedclass =
3207 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3208 else if (strnEQ(posixcc, "alpha", 5))
3209 namedclass =
3210 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3211 else if (strnEQ(posixcc, "ascii", 5))
3212 namedclass =
3213 complement ? ANYOF_NASCII : ANYOF_ASCII;
3214 break;
aaa51d5e
JF
3215 case 'b':
3216 if (strnEQ(posixcc, "blank", 5))
3217 namedclass =
3218 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3219 break;
b8c5462f
JH
3220 case 'c':
3221 if (strnEQ(posixcc, "cntrl", 5))
3222 namedclass =
3223 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3224 break;
3225 case 'd':
3226 if (strnEQ(posixcc, "digit", 5))
3227 namedclass =
3228 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3229 break;
3230 case 'g':
3231 if (strnEQ(posixcc, "graph", 5))
3232 namedclass =
3233 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3234 break;
3235 case 'l':
3236 if (strnEQ(posixcc, "lower", 5))
3237 namedclass =
3238 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3239 break;
3240 case 'p':
3241 if (strnEQ(posixcc, "print", 5))
3242 namedclass =
3243 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3244 else if (strnEQ(posixcc, "punct", 5))
3245 namedclass =
3246 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3247 break;
3248 case 's':
3249 if (strnEQ(posixcc, "space", 5))
3250 namedclass =
aaa51d5e 3251 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3252 break;
b8c5462f
JH
3253 case 'u':
3254 if (strnEQ(posixcc, "upper", 5))
3255 namedclass =
3256 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3257 break;
3258 case 'w': /* this is not POSIX, this is the Perl \w */
3259 if (strnEQ(posixcc, "word", 4)) {
3260 namedclass =
3261 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3262 skip = 4;
3263 }
3264 break;
3265 case 'x':
3266 if (strnEQ(posixcc, "xdigit", 6)) {
3267 namedclass =
3268 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3269 skip = 6;
3270 }
3271 break;
3272 }
ac561586
JH
3273 if (namedclass == OOB_NAMEDCLASS ||
3274 posixcc[skip] != ':' ||
3275 posixcc[skip+1] != ']')
b45f050a
JF
3276 {
3277 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3278 t - s - 1, s + 1);
3279 }
3280 } else if (!SIZE_ONLY) {
b8c5462f 3281 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3282
830247a4 3283 /* adjust RExC_parse so the warning shows after
b45f050a 3284 the class closes */
830247a4
IZ
3285 while (*RExC_parse && *RExC_parse != ']')
3286 RExC_parse++;
b45f050a
JF
3287 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3288 }
b8c5462f
JH
3289 } else {
3290 /* Maternal grandfather:
3291 * "[:" ending in ":" but not in ":]" */
830247a4 3292 RExC_parse = s;
767d463e 3293 }
620e46c5
JH
3294 }
3295 }
3296
b8c5462f
JH
3297 return namedclass;
3298}
3299
3300STATIC void
830247a4 3301S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3302{
e476b1b5 3303 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3304 (*RExC_parse == ':' ||
3305 *RExC_parse == '=' ||
3306 *RExC_parse == '.')) {
3307 char *s = RExC_parse;
93733859 3308 char c = *s++;
b8c5462f
JH
3309
3310 while(*s && isALNUM(*s))
3311 s++;
3312 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3313 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3314
3315 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3316 if (c == '=' || c == '.')
b45f050a 3317 {
830247a4 3318 /* adjust RExC_parse so the error shows after
b45f050a 3319 the class closes */
830247a4 3320 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3321 ;
3322 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3323 }
b8c5462f
JH
3324 }
3325 }
620e46c5
JH
3326}
3327
76e3520e 3328STATIC regnode *
830247a4 3329S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3330{
ffc61ed2 3331 register UV value;
3568d838 3332 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3333 register IV range = 0;
c277df42 3334 register regnode *ret;
ba210ebe 3335 STRLEN numlen;
ffc61ed2 3336 IV namedclass;
9c5ffd7c 3337 char *rangebegin = 0;
936ed897 3338 bool need_class = 0;
9c5ffd7c 3339 SV *listsv = Nullsv;
ffc61ed2
JH
3340 register char *e;
3341 UV n;
3568d838 3342 bool optimize_invert = TRUE;
ffc61ed2
JH
3343
3344 ret = reganode(pRExC_state, ANYOF, 0);
3345
3346 if (!SIZE_ONLY)
3347 ANYOF_FLAGS(ret) = 0;
3348
3349 if (*RExC_parse == '^') { /* Complement of range. */
3350 RExC_naughty++;
3351 RExC_parse++;
3352 if (!SIZE_ONLY)
3353 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3354 }
a0d0e21e 3355
936ed897 3356 if (SIZE_ONLY)
830247a4 3357 RExC_size += ANYOF_SKIP;
936ed897 3358 else {
830247a4 3359 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3360 if (FOLD)
3361 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3362 if (LOC)
3363 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3364 ANYOF_BITMAP_ZERO(ret);
3365 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3366 }
b8c5462f 3367
e476b1b5 3368 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3369 checkposixcc(pRExC_state);
b8c5462f 3370
830247a4 3371 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3372 goto charclassloop; /* allow 1st char to be ] or - */
3373
830247a4 3374 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3375
3376 charclassloop:
3377
3378 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3379
73b437c8 3380 if (!range)
830247a4 3381 rangebegin = RExC_parse;
ffc61ed2 3382 if (UTF) {
5e12f4fb 3383 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3384 RExC_end - RExC_parse,
3385 &numlen, 0);
ffc61ed2
JH
3386 RExC_parse += numlen;
3387 }
3388 else
3389 value = UCHARAT(RExC_parse++);
620e46c5 3390 if (value == '[')
830247a4 3391 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3392 else if (value == '\\') {
ffc61ed2 3393 if (UTF) {
5e12f4fb 3394 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3395 RExC_end - RExC_parse,
3396 &numlen, 0);
3397 RExC_parse += numlen;
3398 }
3399 else
3400 value = UCHARAT(RExC_parse++);
470c3474 3401 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3402 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3403 * be a problem later if we want switch on Unicode.
3404 * A similar issue a little bit later when switching on
3405 * namedclass. --jhi */
ffc61ed2 3406 switch ((I32)value) {
b8c5462f
JH
3407 case 'w': namedclass = ANYOF_ALNUM; break;
3408 case 'W': namedclass = ANYOF_NALNUM; break;
3409 case 's': namedclass = ANYOF_SPACE; break;
3410 case 'S': namedclass = ANYOF_NSPACE; break;
3411 case 'd': namedclass = ANYOF_DIGIT; break;
3412 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3413 case 'p':
3414 case 'P':
3415 if (*RExC_parse == '{') {
3416 e = strchr(RExC_parse++, '}');
3417 if (!e)
3418 vFAIL("Missing right brace on \\p{}");
3419 n = e - RExC_parse;
3420 }
3421 else {
3422 e = RExC_parse;
3423 n = 1;
3424 }
3425 if (!SIZE_ONLY) {
3426 if (value == 'p')
3427 Perl_sv_catpvf(aTHX_ listsv,
3428 "+utf8::%.*s\n", (int)n, RExC_parse);
3429 else
3430 Perl_sv_catpvf(aTHX_ listsv,
3431 "!utf8::%.*s\n", (int)n, RExC_parse);
3432 }
3433 RExC_parse = e + 1;
3434 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3435 continue;
b8c5462f
JH
3436 case 'n': value = '\n'; break;
3437 case 'r': value = '\r'; break;
3438 case 't': value = '\t'; break;
3439 case 'f': value = '\f'; break;
3440 case 'b': value = '\b'; break;
c7f1f016
NIS
3441 case 'e': value = ASCII_TO_NATIVE('\033');break;
3442 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3443 case 'x':
ffc61ed2
JH
3444 if (*RExC_parse == '{') {
3445 e = strchr(RExC_parse++, '}');
b81d288d 3446 if (!e)
ffc61ed2
JH
3447 vFAIL("Missing right brace on \\x{}");
3448 numlen = 1; /* allow underscores */
3449 value = (UV)scan_hex(RExC_parse,
3450 e - RExC_parse,
3451 &numlen);
3452 RExC_parse = e + 1;
3453 }
3454 else {
3455 numlen = 0; /* disallow underscores */
3456 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3457 RExC_parse += numlen;
3458 }
b8c5462f
JH
3459 break;
3460 case 'c':
830247a4 3461 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3462 value = toCTRL(value);
3463 break;
3464 case '0': case '1': case '2': case '3': case '4':
3465 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3466 numlen = 0; /* disallow underscores */
830247a4
IZ
3467 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3468 RExC_parse += numlen;
b8c5462f 3469 break;
1028017a 3470 default:
e476b1b5 3471 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3472 vWARN2(RExC_parse,
3473 "Unrecognized escape \\%c in character class passed through",
3474 (int)value);
1028017a 3475 break;
b8c5462f 3476 }
ffc61ed2
JH
3477 } /* end of \blah */
3478
3479 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3480
3481 if (!SIZE_ONLY && !need_class)
936ed897 3482 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3483
936ed897 3484 need_class = 1;
ffc61ed2
JH
3485
3486 /* a bad range like a-\d, a-[:digit:] ? */
3487 if (range) {
73b437c8 3488 if (!SIZE_ONLY) {
e476b1b5 3489 if (ckWARN(WARN_REGEXP))
830247a4 3490 vWARN4(RExC_parse,
b45f050a 3491 "False [] range \"%*.*s\"",
830247a4
IZ
3492 RExC_parse - rangebegin,
3493 RExC_parse - rangebegin,
b45f050a 3494 rangebegin);
3568d838
JH
3495 if (prevvalue < 256) {
3496 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3497 ANYOF_BITMAP_SET(ret, '-');
3498 }
3499 else {
3500 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3501 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3502 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3503 }
b8c5462f 3504 }
ffc61ed2
JH
3505
3506 range = 0; /* this was not a true range */
73b437c8 3507 }
ffc61ed2 3508
73b437c8 3509 if (!SIZE_ONLY) {
3568d838
JH
3510 if (namedclass > OOB_NAMEDCLASS)
3511 optimize_invert = FALSE;
e2962f66
JH
3512 /* Possible truncation here but in some 64-bit environments
3513 * the compiler gets heartburn about switch on 64-bit values.
3514 * A similar issue a little earlier when switching on value.
98f323fa 3515 * --jhi */
e2962f66 3516 switch ((I32)namedclass) {
73b437c8
JH
3517 case ANYOF_ALNUM:
3518 if (LOC)
936ed897 3519 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3520 else {
3521 for (value = 0; value < 256; value++)
3522 if (isALNUM(value))
936ed897 3523 ANYOF_BITMAP_SET(ret, value);
73b437c8 3524 }
ffc61ed2 3525 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3526 break;
3527 case ANYOF_NALNUM:
3528 if (LOC)
936ed897 3529 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3530 else {
3531 for (value = 0; value < 256; value++)
3532 if (!isALNUM(value))
936ed897 3533 ANYOF_BITMAP_SET(ret, value);
73b437c8 3534 }
ffc61ed2 3535 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3536 break;
ffc61ed2 3537 case ANYOF_ALNUMC:
73b437c8 3538 if (LOC)
ffc61ed2 3539 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3540 else {
3541 for (value = 0; value < 256; value++)
ffc61ed2 3542 if (isALNUMC(value))
936ed897 3543 ANYOF_BITMAP_SET(ret, value);
73b437c8 3544 }
ffc61ed2 3545 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3546 break;
3547 case ANYOF_NALNUMC:
3548 if (LOC)
936ed897 3549 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3550 else {
3551 for (value = 0; value < 256; value++)
3552 if (!isALNUMC(value))
936ed897 3553 ANYOF_BITMAP_SET(ret, value);
73b437c8 3554 }
ffc61ed2 3555 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3556 break;
3557 case ANYOF_ALPHA:
3558 if (LOC)
936ed897 3559 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3560 else {
3561 for (value = 0; value < 256; value++)
3562 if (isALPHA(value))
936ed897 3563 ANYOF_BITMAP_SET(ret, value);
73b437c8 3564 }
ffc61ed2 3565 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3566 break;
3567 case ANYOF_NALPHA:
3568 if (LOC)
936ed897 3569 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3570 else {
3571 for (value = 0; value < 256; value++)
3572 if (!isALPHA(value))
936ed897 3573 ANYOF_BITMAP_SET(ret, value);
73b437c8 3574 }
ffc61ed2 3575 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3576 break;
3577 case ANYOF_ASCII:
3578 if (LOC)
936ed897 3579 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3580 else {
c7f1f016 3581#ifndef EBCDIC
1ba5c669
JH
3582 for (value = 0; value < 128; value++)
3583 ANYOF_BITMAP_SET(ret, value);
3584#else /* EBCDIC */
ffbc6a93 3585 for (value = 0; value < 256; value++) {
3a3c4447
JH
3586 if (isASCII(value))
3587 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3588 }
1ba5c669 3589#endif /* EBCDIC */
73b437c8 3590 }
ffc61ed2 3591 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3592 break;
3593 case ANYOF_NASCII:
3594 if (LOC)
936ed897 3595 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3596 else {
c7f1f016 3597#ifndef EBCDIC
1ba5c669
JH
3598 for (value = 128; value < 256; value++)
3599 ANYOF_BITMAP_SET(ret, value);
3600#else /* EBCDIC */
ffbc6a93 3601 for (value = 0; value < 256; value++) {
3a3c4447
JH
3602 if (!isASCII(value))
3603 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3604 }
1ba5c669 3605#endif /* EBCDIC */
73b437c8 3606 }
ffc61ed2 3607 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3608 break;
aaa51d5e
JF
3609 case ANYOF_BLANK:
3610 if (LOC)
3611 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3612 else {
3613 for (value = 0; value < 256; value++)
3614 if (isBLANK(value))
3615 ANYOF_BITMAP_SET(ret, value);
3616 }
ffc61ed2 3617 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3618 break;
3619 case ANYOF_NBLANK:
3620 if (LOC)
3621 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3622 else {
3623 for (value = 0; value < 256; value++)
3624 if (!isBLANK(value))
3625 ANYOF_BITMAP_SET(ret, value);
3626 }
ffc61ed2 3627 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3628 break;
73b437c8
JH
3629 case ANYOF_CNTRL:
3630 if (LOC)
936ed897 3631 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3632 else {
3633 for (value = 0; value < 256; value++)
3634 if (isCNTRL(value))
936ed897 3635 ANYOF_BITMAP_SET(ret, value);
73b437c8 3636 }
ffc61ed2 3637 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3638 break;
3639 case ANYOF_NCNTRL:
3640 if (LOC)
936ed897 3641 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3642 else {
3643 for (value = 0; value < 256; value++)
3644 if (!isCNTRL(value))
936ed897 3645 ANYOF_BITMAP_SET(ret, value);
73b437c8 3646 }
ffc61ed2
JH
3647 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3648 break;
3649 case ANYOF_DIGIT:
3650 if (LOC)
3651 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3652 else {
3653 /* consecutive digits assumed */
3654 for (value = '0'; value <= '9'; value++)
3655 ANYOF_BITMAP_SET(ret, value);
3656 }
3657 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3658 break;
3659 case ANYOF_NDIGIT:
3660 if (LOC)
3661 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3662 else {
3663 /* consecutive digits assumed */
3664 for (value = 0; value < '0'; value++)
3665 ANYOF_BITMAP_SET(ret, value);
3666 for (value = '9' + 1; value < 256; value++)
3667 ANYOF_BITMAP_SET(ret, value);
3668 }
3669 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3670 break;
3671 case ANYOF_GRAPH:
3672 if (LOC)
936ed897 3673 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3674 else {
3675 for (value = 0; value < 256; value++)
3676 if (isGRAPH(value))
936ed897 3677 ANYOF_BITMAP_SET(ret, value);
73b437c8 3678 }
ffc61ed2 3679 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3680 break;
3681 case ANYOF_NGRAPH:
3682 if (LOC)
936ed897 3683 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3684 else {
3685 for (value = 0; value < 256; value++)
3686 if (!isGRAPH(value))
936ed897 3687 ANYOF_BITMAP_SET(ret, value);
73b437c8 3688 }
ffc61ed2 3689 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3690 break;
3691 case ANYOF_LOWER:
3692 if (LOC)
936ed897 3693 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3694 else {
3695 for (value = 0; value < 256; value++)
3696 if (isLOWER(value))
936ed897 3697 ANYOF_BITMAP_SET(ret, value);
73b437c8 3698 }
ffc61ed2 3699 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3700 break;
3701 case ANYOF_NLOWER:
3702 if (LOC)
936ed897 3703 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3704 else {
3705 for (value = 0; value < 256; value++)
3706 if (!isLOWER(value))
936ed897 3707 ANYOF_BITMAP_SET(ret, value);
73b437c8 3708 }
ffc61ed2 3709 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3710 break;
3711 case ANYOF_PRINT:
3712 if (LOC)
936ed897 3713 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3714 else {
3715 for (value = 0; value < 256; value++)
3716 if (isPRINT(value))
936ed897 3717 ANYOF_BITMAP_SET(ret, value);
73b437c8 3718 }
ffc61ed2 3719 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3720 break;
3721 case ANYOF_NPRINT:
3722 if (LOC)
936ed897 3723 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3724 else {
3725 for (value = 0; value < 256; value++)
3726 if (!isPRINT(value))
936ed897 3727 ANYOF_BITMAP_SET(ret, value);
73b437c8 3728 }
ffc61ed2 3729 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3730 break;
aaa51d5e
JF
3731 case ANYOF_PSXSPC:
3732 if (LOC)
3733 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3734 else {
3735 for (value = 0; value < 256; value++)
3736 if (isPSXSPC(value))
3737 ANYOF_BITMAP_SET(ret, value);
3738 }
ffc61ed2 3739 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3740 break;
3741 case ANYOF_NPSXSPC:
3742 if (LOC)
3743 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3744 else {
3745 for (value = 0; value < 256; value++)
3746 if (!isPSXSPC(value))
3747 ANYOF_BITMAP_SET(ret, value);
3748 }
ffc61ed2 3749 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3750 break;
73b437c8
JH
3751 case ANYOF_PUNCT:
3752 if (LOC)
936ed897 3753 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3754 else {
3755 for (value = 0; value < 256; value++)
3756 if (isPUNCT(value))
936ed897 3757 ANYOF_BITMAP_SET(ret, value);
73b437c8 3758 }
ffc61ed2 3759 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3760 break;
3761 case ANYOF_NPUNCT:
3762 if (LOC)
936ed897 3763 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3764 else {
3765 for (value = 0; value < 256; value++)
3766 if (!isPUNCT(value))
936ed897 3767 ANYOF_BITMAP_SET(ret, value);
73b437c8 3768 }
ffc61ed2
JH
3769 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3770 break;
3771 case ANYOF_SPACE:
3772 if (LOC)
3773 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3774 else {
3775 for (value = 0; value < 256; value++)
3776 if (isSPACE(value))
3777 ANYOF_BITMAP_SET(ret, value);
3778 }
3779 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3780 break;
3781 case ANYOF_NSPACE:
3782 if (LOC)
3783 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3784 else {
3785 for (value = 0; value < 256; value++)
3786 if (!isSPACE(value))
3787 ANYOF_BITMAP_SET(ret, value);
3788 }
3789 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3790 break;
3791 case ANYOF_UPPER:
3792 if (LOC)
936ed897 3793 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3794 else {
3795 for (value = 0; value < 256; value++)
3796 if (isUPPER(value))
936ed897 3797 ANYOF_BITMAP_SET(ret, value);
73b437c8 3798 }
ffc61ed2 3799 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3800 break;
3801 case ANYOF_NUPPER:
3802 if (LOC)
936ed897 3803 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3804 else {
3805 for (value = 0; value < 256; value++)
3806 if (!isUPPER(value))
936ed897 3807 ANYOF_BITMAP_SET(ret, value);
73b437c8 3808 }
ffc61ed2 3809 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3810 break;
3811 case ANYOF_XDIGIT:
3812 if (LOC)
936ed897 3813 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3814 else {
3815 for (value = 0; value < 256; value++)
3816 if (isXDIGIT(value))
936ed897 3817 ANYOF_BITMAP_SET(ret, value);
73b437c8 3818 }
ffc61ed2 3819 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3820 break;
3821 case ANYOF_NXDIGIT:
3822 if (LOC)
936ed897 3823 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3824 else {
3825 for (value = 0; value < 256; value++)
3826 if (!isXDIGIT(value))
936ed897 3827 ANYOF_BITMAP_SET(ret, value);
73b437c8 3828 }
ffc61ed2 3829 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3830 break;
3831 default:
b45f050a 3832 vFAIL("Invalid [::] class");
73b437c8 3833 break;
b8c5462f 3834 }
b8c5462f 3835 if (LOC)
936ed897 3836 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3837 continue;
a0d0e21e 3838 }
ffc61ed2
JH
3839 } /* end of namedclass \blah */
3840
a0d0e21e 3841 if (range) {
3a3c4447 3842 if (prevvalue > value) /* b-a */ {
b45f050a 3843 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3844 RExC_parse - rangebegin,
3845 RExC_parse - rangebegin,
b45f050a 3846 rangebegin);
3568d838 3847 range = 0; /* not a valid range */
73b437c8 3848 }
a0d0e21e
LW
3849 }
3850 else {
3568d838 3851 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3852 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3853 RExC_parse[1] != ']') {
3854 RExC_parse++;
ffc61ed2
JH
3855
3856 /* a bad range like \w-, [:word:]- ? */
3857 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3858 if (ckWARN(WARN_REGEXP))
830247a4 3859 vWARN4(RExC_parse,
b45f050a 3860 "False [] range \"%*.*s\"",
830247a4
IZ
3861 RExC_parse - rangebegin,
3862 RExC_parse - rangebegin,
b45f050a 3863 rangebegin);
73b437c8 3864 if (!SIZE_ONLY)
936ed897 3865 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3866 } else
ffc61ed2
JH
3867 range = 1; /* yeah, it's a range! */
3868 continue; /* but do it the next time */
a0d0e21e 3869 }
a687059c 3870 }
ffc61ed2 3871
93733859 3872 /* now is the next time */
ae5c130c 3873 if (!SIZE_ONLY) {
3568d838
JH
3874 IV i;
3875
3876 if (prevvalue < 256) {
3877 IV ceilvalue = value < 256 ? value : 255;
3878
3879#ifdef EBCDIC
3a3c4447
JH
3880 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3881 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3882 {
3568d838
JH
3883 if (isLOWER(prevvalue)) {
3884 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3885 if (isLOWER(i))
3886 ANYOF_BITMAP_SET(ret, i);
3887 } else {
3568d838 3888 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3889 if (isUPPER(i))
3890 ANYOF_BITMAP_SET(ret, i);
3891 }
8ada0baa 3892 }
ffc61ed2 3893 else
8ada0baa 3894#endif
3568d838
JH
3895 for (i = prevvalue; i <= ceilvalue; i++)
3896 ANYOF_BITMAP_SET(ret, i);
3897 }
3898 if (value > 255) {
ffc61ed2 3899 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3900 if (prevvalue < value)
ffc61ed2 3901 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3902 (UV)prevvalue, (UV)value);
3903 else if (prevvalue == value)
ffc61ed2
JH
3904 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3905 (UV)value);
3906 }
8ada0baa 3907 }
ffc61ed2
JH
3908
3909 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3910 }
ffc61ed2 3911
936ed897 3912 if (need_class) {
4f66b38d 3913 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3914 if (SIZE_ONLY)
830247a4 3915 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3916 else
830247a4 3917 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3918 }
ffc61ed2 3919
ae5c130c 3920 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3921 if (!SIZE_ONLY &&
ffc61ed2 3922 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3923 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3924 ) {
a0ed51b3 3925 for (value = 0; value < 256; ++value) {
936ed897 3926 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3927 IV fold = PL_fold[value];
3928
3929 if (fold != value)
3930 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3931 }
3932 }
936ed897 3933 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3934 }
ffc61ed2 3935
ae5c130c 3936 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3937 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3938 /* If the only flag is inversion. */
3939 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3940 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3941 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3942 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3943 }
a0d0e21e 3944
b81d288d 3945 if (!SIZE_ONLY) {
fde631ed 3946 AV *av = newAV();
ffc61ed2
JH
3947 SV *rv;
3948
3949 av_store(av, 0, listsv);
3950 av_store(av, 1, NULL);
3951 rv = newRV_noinc((SV*)av);
19860706 3952 n = add_data(pRExC_state, 1, "s");
830247a4 3953 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3954 ARG_SET(ret, n);
a0ed51b3
LW
3955 }
3956
3957 return ret;
3958}
3959
76e3520e 3960STATIC char*
830247a4 3961S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3962{
830247a4 3963 char* retval = RExC_parse++;
a0d0e21e 3964
4633a7c4 3965 for (;;) {
830247a4
IZ
3966 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3967 RExC_parse[2] == '#') {
3968 while (*RExC_parse && *RExC_parse != ')')
3969 RExC_parse++;
3970 RExC_parse++;
4633a7c4
LW
3971 continue;
3972 }
830247a4
IZ
3973 if (RExC_flags16 & PMf_EXTENDED) {
3974 if (isSPACE(*RExC_parse)) {
3975 RExC_parse++;
748a9306
LW
3976 continue;
3977 }
830247a4
IZ
3978 else if (*RExC_parse == '#') {
3979 while (*RExC_parse && *RExC_parse != '\n')
3980 RExC_parse++;
3981 RExC_parse++;
748a9306
LW
3982 continue;
3983 }
748a9306 3984 }
4633a7c4 3985 return retval;
a0d0e21e 3986 }
a687059c
LW
3987}
3988
3989/*
c277df42 3990- reg_node - emit a node
a0d0e21e 3991*/
76e3520e 3992STATIC regnode * /* Location. */
830247a4 3993S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3994{
c277df42
IZ
3995 register regnode *ret;
3996 register regnode *ptr;
a687059c 3997
830247a4 3998 ret = RExC_emit;
c277df42 3999 if (SIZE_ONLY) {
830247a4
IZ
4000 SIZE_ALIGN(RExC_size);
4001 RExC_size += 1;
a0d0e21e
LW
4002 return(ret);
4003 }
a687059c 4004
c277df42 4005 NODE_ALIGN_FILL(ret);
a0d0e21e 4006 ptr = ret;
c277df42 4007 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4008 if (RExC_offsets) { /* MJD */
4009 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4010 "reg_node", __LINE__,
4011 reg_name[op],
4012 RExC_emit - RExC_emit_start > RExC_offsets[0]
4013 ? "Overwriting end of array!\n" : "OK",
4014 RExC_emit - RExC_emit_start,
4015 RExC_parse - RExC_start,
4016 RExC_offsets[0]));
4017 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4018 }
4019
830247a4 4020 RExC_emit = ptr;
a687059c 4021
a0d0e21e 4022 return(ret);
a687059c
LW
4023}
4024
4025/*
a0d0e21e
LW
4026- reganode - emit a node with an argument
4027*/
76e3520e 4028STATIC regnode * /* Location. */
830247a4 4029S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4030{
c277df42
IZ
4031 register regnode *ret;
4032 register regnode *ptr;
fe14fcc3 4033
830247a4 4034 ret = RExC_emit;
c277df42 4035 if (SIZE_ONLY) {
830247a4
IZ
4036 SIZE_ALIGN(RExC_size);
4037 RExC_size += 2;
a0d0e21e
LW
4038 return(ret);
4039 }
fe14fcc3 4040
c277df42 4041 NODE_ALIGN_FILL(ret);
a0d0e21e 4042 ptr = ret;
c277df42 4043 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4044 if (RExC_offsets) { /* MJD */
4045 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4046 "reganode",
4047 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4048 "Overwriting end of array!\n" : "OK",
4049 RExC_emit - RExC_emit_start,
4050 RExC_parse - RExC_start,
4051 RExC_offsets[0]));
4052 Set_Cur_Node_Offset;
4053 }
4054
830247a4 4055 RExC_emit = ptr;
fe14fcc3 4056
a0d0e21e 4057 return(ret);
fe14fcc3
LW
4058}
4059
4060/*
cd439c50 4061- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4062*/
4063STATIC void
830247a4 4064S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4065{
5e12f4fb 4066 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4067}
4068
4069/*
a0d0e21e
LW
4070- reginsert - insert an operator in front of already-emitted operand
4071*
4072* Means relocating the operand.
4073*/
76e3520e 4074STATIC void
830247a4 4075S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4076{
c277df42
IZ
4077 register regnode *src;
4078 register regnode *dst;
4079 register regnode *place;
4080 register int offset = regarglen[(U8)op];
b81d288d 4081
22c35a8c 4082/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4083
4084 if (SIZE_ONLY) {
830247a4 4085 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4086 return;
4087 }
a687059c 4088
830247a4
IZ
4089 src = RExC_emit;
4090 RExC_emit += NODE_STEP_REGNODE + offset;
4091 dst = RExC_emit;
fac92740 4092 while (src > opnd) {
c277df42 4093 StructCopy(--src, --dst, regnode);
fac92740
MJD
4094 if (RExC_offsets) { /* MJD 20010112 */
4095 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4096 "reg_insert",
4097 dst - RExC_emit_start > RExC_offsets[0]
4098 ? "Overwriting end of array!\n" : "OK",
4099 src - RExC_emit_start,
4100 dst - RExC_emit_start,
4101 RExC_offsets[0]));
4102 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4103 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4104 }
4105 }
4106
a0d0e21e
LW
4107
4108 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4109 if (RExC_offsets) { /* MJD */
4110 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4111 "reginsert",
4112 place - RExC_emit_start > RExC_offsets[0]
4113 ? "Overwriting end of array!\n" : "OK",
4114 place - RExC_emit_start,
4115 RExC_parse - RExC_start,
4116 RExC_offsets[0]));
4117 Set_Node_Offset(place, RExC_parse);
4118 }
c277df42
IZ
4119 src = NEXTOPER(place);
4120 FILL_ADVANCE_NODE(place, op);
4121 Zero(src, offset, regnode);
a687059c
LW
4122}
4123
4124/*
c277df42 4125- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4126*/
76e3520e 4127STATIC void
830247a4 4128S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4129{
c277df42
IZ
4130 register regnode *scan;
4131 register regnode *temp;
a0d0e21e 4132
c277df42 4133 if (SIZE_ONLY)
a0d0e21e
LW
4134 return;
4135
4136 /* Find last node. */
4137 scan = p;
4138 for (;;) {
4139 temp = regnext(scan);
4140 if (temp == NULL)
4141 break;
4142 scan = temp;
4143 }
a687059c 4144
c277df42
IZ
4145 if (reg_off_by_arg[OP(scan)]) {
4146 ARG_SET(scan, val - scan);
a0ed51b3
LW
4147 }
4148 else {
c277df42
IZ
4149 NEXT_OFF(scan) = val - scan;
4150 }
a687059c
LW
4151}
4152
4153/*
a0d0e21e
LW
4154- regoptail - regtail on operand of first argument; nop if operandless
4155*/
76e3520e 4156STATIC void
830247a4 4157S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4158{
a0d0e21e 4159 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4160 if (p == NULL || SIZE_ONLY)
4161 return;
22c35a8c 4162 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4163 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4164 }
22c35a8c 4165 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4166 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4167 }
4168 else
a0d0e21e 4169 return;
a687059c
LW
4170}
4171
4172/*
4173 - regcurly - a little FSA that accepts {\d+,?\d*}
4174 */
79072805 4175STATIC I32
cea2e8a9 4176S_regcurly(pTHX_ register char *s)
a687059c
LW
4177{
4178 if (*s++ != '{')
4179 return FALSE;
f0fcb552 4180 if (!isDIGIT(*s))
a687059c 4181 return FALSE;
f0fcb552 4182 while (isDIGIT(*s))
a687059c
LW
4183 s++;
4184 if (*s == ',')
4185 s++;
f0fcb552 4186 while (isDIGIT(*s))
a687059c
LW
4187 s++;
4188 if (*s != '}')
4189 return FALSE;
4190 return TRUE;
4191}
4192
a687059c 4193
8fa7f367
JH
4194#ifdef DEBUGGING
4195
76e3520e 4196STATIC regnode *
cea2e8a9 4197S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4198{
f248d071 4199 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4200 register regnode *next;
c277df42
IZ
4201
4202 while (op != END && (!last || node < last)) {
4203 /* While that wasn't END last time... */
4204
4205 NODE_ALIGN(node);
4206 op = OP(node);
4207 if (op == CLOSE)
4208 l--;
4209 next = regnext(node);
4210 /* Where, what. */
4211 if (OP(node) == OPTIMIZED)
4212 goto after_print;
4213 regprop(sv, node);
b900a521 4214 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4215 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4216 if (next == NULL) /* Next ptr. */
4217 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4218 else
b900a521 4219 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4220 (void)PerlIO_putc(Perl_debug_log, '\n');
4221 after_print:
22c35a8c 4222 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4223 register regnode *nnode = (OP(next) == LONGJMP
4224 ? regnext(next)
c277df42
IZ
4225 : next);
4226 if (last && nnode > last)
4227 nnode = last;
4228 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4229 }
22c35a8c 4230 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4231 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4232 }
4233 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4234 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4235 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4236 }
22c35a8c 4237 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4238 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4239 next, sv, l + 1);
a0ed51b3
LW
4240 }
4241 else if ( op == PLUS || op == STAR) {
c277df42 4242 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4243 }
4244 else if (op == ANYOF) {
4f66b38d
HS
4245 /* arglen 1 + class block */
4246 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4247 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4248 node = NEXTOPER(node);
a0ed51b3 4249 }
22c35a8c 4250 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4251 /* Literal string, where present. */
cd439c50 4252 node += NODE_SZ_STR(node) - 1;
c277df42 4253 node = NEXTOPER(node);
a0ed51b3
LW
4254 }
4255 else {
c277df42
IZ
4256 node = NEXTOPER(node);
4257 node += regarglen[(U8)op];
4258 }
4259 if (op == CURLYX || op == OPEN)
4260 l++;
4261 else if (op == WHILEM)
4262 l--;
4263 }
4264 return node;
4265}
4266
8fa7f367
JH
4267#endif /* DEBUGGING */
4268
a687059c 4269/*
fd181c75 4270 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4271 */
4272void
864dbfa3 4273Perl_regdump(pTHX_ regexp *r)
a687059c 4274{
35ff7856 4275#ifdef DEBUGGING
46fc3d4c 4276 SV *sv = sv_newmortal();
a687059c 4277
c277df42 4278 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4279
4280 /* Header fields of interest. */
c277df42 4281 if (r->anchored_substr)
7b0972df 4282 PerlIO_printf(Perl_debug_log,
b81d288d 4283 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4284 PL_colors[0],
7b0972df 4285 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4286 SvPVX(r->anchored_substr),
3280af22 4287 PL_colors[1],
c277df42 4288 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4289 (IV)r->anchored_offset);
c277df42 4290 if (r->float_substr)
7b0972df 4291 PerlIO_printf(Perl_debug_log,
b81d288d 4292 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4293 PL_colors[0],
b81d288d 4294 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4295 SvPVX(r->float_substr),
3280af22 4296 PL_colors[1],
c277df42 4297 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4298 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4299 if (r->check_substr)
b81d288d
AB
4300 PerlIO_printf(Perl_debug_log,
4301 r->check_substr == r->float_substr
c277df42
IZ
4302 ? "(checking floating" : "(checking anchored");
4303 if (r->reganch & ROPT_NOSCAN)
4304 PerlIO_printf(Perl_debug_log, " noscan");
4305 if (r->reganch & ROPT_CHECK_ALL)
4306 PerlIO_printf(Perl_debug_log, " isall");
4307 if (r->check_substr)
4308 PerlIO_printf(Perl_debug_log, ") ");
4309
46fc3d4c 4310 if (r->regstclass) {
4311 regprop(sv, r->regstclass);
4312 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4313 }
774d564b 4314 if (r->reganch & ROPT_ANCH) {
4315 PerlIO_printf(Perl_debug_log, "anchored");
4316 if (r->reganch & ROPT_ANCH_BOL)
4317 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4318 if (r->reganch & ROPT_ANCH_MBOL)
4319 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4320 if (r->reganch & ROPT_ANCH_SBOL)
4321 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4322 if (r->reganch & ROPT_ANCH_GPOS)
4323 PerlIO_printf(Perl_debug_log, "(GPOS)");
4324 PerlIO_putc(Perl_debug_log, ' ');
4325 }
c277df42
IZ
4326 if (r->reganch & ROPT_GPOS_SEEN)
4327 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4328 if (r->reganch & ROPT_SKIP)
760ac839 4329 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4330 if (r->reganch & ROPT_IMPLICIT)
760ac839 4331 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4332 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4333 if (r->reganch & ROPT_EVAL_SEEN)
4334 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4335 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4336 if (r->offsets) {
4337 U32 i;
4338 U32 len = r->offsets[0];
392fbf5d 4339 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4340 for (i = 1; i <= len; i++)
392fbf5d
RB
4341 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4342 (UV)r->offsets[i*2-1],
4343 (UV)r->offsets[i*2]);
fac92740
MJD
4344 PerlIO_printf(Perl_debug_log, "\n");
4345 }
17c3b450 4346#endif /* DEBUGGING */
a687059c
LW
4347}
4348
8fa7f367
JH
4349#ifdef DEBUGGING
4350
653099ff
GS
4351STATIC void
4352S_put_byte(pTHX_ SV *sv, int c)
4353{
ffc61ed2 4354 if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
653099ff
GS
4355 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4356 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4357 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4358 else
4359 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4360}
4361
8fa7f367
JH
4362#endif /* DEBUGGING */
4363
a687059c 4364/*
a0d0e21e
LW
4365- regprop - printable representation of opcode
4366*/
46fc3d4c 4367void
864dbfa3 4368Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4369{
35ff7856 4370#ifdef DEBUGGING
9b155405 4371 register int k;
a0d0e21e 4372
54dc92de 4373 sv_setpvn(sv, "", 0);
9b155405 4374 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4375 /* It would be nice to FAIL() here, but this may be called from
4376 regexec.c, and it would be hard to supply pRExC_state. */
4377 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4378 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4379
4380 k = PL_regkind[(U8)OP(o)];
4381
4382 if (k == EXACT)
7821416a 4383 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4384 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4385 else if (k == CURLY) {
cb434fcc 4386 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4387 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4388 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4389 }
2c2d71f5
JH
4390 else if (k == WHILEM && o->flags) /* Ordinal/of */
4391 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4392 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4393 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4394 else if (k == LOGICAL)
04ebc1ab 4395 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4396 else if (k == ANYOF) {
4397 int i, rangestart = -1;
ffc61ed2 4398 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4399 const char * const anyofs[] = { /* Should be syncronized with
4400 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4401 "\\w",
4402 "\\W",
4403 "\\s",
4404 "\\S",
4405 "\\d",
4406 "\\D",
4407 "[:alnum:]",
4408 "[:^alnum:]",
4409 "[:alpha:]",
4410 "[:^alpha:]",
4411 "[:ascii:]",
4412 "[:^ascii:]",
4413 "[:ctrl:]",
4414 "[:^ctrl:]",
4415 "[:graph:]",
4416 "[:^graph:]",
4417 "[:lower:]",
4418 "[:^lower:]",
4419 "[:print:]",
4420 "[:^print:]",
4421 "[:punct:]",
4422 "[:^punct:]",
4423 "[:upper:]",
aaa51d5e 4424 "[:^upper:]",
653099ff 4425 "[:xdigit:]",
aaa51d5e
JF
4426 "[:^xdigit:]",
4427 "[:space:]",
4428 "[:^space:]",
4429 "[:blank:]",
4430 "[:^blank:]"
653099ff
GS
4431 };
4432
19860706 4433 if (flags & ANYOF_LOCALE)
653099ff 4434 sv_catpv(sv, "{loc}");
19860706 4435 if (flags & ANYOF_FOLD)
653099ff
GS
4436 sv_catpv(sv, "{i}");
4437 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4438 if (flags & ANYOF_INVERT)
653099ff 4439 sv_catpv(sv, "^");
ffc61ed2
JH
4440 for (i = 0; i <= 256; i++) {
4441 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4442 if (rangestart == -1)
4443 rangestart = i;
4444 } else if (rangestart != -1) {
4445 if (i <= rangestart + 3)
4446 for (; rangestart < i; rangestart++)
653099ff 4447 put_byte(sv, rangestart);
ffc61ed2
JH
4448 else {
4449 put_byte(sv, rangestart);
4450 sv_catpv(sv, "-");
4451 put_byte(sv, i - 1);
653099ff 4452 }
ffc61ed2 4453 rangestart = -1;
653099ff 4454 }
847a199f 4455 }
ffc61ed2
JH
4456
4457 if (o->flags & ANYOF_CLASS)
4458 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4459 if (ANYOF_CLASS_TEST(o,i))
4460 sv_catpv(sv, anyofs[i]);
4461
4462 if (flags & ANYOF_UNICODE)
4463 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4464 else if (flags & ANYOF_UNICODE_ALL)
4465 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4466
4467 {
4468 SV *lv;
4469 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4470
ffc61ed2
JH
4471 if (lv) {
4472 if (sw) {
4473 UV i;
4474 U8 s[UTF8_MAXLEN+1];
b81d288d 4475
ffc61ed2 4476 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4477 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4478
3568d838 4479 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4480 if (rangestart == -1)
4481 rangestart = i;
4482 } else if (rangestart != -1) {
4483 U8 *p;
b81d288d 4484
ffc61ed2
JH
4485 if (i <= rangestart + 3)
4486 for (; rangestart < i; rangestart++) {
2b9d42f0 4487 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4488 put_byte(sv, *p);
4489 }
4490 else {
2b9d42f0 4491 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4492 put_byte(sv, *p);
4493 sv_catpv(sv, "-");
2b9d42f0 4494 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4495 put_byte(sv, *p);
4496 }
4497 rangestart = -1;
4498 }
19860706 4499 }
ffc61ed2
JH
4500
4501 sv_catpv(sv, "..."); /* et cetera */
19860706 4502 }
fde631ed 4503
ffc61ed2
JH
4504 {
4505 char *s = savepv(SvPVX(lv));
4506 char *origs = s;
b81d288d 4507
ffc61ed2 4508 while(*s && *s != '\n') s++;
b81d288d 4509
ffc61ed2
JH
4510 if (*s == '\n') {
4511 char *t = ++s;
4512
4513 while (*s) {
4514 if (*s == '\n')
4515 *s = ' ';
4516 s++;
4517 }
4518 if (s[-1] == ' ')
4519 s[-1] = 0;
4520
4521 sv_catpv(sv, t);
fde631ed 4522 }
b81d288d 4523
ffc61ed2 4524 Safefree(origs);
fde631ed
JH
4525 }
4526 }
653099ff 4527 }
ffc61ed2 4528
653099ff
GS
4529 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4530 }
9b155405 4531 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4532 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4533#endif /* DEBUGGING */
35ff7856 4534}
a687059c 4535
cad2e5aa
JH
4536SV *
4537Perl_re_intuit_string(pTHX_ regexp *prog)
4538{ /* Assume that RE_INTUIT is set */
4539 DEBUG_r(
4540 { STRLEN n_a;
4541 char *s = SvPV(prog->check_substr,n_a);
4542
4543 if (!PL_colorset) reginitcolors();
4544 PerlIO_printf(Perl_debug_log,
4545 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4546 PL_colors[4],PL_colors[5],PL_colors[0],
4547 s,
4548 PL_colors[1],
4549 (strlen(s) > 60 ? "..." : ""));
4550 } );
4551
4552 return prog->check_substr;
4553}
4554
2b69d0c2 4555void
864dbfa3 4556Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4557{
adac82c7 4558 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4559
4560 if (!r || (--r->refcnt > 0))
4561 return;
cad2e5aa
JH
4562 DEBUG_r(PerlIO_printf(Perl_debug_log,
4563 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4564 PL_colors[4],PL_colors[5],PL_colors[0],
4565 r->precomp,
4566 PL_colors[1],
4567 (strlen(r->precomp) > 60 ? "..." : "")));
4568
c277df42 4569 if (r->precomp)
a0d0e21e 4570 Safefree(r->precomp);
fac92740
MJD
4571 if (r->offsets) /* 20010421 MJD */
4572 Safefree(r->offsets);
cf93c79d
IZ
4573 if (RX_MATCH_COPIED(r))
4574 Safefree(r->subbeg);
a193d654
GS
4575 if (r->substrs) {
4576 if (r->anchored_substr)
4577 SvREFCNT_dec(r->anchored_substr);
4578 if (r->float_substr)
4579 SvREFCNT_dec(r->float_substr);
2779dcf1 4580 Safefree(r->substrs);
a193d654 4581 }
c277df42
IZ
4582 if (r->data) {
4583 int n = r->data->count;
dfad63ad
HS
4584 AV* new_comppad = NULL;
4585 AV* old_comppad;
4586 SV** old_curpad;
4587
c277df42
IZ
4588 while (--n >= 0) {
4589 switch (r->data->what[n]) {
4590 case 's':
4591 SvREFCNT_dec((SV*)r->data->data[n]);
4592 break;
653099ff
GS
4593 case 'f':
4594 Safefree(r->data->data[n]);
4595 break;
dfad63ad
HS
4596 case 'p':
4597 new_comppad = (AV*)r->data->data[n];
4598 break;
c277df42 4599 case 'o':
dfad63ad 4600 if (new_comppad == NULL)
cea2e8a9 4601 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4602 old_comppad = PL_comppad;
4603 old_curpad = PL_curpad;
1e6dc0b6
SB
4604 /* Watch out for global destruction's random ordering. */
4605 if (SvTYPE(new_comppad) == SVt_PVAV) {
4606 PL_comppad = new_comppad;
4607 PL_curpad = AvARRAY(new_comppad);
4608 }
4609 else
4610 PL_curpad = NULL;
c277df42 4611 op_free((OP_4tree*)r->data->data[n]);
dfad63ad
HS
4612 PL_comppad = old_comppad;
4613 PL_curpad = old_curpad;
4614 SvREFCNT_dec((SV*)new_comppad);
4615 new_comppad = NULL;
c277df42
IZ
4616 break;
4617 case 'n':
4618 break;
4619 default:
830247a4 4620 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4621 }
4622 }
4623 Safefree(r->data->what);
4624 Safefree(r->data);
a0d0e21e
LW
4625 }
4626 Safefree(r->startp);
4627 Safefree(r->endp);
4628 Safefree(r);
a687059c 4629}
c277df42
IZ
4630
4631/*
4632 - regnext - dig the "next" pointer out of a node
4633 *
4634 * [Note, when REGALIGN is defined there are two places in regmatch()
4635 * that bypass this code for speed.]
4636 */
4637regnode *
864dbfa3 4638Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4639{
4640 register I32 offset;
4641
3280af22 4642 if (p == &PL_regdummy)
c277df42
IZ
4643 return(NULL);
4644
4645 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4646 if (offset == 0)
4647 return(NULL);
4648
c277df42 4649 return(p+offset);
c277df42
IZ
4650}
4651
01f988be 4652STATIC void
cea2e8a9 4653S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4654{
4655 va_list args;
4656 STRLEN l1 = strlen(pat1);
4657 STRLEN l2 = strlen(pat2);
4658 char buf[512];
06bf62c7 4659 SV *msv;
c277df42
IZ
4660 char *message;
4661
4662 if (l1 > 510)
4663 l1 = 510;
4664 if (l1 + l2 > 510)
4665 l2 = 510 - l1;
4666 Copy(pat1, buf, l1 , char);
4667 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4668 buf[l1 + l2] = '\n';
4669 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4670#ifdef I_STDARG
4671 /* ANSI variant takes additional second argument */
c277df42 4672 va_start(args, pat2);
8736538c
AS
4673#else
4674 va_start(args);
4675#endif
5a844595 4676 msv = vmess(buf, &args);
c277df42 4677 va_end(args);
06bf62c7 4678 message = SvPV(msv,l1);
c277df42
IZ
4679 if (l1 > 512)
4680 l1 = 512;
4681 Copy(message, buf, l1 , char);
4682 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4683 Perl_croak(aTHX_ "%s", buf);
c277df42 4684}
a0ed51b3
LW
4685
4686/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4687
4688void
864dbfa3 4689Perl_save_re_context(pTHX)
b81d288d 4690{
830247a4
IZ
4691#if 0
4692 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4693 SAVEI32(RExC_npar); /* () count. */
4694 SAVEI32(RExC_size); /* Code size. */
4695 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4696 SAVEVPTR(RExC_rx); /* from regcomp.c */
4697 SAVEI32(RExC_seen); /* from regcomp.c */
4698 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4699 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4700 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4701 SAVEPPTR(RExC_end); /* End of input for compile */
4702 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4703#endif
4704
4705 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4706 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4707 SAVEPPTR(PL_reginput); /* String-input pointer. */
4708 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4709 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4710 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4711 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4712 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4713 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4714 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4715 PL_reg_start_tmp = 0;
a0ed51b3
LW
4716 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4717 PL_reg_start_tmpl = 0;
7766f137 4718 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4719 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4720 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4721 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4722 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4723 SAVEVPTR(PL_regcc); /* from regexec.c */
4724 SAVEVPTR(PL_curcop);
7766f137
GS
4725 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4726 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4727 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4728 SAVESPTR(PL_reg_sv); /* from regexec.c */
7766f137 4729 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4730 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4731 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4732 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4733 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4734 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4735#ifdef DEBUGGING
b81d288d 4736 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4737#endif
a0ed51b3 4738}
51371543
GS
4739
4740#ifdef PERL_OBJECT
51371543
GS
4741#include "XSUB.h"
4742#undef this
4743#define this pPerl
4744#endif
4745
4746static void
4747clear_re(pTHXo_ void *r)
4748{
4749 ReREFCNT_dec((regexp *)r);
4750}
ffbc6a93 4751