This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[DOC PATCH] Add perl4 warning messages to perldiag.pod
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
f9373011
PM
396#define vWARNdep(loc,m) \
397 STMT_START { \
398 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
399 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
400 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 } STMT_END \
403
b45f050a
JF
404
405#define vWARN2(loc, m, a1) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
409 a1, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN3(loc, m, a1, a2) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
417 a1, a2, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
419 } STMT_END
420
421#define vWARN4(loc, m, a1, a2, a3) \
422 STMT_START { \
830247a4 423 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
424 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
425 a1, a2, a3, \
830247a4 426 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
427 } STMT_END
428
9d1d55b5
JP
429/* used for the parse_flags section for (?c) -- japhy */
430#define vWARN5(loc, m, a1, a2, a3, a4) \
431 STMT_START { \
432 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
433 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
434 a1, a2, a3, a4, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436 } STMT_END
437
8615cb43 438
cd439c50 439/* Allow for side effects in s */
fd3f0ae2 440#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 441
fac92740
MJD
442/* Macros for recording node offsets. 20001227 mjd@plover.com
443 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
444 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
445 * Element 0 holds the number n.
446 */
447
448#define MJD_OFFSET_DEBUG(x)
449/* #define MJD_OFFSET_DEBUG(x) fprintf x */
450
451
452# define Set_Node_Offset_To_R(node,byte) \
453 STMT_START { \
454 if (! SIZE_ONLY) { \
455 if((node) < 0) { \
456 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
457 } else { \
458 RExC_offsets[2*(node)-1] = (byte); \
459 } \
460 } \
461 } STMT_END
462
463# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
464# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
465
466# define Set_Node_Length_To_R(node,len) \
467 STMT_START { \
468 if (! SIZE_ONLY) { \
469 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
470 if((node) < 0) { \
471 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
472 } else { \
473 RExC_offsets[2*(node)] = (len); \
474 } \
475 } \
476 } STMT_END
477
478# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
479# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
480# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
481
482/* Get offsets and lengths */
483#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
484#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
485
51371543 486static void clear_re(pTHXo_ void *r);
4327152a 487
653099ff
GS
488/* Mark that we cannot extend a found fixed substring at this point.
489 Updata the longest found anchored substring and the longest found
490 floating substrings if needed. */
491
4327152a 492STATIC void
830247a4 493S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 494{
a0ed51b3
LW
495 STRLEN l = CHR_SVLEN(data->last_found);
496 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 497
c277df42
IZ
498 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
499 sv_setsv(*data->longest, data->last_found);
500 if (*data->longest == data->longest_fixed) {
501 data->offset_fixed = l ? data->last_start_min : data->pos_min;
502 if (data->flags & SF_BEFORE_EOL)
b81d288d 503 data->flags
c277df42
IZ
504 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
505 else
506 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
507 }
508 else {
c277df42 509 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
510 data->offset_float_max = (l
511 ? data->last_start_max
c277df42
IZ
512 : data->pos_min + data->pos_delta);
513 if (data->flags & SF_BEFORE_EOL)
b81d288d 514 data->flags
c277df42
IZ
515 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
516 else
517 data->flags &= ~SF_FL_BEFORE_EOL;
518 }
519 }
520 SvCUR_set(data->last_found, 0);
521 data->last_end = -1;
522 data->flags &= ~SF_BEFORE_EOL;
523}
524
653099ff
GS
525/* Can match anything (initialization) */
526STATIC void
830247a4 527S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 528{
9af0acdb
JH
529 int value;
530
653099ff 531 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
532 for (value = 0; value < 256; ++value)
533 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 534 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* Can match anything (initialization) */
540STATIC int
541S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
542{
543 int value;
544
aaa51d5e 545 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
546 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
547 return 1;
1aa99e6b
IH
548 if (!(cl->flags & ANYOF_UNICODE_ALL))
549 return 0;
9af0acdb
JH
550 for (value = 0; value < 256; ++value)
551 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
552 return 0;
553 return 1;
554}
555
556/* Can match anything (initialization) */
557STATIC void
830247a4 558S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563}
564
565STATIC void
830247a4 566S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 567{
8ecf7187 568 Zero(cl, 1, struct regnode_charclass_class);
653099ff 569 cl->type = ANYOF;
830247a4 570 cl_anything(pRExC_state, cl);
653099ff
GS
571 if (LOC)
572 cl->flags |= ANYOF_LOCALE;
573}
574
575/* 'And' a given class with another one. Can create false positives */
576/* We assume that cl is not inverted */
577STATIC void
578S_cl_and(pTHX_ struct regnode_charclass_class *cl,
579 struct regnode_charclass_class *and_with)
580{
653099ff
GS
581 if (!(and_with->flags & ANYOF_CLASS)
582 && !(cl->flags & ANYOF_CLASS)
583 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
584 && !(and_with->flags & ANYOF_FOLD)
585 && !(cl->flags & ANYOF_FOLD)) {
586 int i;
587
588 if (and_with->flags & ANYOF_INVERT)
589 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
590 cl->bitmap[i] &= ~and_with->bitmap[i];
591 else
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] &= and_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
595 if (!(and_with->flags & ANYOF_EOS))
596 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
597
598 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
599 cl->flags &= ~ANYOF_UNICODE_ALL;
600 cl->flags |= ANYOF_UNICODE;
601 ARG_SET(cl, ARG(and_with));
602 }
603 if (!(and_with->flags & ANYOF_UNICODE_ALL))
604 cl->flags &= ~ANYOF_UNICODE_ALL;
605 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
606 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
607}
608
609/* 'OR' a given class with another one. Can create false positives */
610/* We assume that cl is not inverted */
611STATIC void
830247a4 612S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 613{
653099ff
GS
614 if (or_with->flags & ANYOF_INVERT) {
615 /* We do not use
616 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
617 * <= (B1 | !B2) | (CL1 | !CL2)
618 * which is wasteful if CL2 is small, but we ignore CL2:
619 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
620 * XXXX Can we handle case-fold? Unclear:
621 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
622 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
623 */
624 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
625 && !(or_with->flags & ANYOF_FOLD)
626 && !(cl->flags & ANYOF_FOLD) ) {
627 int i;
628
629 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
630 cl->bitmap[i] |= ~or_with->bitmap[i];
631 } /* XXXX: logic is complicated otherwise */
632 else {
830247a4 633 cl_anything(pRExC_state, cl);
653099ff
GS
634 }
635 } else {
636 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
637 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 638 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
639 || (cl->flags & ANYOF_FOLD)) ) {
640 int i;
641
642 /* OR char bitmap and class bitmap separately */
643 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
644 cl->bitmap[i] |= or_with->bitmap[i];
645 if (or_with->flags & ANYOF_CLASS) {
646 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
647 cl->classflags[i] |= or_with->classflags[i];
648 cl->flags |= ANYOF_CLASS;
649 }
650 }
651 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653 }
654 }
655 if (or_with->flags & ANYOF_EOS)
656 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
657
658 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
659 ARG(cl) != ARG(or_with)) {
660 cl->flags |= ANYOF_UNICODE_ALL;
661 cl->flags &= ~ANYOF_UNICODE;
662 }
663 if (or_with->flags & ANYOF_UNICODE_ALL) {
664 cl->flags |= ANYOF_UNICODE_ALL;
665 cl->flags &= ~ANYOF_UNICODE;
666 }
653099ff
GS
667}
668
669/* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
671
c277df42
IZ
672/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
674
76e3520e 675STATIC I32
830247a4 676S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
680{
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
683 I32 delta = 0;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 685 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
653099ff 688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 689
c277df42
IZ
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
692
22c35a8c 693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 694 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
695 regnode *n = regnext(scan);
696 U32 stringok = 1;
697#ifdef DEBUGGING
698 regnode *stop = scan;
b81d288d 699#endif
c277df42 700
cd439c50 701 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
702 /* Skip NOTHING, merge EXACT*. */
703 while (n &&
b81d288d 704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
705 (stringok && (OP(n) == OP(scan))))
706 && NEXT_OFF(n)
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
709 stringok = 0;
22c35a8c 710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
713#ifdef DEBUGGING
714 if (stringok)
715 stop = n;
b81d288d 716#endif
c277df42 717 n = regnext(n);
a0ed51b3 718 }
f49d4d0f 719 else if (stringok) {
cd439c50 720 int oldl = STR_LEN(scan);
c277df42 721 regnode *nnext = regnext(n);
f49d4d0f 722
b81d288d 723 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
724 break;
725 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
c277df42 728 /* Now we can overwrite *n : */
f49d4d0f 729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 730#ifdef DEBUGGING
f49d4d0f 731 stop = next - 1;
b81d288d 732#endif
c277df42
IZ
733 n = nnext;
734 }
735 }
736#ifdef DEBUGGING
737 /* Allow dumping */
cd439c50 738 n = scan + NODE_SZ_STR(scan);
c277df42 739 while (n <= stop) {
22c35a8c 740 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
741 OP(n) = OPTIMIZED;
742 NEXT_OFF(n) = 0;
743 }
744 n++;
745 }
653099ff 746#endif
c277df42 747 }
653099ff
GS
748 /* Follow the next-chain of the current node and optimize
749 away all the NOTHINGs from it. */
c277df42 750 if (OP(scan) != CURLYX) {
048cfca1
GS
751 int max = (reg_off_by_arg[OP(scan)]
752 ? I32_MAX
753 /* I32 may be smaller than U16 on CRAYs! */
754 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
755 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
756 int noff;
757 regnode *n = scan;
b81d288d 758
c277df42
IZ
759 /* Skip NOTHING and LONGJMP. */
760 while ((n = regnext(n))
22c35a8c 761 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
762 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
763 && off + noff < max)
764 off += noff;
765 if (reg_off_by_arg[OP(scan)])
766 ARG(scan) = off;
b81d288d 767 else
c277df42
IZ
768 NEXT_OFF(scan) = off;
769 }
653099ff
GS
770 /* The principal pseudo-switch. Cannot be a switch, since we
771 look into several different things. */
b81d288d 772 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
773 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
774 next = regnext(scan);
775 code = OP(scan);
b81d288d
AB
776
777 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 778 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 779 struct regnode_charclass_class accum;
c277df42 780
653099ff 781 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 782 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 783 if (flags & SCF_DO_STCLASS)
830247a4 784 cl_init_zero(pRExC_state, &accum);
c277df42 785 while (OP(scan) == code) {
830247a4 786 I32 deltanext, minnext, f = 0, fake;
653099ff 787 struct regnode_charclass_class this_class;
c277df42
IZ
788
789 num++;
790 data_fake.flags = 0;
b81d288d 791 if (data) {
2c2d71f5 792 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
793 data_fake.last_closep = data->last_closep;
794 }
795 else
796 data_fake.last_closep = &fake;
c277df42
IZ
797 next = regnext(scan);
798 scan = NEXTOPER(scan);
799 if (code != BRANCH)
800 scan = NEXTOPER(scan);
653099ff 801 if (flags & SCF_DO_STCLASS) {
830247a4 802 cl_init(pRExC_state, &this_class);
653099ff
GS
803 data_fake.start_class = &this_class;
804 f = SCF_DO_STCLASS_AND;
b81d288d 805 }
e1901655
IZ
806 if (flags & SCF_WHILEM_VISITED_POS)
807 f |= SCF_WHILEM_VISITED_POS;
653099ff 808 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
809 minnext = study_chunk(pRExC_state, &scan, &deltanext,
810 next, &data_fake, f);
b81d288d 811 if (min1 > minnext)
c277df42
IZ
812 min1 = minnext;
813 if (max1 < minnext + deltanext)
814 max1 = minnext + deltanext;
815 if (deltanext == I32_MAX)
aca2d497 816 is_inf = is_inf_internal = 1;
c277df42
IZ
817 scan = next;
818 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
819 pars++;
405ff068 820 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 821 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
822 if (data)
823 data->whilem_c = data_fake.whilem_c;
653099ff 824 if (flags & SCF_DO_STCLASS)
830247a4 825 cl_or(pRExC_state, &accum, &this_class);
b81d288d 826 if (code == SUSPEND)
c277df42
IZ
827 break;
828 }
829 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
830 min1 = 0;
831 if (flags & SCF_DO_SUBSTR) {
832 data->pos_min += min1;
833 data->pos_delta += max1 - min1;
834 if (max1 != min1 || is_inf)
835 data->longest = &(data->longest_float);
836 }
837 min += min1;
838 delta += max1 - min1;
653099ff 839 if (flags & SCF_DO_STCLASS_OR) {
830247a4 840 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
841 if (min1) {
842 cl_and(data->start_class, &and_with);
843 flags &= ~SCF_DO_STCLASS;
844 }
845 }
846 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
847 if (min1) {
848 cl_and(data->start_class, &accum);
653099ff 849 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
850 }
851 else {
b81d288d 852 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
853 * data->start_class */
854 StructCopy(data->start_class, &and_with,
855 struct regnode_charclass_class);
856 flags &= ~SCF_DO_STCLASS_AND;
857 StructCopy(&accum, data->start_class,
858 struct regnode_charclass_class);
859 flags |= SCF_DO_STCLASS_OR;
860 data->start_class->flags |= ANYOF_EOS;
861 }
653099ff 862 }
a0ed51b3
LW
863 }
864 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
865 scan = NEXTOPER(NEXTOPER(scan));
866 else /* single branch is optimized. */
867 scan = NEXTOPER(scan);
868 continue;
a0ed51b3
LW
869 }
870 else if (OP(scan) == EXACT) {
cd439c50 871 I32 l = STR_LEN(scan);
1aa99e6b 872 UV uc = *((U8*)STRING(scan));
a0ed51b3 873 if (UTF) {
1aa99e6b
IH
874 U8 *s = (U8*)STRING(scan);
875 l = utf8_length(s, s + l);
9041c2e3 876 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
877 }
878 min += l;
c277df42 879 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
880 /* The code below prefers earlier match for fixed
881 offset, later match for variable offset. */
882 if (data->last_end == -1) { /* Update the start info. */
883 data->last_start_min = data->pos_min;
884 data->last_start_max = is_inf
b81d288d 885 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 886 }
cd439c50 887 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
888 data->last_end = data->pos_min + l;
889 data->pos_min += l; /* As in the first entry. */
890 data->flags &= ~SF_BEFORE_EOL;
891 }
653099ff
GS
892 if (flags & SCF_DO_STCLASS_AND) {
893 /* Check whether it is compatible with what we know already! */
894 int compat = 1;
895
1aa99e6b 896 if (uc >= 0x100 ||
516a5887 897 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 898 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 899 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 900 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 901 )
653099ff
GS
902 compat = 0;
903 ANYOF_CLASS_ZERO(data->start_class);
904 ANYOF_BITMAP_ZERO(data->start_class);
905 if (compat)
1aa99e6b 906 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 907 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
908 if (uc < 0x100)
909 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
910 }
911 else if (flags & SCF_DO_STCLASS_OR) {
912 /* false positive possible if the class is case-folded */
1aa99e6b 913 if (uc < 0x100)
9b877dbb
IH
914 ANYOF_BITMAP_SET(data->start_class, uc);
915 else
916 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
917 data->start_class->flags &= ~ANYOF_EOS;
918 cl_and(data->start_class, &and_with);
919 }
920 flags &= ~SCF_DO_STCLASS;
a0ed51b3 921 }
653099ff 922 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 923 I32 l = STR_LEN(scan);
1aa99e6b 924 UV uc = *((U8*)STRING(scan));
653099ff
GS
925
926 /* Search for fixed substrings supports EXACT only. */
b81d288d 927 if (flags & SCF_DO_SUBSTR)
830247a4 928 scan_commit(pRExC_state, data);
a0ed51b3 929 if (UTF) {
1aa99e6b
IH
930 U8 *s = (U8 *)STRING(scan);
931 l = utf8_length(s, s + l);
9041c2e3 932 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
933 }
934 min += l;
c277df42 935 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 936 data->pos_min += l;
653099ff
GS
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
939 int compat = 1;
940
1aa99e6b 941 if (uc >= 0x100 ||
516a5887 942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 944 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
945 compat = 0;
946 ANYOF_CLASS_ZERO(data->start_class);
947 ANYOF_BITMAP_ZERO(data->start_class);
948 if (compat) {
1aa99e6b 949 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
950 data->start_class->flags &= ~ANYOF_EOS;
951 data->start_class->flags |= ANYOF_FOLD;
952 if (OP(scan) == EXACTFL)
953 data->start_class->flags |= ANYOF_LOCALE;
954 }
955 }
956 else if (flags & SCF_DO_STCLASS_OR) {
957 if (data->start_class->flags & ANYOF_FOLD) {
958 /* false positive possible if the class is case-folded.
959 Assume that the locale settings are the same... */
1aa99e6b
IH
960 if (uc < 0x100)
961 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
962 data->start_class->flags &= ~ANYOF_EOS;
963 }
964 cl_and(data->start_class, &and_with);
965 }
966 flags &= ~SCF_DO_STCLASS;
a0ed51b3 967 }
4d61ec05 968 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 969 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 970 I32 f = flags, pos_before = 0;
c277df42 971 regnode *oscan = scan;
653099ff
GS
972 struct regnode_charclass_class this_class;
973 struct regnode_charclass_class *oclass = NULL;
974
22c35a8c 975 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 976 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
977 scan = NEXTOPER(scan);
978 goto finish;
979 case PLUS:
653099ff 980 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 981 next = NEXTOPER(scan);
653099ff 982 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
983 mincount = 1;
984 maxcount = REG_INFTY;
c277df42
IZ
985 next = regnext(scan);
986 scan = NEXTOPER(scan);
987 goto do_curly;
988 }
989 }
990 if (flags & SCF_DO_SUBSTR)
991 data->pos_min++;
992 min++;
993 /* Fall through. */
994 case STAR:
653099ff
GS
995 if (flags & SCF_DO_STCLASS) {
996 mincount = 0;
b81d288d 997 maxcount = REG_INFTY;
653099ff
GS
998 next = regnext(scan);
999 scan = NEXTOPER(scan);
1000 goto do_curly;
1001 }
b81d288d 1002 is_inf = is_inf_internal = 1;
c277df42
IZ
1003 scan = regnext(scan);
1004 if (flags & SCF_DO_SUBSTR) {
830247a4 1005 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1006 data->longest = &(data->longest_float);
1007 }
1008 goto optimize_curly_tail;
1009 case CURLY:
b81d288d 1010 mincount = ARG1(scan);
c277df42
IZ
1011 maxcount = ARG2(scan);
1012 next = regnext(scan);
cb434fcc
IZ
1013 if (OP(scan) == CURLYX) {
1014 I32 lp = (data ? *(data->last_closep) : 0);
1015
1016 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1017 }
c277df42
IZ
1018 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1019 do_curly:
1020 if (flags & SCF_DO_SUBSTR) {
830247a4 1021 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1022 pos_before = data->pos_min;
1023 }
1024 if (data) {
1025 fl = data->flags;
1026 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1027 if (is_inf)
1028 data->flags |= SF_IS_INF;
1029 }
653099ff 1030 if (flags & SCF_DO_STCLASS) {
830247a4 1031 cl_init(pRExC_state, &this_class);
653099ff
GS
1032 oclass = data->start_class;
1033 data->start_class = &this_class;
1034 f |= SCF_DO_STCLASS_AND;
1035 f &= ~SCF_DO_STCLASS_OR;
1036 }
e1901655
IZ
1037 /* These are the cases when once a subexpression
1038 fails at a particular position, it cannot succeed
1039 even after backtracking at the enclosing scope.
b81d288d 1040
e1901655
IZ
1041 XXXX what if minimal match and we are at the
1042 initial run of {n,m}? */
1043 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1044 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1045
c277df42 1046 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1047 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1048 mincount == 0
653099ff
GS
1049 ? (f & ~SCF_DO_SUBSTR) : f);
1050
1051 if (flags & SCF_DO_STCLASS)
1052 data->start_class = oclass;
1053 if (mincount == 0 || minnext == 0) {
1054 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1055 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1056 }
1057 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1058 /* Switch to OR mode: cache the old value of
653099ff
GS
1059 * data->start_class */
1060 StructCopy(data->start_class, &and_with,
1061 struct regnode_charclass_class);
1062 flags &= ~SCF_DO_STCLASS_AND;
1063 StructCopy(&this_class, data->start_class,
1064 struct regnode_charclass_class);
1065 flags |= SCF_DO_STCLASS_OR;
1066 data->start_class->flags |= ANYOF_EOS;
1067 }
1068 } else { /* Non-zero len */
1069 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1070 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1071 cl_and(data->start_class, &and_with);
1072 }
1073 else if (flags & SCF_DO_STCLASS_AND)
1074 cl_and(data->start_class, &this_class);
1075 flags &= ~SCF_DO_STCLASS;
1076 }
c277df42
IZ
1077 if (!scan) /* It was not CURLYX, but CURLY. */
1078 scan = next;
84037bb0
HS
1079 if (ckWARN(WARN_REGEXP)
1080 && (minnext == 0) && (deltanext == 0)
99799961 1081 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1082 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1083 {
830247a4 1084 vWARN(RExC_parse,
b45f050a
JF
1085 "Quantifier unexpected on zero-length expression");
1086 }
1087
c277df42 1088 min += minnext * mincount;
b81d288d 1089 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1090 && (minnext + deltanext) > 0)
1091 || deltanext == I32_MAX);
aca2d497 1092 is_inf |= is_inf_internal;
c277df42
IZ
1093 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1094
1095 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1096 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1097 && data->flags & SF_IN_PAR
1098 && !(data->flags & SF_HAS_EVAL)
1099 && !deltanext && minnext == 1 ) {
1100 /* Try to optimize to CURLYN. */
1101 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1102 regnode *nxt1 = nxt;
1103#ifdef DEBUGGING
1104 regnode *nxt2;
1105#endif
c277df42
IZ
1106
1107 /* Skip open. */
1108 nxt = regnext(nxt);
4d61ec05 1109 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1110 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1111 && STR_LEN(nxt) == 1))
c277df42 1112 goto nogo;
497b47a8 1113#ifdef DEBUGGING
c277df42 1114 nxt2 = nxt;
497b47a8 1115#endif
c277df42 1116 nxt = regnext(nxt);
b81d288d 1117 if (OP(nxt) != CLOSE)
c277df42
IZ
1118 goto nogo;
1119 /* Now we know that nxt2 is the only contents: */
1120 oscan->flags = ARG(nxt);
1121 OP(oscan) = CURLYN;
1122 OP(nxt1) = NOTHING; /* was OPEN. */
1123#ifdef DEBUGGING
1124 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1125 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1126 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1127 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1128 OP(nxt + 1) = OPTIMIZED; /* was count. */
1129 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1130#endif
c277df42 1131 }
c277df42
IZ
1132 nogo:
1133
1134 /* Try optimization CURLYX => CURLYM. */
b81d288d 1135 if ( OP(oscan) == CURLYX && data
c277df42 1136 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1137 && !(data->flags & SF_HAS_EVAL)
1138 && !deltanext ) {
1139 /* XXXX How to optimize if data == 0? */
1140 /* Optimize to a simpler form. */
1141 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1142 regnode *nxt2;
1143
1144 OP(oscan) = CURLYM;
1145 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1146 && (OP(nxt2) != WHILEM))
c277df42
IZ
1147 nxt = nxt2;
1148 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1149 /* Need to optimize away parenths. */
1150 if (data->flags & SF_IN_PAR) {
1151 /* Set the parenth number. */
1152 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1153
b81d288d 1154 if (OP(nxt) != CLOSE)
b45f050a 1155 FAIL("Panic opt close");
c277df42
IZ
1156 oscan->flags = ARG(nxt);
1157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1159#ifdef DEBUGGING
1160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1161 OP(nxt + 1) = OPTIMIZED; /* was count. */
1162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1164#endif
c277df42
IZ
1165#if 0
1166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1167 regnode *nnxt = regnext(nxt1);
b81d288d 1168
c277df42
IZ
1169 if (nnxt == nxt) {
1170 if (reg_off_by_arg[OP(nxt1)])
1171 ARG_SET(nxt1, nxt2 - nxt1);
1172 else if (nxt2 - nxt1 < U16_MAX)
1173 NEXT_OFF(nxt1) = nxt2 - nxt1;
1174 else
1175 OP(nxt) = NOTHING; /* Cannot beautify */
1176 }
1177 nxt1 = nnxt;
1178 }
1179#endif
1180 /* Optimize again: */
b81d288d 1181 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1182 NULL, 0);
a0ed51b3
LW
1183 }
1184 else
c277df42 1185 oscan->flags = 0;
c277df42 1186 }
e1901655
IZ
1187 else if ((OP(oscan) == CURLYX)
1188 && (flags & SCF_WHILEM_VISITED_POS)
1189 /* See the comment on a similar expression above.
1190 However, this time it not a subexpression
1191 we care about, but the expression itself. */
1192 && (maxcount == REG_INFTY)
1193 && data && ++data->whilem_c < 16) {
1194 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1195 /* Find WHILEM (as in regexec.c) */
1196 regnode *nxt = oscan + NEXT_OFF(oscan);
1197
1198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1199 nxt += ARG(nxt);
1200 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1201 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1202 }
b81d288d 1203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1204 pars++;
1205 if (flags & SCF_DO_SUBSTR) {
1206 SV *last_str = Nullsv;
1207 int counted = mincount != 0;
1208
1209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1210 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1211 ? pos_before : data->last_start_min;
1212 STRLEN l;
1213 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1214 I32 old = b - data->last_start_min;
1215
1216 if (UTF)
1217 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1218
a0ed51b3 1219 l -= old;
c277df42 1220 /* Get the added string: */
79cb57f6 1221 last_str = newSVpvn(s + old, l);
c277df42
IZ
1222 if (deltanext == 0 && pos_before == b) {
1223 /* What was added is a constant string */
1224 if (mincount > 1) {
1225 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1226 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1227 SvPVX(last_str), l, mincount - 1);
1228 SvCUR(last_str) *= mincount;
1229 /* Add additional parts. */
b81d288d 1230 SvCUR_set(data->last_found,
c277df42
IZ
1231 SvCUR(data->last_found) - l);
1232 sv_catsv(data->last_found, last_str);
1233 data->last_end += l * (mincount - 1);
1234 }
2a8d9689
HS
1235 } else {
1236 /* start offset must point into the last copy */
1237 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1238 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1239 * (minnext + data->pos_delta);
c277df42
IZ
1240 }
1241 }
1242 /* It is counted once already... */
1243 data->pos_min += minnext * (mincount - counted);
1244 data->pos_delta += - counted * deltanext +
1245 (minnext + deltanext) * maxcount - minnext * mincount;
1246 if (mincount != maxcount) {
653099ff
GS
1247 /* Cannot extend fixed substrings found inside
1248 the group. */
830247a4 1249 scan_commit(pRExC_state,data);
c277df42
IZ
1250 if (mincount && last_str) {
1251 sv_setsv(data->last_found, last_str);
1252 data->last_end = data->pos_min;
b81d288d 1253 data->last_start_min =
a0ed51b3 1254 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1255 data->last_start_max = is_inf
1256 ? I32_MAX
c277df42 1257 : data->pos_min + data->pos_delta
a0ed51b3 1258 - CHR_SVLEN(last_str);
c277df42
IZ
1259 }
1260 data->longest = &(data->longest_float);
1261 }
aca2d497 1262 SvREFCNT_dec(last_str);
c277df42 1263 }
405ff068 1264 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1265 data->flags |= SF_HAS_EVAL;
1266 optimize_curly_tail:
c277df42 1267 if (OP(oscan) != CURLYX) {
22c35a8c 1268 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1269 && NEXT_OFF(next))
1270 NEXT_OFF(oscan) += NEXT_OFF(next);
1271 }
c277df42 1272 continue;
653099ff 1273 default: /* REF and CLUMP only? */
c277df42 1274 if (flags & SCF_DO_SUBSTR) {
830247a4 1275 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1276 data->longest = &(data->longest_float);
1277 }
aca2d497 1278 is_inf = is_inf_internal = 1;
653099ff 1279 if (flags & SCF_DO_STCLASS_OR)
830247a4 1280 cl_anything(pRExC_state, data->start_class);
653099ff 1281 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1282 break;
1283 }
a0ed51b3 1284 }
ffc61ed2 1285 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1286 int value = 0;
653099ff 1287
c277df42 1288 if (flags & SCF_DO_SUBSTR) {
830247a4 1289 scan_commit(pRExC_state,data);
c277df42
IZ
1290 data->pos_min++;
1291 }
1292 min++;
653099ff
GS
1293 if (flags & SCF_DO_STCLASS) {
1294 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1295
1296 /* Some of the logic below assumes that switching
1297 locale on will only add false positives. */
1298 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1299 case SANY:
653099ff
GS
1300 default:
1301 do_default:
1302 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1305 break;
1306 case REG_ANY:
1307 if (OP(scan) == SANY)
1308 goto do_default;
1309 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1310 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1311 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1312 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1313 }
1314 if (flags & SCF_DO_STCLASS_AND || !value)
1315 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1316 break;
1317 case ANYOF:
1318 if (flags & SCF_DO_STCLASS_AND)
1319 cl_and(data->start_class,
1320 (struct regnode_charclass_class*)scan);
1321 else
830247a4 1322 cl_or(pRExC_state, data->start_class,
653099ff
GS
1323 (struct regnode_charclass_class*)scan);
1324 break;
1325 case ALNUM:
1326 if (flags & SCF_DO_STCLASS_AND) {
1327 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1329 for (value = 0; value < 256; value++)
1330 if (!isALNUM(value))
1331 ANYOF_BITMAP_CLEAR(data->start_class, value);
1332 }
1333 }
1334 else {
1335 if (data->start_class->flags & ANYOF_LOCALE)
1336 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1337 else {
1338 for (value = 0; value < 256; value++)
1339 if (isALNUM(value))
b81d288d 1340 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1341 }
1342 }
1343 break;
1344 case ALNUML:
1345 if (flags & SCF_DO_STCLASS_AND) {
1346 if (data->start_class->flags & ANYOF_LOCALE)
1347 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1348 }
1349 else {
1350 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1351 data->start_class->flags |= ANYOF_LOCALE;
1352 }
1353 break;
1354 case NALNUM:
1355 if (flags & SCF_DO_STCLASS_AND) {
1356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1358 for (value = 0; value < 256; value++)
1359 if (isALNUM(value))
1360 ANYOF_BITMAP_CLEAR(data->start_class, value);
1361 }
1362 }
1363 else {
1364 if (data->start_class->flags & ANYOF_LOCALE)
1365 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1366 else {
1367 for (value = 0; value < 256; value++)
1368 if (!isALNUM(value))
b81d288d 1369 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1370 }
1371 }
1372 break;
1373 case NALNUML:
1374 if (flags & SCF_DO_STCLASS_AND) {
1375 if (data->start_class->flags & ANYOF_LOCALE)
1376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1377 }
1378 else {
1379 data->start_class->flags |= ANYOF_LOCALE;
1380 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1381 }
1382 break;
1383 case SPACE:
1384 if (flags & SCF_DO_STCLASS_AND) {
1385 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1387 for (value = 0; value < 256; value++)
1388 if (!isSPACE(value))
1389 ANYOF_BITMAP_CLEAR(data->start_class, value);
1390 }
1391 }
1392 else {
1393 if (data->start_class->flags & ANYOF_LOCALE)
1394 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1395 else {
1396 for (value = 0; value < 256; value++)
1397 if (isSPACE(value))
b81d288d 1398 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1399 }
1400 }
1401 break;
1402 case SPACEL:
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1406 }
1407 else {
1408 data->start_class->flags |= ANYOF_LOCALE;
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1410 }
1411 break;
1412 case NSPACE:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1416 for (value = 0; value < 256; value++)
1417 if (isSPACE(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1419 }
1420 }
1421 else {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1424 else {
1425 for (value = 0; value < 256; value++)
1426 if (!isSPACE(value))
b81d288d 1427 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1428 }
1429 }
1430 break;
1431 case NSPACEL:
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1435 for (value = 0; value < 256; value++)
1436 if (!isSPACE(value))
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1438 }
1439 }
1440 else {
1441 data->start_class->flags |= ANYOF_LOCALE;
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1443 }
1444 break;
1445 case DIGIT:
1446 if (flags & SCF_DO_STCLASS_AND) {
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1448 for (value = 0; value < 256; value++)
1449 if (!isDIGIT(value))
1450 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 }
1452 else {
1453 if (data->start_class->flags & ANYOF_LOCALE)
1454 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1455 else {
1456 for (value = 0; value < 256; value++)
1457 if (isDIGIT(value))
b81d288d 1458 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1459 }
1460 }
1461 break;
1462 case NDIGIT:
1463 if (flags & SCF_DO_STCLASS_AND) {
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1465 for (value = 0; value < 256; value++)
1466 if (isDIGIT(value))
1467 ANYOF_BITMAP_CLEAR(data->start_class, value);
1468 }
1469 else {
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1472 else {
1473 for (value = 0; value < 256; value++)
1474 if (!isDIGIT(value))
b81d288d 1475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1476 }
1477 }
1478 break;
1479 }
1480 if (flags & SCF_DO_STCLASS_OR)
1481 cl_and(data->start_class, &and_with);
1482 flags &= ~SCF_DO_STCLASS;
1483 }
a0ed51b3 1484 }
22c35a8c 1485 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1486 data->flags |= (OP(scan) == MEOL
1487 ? SF_BEFORE_MEOL
1488 : SF_BEFORE_SEOL);
a0ed51b3 1489 }
653099ff
GS
1490 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1491 /* Lookbehind, or need to calculate parens/evals/stclass: */
1492 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1493 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1494 /* Lookahead/lookbehind */
cb434fcc 1495 I32 deltanext, minnext, fake = 0;
c277df42 1496 regnode *nscan;
653099ff
GS
1497 struct regnode_charclass_class intrnl;
1498 int f = 0;
c277df42
IZ
1499
1500 data_fake.flags = 0;
b81d288d 1501 if (data) {
2c2d71f5 1502 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1503 data_fake.last_closep = data->last_closep;
1504 }
1505 else
1506 data_fake.last_closep = &fake;
653099ff
GS
1507 if ( flags & SCF_DO_STCLASS && !scan->flags
1508 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1509 cl_init(pRExC_state, &intrnl);
653099ff 1510 data_fake.start_class = &intrnl;
e1901655 1511 f |= SCF_DO_STCLASS_AND;
653099ff 1512 }
e1901655
IZ
1513 if (flags & SCF_WHILEM_VISITED_POS)
1514 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1515 next = regnext(scan);
1516 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1517 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1518 if (scan->flags) {
1519 if (deltanext) {
9baa0206 1520 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1521 }
1522 else if (minnext > U8_MAX) {
9baa0206 1523 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1524 }
1525 scan->flags = minnext;
1526 }
1527 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1528 pars++;
405ff068 1529 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1530 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1531 if (data)
1532 data->whilem_c = data_fake.whilem_c;
e1901655 1533 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1534 int was = (data->start_class->flags & ANYOF_EOS);
1535
1536 cl_and(data->start_class, &intrnl);
1537 if (was)
1538 data->start_class->flags |= ANYOF_EOS;
1539 }
a0ed51b3
LW
1540 }
1541 else if (OP(scan) == OPEN) {
c277df42 1542 pars++;
a0ed51b3 1543 }
cb434fcc
IZ
1544 else if (OP(scan) == CLOSE) {
1545 if (ARG(scan) == is_par) {
1546 next = regnext(scan);
c277df42 1547
cb434fcc
IZ
1548 if ( next && (OP(next) != WHILEM) && next < last)
1549 is_par = 0; /* Disable optimization */
1550 }
1551 if (data)
1552 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1553 }
1554 else if (OP(scan) == EVAL) {
c277df42
IZ
1555 if (data)
1556 data->flags |= SF_HAS_EVAL;
1557 }
96776eda 1558 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1559 if (flags & SCF_DO_SUBSTR) {
830247a4 1560 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1561 data->longest = &(data->longest_float);
1562 }
1563 is_inf = is_inf_internal = 1;
653099ff 1564 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1565 cl_anything(pRExC_state, data->start_class);
96776eda 1566 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1567 }
c277df42
IZ
1568 /* Else: zero-length, ignore. */
1569 scan = regnext(scan);
1570 }
1571
1572 finish:
1573 *scanp = scan;
aca2d497 1574 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1575 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1576 data->pos_delta = I32_MAX - data->pos_min;
1577 if (is_par > U8_MAX)
1578 is_par = 0;
1579 if (is_par && pars==1 && data) {
1580 data->flags |= SF_IN_PAR;
1581 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1582 }
1583 else if (pars && data) {
c277df42
IZ
1584 data->flags |= SF_HAS_PAR;
1585 data->flags &= ~SF_IN_PAR;
1586 }
653099ff
GS
1587 if (flags & SCF_DO_STCLASS_OR)
1588 cl_and(data->start_class, &and_with);
c277df42
IZ
1589 return min;
1590}
1591
76e3520e 1592STATIC I32
830247a4 1593S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1594{
830247a4 1595 if (RExC_rx->data) {
b81d288d
AB
1596 Renewc(RExC_rx->data,
1597 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1598 char, struct reg_data);
830247a4
IZ
1599 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1600 RExC_rx->data->count += n;
a0ed51b3
LW
1601 }
1602 else {
830247a4 1603 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1604 char, struct reg_data);
830247a4
IZ
1605 New(1208, RExC_rx->data->what, n, U8);
1606 RExC_rx->data->count = n;
c277df42 1607 }
830247a4
IZ
1608 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1609 return RExC_rx->data->count - n;
c277df42
IZ
1610}
1611
d88dccdf 1612void
864dbfa3 1613Perl_reginitcolors(pTHX)
d88dccdf 1614{
d88dccdf
IZ
1615 int i = 0;
1616 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1617
d88dccdf
IZ
1618 if (s) {
1619 PL_colors[0] = s = savepv(s);
1620 while (++i < 6) {
1621 s = strchr(s, '\t');
1622 if (s) {
1623 *s = '\0';
1624 PL_colors[i] = ++s;
1625 }
1626 else
c712d376 1627 PL_colors[i] = s = "";
d88dccdf
IZ
1628 }
1629 } else {
b81d288d 1630 while (i < 6)
d88dccdf
IZ
1631 PL_colors[i++] = "";
1632 }
1633 PL_colorset = 1;
1634}
1635
8615cb43 1636
a687059c 1637/*
e50aee73 1638 - pregcomp - compile a regular expression into internal code
a687059c
LW
1639 *
1640 * We can't allocate space until we know how big the compiled form will be,
1641 * but we can't compile it (and thus know how big it is) until we've got a
1642 * place to put the code. So we cheat: we compile it twice, once with code
1643 * generation turned off and size counting turned on, and once "for real".
1644 * This also means that we don't allocate space until we are sure that the
1645 * thing really will compile successfully, and we never have to move the
1646 * code and thus invalidate pointers into it. (Note that it has to be in
1647 * one piece because free() must be able to free it all.) [NB: not true in perl]
1648 *
1649 * Beware that the optimization-preparation code in here knows about some
1650 * of the structure of the compiled regexp. [I'll say.]
1651 */
1652regexp *
864dbfa3 1653Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1654{
a0d0e21e 1655 register regexp *r;
c277df42 1656 regnode *scan;
c277df42 1657 regnode *first;
a0d0e21e 1658 I32 flags;
a0d0e21e
LW
1659 I32 minlen = 0;
1660 I32 sawplus = 0;
1661 I32 sawopen = 0;
2c2d71f5 1662 scan_data_t data;
830247a4
IZ
1663 RExC_state_t RExC_state;
1664 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1665
1666 if (exp == NULL)
c277df42 1667 FAIL("NULL regexp argument");
a0d0e21e 1668
84e09d5e
JH
1669 /* XXXX This looks very suspicious... */
1670 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1671 RExC_utf8 = 1;
1672 else
1673 RExC_utf8 = 0;
a0ed51b3 1674
5cfc7842 1675 RExC_precomp = exp;
35ef4773 1676 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1678 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1679 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1680 RExC_flags16 = pm->op_pmflags;
1681 RExC_sawback = 0;
bbce6d69 1682
830247a4
IZ
1683 RExC_seen = 0;
1684 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1685 RExC_seen_evals = 0;
1686 RExC_extralen = 0;
c277df42 1687
bbce6d69 1688 /* First pass: determine size, legality. */
830247a4 1689 RExC_parse = exp;
fac92740 1690 RExC_start = exp;
830247a4
IZ
1691 RExC_end = xend;
1692 RExC_naughty = 0;
1693 RExC_npar = 1;
1694 RExC_size = 0L;
1695 RExC_emit = &PL_regdummy;
1696 RExC_whilem_seen = 0;
85ddcde9
JH
1697#if 0 /* REGC() is (currently) a NOP at the first pass.
1698 * Clever compilers notice this and complain. --jhi */
830247a4 1699 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1700#endif
830247a4 1701 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1702 RExC_precomp = Nullch;
a0d0e21e
LW
1703 return(NULL);
1704 }
830247a4 1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1706
c277df42
IZ
1707 /* Small enough for pointer-storage convention?
1708 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1709 if (RExC_size >= 0x10000L && RExC_extralen)
1710 RExC_size += RExC_extralen;
c277df42 1711 else
830247a4
IZ
1712 RExC_extralen = 0;
1713 if (RExC_whilem_seen > 15)
1714 RExC_whilem_seen = 15;
a0d0e21e 1715
bbce6d69 1716 /* Allocate space and initialize. */
830247a4 1717 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1718 char, regexp);
a0d0e21e 1719 if (r == NULL)
b45f050a
JF
1720 FAIL("Regexp out of space");
1721
0f79a09d
GS
1722#ifdef DEBUGGING
1723 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1724 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1725#endif
c277df42 1726 r->refcnt = 1;
bbce6d69 1727 r->prelen = xend - exp;
5cfc7842 1728 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1729 r->subbeg = NULL;
1730 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1731 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1732
1733 r->substrs = 0; /* Useful during FAIL. */
1734 r->startp = 0; /* Useful during FAIL. */
1735 r->endp = 0; /* Useful during FAIL. */
1736
fac92740
MJD
1737 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1738 if (r->offsets) {
1739 r->offsets[0] = RExC_size;
1740 }
1741 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1742 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1743 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1744 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1745
830247a4 1746 RExC_rx = r;
bbce6d69 1747
1748 /* Second pass: emit code. */
055bb491 1749 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1750 RExC_parse = exp;
1751 RExC_end = xend;
1752 RExC_naughty = 0;
1753 RExC_npar = 1;
fac92740 1754 RExC_emit_start = r->program;
830247a4 1755 RExC_emit = r->program;
2cd61cdb 1756 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1757 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1758 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1759 r->data = 0;
830247a4 1760 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1761 return(NULL);
1762
1763 /* Dig out information for optimizations. */
cf93c79d 1764 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1765 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1766 if (UTF)
1767 r->reganch |= ROPT_UTF8;
c277df42 1768 r->regstclass = NULL;
830247a4 1769 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1770 r->reganch |= ROPT_NAUGHTY;
c277df42 1771 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1772
1773 /* XXXX To minimize changes to RE engine we always allocate
1774 3-units-long substrs field. */
1775 Newz(1004, r->substrs, 1, struct reg_substr_data);
1776
2c2d71f5 1777 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1778 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1779 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1780 I32 fake;
c5254dd6 1781 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1782 struct regnode_charclass_class ch_class;
1783 int stclass_flag;
cb434fcc 1784 I32 last_close = 0;
a0d0e21e
LW
1785
1786 first = scan;
c277df42 1787 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1788 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1789 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1790 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1791 (OP(first) == PLUS) ||
1792 (OP(first) == MINMOD) ||
653099ff 1793 /* An {n,m} with n>0 */
22c35a8c 1794 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1795 if (OP(first) == PLUS)
1796 sawplus = 1;
1797 else
1798 first += regarglen[(U8)OP(first)];
1799 first = NEXTOPER(first);
a687059c
LW
1800 }
1801
a0d0e21e
LW
1802 /* Starting-point info. */
1803 again:
653099ff 1804 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1805 if (OP(first) == EXACT)
1806 ; /* Empty, get anchored substr later. */
1807 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1808 r->regstclass = first;
1809 }
653099ff 1810 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1811 r->regstclass = first;
22c35a8c
GS
1812 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1813 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1814 r->regstclass = first;
22c35a8c 1815 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1816 r->reganch |= (OP(first) == MBOL
1817 ? ROPT_ANCH_MBOL
1818 : (OP(first) == SBOL
1819 ? ROPT_ANCH_SBOL
1820 : ROPT_ANCH_BOL));
a0d0e21e 1821 first = NEXTOPER(first);
774d564b 1822 goto again;
1823 }
1824 else if (OP(first) == GPOS) {
1825 r->reganch |= ROPT_ANCH_GPOS;
1826 first = NEXTOPER(first);
1827 goto again;
a0d0e21e
LW
1828 }
1829 else if ((OP(first) == STAR &&
22c35a8c 1830 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1831 !(r->reganch & ROPT_ANCH) )
1832 {
1833 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1834 int type = OP(NEXTOPER(first));
1835
ffc61ed2 1836 if (type == REG_ANY)
cad2e5aa
JH
1837 type = ROPT_ANCH_MBOL;
1838 else
1839 type = ROPT_ANCH_SBOL;
1840
1841 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1842 first = NEXTOPER(first);
774d564b 1843 goto again;
a0d0e21e 1844 }
b81d288d 1845 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1846 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1847 /* x+ must match at the 1st pos of run of x's */
1848 r->reganch |= ROPT_SKIP;
a0d0e21e 1849
c277df42 1850 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1851 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1852 (IV)(first - scan + 1)));
a0d0e21e
LW
1853 /*
1854 * If there's something expensive in the r.e., find the
1855 * longest literal string that must appear and make it the
1856 * regmust. Resolve ties in favor of later strings, since
1857 * the regstart check works with the beginning of the r.e.
1858 * and avoiding duplication strengthens checking. Not a
1859 * strong reason, but sufficient in the absence of others.
1860 * [Now we resolve ties in favor of the earlier string if
c277df42 1861 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1862 * earlier string may buy us something the later one won't.]
1863 */
a0d0e21e 1864 minlen = 0;
a687059c 1865
79cb57f6
GS
1866 data.longest_fixed = newSVpvn("",0);
1867 data.longest_float = newSVpvn("",0);
1868 data.last_found = newSVpvn("",0);
c277df42
IZ
1869 data.longest = &(data.longest_fixed);
1870 first = scan;
653099ff 1871 if (!r->regstclass) {
830247a4 1872 cl_init(pRExC_state, &ch_class);
653099ff
GS
1873 data.start_class = &ch_class;
1874 stclass_flag = SCF_DO_STCLASS_AND;
1875 } else /* XXXX Check for BOUND? */
1876 stclass_flag = 0;
cb434fcc 1877 data.last_closep = &last_close;
653099ff 1878
830247a4 1879 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1880 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1882 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1883 && !RExC_seen_zerolen
1884 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1885 r->reganch |= ROPT_CHECK_ALL;
830247a4 1886 scan_commit(pRExC_state, &data);
c277df42
IZ
1887 SvREFCNT_dec(data.last_found);
1888
a0ed51b3 1889 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1890 if (longest_float_length
c277df42
IZ
1891 || (data.flags & SF_FL_BEFORE_EOL
1892 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1893 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1894 int t;
1895
a0ed51b3 1896 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1897 && data.offset_fixed == data.offset_float_min
1898 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1899 goto remove_float; /* As in (a)+. */
1900
c277df42
IZ
1901 r->float_substr = data.longest_float;
1902 r->float_min_offset = data.offset_float_min;
1903 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1904 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1905 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1906 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1907 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1908 }
1909 else {
aca2d497 1910 remove_float:
c277df42
IZ
1911 r->float_substr = Nullsv;
1912 SvREFCNT_dec(data.longest_float);
c5254dd6 1913 longest_float_length = 0;
a0d0e21e 1914 }
c277df42 1915
a0ed51b3 1916 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1917 if (longest_fixed_length
c277df42
IZ
1918 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1919 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1920 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1921 int t;
1922
c277df42
IZ
1923 r->anchored_substr = data.longest_fixed;
1924 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1925 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1926 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1927 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1928 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1929 }
1930 else {
c277df42
IZ
1931 r->anchored_substr = Nullsv;
1932 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1933 longest_fixed_length = 0;
a0d0e21e 1934 }
b81d288d 1935 if (r->regstclass
ffc61ed2 1936 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1937 r->regstclass = NULL;
1938 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1939 && !(data.start_class->flags & ANYOF_EOS)
1940 && !cl_is_anything(data.start_class)) {
830247a4 1941 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1942
b81d288d 1943 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1944 struct regnode_charclass_class);
1945 StructCopy(data.start_class,
830247a4 1946 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1947 struct regnode_charclass_class);
830247a4 1948 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1949 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1950 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1951 DEBUG_r({ SV *sv = sv_newmortal();
1952 regprop(sv, (regnode*)data.start_class);
1953 PerlIO_printf(Perl_debug_log,
1954 "synthetic stclass `%s'.\n",
1955 SvPVX(sv));});
653099ff 1956 }
c277df42
IZ
1957
1958 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1959 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1960 r->check_substr = r->anchored_substr;
1961 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1962 if (r->reganch & ROPT_ANCH_SINGLE)
1963 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1964 }
1965 else {
c277df42
IZ
1966 r->check_substr = r->float_substr;
1967 r->check_offset_min = data.offset_float_min;
1968 r->check_offset_max = data.offset_float_max;
a0d0e21e 1969 }
30382c73
IZ
1970 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1971 This should be changed ASAP! */
1972 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1973 r->reganch |= RE_USE_INTUIT;
1974 if (SvTAIL(r->check_substr))
1975 r->reganch |= RE_INTUIT_TAIL;
1976 }
a0ed51b3
LW
1977 }
1978 else {
c277df42
IZ
1979 /* Several toplevels. Best we can is to set minlen. */
1980 I32 fake;
653099ff 1981 struct regnode_charclass_class ch_class;
cb434fcc 1982 I32 last_close = 0;
c277df42
IZ
1983
1984 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1985 scan = r->program + 1;
830247a4 1986 cl_init(pRExC_state, &ch_class);
653099ff 1987 data.start_class = &ch_class;
cb434fcc 1988 data.last_closep = &last_close;
e1901655 1989 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1990 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1991 if (!(data.start_class->flags & ANYOF_EOS)
1992 && !cl_is_anything(data.start_class)) {
830247a4 1993 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1994
b81d288d 1995 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1996 struct regnode_charclass_class);
1997 StructCopy(data.start_class,
830247a4 1998 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1999 struct regnode_charclass_class);
830247a4 2000 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2001 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2002 DEBUG_r({ SV* sv = sv_newmortal();
2003 regprop(sv, (regnode*)data.start_class);
2004 PerlIO_printf(Perl_debug_log,
2005 "synthetic stclass `%s'.\n",
2006 SvPVX(sv));});
653099ff 2007 }
a0d0e21e
LW
2008 }
2009
a0d0e21e 2010 r->minlen = minlen;
b81d288d 2011 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2012 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2013 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2014 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2015 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2016 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2017 if (RExC_seen & REG_SEEN_CANY)
2018 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2019 Newz(1002, r->startp, RExC_npar, I32);
2020 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2021 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2022 DEBUG_r(regdump(r));
2023 return(r);
a687059c
LW
2024}
2025
2026/*
2027 - reg - regular expression, i.e. main body or parenthesized thing
2028 *
2029 * Caller must absorb opening parenthesis.
2030 *
2031 * Combining parenthesis handling with the base level of regular expression
2032 * is a trifle forced, but the need to tie the tails of the branches to what
2033 * follows makes it hard to avoid.
2034 */
76e3520e 2035STATIC regnode *
830247a4 2036S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2037 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2038{
c277df42
IZ
2039 register regnode *ret; /* Will be the head of the group. */
2040 register regnode *br;
2041 register regnode *lastbr;
2042 register regnode *ender = 0;
a0d0e21e 2043 register I32 parno = 0;
830247a4 2044 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2045
2046 /* for (?g), (?gc), and (?o) warnings; warning
2047 about (?c) will warn about (?g) -- japhy */
2048
2049 I32 wastedflags = 0x00,
2050 wasted_o = 0x01,
2051 wasted_g = 0x02,
2052 wasted_gc = 0x02 | 0x04,
2053 wasted_c = 0x04;
2054
fac92740 2055 char * parse_start = RExC_parse; /* MJD */
830247a4 2056 char *oregcomp_parse = RExC_parse;
c277df42 2057 char c;
a0d0e21e 2058
821b33a5 2059 *flagp = 0; /* Tentatively. */
a0d0e21e 2060
9d1d55b5 2061
a0d0e21e
LW
2062 /* Make an OPEN node, if parenthesized. */
2063 if (paren) {
fac92740 2064 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2065 U16 posflags = 0, negflags = 0;
2066 U16 *flagsp = &posflags;
0f5d15d6 2067 int logical = 0;
830247a4 2068 char *seqstart = RExC_parse;
ca9dfc88 2069
830247a4
IZ
2070 RExC_parse++;
2071 paren = *RExC_parse++;
c277df42 2072 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2073 switch (paren) {
fac92740 2074 case '<': /* (?<...) */
830247a4 2075 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2076 if (*RExC_parse == '!')
c277df42 2077 paren = ',';
b81d288d 2078 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2079 goto unknown;
830247a4 2080 RExC_parse++;
fac92740
MJD
2081 case '=': /* (?=...) */
2082 case '!': /* (?!...) */
830247a4 2083 RExC_seen_zerolen++;
fac92740
MJD
2084 case ':': /* (?:...) */
2085 case '>': /* (?>...) */
a0d0e21e 2086 break;
fac92740
MJD
2087 case '$': /* (?$...) */
2088 case '@': /* (?@...) */
8615cb43 2089 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2090 break;
fac92740 2091 case '#': /* (?#...) */
830247a4
IZ
2092 while (*RExC_parse && *RExC_parse != ')')
2093 RExC_parse++;
2094 if (*RExC_parse != ')')
c277df42 2095 FAIL("Sequence (?#... not terminated");
830247a4 2096 nextchar(pRExC_state);
a0d0e21e
LW
2097 *flagp = TRYAGAIN;
2098 return NULL;
fac92740 2099 case 'p': /* (?p...) */
f9373011
PM
2100 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2101 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2102 /* FALL THROUGH*/
fac92740 2103 case '?': /* (??...) */
0f5d15d6 2104 logical = 1;
830247a4 2105 paren = *RExC_parse++;
0f5d15d6 2106 /* FALL THROUGH */
fac92740 2107 case '{': /* (?{...}) */
c277df42 2108 {
c277df42
IZ
2109 I32 count = 1, n = 0;
2110 char c;
830247a4 2111 char *s = RExC_parse;
c277df42
IZ
2112 SV *sv;
2113 OP_4tree *sop, *rop;
2114
830247a4
IZ
2115 RExC_seen_zerolen++;
2116 RExC_seen |= REG_SEEN_EVAL;
2117 while (count && (c = *RExC_parse)) {
2118 if (c == '\\' && RExC_parse[1])
2119 RExC_parse++;
b81d288d 2120 else if (c == '{')
c277df42 2121 count++;
b81d288d 2122 else if (c == '}')
c277df42 2123 count--;
830247a4 2124 RExC_parse++;
c277df42 2125 }
830247a4 2126 if (*RExC_parse != ')')
b45f050a 2127 {
b81d288d 2128 RExC_parse = s;
b45f050a
JF
2129 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2130 }
c277df42
IZ
2131 if (!SIZE_ONLY) {
2132 AV *av;
b81d288d
AB
2133
2134 if (RExC_parse - 1 - s)
830247a4 2135 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2136 else
79cb57f6 2137 sv = newSVpvn("", 0);
c277df42 2138
569233ed
SB
2139 ENTER;
2140 Perl_save_re_context(aTHX);
c277df42 2141 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2142 sop->op_private |= OPpREFCOUNTED;
2143 /* re_dup will OpREFCNT_inc */
2144 OpREFCNT_set(sop, 1);
569233ed 2145 LEAVE;
c277df42 2146
830247a4
IZ
2147 n = add_data(pRExC_state, 3, "nop");
2148 RExC_rx->data->data[n] = (void*)rop;
2149 RExC_rx->data->data[n+1] = (void*)sop;
2150 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2151 SvREFCNT_dec(sv);
a0ed51b3 2152 }
e24b16f9 2153 else { /* First pass */
830247a4 2154 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2155 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2156 /* No compiled RE interpolated, has runtime
2157 components ===> unsafe. */
2158 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2159 if (PL_tainting && PL_tainted)
cc6b7395 2160 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2161 }
2162
830247a4 2163 nextchar(pRExC_state);
0f5d15d6 2164 if (logical) {
830247a4 2165 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2166 if (!SIZE_ONLY)
2167 ret->flags = 2;
830247a4 2168 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2169 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2170 return ret;
2171 }
830247a4 2172 return reganode(pRExC_state, EVAL, n);
c277df42 2173 }
fac92740 2174 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2175 {
fac92740 2176 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2177 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2178 || RExC_parse[1] == '<'
830247a4 2179 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2180 I32 flag;
2181
830247a4 2182 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2183 if (!SIZE_ONLY)
2184 ret->flags = 1;
830247a4 2185 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2186 goto insert_if;
b81d288d 2187 }
a0ed51b3 2188 }
830247a4 2189 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2190 /* (?(1)...) */
830247a4 2191 parno = atoi(RExC_parse++);
c277df42 2192
830247a4
IZ
2193 while (isDIGIT(*RExC_parse))
2194 RExC_parse++;
fac92740
MJD
2195 ret = reganode(pRExC_state, GROUPP, parno);
2196
830247a4 2197 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2198 vFAIL("Switch condition not recognized");
c277df42 2199 insert_if:
830247a4
IZ
2200 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2201 br = regbranch(pRExC_state, &flags, 1);
c277df42 2202 if (br == NULL)
830247a4 2203 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2204 else
830247a4
IZ
2205 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2206 c = *nextchar(pRExC_state);
d1b80229
IZ
2207 if (flags&HASWIDTH)
2208 *flagp |= HASWIDTH;
c277df42 2209 if (c == '|') {
830247a4
IZ
2210 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2211 regbranch(pRExC_state, &flags, 1);
2212 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2213 if (flags&HASWIDTH)
2214 *flagp |= HASWIDTH;
830247a4 2215 c = *nextchar(pRExC_state);
a0ed51b3
LW
2216 }
2217 else
c277df42
IZ
2218 lastbr = NULL;
2219 if (c != ')')
8615cb43 2220 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2221 ender = reg_node(pRExC_state, TAIL);
2222 regtail(pRExC_state, br, ender);
c277df42 2223 if (lastbr) {
830247a4
IZ
2224 regtail(pRExC_state, lastbr, ender);
2225 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2226 }
2227 else
830247a4 2228 regtail(pRExC_state, ret, ender);
c277df42 2229 return ret;
a0ed51b3
LW
2230 }
2231 else {
830247a4 2232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2233 }
2234 }
1b1626e4 2235 case 0:
830247a4 2236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2237 vFAIL("Sequence (? incomplete");
1b1626e4 2238 break;
a0d0e21e 2239 default:
830247a4 2240 --RExC_parse;
fac92740 2241 parse_flags: /* (?i) */
830247a4 2242 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2243 /* (?g), (?gc) and (?o) are useless here
2244 and must be globally applied -- japhy */
2245
2246 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2247 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2248 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2249 if (! (wastedflags & wflagbit) ) {
2250 wastedflags |= wflagbit;
2251 vWARN5(
2252 RExC_parse + 1,
2253 "Useless (%s%c) - %suse /%c modifier",
2254 flagsp == &negflags ? "?-" : "?",
2255 *RExC_parse,
2256 flagsp == &negflags ? "don't " : "",
2257 *RExC_parse
2258 );
2259 }
2260 }
2261 }
2262 else if (*RExC_parse == 'c') {
2263 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2264 if (! (wastedflags & wasted_c) ) {
2265 wastedflags |= wasted_gc;
2266 vWARN3(
2267 RExC_parse + 1,
2268 "Useless (%sc) - %suse /gc modifier",
2269 flagsp == &negflags ? "?-" : "?",
2270 flagsp == &negflags ? "don't " : ""
2271 );
2272 }
2273 }
2274 }
2275 else { pmflag(flagsp, *RExC_parse); }
2276
830247a4 2277 ++RExC_parse;
ca9dfc88 2278 }
830247a4 2279 if (*RExC_parse == '-') {
ca9dfc88 2280 flagsp = &negflags;
9d1d55b5 2281 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2282 ++RExC_parse;
ca9dfc88 2283 goto parse_flags;
48c036b1 2284 }
830247a4
IZ
2285 RExC_flags16 |= posflags;
2286 RExC_flags16 &= ~negflags;
2287 if (*RExC_parse == ':') {
2288 RExC_parse++;
ca9dfc88
IZ
2289 paren = ':';
2290 break;
2291 }
c277df42 2292 unknown:
830247a4
IZ
2293 if (*RExC_parse != ')') {
2294 RExC_parse++;
2295 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2296 }
830247a4 2297 nextchar(pRExC_state);
a0d0e21e
LW
2298 *flagp = TRYAGAIN;
2299 return NULL;
2300 }
2301 }
fac92740 2302 else { /* (...) */
830247a4
IZ
2303 parno = RExC_npar;
2304 RExC_npar++;
2305 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2306 Set_Node_Length(ret, 1); /* MJD */
2307 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2308 open = 1;
a0d0e21e 2309 }
a0ed51b3 2310 }
fac92740 2311 else /* ! paren */
a0d0e21e
LW
2312 ret = NULL;
2313
2314 /* Pick up the branches, linking them together. */
fac92740 2315 parse_start = RExC_parse; /* MJD */
830247a4 2316 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2317 /* branch_len = (paren != 0); */
2318
a0d0e21e
LW
2319 if (br == NULL)
2320 return(NULL);
830247a4
IZ
2321 if (*RExC_parse == '|') {
2322 if (!SIZE_ONLY && RExC_extralen) {
2323 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2324 }
fac92740 2325 else { /* MJD */
830247a4 2326 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2327 Set_Node_Length(br, paren != 0);
2328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2329 }
c277df42
IZ
2330 have_branch = 1;
2331 if (SIZE_ONLY)
830247a4 2332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2333 }
2334 else if (paren == ':') {
c277df42
IZ
2335 *flagp |= flags&SIMPLE;
2336 }
2337 if (open) { /* Starts with OPEN. */
830247a4 2338 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2339 }
2340 else if (paren != '?') /* Not Conditional */
a0d0e21e 2341 ret = br;
821b33a5
IZ
2342 if (flags&HASWIDTH)
2343 *flagp |= HASWIDTH;
a0d0e21e 2344 *flagp |= flags&SPSTART;
c277df42 2345 lastbr = br;
830247a4
IZ
2346 while (*RExC_parse == '|') {
2347 if (!SIZE_ONLY && RExC_extralen) {
2348 ender = reganode(pRExC_state, LONGJMP,0);
2349 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2350 }
2351 if (SIZE_ONLY)
830247a4
IZ
2352 RExC_extralen += 2; /* Account for LONGJMP. */
2353 nextchar(pRExC_state);
2354 br = regbranch(pRExC_state, &flags, 0);
fac92740 2355
a687059c 2356 if (br == NULL)
a0d0e21e 2357 return(NULL);
830247a4 2358 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2359 lastbr = br;
821b33a5
IZ
2360 if (flags&HASWIDTH)
2361 *flagp |= HASWIDTH;
a687059c 2362 *flagp |= flags&SPSTART;
a0d0e21e
LW
2363 }
2364
c277df42
IZ
2365 if (have_branch || paren != ':') {
2366 /* Make a closing node, and hook it on the end. */
2367 switch (paren) {
2368 case ':':
830247a4 2369 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2370 break;
2371 case 1:
830247a4 2372 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2373 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2374 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2375 break;
2376 case '<':
c277df42
IZ
2377 case ',':
2378 case '=':
2379 case '!':
c277df42 2380 *flagp &= ~HASWIDTH;
821b33a5
IZ
2381 /* FALL THROUGH */
2382 case '>':
830247a4 2383 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2384 break;
2385 case 0:
830247a4 2386 ender = reg_node(pRExC_state, END);
c277df42
IZ
2387 break;
2388 }
830247a4 2389 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2390
c277df42
IZ
2391 if (have_branch) {
2392 /* Hook the tails of the branches to the closing node. */
2393 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2394 regoptail(pRExC_state, br, ender);
c277df42
IZ
2395 }
2396 }
a0d0e21e 2397 }
c277df42
IZ
2398
2399 {
2400 char *p;
2401 static char parens[] = "=!<,>";
2402
2403 if (paren && (p = strchr(parens, paren))) {
2404 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2405 int flag = (p - parens) > 1;
2406
2407 if (paren == '>')
2408 node = SUSPEND, flag = 0;
830247a4 2409 reginsert(pRExC_state, node,ret);
c277df42 2410 ret->flags = flag;
830247a4 2411 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2412 }
a0d0e21e
LW
2413 }
2414
2415 /* Check for proper termination. */
ce3e6498 2416 if (paren) {
830247a4
IZ
2417 RExC_flags16 = oregflags;
2418 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2419 RExC_parse = oregcomp_parse;
380a0633 2420 vFAIL("Unmatched (");
ce3e6498 2421 }
a0ed51b3 2422 }
830247a4
IZ
2423 else if (!paren && RExC_parse < RExC_end) {
2424 if (*RExC_parse == ')') {
2425 RExC_parse++;
380a0633 2426 vFAIL("Unmatched )");
a0ed51b3
LW
2427 }
2428 else
b45f050a 2429 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2430 /* NOTREACHED */
2431 }
a687059c 2432
a0d0e21e 2433 return(ret);
a687059c
LW
2434}
2435
2436/*
2437 - regbranch - one alternative of an | operator
2438 *
2439 * Implements the concatenation operator.
2440 */
76e3520e 2441STATIC regnode *
830247a4 2442S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2443{
c277df42
IZ
2444 register regnode *ret;
2445 register regnode *chain = NULL;
2446 register regnode *latest;
2447 I32 flags = 0, c = 0;
a0d0e21e 2448
b81d288d 2449 if (first)
c277df42
IZ
2450 ret = NULL;
2451 else {
b81d288d 2452 if (!SIZE_ONLY && RExC_extralen)
830247a4 2453 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2454 else {
830247a4 2455 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2456 Set_Node_Length(ret, 1);
2457 }
c277df42
IZ
2458 }
2459
b81d288d 2460 if (!first && SIZE_ONLY)
830247a4 2461 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2462
c277df42 2463 *flagp = WORST; /* Tentatively. */
a0d0e21e 2464
830247a4
IZ
2465 RExC_parse--;
2466 nextchar(pRExC_state);
2467 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2468 flags &= ~TRYAGAIN;
830247a4 2469 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2470 if (latest == NULL) {
2471 if (flags & TRYAGAIN)
2472 continue;
2473 return(NULL);
a0ed51b3
LW
2474 }
2475 else if (ret == NULL)
c277df42 2476 ret = latest;
a0d0e21e 2477 *flagp |= flags&HASWIDTH;
c277df42 2478 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2479 *flagp |= flags&SPSTART;
2480 else {
830247a4
IZ
2481 RExC_naughty++;
2482 regtail(pRExC_state, chain, latest);
a687059c 2483 }
a0d0e21e 2484 chain = latest;
c277df42
IZ
2485 c++;
2486 }
2487 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2488 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2489 if (ret == NULL)
2490 ret = chain;
2491 }
2492 if (c == 1) {
2493 *flagp |= flags&SIMPLE;
a0d0e21e 2494 }
a687059c 2495
a0d0e21e 2496 return(ret);
a687059c
LW
2497}
2498
2499/*
2500 - regpiece - something followed by possible [*+?]
2501 *
2502 * Note that the branching code sequences used for ? and the general cases
2503 * of * and + are somewhat optimized: they use the same NOTHING node as
2504 * both the endmarker for their branch list and the body of the last branch.
2505 * It might seem that this node could be dispensed with entirely, but the
2506 * endmarker role is not redundant.
2507 */
76e3520e 2508STATIC regnode *
830247a4 2509S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2510{
c277df42 2511 register regnode *ret;
a0d0e21e
LW
2512 register char op;
2513 register char *next;
2514 I32 flags;
830247a4 2515 char *origparse = RExC_parse;
a0d0e21e
LW
2516 char *maxpos;
2517 I32 min;
c277df42 2518 I32 max = REG_INFTY;
fac92740 2519 char *parse_start;
a0d0e21e 2520
830247a4 2521 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2522 if (ret == NULL) {
2523 if (flags & TRYAGAIN)
2524 *flagp |= TRYAGAIN;
2525 return(NULL);
2526 }
2527
830247a4 2528 op = *RExC_parse;
a0d0e21e 2529
830247a4 2530 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2531 parse_start = RExC_parse; /* MJD */
830247a4 2532 next = RExC_parse + 1;
a0d0e21e
LW
2533 maxpos = Nullch;
2534 while (isDIGIT(*next) || *next == ',') {
2535 if (*next == ',') {
2536 if (maxpos)
2537 break;
2538 else
2539 maxpos = next;
a687059c 2540 }
a0d0e21e
LW
2541 next++;
2542 }
2543 if (*next == '}') { /* got one */
2544 if (!maxpos)
2545 maxpos = next;
830247a4
IZ
2546 RExC_parse++;
2547 min = atoi(RExC_parse);
a0d0e21e
LW
2548 if (*maxpos == ',')
2549 maxpos++;
2550 else
830247a4 2551 maxpos = RExC_parse;
a0d0e21e
LW
2552 max = atoi(maxpos);
2553 if (!max && *maxpos != '0')
c277df42
IZ
2554 max = REG_INFTY; /* meaning "infinity" */
2555 else if (max >= REG_INFTY)
8615cb43 2556 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2557 RExC_parse = next;
2558 nextchar(pRExC_state);
a0d0e21e
LW
2559
2560 do_curly:
2561 if ((flags&SIMPLE)) {
830247a4
IZ
2562 RExC_naughty += 2 + RExC_naughty / 2;
2563 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2564 Set_Node_Offset(ret, parse_start+1); /* MJD */
2565 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2566 }
2567 else {
830247a4 2568 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2569
2570 w->flags = 0;
830247a4
IZ
2571 regtail(pRExC_state, ret, w);
2572 if (!SIZE_ONLY && RExC_extralen) {
2573 reginsert(pRExC_state, LONGJMP,ret);
2574 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2575 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2576 }
830247a4 2577 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2578 /* MJD hk */
2579 Set_Node_Offset(ret, parse_start+1);
2580 Set_Node_Length(ret,
2581 op == '{' ? (RExC_parse - parse_start) : 1);
2582
830247a4 2583 if (!SIZE_ONLY && RExC_extralen)
c277df42 2584 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2585 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2586 if (SIZE_ONLY)
830247a4
IZ
2587 RExC_whilem_seen++, RExC_extralen += 3;
2588 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2589 }
c277df42 2590 ret->flags = 0;
a0d0e21e
LW
2591
2592 if (min > 0)
821b33a5
IZ
2593 *flagp = WORST;
2594 if (max > 0)
2595 *flagp |= HASWIDTH;
a0d0e21e 2596 if (max && max < min)
8615cb43 2597 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2598 if (!SIZE_ONLY) {
2599 ARG1_SET(ret, min);
2600 ARG2_SET(ret, max);
a687059c 2601 }
a687059c 2602
a0d0e21e 2603 goto nest_check;
a687059c 2604 }
a0d0e21e 2605 }
a687059c 2606
a0d0e21e
LW
2607 if (!ISMULT1(op)) {
2608 *flagp = flags;
a687059c 2609 return(ret);
a0d0e21e 2610 }
bb20fd44 2611
c277df42 2612#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2613
2614 /* if this is reinstated, don't forget to put this back into perldiag:
2615
2616 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2617
2618 (F) The part of the regexp subject to either the * or + quantifier
2619 could match an empty string. The {#} shows in the regular
2620 expression about where the problem was discovered.
2621
2622 */
2623
bb20fd44 2624 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2625 vFAIL("Regexp *+ operand could be empty");
b81d288d 2626#endif
bb20fd44 2627
fac92740 2628 parse_start = RExC_parse;
830247a4 2629 nextchar(pRExC_state);
a0d0e21e 2630
821b33a5 2631 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2632
2633 if (op == '*' && (flags&SIMPLE)) {
830247a4 2634 reginsert(pRExC_state, STAR, ret);
c277df42 2635 ret->flags = 0;
830247a4 2636 RExC_naughty += 4;
a0d0e21e
LW
2637 }
2638 else if (op == '*') {
2639 min = 0;
2640 goto do_curly;
a0ed51b3
LW
2641 }
2642 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2643 reginsert(pRExC_state, PLUS, ret);
c277df42 2644 ret->flags = 0;
830247a4 2645 RExC_naughty += 3;
a0d0e21e
LW
2646 }
2647 else if (op == '+') {
2648 min = 1;
2649 goto do_curly;
a0ed51b3
LW
2650 }
2651 else if (op == '?') {
a0d0e21e
LW
2652 min = 0; max = 1;
2653 goto do_curly;
2654 }
2655 nest_check:
e476b1b5 2656 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2657 vWARN3(RExC_parse,
b45f050a 2658 "%.*s matches null string many times",
830247a4 2659 RExC_parse - origparse,
b45f050a 2660 origparse);
a0d0e21e
LW
2661 }
2662
830247a4
IZ
2663 if (*RExC_parse == '?') {
2664 nextchar(pRExC_state);
2665 reginsert(pRExC_state, MINMOD, ret);
2666 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2667 }
830247a4
IZ
2668 if (ISMULT2(RExC_parse)) {
2669 RExC_parse++;
b45f050a
JF
2670 vFAIL("Nested quantifiers");
2671 }
a0d0e21e
LW
2672
2673 return(ret);
a687059c
LW
2674}
2675
2676/*
2677 - regatom - the lowest level
2678 *
2679 * Optimization: gobbles an entire sequence of ordinary characters so that
2680 * it can turn them into a single node, which is smaller to store and
2681 * faster to run. Backslashed characters are exceptions, each becoming a
2682 * separate node; the code is simpler that way and it's not worth fixing.
2683 *
b45f050a 2684 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2685STATIC regnode *
830247a4 2686S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2687{
c277df42 2688 register regnode *ret = 0;
a0d0e21e 2689 I32 flags;
f06dbbb7 2690 char *parse_start = 0;
a0d0e21e
LW
2691
2692 *flagp = WORST; /* Tentatively. */
2693
2694tryagain:
830247a4 2695 switch (*RExC_parse) {
a0d0e21e 2696 case '^':
830247a4
IZ
2697 RExC_seen_zerolen++;
2698 nextchar(pRExC_state);
2699 if (RExC_flags16 & PMf_MULTILINE)
2700 ret = reg_node(pRExC_state, MBOL);
2701 else if (RExC_flags16 & PMf_SINGLELINE)
2702 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2703 else
830247a4 2704 ret = reg_node(pRExC_state, BOL);
fac92740 2705 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2706 break;
2707 case '$':
830247a4 2708 nextchar(pRExC_state);
b81d288d 2709 if (*RExC_parse)
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 if (RExC_flags16 & PMf_MULTILINE)
2712 ret = reg_node(pRExC_state, MEOL);
2713 else if (RExC_flags16 & PMf_SINGLELINE)
2714 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2715 else
830247a4 2716 ret = reg_node(pRExC_state, EOL);
fac92740 2717 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2718 break;
2719 case '.':
830247a4 2720 nextchar(pRExC_state);
cce850e4 2721 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2722 ret = reg_node(pRExC_state, SANY);
2723 else
2724 ret = reg_node(pRExC_state, REG_ANY);
2725 *flagp |= HASWIDTH|SIMPLE;
830247a4 2726 RExC_naughty++;
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '[':
b45f050a 2730 {
830247a4 2731 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2732 ret = regclass(pRExC_state);
830247a4
IZ
2733 if (*RExC_parse != ']') {
2734 RExC_parse = oregcomp_parse;
b45f050a
JF
2735 vFAIL("Unmatched [");
2736 }
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738 *flagp |= HASWIDTH|SIMPLE;
fac92740 2739 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2740 break;
b45f050a 2741 }
a0d0e21e 2742 case '(':
830247a4
IZ
2743 nextchar(pRExC_state);
2744 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2745 if (ret == NULL) {
bf93d4cc 2746 if (flags & TRYAGAIN) {
830247a4 2747 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2748 /* Make parent create an empty node if needed. */
2749 *flagp |= TRYAGAIN;
2750 return(NULL);
2751 }
a0d0e21e 2752 goto tryagain;
bf93d4cc 2753 }
a0d0e21e
LW
2754 return(NULL);
2755 }
c277df42 2756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2757 break;
2758 case '|':
2759 case ')':
2760 if (flags & TRYAGAIN) {
2761 *flagp |= TRYAGAIN;
2762 return NULL;
2763 }
b45f050a 2764 vFAIL("Internal urp");
a0d0e21e
LW
2765 /* Supposed to be caught earlier. */
2766 break;
85afd4ae 2767 case '{':
830247a4
IZ
2768 if (!regcurly(RExC_parse)) {
2769 RExC_parse++;
85afd4ae
CS
2770 goto defchar;
2771 }
2772 /* FALL THROUGH */
a0d0e21e
LW
2773 case '?':
2774 case '+':
2775 case '*':
830247a4 2776 RExC_parse++;
b45f050a 2777 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2778 break;
2779 case '\\':
830247a4 2780 switch (*++RExC_parse) {
a0d0e21e 2781 case 'A':
830247a4
IZ
2782 RExC_seen_zerolen++;
2783 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2784 *flagp |= SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'G':
830247a4
IZ
2789 ret = reg_node(pRExC_state, GPOS);
2790 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2791 *flagp |= SIMPLE;
830247a4 2792 nextchar(pRExC_state);
fac92740 2793 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2794 break;
2795 case 'Z':
830247a4 2796 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2797 *flagp |= SIMPLE;
830247a4 2798 nextchar(pRExC_state);
a0d0e21e 2799 break;
b85d18e9 2800 case 'z':
830247a4 2801 ret = reg_node(pRExC_state, EOS);
b85d18e9 2802 *flagp |= SIMPLE;
830247a4
IZ
2803 RExC_seen_zerolen++; /* Do not optimize RE away */
2804 nextchar(pRExC_state);
fac92740 2805 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2806 break;
4a2d328f 2807 case 'C':
f33976b4
DB
2808 ret = reg_node(pRExC_state, CANY);
2809 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2810 *flagp |= HASWIDTH|SIMPLE;
830247a4 2811 nextchar(pRExC_state);
fac92740 2812 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2813 break;
2814 case 'X':
830247a4 2815 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2816 *flagp |= HASWIDTH;
830247a4 2817 nextchar(pRExC_state);
fac92740 2818 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2819 break;
a0d0e21e 2820 case 'w':
ffc61ed2 2821 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2822 *flagp |= HASWIDTH|SIMPLE;
830247a4 2823 nextchar(pRExC_state);
fac92740 2824 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2825 break;
2826 case 'W':
ffc61ed2 2827 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2828 *flagp |= HASWIDTH|SIMPLE;
830247a4 2829 nextchar(pRExC_state);
fac92740 2830 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2831 break;
2832 case 'b':
830247a4
IZ
2833 RExC_seen_zerolen++;
2834 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2835 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2836 *flagp |= SIMPLE;
830247a4 2837 nextchar(pRExC_state);
fac92740 2838 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2839 break;
2840 case 'B':
830247a4
IZ
2841 RExC_seen_zerolen++;
2842 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2843 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2844 *flagp |= SIMPLE;
830247a4 2845 nextchar(pRExC_state);
fac92740 2846 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2847 break;
2848 case 's':
ffc61ed2 2849 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2850 *flagp |= HASWIDTH|SIMPLE;
830247a4 2851 nextchar(pRExC_state);
fac92740 2852 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2853 break;
2854 case 'S':
ffc61ed2 2855 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
830247a4 2857 nextchar(pRExC_state);
fac92740 2858 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2859 break;
2860 case 'd':
ffc61ed2 2861 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2862 *flagp |= HASWIDTH|SIMPLE;
830247a4 2863 nextchar(pRExC_state);
fac92740 2864 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2865 break;
2866 case 'D':
ffc61ed2 2867 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2868 *flagp |= HASWIDTH|SIMPLE;
830247a4 2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2871 break;
a14b48bc
LW
2872 case 'p':
2873 case 'P':
3568d838 2874 {
830247a4 2875 char* oldregxend = RExC_end;
fac92740 2876 char* parse_start = RExC_parse;
a14b48bc 2877
830247a4 2878 if (RExC_parse[1] == '{') {
3568d838 2879 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2880 RExC_end = strchr(RExC_parse, '}');
2881 if (!RExC_end) {
2882 RExC_parse += 2;
2883 RExC_end = oldregxend;
b45f050a
JF
2884 vFAIL("Missing right brace on \\p{}");
2885 }
830247a4 2886 RExC_end++;
a14b48bc
LW
2887 }
2888 else
830247a4
IZ
2889 RExC_end = RExC_parse + 2;
2890 RExC_parse--;
a14b48bc 2891
ffc61ed2 2892 ret = regclass(pRExC_state);
a14b48bc 2893
830247a4
IZ
2894 RExC_end = oldregxend;
2895 RExC_parse--;
fac92740 2896 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2897 nextchar(pRExC_state);
a14b48bc
LW
2898 *flagp |= HASWIDTH|SIMPLE;
2899 }
2900 break;
a0d0e21e
LW
2901 case 'n':
2902 case 'r':
2903 case 't':
2904 case 'f':
2905 case 'e':
2906 case 'a':
2907 case 'x':
2908 case 'c':
2909 case '0':
2910 goto defchar;
2911 case '1': case '2': case '3': case '4':
2912 case '5': case '6': case '7': case '8': case '9':
2913 {
830247a4 2914 I32 num = atoi(RExC_parse);
a0d0e21e 2915
830247a4 2916 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2917 goto defchar;
2918 else {
fac92740 2919 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2920 while (isDIGIT(*RExC_parse))
2921 RExC_parse++;
b45f050a 2922
830247a4 2923 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2924 vFAIL("Reference to nonexistent group");
830247a4
IZ
2925 RExC_sawback = 1;
2926 ret = reganode(pRExC_state, FOLD
a0ed51b3 2927 ? (LOC ? REFFL : REFF)
c8756f30 2928 : REF, num);
a0d0e21e 2929 *flagp |= HASWIDTH;
fac92740
MJD
2930
2931 /* override incorrect value set in reganode MJD */
2932 Set_Node_Offset(ret, parse_start+1);
2933 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2934 RExC_parse--;
2935 nextchar(pRExC_state);
a0d0e21e
LW
2936 }
2937 }
2938 break;
2939 case '\0':
830247a4 2940 if (RExC_parse >= RExC_end)
b45f050a 2941 FAIL("Trailing \\");
a0d0e21e
LW
2942 /* FALL THROUGH */
2943 default:
c9f97d15
IZ
2944 /* Do not generate `unrecognized' warnings here, we fall
2945 back into the quick-grab loop below */
a0d0e21e
LW
2946 goto defchar;
2947 }
2948 break;
4633a7c4
LW
2949
2950 case '#':
830247a4
IZ
2951 if (RExC_flags16 & PMf_EXTENDED) {
2952 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2953 if (RExC_parse < RExC_end)
4633a7c4
LW
2954 goto tryagain;
2955 }
2956 /* FALL THROUGH */
2957
a0d0e21e 2958 default: {
ba210ebe 2959 register STRLEN len;
a0ed51b3 2960 register UV ender;
a0d0e21e 2961 register char *p;
c277df42 2962 char *oldp, *s;
ba210ebe 2963 STRLEN numlen;
f06dbbb7
JH
2964
2965 parse_start = RExC_parse - 1;
a0d0e21e 2966
830247a4 2967 RExC_parse++;
a0d0e21e
LW
2968
2969 defchar:
830247a4 2970 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2971 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2972 : EXACT);
cd439c50 2973 s = STRING(ret);
830247a4
IZ
2974 for (len = 0, p = RExC_parse - 1;
2975 len < 127 && p < RExC_end;
a0d0e21e
LW
2976 len++)
2977 {
2978 oldp = p;
5b5a24f7 2979
830247a4
IZ
2980 if (RExC_flags16 & PMf_EXTENDED)
2981 p = regwhite(p, RExC_end);
a0d0e21e
LW
2982 switch (*p) {
2983 case '^':
2984 case '$':
2985 case '.':
2986 case '[':
2987 case '(':
2988 case ')':
2989 case '|':
2990 goto loopdone;
2991 case '\\':
2992 switch (*++p) {
2993 case 'A':
2994 case 'G':
2995 case 'Z':
b85d18e9 2996 case 'z':
a0d0e21e
LW
2997 case 'w':
2998 case 'W':
2999 case 'b':
3000 case 'B':
3001 case 's':
3002 case 'S':
3003 case 'd':
3004 case 'D':
a14b48bc
LW
3005 case 'p':
3006 case 'P':
a0d0e21e
LW
3007 --p;
3008 goto loopdone;
3009 case 'n':
3010 ender = '\n';
3011 p++;
a687059c 3012 break;
a0d0e21e
LW
3013 case 'r':
3014 ender = '\r';
3015 p++;
a687059c 3016 break;
a0d0e21e
LW
3017 case 't':
3018 ender = '\t';
3019 p++;
a687059c 3020 break;
a0d0e21e
LW
3021 case 'f':
3022 ender = '\f';
3023 p++;
a687059c 3024 break;
a0d0e21e 3025 case 'e':
c7f1f016 3026 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3027 p++;
a687059c 3028 break;
a0d0e21e 3029 case 'a':
c7f1f016 3030 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3031 p++;
a687059c 3032 break;
a0d0e21e 3033 case 'x':
a0ed51b3
LW
3034 if (*++p == '{') {
3035 char* e = strchr(p, '}');
b81d288d 3036
b45f050a 3037 if (!e) {
830247a4 3038 RExC_parse = p + 1;
b45f050a
JF
3039 vFAIL("Missing right brace on \\x{}");
3040 }
de5f0749 3041 else {
b21ed0a9 3042 numlen = 1; /* allow underscores */
de35ba6f 3043 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
3044 if (ender > 0xff)
3045 RExC_utf8 = 1;
b21ed0a9
GS
3046 /* numlen is generous */
3047 if (numlen + len >= 127) {
a0ed51b3
LW
3048 p--;
3049 goto loopdone;
3050 }
3051 p = e + 1;
3052 }
a0ed51b3
LW
3053 }
3054 else {
b21ed0a9 3055 numlen = 0; /* disallow underscores */
dff6d3cd 3056 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
3057 p += numlen;
3058 }
a687059c 3059 break;
a0d0e21e
LW
3060 case 'c':
3061 p++;
bbce6d69 3062 ender = UCHARAT(p++);
3063 ender = toCTRL(ender);
a687059c 3064 break;
a0d0e21e
LW
3065 case '0': case '1': case '2': case '3':case '4':
3066 case '5': case '6': case '7': case '8':case '9':
3067 if (*p == '0' ||
830247a4 3068 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 3069 numlen = 0; /* disallow underscores */
dff6d3cd 3070 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
3071 p += numlen;
3072 }
3073 else {
3074 --p;
3075 goto loopdone;
a687059c
LW
3076 }
3077 break;
a0d0e21e 3078 case '\0':
830247a4 3079 if (p >= RExC_end)
b45f050a 3080 FAIL("Trailing \\");
a687059c 3081 /* FALL THROUGH */
a0d0e21e 3082 default:
e476b1b5 3083 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3084 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3085 goto normal_default;
a0d0e21e
LW
3086 }
3087 break;
a687059c 3088 default:
a0ed51b3 3089 normal_default:
fd400ab9 3090 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3091 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3092 &numlen, 0);
a0ed51b3
LW
3093 p += numlen;
3094 }
3095 else
3096 ender = *p++;
a0d0e21e 3097 break;
a687059c 3098 }
830247a4
IZ
3099 if (RExC_flags16 & PMf_EXTENDED)
3100 p = regwhite(p, RExC_end);
a0ed51b3
LW
3101 if (UTF && FOLD) {
3102 if (LOC)
5e12f4fb 3103 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3104 else
3105 ender = toLOWER_uni(ender);
3106 }
a0d0e21e
LW
3107 if (ISMULT2(p)) { /* Back off on ?+*. */
3108 if (len)
3109 p = oldp;
2b9d42f0 3110 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3111 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3112 s += numlen;
3113 len += numlen;
3114 }
a0d0e21e
LW
3115 else {
3116 len++;
cd439c50 3117 REGC(ender, s++);
a0d0e21e
LW
3118 }
3119 break;
a687059c 3120 }
2b9d42f0 3121 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3122 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3123 s += numlen;
3124 len += numlen - 1;
3125 }
3126 else
cd439c50 3127 REGC(ender, s++);
a0d0e21e
LW
3128 }
3129 loopdone:
830247a4 3130 RExC_parse = p - 1;
fac92740 3131 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3132 nextchar(pRExC_state);
793db0cb
JH
3133 {
3134 /* len is STRLEN which is unsigned, need to copy to signed */
3135 IV iv = len;
3136 if (iv < 0)
3137 vFAIL("Internal disaster");
3138 }
a0d0e21e
LW
3139 if (len > 0)
3140 *flagp |= HASWIDTH;
3141 if (len == 1)
3142 *flagp |= SIMPLE;
c277df42 3143 if (!SIZE_ONLY)
cd439c50
IZ
3144 STR_LEN(ret) = len;
3145 if (SIZE_ONLY)
830247a4 3146 RExC_size += STR_SZ(len);
cd439c50 3147 else
830247a4 3148 RExC_emit += STR_SZ(len);
a687059c 3149 }
a0d0e21e
LW
3150 break;
3151 }
a687059c 3152
a0d0e21e 3153 return(ret);
a687059c
LW
3154}
3155
873ef191 3156STATIC char *
cea2e8a9 3157S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3158{
3159 while (p < e) {
3160 if (isSPACE(*p))
3161 ++p;
3162 else if (*p == '#') {
3163 do {
3164 p++;
3165 } while (p < e && *p != '\n');
3166 }
3167 else
3168 break;
3169 }
3170 return p;
3171}
3172
b8c5462f
JH
3173/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3174 Character classes ([:foo:]) can also be negated ([:^foo:]).
3175 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3176 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3177 but trigger warnings because they are currently unimplemented. */
3178STATIC I32
830247a4 3179S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3180{
3181 char *posixcc = 0;
936ed897 3182 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3183
830247a4 3184 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3185 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3186 (*RExC_parse == ':' ||
3187 *RExC_parse == '=' ||
3188 *RExC_parse == '.')) {
3189 char c = *RExC_parse;
3190 char* s = RExC_parse++;
b81d288d 3191
830247a4
IZ
3192 while (RExC_parse < RExC_end && *RExC_parse != c)
3193 RExC_parse++;
3194 if (RExC_parse == RExC_end)
620e46c5 3195 /* Grandfather lone [:, [=, [. */
830247a4 3196 RExC_parse = s;
620e46c5 3197 else {
830247a4 3198 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3199
830247a4
IZ
3200 if (*RExC_parse == ']') {
3201 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3202 posixcc = s + 1;
3203 if (*s == ':') {
3204 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3205 I32 skip = 5; /* the most common skip */
3206
3207 switch (*posixcc) {
3208 case 'a':
3209 if (strnEQ(posixcc, "alnum", 5))
3210 namedclass =
3211 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3212 else if (strnEQ(posixcc, "alpha", 5))
3213 namedclass =
3214 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3215 else if (strnEQ(posixcc, "ascii", 5))
3216 namedclass =
3217 complement ? ANYOF_NASCII : ANYOF_ASCII;
3218 break;
aaa51d5e
JF
3219 case 'b':
3220 if (strnEQ(posixcc, "blank", 5))
3221 namedclass =
3222 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3223 break;
b8c5462f
JH
3224 case 'c':
3225 if (strnEQ(posixcc, "cntrl", 5))
3226 namedclass =
3227 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3228 break;
3229 case 'd':
3230 if (strnEQ(posixcc, "digit", 5))
3231 namedclass =
3232 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3233 break;
3234 case 'g':
3235 if (strnEQ(posixcc, "graph", 5))
3236 namedclass =
3237 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3238 break;
3239 case 'l':
3240 if (strnEQ(posixcc, "lower", 5))
3241 namedclass =
3242 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3243 break;
3244 case 'p':
3245 if (strnEQ(posixcc, "print", 5))
3246 namedclass =
3247 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3248 else if (strnEQ(posixcc, "punct", 5))
3249 namedclass =
3250 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3251 break;
3252 case 's':
3253 if (strnEQ(posixcc, "space", 5))
3254 namedclass =
aaa51d5e 3255 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3256 break;
b8c5462f
JH
3257 case 'u':
3258 if (strnEQ(posixcc, "upper", 5))
3259 namedclass =
3260 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3261 break;
3262 case 'w': /* this is not POSIX, this is the Perl \w */
3263 if (strnEQ(posixcc, "word", 4)) {
3264 namedclass =
3265 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3266 skip = 4;
3267 }
3268 break;
3269 case 'x':
3270 if (strnEQ(posixcc, "xdigit", 6)) {
3271 namedclass =
3272 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3273 skip = 6;
3274 }
3275 break;
3276 }
ac561586
JH
3277 if (namedclass == OOB_NAMEDCLASS ||
3278 posixcc[skip] != ':' ||
3279 posixcc[skip+1] != ']')
b45f050a
JF
3280 {
3281 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3282 t - s - 1, s + 1);
3283 }
3284 } else if (!SIZE_ONLY) {
b8c5462f 3285 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3286
830247a4 3287 /* adjust RExC_parse so the warning shows after
b45f050a 3288 the class closes */
830247a4
IZ
3289 while (*RExC_parse && *RExC_parse != ']')
3290 RExC_parse++;
b45f050a
JF
3291 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3292 }
b8c5462f
JH
3293 } else {
3294 /* Maternal grandfather:
3295 * "[:" ending in ":" but not in ":]" */
830247a4 3296 RExC_parse = s;
767d463e 3297 }
620e46c5
JH
3298 }
3299 }
3300
b8c5462f
JH
3301 return namedclass;
3302}
3303
3304STATIC void
830247a4 3305S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3306{
e476b1b5 3307 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3308 (*RExC_parse == ':' ||
3309 *RExC_parse == '=' ||
3310 *RExC_parse == '.')) {
3311 char *s = RExC_parse;
93733859 3312 char c = *s++;
b8c5462f
JH
3313
3314 while(*s && isALNUM(*s))
3315 s++;
3316 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3317 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3318
3319 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3320 if (c == '=' || c == '.')
b45f050a 3321 {
830247a4 3322 /* adjust RExC_parse so the error shows after
b45f050a 3323 the class closes */
830247a4 3324 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3325 ;
3326 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3327 }
b8c5462f
JH
3328 }
3329 }
620e46c5
JH
3330}
3331
76e3520e 3332STATIC regnode *
830247a4 3333S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3334{
ffc61ed2 3335 register UV value;
3568d838 3336 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3337 register IV range = 0;
c277df42 3338 register regnode *ret;
ba210ebe 3339 STRLEN numlen;
ffc61ed2 3340 IV namedclass;
9c5ffd7c 3341 char *rangebegin = 0;
936ed897 3342 bool need_class = 0;
9c5ffd7c 3343 SV *listsv = Nullsv;
ffc61ed2
JH
3344 register char *e;
3345 UV n;
3568d838 3346 bool optimize_invert = TRUE;
ffc61ed2
JH
3347
3348 ret = reganode(pRExC_state, ANYOF, 0);
3349
3350 if (!SIZE_ONLY)
3351 ANYOF_FLAGS(ret) = 0;
3352
3353 if (*RExC_parse == '^') { /* Complement of range. */
3354 RExC_naughty++;
3355 RExC_parse++;
3356 if (!SIZE_ONLY)
3357 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3358 }
a0d0e21e 3359
936ed897 3360 if (SIZE_ONLY)
830247a4 3361 RExC_size += ANYOF_SKIP;
936ed897 3362 else {
830247a4 3363 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3364 if (FOLD)
3365 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3366 if (LOC)
3367 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3368 ANYOF_BITMAP_ZERO(ret);
3369 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3370 }
b8c5462f 3371
e476b1b5 3372 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3373 checkposixcc(pRExC_state);
b8c5462f 3374
830247a4 3375 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3376 goto charclassloop; /* allow 1st char to be ] or - */
3377
830247a4 3378 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3379
3380 charclassloop:
3381
3382 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3383
73b437c8 3384 if (!range)
830247a4 3385 rangebegin = RExC_parse;
ffc61ed2 3386 if (UTF) {
5e12f4fb 3387 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3388 RExC_end - RExC_parse,
3389 &numlen, 0);
ffc61ed2
JH
3390 RExC_parse += numlen;
3391 }
3392 else
3393 value = UCHARAT(RExC_parse++);
620e46c5 3394 if (value == '[')
830247a4 3395 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3396 else if (value == '\\') {
ffc61ed2 3397 if (UTF) {
5e12f4fb 3398 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3399 RExC_end - RExC_parse,
3400 &numlen, 0);
3401 RExC_parse += numlen;
3402 }
3403 else
3404 value = UCHARAT(RExC_parse++);
470c3474 3405 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3406 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3407 * be a problem later if we want switch on Unicode.
3408 * A similar issue a little bit later when switching on
3409 * namedclass. --jhi */
ffc61ed2 3410 switch ((I32)value) {
b8c5462f
JH
3411 case 'w': namedclass = ANYOF_ALNUM; break;
3412 case 'W': namedclass = ANYOF_NALNUM; break;
3413 case 's': namedclass = ANYOF_SPACE; break;
3414 case 'S': namedclass = ANYOF_NSPACE; break;
3415 case 'd': namedclass = ANYOF_DIGIT; break;
3416 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3417 case 'p':
3418 case 'P':
3419 if (*RExC_parse == '{') {
3420 e = strchr(RExC_parse++, '}');
3421 if (!e)
3422 vFAIL("Missing right brace on \\p{}");
3423 n = e - RExC_parse;
3424 }
3425 else {
3426 e = RExC_parse;
3427 n = 1;
3428 }
3429 if (!SIZE_ONLY) {
3430 if (value == 'p')
3431 Perl_sv_catpvf(aTHX_ listsv,
3432 "+utf8::%.*s\n", (int)n, RExC_parse);
3433 else
3434 Perl_sv_catpvf(aTHX_ listsv,
3435 "!utf8::%.*s\n", (int)n, RExC_parse);
3436 }
3437 RExC_parse = e + 1;
3438 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3439 continue;
b8c5462f
JH
3440 case 'n': value = '\n'; break;
3441 case 'r': value = '\r'; break;
3442 case 't': value = '\t'; break;
3443 case 'f': value = '\f'; break;
3444 case 'b': value = '\b'; break;
c7f1f016
NIS
3445 case 'e': value = ASCII_TO_NATIVE('\033');break;
3446 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3447 case 'x':
ffc61ed2
JH
3448 if (*RExC_parse == '{') {
3449 e = strchr(RExC_parse++, '}');
b81d288d 3450 if (!e)
ffc61ed2
JH
3451 vFAIL("Missing right brace on \\x{}");
3452 numlen = 1; /* allow underscores */
3453 value = (UV)scan_hex(RExC_parse,
3454 e - RExC_parse,
3455 &numlen);
3456 RExC_parse = e + 1;
3457 }
3458 else {
3459 numlen = 0; /* disallow underscores */
3460 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3461 RExC_parse += numlen;
3462 }
b8c5462f
JH
3463 break;
3464 case 'c':
830247a4 3465 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3466 value = toCTRL(value);
3467 break;
3468 case '0': case '1': case '2': case '3': case '4':
3469 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3470 numlen = 0; /* disallow underscores */
830247a4
IZ
3471 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3472 RExC_parse += numlen;
b8c5462f 3473 break;
1028017a 3474 default:
e476b1b5 3475 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3476 vWARN2(RExC_parse,
3477 "Unrecognized escape \\%c in character class passed through",
3478 (int)value);
1028017a 3479 break;
b8c5462f 3480 }
ffc61ed2
JH
3481 } /* end of \blah */
3482
3483 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3484
3485 if (!SIZE_ONLY && !need_class)
936ed897 3486 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3487
936ed897 3488 need_class = 1;
ffc61ed2
JH
3489
3490 /* a bad range like a-\d, a-[:digit:] ? */
3491 if (range) {
73b437c8 3492 if (!SIZE_ONLY) {
e476b1b5 3493 if (ckWARN(WARN_REGEXP))
830247a4 3494 vWARN4(RExC_parse,
b45f050a 3495 "False [] range \"%*.*s\"",
830247a4
IZ
3496 RExC_parse - rangebegin,
3497 RExC_parse - rangebegin,
b45f050a 3498 rangebegin);
3568d838
JH
3499 if (prevvalue < 256) {
3500 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3501 ANYOF_BITMAP_SET(ret, '-');
3502 }
3503 else {
3504 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3505 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3506 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3507 }
b8c5462f 3508 }
ffc61ed2
JH
3509
3510 range = 0; /* this was not a true range */
73b437c8 3511 }
ffc61ed2 3512
73b437c8 3513 if (!SIZE_ONLY) {
3568d838
JH
3514 if (namedclass > OOB_NAMEDCLASS)
3515 optimize_invert = FALSE;
e2962f66
JH
3516 /* Possible truncation here but in some 64-bit environments
3517 * the compiler gets heartburn about switch on 64-bit values.
3518 * A similar issue a little earlier when switching on value.
98f323fa 3519 * --jhi */
e2962f66 3520 switch ((I32)namedclass) {
73b437c8
JH
3521 case ANYOF_ALNUM:
3522 if (LOC)
936ed897 3523 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3524 else {
3525 for (value = 0; value < 256; value++)
3526 if (isALNUM(value))
936ed897 3527 ANYOF_BITMAP_SET(ret, value);
73b437c8 3528 }
ffc61ed2 3529 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3530 break;
3531 case ANYOF_NALNUM:
3532 if (LOC)
936ed897 3533 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3534 else {
3535 for (value = 0; value < 256; value++)
3536 if (!isALNUM(value))
936ed897 3537 ANYOF_BITMAP_SET(ret, value);
73b437c8 3538 }
ffc61ed2 3539 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3540 break;
ffc61ed2 3541 case ANYOF_ALNUMC:
73b437c8 3542 if (LOC)
ffc61ed2 3543 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3544 else {
3545 for (value = 0; value < 256; value++)
ffc61ed2 3546 if (isALNUMC(value))
936ed897 3547 ANYOF_BITMAP_SET(ret, value);
73b437c8 3548 }
ffc61ed2 3549 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3550 break;
3551 case ANYOF_NALNUMC:
3552 if (LOC)
936ed897 3553 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3554 else {
3555 for (value = 0; value < 256; value++)
3556 if (!isALNUMC(value))
936ed897 3557 ANYOF_BITMAP_SET(ret, value);
73b437c8 3558 }
ffc61ed2 3559 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3560 break;
3561 case ANYOF_ALPHA:
3562 if (LOC)
936ed897 3563 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3564 else {
3565 for (value = 0; value < 256; value++)
3566 if (isALPHA(value))
936ed897 3567 ANYOF_BITMAP_SET(ret, value);
73b437c8 3568 }
ffc61ed2 3569 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3570 break;
3571 case ANYOF_NALPHA:
3572 if (LOC)
936ed897 3573 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3574 else {
3575 for (value = 0; value < 256; value++)
3576 if (!isALPHA(value))
936ed897 3577 ANYOF_BITMAP_SET(ret, value);
73b437c8 3578 }
ffc61ed2 3579 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3580 break;
3581 case ANYOF_ASCII:
3582 if (LOC)
936ed897 3583 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3584 else {
c7f1f016 3585#ifndef EBCDIC
1ba5c669
JH
3586 for (value = 0; value < 128; value++)
3587 ANYOF_BITMAP_SET(ret, value);
3588#else /* EBCDIC */
ffbc6a93 3589 for (value = 0; value < 256; value++) {
3a3c4447
JH
3590 if (isASCII(value))
3591 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3592 }
1ba5c669 3593#endif /* EBCDIC */
73b437c8 3594 }
ffc61ed2 3595 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3596 break;
3597 case ANYOF_NASCII:
3598 if (LOC)
936ed897 3599 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3600 else {
c7f1f016 3601#ifndef EBCDIC
1ba5c669
JH
3602 for (value = 128; value < 256; value++)
3603 ANYOF_BITMAP_SET(ret, value);
3604#else /* EBCDIC */
ffbc6a93 3605 for (value = 0; value < 256; value++) {
3a3c4447
JH
3606 if (!isASCII(value))
3607 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3608 }
1ba5c669 3609#endif /* EBCDIC */
73b437c8 3610 }
ffc61ed2 3611 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3612 break;
aaa51d5e
JF
3613 case ANYOF_BLANK:
3614 if (LOC)
3615 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3616 else {
3617 for (value = 0; value < 256; value++)
3618 if (isBLANK(value))
3619 ANYOF_BITMAP_SET(ret, value);
3620 }
ffc61ed2 3621 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3622 break;
3623 case ANYOF_NBLANK:
3624 if (LOC)
3625 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3626 else {
3627 for (value = 0; value < 256; value++)
3628 if (!isBLANK(value))
3629 ANYOF_BITMAP_SET(ret, value);
3630 }
ffc61ed2 3631 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3632 break;
73b437c8
JH
3633 case ANYOF_CNTRL:
3634 if (LOC)
936ed897 3635 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3636 else {
3637 for (value = 0; value < 256; value++)
3638 if (isCNTRL(value))
936ed897 3639 ANYOF_BITMAP_SET(ret, value);
73b437c8 3640 }
ffc61ed2 3641 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3642 break;
3643 case ANYOF_NCNTRL:
3644 if (LOC)
936ed897 3645 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3646 else {
3647 for (value = 0; value < 256; value++)
3648 if (!isCNTRL(value))
936ed897 3649 ANYOF_BITMAP_SET(ret, value);
73b437c8 3650 }
ffc61ed2
JH
3651 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3652 break;
3653 case ANYOF_DIGIT:
3654 if (LOC)
3655 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3656 else {
3657 /* consecutive digits assumed */
3658 for (value = '0'; value <= '9'; value++)
3659 ANYOF_BITMAP_SET(ret, value);
3660 }
3661 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3662 break;
3663 case ANYOF_NDIGIT:
3664 if (LOC)
3665 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3666 else {
3667 /* consecutive digits assumed */
3668 for (value = 0; value < '0'; value++)
3669 ANYOF_BITMAP_SET(ret, value);
3670 for (value = '9' + 1; value < 256; value++)
3671 ANYOF_BITMAP_SET(ret, value);
3672 }
3673 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3674 break;
3675 case ANYOF_GRAPH:
3676 if (LOC)
936ed897 3677 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (isGRAPH(value))
936ed897 3681 ANYOF_BITMAP_SET(ret, value);
73b437c8 3682 }
ffc61ed2 3683 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3684 break;
3685 case ANYOF_NGRAPH:
3686 if (LOC)
936ed897 3687 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3688 else {
3689 for (value = 0; value < 256; value++)
3690 if (!isGRAPH(value))
936ed897 3691 ANYOF_BITMAP_SET(ret, value);
73b437c8 3692 }
ffc61ed2 3693 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3694 break;
3695 case ANYOF_LOWER:
3696 if (LOC)
936ed897 3697 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3698 else {
3699 for (value = 0; value < 256; value++)
3700 if (isLOWER(value))
936ed897 3701 ANYOF_BITMAP_SET(ret, value);
73b437c8 3702 }
ffc61ed2 3703 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3704 break;
3705 case ANYOF_NLOWER:
3706 if (LOC)
936ed897 3707 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3708 else {
3709 for (value = 0; value < 256; value++)
3710 if (!isLOWER(value))
936ed897 3711 ANYOF_BITMAP_SET(ret, value);
73b437c8 3712 }
ffc61ed2 3713 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3714 break;
3715 case ANYOF_PRINT:
3716 if (LOC)
936ed897 3717 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3718 else {
3719 for (value = 0; value < 256; value++)
3720 if (isPRINT(value))
936ed897 3721 ANYOF_BITMAP_SET(ret, value);
73b437c8 3722 }
ffc61ed2 3723 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3724 break;
3725 case ANYOF_NPRINT:
3726 if (LOC)
936ed897 3727 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3728 else {
3729 for (value = 0; value < 256; value++)
3730 if (!isPRINT(value))
936ed897 3731 ANYOF_BITMAP_SET(ret, value);
73b437c8 3732 }
ffc61ed2 3733 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3734 break;
aaa51d5e
JF
3735 case ANYOF_PSXSPC:
3736 if (LOC)
3737 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3738 else {
3739 for (value = 0; value < 256; value++)
3740 if (isPSXSPC(value))
3741 ANYOF_BITMAP_SET(ret, value);
3742 }
ffc61ed2 3743 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3744 break;
3745 case ANYOF_NPSXSPC:
3746 if (LOC)
3747 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3748 else {
3749 for (value = 0; value < 256; value++)
3750 if (!isPSXSPC(value))
3751 ANYOF_BITMAP_SET(ret, value);
3752 }
ffc61ed2 3753 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3754 break;
73b437c8
JH
3755 case ANYOF_PUNCT:
3756 if (LOC)
936ed897 3757 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3758 else {
3759 for (value = 0; value < 256; value++)
3760 if (isPUNCT(value))
936ed897 3761 ANYOF_BITMAP_SET(ret, value);
73b437c8 3762 }
ffc61ed2 3763 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3764 break;
3765 case ANYOF_NPUNCT:
3766 if (LOC)
936ed897 3767 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3768 else {
3769 for (value = 0; value < 256; value++)
3770 if (!isPUNCT(value))
936ed897 3771 ANYOF_BITMAP_SET(ret, value);
73b437c8 3772 }
ffc61ed2
JH
3773 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3774 break;
3775 case ANYOF_SPACE:
3776 if (LOC)
3777 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3778 else {
3779 for (value = 0; value < 256; value++)
3780 if (isSPACE(value))
3781 ANYOF_BITMAP_SET(ret, value);
3782 }
3783 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3784 break;
3785 case ANYOF_NSPACE:
3786 if (LOC)
3787 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3788 else {
3789 for (value = 0; value < 256; value++)
3790 if (!isSPACE(value))
3791 ANYOF_BITMAP_SET(ret, value);
3792 }
3793 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3794 break;
3795 case ANYOF_UPPER:
3796 if (LOC)
936ed897 3797 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3798 else {
3799 for (value = 0; value < 256; value++)
3800 if (isUPPER(value))
936ed897 3801 ANYOF_BITMAP_SET(ret, value);
73b437c8 3802 }
ffc61ed2 3803 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3804 break;
3805 case ANYOF_NUPPER:
3806 if (LOC)
936ed897 3807 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3808 else {
3809 for (value = 0; value < 256; value++)
3810 if (!isUPPER(value))
936ed897 3811 ANYOF_BITMAP_SET(ret, value);
73b437c8 3812 }
ffc61ed2 3813 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3814 break;
3815 case ANYOF_XDIGIT:
3816 if (LOC)
936ed897 3817 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3818 else {
3819 for (value = 0; value < 256; value++)
3820 if (isXDIGIT(value))
936ed897 3821 ANYOF_BITMAP_SET(ret, value);
73b437c8 3822 }
ffc61ed2 3823 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3824 break;
3825 case ANYOF_NXDIGIT:
3826 if (LOC)
936ed897 3827 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3828 else {
3829 for (value = 0; value < 256; value++)
3830 if (!isXDIGIT(value))
936ed897 3831 ANYOF_BITMAP_SET(ret, value);
73b437c8 3832 }
ffc61ed2 3833 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3834 break;
3835 default:
b45f050a 3836 vFAIL("Invalid [::] class");
73b437c8 3837 break;
b8c5462f 3838 }
b8c5462f 3839 if (LOC)
936ed897 3840 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3841 continue;
a0d0e21e 3842 }
ffc61ed2
JH
3843 } /* end of namedclass \blah */
3844
a0d0e21e 3845 if (range) {
3a3c4447 3846 if (prevvalue > value) /* b-a */ {
b45f050a 3847 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
3848 RExC_parse - rangebegin,
3849 RExC_parse - rangebegin,
b45f050a 3850 rangebegin);
3568d838 3851 range = 0; /* not a valid range */
73b437c8 3852 }
a0d0e21e
LW
3853 }
3854 else {
3568d838 3855 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
3856 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3857 RExC_parse[1] != ']') {
3858 RExC_parse++;
ffc61ed2
JH
3859
3860 /* a bad range like \w-, [:word:]- ? */
3861 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 3862 if (ckWARN(WARN_REGEXP))
830247a4 3863 vWARN4(RExC_parse,
b45f050a 3864 "False [] range \"%*.*s\"",
830247a4
IZ
3865 RExC_parse - rangebegin,
3866 RExC_parse - rangebegin,
b45f050a 3867 rangebegin);
73b437c8 3868 if (!SIZE_ONLY)
936ed897 3869 ANYOF_BITMAP_SET(ret, '-');
73b437c8 3870 } else
ffc61ed2
JH
3871 range = 1; /* yeah, it's a range! */
3872 continue; /* but do it the next time */
a0d0e21e 3873 }
a687059c 3874 }
ffc61ed2 3875
93733859 3876 /* now is the next time */
ae5c130c 3877 if (!SIZE_ONLY) {
3568d838
JH
3878 IV i;
3879
3880 if (prevvalue < 256) {
3881 IV ceilvalue = value < 256 ? value : 255;
3882
3883#ifdef EBCDIC
3a3c4447
JH
3884 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
3885 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 3886 {
3568d838
JH
3887 if (isLOWER(prevvalue)) {
3888 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3889 if (isLOWER(i))
3890 ANYOF_BITMAP_SET(ret, i);
3891 } else {
3568d838 3892 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
3893 if (isUPPER(i))
3894 ANYOF_BITMAP_SET(ret, i);
3895 }
8ada0baa 3896 }
ffc61ed2 3897 else
8ada0baa 3898#endif
3568d838
JH
3899 for (i = prevvalue; i <= ceilvalue; i++)
3900 ANYOF_BITMAP_SET(ret, i);
3901 }
3902 if (value > 255) {
ffc61ed2 3903 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 3904 if (prevvalue < value)
ffc61ed2 3905 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838
JH
3906 (UV)prevvalue, (UV)value);
3907 else if (prevvalue == value)
ffc61ed2
JH
3908 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3909 (UV)value);
3910 }
8ada0baa 3911 }
ffc61ed2
JH
3912
3913 range = 0; /* this range (if it was one) is done now */
a0d0e21e 3914 }
ffc61ed2 3915
936ed897 3916 if (need_class) {
4f66b38d 3917 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 3918 if (SIZE_ONLY)
830247a4 3919 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 3920 else
830247a4 3921 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 3922 }
ffc61ed2 3923
ae5c130c 3924 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 3925 if (!SIZE_ONLY &&
ffc61ed2 3926 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
3927 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
3928 ) {
a0ed51b3 3929 for (value = 0; value < 256; ++value) {
936ed897 3930 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
3931 IV fold = PL_fold[value];
3932
3933 if (fold != value)
3934 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
3935 }
3936 }
936ed897 3937 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 3938 }
ffc61ed2 3939
ae5c130c 3940 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 3941 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
3942 /* If the only flag is inversion. */
3943 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 3944 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 3945 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 3946 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 3947 }
a0d0e21e 3948
b81d288d 3949 if (!SIZE_ONLY) {
fde631ed 3950 AV *av = newAV();
ffc61ed2
JH
3951 SV *rv;
3952
3953 av_store(av, 0, listsv);
3954 av_store(av, 1, NULL);
3955 rv = newRV_noinc((SV*)av);
19860706 3956 n = add_data(pRExC_state, 1, "s");
830247a4 3957 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 3958 ARG_SET(ret, n);
a0ed51b3
LW
3959 }
3960
3961 return ret;
3962}
3963
76e3520e 3964STATIC char*
830247a4 3965S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 3966{
830247a4 3967 char* retval = RExC_parse++;
a0d0e21e 3968
4633a7c4 3969 for (;;) {
830247a4
IZ
3970 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3971 RExC_parse[2] == '#') {
3972 while (*RExC_parse && *RExC_parse != ')')
3973 RExC_parse++;
3974 RExC_parse++;
4633a7c4
LW
3975 continue;
3976 }
830247a4
IZ
3977 if (RExC_flags16 & PMf_EXTENDED) {
3978 if (isSPACE(*RExC_parse)) {
3979 RExC_parse++;
748a9306
LW
3980 continue;
3981 }
830247a4
IZ
3982 else if (*RExC_parse == '#') {
3983 while (*RExC_parse && *RExC_parse != '\n')
3984 RExC_parse++;
3985 RExC_parse++;
748a9306
LW
3986 continue;
3987 }
748a9306 3988 }
4633a7c4 3989 return retval;
a0d0e21e 3990 }
a687059c
LW
3991}
3992
3993/*
c277df42 3994- reg_node - emit a node
a0d0e21e 3995*/
76e3520e 3996STATIC regnode * /* Location. */
830247a4 3997S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 3998{
c277df42
IZ
3999 register regnode *ret;
4000 register regnode *ptr;
a687059c 4001
830247a4 4002 ret = RExC_emit;
c277df42 4003 if (SIZE_ONLY) {
830247a4
IZ
4004 SIZE_ALIGN(RExC_size);
4005 RExC_size += 1;
a0d0e21e
LW
4006 return(ret);
4007 }
a687059c 4008
c277df42 4009 NODE_ALIGN_FILL(ret);
a0d0e21e 4010 ptr = ret;
c277df42 4011 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4012 if (RExC_offsets) { /* MJD */
4013 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4014 "reg_node", __LINE__,
4015 reg_name[op],
4016 RExC_emit - RExC_emit_start > RExC_offsets[0]
4017 ? "Overwriting end of array!\n" : "OK",
4018 RExC_emit - RExC_emit_start,
4019 RExC_parse - RExC_start,
4020 RExC_offsets[0]));
4021 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4022 }
4023
830247a4 4024 RExC_emit = ptr;
a687059c 4025
a0d0e21e 4026 return(ret);
a687059c
LW
4027}
4028
4029/*
a0d0e21e
LW
4030- reganode - emit a node with an argument
4031*/
76e3520e 4032STATIC regnode * /* Location. */
830247a4 4033S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4034{
c277df42
IZ
4035 register regnode *ret;
4036 register regnode *ptr;
fe14fcc3 4037
830247a4 4038 ret = RExC_emit;
c277df42 4039 if (SIZE_ONLY) {
830247a4
IZ
4040 SIZE_ALIGN(RExC_size);
4041 RExC_size += 2;
a0d0e21e
LW
4042 return(ret);
4043 }
fe14fcc3 4044
c277df42 4045 NODE_ALIGN_FILL(ret);
a0d0e21e 4046 ptr = ret;
c277df42 4047 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4048 if (RExC_offsets) { /* MJD */
4049 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4050 "reganode",
4051 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4052 "Overwriting end of array!\n" : "OK",
4053 RExC_emit - RExC_emit_start,
4054 RExC_parse - RExC_start,
4055 RExC_offsets[0]));
4056 Set_Cur_Node_Offset;
4057 }
4058
830247a4 4059 RExC_emit = ptr;
fe14fcc3 4060
a0d0e21e 4061 return(ret);
fe14fcc3
LW
4062}
4063
4064/*
cd439c50 4065- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4066*/
4067STATIC void
830247a4 4068S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4069{
5e12f4fb 4070 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4071}
4072
4073/*
a0d0e21e
LW
4074- reginsert - insert an operator in front of already-emitted operand
4075*
4076* Means relocating the operand.
4077*/
76e3520e 4078STATIC void
830247a4 4079S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4080{
c277df42
IZ
4081 register regnode *src;
4082 register regnode *dst;
4083 register regnode *place;
4084 register int offset = regarglen[(U8)op];
b81d288d 4085
22c35a8c 4086/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4087
4088 if (SIZE_ONLY) {
830247a4 4089 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4090 return;
4091 }
a687059c 4092
830247a4
IZ
4093 src = RExC_emit;
4094 RExC_emit += NODE_STEP_REGNODE + offset;
4095 dst = RExC_emit;
fac92740 4096 while (src > opnd) {
c277df42 4097 StructCopy(--src, --dst, regnode);
fac92740
MJD
4098 if (RExC_offsets) { /* MJD 20010112 */
4099 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4100 "reg_insert",
4101 dst - RExC_emit_start > RExC_offsets[0]
4102 ? "Overwriting end of array!\n" : "OK",
4103 src - RExC_emit_start,
4104 dst - RExC_emit_start,
4105 RExC_offsets[0]));
4106 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4107 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4108 }
4109 }
4110
a0d0e21e
LW
4111
4112 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4113 if (RExC_offsets) { /* MJD */
4114 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4115 "reginsert",
4116 place - RExC_emit_start > RExC_offsets[0]
4117 ? "Overwriting end of array!\n" : "OK",
4118 place - RExC_emit_start,
4119 RExC_parse - RExC_start,
4120 RExC_offsets[0]));
4121 Set_Node_Offset(place, RExC_parse);
4122 }
c277df42
IZ
4123 src = NEXTOPER(place);
4124 FILL_ADVANCE_NODE(place, op);
4125 Zero(src, offset, regnode);
a687059c
LW
4126}
4127
4128/*
c277df42 4129- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4130*/
76e3520e 4131STATIC void
830247a4 4132S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4133{
c277df42
IZ
4134 register regnode *scan;
4135 register regnode *temp;
a0d0e21e 4136
c277df42 4137 if (SIZE_ONLY)
a0d0e21e
LW
4138 return;
4139
4140 /* Find last node. */
4141 scan = p;
4142 for (;;) {
4143 temp = regnext(scan);
4144 if (temp == NULL)
4145 break;
4146 scan = temp;
4147 }
a687059c 4148
c277df42
IZ
4149 if (reg_off_by_arg[OP(scan)]) {
4150 ARG_SET(scan, val - scan);
a0ed51b3
LW
4151 }
4152 else {
c277df42
IZ
4153 NEXT_OFF(scan) = val - scan;
4154 }
a687059c
LW
4155}
4156
4157/*
a0d0e21e
LW
4158- regoptail - regtail on operand of first argument; nop if operandless
4159*/
76e3520e 4160STATIC void
830247a4 4161S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4162{
a0d0e21e 4163 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4164 if (p == NULL || SIZE_ONLY)
4165 return;
22c35a8c 4166 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4167 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4168 }
22c35a8c 4169 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4170 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4171 }
4172 else
a0d0e21e 4173 return;
a687059c
LW
4174}
4175
4176/*
4177 - regcurly - a little FSA that accepts {\d+,?\d*}
4178 */
79072805 4179STATIC I32
cea2e8a9 4180S_regcurly(pTHX_ register char *s)
a687059c
LW
4181{
4182 if (*s++ != '{')
4183 return FALSE;
f0fcb552 4184 if (!isDIGIT(*s))
a687059c 4185 return FALSE;
f0fcb552 4186 while (isDIGIT(*s))
a687059c
LW
4187 s++;
4188 if (*s == ',')
4189 s++;
f0fcb552 4190 while (isDIGIT(*s))
a687059c
LW
4191 s++;
4192 if (*s != '}')
4193 return FALSE;
4194 return TRUE;
4195}
4196
a687059c 4197
8fa7f367
JH
4198#ifdef DEBUGGING
4199
76e3520e 4200STATIC regnode *
cea2e8a9 4201S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4202{
f248d071 4203 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4204 register regnode *next;
c277df42
IZ
4205
4206 while (op != END && (!last || node < last)) {
4207 /* While that wasn't END last time... */
4208
4209 NODE_ALIGN(node);
4210 op = OP(node);
4211 if (op == CLOSE)
4212 l--;
4213 next = regnext(node);
4214 /* Where, what. */
4215 if (OP(node) == OPTIMIZED)
4216 goto after_print;
4217 regprop(sv, node);
b900a521 4218 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4219 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4220 if (next == NULL) /* Next ptr. */
4221 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4222 else
b900a521 4223 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4224 (void)PerlIO_putc(Perl_debug_log, '\n');
4225 after_print:
22c35a8c 4226 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4227 register regnode *nnode = (OP(next) == LONGJMP
4228 ? regnext(next)
c277df42
IZ
4229 : next);
4230 if (last && nnode > last)
4231 nnode = last;
4232 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4233 }
22c35a8c 4234 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4235 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4236 }
4237 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4238 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4239 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4240 }
22c35a8c 4241 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4242 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4243 next, sv, l + 1);
a0ed51b3
LW
4244 }
4245 else if ( op == PLUS || op == STAR) {
c277df42 4246 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4247 }
4248 else if (op == ANYOF) {
4f66b38d
HS
4249 /* arglen 1 + class block */
4250 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4251 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4252 node = NEXTOPER(node);
a0ed51b3 4253 }
22c35a8c 4254 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4255 /* Literal string, where present. */
cd439c50 4256 node += NODE_SZ_STR(node) - 1;
c277df42 4257 node = NEXTOPER(node);
a0ed51b3
LW
4258 }
4259 else {
c277df42
IZ
4260 node = NEXTOPER(node);
4261 node += regarglen[(U8)op];
4262 }
4263 if (op == CURLYX || op == OPEN)
4264 l++;
4265 else if (op == WHILEM)
4266 l--;
4267 }
4268 return node;
4269}
4270
8fa7f367
JH
4271#endif /* DEBUGGING */
4272
a687059c 4273/*
fd181c75 4274 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4275 */
4276void
864dbfa3 4277Perl_regdump(pTHX_ regexp *r)
a687059c 4278{
35ff7856 4279#ifdef DEBUGGING
46fc3d4c 4280 SV *sv = sv_newmortal();
a687059c 4281
c277df42 4282 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4283
4284 /* Header fields of interest. */
c277df42 4285 if (r->anchored_substr)
7b0972df 4286 PerlIO_printf(Perl_debug_log,
b81d288d 4287 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4288 PL_colors[0],
7b0972df 4289 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4290 SvPVX(r->anchored_substr),
3280af22 4291 PL_colors[1],
c277df42 4292 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4293 (IV)r->anchored_offset);
c277df42 4294 if (r->float_substr)
7b0972df 4295 PerlIO_printf(Perl_debug_log,
b81d288d 4296 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4297 PL_colors[0],
b81d288d 4298 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4299 SvPVX(r->float_substr),
3280af22 4300 PL_colors[1],
c277df42 4301 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4302 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4303 if (r->check_substr)
b81d288d
AB
4304 PerlIO_printf(Perl_debug_log,
4305 r->check_substr == r->float_substr
c277df42
IZ
4306 ? "(checking floating" : "(checking anchored");
4307 if (r->reganch & ROPT_NOSCAN)
4308 PerlIO_printf(Perl_debug_log, " noscan");
4309 if (r->reganch & ROPT_CHECK_ALL)
4310 PerlIO_printf(Perl_debug_log, " isall");
4311 if (r->check_substr)
4312 PerlIO_printf(Perl_debug_log, ") ");
4313
46fc3d4c 4314 if (r->regstclass) {
4315 regprop(sv, r->regstclass);
4316 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4317 }
774d564b 4318 if (r->reganch & ROPT_ANCH) {
4319 PerlIO_printf(Perl_debug_log, "anchored");
4320 if (r->reganch & ROPT_ANCH_BOL)
4321 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4322 if (r->reganch & ROPT_ANCH_MBOL)
4323 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4324 if (r->reganch & ROPT_ANCH_SBOL)
4325 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4326 if (r->reganch & ROPT_ANCH_GPOS)
4327 PerlIO_printf(Perl_debug_log, "(GPOS)");
4328 PerlIO_putc(Perl_debug_log, ' ');
4329 }
c277df42
IZ
4330 if (r->reganch & ROPT_GPOS_SEEN)
4331 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4332 if (r->reganch & ROPT_SKIP)
760ac839 4333 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4334 if (r->reganch & ROPT_IMPLICIT)
760ac839 4335 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4336 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4337 if (r->reganch & ROPT_EVAL_SEEN)
4338 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4339 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4340 if (r->offsets) {
4341 U32 i;
4342 U32 len = r->offsets[0];
392fbf5d 4343 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4344 for (i = 1; i <= len; i++)
392fbf5d
RB
4345 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4346 (UV)r->offsets[i*2-1],
4347 (UV)r->offsets[i*2]);
fac92740
MJD
4348 PerlIO_printf(Perl_debug_log, "\n");
4349 }
17c3b450 4350#endif /* DEBUGGING */
a687059c
LW
4351}
4352
8fa7f367
JH
4353#ifdef DEBUGGING
4354
653099ff
GS
4355STATIC void
4356S_put_byte(pTHX_ SV *sv, int c)
4357{
7be5a6cf 4358 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4359 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4360 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4361 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4362 else
4363 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4364}
4365
8fa7f367
JH
4366#endif /* DEBUGGING */
4367
a687059c 4368/*
a0d0e21e
LW
4369- regprop - printable representation of opcode
4370*/
46fc3d4c 4371void
864dbfa3 4372Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4373{
35ff7856 4374#ifdef DEBUGGING
9b155405 4375 register int k;
a0d0e21e 4376
54dc92de 4377 sv_setpvn(sv, "", 0);
9b155405 4378 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4379 /* It would be nice to FAIL() here, but this may be called from
4380 regexec.c, and it would be hard to supply pRExC_state. */
4381 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4382 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4383
4384 k = PL_regkind[(U8)OP(o)];
4385
4386 if (k == EXACT)
7821416a 4387 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
cd439c50 4388 STR_LEN(o), STRING(o), PL_colors[1]);
9b155405 4389 else if (k == CURLY) {
cb434fcc 4390 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4391 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4392 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4393 }
2c2d71f5
JH
4394 else if (k == WHILEM && o->flags) /* Ordinal/of */
4395 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4396 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4397 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4398 else if (k == LOGICAL)
04ebc1ab 4399 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4400 else if (k == ANYOF) {
4401 int i, rangestart = -1;
ffc61ed2 4402 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4403 const char * const anyofs[] = { /* Should be syncronized with
4404 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4405 "\\w",
4406 "\\W",
4407 "\\s",
4408 "\\S",
4409 "\\d",
4410 "\\D",
4411 "[:alnum:]",
4412 "[:^alnum:]",
4413 "[:alpha:]",
4414 "[:^alpha:]",
4415 "[:ascii:]",
4416 "[:^ascii:]",
4417 "[:ctrl:]",
4418 "[:^ctrl:]",
4419 "[:graph:]",
4420 "[:^graph:]",
4421 "[:lower:]",
4422 "[:^lower:]",
4423 "[:print:]",
4424 "[:^print:]",
4425 "[:punct:]",
4426 "[:^punct:]",
4427 "[:upper:]",
aaa51d5e 4428 "[:^upper:]",
653099ff 4429 "[:xdigit:]",
aaa51d5e
JF
4430 "[:^xdigit:]",
4431 "[:space:]",
4432 "[:^space:]",
4433 "[:blank:]",
4434 "[:^blank:]"
653099ff
GS
4435 };
4436
19860706 4437 if (flags & ANYOF_LOCALE)
653099ff 4438 sv_catpv(sv, "{loc}");
19860706 4439 if (flags & ANYOF_FOLD)
653099ff
GS
4440 sv_catpv(sv, "{i}");
4441 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4442 if (flags & ANYOF_INVERT)
653099ff 4443 sv_catpv(sv, "^");
ffc61ed2
JH
4444 for (i = 0; i <= 256; i++) {
4445 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4446 if (rangestart == -1)
4447 rangestart = i;
4448 } else if (rangestart != -1) {
4449 if (i <= rangestart + 3)
4450 for (; rangestart < i; rangestart++)
653099ff 4451 put_byte(sv, rangestart);
ffc61ed2
JH
4452 else {
4453 put_byte(sv, rangestart);
4454 sv_catpv(sv, "-");
4455 put_byte(sv, i - 1);
653099ff 4456 }
ffc61ed2 4457 rangestart = -1;
653099ff 4458 }
847a199f 4459 }
ffc61ed2
JH
4460
4461 if (o->flags & ANYOF_CLASS)
4462 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4463 if (ANYOF_CLASS_TEST(o,i))
4464 sv_catpv(sv, anyofs[i]);
4465
4466 if (flags & ANYOF_UNICODE)
4467 sv_catpv(sv, "{unicode}");
1aa99e6b
IH
4468 else if (flags & ANYOF_UNICODE_ALL)
4469 sv_catpv(sv, "{all-unicode}");
ffc61ed2
JH
4470
4471 {
4472 SV *lv;
4473 SV *sw = regclass_swash(o, FALSE, &lv);
b81d288d 4474
ffc61ed2
JH
4475 if (lv) {
4476 if (sw) {
4477 UV i;
4478 U8 s[UTF8_MAXLEN+1];
b81d288d 4479
ffc61ed2 4480 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4481 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4482
3568d838 4483 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4484 if (rangestart == -1)
4485 rangestart = i;
4486 } else if (rangestart != -1) {
4487 U8 *p;
b81d288d 4488
ffc61ed2
JH
4489 if (i <= rangestart + 3)
4490 for (; rangestart < i; rangestart++) {
2b9d42f0 4491 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4492 put_byte(sv, *p);
4493 }
4494 else {
2b9d42f0 4495 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4496 put_byte(sv, *p);
4497 sv_catpv(sv, "-");
2b9d42f0 4498 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4499 put_byte(sv, *p);
4500 }
4501 rangestart = -1;
4502 }
19860706 4503 }
ffc61ed2
JH
4504
4505 sv_catpv(sv, "..."); /* et cetera */
19860706 4506 }
fde631ed 4507
ffc61ed2
JH
4508 {
4509 char *s = savepv(SvPVX(lv));
4510 char *origs = s;
b81d288d 4511
ffc61ed2 4512 while(*s && *s != '\n') s++;
b81d288d 4513
ffc61ed2
JH
4514 if (*s == '\n') {
4515 char *t = ++s;
4516
4517 while (*s) {
4518 if (*s == '\n')
4519 *s = ' ';
4520 s++;
4521 }
4522 if (s[-1] == ' ')
4523 s[-1] = 0;
4524
4525 sv_catpv(sv, t);
fde631ed 4526 }
b81d288d 4527
ffc61ed2 4528 Safefree(origs);
fde631ed
JH
4529 }
4530 }
653099ff 4531 }
ffc61ed2 4532
653099ff
GS
4533 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4534 }
9b155405 4535 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4536 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4537#endif /* DEBUGGING */
35ff7856 4538}
a687059c 4539
cad2e5aa
JH
4540SV *
4541Perl_re_intuit_string(pTHX_ regexp *prog)
4542{ /* Assume that RE_INTUIT is set */
4543 DEBUG_r(
4544 { STRLEN n_a;
4545 char *s = SvPV(prog->check_substr,n_a);
4546
4547 if (!PL_colorset) reginitcolors();
4548 PerlIO_printf(Perl_debug_log,
4549 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4550 PL_colors[4],PL_colors[5],PL_colors[0],
4551 s,
4552 PL_colors[1],
4553 (strlen(s) > 60 ? "..." : ""));
4554 } );
4555
4556 return prog->check_substr;
4557}
4558
2b69d0c2 4559void
864dbfa3 4560Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4561{
adac82c7 4562 DEBUG_r(if (!PL_colorset) reginitcolors());
7821416a
IZ
4563
4564 if (!r || (--r->refcnt > 0))
4565 return;
cad2e5aa
JH
4566 DEBUG_r(PerlIO_printf(Perl_debug_log,
4567 "%sFreeing REx:%s `%s%.60s%s%s'\n",
4568 PL_colors[4],PL_colors[5],PL_colors[0],
4569 r->precomp,
4570 PL_colors[1],
4571 (strlen(r->precomp) > 60 ? "..." : "")));
4572
c277df42 4573 if (r->precomp)
a0d0e21e 4574 Safefree(r->precomp);
fac92740
MJD
4575 if (r->offsets) /* 20010421 MJD */
4576 Safefree(r->offsets);
cf93c79d
IZ
4577 if (RX_MATCH_COPIED(r))
4578 Safefree(r->subbeg);
a193d654
GS
4579 if (r->substrs) {
4580 if (r->anchored_substr)
4581 SvREFCNT_dec(r->anchored_substr);
4582 if (r->float_substr)
4583 SvREFCNT_dec(r->float_substr);
2779dcf1 4584 Safefree(r->substrs);
a193d654 4585 }
c277df42
IZ
4586 if (r->data) {
4587 int n = r->data->count;
dfad63ad
HS
4588 AV* new_comppad = NULL;
4589 AV* old_comppad;
4590 SV** old_curpad;
4591
c277df42 4592 while (--n >= 0) {
261faec3 4593 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4594 switch (r->data->what[n]) {
4595 case 's':
4596 SvREFCNT_dec((SV*)r->data->data[n]);
4597 break;
653099ff
GS
4598 case 'f':
4599 Safefree(r->data->data[n]);
4600 break;
dfad63ad
HS
4601 case 'p':
4602 new_comppad = (AV*)r->data->data[n];
4603 break;
c277df42 4604 case 'o':
dfad63ad 4605 if (new_comppad == NULL)
cea2e8a9 4606 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4607 old_comppad = PL_comppad;
4608 old_curpad = PL_curpad;
1e6dc0b6
SB
4609 /* Watch out for global destruction's random ordering. */
4610 if (SvTYPE(new_comppad) == SVt_PVAV) {
4611 PL_comppad = new_comppad;
4612 PL_curpad = AvARRAY(new_comppad);
4613 }
4614 else
4615 PL_curpad = NULL;
9b978d73
DM
4616
4617 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4618 op_free((OP_4tree*)r->data->data[n]);
4619 }
4620
dfad63ad
HS
4621 PL_comppad = old_comppad;
4622 PL_curpad = old_curpad;
4623 SvREFCNT_dec((SV*)new_comppad);
4624 new_comppad = NULL;
c277df42
IZ
4625 break;
4626 case 'n':
4627 break;
4628 default:
830247a4 4629 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4630 }
4631 }
4632 Safefree(r->data->what);
4633 Safefree(r->data);
a0d0e21e
LW
4634 }
4635 Safefree(r->startp);
4636 Safefree(r->endp);
4637 Safefree(r);
a687059c 4638}
c277df42
IZ
4639
4640/*
4641 - regnext - dig the "next" pointer out of a node
4642 *
4643 * [Note, when REGALIGN is defined there are two places in regmatch()
4644 * that bypass this code for speed.]
4645 */
4646regnode *
864dbfa3 4647Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4648{
4649 register I32 offset;
4650
3280af22 4651 if (p == &PL_regdummy)
c277df42
IZ
4652 return(NULL);
4653
4654 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4655 if (offset == 0)
4656 return(NULL);
4657
c277df42 4658 return(p+offset);
c277df42
IZ
4659}
4660
01f988be 4661STATIC void
cea2e8a9 4662S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4663{
4664 va_list args;
4665 STRLEN l1 = strlen(pat1);
4666 STRLEN l2 = strlen(pat2);
4667 char buf[512];
06bf62c7 4668 SV *msv;
c277df42
IZ
4669 char *message;
4670
4671 if (l1 > 510)
4672 l1 = 510;
4673 if (l1 + l2 > 510)
4674 l2 = 510 - l1;
4675 Copy(pat1, buf, l1 , char);
4676 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4677 buf[l1 + l2] = '\n';
4678 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4679#ifdef I_STDARG
4680 /* ANSI variant takes additional second argument */
c277df42 4681 va_start(args, pat2);
8736538c
AS
4682#else
4683 va_start(args);
4684#endif
5a844595 4685 msv = vmess(buf, &args);
c277df42 4686 va_end(args);
06bf62c7 4687 message = SvPV(msv,l1);
c277df42
IZ
4688 if (l1 > 512)
4689 l1 = 512;
4690 Copy(message, buf, l1 , char);
4691 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4692 Perl_croak(aTHX_ "%s", buf);
c277df42 4693}
a0ed51b3
LW
4694
4695/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4696
4697void
864dbfa3 4698Perl_save_re_context(pTHX)
b81d288d 4699{
830247a4
IZ
4700#if 0
4701 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4702 SAVEI32(RExC_npar); /* () count. */
4703 SAVEI32(RExC_size); /* Code size. */
4704 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4705 SAVEVPTR(RExC_rx); /* from regcomp.c */
4706 SAVEI32(RExC_seen); /* from regcomp.c */
4707 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4708 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4709 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4710 SAVEPPTR(RExC_end); /* End of input for compile */
4711 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4712#endif
4713
4714 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4715 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4716 SAVEPPTR(PL_reginput); /* String-input pointer. */
4717 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4718 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4719 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4720 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4721 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4722 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4723 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4724 PL_reg_start_tmp = 0;
a0ed51b3
LW
4725 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4726 PL_reg_start_tmpl = 0;
7766f137 4727 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4728 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4729 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4730 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4731 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4732 SAVEVPTR(PL_regcc); /* from regexec.c */
4733 SAVEVPTR(PL_curcop);
7766f137
GS
4734 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4735 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4736 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4737 SAVESPTR(PL_reg_sv); /* from regexec.c */
6e072df0 4738 SAVEI8(PL_reg_sv_utf8); /* from regexec.c */
7766f137 4739 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4740 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4741 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4742 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4743 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4744 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4745#ifdef DEBUGGING
b81d288d 4746 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4747#endif
a0ed51b3 4748}
51371543
GS
4749
4750#ifdef PERL_OBJECT
51371543
GS
4751#include "XSUB.h"
4752#undef this
4753#define this pPerl
4754#endif
4755
4756static void
4757clear_re(pTHXo_ void *r)
4758{
4759 ReREFCNT_dec((regexp *)r);
4760}
ffbc6a93 4761