This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
integrate maint-5.005 changes (except conflicting change#1794)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
28# ifndef DEBUGGING
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c
GS
38/* *These* symbols are masked to allow static link. */
39# define Perl_pregfree my_regfree
40# define Perl_regnext my_regnext
fe2c2566 41# define save_re_context my_save_re_context
56953603
IZ
42#endif
43
f0fcb552 44/*SUPPRESS 112*/
a687059c 45/*
e50aee73 46 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
47 *
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
50 *
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
54 *
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
57 * from defects in it.
58 *
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
61 *
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
64 *
65 *
66 **** Alterations to Henry's code are...
67 ****
a0ed51b3 68 **** Copyright (c) 1991-1998, Larry Wall
a687059c 69 ****
9ef589d8
LW
70 **** You may distribute under the terms of either the GNU General Public
71 **** License or the Artistic License, as specified in the README file.
72
a687059c
LW
73 *
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
77 */
78#include "EXTERN.h"
79#include "perl.h"
d06ea78c 80
b9d5759e 81#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
82# include "INTERN.h"
83#endif
c277df42
IZ
84
85#define REG_COMP_C
a687059c
LW
86#include "regcomp.h"
87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3
LW
92#ifdef MSDOS
93# if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
105#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
106#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
107 ((*s) == '{' && regcurly(s)))
2b69d0c2
LW
108#ifdef atarist
109#define PERL_META "^$.[()|?+*\\"
110#else
a687059c 111#define META "^$.[()|?+*\\"
2b69d0c2 112#endif
a687059c 113
35c8bce7
LW
114#ifdef SPSTART
115#undef SPSTART /* dratted cpp namespace... */
116#endif
a687059c
LW
117/*
118 * Flags to be passed up and down.
119 */
a687059c 120#define WORST 0 /* Worst case. */
821b33a5 121#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
122#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
123#define SPSTART 0x4 /* Starts with * or +. */
124#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c
LW
125
126/*
e50aee73 127 * Forward declarations for pregcomp()'s friends.
a687059c 128 */
a0d0e21e 129
76e3520e 130#ifndef PERL_OBJECT
c277df42
IZ
131static regnode *reg _((I32, I32 *));
132static regnode *reganode _((U8, U32));
133static regnode *regatom _((I32 *));
134static regnode *regbranch _((I32 *, I32));
135static void regc _((U8, char *));
a0ed51b3 136static void reguni _((UV, char *, I32*));
c277df42 137static regnode *regclass _((void));
a0ed51b3 138static regnode *regclassutf8 _((void));
a0d0e21e 139STATIC I32 regcurly _((char *));
c277df42
IZ
140static regnode *reg_node _((U8));
141static regnode *regpiece _((I32 *));
142static void reginsert _((U8, regnode *));
143static void regoptail _((regnode *, regnode *));
c277df42 144static void regtail _((regnode *, regnode *));
873ef191 145static char* regwhite _((char *, char *));
a0d0e21e 146static char* nextchar _((void));
3bd495df 147static void re_croak2 _((const char* pat1,const char* pat2,...)) __attribute__((noreturn));
76e3520e 148#endif
a687059c 149
c277df42
IZ
150/* Length of a variant. */
151
76e3520e 152#ifndef PERL_OBJECT
c277df42
IZ
153typedef struct {
154 I32 len_min;
155 I32 len_delta;
a0ed51b3
LW
156 I32 pos_min; /* CC */
157 I32 pos_delta; /* CC */
c277df42
IZ
158 SV *last_found;
159 I32 last_end; /* min value, <0 unless valid. */
a0ed51b3
LW
160 I32 last_start_min; /* CC */
161 I32 last_start_max; /* CC */
c277df42
IZ
162 SV **longest; /* Either &l_fixed, or &l_float. */
163 SV *longest_fixed;
a0ed51b3 164 I32 offset_fixed; /* CC */
c277df42 165 SV *longest_float;
a0ed51b3
LW
166 I32 offset_float_min; /* CC */
167 I32 offset_float_max; /* CC */
c277df42
IZ
168 I32 flags;
169} scan_data_t;
76e3520e 170#endif
c277df42
IZ
171
172static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
173
174#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
175#define SF_BEFORE_SEOL 0x1
176#define SF_BEFORE_MEOL 0x2
177#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
178#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
179
09b7f37c
CB
180#ifdef NO_UNARY_PLUS
181# define SF_FIX_SHIFT_EOL (0+2)
182# define SF_FL_SHIFT_EOL (0+4)
183#else
184# define SF_FIX_SHIFT_EOL (+2)
185# define SF_FL_SHIFT_EOL (+4)
186#endif
c277df42
IZ
187
188#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
189#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
190
191#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
192#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
193#define SF_IS_INF 0x40
194#define SF_HAS_PAR 0x80
195#define SF_IN_PAR 0x100
196#define SF_HAS_EVAL 0x200
4bfe0158 197#define SCF_DO_SUBSTR 0x400
c277df42 198
a0ed51b3
LW
199#define RF_utf8 8
200#define UTF (PL_reg_flags & RF_utf8)
201#define LOC (PL_regflags & PMf_LOCALE)
202#define FOLD (PL_regflags & PMf_FOLD)
203
204#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
205#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
206
76e3520e 207STATIC void
c277df42
IZ
208scan_commit(scan_data_t *data)
209{
c485e607 210 dTHR;
a0ed51b3
LW
211 STRLEN l = CHR_SVLEN(data->last_found);
212 STRLEN old_l = CHR_SVLEN(*data->longest);
c277df42
IZ
213
214 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
215 sv_setsv(*data->longest, data->last_found);
216 if (*data->longest == data->longest_fixed) {
217 data->offset_fixed = l ? data->last_start_min : data->pos_min;
218 if (data->flags & SF_BEFORE_EOL)
219 data->flags
220 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
221 else
222 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
223 }
224 else {
c277df42
IZ
225 data->offset_float_min = l ? data->last_start_min : data->pos_min;
226 data->offset_float_max = (l
227 ? data->last_start_max
228 : data->pos_min + data->pos_delta);
229 if (data->flags & SF_BEFORE_EOL)
230 data->flags
231 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
232 else
233 data->flags &= ~SF_FL_BEFORE_EOL;
234 }
235 }
236 SvCUR_set(data->last_found, 0);
237 data->last_end = -1;
238 data->flags &= ~SF_BEFORE_EOL;
239}
240
c277df42
IZ
241/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
242 to the position after last scanned or to NULL. */
243
76e3520e 244STATIC I32
c277df42
IZ
245study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
246 /* scanp: Start here (read-write). */
247 /* deltap: Write maxlen-minlen here. */
248 /* last: Stop before this one. */
249{
5c0ca799 250 dTHR;
c277df42
IZ
251 I32 min = 0, pars = 0, code;
252 regnode *scan = *scanp, *next;
253 I32 delta = 0;
254 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
255 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
256 scan_data_t data_fake;
257
258 while (scan && OP(scan) != END && scan < last) {
259 /* Peephole optimizer: */
260
261 if (regkind[(U8)OP(scan)] == EXACT) {
262 regnode *n = regnext(scan);
263 U32 stringok = 1;
264#ifdef DEBUGGING
265 regnode *stop = scan;
266#endif
267
268 next = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2;
269 /* Skip NOTHING, merge EXACT*. */
270 while (n &&
271 ( regkind[(U8)OP(n)] == NOTHING ||
272 (stringok && (OP(n) == OP(scan))))
273 && NEXT_OFF(n)
274 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
275 if (OP(n) == TAIL || n > next)
276 stringok = 0;
277 if (regkind[(U8)OP(n)] == NOTHING) {
278 NEXT_OFF(scan) += NEXT_OFF(n);
279 next = n + NODE_STEP_REGNODE;
280#ifdef DEBUGGING
281 if (stringok)
282 stop = n;
283#endif
284 n = regnext(n);
a0ed51b3
LW
285 }
286 else {
c277df42
IZ
287 int oldl = *OPERAND(scan);
288 regnode *nnext = regnext(n);
289
290 if (oldl + *OPERAND(n) > U8_MAX)
291 break;
292 NEXT_OFF(scan) += NEXT_OFF(n);
293 *OPERAND(scan) += *OPERAND(n);
294 next = n + (*OPERAND(n) + 2 - 1)/sizeof(regnode) + 2;
295 /* Now we can overwrite *n : */
296 Move(OPERAND(n) + 1, OPERAND(scan) + oldl + 1,
297 *OPERAND(n) + 1, char);
298#ifdef DEBUGGING
299 if (stringok)
300 stop = next - 1;
301#endif
302 n = nnext;
303 }
304 }
305#ifdef DEBUGGING
306 /* Allow dumping */
307 n = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2;
308 while (n <= stop) {
ca04da08
GS
309 /* Purify reports a benign UMR here sometimes, because we
310 * don't initialize the OP() slot of a node when that node
311 * is occupied by just the trailing null of the string in
312 * an EXACT node */
c277df42
IZ
313 if (regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
314 OP(n) = OPTIMIZED;
315 NEXT_OFF(n) = 0;
316 }
317 n++;
318 }
319#endif
320
321 }
322 if (OP(scan) != CURLYX) {
048cfca1
GS
323 int max = (reg_off_by_arg[OP(scan)]
324 ? I32_MAX
325 /* I32 may be smaller than U16 on CRAYs! */
326 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
327 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
328 int noff;
329 regnode *n = scan;
330
331 /* Skip NOTHING and LONGJMP. */
332 while ((n = regnext(n))
333 && ((regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
334 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
335 && off + noff < max)
336 off += noff;
337 if (reg_off_by_arg[OP(scan)])
338 ARG(scan) = off;
339 else
340 NEXT_OFF(scan) = off;
341 }
342 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
343 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
344 next = regnext(scan);
345 code = OP(scan);
346
347 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
348 I32 max1 = 0, min1 = I32_MAX, num = 0;
349
350 if (flags & SCF_DO_SUBSTR)
351 scan_commit(data);
352 while (OP(scan) == code) {
353 I32 deltanext, minnext;
354
355 num++;
356 data_fake.flags = 0;
357 next = regnext(scan);
358 scan = NEXTOPER(scan);
359 if (code != BRANCH)
360 scan = NEXTOPER(scan);
361 /* We suppose the run is continuous, last=next...*/
362 minnext = study_chunk(&scan, &deltanext, next,
363 &data_fake, 0);
364 if (min1 > minnext)
365 min1 = minnext;
366 if (max1 < minnext + deltanext)
367 max1 = minnext + deltanext;
368 if (deltanext == I32_MAX)
369 is_inf = 1;
370 scan = next;
371 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
372 pars++;
405ff068 373 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42
IZ
374 data->flags |= SF_HAS_EVAL;
375 if (code == SUSPEND)
376 break;
377 }
378 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
379 min1 = 0;
380 if (flags & SCF_DO_SUBSTR) {
381 data->pos_min += min1;
382 data->pos_delta += max1 - min1;
383 if (max1 != min1 || is_inf)
384 data->longest = &(data->longest_float);
385 }
386 min += min1;
387 delta += max1 - min1;
a0ed51b3
LW
388 }
389 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
390 scan = NEXTOPER(NEXTOPER(scan));
391 else /* single branch is optimized. */
392 scan = NEXTOPER(scan);
393 continue;
a0ed51b3
LW
394 }
395 else if (OP(scan) == EXACT) {
396 I32 l = *OPERAND(scan);
397 if (UTF) {
398 unsigned char *s = (unsigned char *)(OPERAND(scan)+1);
399 unsigned char *e = s + l;
400 I32 newl = 0;
401 while (s < e) {
402 newl++;
403 s += UTF8SKIP(s);
404 }
405 l = newl;
406 }
407 min += l;
c277df42 408 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
409 /* The code below prefers earlier match for fixed
410 offset, later match for variable offset. */
411 if (data->last_end == -1) { /* Update the start info. */
412 data->last_start_min = data->pos_min;
413 data->last_start_max = is_inf
414 ? I32_MAX : data->pos_min + data->pos_delta;
415 }
a0ed51b3 416 sv_catpvn(data->last_found, (char *)(OPERAND(scan)+1), *OPERAND(scan));
c277df42
IZ
417 data->last_end = data->pos_min + l;
418 data->pos_min += l; /* As in the first entry. */
419 data->flags &= ~SF_BEFORE_EOL;
420 }
a0ed51b3
LW
421 }
422 else if (regkind[(U8)OP(scan)] == EXACT) {
423 I32 l = *OPERAND(scan);
c277df42
IZ
424 if (flags & SCF_DO_SUBSTR)
425 scan_commit(data);
a0ed51b3
LW
426 if (UTF) {
427 unsigned char *s = (unsigned char *)(OPERAND(scan)+1);
428 unsigned char *e = s + l;
429 I32 newl = 0;
430 while (s < e) {
431 newl++;
432 s += UTF8SKIP(s);
433 }
434 l = newl;
435 }
436 min += l;
c277df42 437 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3
LW
438 data->pos_min += l;
439 }
440 else if (strchr(varies,OP(scan))) {
c277df42
IZ
441 I32 mincount, maxcount, minnext, deltanext, pos_before, fl;
442 regnode *oscan = scan;
443
444 switch (regkind[(U8)OP(scan)]) {
445 case WHILEM:
446 scan = NEXTOPER(scan);
447 goto finish;
448 case PLUS:
449 if (flags & SCF_DO_SUBSTR) {
450 next = NEXTOPER(scan);
451 if (OP(next) == EXACT) {
452 mincount = 1;
453 maxcount = REG_INFTY;
454 next = regnext(scan);
455 scan = NEXTOPER(scan);
456 goto do_curly;
457 }
458 }
459 if (flags & SCF_DO_SUBSTR)
460 data->pos_min++;
461 min++;
462 /* Fall through. */
463 case STAR:
464 is_inf = 1;
465 scan = regnext(scan);
466 if (flags & SCF_DO_SUBSTR) {
467 scan_commit(data);
468 data->longest = &(data->longest_float);
469 }
470 goto optimize_curly_tail;
471 case CURLY:
472 mincount = ARG1(scan);
473 maxcount = ARG2(scan);
474 next = regnext(scan);
475 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
476 do_curly:
477 if (flags & SCF_DO_SUBSTR) {
478 if (mincount == 0) scan_commit(data);
479 pos_before = data->pos_min;
480 }
481 if (data) {
482 fl = data->flags;
483 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
484 if (is_inf)
485 data->flags |= SF_IS_INF;
486 }
487 /* This will finish on WHILEM, setting scan, or on NULL: */
488 minnext = study_chunk(&scan, &deltanext, last, data,
489 mincount == 0
490 ? (flags & ~SCF_DO_SUBSTR) : flags);
491 if (!scan) /* It was not CURLYX, but CURLY. */
492 scan = next;
599cee73 493 if (ckWARN(WARN_UNSAFE) && (minnext + deltanext == 0)
821b33a5
IZ
494 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
495 && maxcount <= 10000) /* Complement check for big count */
599cee73 496 warner(WARN_UNSAFE, "Strange *+?{} on zero-length expression");
c277df42
IZ
497 min += minnext * mincount;
498 is_inf |= (maxcount == REG_INFTY && (minnext + deltanext) > 0
499 || deltanext == I32_MAX);
500 delta += (minnext + deltanext) * maxcount - minnext * mincount;
501
502 /* Try powerful optimization CURLYX => CURLYN. */
c277df42
IZ
503 if ( OP(oscan) == CURLYX && data
504 && data->flags & SF_IN_PAR
505 && !(data->flags & SF_HAS_EVAL)
506 && !deltanext && minnext == 1 ) {
507 /* Try to optimize to CURLYN. */
508 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
509 regnode *nxt1 = nxt, *nxt2;
510
511 /* Skip open. */
512 nxt = regnext(nxt);
513 if (!strchr(simple,OP(nxt))
514 && !(regkind[(U8)OP(nxt)] == EXACT
515 && *OPERAND(nxt) == 1))
516 goto nogo;
517 nxt2 = nxt;
518 nxt = regnext(nxt);
519 if (OP(nxt) != CLOSE)
520 goto nogo;
521 /* Now we know that nxt2 is the only contents: */
522 oscan->flags = ARG(nxt);
523 OP(oscan) = CURLYN;
524 OP(nxt1) = NOTHING; /* was OPEN. */
525#ifdef DEBUGGING
526 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
527 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
528 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
529 OP(nxt) = OPTIMIZED; /* was CLOSE. */
530 OP(nxt + 1) = OPTIMIZED; /* was count. */
531 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
532#endif
533 }
c277df42
IZ
534 nogo:
535
536 /* Try optimization CURLYX => CURLYM. */
537 if ( OP(oscan) == CURLYX && data
c277df42 538 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
539 && !(data->flags & SF_HAS_EVAL)
540 && !deltanext ) {
541 /* XXXX How to optimize if data == 0? */
542 /* Optimize to a simpler form. */
543 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
544 regnode *nxt2;
545
546 OP(oscan) = CURLYM;
547 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
548 && (OP(nxt2) != WHILEM))
549 nxt = nxt2;
550 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
551 /* Need to optimize away parenths. */
552 if (data->flags & SF_IN_PAR) {
553 /* Set the parenth number. */
554 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
555
556 if (OP(nxt) != CLOSE)
557 FAIL("panic opt close");
558 oscan->flags = ARG(nxt);
559 OP(nxt1) = OPTIMIZED; /* was OPEN. */
560 OP(nxt) = OPTIMIZED; /* was CLOSE. */
561#ifdef DEBUGGING
562 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
563 OP(nxt + 1) = OPTIMIZED; /* was count. */
564 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
565 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
566#endif
567#if 0
568 while ( nxt1 && (OP(nxt1) != WHILEM)) {
569 regnode *nnxt = regnext(nxt1);
570
571 if (nnxt == nxt) {
572 if (reg_off_by_arg[OP(nxt1)])
573 ARG_SET(nxt1, nxt2 - nxt1);
574 else if (nxt2 - nxt1 < U16_MAX)
575 NEXT_OFF(nxt1) = nxt2 - nxt1;
576 else
577 OP(nxt) = NOTHING; /* Cannot beautify */
578 }
579 nxt1 = nnxt;
580 }
581#endif
582 /* Optimize again: */
583 study_chunk(&nxt1, &deltanext, nxt, NULL, 0);
a0ed51b3
LW
584 }
585 else
c277df42 586 oscan->flags = 0;
c277df42
IZ
587 }
588 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
589 pars++;
590 if (flags & SCF_DO_SUBSTR) {
591 SV *last_str = Nullsv;
592 int counted = mincount != 0;
593
594 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
595 I32 b = pos_before >= data->last_start_min
596 ? pos_before : data->last_start_min;
597 STRLEN l;
598 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
599 I32 old = b - data->last_start_min;
600
601 if (UTF)
602 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 603
a0ed51b3 604 l -= old;
c277df42 605 /* Get the added string: */
a0ed51b3 606 last_str = newSVpv(s + old, l);
c277df42
IZ
607 if (deltanext == 0 && pos_before == b) {
608 /* What was added is a constant string */
609 if (mincount > 1) {
610 SvGROW(last_str, (mincount * l) + 1);
611 repeatcpy(SvPVX(last_str) + l,
612 SvPVX(last_str), l, mincount - 1);
613 SvCUR(last_str) *= mincount;
614 /* Add additional parts. */
615 SvCUR_set(data->last_found,
616 SvCUR(data->last_found) - l);
617 sv_catsv(data->last_found, last_str);
618 data->last_end += l * (mincount - 1);
619 }
620 }
621 }
622 /* It is counted once already... */
623 data->pos_min += minnext * (mincount - counted);
624 data->pos_delta += - counted * deltanext +
625 (minnext + deltanext) * maxcount - minnext * mincount;
626 if (mincount != maxcount) {
627 scan_commit(data);
628 if (mincount && last_str) {
629 sv_setsv(data->last_found, last_str);
630 data->last_end = data->pos_min;
631 data->last_start_min =
a0ed51b3 632 data->pos_min - CHR_SVLEN(last_str);
c277df42
IZ
633 data->last_start_max = is_inf
634 ? I32_MAX
635 : data->pos_min + data->pos_delta
a0ed51b3 636 - CHR_SVLEN(last_str);
c277df42
IZ
637 }
638 data->longest = &(data->longest_float);
639 }
640 }
405ff068 641 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
642 data->flags |= SF_HAS_EVAL;
643 optimize_curly_tail:
c277df42
IZ
644 if (OP(oscan) != CURLYX) {
645 while (regkind[(U8)OP(next = regnext(oscan))] == NOTHING
646 && NEXT_OFF(next))
647 NEXT_OFF(oscan) += NEXT_OFF(next);
648 }
c277df42
IZ
649 continue;
650 default: /* REF only? */
651 if (flags & SCF_DO_SUBSTR) {
652 scan_commit(data);
653 data->longest = &(data->longest_float);
654 }
655 is_inf = 1;
656 break;
657 }
a0ed51b3
LW
658 }
659 else if (strchr(simple,OP(scan)) || regkind[(U8)OP(scan)] == ANYUTF8) {
c277df42
IZ
660 if (flags & SCF_DO_SUBSTR) {
661 scan_commit(data);
662 data->pos_min++;
663 }
664 min++;
a0ed51b3
LW
665 }
666 else if (regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
667 data->flags |= (OP(scan) == MEOL
668 ? SF_BEFORE_MEOL
669 : SF_BEFORE_SEOL);
a0ed51b3
LW
670 }
671 else if (regkind[(U8)OP(scan)] == BRANCHJ
c277df42
IZ
672 && (scan->flags || data)
673 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
674 I32 deltanext, minnext;
675 regnode *nscan;
676
677 data_fake.flags = 0;
678 next = regnext(scan);
679 nscan = NEXTOPER(NEXTOPER(scan));
680 minnext = study_chunk(&nscan, &deltanext, last, &data_fake, 0);
681 if (scan->flags) {
682 if (deltanext) {
683 FAIL("variable length lookbehind not implemented");
a0ed51b3
LW
684 }
685 else if (minnext > U8_MAX) {
c277df42
IZ
686 FAIL2("lookbehind longer than %d not implemented", U8_MAX);
687 }
688 scan->flags = minnext;
689 }
690 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
691 pars++;
405ff068 692 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 693 data->flags |= SF_HAS_EVAL;
a0ed51b3
LW
694 }
695 else if (OP(scan) == OPEN) {
c277df42 696 pars++;
a0ed51b3
LW
697 }
698 else if (OP(scan) == CLOSE && ARG(scan) == is_par) {
c277df42
IZ
699 next = regnext(scan);
700
701 if ( next && (OP(next) != WHILEM) && next < last)
c277df42 702 is_par = 0; /* Disable optimization */
a0ed51b3
LW
703 }
704 else if (OP(scan) == EVAL) {
c277df42
IZ
705 if (data)
706 data->flags |= SF_HAS_EVAL;
707 }
708 /* Else: zero-length, ignore. */
709 scan = regnext(scan);
710 }
711
712 finish:
713 *scanp = scan;
714 *deltap = is_inf ? I32_MAX : delta;
715 if (flags & SCF_DO_SUBSTR && is_inf)
716 data->pos_delta = I32_MAX - data->pos_min;
717 if (is_par > U8_MAX)
718 is_par = 0;
719 if (is_par && pars==1 && data) {
720 data->flags |= SF_IN_PAR;
721 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
722 }
723 else if (pars && data) {
c277df42
IZ
724 data->flags |= SF_HAS_PAR;
725 data->flags &= ~SF_IN_PAR;
726 }
727 return min;
728}
729
76e3520e 730STATIC I32
c277df42
IZ
731add_data(I32 n, char *s)
732{
5c0ca799 733 dTHR;
3280af22
NIS
734 if (PL_regcomp_rx->data) {
735 Renewc(PL_regcomp_rx->data,
736 sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1),
c277df42 737 char, struct reg_data);
3280af22
NIS
738 Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8);
739 PL_regcomp_rx->data->count += n;
a0ed51b3
LW
740 }
741 else {
3280af22 742 Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1),
c277df42 743 char, struct reg_data);
3280af22
NIS
744 New(1208, PL_regcomp_rx->data->what, n, U8);
745 PL_regcomp_rx->data->count = n;
c277df42 746 }
3280af22
NIS
747 Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8);
748 return PL_regcomp_rx->data->count - n;
c277df42
IZ
749}
750
a687059c 751/*
e50aee73 752 - pregcomp - compile a regular expression into internal code
a687059c
LW
753 *
754 * We can't allocate space until we know how big the compiled form will be,
755 * but we can't compile it (and thus know how big it is) until we've got a
756 * place to put the code. So we cheat: we compile it twice, once with code
757 * generation turned off and size counting turned on, and once "for real".
758 * This also means that we don't allocate space until we are sure that the
759 * thing really will compile successfully, and we never have to move the
760 * code and thus invalidate pointers into it. (Note that it has to be in
761 * one piece because free() must be able to free it all.) [NB: not true in perl]
762 *
763 * Beware that the optimization-preparation code in here knows about some
764 * of the structure of the compiled regexp. [I'll say.]
765 */
766regexp *
8ac85365 767pregcomp(char *exp, char *xend, PMOP *pm)
a687059c 768{
5c0ca799 769 dTHR;
a0d0e21e 770 register regexp *r;
c277df42
IZ
771 regnode *scan;
772 SV **longest;
773 SV *longest_fixed;
774 SV *longest_float;
775 regnode *first;
a0d0e21e 776 I32 flags;
a0d0e21e
LW
777 I32 minlen = 0;
778 I32 sawplus = 0;
779 I32 sawopen = 0;
780
781 if (exp == NULL)
c277df42 782 FAIL("NULL regexp argument");
a0d0e21e 783
a0ed51b3
LW
784 if (PL_curcop == &compiling ? (PL_hints & HINT_UTF8) : IN_UTF8)
785 PL_reg_flags |= RF_utf8;
786 else
787 PL_reg_flags = 0;
788
3280af22 789 PL_regprecomp = savepvn(exp, xend - exp);
c277df42 790 DEBUG_r(PerlIO_printf(Perl_debug_log, "compiling RE `%*s'\n",
3280af22
NIS
791 xend - exp, PL_regprecomp));
792 PL_regflags = pm->op_pmflags;
793 PL_regsawback = 0;
bbce6d69 794
3280af22
NIS
795 PL_regseen = 0;
796 PL_seen_zerolen = *exp == '^' ? -1 : 0;
797 PL_seen_evals = 0;
798 PL_extralen = 0;
c277df42 799
bbce6d69 800 /* First pass: determine size, legality. */
3280af22
NIS
801 PL_regcomp_parse = exp;
802 PL_regxend = xend;
803 PL_regnaughty = 0;
804 PL_regnpar = 1;
805 PL_regsize = 0L;
806 PL_regcode = &PL_regdummy;
807 regc((U8)MAGIC, (char*)PL_regcode);
a0d0e21e 808 if (reg(0, &flags) == NULL) {
3280af22
NIS
809 Safefree(PL_regprecomp);
810 PL_regprecomp = Nullch;
a0d0e21e
LW
811 return(NULL);
812 }
3280af22 813 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", PL_regsize));
c277df42
IZ
814
815 DEBUG_r(
3280af22 816 if (!PL_colorset) {
c277df42 817 int i = 0;
76e3520e 818 char *s = PerlEnv_getenv("TERMCAP_COLORS");
c277df42 819
3280af22 820 PL_colorset = 1;
c277df42 821 if (s) {
3280af22 822 PL_colors[0] = s = savepv(s);
c277df42
IZ
823 while (++i < 4) {
824 s = strchr(s, '\t');
825 if (!s)
826 FAIL("Not enough TABs in TERMCAP_COLORS");
827 *s = '\0';
3280af22 828 PL_colors[i] = ++s;
c277df42 829 }
a0ed51b3
LW
830 }
831 else {
c277df42 832 while (i < 4)
3280af22 833 PL_colors[i++] = "";
c277df42
IZ
834 }
835 /* Reset colors: */
836 PerlIO_printf(Perl_debug_log, "%s%s%s%s",
3280af22 837 PL_colors[0],PL_colors[1],PL_colors[2],PL_colors[3]);
c277df42
IZ
838 }
839 );
a0d0e21e 840
c277df42
IZ
841 /* Small enough for pointer-storage convention?
842 If extralen==0, this means that we will not need long jumps. */
3280af22
NIS
843 if (PL_regsize >= 0x10000L && PL_extralen)
844 PL_regsize += PL_extralen;
c277df42 845 else
3280af22 846 PL_extralen = 0;
a0d0e21e 847
bbce6d69 848 /* Allocate space and initialize. */
3280af22 849 Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode),
c277df42 850 char, regexp);
a0d0e21e
LW
851 if (r == NULL)
852 FAIL("regexp out of space");
c277df42 853 r->refcnt = 1;
bbce6d69 854 r->prelen = xend - exp;
3280af22 855 r->precomp = PL_regprecomp;
a0d0e21e 856 r->subbeg = r->subbase = NULL;
3280af22
NIS
857 r->nparens = PL_regnpar - 1; /* set early to validate backrefs */
858 PL_regcomp_rx = r;
bbce6d69 859
860 /* Second pass: emit code. */
3280af22
NIS
861 PL_regcomp_parse = exp;
862 PL_regxend = xend;
863 PL_regnaughty = 0;
864 PL_regnpar = 1;
865 PL_regcode = r->program;
2cd61cdb 866 /* Store the count of eval-groups for security checks: */
3280af22
NIS
867 PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals);
868 regc((U8)MAGIC, (char*) PL_regcode++);
c277df42 869 r->data = 0;
a0d0e21e
LW
870 if (reg(0, &flags) == NULL)
871 return(NULL);
872
873 /* Dig out information for optimizations. */
8782bef2 874 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3280af22 875 pm->op_pmflags = PL_regflags;
a0ed51b3
LW
876 if (UTF)
877 r->reganch |= ROPT_UTF8;
c277df42 878 r->regstclass = NULL;
a0ed51b3
LW
879 if (PL_regnaughty >= 10) /* Probably an expensive pattern. */
880 r->reganch |= ROPT_NAUGHTY;
c277df42 881 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
882
883 /* XXXX To minimize changes to RE engine we always allocate
884 3-units-long substrs field. */
885 Newz(1004, r->substrs, 1, struct reg_substr_data);
886
c277df42
IZ
887 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
888 scan_data_t data;
889 I32 fake;
c5254dd6 890 STRLEN longest_float_length, longest_fixed_length;
a0d0e21e 891
c277df42 892 StructCopy(&zero_scan_data, &data, scan_data_t);
a0d0e21e 893 first = scan;
c277df42 894 /* Skip introductions and multiplicators >= 1. */
a0d0e21e
LW
895 while ((OP(first) == OPEN && (sawopen = 1)) ||
896 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
897 (OP(first) == PLUS) ||
898 (OP(first) == MINMOD) ||
899 (regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
900 if (OP(first) == PLUS)
901 sawplus = 1;
902 else
903 first += regarglen[(U8)OP(first)];
904 first = NEXTOPER(first);
a687059c
LW
905 }
906
a0d0e21e
LW
907 /* Starting-point info. */
908 again:
c277df42 909 if (OP(first) == EXACT); /* Empty, get anchored substr later. */
a0ed51b3 910 else if (strchr(simple+4,OP(first)))
a0d0e21e 911 r->regstclass = first;
bbce6d69 912 else if (regkind[(U8)OP(first)] == BOUND ||
913 regkind[(U8)OP(first)] == NBOUND)
a0d0e21e
LW
914 r->regstclass = first;
915 else if (regkind[(U8)OP(first)] == BOL) {
c277df42 916 r->reganch |= (OP(first) == MBOL ? ROPT_ANCH_MBOL: ROPT_ANCH_BOL);
a0d0e21e 917 first = NEXTOPER(first);
774d564b 918 goto again;
919 }
920 else if (OP(first) == GPOS) {
921 r->reganch |= ROPT_ANCH_GPOS;
922 first = NEXTOPER(first);
923 goto again;
a0d0e21e
LW
924 }
925 else if ((OP(first) == STAR &&
926 regkind[(U8)OP(NEXTOPER(first))] == ANY) &&
927 !(r->reganch & ROPT_ANCH) )
928 {
929 /* turn .* into ^.* with an implied $*=1 */
774d564b 930 r->reganch |= ROPT_ANCH_BOL | ROPT_IMPLICIT;
a0d0e21e 931 first = NEXTOPER(first);
774d564b 932 goto again;
a0d0e21e 933 }
3280af22 934 if (sawplus && (!sawopen || !PL_regsawback))
a0d0e21e
LW
935 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
936
c277df42
IZ
937 /* Scan is after the zeroth branch, first is atomic matcher. */
938 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %d\n",
939 first - scan + 1));
a0d0e21e
LW
940 /*
941 * If there's something expensive in the r.e., find the
942 * longest literal string that must appear and make it the
943 * regmust. Resolve ties in favor of later strings, since
944 * the regstart check works with the beginning of the r.e.
945 * and avoiding duplication strengthens checking. Not a
946 * strong reason, but sufficient in the absence of others.
947 * [Now we resolve ties in favor of the earlier string if
c277df42 948 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
949 * earlier string may buy us something the later one won't.]
950 */
a0d0e21e 951 minlen = 0;
a687059c 952
c277df42
IZ
953 data.longest_fixed = newSVpv("",0);
954 data.longest_float = newSVpv("",0);
955 data.last_found = newSVpv("",0);
956 data.longest = &(data.longest_fixed);
957 first = scan;
958
3280af22 959 minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */
c277df42 960 &data, SCF_DO_SUBSTR);
3280af22 961 if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed)
c277df42 962 && data.last_start_min == 0 && data.last_end > 0
3280af22
NIS
963 && !PL_seen_zerolen
964 && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42
IZ
965 r->reganch |= ROPT_CHECK_ALL;
966 scan_commit(&data);
967 SvREFCNT_dec(data.last_found);
968
a0ed51b3 969 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 970 if (longest_float_length
c277df42
IZ
971 || (data.flags & SF_FL_BEFORE_EOL
972 && (!(data.flags & SF_FL_BEFORE_MEOL)
3280af22 973 || (PL_regflags & PMf_MULTILINE)))) {
a0ed51b3 974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
c277df42
IZ
975 && data.offset_fixed == data.offset_float_min)
976 goto remove; /* Like in (a)+. */
977
978 r->float_substr = data.longest_float;
979 r->float_min_offset = data.offset_float_min;
980 r->float_max_offset = data.offset_float_max;
2779dcf1 981 fbm_compile(r->float_substr, 0);
c277df42
IZ
982 BmUSEFUL(r->float_substr) = 100;
983 if (data.flags & SF_FL_BEFORE_EOL /* Cannot have SEOL and MULTI */
984 && (!(data.flags & SF_FL_BEFORE_MEOL)
3280af22 985 || (PL_regflags & PMf_MULTILINE)))
c277df42 986 SvTAIL_on(r->float_substr);
a0ed51b3
LW
987 }
988 else {
c277df42
IZ
989 remove:
990 r->float_substr = Nullsv;
991 SvREFCNT_dec(data.longest_float);
c5254dd6 992 longest_float_length = 0;
a0d0e21e 993 }
c277df42 994
a0ed51b3 995 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 996 if (longest_fixed_length
c277df42
IZ
997 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
998 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3280af22 999 || (PL_regflags & PMf_MULTILINE)))) {
c277df42
IZ
1000 r->anchored_substr = data.longest_fixed;
1001 r->anchored_offset = data.offset_fixed;
2779dcf1 1002 fbm_compile(r->anchored_substr, 0);
c277df42
IZ
1003 BmUSEFUL(r->anchored_substr) = 100;
1004 if (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3280af22 1006 || (PL_regflags & PMf_MULTILINE)))
c277df42 1007 SvTAIL_on(r->anchored_substr);
a0ed51b3
LW
1008 }
1009 else {
c277df42
IZ
1010 r->anchored_substr = Nullsv;
1011 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1012 longest_fixed_length = 0;
a0d0e21e 1013 }
c277df42
IZ
1014
1015 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1016 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1017 r->check_substr = r->anchored_substr;
1018 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1019 if (r->reganch & ROPT_ANCH_SINGLE)
1020 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1021 }
1022 else {
c277df42
IZ
1023 r->check_substr = r->float_substr;
1024 r->check_offset_min = data.offset_float_min;
1025 r->check_offset_max = data.offset_float_max;
a0d0e21e 1026 }
a0ed51b3
LW
1027 }
1028 else {
c277df42
IZ
1029 /* Several toplevels. Best we can is to set minlen. */
1030 I32 fake;
1031
1032 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1033 scan = r->program + 1;
3280af22 1034 minlen = study_chunk(&scan, &fake, scan + PL_regsize, NULL, 0);
c277df42 1035 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
a0d0e21e
LW
1036 }
1037
a0d0e21e 1038 r->minlen = minlen;
3280af22 1039 if (PL_regseen & REG_SEEN_GPOS)
c277df42 1040 r->reganch |= ROPT_GPOS_SEEN;
3280af22 1041 if (PL_regseen & REG_SEEN_LOOKBEHIND)
c277df42 1042 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3280af22 1043 if (PL_regseen & REG_SEEN_EVAL)
ce862d02 1044 r->reganch |= ROPT_EVAL_SEEN;
3280af22
NIS
1045 Newz(1002, r->startp, PL_regnpar, char*);
1046 Newz(1002, r->endp, PL_regnpar, char*);
a0d0e21e
LW
1047 DEBUG_r(regdump(r));
1048 return(r);
a687059c
LW
1049}
1050
1051/*
1052 - reg - regular expression, i.e. main body or parenthesized thing
1053 *
1054 * Caller must absorb opening parenthesis.
1055 *
1056 * Combining parenthesis handling with the base level of regular expression
1057 * is a trifle forced, but the need to tie the tails of the branches to what
1058 * follows makes it hard to avoid.
1059 */
76e3520e 1060STATIC regnode *
8ac85365 1061reg(I32 paren, I32 *flagp)
c277df42 1062 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 1063{
5c0ca799 1064 dTHR;
c277df42
IZ
1065 register regnode *ret; /* Will be the head of the group. */
1066 register regnode *br;
1067 register regnode *lastbr;
1068 register regnode *ender = 0;
a0d0e21e 1069 register I32 parno = 0;
3280af22 1070 I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0;
c277df42 1071 char c;
a0d0e21e 1072
821b33a5 1073 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
1074
1075 /* Make an OPEN node, if parenthesized. */
1076 if (paren) {
3280af22 1077 if (*PL_regcomp_parse == '?') {
ca9dfc88
IZ
1078 U16 posflags = 0, negflags = 0;
1079 U16 *flagsp = &posflags;
1080
3280af22
NIS
1081 PL_regcomp_parse++;
1082 paren = *PL_regcomp_parse++;
c277df42 1083 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 1084 switch (paren) {
c277df42 1085 case '<':
3280af22
NIS
1086 PL_regseen |= REG_SEEN_LOOKBEHIND;
1087 if (*PL_regcomp_parse == '!')
c277df42 1088 paren = ',';
3280af22 1089 if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!')
c277df42 1090 goto unknown;
3280af22 1091 PL_regcomp_parse++;
a0d0e21e
LW
1092 case '=':
1093 case '!':
3280af22 1094 PL_seen_zerolen++;
c277df42
IZ
1095 case ':':
1096 case '>':
a0d0e21e
LW
1097 break;
1098 case '$':
1099 case '@':
c277df42 1100 FAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
1101 break;
1102 case '#':
3280af22
NIS
1103 while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
1104 PL_regcomp_parse++;
1105 if (*PL_regcomp_parse != ')')
c277df42 1106 FAIL("Sequence (?#... not terminated");
a0d0e21e
LW
1107 nextchar();
1108 *flagp = TRYAGAIN;
1109 return NULL;
c277df42
IZ
1110 case '{':
1111 {
1112 dTHR;
1113 I32 count = 1, n = 0;
1114 char c;
3280af22 1115 char *s = PL_regcomp_parse;
c277df42
IZ
1116 SV *sv;
1117 OP_4tree *sop, *rop;
1118
3280af22
NIS
1119 PL_seen_zerolen++;
1120 PL_regseen |= REG_SEEN_EVAL;
1121 while (count && (c = *PL_regcomp_parse)) {
1122 if (c == '\\' && PL_regcomp_parse[1])
1123 PL_regcomp_parse++;
c277df42
IZ
1124 else if (c == '{')
1125 count++;
1126 else if (c == '}')
1127 count--;
3280af22 1128 PL_regcomp_parse++;
c277df42 1129 }
3280af22 1130 if (*PL_regcomp_parse != ')')
c277df42
IZ
1131 FAIL("Sequence (?{...}) not terminated or not {}-balanced");
1132 if (!SIZE_ONLY) {
1133 AV *av;
1134
3280af22
NIS
1135 if (PL_regcomp_parse - 1 - s)
1136 sv = newSVpv(s, PL_regcomp_parse - 1 - s);
c277df42
IZ
1137 else
1138 sv = newSVpv("", 0);
1139
1140 rop = sv_compile_2op(sv, &sop, "re", &av);
1141
1142 n = add_data(3, "nso");
3280af22
NIS
1143 PL_regcomp_rx->data->data[n] = (void*)rop;
1144 PL_regcomp_rx->data->data[n+1] = (void*)av;
1145 PL_regcomp_rx->data->data[n+2] = (void*)sop;
c277df42 1146 SvREFCNT_dec(sv);
a0ed51b3
LW
1147 }
1148 else { /* First pass */
1149 if (PL_reginterp_cnt < ++PL_seen_evals && PL_curcop != &compiling)
2cd61cdb
IZ
1150 /* No compiled RE interpolated, has runtime
1151 components ===> unsafe. */
1152 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3280af22 1153 if (PL_tainted)
cc6b7395 1154 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
1155 }
1156
1157 nextchar();
c277df42
IZ
1158 return reganode(EVAL, n);
1159 }
1160 case '(':
1161 {
3280af22
NIS
1162 if (PL_regcomp_parse[0] == '?') {
1163 if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!'
1164 || PL_regcomp_parse[1] == '<'
1165 || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
1166 I32 flag;
1167
1168 ret = reg_node(LOGICAL);
1169 regtail(ret, reg(1, &flag));
1170 goto insert_if;
1171 }
a0ed51b3
LW
1172 }
1173 else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) {
3280af22 1174 parno = atoi(PL_regcomp_parse++);
c277df42 1175
3280af22
NIS
1176 while (isDIGIT(*PL_regcomp_parse))
1177 PL_regcomp_parse++;
c277df42
IZ
1178 ret = reganode(GROUPP, parno);
1179 if ((c = *nextchar()) != ')')
1180 FAIL2("Switch (?(number%c not recognized", c);
1181 insert_if:
1182 regtail(ret, reganode(IFTHEN, 0));
1183 br = regbranch(&flags, 1);
1184 if (br == NULL)
1185 br = reganode(LONGJMP, 0);
1186 else
1187 regtail(br, reganode(LONGJMP, 0));
1188 c = *nextchar();
1189 if (c == '|') {
1190 lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */
1191 regbranch(&flags, 1);
1192 regtail(ret, lastbr);
1193 c = *nextchar();
a0ed51b3
LW
1194 }
1195 else
c277df42
IZ
1196 lastbr = NULL;
1197 if (c != ')')
1198 FAIL("Switch (?(condition)... contains too many branches");
1199 ender = reg_node(TAIL);
1200 regtail(br, ender);
1201 if (lastbr) {
1202 regtail(lastbr, ender);
1203 regtail(NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
1204 }
1205 else
c277df42
IZ
1206 regtail(ret, ender);
1207 return ret;
a0ed51b3
LW
1208 }
1209 else {
3280af22 1210 FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse);
c277df42
IZ
1211 }
1212 }
1b1626e4 1213 case 0:
c277df42 1214 FAIL("Sequence (? incomplete");
1b1626e4 1215 break;
a0d0e21e 1216 default:
3280af22 1217 --PL_regcomp_parse;
ca9dfc88 1218 parse_flags:
3280af22
NIS
1219 while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) {
1220 if (*PL_regcomp_parse != 'o')
1221 pmflag(flagsp, *PL_regcomp_parse);
1222 ++PL_regcomp_parse;
ca9dfc88 1223 }
3280af22 1224 if (*PL_regcomp_parse == '-') {
ca9dfc88 1225 flagsp = &negflags;
3280af22 1226 ++PL_regcomp_parse;
ca9dfc88 1227 goto parse_flags;
48c036b1 1228 }
3280af22
NIS
1229 PL_regflags |= posflags;
1230 PL_regflags &= ~negflags;
1231 if (*PL_regcomp_parse == ':') {
1232 PL_regcomp_parse++;
ca9dfc88
IZ
1233 paren = ':';
1234 break;
1235 }
c277df42 1236 unknown:
3280af22
NIS
1237 if (*PL_regcomp_parse != ')')
1238 FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse);
a0d0e21e
LW
1239 nextchar();
1240 *flagp = TRYAGAIN;
1241 return NULL;
1242 }
1243 }
1244 else {
3280af22
NIS
1245 parno = PL_regnpar;
1246 PL_regnpar++;
a0d0e21e 1247 ret = reganode(OPEN, parno);
c277df42 1248 open = 1;
a0d0e21e 1249 }
a0ed51b3
LW
1250 }
1251 else
a0d0e21e
LW
1252 ret = NULL;
1253
1254 /* Pick up the branches, linking them together. */
c277df42 1255 br = regbranch(&flags, 1);
a0d0e21e
LW
1256 if (br == NULL)
1257 return(NULL);
3280af22
NIS
1258 if (*PL_regcomp_parse == '|') {
1259 if (!SIZE_ONLY && PL_extralen) {
c277df42 1260 reginsert(BRANCHJ, br);
a0ed51b3
LW
1261 }
1262 else
c277df42
IZ
1263 reginsert(BRANCH, br);
1264 have_branch = 1;
1265 if (SIZE_ONLY)
3280af22 1266 PL_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
1267 }
1268 else if (paren == ':') {
c277df42
IZ
1269 *flagp |= flags&SIMPLE;
1270 }
1271 if (open) { /* Starts with OPEN. */
1272 regtail(ret, br); /* OPEN -> first. */
a0ed51b3
LW
1273 }
1274 else if (paren != '?') /* Not Conditional */
a0d0e21e 1275 ret = br;
821b33a5
IZ
1276 if (flags&HASWIDTH)
1277 *flagp |= HASWIDTH;
a0d0e21e 1278 *flagp |= flags&SPSTART;
c277df42 1279 lastbr = br;
3280af22
NIS
1280 while (*PL_regcomp_parse == '|') {
1281 if (!SIZE_ONLY && PL_extralen) {
c277df42
IZ
1282 ender = reganode(LONGJMP,0);
1283 regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
1284 }
1285 if (SIZE_ONLY)
3280af22 1286 PL_extralen += 2; /* Account for LONGJMP. */
a0d0e21e 1287 nextchar();
c277df42 1288 br = regbranch(&flags, 0);
a687059c 1289 if (br == NULL)
a0d0e21e 1290 return(NULL);
c277df42
IZ
1291 regtail(lastbr, br); /* BRANCH -> BRANCH. */
1292 lastbr = br;
821b33a5
IZ
1293 if (flags&HASWIDTH)
1294 *flagp |= HASWIDTH;
a687059c 1295 *flagp |= flags&SPSTART;
a0d0e21e
LW
1296 }
1297
c277df42
IZ
1298 if (have_branch || paren != ':') {
1299 /* Make a closing node, and hook it on the end. */
1300 switch (paren) {
1301 case ':':
1302 ender = reg_node(TAIL);
1303 break;
1304 case 1:
1305 ender = reganode(CLOSE, parno);
1306 break;
1307 case '<':
c277df42
IZ
1308 case ',':
1309 case '=':
1310 case '!':
c277df42 1311 *flagp &= ~HASWIDTH;
821b33a5
IZ
1312 /* FALL THROUGH */
1313 case '>':
1314 ender = reg_node(SUCCEED);
c277df42
IZ
1315 break;
1316 case 0:
1317 ender = reg_node(END);
1318 break;
1319 }
1320 regtail(lastbr, ender);
a0d0e21e 1321
c277df42
IZ
1322 if (have_branch) {
1323 /* Hook the tails of the branches to the closing node. */
1324 for (br = ret; br != NULL; br = regnext(br)) {
1325 regoptail(br, ender);
1326 }
1327 }
a0d0e21e 1328 }
c277df42
IZ
1329
1330 {
1331 char *p;
1332 static char parens[] = "=!<,>";
1333
1334 if (paren && (p = strchr(parens, paren))) {
1335 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
1336 int flag = (p - parens) > 1;
1337
1338 if (paren == '>')
1339 node = SUSPEND, flag = 0;
1340 reginsert(node,ret);
c277df42 1341 ret->flags = flag;
c277df42
IZ
1342 regtail(ret, reg_node(TAIL));
1343 }
a0d0e21e
LW
1344 }
1345
1346 /* Check for proper termination. */
3280af22 1347 if (paren && (PL_regcomp_parse >= PL_regxend || *nextchar() != ')')) {
a0d0e21e 1348 FAIL("unmatched () in regexp");
a0ed51b3
LW
1349 }
1350 else if (!paren && PL_regcomp_parse < PL_regxend) {
3280af22 1351 if (*PL_regcomp_parse == ')') {
a0d0e21e 1352 FAIL("unmatched () in regexp");
a0ed51b3
LW
1353 }
1354 else
a0d0e21e
LW
1355 FAIL("junk on end of regexp"); /* "Can't happen". */
1356 /* NOTREACHED */
1357 }
c277df42 1358 if (paren != 0) {
3280af22 1359 PL_regflags = oregflags;
c277df42 1360 }
a687059c 1361
a0d0e21e 1362 return(ret);
a687059c
LW
1363}
1364
1365/*
1366 - regbranch - one alternative of an | operator
1367 *
1368 * Implements the concatenation operator.
1369 */
76e3520e 1370STATIC regnode *
c277df42 1371regbranch(I32 *flagp, I32 first)
a687059c 1372{
5c0ca799 1373 dTHR;
c277df42
IZ
1374 register regnode *ret;
1375 register regnode *chain = NULL;
1376 register regnode *latest;
1377 I32 flags = 0, c = 0;
a0d0e21e 1378
c277df42
IZ
1379 if (first)
1380 ret = NULL;
1381 else {
3280af22 1382 if (!SIZE_ONLY && PL_extralen)
c277df42
IZ
1383 ret = reganode(BRANCHJ,0);
1384 else
1385 ret = reg_node(BRANCH);
1386 }
1387
1388 if (!first && SIZE_ONLY)
3280af22 1389 PL_extralen += 1; /* BRANCHJ */
c277df42
IZ
1390
1391 *flagp = WORST; /* Tentatively. */
a0d0e21e 1392
3280af22 1393 PL_regcomp_parse--;
a0d0e21e 1394 nextchar();
3280af22 1395 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') {
a0d0e21e
LW
1396 flags &= ~TRYAGAIN;
1397 latest = regpiece(&flags);
1398 if (latest == NULL) {
1399 if (flags & TRYAGAIN)
1400 continue;
1401 return(NULL);
a0ed51b3
LW
1402 }
1403 else if (ret == NULL)
c277df42 1404 ret = latest;
a0d0e21e 1405 *flagp |= flags&HASWIDTH;
c277df42 1406 if (chain == NULL) /* First piece. */
a0d0e21e
LW
1407 *flagp |= flags&SPSTART;
1408 else {
3280af22 1409 PL_regnaughty++;
a0d0e21e 1410 regtail(chain, latest);
a687059c 1411 }
a0d0e21e 1412 chain = latest;
c277df42
IZ
1413 c++;
1414 }
1415 if (chain == NULL) { /* Loop ran zero times. */
1416 chain = reg_node(NOTHING);
1417 if (ret == NULL)
1418 ret = chain;
1419 }
1420 if (c == 1) {
1421 *flagp |= flags&SIMPLE;
a0d0e21e 1422 }
a687059c 1423
a0d0e21e 1424 return(ret);
a687059c
LW
1425}
1426
1427/*
1428 - regpiece - something followed by possible [*+?]
1429 *
1430 * Note that the branching code sequences used for ? and the general cases
1431 * of * and + are somewhat optimized: they use the same NOTHING node as
1432 * both the endmarker for their branch list and the body of the last branch.
1433 * It might seem that this node could be dispensed with entirely, but the
1434 * endmarker role is not redundant.
1435 */
76e3520e 1436STATIC regnode *
8ac85365 1437regpiece(I32 *flagp)
a687059c 1438{
5c0ca799 1439 dTHR;
c277df42 1440 register regnode *ret;
a0d0e21e
LW
1441 register char op;
1442 register char *next;
1443 I32 flags;
3280af22 1444 char *origparse = PL_regcomp_parse;
a0d0e21e
LW
1445 char *maxpos;
1446 I32 min;
c277df42 1447 I32 max = REG_INFTY;
a0d0e21e
LW
1448
1449 ret = regatom(&flags);
1450 if (ret == NULL) {
1451 if (flags & TRYAGAIN)
1452 *flagp |= TRYAGAIN;
1453 return(NULL);
1454 }
1455
3280af22 1456 op = *PL_regcomp_parse;
a0d0e21e 1457
3280af22
NIS
1458 if (op == '{' && regcurly(PL_regcomp_parse)) {
1459 next = PL_regcomp_parse + 1;
a0d0e21e
LW
1460 maxpos = Nullch;
1461 while (isDIGIT(*next) || *next == ',') {
1462 if (*next == ',') {
1463 if (maxpos)
1464 break;
1465 else
1466 maxpos = next;
a687059c 1467 }
a0d0e21e
LW
1468 next++;
1469 }
1470 if (*next == '}') { /* got one */
1471 if (!maxpos)
1472 maxpos = next;
3280af22
NIS
1473 PL_regcomp_parse++;
1474 min = atoi(PL_regcomp_parse);
a0d0e21e
LW
1475 if (*maxpos == ',')
1476 maxpos++;
1477 else
3280af22 1478 maxpos = PL_regcomp_parse;
a0d0e21e
LW
1479 max = atoi(maxpos);
1480 if (!max && *maxpos != '0')
c277df42
IZ
1481 max = REG_INFTY; /* meaning "infinity" */
1482 else if (max >= REG_INFTY)
1483 FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3280af22 1484 PL_regcomp_parse = next;
a0d0e21e
LW
1485 nextchar();
1486
1487 do_curly:
1488 if ((flags&SIMPLE)) {
3280af22 1489 PL_regnaughty += 2 + PL_regnaughty / 2;
a0d0e21e
LW
1490 reginsert(CURLY, ret);
1491 }
1492 else {
3280af22 1493 PL_regnaughty += 4 + PL_regnaughty; /* compound interest */
c277df42 1494 regtail(ret, reg_node(WHILEM));
3280af22 1495 if (!SIZE_ONLY && PL_extralen) {
c277df42
IZ
1496 reginsert(LONGJMP,ret);
1497 reginsert(NOTHING,ret);
1498 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
1499 }
a0d0e21e 1500 reginsert(CURLYX,ret);
3280af22 1501 if (!SIZE_ONLY && PL_extralen)
c277df42
IZ
1502 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
1503 regtail(ret, reg_node(NOTHING));
1504 if (SIZE_ONLY)
3280af22 1505 PL_extralen += 3;
a0d0e21e 1506 }
c277df42 1507 ret->flags = 0;
a0d0e21e
LW
1508
1509 if (min > 0)
821b33a5
IZ
1510 *flagp = WORST;
1511 if (max > 0)
1512 *flagp |= HASWIDTH;
a0d0e21e 1513 if (max && max < min)
c277df42
IZ
1514 FAIL("Can't do {n,m} with n > m");
1515 if (!SIZE_ONLY) {
1516 ARG1_SET(ret, min);
1517 ARG2_SET(ret, max);
a687059c 1518 }
a687059c 1519
a0d0e21e 1520 goto nest_check;
a687059c 1521 }
a0d0e21e 1522 }
a687059c 1523
a0d0e21e
LW
1524 if (!ISMULT1(op)) {
1525 *flagp = flags;
a687059c 1526 return(ret);
a0d0e21e 1527 }
bb20fd44 1528
c277df42 1529#if 0 /* Now runtime fix should be reliable. */
bb20fd44 1530 if (!(flags&HASWIDTH) && op != '?')
c277df42
IZ
1531 FAIL("regexp *+ operand could be empty");
1532#endif
bb20fd44 1533
a0d0e21e
LW
1534 nextchar();
1535
821b33a5 1536 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
1537
1538 if (op == '*' && (flags&SIMPLE)) {
1539 reginsert(STAR, ret);
c277df42 1540 ret->flags = 0;
3280af22 1541 PL_regnaughty += 4;
a0d0e21e
LW
1542 }
1543 else if (op == '*') {
1544 min = 0;
1545 goto do_curly;
a0ed51b3
LW
1546 }
1547 else if (op == '+' && (flags&SIMPLE)) {
a0d0e21e 1548 reginsert(PLUS, ret);
c277df42 1549 ret->flags = 0;
3280af22 1550 PL_regnaughty += 3;
a0d0e21e
LW
1551 }
1552 else if (op == '+') {
1553 min = 1;
1554 goto do_curly;
a0ed51b3
LW
1555 }
1556 else if (op == '?') {
a0d0e21e
LW
1557 min = 0; max = 1;
1558 goto do_curly;
1559 }
1560 nest_check:
599cee73
PM
1561 if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) {
1562 warner(WARN_UNSAFE, "%.*s matches null string many times",
3280af22 1563 PL_regcomp_parse - origparse, origparse);
a0d0e21e
LW
1564 }
1565
3280af22 1566 if (*PL_regcomp_parse == '?') {
a0d0e21e
LW
1567 nextchar();
1568 reginsert(MINMOD, ret);
c277df42 1569 regtail(ret, ret + NODE_STEP_REGNODE);
a0d0e21e 1570 }
3280af22 1571 if (ISMULT2(PL_regcomp_parse))
a0d0e21e
LW
1572 FAIL("nested *?+ in regexp");
1573
1574 return(ret);
a687059c
LW
1575}
1576
1577/*
1578 - regatom - the lowest level
1579 *
1580 * Optimization: gobbles an entire sequence of ordinary characters so that
1581 * it can turn them into a single node, which is smaller to store and
1582 * faster to run. Backslashed characters are exceptions, each becoming a
1583 * separate node; the code is simpler that way and it's not worth fixing.
1584 *
1585 * [Yes, it is worth fixing, some scripts can run twice the speed.]
1586 */
76e3520e 1587STATIC regnode *
8ac85365 1588regatom(I32 *flagp)
a687059c 1589{
5c0ca799 1590 dTHR;
c277df42 1591 register regnode *ret = 0;
a0d0e21e
LW
1592 I32 flags;
1593
1594 *flagp = WORST; /* Tentatively. */
1595
1596tryagain:
3280af22 1597 switch (*PL_regcomp_parse) {
a0d0e21e 1598 case '^':
3280af22 1599 PL_seen_zerolen++;
a0d0e21e 1600 nextchar();
3280af22 1601 if (PL_regflags & PMf_MULTILINE)
c277df42 1602 ret = reg_node(MBOL);
3280af22 1603 else if (PL_regflags & PMf_SINGLELINE)
c277df42 1604 ret = reg_node(SBOL);
a0d0e21e 1605 else
c277df42 1606 ret = reg_node(BOL);
a0d0e21e
LW
1607 break;
1608 case '$':
3280af22
NIS
1609 if (PL_regcomp_parse[1])
1610 PL_seen_zerolen++;
a0d0e21e 1611 nextchar();
3280af22 1612 if (PL_regflags & PMf_MULTILINE)
c277df42 1613 ret = reg_node(MEOL);
3280af22 1614 else if (PL_regflags & PMf_SINGLELINE)
c277df42 1615 ret = reg_node(SEOL);
a0d0e21e 1616 else
c277df42 1617 ret = reg_node(EOL);
a0d0e21e
LW
1618 break;
1619 case '.':
1620 nextchar();
a0ed51b3
LW
1621 if (UTF) {
1622 if (PL_regflags & PMf_SINGLELINE)
1623 ret = reg_node(SANYUTF8);
1624 else
1625 ret = reg_node(ANYUTF8);
1626 *flagp |= HASWIDTH;
1627 }
1628 else {
1629 if (PL_regflags & PMf_SINGLELINE)
1630 ret = reg_node(SANY);
1631 else
1632 ret = reg_node(ANY);
1633 *flagp |= HASWIDTH|SIMPLE;
1634 }
3280af22 1635 PL_regnaughty++;
a0d0e21e
LW
1636 break;
1637 case '[':
3280af22 1638 PL_regcomp_parse++;
a0ed51b3 1639 ret = (UTF ? regclassutf8() : regclass());
a14b48bc
LW
1640 if (*PL_regcomp_parse != ']')
1641 FAIL("unmatched [] in regexp");
1642 nextchar();
a0d0e21e
LW
1643 *flagp |= HASWIDTH|SIMPLE;
1644 break;
1645 case '(':
1646 nextchar();
1647 ret = reg(1, &flags);
1648 if (ret == NULL) {
1649 if (flags & TRYAGAIN)
1650 goto tryagain;
1651 return(NULL);
1652 }
c277df42 1653 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
1654 break;
1655 case '|':
1656 case ')':
1657 if (flags & TRYAGAIN) {
1658 *flagp |= TRYAGAIN;
1659 return NULL;
1660 }
3280af22 1661 FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse);
a0d0e21e
LW
1662 /* Supposed to be caught earlier. */
1663 break;
85afd4ae 1664 case '{':
3280af22
NIS
1665 if (!regcurly(PL_regcomp_parse)) {
1666 PL_regcomp_parse++;
85afd4ae
CS
1667 goto defchar;
1668 }
1669 /* FALL THROUGH */
a0d0e21e
LW
1670 case '?':
1671 case '+':
1672 case '*':
3115e423 1673 FAIL("?+*{} follows nothing in regexp");
a0d0e21e
LW
1674 break;
1675 case '\\':
3280af22 1676 switch (*++PL_regcomp_parse) {
a0d0e21e 1677 case 'A':
3280af22 1678 PL_seen_zerolen++;
c277df42 1679 ret = reg_node(SBOL);
a0d0e21e
LW
1680 *flagp |= SIMPLE;
1681 nextchar();
1682 break;
1683 case 'G':
c277df42 1684 ret = reg_node(GPOS);
3280af22 1685 PL_regseen |= REG_SEEN_GPOS;
a0d0e21e
LW
1686 *flagp |= SIMPLE;
1687 nextchar();
1688 break;
1689 case 'Z':
c277df42 1690 ret = reg_node(SEOL);
a0d0e21e
LW
1691 *flagp |= SIMPLE;
1692 nextchar();
1693 break;
b85d18e9
IZ
1694 case 'z':
1695 ret = reg_node(EOS);
1696 *flagp |= SIMPLE;
3280af22 1697 PL_seen_zerolen++; /* Do not optimize RE away */
b85d18e9
IZ
1698 nextchar();
1699 break;
a0ed51b3
LW
1700 case 'C':
1701 ret = reg_node(SANY);
1702 *flagp |= HASWIDTH|SIMPLE;
1703 nextchar();
1704 break;
1705 case 'X':
1706 ret = reg_node(CLUMP);
1707 *flagp |= HASWIDTH;
1708 nextchar();
1709 if (UTF && !PL_utf8_mark)
dfe13c55 1710 is_utf8_mark((U8*)"~"); /* preload table */
a0ed51b3 1711 break;
a0d0e21e 1712 case 'w':
a0ed51b3
LW
1713 ret = reg_node(
1714 UTF
1715 ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
1716 : (LOC ? ALNUML : ALNUM));
a0d0e21e
LW
1717 *flagp |= HASWIDTH|SIMPLE;
1718 nextchar();
a0ed51b3 1719 if (UTF && !PL_utf8_alnum)
dfe13c55 1720 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
1721 break;
1722 case 'W':
a0ed51b3
LW
1723 ret = reg_node(
1724 UTF
1725 ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
1726 : (LOC ? NALNUML : NALNUM));
a0d0e21e
LW
1727 *flagp |= HASWIDTH|SIMPLE;
1728 nextchar();
a0ed51b3 1729 if (UTF && !PL_utf8_alnum)
dfe13c55 1730 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
1731 break;
1732 case 'b':
3280af22 1733 PL_seen_zerolen++;
a0ed51b3
LW
1734 ret = reg_node(
1735 UTF
1736 ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
1737 : (LOC ? BOUNDL : BOUND));
a0d0e21e
LW
1738 *flagp |= SIMPLE;
1739 nextchar();
a0ed51b3 1740 if (UTF && !PL_utf8_alnum)
dfe13c55 1741 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
1742 break;
1743 case 'B':
3280af22 1744 PL_seen_zerolen++;
a0ed51b3
LW
1745 ret = reg_node(
1746 UTF
1747 ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
1748 : (LOC ? NBOUNDL : NBOUND));
a0d0e21e
LW
1749 *flagp |= SIMPLE;
1750 nextchar();
a0ed51b3 1751 if (UTF && !PL_utf8_alnum)
dfe13c55 1752 is_utf8_alnum((U8*)"a"); /* preload table */
a0d0e21e
LW
1753 break;
1754 case 's':
a0ed51b3
LW
1755 ret = reg_node(
1756 UTF
1757 ? (LOC ? SPACELUTF8 : SPACEUTF8)
1758 : (LOC ? SPACEL : SPACE));
a0d0e21e
LW
1759 *flagp |= HASWIDTH|SIMPLE;
1760 nextchar();
a0ed51b3 1761 if (UTF && !PL_utf8_space)
dfe13c55 1762 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
1763 break;
1764 case 'S':
a0ed51b3
LW
1765 ret = reg_node(
1766 UTF
1767 ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
1768 : (LOC ? NSPACEL : NSPACE));
a0d0e21e
LW
1769 *flagp |= HASWIDTH|SIMPLE;
1770 nextchar();
a0ed51b3 1771 if (UTF && !PL_utf8_space)
dfe13c55 1772 is_utf8_space((U8*)" "); /* preload table */
a0d0e21e
LW
1773 break;
1774 case 'd':
a0ed51b3 1775 ret = reg_node(UTF ? DIGITUTF8 : DIGIT);
a0d0e21e
LW
1776 *flagp |= HASWIDTH|SIMPLE;
1777 nextchar();
a0ed51b3 1778 if (UTF && !PL_utf8_digit)
dfe13c55 1779 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e
LW
1780 break;
1781 case 'D':
a0ed51b3 1782 ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT);
a0d0e21e
LW
1783 *flagp |= HASWIDTH|SIMPLE;
1784 nextchar();
a0ed51b3 1785 if (UTF && !PL_utf8_digit)
dfe13c55 1786 is_utf8_digit((U8*)"1"); /* preload table */
a0d0e21e 1787 break;
a14b48bc
LW
1788 case 'p':
1789 case 'P':
1790 { /* a lovely hack--pretend we saw [\pX] instead */
1791 char* oldregxend = PL_regxend;
1792
1793 if (PL_regcomp_parse[1] == '{') {
1794 PL_regxend = strchr(PL_regcomp_parse, '}');
1795 if (!PL_regxend)
1796 FAIL("Missing right brace on \\p{}");
1797 PL_regxend++;
1798 }
1799 else
1800 PL_regxend = PL_regcomp_parse + 2;
1801 PL_regcomp_parse--;
1802
1803 ret = regclassutf8();
1804
1805 PL_regxend = oldregxend;
1806 PL_regcomp_parse--;
1807 nextchar();
1808 *flagp |= HASWIDTH|SIMPLE;
1809 }
1810 break;
a0d0e21e
LW
1811 case 'n':
1812 case 'r':
1813 case 't':
1814 case 'f':
1815 case 'e':
1816 case 'a':
1817 case 'x':
1818 case 'c':
1819 case '0':
1820 goto defchar;
1821 case '1': case '2': case '3': case '4':
1822 case '5': case '6': case '7': case '8': case '9':
1823 {
3280af22 1824 I32 num = atoi(PL_regcomp_parse);
a0d0e21e 1825
3280af22 1826 if (num > 9 && num >= PL_regnpar)
a0d0e21e
LW
1827 goto defchar;
1828 else {
3280af22 1829 if (!SIZE_ONLY && num > PL_regcomp_rx->nparens)
ef64f398 1830 FAIL("reference to nonexistent group");
3280af22 1831 PL_regsawback = 1;
a0ed51b3
LW
1832 ret = reganode(FOLD
1833 ? (LOC ? REFFL : REFF)
c8756f30 1834 : REF, num);
a0d0e21e 1835 *flagp |= HASWIDTH;
3280af22
NIS
1836 while (isDIGIT(*PL_regcomp_parse))
1837 PL_regcomp_parse++;
1838 PL_regcomp_parse--;
a0d0e21e
LW
1839 nextchar();
1840 }
1841 }
1842 break;
1843 case '\0':
3280af22 1844 if (PL_regcomp_parse >= PL_regxend)
a0d0e21e
LW
1845 FAIL("trailing \\ in regexp");
1846 /* FALL THROUGH */
1847 default:
1848 goto defchar;
1849 }
1850 break;
4633a7c4
LW
1851
1852 case '#':
3280af22
NIS
1853 if (PL_regflags & PMf_EXTENDED) {
1854 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++;
1855 if (PL_regcomp_parse < PL_regxend)
4633a7c4
LW
1856 goto tryagain;
1857 }
1858 /* FALL THROUGH */
1859
a0d0e21e
LW
1860 default: {
1861 register I32 len;
a0ed51b3 1862 register UV ender;
a0d0e21e 1863 register char *p;
c277df42 1864 char *oldp, *s;
a0d0e21e
LW
1865 I32 numlen;
1866
3280af22 1867 PL_regcomp_parse++;
a0d0e21e
LW
1868
1869 defchar:
a0ed51b3
LW
1870 ret = reg_node(FOLD
1871 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 1872 : EXACT);
161b471a 1873 s = (char *) OPERAND(ret);
c277df42 1874 regc(0, s++); /* save spot for len */
3280af22
NIS
1875 for (len = 0, p = PL_regcomp_parse - 1;
1876 len < 127 && p < PL_regxend;
a0d0e21e
LW
1877 len++)
1878 {
1879 oldp = p;
5b5a24f7 1880
3280af22
NIS
1881 if (PL_regflags & PMf_EXTENDED)
1882 p = regwhite(p, PL_regxend);
a0d0e21e
LW
1883 switch (*p) {
1884 case '^':
1885 case '$':
1886 case '.':
1887 case '[':
1888 case '(':
1889 case ')':
1890 case '|':
1891 goto loopdone;
1892 case '\\':
1893 switch (*++p) {
1894 case 'A':
1895 case 'G':
1896 case 'Z':
b85d18e9 1897 case 'z':
a0d0e21e
LW
1898 case 'w':
1899 case 'W':
1900 case 'b':
1901 case 'B':
1902 case 's':
1903 case 'S':
1904 case 'd':
1905 case 'D':
a14b48bc
LW
1906 case 'p':
1907 case 'P':
a0d0e21e
LW
1908 --p;
1909 goto loopdone;
1910 case 'n':
1911 ender = '\n';
1912 p++;
a687059c 1913 break;
a0d0e21e
LW
1914 case 'r':
1915 ender = '\r';
1916 p++;
a687059c 1917 break;
a0d0e21e
LW
1918 case 't':
1919 ender = '\t';
1920 p++;
a687059c 1921 break;
a0d0e21e
LW
1922 case 'f':
1923 ender = '\f';
1924 p++;
a687059c 1925 break;
a0d0e21e
LW
1926 case 'e':
1927 ender = '\033';
1928 p++;
a687059c 1929 break;
a0d0e21e
LW
1930 case 'a':
1931 ender = '\007';
1932 p++;
a687059c 1933 break;
a0d0e21e 1934 case 'x':
a0ed51b3
LW
1935 if (*++p == '{') {
1936 char* e = strchr(p, '}');
1937
1938 if (!e)
1939 FAIL("Missing right brace on \\x{}");
1940 else if (UTF) {
1941 ender = scan_hex(p + 1, e - p, &numlen);
1942 if (numlen + len >= 127) { /* numlen is generous */
1943 p--;
1944 goto loopdone;
1945 }
1946 p = e + 1;
1947 }
1948 else
1949 FAIL("Can't use \\x{} without 'use utf8' declaration");
1950 }
1951 else {
1952 ender = scan_hex(p, 2, &numlen);
1953 p += numlen;
1954 }
a687059c 1955 break;
a0d0e21e
LW
1956 case 'c':
1957 p++;
bbce6d69 1958 ender = UCHARAT(p++);
1959 ender = toCTRL(ender);
a687059c 1960 break;
a0d0e21e
LW
1961 case '0': case '1': case '2': case '3':case '4':
1962 case '5': case '6': case '7': case '8':case '9':
1963 if (*p == '0' ||
3280af22 1964 (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) {
a0d0e21e
LW
1965 ender = scan_oct(p, 3, &numlen);
1966 p += numlen;
1967 }
1968 else {
1969 --p;
1970 goto loopdone;
a687059c
LW
1971 }
1972 break;
a0d0e21e 1973 case '\0':
3280af22 1974 if (p >= PL_regxend)
a687059c
LW
1975 FAIL("trailing \\ in regexp");
1976 /* FALL THROUGH */
a0d0e21e 1977 default:
a0ed51b3 1978 goto normal_default;
a0d0e21e
LW
1979 }
1980 break;
a687059c 1981 default:
a0ed51b3
LW
1982 normal_default:
1983 if ((*p & 0xc0) == 0xc0 && UTF) {
dfe13c55 1984 ender = utf8_to_uv((U8*)p, &numlen);
a0ed51b3
LW
1985 p += numlen;
1986 }
1987 else
1988 ender = *p++;
a0d0e21e 1989 break;
a687059c 1990 }
3280af22
NIS
1991 if (PL_regflags & PMf_EXTENDED)
1992 p = regwhite(p, PL_regxend);
a0ed51b3
LW
1993 if (UTF && FOLD) {
1994 if (LOC)
1995 ender = toLOWER_LC_uni(ender);
1996 else
1997 ender = toLOWER_uni(ender);
1998 }
a0d0e21e
LW
1999 if (ISMULT2(p)) { /* Back off on ?+*. */
2000 if (len)
2001 p = oldp;
a0ed51b3
LW
2002 else if (ender >= 0x80 && UTF) {
2003 reguni(ender, s, &numlen);
2004 s += numlen;
2005 len += numlen;
2006 }
a0d0e21e
LW
2007 else {
2008 len++;
c277df42 2009 regc(ender, s++);
a0d0e21e
LW
2010 }
2011 break;
a687059c 2012 }
a0ed51b3
LW
2013 if (ender >= 0x80 && UTF) {
2014 reguni(ender, s, &numlen);
2015 s += numlen;
2016 len += numlen - 1;
2017 }
2018 else
2019 regc(ender, s++);
a0d0e21e
LW
2020 }
2021 loopdone:
3280af22 2022 PL_regcomp_parse = p - 1;
a0d0e21e
LW
2023 nextchar();
2024 if (len < 0)
2025 FAIL("internal disaster in regexp");
2026 if (len > 0)
2027 *flagp |= HASWIDTH;
2028 if (len == 1)
2029 *flagp |= SIMPLE;
c277df42 2030 if (!SIZE_ONLY)
a0d0e21e 2031 *OPERAND(ret) = len;
c277df42
IZ
2032 regc('\0', s++);
2033 if (SIZE_ONLY) {
3280af22 2034 PL_regsize += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode);
a0ed51b3
LW
2035 }
2036 else {
3280af22 2037 PL_regcode += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode);
c277df42 2038 }
a687059c 2039 }
a0d0e21e
LW
2040 break;
2041 }
a687059c 2042
a0d0e21e 2043 return(ret);
a687059c
LW
2044}
2045
873ef191 2046STATIC char *
8ac85365 2047regwhite(char *p, char *e)
5b5a24f7
CS
2048{
2049 while (p < e) {
2050 if (isSPACE(*p))
2051 ++p;
2052 else if (*p == '#') {
2053 do {
2054 p++;
2055 } while (p < e && *p != '\n');
2056 }
2057 else
2058 break;
2059 }
2060 return p;
2061}
2062
76e3520e 2063STATIC regnode *
8ac85365 2064regclass(void)
a687059c 2065{
5c0ca799 2066 dTHR;
c277df42 2067 register char *opnd, *s;
a0ed51b3
LW
2068 register I32 value;
2069 register I32 lastvalue = 1234;
a0d0e21e 2070 register I32 range = 0;
c277df42 2071 register regnode *ret;
a0d0e21e
LW
2072 register I32 def;
2073 I32 numlen;
2074
3280af22 2075 s = opnd = (char *) OPERAND(PL_regcode);
c277df42 2076 ret = reg_node(ANYOF);
a0ed51b3 2077 for (value = 0; value < 33; value++)
c277df42 2078 regc(0, s++);
3280af22
NIS
2079 if (*PL_regcomp_parse == '^') { /* Complement of range. */
2080 PL_regnaughty++;
2081 PL_regcomp_parse++;
c277df42 2082 if (!SIZE_ONLY)
bbce6d69 2083 *opnd |= ANYOF_INVERT;
2084 }
c277df42 2085 if (!SIZE_ONLY) {
3280af22 2086 PL_regcode += ANY_SKIP;
a0ed51b3 2087 if (FOLD)
bbce6d69 2088 *opnd |= ANYOF_FOLD;
a0ed51b3 2089 if (LOC)
bbce6d69 2090 *opnd |= ANYOF_LOCALE;
a0ed51b3
LW
2091 }
2092 else {
3280af22 2093 PL_regsize += ANY_SKIP;
a0d0e21e 2094 }
3280af22 2095 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
a0d0e21e 2096 goto skipcond; /* allow 1st char to be ] or - */
3280af22 2097 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
a0d0e21e 2098 skipcond:
a0ed51b3
LW
2099 value = UCHARAT(PL_regcomp_parse++);
2100 if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
4599a1de 2101 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3280af22
NIS
2102 (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) {
2103 char posixccc = *PL_regcomp_parse;
2104 char* posixccs = PL_regcomp_parse++;
4599a1de 2105
3280af22
NIS
2106 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != posixccc)
2107 PL_regcomp_parse++;
2108 if (PL_regcomp_parse == PL_regxend)
4599a1de 2109 /* Grandfather lone [:, [=, [. */
3280af22 2110 PL_regcomp_parse = posixccs;
4599a1de 2111 else {
3280af22
NIS
2112 PL_regcomp_parse++; /* skip over the posixccc */
2113 if (*PL_regcomp_parse == ']') {
4599a1de
JH
2114 /* Not Implemented Yet.
2115 * (POSIX Extended Character Classes, that is)
2116 * The text between e.g. [: and :] would start
77d41b28 2117 * at posixccs + 1 and stop at regcomp_parse - 2. */
599cee73
PM
2118 if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
2119 warner(WARN_UNSAFE,
2120 "Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
3280af22 2121 PL_regcomp_parse++; /* skip over the ending ] */
4599a1de
JH
2122 }
2123 }
2124 }
a0ed51b3
LW
2125 if (value == '\\') {
2126 value = UCHARAT(PL_regcomp_parse++);
2127 switch (value) {
a0d0e21e 2128 case 'w':
ae5c130c 2129 if (!SIZE_ONLY) {
a0ed51b3 2130 if (LOC)
bbce6d69 2131 *opnd |= ANYOF_ALNUML;
ae5c130c 2132 else {
a0ed51b3
LW
2133 for (value = 0; value < 256; value++)
2134 if (isALNUM(value))
2135 ANYOF_SET(opnd, value);
ae5c130c 2136 }
bbce6d69 2137 }
a0ed51b3 2138 lastvalue = 1234;
a0d0e21e
LW
2139 continue;
2140 case 'W':
ae5c130c 2141 if (!SIZE_ONLY) {
a0ed51b3 2142 if (LOC)
bbce6d69 2143 *opnd |= ANYOF_NALNUML;
ae5c130c 2144 else {
a0ed51b3
LW
2145 for (value = 0; value < 256; value++)
2146 if (!isALNUM(value))
2147 ANYOF_SET(opnd, value);
ae5c130c 2148 }
bbce6d69 2149 }
a0ed51b3 2150 lastvalue = 1234;
a0d0e21e
LW
2151 continue;
2152 case 's':
ae5c130c 2153 if (!SIZE_ONLY) {
a0ed51b3 2154 if (LOC)
bbce6d69 2155 *opnd |= ANYOF_SPACEL;
ae5c130c 2156 else {
a0ed51b3
LW
2157 for (value = 0; value < 256; value++)
2158 if (isSPACE(value))
2159 ANYOF_SET(opnd, value);
ae5c130c 2160 }
bbce6d69 2161 }
a0ed51b3 2162 lastvalue = 1234;
a0d0e21e
LW
2163 continue;
2164 case 'S':
ae5c130c 2165 if (!SIZE_ONLY) {
a0ed51b3 2166 if (LOC)
bbce6d69 2167 *opnd |= ANYOF_NSPACEL;
ae5c130c 2168 else {
a0ed51b3
LW
2169 for (value = 0; value < 256; value++)
2170 if (!isSPACE(value))
2171 ANYOF_SET(opnd, value);
ae5c130c 2172 }
bbce6d69 2173 }
a0ed51b3 2174 lastvalue = 1234;
a0d0e21e
LW
2175 continue;
2176 case 'd':
ae5c130c 2177 if (!SIZE_ONLY) {
a0ed51b3
LW
2178 for (value = '0'; value <= '9'; value++)
2179 ANYOF_SET(opnd, value);
ae5c130c 2180 }
a0ed51b3 2181 lastvalue = 1234;
a0d0e21e
LW
2182 continue;
2183 case 'D':
ae5c130c 2184 if (!SIZE_ONLY) {
a0ed51b3
LW
2185 for (value = 0; value < '0'; value++)
2186 ANYOF_SET(opnd, value);
2187 for (value = '9' + 1; value < 256; value++)
2188 ANYOF_SET(opnd, value);
ae5c130c 2189 }
a0ed51b3 2190 lastvalue = 1234;
a0d0e21e
LW
2191 continue;
2192 case 'n':
a0ed51b3 2193 value = '\n';
a0d0e21e
LW
2194 break;
2195 case 'r':
a0ed51b3 2196 value = '\r';
a0d0e21e
LW
2197 break;
2198 case 't':
a0ed51b3 2199 value = '\t';
a0d0e21e
LW
2200 break;
2201 case 'f':
a0ed51b3 2202 value = '\f';
a0d0e21e
LW
2203 break;
2204 case 'b':
a0ed51b3 2205 value = '\b';
a0d0e21e
LW
2206 break;
2207 case 'e':
a0ed51b3 2208 value = '\033';
a0d0e21e
LW
2209 break;
2210 case 'a':
a0ed51b3 2211 value = '\007';
a0d0e21e
LW
2212 break;
2213 case 'x':
a0ed51b3 2214 value = scan_hex(PL_regcomp_parse, 2, &numlen);
3280af22 2215 PL_regcomp_parse += numlen;
a0d0e21e
LW
2216 break;
2217 case 'c':
a0ed51b3
LW
2218 value = UCHARAT(PL_regcomp_parse++);
2219 value = toCTRL(value);
a0d0e21e
LW
2220 break;
2221 case '0': case '1': case '2': case '3': case '4':
2222 case '5': case '6': case '7': case '8': case '9':
a0ed51b3 2223 value = scan_oct(--PL_regcomp_parse, 3, &numlen);
3280af22 2224 PL_regcomp_parse += numlen;
a0d0e21e
LW
2225 break;
2226 }
2227 }
2228 if (range) {
a0ed51b3 2229 if (lastvalue > value)
a0d0e21e
LW
2230 FAIL("invalid [] range in regexp");
2231 range = 0;
2232 }
2233 else {
a0ed51b3 2234 lastvalue = value;
3280af22
NIS
2235 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
2236 PL_regcomp_parse[1] != ']') {
2237 PL_regcomp_parse++;
a0d0e21e
LW
2238 range = 1;
2239 continue; /* do it next time */
2240 }
a687059c 2241 }
ae5c130c 2242 if (!SIZE_ONLY) {
a0ed51b3
LW
2243 for ( ; lastvalue <= value; lastvalue++)
2244 ANYOF_SET(opnd, lastvalue);
ae5c130c 2245 }
a0ed51b3 2246 lastvalue = value;
a0d0e21e 2247 }
ae5c130c
GS
2248 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
2249 if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) {
a0ed51b3
LW
2250 for (value = 0; value < 256; ++value) {
2251 if (ANYOF_TEST(opnd, value)) {
2252 I32 cf = fold[value];
ae5c130c
GS
2253 ANYOF_SET(opnd, cf);
2254 }
2255 }
2256 *opnd &= ~ANYOF_FOLD;
2257 }
2258 /* optimize inverted simple patterns (e.g. [^a-z]) */
2259 if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) {
a0ed51b3
LW
2260 for (value = 0; value < 32; ++value)
2261 opnd[1 + value] ^= 0xFF;
ae5c130c
GS
2262 *opnd = 0;
2263 }
a0d0e21e
LW
2264 return ret;
2265}
2266
a0ed51b3
LW
2267STATIC regnode *
2268regclassutf8(void)
2269{
2270 register char *opnd, *e;
2271 register U32 value;
2272 register U32 lastvalue = 123456;
2273 register I32 range = 0;
2274 register regnode *ret;
2275 I32 numlen;
2276 I32 n;
2277 SV *listsv;
2278 U8 flags = 0;
c485e607 2279 dTHR;
a0ed51b3
LW
2280
2281 if (*PL_regcomp_parse == '^') { /* Complement of range. */
2282 PL_regnaughty++;
2283 PL_regcomp_parse++;
2284 if (!SIZE_ONLY)
2285 flags |= ANYOF_INVERT;
2286 }
2287 if (!SIZE_ONLY) {
2288 if (FOLD)
2289 flags |= ANYOF_FOLD;
2290 if (LOC)
2291 flags |= ANYOF_LOCALE;
2292 listsv = newSVpv("# comment\n",0);
2293 }
2294
2295 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
2296 goto skipcond; /* allow 1st char to be ] or - */
2297
2298 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
2299 skipcond:
dfe13c55 2300 value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
a0ed51b3
LW
2301 PL_regcomp_parse += numlen;
2302
2303 if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
2304 /* I smell either [: or [= or [. -- POSIX has been here, right? */
2305 (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) {
2306 char posixccc = *PL_regcomp_parse;
2307 char* posixccs = PL_regcomp_parse++;
2308
2309 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != posixccc)
2310 PL_regcomp_parse++;
2311 if (PL_regcomp_parse == PL_regxend)
2312 /* Grandfather lone [:, [=, [. */
2313 PL_regcomp_parse = posixccs;
2314 else {
2315 PL_regcomp_parse++; /* skip over the posixccc */
2316 if (*PL_regcomp_parse == ']') {
2317 /* Not Implemented Yet.
2318 * (POSIX Extended Character Classes, that is)
2319 * The text between e.g. [: and :] would start
2320 * at posixccs + 1 and stop at regcomp_parse - 2. */
599cee73
PM
2321 if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
2322 warner(WARN_UNSAFE,
2323 "Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
a0ed51b3
LW
2324 PL_regcomp_parse++; /* skip over the ending ] */
2325 }
2326 }
2327 }
2328
2329 if (value == '\\') {
dfe13c55 2330 value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
a0ed51b3
LW
2331 PL_regcomp_parse += numlen;
2332 switch (value) {
2333 case 'w':
2334 if (!SIZE_ONLY) {
2335 if (LOC)
2336 flags |= ANYOF_ALNUML;
2337
2338 sv_catpvf(listsv, "+utf8::IsAlnum\n");
2339 }
2340 lastvalue = 123456;
2341 continue;
2342 case 'W':
2343 if (!SIZE_ONLY) {
2344 if (LOC)
2345 flags |= ANYOF_NALNUML;
2346
2347 sv_catpvf(listsv,
2348 "-utf8::IsAlpha\n-utf8::IsDigit\n0000\t%04x\n%04x\tffff\n",
2349 '_' - 1,
2350 '_' + 1);
2351 }
2352 lastvalue = 123456;
2353 continue;
2354 case 's':
2355 if (!SIZE_ONLY) {
2356 if (LOC)
2357 flags |= ANYOF_SPACEL;
2358 sv_catpvf(listsv, "+utf8::IsSpace\n");
2359 if (!PL_utf8_space)
dfe13c55 2360 is_utf8_space((U8*)" ");
a0ed51b3
LW
2361 }
2362 lastvalue = 123456;
2363 continue;
2364 case 'S':
2365 if (!SIZE_ONLY) {
2366 if (LOC)
2367 flags |= ANYOF_NSPACEL;
2368 sv_catpvf(listsv,
2369 "!utf8::IsSpace\n");
2370 if (!PL_utf8_space)
dfe13c55 2371 is_utf8_space((U8*)" ");
a0ed51b3
LW
2372 }
2373 lastvalue = 123456;
2374 continue;
2375 case 'd':
2376 if (!SIZE_ONLY) {
2377 sv_catpvf(listsv, "+utf8::IsDigit\n");
2378 }
2379 lastvalue = 123456;
2380 continue;
2381 case 'D':
2382 if (!SIZE_ONLY) {
2383 sv_catpvf(listsv,
2384 "!utf8::IsDigit\n");
2385 }
2386 lastvalue = 123456;
2387 continue;
2388 case 'p':
2389 case 'P':
2390 if (*PL_regcomp_parse == '{') {
2391 e = strchr(PL_regcomp_parse++, '}');
2392 if (!e)
2393 FAIL("Missing right brace on \\p{}");
2394 n = e - PL_regcomp_parse;
2395 }
2396 else {
2397 e = PL_regcomp_parse;
2398 n = 1;
2399 }
2400 if (!SIZE_ONLY) {
2401 if (value == 'p')
2402 sv_catpvf(listsv, "+utf8::%.*s\n", n, PL_regcomp_parse);
2403 else
2404 sv_catpvf(listsv,
2405 "!utf8::%.*s\n", n, PL_regcomp_parse);
2406 }
2407 PL_regcomp_parse = e + 1;
2408 lastvalue = 123456;
2409 continue;
2410 case 'n':
2411 value = '\n';
2412 break;
2413 case 'r':
2414 value = '\r';
2415 break;
2416 case 't':
2417 value = '\t';
2418 break;
2419 case 'f':
2420 value = '\f';
2421 break;
2422 case 'b':
2423 value = '\b';
2424 break;
2425 case 'e':
2426 value = '\033';
2427 break;
2428 case 'a':
2429 value = '\007';
2430 break;
2431 case 'x':
2432 if (*PL_regcomp_parse == '{') {
2433 e = strchr(PL_regcomp_parse++, '}');
2434 if (!e)
2435 FAIL("Missing right brace on \\x{}");
2436 value = scan_hex(PL_regcomp_parse + 1, e - PL_regcomp_parse, &numlen);
2437 PL_regcomp_parse = e + 1;
2438 }
2439 else {
2440 value = scan_hex(PL_regcomp_parse, 2, &numlen);
2441 PL_regcomp_parse += numlen;
2442 }
2443 break;
2444 case 'c':
2445 value = UCHARAT(PL_regcomp_parse++);
2446 value = toCTRL(value);
2447 break;
2448 case '0': case '1': case '2': case '3': case '4':
2449 case '5': case '6': case '7': case '8': case '9':
2450 value = scan_oct(--PL_regcomp_parse, 3, &numlen);
2451 PL_regcomp_parse += numlen;
2452 break;
2453 }
2454 }
2455 if (range) {
2456 if (lastvalue > value)
2457 FAIL("invalid [] range in regexp");
2458 if (!SIZE_ONLY)
2459 sv_catpvf(listsv, "%04x\t%04x\n", lastvalue, value);
2460 lastvalue = value;
2461 range = 0;
2462 }
2463 else {
2464 lastvalue = value;
2465 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
2466 PL_regcomp_parse[1] != ']') {
2467 PL_regcomp_parse++;
2468 range = 1;
2469 continue; /* do it next time */
2470 }
2471 if (!SIZE_ONLY)
2472 sv_catpvf(listsv, "%04x\n", value);
2473 }
2474 }
a0ed51b3
LW
2475
2476 ret = reganode(ANYOFUTF8, 0);
2477
2478 if (!SIZE_ONLY) {
2479 SV *rv = swash_init("utf8", "", listsv, 1, 0);
2480 SvREFCNT_dec(listsv);
2481 n = add_data(1,"s");
2482 PL_regcomp_rx->data->data[n] = (void*)rv;
2483 ARG1_SET(ret, flags);
2484 ARG2_SET(ret, n);
2485 }
2486
2487 return ret;
2488}
2489
76e3520e 2490STATIC char*
8ac85365 2491nextchar(void)
a0d0e21e 2492{
5c0ca799 2493 dTHR;
3280af22 2494 char* retval = PL_regcomp_parse++;
a0d0e21e 2495
4633a7c4 2496 for (;;) {
3280af22
NIS
2497 if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' &&
2498 PL_regcomp_parse[2] == '#') {
2499 while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
2500 PL_regcomp_parse++;
2501 PL_regcomp_parse++;
4633a7c4
LW
2502 continue;
2503 }
3280af22
NIS
2504 if (PL_regflags & PMf_EXTENDED) {
2505 if (isSPACE(*PL_regcomp_parse)) {
2506 PL_regcomp_parse++;
748a9306
LW
2507 continue;
2508 }
3280af22
NIS
2509 else if (*PL_regcomp_parse == '#') {
2510 while (*PL_regcomp_parse && *PL_regcomp_parse != '\n')
2511 PL_regcomp_parse++;
2512 PL_regcomp_parse++;
748a9306
LW
2513 continue;
2514 }
748a9306 2515 }
4633a7c4 2516 return retval;
a0d0e21e 2517 }
a687059c
LW
2518}
2519
2520/*
c277df42 2521- reg_node - emit a node
a0d0e21e 2522*/
76e3520e 2523STATIC regnode * /* Location. */
c277df42 2524reg_node(U8 op)
a687059c 2525{
5c0ca799 2526 dTHR;
c277df42
IZ
2527 register regnode *ret;
2528 register regnode *ptr;
a687059c 2529
3280af22 2530 ret = PL_regcode;
c277df42 2531 if (SIZE_ONLY) {
6b88bc9c 2532 SIZE_ALIGN(PL_regsize);
3280af22 2533 PL_regsize += 1;
a0d0e21e
LW
2534 return(ret);
2535 }
a687059c 2536
c277df42 2537 NODE_ALIGN_FILL(ret);
a0d0e21e 2538 ptr = ret;
c277df42 2539 FILL_ADVANCE_NODE(ptr, op);
3280af22 2540 PL_regcode = ptr;
a687059c 2541
a0d0e21e 2542 return(ret);
a687059c
LW
2543}
2544
2545/*
a0d0e21e
LW
2546- reganode - emit a node with an argument
2547*/
76e3520e 2548STATIC regnode * /* Location. */
c277df42 2549reganode(U8 op, U32 arg)
fe14fcc3 2550{
5c0ca799 2551 dTHR;
c277df42
IZ
2552 register regnode *ret;
2553 register regnode *ptr;
fe14fcc3 2554
3280af22 2555 ret = PL_regcode;
c277df42 2556 if (SIZE_ONLY) {
6b88bc9c 2557 SIZE_ALIGN(PL_regsize);
3280af22 2558 PL_regsize += 2;
a0d0e21e
LW
2559 return(ret);
2560 }
fe14fcc3 2561
c277df42 2562 NODE_ALIGN_FILL(ret);
a0d0e21e 2563 ptr = ret;
c277df42 2564 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
3280af22 2565 PL_regcode = ptr;
fe14fcc3 2566
a0d0e21e 2567 return(ret);
fe14fcc3
LW
2568}
2569
2570/*
a0ed51b3
LW
2571- regc - emit (if appropriate) a Unicode character
2572*/
2573STATIC void
2574reguni(UV uv, char* s, I32* lenp)
2575{
c485e607 2576 dTHR;
a0ed51b3 2577 if (SIZE_ONLY) {
dfe13c55 2578 U8 tmpbuf[10];
a0ed51b3
LW
2579 *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf;
2580 }
2581 else
dfe13c55 2582 *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s;
a0ed51b3
LW
2583
2584}
2585
2586/*
a0d0e21e
LW
2587- regc - emit (if appropriate) a byte of code
2588*/
76e3520e 2589STATIC void
c277df42 2590regc(U8 b, char* s)
a687059c 2591{
5c0ca799 2592 dTHR;
c277df42
IZ
2593 if (!SIZE_ONLY)
2594 *s = b;
a687059c
LW
2595}
2596
2597/*
a0d0e21e
LW
2598- reginsert - insert an operator in front of already-emitted operand
2599*
2600* Means relocating the operand.
2601*/
76e3520e 2602STATIC void
c277df42 2603reginsert(U8 op, regnode *opnd)
a687059c 2604{
5c0ca799 2605 dTHR;
c277df42
IZ
2606 register regnode *src;
2607 register regnode *dst;
2608 register regnode *place;
2609 register int offset = regarglen[(U8)op];
2610
2611/* (regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
2612
2613 if (SIZE_ONLY) {
3280af22 2614 PL_regsize += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
2615 return;
2616 }
a687059c 2617
3280af22
NIS
2618 src = PL_regcode;
2619 PL_regcode += NODE_STEP_REGNODE + offset;
2620 dst = PL_regcode;
a0d0e21e 2621 while (src > opnd)
c277df42 2622 StructCopy(--src, --dst, regnode);
a0d0e21e
LW
2623
2624 place = opnd; /* Op node, where operand used to be. */
c277df42
IZ
2625 src = NEXTOPER(place);
2626 FILL_ADVANCE_NODE(place, op);
2627 Zero(src, offset, regnode);
a687059c
LW
2628}
2629
2630/*
c277df42 2631- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 2632*/
76e3520e 2633STATIC void
c277df42 2634regtail(regnode *p, regnode *val)
a687059c 2635{
5c0ca799 2636 dTHR;
c277df42
IZ
2637 register regnode *scan;
2638 register regnode *temp;
a0d0e21e
LW
2639 register I32 offset;
2640
c277df42 2641 if (SIZE_ONLY)
a0d0e21e
LW
2642 return;
2643
2644 /* Find last node. */
2645 scan = p;
2646 for (;;) {
2647 temp = regnext(scan);
2648 if (temp == NULL)
2649 break;
2650 scan = temp;
2651 }
a687059c 2652
c277df42
IZ
2653 if (reg_off_by_arg[OP(scan)]) {
2654 ARG_SET(scan, val - scan);
a0ed51b3
LW
2655 }
2656 else {
c277df42
IZ
2657 NEXT_OFF(scan) = val - scan;
2658 }
a687059c
LW
2659}
2660
2661/*
a0d0e21e
LW
2662- regoptail - regtail on operand of first argument; nop if operandless
2663*/
76e3520e 2664STATIC void
c277df42 2665regoptail(regnode *p, regnode *val)
a687059c 2666{
5c0ca799 2667 dTHR;
a0d0e21e 2668 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
2669 if (p == NULL || SIZE_ONLY)
2670 return;
2671 if (regkind[(U8)OP(p)] == BRANCH) {
2672 regtail(NEXTOPER(p), val);
a0ed51b3
LW
2673 }
2674 else if ( regkind[(U8)OP(p)] == BRANCHJ) {
c277df42 2675 regtail(NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
2676 }
2677 else
a0d0e21e 2678 return;
a687059c
LW
2679}
2680
2681/*
2682 - regcurly - a little FSA that accepts {\d+,?\d*}
2683 */
79072805 2684STATIC I32
8ac85365 2685regcurly(register char *s)
a687059c
LW
2686{
2687 if (*s++ != '{')
2688 return FALSE;
f0fcb552 2689 if (!isDIGIT(*s))
a687059c 2690 return FALSE;
f0fcb552 2691 while (isDIGIT(*s))
a687059c
LW
2692 s++;
2693 if (*s == ',')
2694 s++;
f0fcb552 2695 while (isDIGIT(*s))
a687059c
LW
2696 s++;
2697 if (*s != '}')
2698 return FALSE;
2699 return TRUE;
2700}
2701
a687059c 2702
76e3520e 2703STATIC regnode *
c277df42
IZ
2704dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
2705{
35ff7856 2706#ifdef DEBUGGING
c277df42
IZ
2707 register char op = EXACT; /* Arbitrary non-END op. */
2708 register regnode *next, *onode;
2709
2710 while (op != END && (!last || node < last)) {
2711 /* While that wasn't END last time... */
2712
2713 NODE_ALIGN(node);
2714 op = OP(node);
2715 if (op == CLOSE)
2716 l--;
2717 next = regnext(node);
2718 /* Where, what. */
2719 if (OP(node) == OPTIMIZED)
2720 goto after_print;
2721 regprop(sv, node);
54dc92de 2722 PerlIO_printf(Perl_debug_log, "%4d:%*s%s", node - start,
c277df42
IZ
2723 2*l + 1, "", SvPVX(sv));
2724 if (next == NULL) /* Next ptr. */
2725 PerlIO_printf(Perl_debug_log, "(0)");
2726 else
2727 PerlIO_printf(Perl_debug_log, "(%d)", next - start);
2728 (void)PerlIO_putc(Perl_debug_log, '\n');
2729 after_print:
2730 if (regkind[(U8)op] == BRANCHJ) {
2731 register regnode *nnode = (OP(next) == LONGJMP
2732 ? regnext(next)
2733 : next);
2734 if (last && nnode > last)
2735 nnode = last;
2736 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3
LW
2737 }
2738 else if (regkind[(U8)op] == BRANCH) {
c277df42 2739 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
2740 }
2741 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
2742 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
2743 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3
LW
2744 }
2745 else if (regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
2746 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
2747 next, sv, l + 1);
a0ed51b3
LW
2748 }
2749 else if ( op == PLUS || op == STAR) {
c277df42 2750 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
2751 }
2752 else if (op == ANYOF) {
c277df42
IZ
2753 node = NEXTOPER(node);
2754 node += ANY_SKIP;
a0ed51b3
LW
2755 }
2756 else if (regkind[(U8)op] == EXACT) {
c277df42
IZ
2757 /* Literal string, where present. */
2758 node += ((*OPERAND(node)) + 2 + sizeof(regnode) - 1) / sizeof(regnode);
2759 node = NEXTOPER(node);
a0ed51b3
LW
2760 }
2761 else {
c277df42
IZ
2762 node = NEXTOPER(node);
2763 node += regarglen[(U8)op];
2764 }
2765 if (op == CURLYX || op == OPEN)
2766 l++;
2767 else if (op == WHILEM)
2768 l--;
2769 }
17c3b450 2770#endif /* DEBUGGING */
c277df42
IZ
2771 return node;
2772}
2773
a687059c 2774/*
fd181c75 2775 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
2776 */
2777void
8ac85365 2778regdump(regexp *r)
a687059c 2779{
35ff7856 2780#ifdef DEBUGGING
5c0ca799 2781 dTHR;
46fc3d4c 2782 SV *sv = sv_newmortal();
a687059c 2783
c277df42 2784 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
2785
2786 /* Header fields of interest. */
c277df42
IZ
2787 if (r->anchored_substr)
2788 PerlIO_printf(Perl_debug_log, "anchored `%s%s%s'%s at %d ",
3280af22 2789 PL_colors[0],
c277df42 2790 SvPVX(r->anchored_substr),
3280af22 2791 PL_colors[1],
c277df42
IZ
2792 SvTAIL(r->anchored_substr) ? "$" : "",
2793 r->anchored_offset);
2794 if (r->float_substr)
2795 PerlIO_printf(Perl_debug_log, "floating `%s%s%s'%s at %d..%u ",
3280af22 2796 PL_colors[0],
c277df42 2797 SvPVX(r->float_substr),
3280af22 2798 PL_colors[1],
c277df42
IZ
2799 SvTAIL(r->float_substr) ? "$" : "",
2800 r->float_min_offset, r->float_max_offset);
2801 if (r->check_substr)
2802 PerlIO_printf(Perl_debug_log,
2803 r->check_substr == r->float_substr
2804 ? "(checking floating" : "(checking anchored");
2805 if (r->reganch & ROPT_NOSCAN)
2806 PerlIO_printf(Perl_debug_log, " noscan");
2807 if (r->reganch & ROPT_CHECK_ALL)
2808 PerlIO_printf(Perl_debug_log, " isall");
2809 if (r->check_substr)
2810 PerlIO_printf(Perl_debug_log, ") ");
2811
46fc3d4c 2812 if (r->regstclass) {
2813 regprop(sv, r->regstclass);
2814 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
2815 }
774d564b 2816 if (r->reganch & ROPT_ANCH) {
2817 PerlIO_printf(Perl_debug_log, "anchored");
2818 if (r->reganch & ROPT_ANCH_BOL)
2819 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
2820 if (r->reganch & ROPT_ANCH_MBOL)
2821 PerlIO_printf(Perl_debug_log, "(MBOL)");
774d564b 2822 if (r->reganch & ROPT_ANCH_GPOS)
2823 PerlIO_printf(Perl_debug_log, "(GPOS)");
2824 PerlIO_putc(Perl_debug_log, ' ');
2825 }
c277df42
IZ
2826 if (r->reganch & ROPT_GPOS_SEEN)
2827 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 2828 if (r->reganch & ROPT_SKIP)
760ac839 2829 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 2830 if (r->reganch & ROPT_IMPLICIT)
760ac839 2831 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 2832 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
2833 if (r->reganch & ROPT_EVAL_SEEN)
2834 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 2835 PerlIO_printf(Perl_debug_log, "\n");
17c3b450 2836#endif /* DEBUGGING */
a687059c
LW
2837}
2838
2839/*
a0d0e21e
LW
2840- regprop - printable representation of opcode
2841*/
46fc3d4c 2842void
c277df42 2843regprop(SV *sv, regnode *o)
a687059c 2844{
35ff7856 2845#ifdef DEBUGGING
5c0ca799 2846 dTHR;
a0d0e21e
LW
2847 register char *p = 0;
2848
54dc92de 2849 sv_setpvn(sv, "", 0);
11343788 2850 switch (OP(o)) {
a0d0e21e
LW
2851 case BOL:
2852 p = "BOL";
2853 break;
2854 case MBOL:
2855 p = "MBOL";
2856 break;
2857 case SBOL:
2858 p = "SBOL";
2859 break;
2860 case EOL:
2861 p = "EOL";
2862 break;
b85d18e9
IZ
2863 case EOS:
2864 p = "EOS";
2865 break;
a0d0e21e
LW
2866 case MEOL:
2867 p = "MEOL";
2868 break;
2869 case SEOL:
2870 p = "SEOL";
2871 break;
2872 case ANY:
2873 p = "ANY";
2874 break;
2875 case SANY:
2876 p = "SANY";
2877 break;
a0ed51b3
LW
2878 case ANYUTF8:
2879 p = "ANYUTF8";
2880 break;
2881 case SANYUTF8:
2882 p = "SANYUTF8";
2883 break;
2884 case ANYOFUTF8:
2885 p = "ANYOFUTF8";
2886 break;
a0d0e21e
LW
2887 case ANYOF:
2888 p = "ANYOF";
2889 break;
2890 case BRANCH:
2891 p = "BRANCH";
2892 break;
bbce6d69 2893 case EXACT:
3280af22 2894 sv_catpvf(sv, "EXACT <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
bbce6d69 2895 break;
2896 case EXACTF:
3280af22 2897 sv_catpvf(sv, "EXACTF <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
bbce6d69 2898 break;
2899 case EXACTFL:
3280af22 2900 sv_catpvf(sv, "EXACTFL <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
a0d0e21e
LW
2901 break;
2902 case NOTHING:
2903 p = "NOTHING";
2904 break;
c277df42
IZ
2905 case TAIL:
2906 p = "TAIL";
2907 break;
a0d0e21e
LW
2908 case BACK:
2909 p = "BACK";
2910 break;
2911 case END:
2912 p = "END";
2913 break;
a0d0e21e
LW
2914 case BOUND:
2915 p = "BOUND";
2916 break;
bbce6d69 2917 case BOUNDL:
2918 p = "BOUNDL";
2919 break;
a0d0e21e
LW
2920 case NBOUND:
2921 p = "NBOUND";
2922 break;
bbce6d69 2923 case NBOUNDL:
2924 p = "NBOUNDL";
a0d0e21e
LW
2925 break;
2926 case CURLY:
5dc0d613 2927 sv_catpvf(sv, "CURLY {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 2928 break;
c277df42 2929 case CURLYM:
c277df42 2930 sv_catpvf(sv, "CURLYM[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o));
c277df42
IZ
2931 break;
2932 case CURLYN:
c277df42 2933 sv_catpvf(sv, "CURLYN[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o));
c277df42 2934 break;
a0d0e21e 2935 case CURLYX:
5dc0d613 2936 sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e
LW
2937 break;
2938 case REF:
c277df42 2939 sv_catpvf(sv, "REF%d", ARG(o));
a0d0e21e 2940 break;
c8756f30 2941 case REFF:
c277df42 2942 sv_catpvf(sv, "REFF%d", ARG(o));
c8756f30
AK
2943 break;
2944 case REFFL:
c277df42 2945 sv_catpvf(sv, "REFFL%d", ARG(o));
c8756f30 2946 break;
a0d0e21e 2947 case OPEN:
c277df42 2948 sv_catpvf(sv, "OPEN%d", ARG(o));
a0d0e21e
LW
2949 break;
2950 case CLOSE:
c277df42 2951 sv_catpvf(sv, "CLOSE%d", ARG(o));
a0d0e21e
LW
2952 p = NULL;
2953 break;
2954 case STAR:
2955 p = "STAR";
2956 break;
2957 case PLUS:
2958 p = "PLUS";
2959 break;
2960 case MINMOD:
2961 p = "MINMOD";
2962 break;
774d564b 2963 case GPOS:
2964 p = "GPOS";
a0d0e21e
LW
2965 break;
2966 case UNLESSM:
c277df42 2967 sv_catpvf(sv, "UNLESSM[-%d]", o->flags);
a0d0e21e
LW
2968 break;
2969 case IFMATCH:
c277df42 2970 sv_catpvf(sv, "IFMATCH[-%d]", o->flags);
a0d0e21e
LW
2971 break;
2972 case SUCCEED:
2973 p = "SUCCEED";
2974 break;
2975 case WHILEM:
2976 p = "WHILEM";
2977 break;
bbce6d69 2978 case DIGIT:
2979 p = "DIGIT";
2980 break;
2981 case NDIGIT:
2982 p = "NDIGIT";
2983 break;
2984 case ALNUM:
2985 p = "ALNUM";
2986 break;
2987 case NALNUM:
2988 p = "NALNUM";
2989 break;
2990 case SPACE:
2991 p = "SPACE";
2992 break;
2993 case NSPACE:
2994 p = "NSPACE";
2995 break;
2996 case ALNUML:
2997 p = "ALNUML";
2998 break;
2999 case NALNUML:
3000 p = "NALNUML";
3001 break;
3002 case SPACEL:
3003 p = "SPACEL";
3004 break;
3005 case NSPACEL:
3006 p = "NSPACEL";
3007 break;
c277df42
IZ
3008 case EVAL:
3009 p = "EVAL";
3010 break;
3011 case LONGJMP:
3012 p = "LONGJMP";
3013 break;
3014 case BRANCHJ:
3015 p = "BRANCHJ";
3016 break;
3017 case IFTHEN:
3018 p = "IFTHEN";
3019 break;
3020 case GROUPP:
3021 sv_catpvf(sv, "GROUPP%d", ARG(o));
3022 break;
3023 case LOGICAL:
3024 p = "LOGICAL";
3025 break;
3026 case SUSPEND:
3027 p = "SUSPEND";
3028 break;
3029 case RENUM:
3030 p = "RENUM";
3031 break;
3032 case OPTIMIZED:
3033 p = "OPTIMIZED";
3034 break;
a0d0e21e
LW
3035 default:
3036 FAIL("corrupted regexp opcode");
3037 }
46fc3d4c 3038 if (p)
3039 sv_catpv(sv, p);
17c3b450 3040#endif /* DEBUGGING */
35ff7856 3041}
a687059c 3042
2b69d0c2 3043void
8ac85365 3044pregfree(struct regexp *r)
a687059c 3045{
5c0ca799 3046 dTHR;
c277df42 3047 if (!r || (--r->refcnt > 0))
a0d0e21e 3048 return;
c277df42 3049 if (r->precomp)
a0d0e21e 3050 Safefree(r->precomp);
c277df42 3051 if (r->subbase)
a0d0e21e 3052 Safefree(r->subbase);
a193d654
GS
3053 if (r->substrs) {
3054 if (r->anchored_substr)
3055 SvREFCNT_dec(r->anchored_substr);
3056 if (r->float_substr)
3057 SvREFCNT_dec(r->float_substr);
2779dcf1 3058 Safefree(r->substrs);
a193d654 3059 }
c277df42
IZ
3060 if (r->data) {
3061 int n = r->data->count;
3062 while (--n >= 0) {
3063 switch (r->data->what[n]) {
3064 case 's':
3065 SvREFCNT_dec((SV*)r->data->data[n]);
3066 break;
3067 case 'o':
3068 op_free((OP_4tree*)r->data->data[n]);
3069 break;
3070 case 'n':
3071 break;
3072 default:
3073 FAIL2("panic: regfree data code '%c'", r->data->what[n]);
3074 }
3075 }
3076 Safefree(r->data->what);
3077 Safefree(r->data);
a0d0e21e
LW
3078 }
3079 Safefree(r->startp);
3080 Safefree(r->endp);
3081 Safefree(r);
a687059c 3082}
c277df42
IZ
3083
3084/*
3085 - regnext - dig the "next" pointer out of a node
3086 *
3087 * [Note, when REGALIGN is defined there are two places in regmatch()
3088 * that bypass this code for speed.]
3089 */
3090regnode *
3091regnext(register regnode *p)
3092{
5c0ca799 3093 dTHR;
c277df42
IZ
3094 register I32 offset;
3095
3280af22 3096 if (p == &PL_regdummy)
c277df42
IZ
3097 return(NULL);
3098
3099 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
3100 if (offset == 0)
3101 return(NULL);
3102
c277df42 3103 return(p+offset);
c277df42
IZ
3104}
3105
01f988be 3106STATIC void
c277df42 3107re_croak2(const char* pat1,const char* pat2,...)
c277df42
IZ
3108{
3109 va_list args;
3110 STRLEN l1 = strlen(pat1);
3111 STRLEN l2 = strlen(pat2);
3112 char buf[512];
3113 char *message;
3114
3115 if (l1 > 510)
3116 l1 = 510;
3117 if (l1 + l2 > 510)
3118 l2 = 510 - l1;
3119 Copy(pat1, buf, l1 , char);
3120 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
3121 buf[l1 + l2] = '\n';
3122 buf[l1 + l2 + 1] = '\0';
c277df42 3123 va_start(args, pat2);
c277df42
IZ
3124 message = mess(buf, &args);
3125 va_end(args);
3126 l1 = strlen(message);
3127 if (l1 > 512)
3128 l1 = 512;
3129 Copy(message, buf, l1 , char);
3130 buf[l1] = '\0'; /* Overwrite \n */
3131 croak("%s", buf);
3132}
a0ed51b3
LW
3133
3134/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
3135
3136void
3137save_re_context(void)
c485e607
NIS
3138{
3139 dTHR;
a0ed51b3
LW
3140 SAVEPPTR(PL_bostr);
3141 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
3142 SAVEI32(PL_regnpar); /* () count. */
3143 SAVEI32(PL_regsize); /* Code size. */
3144 SAVEI16(PL_regflags); /* are we folding, multilining? */
3145 SAVEPPTR(PL_reginput); /* String-input pointer. */
3146 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
3147 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
3148 SAVESPTR(PL_regstartp); /* Pointer to startp array. */
3149 SAVESPTR(PL_regendp); /* Ditto for endp. */
3150 SAVESPTR(PL_reglastparen); /* Similarly for lastparen. */
3151 SAVEPPTR(PL_regtill); /* How far we are required to go. */
3152 SAVEI32(PL_regprev); /* char before regbol, \n if none */
3153 SAVESPTR(PL_reg_start_tmp); /* from regexec.c */
3154 PL_reg_start_tmp = 0;
3155 SAVEFREEPV(PL_reg_start_tmp);
3156 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
3157 PL_reg_start_tmpl = 0;
3158 SAVESPTR(PL_regdata);
3159 SAVEI32(PL_reg_flags); /* from regexec.c */
3160 SAVEI32(PL_reg_eval_set); /* from regexec.c */
3161 SAVEI32(PL_regnarrate); /* from regexec.c */
3162 SAVESPTR(PL_regprogram); /* from regexec.c */
3163 SAVEINT(PL_regindent); /* from regexec.c */
3164 SAVESPTR(PL_regcc); /* from regexec.c */
3165 SAVESPTR(PL_curcop);
3166 SAVESPTR(PL_regcomp_rx); /* from regcomp.c */
3167 SAVEI32(PL_regseen); /* from regcomp.c */
3168 SAVEI32(PL_regsawback); /* Did we see \1, ...? */
3169 SAVEI32(PL_regnaughty); /* How bad is this pattern? */
3170 SAVESPTR(PL_regcode); /* Code-emit pointer; &regdummy = don't */
3171 SAVEPPTR(PL_regxend); /* End of input for compile */
3172 SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */
3173}