This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Eliminate superfluous RV2p[AH]Vs in oops[AH]V()
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
28# ifndef DEBUGGING
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c
GS
38/* *These* symbols are masked to allow static link. */
39# define Perl_pregfree my_regfree
40# define Perl_regnext my_regnext
56953603
IZ
41#endif
42
f0fcb552 43/*SUPPRESS 112*/
a687059c 44/*
e50aee73 45 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
46 *
47 * Copyright (c) 1986 by University of Toronto.
48 * Written by Henry Spencer. Not derived from licensed software.
49 *
50 * Permission is granted to anyone to use this software for any
51 * purpose on any computer system, and to redistribute it freely,
52 * subject to the following restrictions:
53 *
54 * 1. The author is not responsible for the consequences of use of
55 * this software, no matter how awful, even if they arise
56 * from defects in it.
57 *
58 * 2. The origin of this software must not be misrepresented, either
59 * by explicit claim or by omission.
60 *
61 * 3. Altered versions must be plainly marked as such, and must not
62 * be misrepresented as being the original software.
63 *
64 *
65 **** Alterations to Henry's code are...
66 ****
a0ed51b3 67 **** Copyright (c) 1991-1998, Larry Wall
a687059c 68 ****
9ef589d8
LW
69 **** You may distribute under the terms of either the GNU General Public
70 **** License or the Artistic License, as specified in the README file.
71
a687059c
LW
72 *
73 * Beware that some of this code is subtly aware of the way operator
74 * precedence is structured in regular expressions. Serious changes in
75 * regular-expression syntax might require a total rethink.
76 */
77#include "EXTERN.h"
78#include "perl.h"
d06ea78c 79
b9d5759e 80#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
81# include "INTERN.h"
82#endif
c277df42
IZ
83
84#define REG_COMP_C
a687059c
LW
85#include "regcomp.h"
86
d4cce5f1 87#ifdef op
11343788 88#undef op
d4cce5f1 89#endif /* op */
11343788 90
fe14fcc3
LW
91#ifdef MSDOS
92# if defined(BUGGY_MSC6)
93 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
94 # pragma optimize("a",off)
95 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
96 # pragma optimize("w",on )
97# endif /* BUGGY_MSC6 */
98#endif /* MSDOS */
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
104#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
105#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
106 ((*s) == '{' && regcurly(s)))
2b69d0c2
LW
107#ifdef atarist
108#define PERL_META "^$.[()|?+*\\"
109#else
a687059c 110#define META "^$.[()|?+*\\"
2b69d0c2 111#endif
a687059c 112
35c8bce7
LW
113#ifdef SPSTART
114#undef SPSTART /* dratted cpp namespace... */
115#endif
a687059c
LW
116/*
117 * Flags to be passed up and down.
118 */
a687059c 119#define WORST 0 /* Worst case. */
821b33a5 120#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
121#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
122#define SPSTART 0x4 /* Starts with * or +. */
123#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c
LW
124
125/*
e50aee73 126 * Forward declarations for pregcomp()'s friends.
a687059c 127 */
a0d0e21e 128
76e3520e 129#ifndef PERL_OBJECT
c277df42
IZ
130static regnode *reg _((I32, I32 *));
131static regnode *reganode _((U8, U32));
132static regnode *regatom _((I32 *));
133static regnode *regbranch _((I32 *, I32));
134static void regc _((U8, char *));
a0ed51b3 135static void reguni _((UV, char *, I32*));
c277df42 136static regnode *regclass _((void));
a0ed51b3 137static regnode *regclassutf8 _((void));
a0d0e21e 138STATIC I32 regcurly _((char *));
c277df42
IZ
139static regnode *reg_node _((U8));
140static regnode *regpiece _((I32 *));
141static void reginsert _((U8, regnode *));
142static void regoptail _((regnode *, regnode *));
c277df42 143static void regtail _((regnode *, regnode *));
873ef191 144static char* regwhite _((char *, char *));
a0d0e21e 145static char* nextchar _((void));
3bd495df 146static void re_croak2 _((const char* pat1,const char* pat2,...)) __attribute__((noreturn));
76e3520e 147#endif
a687059c 148
c277df42
IZ
149/* Length of a variant. */
150
76e3520e 151#ifndef PERL_OBJECT
c277df42
IZ
152typedef struct {
153 I32 len_min;
154 I32 len_delta;
a0ed51b3
LW
155 I32 pos_min; /* CC */
156 I32 pos_delta; /* CC */
c277df42
IZ
157 SV *last_found;
158 I32 last_end; /* min value, <0 unless valid. */
a0ed51b3
LW
159 I32 last_start_min; /* CC */
160 I32 last_start_max; /* CC */
c277df42
IZ
161 SV **longest; /* Either &l_fixed, or &l_float. */
162 SV *longest_fixed;
a0ed51b3 163 I32 offset_fixed; /* CC */
c277df42 164 SV *longest_float;
a0ed51b3
LW
165 I32 offset_float_min; /* CC */
166 I32 offset_float_max; /* CC */
c277df42
IZ
167 I32 flags;
168} scan_data_t;
76e3520e 169#endif
c277df42
IZ
170
171static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
172
173#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
174#define SF_BEFORE_SEOL 0x1
175#define SF_BEFORE_MEOL 0x2
176#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
177#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
178
09b7f37c
CB
179#ifdef NO_UNARY_PLUS
180# define SF_FIX_SHIFT_EOL (0+2)
181# define SF_FL_SHIFT_EOL (0+4)
182#else
183# define SF_FIX_SHIFT_EOL (+2)
184# define SF_FL_SHIFT_EOL (+4)
185#endif
c277df42
IZ
186
187#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
188#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
189
190#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
191#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
192#define SF_IS_INF 0x40
193#define SF_HAS_PAR 0x80
194#define SF_IN_PAR 0x100
195#define SF_HAS_EVAL 0x200
4bfe0158 196#define SCF_DO_SUBSTR 0x400
c277df42 197
a0ed51b3
LW
198#define RF_utf8 8
199#define UTF (PL_reg_flags & RF_utf8)
200#define LOC (PL_regflags & PMf_LOCALE)
201#define FOLD (PL_regflags & PMf_FOLD)
202
203#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
204#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
205
76e3520e 206STATIC void
c277df42
IZ
207scan_commit(scan_data_t *data)
208{
c485e607 209 dTHR;
a0ed51b3
LW
210 STRLEN l = CHR_SVLEN(data->last_found);
211 STRLEN old_l = CHR_SVLEN(*data->longest);
c277df42
IZ
212
213 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
214 sv_setsv(*data->longest, data->last_found);
215 if (*data->longest == data->longest_fixed) {
216 data->offset_fixed = l ? data->last_start_min : data->pos_min;
217 if (data->flags & SF_BEFORE_EOL)
218 data->flags
219 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
220 else
221 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
222 }
223 else {
c277df42
IZ
224 data->offset_float_min = l ? data->last_start_min : data->pos_min;
225 data->offset_float_max = (l
226 ? data->last_start_max
227 : data->pos_min + data->pos_delta);
228 if (data->flags & SF_BEFORE_EOL)
229 data->flags
230 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
231 else
232 data->flags &= ~SF_FL_BEFORE_EOL;
233 }
234 }
235 SvCUR_set(data->last_found, 0);
236 data->last_end = -1;
237 data->flags &= ~SF_BEFORE_EOL;
238}
239
c277df42
IZ
240/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
241 to the position after last scanned or to NULL. */
242
76e3520e 243STATIC I32
c277df42
IZ
244study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
245 /* scanp: Start here (read-write). */
246 /* deltap: Write maxlen-minlen here. */
247 /* last: Stop before this one. */
248{
5c0ca799 249 dTHR;
c277df42
IZ
250 I32 min = 0, pars = 0, code;
251 regnode *scan = *scanp, *next;
252 I32 delta = 0;
253 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
254 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
255 scan_data_t data_fake;
256
257 while (scan && OP(scan) != END && scan < last) {
258 /* Peephole optimizer: */
259
260 if (regkind[(U8)OP(scan)] == EXACT) {
261 regnode *n = regnext(scan);
262 U32 stringok = 1;
263#ifdef DEBUGGING
264 regnode *stop = scan;
265#endif
266
267 next = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2;
268 /* Skip NOTHING, merge EXACT*. */
269 while (n &&
270 ( regkind[(U8)OP(n)] == NOTHING ||
271 (stringok && (OP(n) == OP(scan))))
272 && NEXT_OFF(n)
273 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
274 if (OP(n) == TAIL || n > next)
275 stringok = 0;
276 if (regkind[(U8)OP(n)] == NOTHING) {
277 NEXT_OFF(scan) += NEXT_OFF(n);
278 next = n + NODE_STEP_REGNODE;
279#ifdef DEBUGGING
280 if (stringok)
281 stop = n;
282#endif
283 n = regnext(n);
a0ed51b3
LW
284 }
285 else {
c277df42
IZ
286 int oldl = *OPERAND(scan);
287 regnode *nnext = regnext(n);
288
289 if (oldl + *OPERAND(n) > U8_MAX)
290 break;
291 NEXT_OFF(scan) += NEXT_OFF(n);
292 *OPERAND(scan) += *OPERAND(n);
293 next = n + (*OPERAND(n) + 2 - 1)/sizeof(regnode) + 2;
294 /* Now we can overwrite *n : */
295 Move(OPERAND(n) + 1, OPERAND(scan) + oldl + 1,
296 *OPERAND(n) + 1, char);
297#ifdef DEBUGGING
298 if (stringok)
299 stop = next - 1;
300#endif
301 n = nnext;
302 }
303 }
304#ifdef DEBUGGING
305 /* Allow dumping */
306 n = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2;
307 while (n <= stop) {
ca04da08
GS
308 /* Purify reports a benign UMR here sometimes, because we
309 * don't initialize the OP() slot of a node when that node
310 * is occupied by just the trailing null of the string in
311 * an EXACT node */
c277df42
IZ
312 if (regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
313 OP(n) = OPTIMIZED;
314 NEXT_OFF(n) = 0;
315 }
316 n++;
317 }
318#endif
319
320 }
321 if (OP(scan) != CURLYX) {
048cfca1
GS
322 int max = (reg_off_by_arg[OP(scan)]
323 ? I32_MAX
324 /* I32 may be smaller than U16 on CRAYs! */
325 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
326 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
327 int noff;
328 regnode *n = scan;
329
330 /* Skip NOTHING and LONGJMP. */
331 while ((n = regnext(n))
332 && ((regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
333 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
334 && off + noff < max)
335 off += noff;
336 if (reg_off_by_arg[OP(scan)])
337 ARG(scan) = off;
338 else
339 NEXT_OFF(scan) = off;
340 }
341 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
342 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
343 next = regnext(scan);
344 code = OP(scan);
345
346 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
347 I32 max1 = 0, min1 = I32_MAX, num = 0;
348
349 if (flags & SCF_DO_SUBSTR)
350 scan_commit(data);
351 while (OP(scan) == code) {
352 I32 deltanext, minnext;
353
354 num++;
355 data_fake.flags = 0;
356 next = regnext(scan);
357 scan = NEXTOPER(scan);
358 if (code != BRANCH)
359 scan = NEXTOPER(scan);
360 /* We suppose the run is continuous, last=next...*/
361 minnext = study_chunk(&scan, &deltanext, next,
362 &data_fake, 0);
363 if (min1 > minnext)
364 min1 = minnext;
365 if (max1 < minnext + deltanext)
366 max1 = minnext + deltanext;
367 if (deltanext == I32_MAX)
368 is_inf = 1;
369 scan = next;
370 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
371 pars++;
405ff068 372 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42
IZ
373 data->flags |= SF_HAS_EVAL;
374 if (code == SUSPEND)
375 break;
376 }
377 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
378 min1 = 0;
379 if (flags & SCF_DO_SUBSTR) {
380 data->pos_min += min1;
381 data->pos_delta += max1 - min1;
382 if (max1 != min1 || is_inf)
383 data->longest = &(data->longest_float);
384 }
385 min += min1;
386 delta += max1 - min1;
a0ed51b3
LW
387 }
388 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
389 scan = NEXTOPER(NEXTOPER(scan));
390 else /* single branch is optimized. */
391 scan = NEXTOPER(scan);
392 continue;
a0ed51b3
LW
393 }
394 else if (OP(scan) == EXACT) {
395 I32 l = *OPERAND(scan);
396 if (UTF) {
397 unsigned char *s = (unsigned char *)(OPERAND(scan)+1);
398 unsigned char *e = s + l;
399 I32 newl = 0;
400 while (s < e) {
401 newl++;
402 s += UTF8SKIP(s);
403 }
404 l = newl;
405 }
406 min += l;
c277df42 407 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
408 /* The code below prefers earlier match for fixed
409 offset, later match for variable offset. */
410 if (data->last_end == -1) { /* Update the start info. */
411 data->last_start_min = data->pos_min;
412 data->last_start_max = is_inf
413 ? I32_MAX : data->pos_min + data->pos_delta;
414 }
a0ed51b3 415 sv_catpvn(data->last_found, (char *)(OPERAND(scan)+1), *OPERAND(scan));
c277df42
IZ
416 data->last_end = data->pos_min + l;
417 data->pos_min += l; /* As in the first entry. */
418 data->flags &= ~SF_BEFORE_EOL;
419 }
a0ed51b3
LW
420 }
421 else if (regkind[(U8)OP(scan)] == EXACT) {
422 I32 l = *OPERAND(scan);
c277df42
IZ
423 if (flags & SCF_DO_SUBSTR)
424 scan_commit(data);
a0ed51b3
LW
425 if (UTF) {
426 unsigned char *s = (unsigned char *)(OPERAND(scan)+1);
427 unsigned char *e = s + l;
428 I32 newl = 0;
429 while (s < e) {
430 newl++;
431 s += UTF8SKIP(s);
432 }
433 l = newl;
434 }
435 min += l;
c277df42 436 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3
LW
437 data->pos_min += l;
438 }
439 else if (strchr(varies,OP(scan))) {
c277df42
IZ
440 I32 mincount, maxcount, minnext, deltanext, pos_before, fl;
441 regnode *oscan = scan;
442
443 switch (regkind[(U8)OP(scan)]) {
444 case WHILEM:
445 scan = NEXTOPER(scan);
446 goto finish;
447 case PLUS:
448 if (flags & SCF_DO_SUBSTR) {
449 next = NEXTOPER(scan);
450 if (OP(next) == EXACT) {
451 mincount = 1;
452 maxcount = REG_INFTY;
453 next = regnext(scan);
454 scan = NEXTOPER(scan);
455 goto do_curly;
456 }
457 }
458 if (flags & SCF_DO_SUBSTR)
459 data->pos_min++;
460 min++;
461 /* Fall through. */
462 case STAR:
463 is_inf = 1;
464 scan = regnext(scan);
465 if (flags & SCF_DO_SUBSTR) {
466 scan_commit(data);
467 data->longest = &(data->longest_float);
468 }
469 goto optimize_curly_tail;
470 case CURLY:
471 mincount = ARG1(scan);
472 maxcount = ARG2(scan);
473 next = regnext(scan);
474 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
475 do_curly:
476 if (flags & SCF_DO_SUBSTR) {
477 if (mincount == 0) scan_commit(data);
478 pos_before = data->pos_min;
479 }
480 if (data) {
481 fl = data->flags;
482 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
483 if (is_inf)
484 data->flags |= SF_IS_INF;
485 }
486 /* This will finish on WHILEM, setting scan, or on NULL: */
487 minnext = study_chunk(&scan, &deltanext, last, data,
488 mincount == 0
489 ? (flags & ~SCF_DO_SUBSTR) : flags);
490 if (!scan) /* It was not CURLYX, but CURLY. */
491 scan = next;
3280af22 492 if (PL_dowarn && (minnext + deltanext == 0)
821b33a5
IZ
493 && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
494 && maxcount <= 10000) /* Complement check for big count */
c277df42
IZ
495 warn("Strange *+?{} on zero-length expression");
496 min += minnext * mincount;
497 is_inf |= (maxcount == REG_INFTY && (minnext + deltanext) > 0
498 || deltanext == I32_MAX);
499 delta += (minnext + deltanext) * maxcount - minnext * mincount;
500
501 /* Try powerful optimization CURLYX => CURLYN. */
c277df42
IZ
502 if ( OP(oscan) == CURLYX && data
503 && data->flags & SF_IN_PAR
504 && !(data->flags & SF_HAS_EVAL)
505 && !deltanext && minnext == 1 ) {
506 /* Try to optimize to CURLYN. */
507 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
508 regnode *nxt1 = nxt, *nxt2;
509
510 /* Skip open. */
511 nxt = regnext(nxt);
512 if (!strchr(simple,OP(nxt))
513 && !(regkind[(U8)OP(nxt)] == EXACT
514 && *OPERAND(nxt) == 1))
515 goto nogo;
516 nxt2 = nxt;
517 nxt = regnext(nxt);
518 if (OP(nxt) != CLOSE)
519 goto nogo;
520 /* Now we know that nxt2 is the only contents: */
521 oscan->flags = ARG(nxt);
522 OP(oscan) = CURLYN;
523 OP(nxt1) = NOTHING; /* was OPEN. */
524#ifdef DEBUGGING
525 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
526 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
527 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
528 OP(nxt) = OPTIMIZED; /* was CLOSE. */
529 OP(nxt + 1) = OPTIMIZED; /* was count. */
530 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
531#endif
532 }
c277df42
IZ
533 nogo:
534
535 /* Try optimization CURLYX => CURLYM. */
536 if ( OP(oscan) == CURLYX && data
c277df42 537 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
538 && !(data->flags & SF_HAS_EVAL)
539 && !deltanext ) {
540 /* XXXX How to optimize if data == 0? */
541 /* Optimize to a simpler form. */
542 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
543 regnode *nxt2;
544
545 OP(oscan) = CURLYM;
546 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
547 && (OP(nxt2) != WHILEM))
548 nxt = nxt2;
549 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
550 /* Need to optimize away parenths. */
551 if (data->flags & SF_IN_PAR) {
552 /* Set the parenth number. */
553 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
554
555 if (OP(nxt) != CLOSE)
556 FAIL("panic opt close");
557 oscan->flags = ARG(nxt);
558 OP(nxt1) = OPTIMIZED; /* was OPEN. */
559 OP(nxt) = OPTIMIZED; /* was CLOSE. */
560#ifdef DEBUGGING
561 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
562 OP(nxt + 1) = OPTIMIZED; /* was count. */
563 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
564 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
565#endif
566#if 0
567 while ( nxt1 && (OP(nxt1) != WHILEM)) {
568 regnode *nnxt = regnext(nxt1);
569
570 if (nnxt == nxt) {
571 if (reg_off_by_arg[OP(nxt1)])
572 ARG_SET(nxt1, nxt2 - nxt1);
573 else if (nxt2 - nxt1 < U16_MAX)
574 NEXT_OFF(nxt1) = nxt2 - nxt1;
575 else
576 OP(nxt) = NOTHING; /* Cannot beautify */
577 }
578 nxt1 = nnxt;
579 }
580#endif
581 /* Optimize again: */
582 study_chunk(&nxt1, &deltanext, nxt, NULL, 0);
a0ed51b3
LW
583 }
584 else
c277df42 585 oscan->flags = 0;
c277df42
IZ
586 }
587 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
588 pars++;
589 if (flags & SCF_DO_SUBSTR) {
590 SV *last_str = Nullsv;
591 int counted = mincount != 0;
592
593 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
594 I32 b = pos_before >= data->last_start_min
595 ? pos_before : data->last_start_min;
596 STRLEN l;
597 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
598 I32 old = b - data->last_start_min;
599
600 if (UTF)
601 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 602
a0ed51b3 603 l -= old;
c277df42 604 /* Get the added string: */
a0ed51b3 605 last_str = newSVpv(s + old, l);
c277df42
IZ
606 if (deltanext == 0 && pos_before == b) {
607 /* What was added is a constant string */
608 if (mincount > 1) {
609 SvGROW(last_str, (mincount * l) + 1);
610 repeatcpy(SvPVX(last_str) + l,
611 SvPVX(last_str), l, mincount - 1);
612 SvCUR(last_str) *= mincount;
613 /* Add additional parts. */
614 SvCUR_set(data->last_found,
615 SvCUR(data->last_found) - l);
616 sv_catsv(data->last_found, last_str);
617 data->last_end += l * (mincount - 1);
618 }
619 }
620 }
621 /* It is counted once already... */
622 data->pos_min += minnext * (mincount - counted);
623 data->pos_delta += - counted * deltanext +
624 (minnext + deltanext) * maxcount - minnext * mincount;
625 if (mincount != maxcount) {
626 scan_commit(data);
627 if (mincount && last_str) {
628 sv_setsv(data->last_found, last_str);
629 data->last_end = data->pos_min;
630 data->last_start_min =
a0ed51b3 631 data->pos_min - CHR_SVLEN(last_str);
c277df42
IZ
632 data->last_start_max = is_inf
633 ? I32_MAX
634 : data->pos_min + data->pos_delta
a0ed51b3 635 - CHR_SVLEN(last_str);
c277df42
IZ
636 }
637 data->longest = &(data->longest_float);
638 }
639 }
405ff068 640 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
641 data->flags |= SF_HAS_EVAL;
642 optimize_curly_tail:
c277df42
IZ
643 if (OP(oscan) != CURLYX) {
644 while (regkind[(U8)OP(next = regnext(oscan))] == NOTHING
645 && NEXT_OFF(next))
646 NEXT_OFF(oscan) += NEXT_OFF(next);
647 }
c277df42
IZ
648 continue;
649 default: /* REF only? */
650 if (flags & SCF_DO_SUBSTR) {
651 scan_commit(data);
652 data->longest = &(data->longest_float);
653 }
654 is_inf = 1;
655 break;
656 }
a0ed51b3
LW
657 }
658 else if (strchr(simple,OP(scan)) || regkind[(U8)OP(scan)] == ANYUTF8) {
c277df42
IZ
659 if (flags & SCF_DO_SUBSTR) {
660 scan_commit(data);
661 data->pos_min++;
662 }
663 min++;
a0ed51b3
LW
664 }
665 else if (regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
666 data->flags |= (OP(scan) == MEOL
667 ? SF_BEFORE_MEOL
668 : SF_BEFORE_SEOL);
a0ed51b3
LW
669 }
670 else if (regkind[(U8)OP(scan)] == BRANCHJ
c277df42
IZ
671 && (scan->flags || data)
672 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
673 I32 deltanext, minnext;
674 regnode *nscan;
675
676 data_fake.flags = 0;
677 next = regnext(scan);
678 nscan = NEXTOPER(NEXTOPER(scan));
679 minnext = study_chunk(&nscan, &deltanext, last, &data_fake, 0);
680 if (scan->flags) {
681 if (deltanext) {
682 FAIL("variable length lookbehind not implemented");
a0ed51b3
LW
683 }
684 else if (minnext > U8_MAX) {
c277df42
IZ
685 FAIL2("lookbehind longer than %d not implemented", U8_MAX);
686 }
687 scan->flags = minnext;
688 }
689 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
690 pars++;
405ff068 691 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 692 data->flags |= SF_HAS_EVAL;
a0ed51b3
LW
693 }
694 else if (OP(scan) == OPEN) {
c277df42 695 pars++;
a0ed51b3
LW
696 }
697 else if (OP(scan) == CLOSE && ARG(scan) == is_par) {
c277df42
IZ
698 next = regnext(scan);
699
700 if ( next && (OP(next) != WHILEM) && next < last)
c277df42 701 is_par = 0; /* Disable optimization */
a0ed51b3
LW
702 }
703 else if (OP(scan) == EVAL) {
c277df42
IZ
704 if (data)
705 data->flags |= SF_HAS_EVAL;
706 }
707 /* Else: zero-length, ignore. */
708 scan = regnext(scan);
709 }
710
711 finish:
712 *scanp = scan;
713 *deltap = is_inf ? I32_MAX : delta;
714 if (flags & SCF_DO_SUBSTR && is_inf)
715 data->pos_delta = I32_MAX - data->pos_min;
716 if (is_par > U8_MAX)
717 is_par = 0;
718 if (is_par && pars==1 && data) {
719 data->flags |= SF_IN_PAR;
720 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
721 }
722 else if (pars && data) {
c277df42
IZ
723 data->flags |= SF_HAS_PAR;
724 data->flags &= ~SF_IN_PAR;
725 }
726 return min;
727}
728
76e3520e 729STATIC I32
c277df42
IZ
730add_data(I32 n, char *s)
731{
5c0ca799 732 dTHR;
3280af22
NIS
733 if (PL_regcomp_rx->data) {
734 Renewc(PL_regcomp_rx->data,
735 sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1),
c277df42 736 char, struct reg_data);
3280af22
NIS
737 Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8);
738 PL_regcomp_rx->data->count += n;
a0ed51b3
LW
739 }
740 else {
3280af22 741 Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1),
c277df42 742 char, struct reg_data);
3280af22
NIS
743 New(1208, PL_regcomp_rx->data->what, n, U8);
744 PL_regcomp_rx->data->count = n;
c277df42 745 }
3280af22
NIS
746 Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8);
747 return PL_regcomp_rx->data->count - n;
c277df42
IZ
748}
749
a687059c 750/*
e50aee73 751 - pregcomp - compile a regular expression into internal code
a687059c
LW
752 *
753 * We can't allocate space until we know how big the compiled form will be,
754 * but we can't compile it (and thus know how big it is) until we've got a
755 * place to put the code. So we cheat: we compile it twice, once with code
756 * generation turned off and size counting turned on, and once "for real".
757 * This also means that we don't allocate space until we are sure that the
758 * thing really will compile successfully, and we never have to move the
759 * code and thus invalidate pointers into it. (Note that it has to be in
760 * one piece because free() must be able to free it all.) [NB: not true in perl]
761 *
762 * Beware that the optimization-preparation code in here knows about some
763 * of the structure of the compiled regexp. [I'll say.]
764 */
765regexp *
8ac85365 766pregcomp(char *exp, char *xend, PMOP *pm)
a687059c 767{
5c0ca799 768 dTHR;
a0d0e21e 769 register regexp *r;
c277df42
IZ
770 regnode *scan;
771 SV **longest;
772 SV *longest_fixed;
773 SV *longest_float;
774 regnode *first;
a0d0e21e 775 I32 flags;
a0d0e21e
LW
776 I32 minlen = 0;
777 I32 sawplus = 0;
778 I32 sawopen = 0;
779
780 if (exp == NULL)
c277df42 781 FAIL("NULL regexp argument");
a0d0e21e 782
a0ed51b3
LW
783 if (PL_curcop == &compiling ? (PL_hints & HINT_UTF8) : IN_UTF8)
784 PL_reg_flags |= RF_utf8;
785 else
786 PL_reg_flags = 0;
787
3280af22 788 PL_regprecomp = savepvn(exp, xend - exp);
c277df42 789 DEBUG_r(PerlIO_printf(Perl_debug_log, "compiling RE `%*s'\n",
3280af22
NIS
790 xend - exp, PL_regprecomp));
791 PL_regflags = pm->op_pmflags;
792 PL_regsawback = 0;
bbce6d69 793
3280af22
NIS
794 PL_regseen = 0;
795 PL_seen_zerolen = *exp == '^' ? -1 : 0;
796 PL_seen_evals = 0;
797 PL_extralen = 0;
c277df42 798
bbce6d69 799 /* First pass: determine size, legality. */
3280af22
NIS
800 PL_regcomp_parse = exp;
801 PL_regxend = xend;
802 PL_regnaughty = 0;
803 PL_regnpar = 1;
804 PL_regsize = 0L;
805 PL_regcode = &PL_regdummy;
806 regc((U8)MAGIC, (char*)PL_regcode);
a0d0e21e 807 if (reg(0, &flags) == NULL) {
3280af22
NIS
808 Safefree(PL_regprecomp);
809 PL_regprecomp = Nullch;
a0d0e21e
LW
810 return(NULL);
811 }
3280af22 812 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", PL_regsize));
c277df42
IZ
813
814 DEBUG_r(
3280af22 815 if (!PL_colorset) {
c277df42 816 int i = 0;
76e3520e 817 char *s = PerlEnv_getenv("TERMCAP_COLORS");
c277df42 818
3280af22 819 PL_colorset = 1;
c277df42 820 if (s) {
3280af22 821 PL_colors[0] = s = savepv(s);
c277df42
IZ
822 while (++i < 4) {
823 s = strchr(s, '\t');
824 if (!s)
825 FAIL("Not enough TABs in TERMCAP_COLORS");
826 *s = '\0';
3280af22 827 PL_colors[i] = ++s;
c277df42 828 }
a0ed51b3
LW
829 }
830 else {
c277df42 831 while (i < 4)
3280af22 832 PL_colors[i++] = "";
c277df42
IZ
833 }
834 /* Reset colors: */
835 PerlIO_printf(Perl_debug_log, "%s%s%s%s",
3280af22 836 PL_colors[0],PL_colors[1],PL_colors[2],PL_colors[3]);
c277df42
IZ
837 }
838 );
a0d0e21e 839
c277df42
IZ
840 /* Small enough for pointer-storage convention?
841 If extralen==0, this means that we will not need long jumps. */
3280af22
NIS
842 if (PL_regsize >= 0x10000L && PL_extralen)
843 PL_regsize += PL_extralen;
c277df42 844 else
3280af22 845 PL_extralen = 0;
a0d0e21e 846
bbce6d69 847 /* Allocate space and initialize. */
3280af22 848 Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode),
c277df42 849 char, regexp);
a0d0e21e
LW
850 if (r == NULL)
851 FAIL("regexp out of space");
c277df42 852 r->refcnt = 1;
bbce6d69 853 r->prelen = xend - exp;
3280af22 854 r->precomp = PL_regprecomp;
a0d0e21e 855 r->subbeg = r->subbase = NULL;
3280af22
NIS
856 r->nparens = PL_regnpar - 1; /* set early to validate backrefs */
857 PL_regcomp_rx = r;
bbce6d69
PP
858
859 /* Second pass: emit code. */
3280af22
NIS
860 PL_regcomp_parse = exp;
861 PL_regxend = xend;
862 PL_regnaughty = 0;
863 PL_regnpar = 1;
864 PL_regcode = r->program;
2cd61cdb 865 /* Store the count of eval-groups for security checks: */
3280af22
NIS
866 PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals);
867 regc((U8)MAGIC, (char*) PL_regcode++);
c277df42 868 r->data = 0;
a0d0e21e
LW
869 if (reg(0, &flags) == NULL)
870 return(NULL);
871
872 /* Dig out information for optimizations. */
8782bef2 873 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3280af22 874 pm->op_pmflags = PL_regflags;
a0ed51b3
LW
875 if (UTF)
876 r->reganch |= ROPT_UTF8;
c277df42 877 r->regstclass = NULL;
a0ed51b3
LW
878 if (PL_regnaughty >= 10) /* Probably an expensive pattern. */
879 r->reganch |= ROPT_NAUGHTY;
c277df42 880 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
881
882 /* XXXX To minimize changes to RE engine we always allocate
883 3-units-long substrs field. */
884 Newz(1004, r->substrs, 1, struct reg_substr_data);
885
c277df42
IZ
886 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
887 scan_data_t data;
888 I32 fake;
c5254dd6 889 STRLEN longest_float_length, longest_fixed_length;
a0d0e21e 890
c277df42 891 StructCopy(&zero_scan_data, &data, scan_data_t);
a0d0e21e 892 first = scan;
c277df42 893 /* Skip introductions and multiplicators >= 1. */
a0d0e21e
LW
894 while ((OP(first) == OPEN && (sawopen = 1)) ||
895 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
896 (OP(first) == PLUS) ||
897 (OP(first) == MINMOD) ||
898 (regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
899 if (OP(first) == PLUS)
900 sawplus = 1;
901 else
902 first += regarglen[(U8)OP(first)];
903 first = NEXTOPER(first);
a687059c
LW
904 }
905
a0d0e21e
LW
906 /* Starting-point info. */
907 again:
c277df42 908 if (OP(first) == EXACT); /* Empty, get anchored substr later. */
a0ed51b3 909 else if (strchr(simple+4,OP(first)))
a0d0e21e 910 r->regstclass = first;
bbce6d69
PP
911 else if (regkind[(U8)OP(first)] == BOUND ||
912 regkind[(U8)OP(first)] == NBOUND)
a0d0e21e
LW
913 r->regstclass = first;
914 else if (regkind[(U8)OP(first)] == BOL) {
c277df42 915 r->reganch |= (OP(first) == MBOL ? ROPT_ANCH_MBOL: ROPT_ANCH_BOL);
a0d0e21e 916 first = NEXTOPER(first);
774d564b
PP
917 goto again;
918 }
919 else if (OP(first) == GPOS) {
920 r->reganch |= ROPT_ANCH_GPOS;
921 first = NEXTOPER(first);
922 goto again;
a0d0e21e
LW
923 }
924 else if ((OP(first) == STAR &&
925 regkind[(U8)OP(NEXTOPER(first))] == ANY) &&
926 !(r->reganch & ROPT_ANCH) )
927 {
928 /* turn .* into ^.* with an implied $*=1 */
774d564b 929 r->reganch |= ROPT_ANCH_BOL | ROPT_IMPLICIT;
a0d0e21e 930 first = NEXTOPER(first);
774d564b 931 goto again;
a0d0e21e 932 }
3280af22 933 if (sawplus && (!sawopen || !PL_regsawback))
a0d0e21e
LW
934 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
935
c277df42
IZ
936 /* Scan is after the zeroth branch, first is atomic matcher. */
937 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %d\n",
938 first - scan + 1));
a0d0e21e
LW
939 /*
940 * If there's something expensive in the r.e., find the
941 * longest literal string that must appear and make it the
942 * regmust. Resolve ties in favor of later strings, since
943 * the regstart check works with the beginning of the r.e.
944 * and avoiding duplication strengthens checking. Not a
945 * strong reason, but sufficient in the absence of others.
946 * [Now we resolve ties in favor of the earlier string if
c277df42 947 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
948 * earlier string may buy us something the later one won't.]
949 */
a0d0e21e 950 minlen = 0;
a687059c 951
c277df42
IZ
952 data.longest_fixed = newSVpv("",0);
953 data.longest_float = newSVpv("",0);
954 data.last_found = newSVpv("",0);
955 data.longest = &(data.longest_fixed);
956 first = scan;
957
3280af22 958 minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */
c277df42 959 &data, SCF_DO_SUBSTR);
3280af22 960 if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed)
c277df42 961 && data.last_start_min == 0 && data.last_end > 0
3280af22
NIS
962 && !PL_seen_zerolen
963 && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42
IZ
964 r->reganch |= ROPT_CHECK_ALL;
965 scan_commit(&data);
966 SvREFCNT_dec(data.last_found);
967
a0ed51b3 968 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 969 if (longest_float_length
c277df42
IZ
970 || (data.flags & SF_FL_BEFORE_EOL
971 && (!(data.flags & SF_FL_BEFORE_MEOL)
3280af22 972 || (PL_regflags & PMf_MULTILINE)))) {
a0ed51b3 973 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
c277df42
IZ
974 && data.offset_fixed == data.offset_float_min)
975 goto remove; /* Like in (a)+. */
976
977 r->float_substr = data.longest_float;
978 r->float_min_offset = data.offset_float_min;
979 r->float_max_offset = data.offset_float_max;
2779dcf1 980 fbm_compile(r->float_substr, 0);
c277df42
IZ
981 BmUSEFUL(r->float_substr) = 100;
982 if (data.flags & SF_FL_BEFORE_EOL /* Cannot have SEOL and MULTI */
983 && (!(data.flags & SF_FL_BEFORE_MEOL)
3280af22 984 || (PL_regflags & PMf_MULTILINE)))
c277df42 985 SvTAIL_on(r->float_substr);
a0ed51b3
LW
986 }
987 else {
c277df42
IZ
988 remove:
989 r->float_substr = Nullsv;
990 SvREFCNT_dec(data.longest_float);
c5254dd6 991 longest_float_length = 0;
a0d0e21e 992 }
c277df42 993
a0ed51b3 994 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 995 if (longest_fixed_length
c277df42
IZ
996 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
997 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3280af22 998 || (PL_regflags & PMf_MULTILINE)))) {
c277df42
IZ
999 r->anchored_substr = data.longest_fixed;
1000 r->anchored_offset = data.offset_fixed;
2779dcf1 1001 fbm_compile(r->anchored_substr, 0);
c277df42
IZ
1002 BmUSEFUL(r->anchored_substr) = 100;
1003 if (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1004 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3280af22 1005 || (PL_regflags & PMf_MULTILINE)))
c277df42 1006 SvTAIL_on(r->anchored_substr);
a0ed51b3
LW
1007 }
1008 else {
c277df42
IZ
1009 r->anchored_substr = Nullsv;
1010 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1011 longest_fixed_length = 0;
a0d0e21e 1012 }
c277df42
IZ
1013
1014 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1015 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1016 r->check_substr = r->anchored_substr;
1017 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1018 if (r->reganch & ROPT_ANCH_SINGLE)
1019 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1020 }
1021 else {
c277df42
IZ
1022 r->check_substr = r->float_substr;
1023 r->check_offset_min = data.offset_float_min;
1024 r->check_offset_max = data.offset_float_max;
a0d0e21e 1025 }
a0ed51b3
LW
1026 }
1027 else {
c277df42
IZ
1028 /* Several toplevels. Best we can is to set minlen. */
1029 I32 fake;
1030
1031 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1032 scan = r->program + 1;
3280af22 1033 minlen = study_chunk(&scan, &fake, scan + PL_regsize, NULL, 0);
c277df42 1034 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
a0d0e21e
LW
1035 }
1036
a0d0e21e 1037 r->minlen = minlen;
3280af22 1038 if (PL_regseen & REG_SEEN_GPOS)
c277df42 1039 r->reganch |= ROPT_GPOS_SEEN;
3280af22 1040 if (PL_regseen & REG_SEEN_LOOKBEHIND)
c277df42 1041 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3280af22 1042 if (PL_regseen & REG_SEEN_EVAL)
ce862d02 1043 r->reganch |= ROPT_EVAL_SEEN;
3280af22
NIS
1044 Newz(1002, r->startp, PL_regnpar, char*);
1045 Newz(1002, r->endp, PL_regnpar, char*);
a0d0e21e
LW
1046 DEBUG_r(regdump(r));
1047 return(r);
a687059c
LW
1048}
1049
1050/*
1051 - reg - regular expression, i.e. main body or parenthesized thing
1052 *
1053 * Caller must absorb opening parenthesis.
1054 *
1055 * Combining parenthesis handling with the base level of regular expression
1056 * is a trifle forced, but the need to tie the tails of the branches to what
1057 * follows makes it hard to avoid.
1058 */
76e3520e 1059STATIC regnode *
8ac85365 1060reg(I32 paren, I32 *flagp)
c277df42 1061 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 1062{
5c0ca799 1063 dTHR;
c277df42
IZ
1064 register regnode *ret; /* Will be the head of the group. */
1065 register regnode *br;
1066 register regnode *lastbr;
1067 register regnode *ender = 0;
a0d0e21e 1068 register I32 parno = 0;
3280af22 1069 I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0;
c277df42 1070 char c;
a0d0e21e 1071
821b33a5 1072 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
1073
1074 /* Make an OPEN node, if parenthesized. */
1075 if (paren) {
3280af22 1076 if (*PL_regcomp_parse == '?') {
ca9dfc88
IZ
1077 U16 posflags = 0, negflags = 0;
1078 U16 *flagsp = &posflags;
1079
3280af22
NIS
1080 PL_regcomp_parse++;
1081 paren = *PL_regcomp_parse++;
c277df42 1082 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 1083 switch (paren) {
c277df42 1084 case '<':
3280af22
NIS
1085 PL_regseen |= REG_SEEN_LOOKBEHIND;
1086 if (*PL_regcomp_parse == '!')
c277df42 1087 paren = ',';
3280af22 1088 if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!')
c277df42 1089 goto unknown;
3280af22 1090 PL_regcomp_parse++;
a0d0e21e
LW
1091 case '=':
1092 case '!':
3280af22 1093 PL_seen_zerolen++;
c277df42
IZ
1094 case ':':
1095 case '>':
a0d0e21e
LW
1096 break;
1097 case '$':
1098 case '@':
c277df42 1099 FAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
1100 break;
1101 case '#':
3280af22
NIS
1102 while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
1103 PL_regcomp_parse++;
1104 if (*PL_regcomp_parse != ')')
c277df42 1105 FAIL("Sequence (?#... not terminated");
a0d0e21e
LW
1106 nextchar();
1107 *flagp = TRYAGAIN;
1108 return NULL;
c277df42
IZ
1109 case '{':
1110 {
1111 dTHR;
1112 I32 count = 1, n = 0;
1113 char c;
3280af22 1114 char *s = PL_regcomp_parse;
c277df42
IZ
1115 SV *sv;
1116 OP_4tree *sop, *rop;
1117
3280af22
NIS
1118 PL_seen_zerolen++;
1119 PL_regseen |= REG_SEEN_EVAL;
1120 while (count && (c = *PL_regcomp_parse)) {
1121 if (c == '\\' && PL_regcomp_parse[1])
1122 PL_regcomp_parse++;
c277df42
IZ
1123 else if (c == '{')
1124 count++;
1125 else if (c == '}')
1126 count--;
3280af22 1127 PL_regcomp_parse++;
c277df42 1128 }
3280af22 1129 if (*PL_regcomp_parse != ')')
c277df42
IZ
1130 FAIL("Sequence (?{...}) not terminated or not {}-balanced");
1131 if (!SIZE_ONLY) {
1132 AV *av;
1133
3280af22
NIS
1134 if (PL_regcomp_parse - 1 - s)
1135 sv = newSVpv(s, PL_regcomp_parse - 1 - s);
c277df42
IZ
1136 else
1137 sv = newSVpv("", 0);
1138
1139 rop = sv_compile_2op(sv, &sop, "re", &av);
1140
1141 n = add_data(3, "nso");
3280af22
NIS
1142 PL_regcomp_rx->data->data[n] = (void*)rop;
1143 PL_regcomp_rx->data->data[n+1] = (void*)av;
1144 PL_regcomp_rx->data->data[n+2] = (void*)sop;
c277df42 1145 SvREFCNT_dec(sv);
a0ed51b3
LW
1146 }
1147 else { /* First pass */
1148 if (PL_reginterp_cnt < ++PL_seen_evals && PL_curcop != &compiling)
2cd61cdb
IZ
1149 /* No compiled RE interpolated, has runtime
1150 components ===> unsafe. */
1151 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3280af22 1152 if (PL_tainted)
cc6b7395 1153 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
1154 }
1155
1156 nextchar();
c277df42
IZ
1157 return reganode(EVAL, n);
1158 }
1159 case '(':
1160 {
3280af22
NIS
1161 if (PL_regcomp_parse[0] == '?') {
1162 if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!'
1163 || PL_regcomp_parse[1] == '<'
1164 || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
1165 I32 flag;
1166
1167 ret = reg_node(LOGICAL);
1168 regtail(ret, reg(1, &flag));
1169 goto insert_if;
1170 }
a0ed51b3
LW
1171 }
1172 else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) {
3280af22 1173 parno = atoi(PL_regcomp_parse++);
c277df42 1174
3280af22
NIS
1175 while (isDIGIT(*PL_regcomp_parse))
1176 PL_regcomp_parse++;
c277df42
IZ
1177 ret = reganode(GROUPP, parno);
1178 if ((c = *nextchar()) != ')')
1179 FAIL2("Switch (?(number%c not recognized", c);
1180 insert_if:
1181 regtail(ret, reganode(IFTHEN, 0));
1182 br = regbranch(&flags, 1);
1183 if (br == NULL)
1184 br = reganode(LONGJMP, 0);
1185 else
1186 regtail(br, reganode(LONGJMP, 0));
1187 c = *nextchar();
1188 if (c == '|') {
1189 lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */
1190 regbranch(&flags, 1);
1191 regtail(ret, lastbr);
1192 c = *nextchar();
a0ed51b3
LW
1193 }
1194 else
c277df42
IZ
1195 lastbr = NULL;
1196 if (c != ')')
1197 FAIL("Switch (?(condition)... contains too many branches");
1198 ender = reg_node(TAIL);
1199 regtail(br, ender);
1200 if (lastbr) {
1201 regtail(lastbr, ender);
1202 regtail(NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
1203 }
1204 else
c277df42
IZ
1205 regtail(ret, ender);
1206 return ret;
a0ed51b3
LW
1207 }
1208 else {
3280af22 1209 FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse);
c277df42
IZ
1210 }
1211 }
1b1626e4 1212 case 0:
c277df42 1213 FAIL("Sequence (? incomplete");
1b1626e4 1214 break;
a0d0e21e 1215 default:
3280af22 1216 --PL_regcomp_parse;
ca9dfc88 1217 parse_flags:
3280af22
NIS
1218 while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) {
1219 if (*PL_regcomp_parse != 'o')
1220 pmflag(flagsp, *PL_regcomp_parse);
1221 ++PL_regcomp_parse;
ca9dfc88 1222 }
3280af22 1223 if (*PL_regcomp_parse == '-') {
ca9dfc88 1224 flagsp = &negflags;
3280af22 1225 ++PL_regcomp_parse;
ca9dfc88 1226 goto parse_flags;
48c036b1 1227 }
3280af22
NIS
1228 PL_regflags |= posflags;
1229 PL_regflags &= ~negflags;
1230 if (*PL_regcomp_parse == ':') {
1231 PL_regcomp_parse++;
ca9dfc88
IZ
1232 paren = ':';
1233 break;
1234 }
c277df42 1235 unknown:
3280af22
NIS
1236 if (*PL_regcomp_parse != ')')
1237 FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse);
a0d0e21e
LW
1238 nextchar();
1239 *flagp = TRYAGAIN;
1240 return NULL;
1241 }
1242 }
1243 else {
3280af22
NIS
1244 parno = PL_regnpar;
1245 PL_regnpar++;
a0d0e21e 1246 ret = reganode(OPEN, parno);
c277df42 1247 open = 1;
a0d0e21e 1248 }
a0ed51b3
LW
1249 }
1250 else
a0d0e21e
LW
1251 ret = NULL;
1252
1253 /* Pick up the branches, linking them together. */
c277df42 1254 br = regbranch(&flags, 1);
a0d0e21e
LW
1255 if (br == NULL)
1256 return(NULL);
3280af22
NIS
1257 if (*PL_regcomp_parse == '|') {
1258 if (!SIZE_ONLY && PL_extralen) {
c277df42 1259 reginsert(BRANCHJ, br);
a0ed51b3
LW
1260 }
1261 else
c277df42
IZ
1262 reginsert(BRANCH, br);
1263 have_branch = 1;
1264 if (SIZE_ONLY)
3280af22 1265 PL_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
1266 }
1267 else if (paren == ':') {
c277df42
IZ
1268 *flagp |= flags&SIMPLE;
1269 }
1270 if (open) { /* Starts with OPEN. */
1271 regtail(ret, br); /* OPEN -> first. */
a0ed51b3
LW
1272 }
1273 else if (paren != '?') /* Not Conditional */
a0d0e21e 1274 ret = br;
821b33a5
IZ
1275 if (flags&HASWIDTH)
1276 *flagp |= HASWIDTH;
a0d0e21e 1277 *flagp |= flags&SPSTART;
c277df42 1278 lastbr = br;
3280af22
NIS
1279 while (*PL_regcomp_parse == '|') {
1280 if (!SIZE_ONLY && PL_extralen) {
c277df42
IZ
1281 ender = reganode(LONGJMP,0);
1282 regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
1283 }
1284 if (SIZE_ONLY)
3280af22 1285 PL_extralen += 2; /* Account for LONGJMP. */
a0d0e21e 1286 nextchar();
c277df42 1287 br = regbranch(&flags, 0);
a687059c 1288 if (br == NULL)
a0d0e21e 1289 return(NULL);
c277df42
IZ
1290 regtail(lastbr, br); /* BRANCH -> BRANCH. */
1291 lastbr = br;
821b33a5
IZ
1292 if (flags&HASWIDTH)
1293 *flagp |= HASWIDTH;
a687059c 1294 *flagp |= flags&SPSTART;
a0d0e21e
LW
1295 }
1296
c277df42
IZ
1297 if (have_branch || paren != ':') {
1298 /* Make a closing node, and hook it on the end. */
1299 switch (paren) {
1300 case ':':
1301 ender = reg_node(TAIL);
1302 break;
1303 case 1:
1304 ender = reganode(CLOSE, parno);
1305 break;
1306 case '<':
c277df42
IZ
1307 case ',':
1308 case '=':
1309 case '!':
c277df42 1310 *flagp &= ~HASWIDTH;
821b33a5
IZ
1311 /* FALL THROUGH */
1312 case '>':
1313 ender = reg_node(SUCCEED);
c277df42
IZ
1314 break;
1315 case 0:
1316 ender = reg_node(END);
1317 break;
1318 }
1319 regtail(lastbr, ender);
a0d0e21e 1320
c277df42
IZ
1321 if (have_branch) {
1322 /* Hook the tails of the branches to the closing node. */
1323 for (br = ret; br != NULL; br = regnext(br)) {
1324 regoptail(br, ender);
1325 }
1326 }
a0d0e21e 1327 }
c277df42
IZ
1328
1329 {
1330 char *p;
1331 static char parens[] = "=!<,>";
1332
1333 if (paren && (p = strchr(parens, paren))) {
1334 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
1335 int flag = (p - parens) > 1;
1336
1337 if (paren == '>')
1338 node = SUSPEND, flag = 0;
1339 reginsert(node,ret);
c277df42 1340 ret->flags = flag;
c277df42
IZ
1341 regtail(ret, reg_node(TAIL));
1342 }
a0d0e21e
LW
1343 }
1344
1345 /* Check for proper termination. */
3280af22 1346 if (paren && (PL_regcomp_parse >= PL_regxend || *nextchar() != ')')) {
a0d0e21e 1347 FAIL("unmatched () in regexp");
a0ed51b3
LW
1348 }
1349 else if (!paren && PL_regcomp_parse < PL_regxend) {
3280af22 1350 if (*PL_regcomp_parse == ')') {
a0d0e21e 1351 FAIL("unmatched () in regexp");
a0ed51b3
LW
1352 }
1353 else
a0d0e21e
LW
1354 FAIL("junk on end of regexp"); /* "Can't happen". */
1355 /* NOTREACHED */
1356 }
c277df42 1357 if (paren != 0) {
3280af22 1358 PL_regflags = oregflags;
c277df42 1359 }
a687059c 1360
a0d0e21e 1361 return(ret);
a687059c
LW
1362}
1363
1364/*
1365 - regbranch - one alternative of an | operator
1366 *
1367 * Implements the concatenation operator.
1368 */
76e3520e 1369STATIC regnode *
c277df42 1370regbranch(I32 *flagp, I32 first)
a687059c 1371{
5c0ca799 1372 dTHR;
c277df42
IZ
1373 register regnode *ret;
1374 register regnode *chain = NULL;
1375 register regnode *latest;
1376 I32 flags = 0, c = 0;
a0d0e21e 1377
c277df42
IZ
1378 if (first)
1379 ret = NULL;
1380 else {
3280af22 1381 if (!SIZE_ONLY && PL_extralen)
c277df42
IZ
1382 ret = reganode(BRANCHJ,0);
1383 else
1384 ret = reg_node(BRANCH);
1385 }
1386
1387 if (!first && SIZE_ONLY)
3280af22 1388 PL_extralen += 1; /* BRANCHJ */
c277df42
IZ
1389
1390 *flagp = WORST; /* Tentatively. */
a0d0e21e 1391
3280af22 1392 PL_regcomp_parse--;
a0d0e21e 1393 nextchar();
3280af22 1394 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') {
a0d0e21e
LW
1395 flags &= ~TRYAGAIN;
1396 latest = regpiece(&flags);
1397 if (latest == NULL) {
1398 if (flags & TRYAGAIN)
1399 continue;
1400 return(NULL);
a0ed51b3
LW
1401 }
1402 else if (ret == NULL)
c277df42 1403 ret = latest;
a0d0e21e 1404 *flagp |= flags&HASWIDTH;
c277df42 1405 if (chain == NULL) /* First piece. */
a0d0e21e
LW
1406 *flagp |= flags&SPSTART;
1407 else {
3280af22 1408 PL_regnaughty++;
a0d0e21e 1409 regtail(chain, latest);
a687059c 1410 }
a0d0e21e 1411 chain = latest;
c277df42
IZ
1412 c++;
1413 }
1414 if (chain == NULL) { /* Loop ran zero times. */
1415 chain = reg_node(NOTHING);
1416 if (ret == NULL)
1417 ret = chain;
1418 }
1419 if (c == 1) {
1420 *flagp |= flags&SIMPLE;
a0d0e21e 1421 }
a687059c 1422
a0d0e21e 1423 return(ret);
a687059c
LW
1424}
1425
1426/*
1427 - regpiece - something followed by possible [*+?]
1428 *
1429 * Note that the branching code sequences used for ? and the general cases
1430 * of * and + are somewhat optimized: they use the same NOTHING node as
1431 * both the endmarker for their branch list and the body of the last branch.
1432 * It might seem that this node could be dispensed with entirely, but the
1433 * endmarker role is not redundant.
1434 */
76e3520e 1435STATIC regnode *
8ac85365 1436regpiece(I32 *flagp)
a687059c 1437{
5c0ca799 1438 dTHR;
c277df42 1439 register regnode *ret;
a0d0e21e
LW
1440 register char op;
1441 register char *next;
1442 I32 flags;
3280af22 1443 char *origparse = PL_regcomp_parse;
a0d0e21e
LW
1444 char *maxpos;
1445 I32 min;
c277df42 1446 I32 max = REG_INFTY;
a0d0e21e
LW
1447
1448 ret = regatom(&flags);
1449 if (ret == NULL) {
1450 if (flags & TRYAGAIN)
1451 *flagp |= TRYAGAIN;
1452 return(NULL);
1453 }
1454
3280af22 1455 op = *PL_regcomp_parse;
a0d0e21e 1456
3280af22
NIS
1457 if (op == '{' && regcurly(PL_regcomp_parse)) {
1458 next = PL_regcomp_parse + 1;
a0d0e21e
LW
1459 maxpos = Nullch;
1460 while (isDIGIT(*next) || *next == ',') {
1461 if (*next == ',') {
1462 if (maxpos)
1463 break;
1464 else
1465 maxpos = next;
a687059c 1466 }
a0d0e21e
LW
1467 next++;
1468 }
1469 if (*next == '}') { /* got one */
1470 if (!maxpos)
1471 maxpos = next;
3280af22
NIS
1472 PL_regcomp_parse++;
1473 min = atoi(PL_regcomp_parse);
a0d0e21e
LW
1474 if (*maxpos == ',')
1475 maxpos++;
1476 else
3280af22 1477 maxpos = PL_regcomp_parse;
a0d0e21e
LW
1478 max = atoi(maxpos);
1479 if (!max && *maxpos != '0')
c277df42
IZ
1480 max = REG_INFTY; /* meaning "infinity" */
1481 else if (max >= REG_INFTY)
1482 FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3280af22 1483 PL_regcomp_parse = next;
a0d0e21e
LW
1484 nextchar();
1485
1486 do_curly:
1487 if ((flags&SIMPLE)) {
3280af22 1488 PL_regnaughty += 2 + PL_regnaughty / 2;
a0d0e21e
LW
1489 reginsert(CURLY, ret);
1490 }
1491 else {
3280af22 1492 PL_regnaughty += 4 + PL_regnaughty; /* compound interest */
c277df42 1493 regtail(ret, reg_node(WHILEM));
3280af22 1494 if (!SIZE_ONLY && PL_extralen) {
c277df42
IZ
1495 reginsert(LONGJMP,ret);
1496 reginsert(NOTHING,ret);
1497 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
1498 }
a0d0e21e 1499 reginsert(CURLYX,ret);
3280af22 1500 if (!SIZE_ONLY && PL_extralen)
c277df42
IZ
1501 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
1502 regtail(ret, reg_node(NOTHING));
1503 if (SIZE_ONLY)
3280af22 1504 PL_extralen += 3;
a0d0e21e 1505 }
c277df42 1506 ret->flags = 0;
a0d0e21e
LW
1507
1508 if (min > 0)
821b33a5
IZ
1509 *flagp = WORST;
1510 if (max > 0)
1511 *flagp |= HASWIDTH;
a0d0e21e 1512 if (max && max < min)
c277df42
IZ
1513 FAIL("Can't do {n,m} with n > m");
1514 if (!SIZE_ONLY) {
1515 ARG1_SET(ret, min);
1516 ARG2_SET(ret, max);
a687059c 1517 }
a687059c 1518
a0d0e21e 1519 goto nest_check;
a687059c 1520 }
a0d0e21e 1521 }
a687059c 1522
a0d0e21e
LW
1523 if (!ISMULT1(op)) {
1524 *flagp = flags;
a687059c 1525 return(ret);
a0d0e21e 1526 }
bb20fd44 1527
c277df42 1528#if 0 /* Now runtime fix should be reliable. */
bb20fd44 1529 if (!(flags&HASWIDTH) && op != '?')
c277df42
IZ
1530 FAIL("regexp *+ operand could be empty");
1531#endif
bb20fd44 1532
a0d0e21e
LW
1533 nextchar();
1534
821b33a5 1535 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
1536
1537 if (op == '*' && (flags&SIMPLE)) {
1538 reginsert(STAR, ret);
c277df42 1539 ret->flags = 0;
3280af22 1540 PL_regnaughty += 4;
a0d0e21e
LW
1541 }
1542 else if (op == '*') {
1543 min = 0;
1544 goto do_curly;
a0ed51b3
LW
1545 }
1546 else if (op == '+' && (flags&SIMPLE)) {
a0d0e21e 1547 reginsert(PLUS, ret);
c277df42 1548 ret->flags = 0;
3280af22 1549 PL_regnaughty += 3;
a0d0e21e
LW
1550 }
1551 else if (op == '+') {
1552 min = 1;
1553 goto do_curly;
a0ed51b3
LW
1554 }
1555 else if (op == '?') {
a0d0e21e
LW
1556 min = 0; max = 1;
1557 goto do_curly;
1558 }
1559 nest_check:
3280af22 1560 if (PL_dowarn && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) {
a0d0e21e 1561 warn("%.*s matches null string many times",
3280af22 1562 PL_regcomp_parse - origparse, origparse);
a0d0e21e
LW
1563 }
1564
3280af22 1565 if (*PL_regcomp_parse == '?') {
a0d0e21e
LW
1566 nextchar();
1567 reginsert(MINMOD, ret);
c277df42 1568 regtail(ret, ret + NODE_STEP_REGNODE);
a0d0e21e 1569 }
3280af22 1570 if (ISMULT2(PL_regcomp_parse))
a0d0e21e
LW
1571 FAIL("nested *?+ in regexp");
1572
1573 return(ret);
a687059c
LW
1574}
1575
1576/*
1577 - regatom - the lowest level
1578 *
1579 * Optimization: gobbles an entire sequence of ordinary characters so that
1580 * it can turn them into a single node, which is smaller to store and
1581 * faster to run. Backslashed characters are exceptions, each becoming a
1582 * separate node; the code is simpler that way and it's not worth fixing.
1583 *
1584 * [Yes, it is worth fixing, some scripts can run twice the speed.]
1585 */
76e3520e 1586STATIC regnode *
8ac85365 1587regatom(I32 *flagp)
a687059c 1588{
5c0ca799 1589 dTHR;
c277df42 1590 register regnode *ret = 0;
a0d0e21e
LW
1591 I32 flags;
1592
1593 *flagp = WORST; /* Tentatively. */
1594
1595tryagain:
3280af22 1596 switch (*PL_regcomp_parse) {
a0d0e21e 1597 case '^':
3280af22 1598 PL_seen_zerolen++;
a0d0e21e 1599 nextchar();
3280af22 1600 if (PL_regflags & PMf_MULTILINE)
c277df42 1601 ret = reg_node(MBOL);
3280af22 1602 else if (PL_regflags & PMf_SINGLELINE)
c277df42 1603 ret = reg_node(SBOL);
a0d0e21e 1604 else
c277df42 1605 ret = reg_node(BOL);
a0d0e21e
LW
1606 break;
1607 case '$':
3280af22
NIS
1608 if (PL_regcomp_parse[1])
1609 PL_seen_zerolen++;
a0d0e21e 1610 nextchar();
3280af22 1611 if (PL_regflags & PMf_MULTILINE)
c277df42 1612 ret = reg_node(MEOL);
3280af22 1613 else if (PL_regflags & PMf_SINGLELINE)
c277df42 1614 ret = reg_node(SEOL);
a0d0e21e 1615 else
c277df42 1616 ret = reg_node(EOL);
a0d0e21e
LW
1617 break;
1618 case '.':
1619 nextchar();
a0ed51b3
LW
1620 if (UTF) {
1621 if (PL_regflags & PMf_SINGLELINE)
1622 ret = reg_node(SANYUTF8);
1623 else
1624 ret = reg_node(ANYUTF8);
1625 *flagp |= HASWIDTH;
1626 }
1627 else {
1628 if (PL_regflags & PMf_SINGLELINE)
1629 ret = reg_node(SANY);
1630 else
1631 ret = reg_node(ANY);
1632 *flagp |= HASWIDTH|SIMPLE;
1633 }
3280af22 1634 PL_regnaughty++;
a0d0e21e
LW
1635 break;
1636 case '[':
3280af22 1637 PL_regcomp_parse++;
a0ed51b3 1638 ret = (UTF ? regclassutf8() : regclass());
a0d0e21e
LW
1639 *flagp |= HASWIDTH|SIMPLE;
1640 break;
1641 case '(':
1642 nextchar();
1643 ret = reg(1, &flags);
1644 if (ret == NULL) {
1645 if (flags & TRYAGAIN)
1646 goto tryagain;
1647 return(NULL);
1648 }
c277df42 1649 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
1650 break;
1651 case '|':
1652 case ')':
1653 if (flags & TRYAGAIN) {
1654 *flagp |= TRYAGAIN;
1655 return NULL;
1656 }
3280af22 1657 FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse);
a0d0e21e
LW
1658 /* Supposed to be caught earlier. */
1659 break;
85afd4ae 1660 case '{':
3280af22
NIS
1661 if (!regcurly(PL_regcomp_parse)) {
1662 PL_regcomp_parse++;
85afd4ae
CS
1663 goto defchar;
1664 }
1665 /* FALL THROUGH */
a0d0e21e
LW
1666 case '?':
1667 case '+':
1668 case '*':
3115e423 1669 FAIL("?+*{} follows nothing in regexp");
a0d0e21e
LW
1670 break;
1671 case '\\':
3280af22 1672 switch (*++PL_regcomp_parse) {
a0d0e21e 1673 case 'A':
3280af22 1674 PL_seen_zerolen++;
c277df42 1675 ret = reg_node(SBOL);
a0d0e21e
LW
1676 *flagp |= SIMPLE;
1677 nextchar();
1678 break;
1679 case 'G':
c277df42 1680 ret = reg_node(GPOS);
3280af22 1681 PL_regseen |= REG_SEEN_GPOS;
a0d0e21e
LW
1682 *flagp |= SIMPLE;
1683 nextchar();
1684 break;
1685 case 'Z':
c277df42 1686 ret = reg_node(SEOL);
a0d0e21e
LW
1687 *flagp |= SIMPLE;
1688 nextchar();
1689 break;
b85d18e9
IZ
1690 case 'z':
1691 ret = reg_node(EOS);
1692 *flagp |= SIMPLE;
3280af22 1693 PL_seen_zerolen++; /* Do not optimize RE away */
b85d18e9
IZ
1694 nextchar();
1695 break;
a0ed51b3
LW
1696 case 'C':
1697 ret = reg_node(SANY);
1698 *flagp |= HASWIDTH|SIMPLE;
1699 nextchar();
1700 break;
1701 case 'X':
1702 ret = reg_node(CLUMP);
1703 *flagp |= HASWIDTH;
1704 nextchar();
1705 if (UTF && !PL_utf8_mark)
1706 is_utf8_mark("~"); /* preload table */
1707 break;
a0d0e21e 1708 case 'w':
a0ed51b3
LW
1709 ret = reg_node(
1710 UTF
1711 ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
1712 : (LOC ? ALNUML : ALNUM));
a0d0e21e
LW
1713 *flagp |= HASWIDTH|SIMPLE;
1714 nextchar();
a0ed51b3
LW
1715 if (UTF && !PL_utf8_alnum)
1716 is_utf8_alnum("a"); /* preload table */
a0d0e21e
LW
1717 break;
1718 case 'W':
a0ed51b3
LW
1719 ret = reg_node(
1720 UTF
1721 ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
1722 : (LOC ? NALNUML : NALNUM));
a0d0e21e
LW
1723 *flagp |= HASWIDTH|SIMPLE;
1724 nextchar();
a0ed51b3
LW
1725 if (UTF && !PL_utf8_alnum)
1726 is_utf8_alnum("a"); /* preload table */
a0d0e21e
LW
1727 break;
1728 case 'b':
3280af22 1729 PL_seen_zerolen++;
a0ed51b3
LW
1730 ret = reg_node(
1731 UTF
1732 ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
1733 : (LOC ? BOUNDL : BOUND));
a0d0e21e
LW
1734 *flagp |= SIMPLE;
1735 nextchar();
a0ed51b3
LW
1736 if (UTF && !PL_utf8_alnum)
1737 is_utf8_alnum("a"); /* preload table */
a0d0e21e
LW
1738 break;
1739 case 'B':
3280af22 1740 PL_seen_zerolen++;
a0ed51b3
LW
1741 ret = reg_node(
1742 UTF
1743 ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
1744 : (LOC ? NBOUNDL : NBOUND));
a0d0e21e
LW
1745 *flagp |= SIMPLE;
1746 nextchar();
a0ed51b3
LW
1747 if (UTF && !PL_utf8_alnum)
1748 is_utf8_alnum("a"); /* preload table */
a0d0e21e
LW
1749 break;
1750 case 's':
a0ed51b3
LW
1751 ret = reg_node(
1752 UTF
1753 ? (LOC ? SPACELUTF8 : SPACEUTF8)
1754 : (LOC ? SPACEL : SPACE));
a0d0e21e
LW
1755 *flagp |= HASWIDTH|SIMPLE;
1756 nextchar();
a0ed51b3
LW
1757 if (UTF && !PL_utf8_space)
1758 is_utf8_space(" "); /* preload table */
a0d0e21e
LW
1759 break;
1760 case 'S':
a0ed51b3
LW
1761 ret = reg_node(
1762 UTF
1763 ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
1764 : (LOC ? NSPACEL : NSPACE));
a0d0e21e
LW
1765 *flagp |= HASWIDTH|SIMPLE;
1766 nextchar();
a0ed51b3
LW
1767 if (UTF && !PL_utf8_space)
1768 is_utf8_space(" "); /* preload table */
a0d0e21e
LW
1769 break;
1770 case 'd':
a0ed51b3 1771 ret = reg_node(UTF ? DIGITUTF8 : DIGIT);
a0d0e21e
LW
1772 *flagp |= HASWIDTH|SIMPLE;
1773 nextchar();
a0ed51b3
LW
1774 if (UTF && !PL_utf8_digit)
1775 is_utf8_digit("1"); /* preload table */
a0d0e21e
LW
1776 break;
1777 case 'D':
a0ed51b3 1778 ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT);
a0d0e21e
LW
1779 *flagp |= HASWIDTH|SIMPLE;
1780 nextchar();
a0ed51b3
LW
1781 if (UTF && !PL_utf8_digit)
1782 is_utf8_digit("1"); /* preload table */
a0d0e21e
LW
1783 break;
1784 case 'n':
1785 case 'r':
1786 case 't':
1787 case 'f':
1788 case 'e':
1789 case 'a':
1790 case 'x':
1791 case 'c':
1792 case '0':
1793 goto defchar;
1794 case '1': case '2': case '3': case '4':
1795 case '5': case '6': case '7': case '8': case '9':
1796 {
3280af22 1797 I32 num = atoi(PL_regcomp_parse);
a0d0e21e 1798
3280af22 1799 if (num > 9 && num >= PL_regnpar)
a0d0e21e
LW
1800 goto defchar;
1801 else {
3280af22 1802 if (!SIZE_ONLY && num > PL_regcomp_rx->nparens)
ef64f398 1803 FAIL("reference to nonexistent group");
3280af22 1804 PL_regsawback = 1;
a0ed51b3
LW
1805 ret = reganode(FOLD
1806 ? (LOC ? REFFL : REFF)
c8756f30 1807 : REF, num);
a0d0e21e 1808 *flagp |= HASWIDTH;
3280af22
NIS
1809 while (isDIGIT(*PL_regcomp_parse))
1810 PL_regcomp_parse++;
1811 PL_regcomp_parse--;
a0d0e21e
LW
1812 nextchar();
1813 }
1814 }
1815 break;
1816 case '\0':
3280af22 1817 if (PL_regcomp_parse >= PL_regxend)
a0d0e21e
LW
1818 FAIL("trailing \\ in regexp");
1819 /* FALL THROUGH */
1820 default:
1821 goto defchar;
1822 }
1823 break;
4633a7c4
LW
1824
1825 case '#':
3280af22
NIS
1826 if (PL_regflags & PMf_EXTENDED) {
1827 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++;
1828 if (PL_regcomp_parse < PL_regxend)
4633a7c4
LW
1829 goto tryagain;
1830 }
1831 /* FALL THROUGH */
1832
a0d0e21e
LW
1833 default: {
1834 register I32 len;
a0ed51b3 1835 register UV ender;
a0d0e21e 1836 register char *p;
c277df42 1837 char *oldp, *s;
a0d0e21e
LW
1838 I32 numlen;
1839
3280af22 1840 PL_regcomp_parse++;
a0d0e21e
LW
1841
1842 defchar:
a0ed51b3
LW
1843 ret = reg_node(FOLD
1844 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 1845 : EXACT);
161b471a 1846 s = (char *) OPERAND(ret);
c277df42 1847 regc(0, s++); /* save spot for len */
3280af22
NIS
1848 for (len = 0, p = PL_regcomp_parse - 1;
1849 len < 127 && p < PL_regxend;
a0d0e21e
LW
1850 len++)
1851 {
1852 oldp = p;
5b5a24f7 1853
3280af22
NIS
1854 if (PL_regflags & PMf_EXTENDED)
1855 p = regwhite(p, PL_regxend);
a0d0e21e
LW
1856 switch (*p) {
1857 case '^':
1858 case '$':
1859 case '.':
1860 case '[':
1861 case '(':
1862 case ')':
1863 case '|':
1864 goto loopdone;
1865 case '\\':
1866 switch (*++p) {
1867 case 'A':
1868 case 'G':
1869 case 'Z':
b85d18e9 1870 case 'z':
a0d0e21e
LW
1871 case 'w':
1872 case 'W':
1873 case 'b':
1874 case 'B':
1875 case 's':
1876 case 'S':
1877 case 'd':
1878 case 'D':
1879 --p;
1880 goto loopdone;
1881 case 'n':
1882 ender = '\n';
1883 p++;
a687059c 1884 break;
a0d0e21e
LW
1885 case 'r':
1886 ender = '\r';
1887 p++;
a687059c 1888 break;
a0d0e21e
LW
1889 case 't':
1890 ender = '\t';
1891 p++;
a687059c 1892 break;
a0d0e21e
LW
1893 case 'f':
1894 ender = '\f';
1895 p++;
a687059c 1896 break;
a0d0e21e
LW
1897 case 'e':
1898 ender = '\033';
1899 p++;
a687059c 1900 break;
a0d0e21e
LW
1901 case 'a':
1902 ender = '\007';
1903 p++;
a687059c 1904 break;
a0d0e21e 1905 case 'x':
a0ed51b3
LW
1906 if (*++p == '{') {
1907 char* e = strchr(p, '}');
1908
1909 if (!e)
1910 FAIL("Missing right brace on \\x{}");
1911 else if (UTF) {
1912 ender = scan_hex(p + 1, e - p, &numlen);
1913 if (numlen + len >= 127) { /* numlen is generous */
1914 p--;
1915 goto loopdone;
1916 }
1917 p = e + 1;
1918 }
1919 else
1920 FAIL("Can't use \\x{} without 'use utf8' declaration");
1921 }
1922 else {
1923 ender = scan_hex(p, 2, &numlen);
1924 p += numlen;
1925 }
a687059c 1926 break;
a0d0e21e
LW
1927 case 'c':
1928 p++;
bbce6d69
PP
1929 ender = UCHARAT(p++);
1930 ender = toCTRL(ender);
a687059c 1931 break;
a0d0e21e
LW
1932 case '0': case '1': case '2': case '3':case '4':
1933 case '5': case '6': case '7': case '8':case '9':
1934 if (*p == '0' ||
3280af22 1935 (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) {
a0d0e21e
LW
1936 ender = scan_oct(p, 3, &numlen);
1937 p += numlen;
1938 }
1939 else {
1940 --p;
1941 goto loopdone;
a687059c
LW
1942 }
1943 break;
a0d0e21e 1944 case '\0':
3280af22 1945 if (p >= PL_regxend)
a687059c
LW
1946 FAIL("trailing \\ in regexp");
1947 /* FALL THROUGH */
a0d0e21e 1948 default:
a0ed51b3 1949 goto normal_default;
a0d0e21e
LW
1950 }
1951 break;
a687059c 1952 default:
a0ed51b3
LW
1953 normal_default:
1954 if ((*p & 0xc0) == 0xc0 && UTF) {
1955 ender = utf8_to_uv(p, &numlen);
1956 p += numlen;
1957 }
1958 else
1959 ender = *p++;
a0d0e21e 1960 break;
a687059c 1961 }
3280af22
NIS
1962 if (PL_regflags & PMf_EXTENDED)
1963 p = regwhite(p, PL_regxend);
a0ed51b3
LW
1964 if (UTF && FOLD) {
1965 if (LOC)
1966 ender = toLOWER_LC_uni(ender);
1967 else
1968 ender = toLOWER_uni(ender);
1969 }
a0d0e21e
LW
1970 if (ISMULT2(p)) { /* Back off on ?+*. */
1971 if (len)
1972 p = oldp;
a0ed51b3
LW
1973 else if (ender >= 0x80 && UTF) {
1974 reguni(ender, s, &numlen);
1975 s += numlen;
1976 len += numlen;
1977 }
a0d0e21e
LW
1978 else {
1979 len++;
c277df42 1980 regc(ender, s++);
a0d0e21e
LW
1981 }
1982 break;
a687059c 1983 }
a0ed51b3
LW
1984 if (ender >= 0x80 && UTF) {
1985 reguni(ender, s, &numlen);
1986 s += numlen;
1987 len += numlen - 1;
1988 }
1989 else
1990 regc(ender, s++);
a0d0e21e
LW
1991 }
1992 loopdone:
3280af22 1993 PL_regcomp_parse = p - 1;
a0d0e21e
LW
1994 nextchar();
1995 if (len < 0)
1996 FAIL("internal disaster in regexp");
1997 if (len > 0)
1998 *flagp |= HASWIDTH;
1999 if (len == 1)
2000 *flagp |= SIMPLE;
c277df42 2001 if (!SIZE_ONLY)
a0d0e21e 2002 *OPERAND(ret) = len;
c277df42
IZ
2003 regc('\0', s++);
2004 if (SIZE_ONLY) {
3280af22 2005 PL_regsize += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode);
a0ed51b3
LW
2006 }
2007 else {
3280af22 2008 PL_regcode += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode);
c277df42 2009 }
a687059c 2010 }
a0d0e21e
LW
2011 break;
2012 }
a687059c 2013
a0d0e21e 2014 return(ret);
a687059c
LW
2015}
2016
873ef191 2017STATIC char *
8ac85365 2018regwhite(char *p, char *e)
5b5a24f7
CS
2019{
2020 while (p < e) {
2021 if (isSPACE(*p))
2022 ++p;
2023 else if (*p == '#') {
2024 do {
2025 p++;
2026 } while (p < e && *p != '\n');
2027 }
2028 else
2029 break;
2030 }
2031 return p;
2032}
2033
76e3520e 2034STATIC regnode *
8ac85365 2035regclass(void)
a687059c 2036{
5c0ca799 2037 dTHR;
c277df42 2038 register char *opnd, *s;
a0ed51b3
LW
2039 register I32 value;
2040 register I32 lastvalue = 1234;
a0d0e21e 2041 register I32 range = 0;
c277df42 2042 register regnode *ret;
a0d0e21e
LW
2043 register I32 def;
2044 I32 numlen;
2045
3280af22 2046 s = opnd = (char *) OPERAND(PL_regcode);
c277df42 2047 ret = reg_node(ANYOF);
a0ed51b3 2048 for (value = 0; value < 33; value++)
c277df42 2049 regc(0, s++);
3280af22
NIS
2050 if (*PL_regcomp_parse == '^') { /* Complement of range. */
2051 PL_regnaughty++;
2052 PL_regcomp_parse++;
c277df42 2053 if (!SIZE_ONLY)
bbce6d69
PP
2054 *opnd |= ANYOF_INVERT;
2055 }
c277df42 2056 if (!SIZE_ONLY) {
3280af22 2057 PL_regcode += ANY_SKIP;
a0ed51b3 2058 if (FOLD)
bbce6d69 2059 *opnd |= ANYOF_FOLD;
a0ed51b3 2060 if (LOC)
bbce6d69 2061 *opnd |= ANYOF_LOCALE;
a0ed51b3
LW
2062 }
2063 else {
3280af22 2064 PL_regsize += ANY_SKIP;
a0d0e21e 2065 }
3280af22 2066 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
a0d0e21e 2067 goto skipcond; /* allow 1st char to be ] or - */
3280af22 2068 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
a0d0e21e 2069 skipcond:
a0ed51b3
LW
2070 value = UCHARAT(PL_regcomp_parse++);
2071 if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
4599a1de 2072 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3280af22
NIS
2073 (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) {
2074 char posixccc = *PL_regcomp_parse;
2075 char* posixccs = PL_regcomp_parse++;
4599a1de 2076
3280af22
NIS
2077 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != posixccc)
2078 PL_regcomp_parse++;
2079 if (PL_regcomp_parse == PL_regxend)
4599a1de 2080 /* Grandfather lone [:, [=, [. */
3280af22 2081 PL_regcomp_parse = posixccs;
4599a1de 2082 else {
3280af22
NIS
2083 PL_regcomp_parse++; /* skip over the posixccc */
2084 if (*PL_regcomp_parse == ']') {
4599a1de
JH
2085 /* Not Implemented Yet.
2086 * (POSIX Extended Character Classes, that is)
2087 * The text between e.g. [: and :] would start
77d41b28 2088 * at posixccs + 1 and stop at regcomp_parse - 2. */
a0ed51b3 2089 if (dowarn && !SIZE_ONLY)
4599a1de 2090 warn("Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
3280af22 2091 PL_regcomp_parse++; /* skip over the ending ] */
4599a1de
JH
2092 }
2093 }
2094 }
a0ed51b3
LW
2095 if (value == '\\') {
2096 value = UCHARAT(PL_regcomp_parse++);
2097 switch (value) {
a0d0e21e 2098 case 'w':
ae5c130c 2099 if (!SIZE_ONLY) {
a0ed51b3 2100 if (LOC)
bbce6d69 2101 *opnd |= ANYOF_ALNUML;
ae5c130c 2102 else {
a0ed51b3
LW
2103 for (value = 0; value < 256; value++)
2104 if (isALNUM(value))
2105 ANYOF_SET(opnd, value);
ae5c130c 2106 }
bbce6d69 2107 }
a0ed51b3 2108 lastvalue = 1234;
a0d0e21e
LW
2109 continue;
2110 case 'W':
ae5c130c 2111 if (!SIZE_ONLY) {
a0ed51b3 2112 if (LOC)
bbce6d69 2113 *opnd |= ANYOF_NALNUML;
ae5c130c 2114 else {
a0ed51b3
LW
2115 for (value = 0; value < 256; value++)
2116 if (!isALNUM(value))
2117 ANYOF_SET(opnd, value);
ae5c130c 2118 }
bbce6d69 2119 }
a0ed51b3 2120 lastvalue = 1234;
a0d0e21e
LW
2121 continue;
2122 case 's':
ae5c130c 2123 if (!SIZE_ONLY) {
a0ed51b3 2124 if (LOC)
bbce6d69 2125 *opnd |= ANYOF_SPACEL;
ae5c130c 2126 else {
a0ed51b3
LW
2127 for (value = 0; value < 256; value++)
2128 if (isSPACE(value))
2129 ANYOF_SET(opnd, value);
ae5c130c 2130 }
bbce6d69 2131 }
a0ed51b3 2132 lastvalue = 1234;
a0d0e21e
LW
2133 continue;
2134 case 'S':
ae5c130c 2135 if (!SIZE_ONLY) {
a0ed51b3 2136 if (LOC)
bbce6d69 2137 *opnd |= ANYOF_NSPACEL;
ae5c130c 2138 else {
a0ed51b3
LW
2139 for (value = 0; value < 256; value++)
2140 if (!isSPACE(value))
2141 ANYOF_SET(opnd, value);
ae5c130c 2142 }
bbce6d69 2143 }
a0ed51b3 2144 lastvalue = 1234;
a0d0e21e
LW
2145 continue;
2146 case 'd':
ae5c130c 2147 if (!SIZE_ONLY) {
a0ed51b3
LW
2148 for (value = '0'; value <= '9'; value++)
2149 ANYOF_SET(opnd, value);
ae5c130c 2150 }
a0ed51b3 2151 lastvalue = 1234;
a0d0e21e
LW
2152 continue;
2153 case 'D':
ae5c130c 2154 if (!SIZE_ONLY) {
a0ed51b3
LW
2155 for (value = 0; value < '0'; value++)
2156 ANYOF_SET(opnd, value);
2157 for (value = '9' + 1; value < 256; value++)
2158 ANYOF_SET(opnd, value);
ae5c130c 2159 }
a0ed51b3 2160 lastvalue = 1234;
a0d0e21e
LW
2161 continue;
2162 case 'n':
a0ed51b3 2163 value = '\n';
a0d0e21e
LW
2164 break;
2165 case 'r':
a0ed51b3 2166 value = '\r';
a0d0e21e
LW
2167 break;
2168 case 't':
a0ed51b3 2169 value = '\t';
a0d0e21e
LW
2170 break;
2171 case 'f':
a0ed51b3 2172 value = '\f';
a0d0e21e
LW
2173 break;
2174 case 'b':
a0ed51b3 2175 value = '\b';
a0d0e21e
LW
2176 break;
2177 case 'e':
a0ed51b3 2178 value = '\033';
a0d0e21e
LW
2179 break;
2180 case 'a':
a0ed51b3 2181 value = '\007';
a0d0e21e
LW
2182 break;
2183 case 'x':
a0ed51b3 2184 value = scan_hex(PL_regcomp_parse, 2, &numlen);
3280af22 2185 PL_regcomp_parse += numlen;
a0d0e21e
LW
2186 break;
2187 case 'c':
a0ed51b3
LW
2188 value = UCHARAT(PL_regcomp_parse++);
2189 value = toCTRL(value);
a0d0e21e
LW
2190 break;
2191 case '0': case '1': case '2': case '3': case '4':
2192 case '5': case '6': case '7': case '8': case '9':
a0ed51b3 2193 value = scan_oct(--PL_regcomp_parse, 3, &numlen);
3280af22 2194 PL_regcomp_parse += numlen;
a0d0e21e
LW
2195 break;
2196 }
2197 }
2198 if (range) {
a0ed51b3 2199 if (lastvalue > value)
a0d0e21e
LW
2200 FAIL("invalid [] range in regexp");
2201 range = 0;
2202 }
2203 else {
a0ed51b3 2204 lastvalue = value;
3280af22
NIS
2205 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
2206 PL_regcomp_parse[1] != ']') {
2207 PL_regcomp_parse++;
a0d0e21e
LW
2208 range = 1;
2209 continue; /* do it next time */
2210 }
a687059c 2211 }
ae5c130c 2212 if (!SIZE_ONLY) {
a0ed51b3
LW
2213 for ( ; lastvalue <= value; lastvalue++)
2214 ANYOF_SET(opnd, lastvalue);
ae5c130c 2215 }
a0ed51b3 2216 lastvalue = value;
a0d0e21e 2217 }
3280af22 2218 if (*PL_regcomp_parse != ']')
a0d0e21e
LW
2219 FAIL("unmatched [] in regexp");
2220 nextchar();
ae5c130c
GS
2221 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
2222 if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) {
a0ed51b3
LW
2223 for (value = 0; value < 256; ++value) {
2224 if (ANYOF_TEST(opnd, value)) {
2225 I32 cf = fold[value];
ae5c130c
GS
2226 ANYOF_SET(opnd, cf);
2227 }
2228 }
2229 *opnd &= ~ANYOF_FOLD;
2230 }
2231 /* optimize inverted simple patterns (e.g. [^a-z]) */
2232 if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) {
a0ed51b3
LW
2233 for (value = 0; value < 32; ++value)
2234 opnd[1 + value] ^= 0xFF;
ae5c130c
GS
2235 *opnd = 0;
2236 }
a0d0e21e
LW
2237 return ret;
2238}
2239
a0ed51b3
LW
2240STATIC regnode *
2241regclassutf8(void)
2242{
2243 register char *opnd, *e;
2244 register U32 value;
2245 register U32 lastvalue = 123456;
2246 register I32 range = 0;
2247 register regnode *ret;
2248 I32 numlen;
2249 I32 n;
2250 SV *listsv;
2251 U8 flags = 0;
c485e607 2252 dTHR;
a0ed51b3
LW
2253
2254 if (*PL_regcomp_parse == '^') { /* Complement of range. */
2255 PL_regnaughty++;
2256 PL_regcomp_parse++;
2257 if (!SIZE_ONLY)
2258 flags |= ANYOF_INVERT;
2259 }
2260 if (!SIZE_ONLY) {
2261 if (FOLD)
2262 flags |= ANYOF_FOLD;
2263 if (LOC)
2264 flags |= ANYOF_LOCALE;
2265 listsv = newSVpv("# comment\n",0);
2266 }
2267
2268 if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
2269 goto skipcond; /* allow 1st char to be ] or - */
2270
2271 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
2272 skipcond:
2273 value = utf8_to_uv(PL_regcomp_parse, &numlen);
2274 PL_regcomp_parse += numlen;
2275
2276 if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
2277 /* I smell either [: or [= or [. -- POSIX has been here, right? */
2278 (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) {
2279 char posixccc = *PL_regcomp_parse;
2280 char* posixccs = PL_regcomp_parse++;
2281
2282 while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != posixccc)
2283 PL_regcomp_parse++;
2284 if (PL_regcomp_parse == PL_regxend)
2285 /* Grandfather lone [:, [=, [. */
2286 PL_regcomp_parse = posixccs;
2287 else {
2288 PL_regcomp_parse++; /* skip over the posixccc */
2289 if (*PL_regcomp_parse == ']') {
2290 /* Not Implemented Yet.
2291 * (POSIX Extended Character Classes, that is)
2292 * The text between e.g. [: and :] would start
2293 * at posixccs + 1 and stop at regcomp_parse - 2. */
2294 if (dowarn && !SIZE_ONLY)
2295 warn("Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
2296 PL_regcomp_parse++; /* skip over the ending ] */
2297 }
2298 }
2299 }
2300
2301 if (value == '\\') {
2302 value = utf8_to_uv(PL_regcomp_parse, &numlen);
2303 PL_regcomp_parse += numlen;
2304 switch (value) {
2305 case 'w':
2306 if (!SIZE_ONLY) {
2307 if (LOC)
2308 flags |= ANYOF_ALNUML;
2309
2310 sv_catpvf(listsv, "+utf8::IsAlnum\n");
2311 }
2312 lastvalue = 123456;
2313 continue;
2314 case 'W':
2315 if (!SIZE_ONLY) {
2316 if (LOC)
2317 flags |= ANYOF_NALNUML;
2318
2319 sv_catpvf(listsv,
2320 "-utf8::IsAlpha\n-utf8::IsDigit\n0000\t%04x\n%04x\tffff\n",
2321 '_' - 1,
2322 '_' + 1);
2323 }
2324 lastvalue = 123456;
2325 continue;
2326 case 's':
2327 if (!SIZE_ONLY) {
2328 if (LOC)
2329 flags |= ANYOF_SPACEL;
2330 sv_catpvf(listsv, "+utf8::IsSpace\n");
2331 if (!PL_utf8_space)
2332 is_utf8_space(" ");
2333 }
2334 lastvalue = 123456;
2335 continue;
2336 case 'S':
2337 if (!SIZE_ONLY) {
2338 if (LOC)
2339 flags |= ANYOF_NSPACEL;
2340 sv_catpvf(listsv,
2341 "!utf8::IsSpace\n");
2342 if (!PL_utf8_space)
2343 is_utf8_space(" ");
2344 }
2345 lastvalue = 123456;
2346 continue;
2347 case 'd':
2348 if (!SIZE_ONLY) {
2349 sv_catpvf(listsv, "+utf8::IsDigit\n");
2350 }
2351 lastvalue = 123456;
2352 continue;
2353 case 'D':
2354 if (!SIZE_ONLY) {
2355 sv_catpvf(listsv,
2356 "!utf8::IsDigit\n");
2357 }
2358 lastvalue = 123456;
2359 continue;
2360 case 'p':
2361 case 'P':
2362 if (*PL_regcomp_parse == '{') {
2363 e = strchr(PL_regcomp_parse++, '}');
2364 if (!e)
2365 FAIL("Missing right brace on \\p{}");
2366 n = e - PL_regcomp_parse;
2367 }
2368 else {
2369 e = PL_regcomp_parse;
2370 n = 1;
2371 }
2372 if (!SIZE_ONLY) {
2373 if (value == 'p')
2374 sv_catpvf(listsv, "+utf8::%.*s\n", n, PL_regcomp_parse);
2375 else
2376 sv_catpvf(listsv,
2377 "!utf8::%.*s\n", n, PL_regcomp_parse);
2378 }
2379 PL_regcomp_parse = e + 1;
2380 lastvalue = 123456;
2381 continue;
2382 case 'n':
2383 value = '\n';
2384 break;
2385 case 'r':
2386 value = '\r';
2387 break;
2388 case 't':
2389 value = '\t';
2390 break;
2391 case 'f':
2392 value = '\f';
2393 break;
2394 case 'b':
2395 value = '\b';
2396 break;
2397 case 'e':
2398 value = '\033';
2399 break;
2400 case 'a':
2401 value = '\007';
2402 break;
2403 case 'x':
2404 if (*PL_regcomp_parse == '{') {
2405 e = strchr(PL_regcomp_parse++, '}');
2406 if (!e)
2407 FAIL("Missing right brace on \\x{}");
2408 value = scan_hex(PL_regcomp_parse + 1, e - PL_regcomp_parse, &numlen);
2409 PL_regcomp_parse = e + 1;
2410 }
2411 else {
2412 value = scan_hex(PL_regcomp_parse, 2, &numlen);
2413 PL_regcomp_parse += numlen;
2414 }
2415 break;
2416 case 'c':
2417 value = UCHARAT(PL_regcomp_parse++);
2418 value = toCTRL(value);
2419 break;
2420 case '0': case '1': case '2': case '3': case '4':
2421 case '5': case '6': case '7': case '8': case '9':
2422 value = scan_oct(--PL_regcomp_parse, 3, &numlen);
2423 PL_regcomp_parse += numlen;
2424 break;
2425 }
2426 }
2427 if (range) {
2428 if (lastvalue > value)
2429 FAIL("invalid [] range in regexp");
2430 if (!SIZE_ONLY)
2431 sv_catpvf(listsv, "%04x\t%04x\n", lastvalue, value);
2432 lastvalue = value;
2433 range = 0;
2434 }
2435 else {
2436 lastvalue = value;
2437 if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
2438 PL_regcomp_parse[1] != ']') {
2439 PL_regcomp_parse++;
2440 range = 1;
2441 continue; /* do it next time */
2442 }
2443 if (!SIZE_ONLY)
2444 sv_catpvf(listsv, "%04x\n", value);
2445 }
2446 }
2447 if (*PL_regcomp_parse != ']')
2448 FAIL("unmatched [] in regexp");
2449 nextchar();
2450
2451 ret = reganode(ANYOFUTF8, 0);
2452
2453 if (!SIZE_ONLY) {
2454 SV *rv = swash_init("utf8", "", listsv, 1, 0);
2455 SvREFCNT_dec(listsv);
2456 n = add_data(1,"s");
2457 PL_regcomp_rx->data->data[n] = (void*)rv;
2458 ARG1_SET(ret, flags);
2459 ARG2_SET(ret, n);
2460 }
2461
2462 return ret;
2463}
2464
76e3520e 2465STATIC char*
8ac85365 2466nextchar(void)
a0d0e21e 2467{
5c0ca799 2468 dTHR;
3280af22 2469 char* retval = PL_regcomp_parse++;
a0d0e21e 2470
4633a7c4 2471 for (;;) {
3280af22
NIS
2472 if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' &&
2473 PL_regcomp_parse[2] == '#') {
2474 while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
2475 PL_regcomp_parse++;
2476 PL_regcomp_parse++;
4633a7c4
LW
2477 continue;
2478 }
3280af22
NIS
2479 if (PL_regflags & PMf_EXTENDED) {
2480 if (isSPACE(*PL_regcomp_parse)) {
2481 PL_regcomp_parse++;
748a9306
LW
2482 continue;
2483 }
3280af22
NIS
2484 else if (*PL_regcomp_parse == '#') {
2485 while (*PL_regcomp_parse && *PL_regcomp_parse != '\n')
2486 PL_regcomp_parse++;
2487 PL_regcomp_parse++;
748a9306
LW
2488 continue;
2489 }
748a9306 2490 }
4633a7c4 2491 return retval;
a0d0e21e 2492 }
a687059c
LW
2493}
2494
2495/*
c277df42 2496- reg_node - emit a node
a0d0e21e 2497*/
76e3520e 2498STATIC regnode * /* Location. */
c277df42 2499reg_node(U8 op)
a687059c 2500{
5c0ca799 2501 dTHR;
c277df42
IZ
2502 register regnode *ret;
2503 register regnode *ptr;
a687059c 2504
3280af22 2505 ret = PL_regcode;
c277df42 2506 if (SIZE_ONLY) {
6b88bc9c 2507 SIZE_ALIGN(PL_regsize);
3280af22 2508 PL_regsize += 1;
a0d0e21e
LW
2509 return(ret);
2510 }
a687059c 2511
c277df42 2512 NODE_ALIGN_FILL(ret);
a0d0e21e 2513 ptr = ret;
c277df42 2514 FILL_ADVANCE_NODE(ptr, op);
3280af22 2515 PL_regcode = ptr;
a687059c 2516
a0d0e21e 2517 return(ret);
a687059c
LW
2518}
2519
2520/*
a0d0e21e
LW
2521- reganode - emit a node with an argument
2522*/
76e3520e 2523STATIC regnode * /* Location. */
c277df42 2524reganode(U8 op, U32 arg)
fe14fcc3 2525{
5c0ca799 2526 dTHR;
c277df42
IZ
2527 register regnode *ret;
2528 register regnode *ptr;
fe14fcc3 2529
3280af22 2530 ret = PL_regcode;
c277df42 2531 if (SIZE_ONLY) {
6b88bc9c 2532 SIZE_ALIGN(PL_regsize);
3280af22 2533 PL_regsize += 2;
a0d0e21e
LW
2534 return(ret);
2535 }
fe14fcc3 2536
c277df42 2537 NODE_ALIGN_FILL(ret);
a0d0e21e 2538 ptr = ret;
c277df42 2539 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
3280af22 2540 PL_regcode = ptr;
fe14fcc3 2541
a0d0e21e 2542 return(ret);
fe14fcc3
LW
2543}
2544
2545/*
a0ed51b3
LW
2546- regc - emit (if appropriate) a Unicode character
2547*/
2548STATIC void
2549reguni(UV uv, char* s, I32* lenp)
2550{
c485e607 2551 dTHR;
a0ed51b3
LW
2552 if (SIZE_ONLY) {
2553 char tmpbuf[10];
2554 *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf;
2555 }
2556 else
2557 *lenp = uv_to_utf8(s, uv) - s;
2558
2559}
2560
2561/*
a0d0e21e
LW
2562- regc - emit (if appropriate) a byte of code
2563*/
76e3520e 2564STATIC void
c277df42 2565regc(U8 b, char* s)
a687059c 2566{
5c0ca799 2567 dTHR;
c277df42
IZ
2568 if (!SIZE_ONLY)
2569 *s = b;
a687059c
LW
2570}
2571
2572/*
a0d0e21e
LW
2573- reginsert - insert an operator in front of already-emitted operand
2574*
2575* Means relocating the operand.
2576*/
76e3520e 2577STATIC void
c277df42 2578reginsert(U8 op, regnode *opnd)
a687059c 2579{
5c0ca799 2580 dTHR;
c277df42
IZ
2581 register regnode *src;
2582 register regnode *dst;
2583 register regnode *place;
2584 register int offset = regarglen[(U8)op];
2585
2586/* (regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
2587
2588 if (SIZE_ONLY) {
3280af22 2589 PL_regsize += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
2590 return;
2591 }
a687059c 2592
3280af22
NIS
2593 src = PL_regcode;
2594 PL_regcode += NODE_STEP_REGNODE + offset;
2595 dst = PL_regcode;
a0d0e21e 2596 while (src > opnd)
c277df42 2597 StructCopy(--src, --dst, regnode);
a0d0e21e
LW
2598
2599 place = opnd; /* Op node, where operand used to be. */
c277df42
IZ
2600 src = NEXTOPER(place);
2601 FILL_ADVANCE_NODE(place, op);
2602 Zero(src, offset, regnode);
a687059c
LW
2603}
2604
2605/*
c277df42 2606- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 2607*/
76e3520e 2608STATIC void
c277df42 2609regtail(regnode *p, regnode *val)
a687059c 2610{
5c0ca799 2611 dTHR;
c277df42
IZ
2612 register regnode *scan;
2613 register regnode *temp;
a0d0e21e
LW
2614 register I32 offset;
2615
c277df42 2616 if (SIZE_ONLY)
a0d0e21e
LW
2617 return;
2618
2619 /* Find last node. */
2620 scan = p;
2621 for (;;) {
2622 temp = regnext(scan);
2623 if (temp == NULL)
2624 break;
2625 scan = temp;
2626 }
a687059c 2627
c277df42
IZ
2628 if (reg_off_by_arg[OP(scan)]) {
2629 ARG_SET(scan, val - scan);
a0ed51b3
LW
2630 }
2631 else {
c277df42
IZ
2632 NEXT_OFF(scan) = val - scan;
2633 }
a687059c
LW
2634}
2635
2636/*
a0d0e21e
LW
2637- regoptail - regtail on operand of first argument; nop if operandless
2638*/
76e3520e 2639STATIC void
c277df42 2640regoptail(regnode *p, regnode *val)
a687059c 2641{
5c0ca799 2642 dTHR;
a0d0e21e 2643 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
2644 if (p == NULL || SIZE_ONLY)
2645 return;
2646 if (regkind[(U8)OP(p)] == BRANCH) {
2647 regtail(NEXTOPER(p), val);
a0ed51b3
LW
2648 }
2649 else if ( regkind[(U8)OP(p)] == BRANCHJ) {
c277df42 2650 regtail(NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
2651 }
2652 else
a0d0e21e 2653 return;
a687059c
LW
2654}
2655
2656/*
2657 - regcurly - a little FSA that accepts {\d+,?\d*}
2658 */
79072805 2659STATIC I32
8ac85365 2660regcurly(register char *s)
a687059c
LW
2661{
2662 if (*s++ != '{')
2663 return FALSE;
f0fcb552 2664 if (!isDIGIT(*s))
a687059c 2665 return FALSE;
f0fcb552 2666 while (isDIGIT(*s))
a687059c
LW
2667 s++;
2668 if (*s == ',')
2669 s++;
f0fcb552 2670 while (isDIGIT(*s))
a687059c
LW
2671 s++;
2672 if (*s != '}')
2673 return FALSE;
2674 return TRUE;
2675}
2676
a687059c 2677
76e3520e 2678STATIC regnode *
c277df42
IZ
2679dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
2680{
35ff7856 2681#ifdef DEBUGGING
c277df42
IZ
2682 register char op = EXACT; /* Arbitrary non-END op. */
2683 register regnode *next, *onode;
2684
2685 while (op != END && (!last || node < last)) {
2686 /* While that wasn't END last time... */
2687
2688 NODE_ALIGN(node);
2689 op = OP(node);
2690 if (op == CLOSE)
2691 l--;
2692 next = regnext(node);
2693 /* Where, what. */
2694 if (OP(node) == OPTIMIZED)
2695 goto after_print;
2696 regprop(sv, node);
54dc92de 2697 PerlIO_printf(Perl_debug_log, "%4d:%*s%s", node - start,
c277df42
IZ
2698 2*l + 1, "", SvPVX(sv));
2699 if (next == NULL) /* Next ptr. */
2700 PerlIO_printf(Perl_debug_log, "(0)");
2701 else
2702 PerlIO_printf(Perl_debug_log, "(%d)", next - start);
2703 (void)PerlIO_putc(Perl_debug_log, '\n');
2704 after_print:
2705 if (regkind[(U8)op] == BRANCHJ) {
2706 register regnode *nnode = (OP(next) == LONGJMP
2707 ? regnext(next)
2708 : next);
2709 if (last && nnode > last)
2710 nnode = last;
2711 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3
LW
2712 }
2713 else if (regkind[(U8)op] == BRANCH) {
c277df42 2714 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
2715 }
2716 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
2717 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
2718 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3
LW
2719 }
2720 else if (regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
2721 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
2722 next, sv, l + 1);
a0ed51b3
LW
2723 }
2724 else if ( op == PLUS || op == STAR) {
c277df42 2725 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
2726 }
2727 else if (op == ANYOF) {
c277df42
IZ
2728 node = NEXTOPER(node);
2729 node += ANY_SKIP;
a0ed51b3
LW
2730 }
2731 else if (regkind[(U8)op] == EXACT) {
c277df42
IZ
2732 /* Literal string, where present. */
2733 node += ((*OPERAND(node)) + 2 + sizeof(regnode) - 1) / sizeof(regnode);
2734 node = NEXTOPER(node);
a0ed51b3
LW
2735 }
2736 else {
c277df42
IZ
2737 node = NEXTOPER(node);
2738 node += regarglen[(U8)op];
2739 }
2740 if (op == CURLYX || op == OPEN)
2741 l++;
2742 else if (op == WHILEM)
2743 l--;
2744 }
17c3b450 2745#endif /* DEBUGGING */
c277df42
IZ
2746 return node;
2747}
2748
a687059c 2749/*
fd181c75 2750 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
2751 */
2752void
8ac85365 2753regdump(regexp *r)
a687059c 2754{
35ff7856 2755#ifdef DEBUGGING
5c0ca799 2756 dTHR;
46fc3d4c 2757 SV *sv = sv_newmortal();
a687059c 2758
c277df42 2759 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
2760
2761 /* Header fields of interest. */
c277df42
IZ
2762 if (r->anchored_substr)
2763 PerlIO_printf(Perl_debug_log, "anchored `%s%s%s'%s at %d ",
3280af22 2764 PL_colors[0],
c277df42 2765 SvPVX(r->anchored_substr),
3280af22 2766 PL_colors[1],
c277df42
IZ
2767 SvTAIL(r->anchored_substr) ? "$" : "",
2768 r->anchored_offset);
2769 if (r->float_substr)
2770 PerlIO_printf(Perl_debug_log, "floating `%s%s%s'%s at %d..%u ",
3280af22 2771 PL_colors[0],
c277df42 2772 SvPVX(r->float_substr),
3280af22 2773 PL_colors[1],
c277df42
IZ
2774 SvTAIL(r->float_substr) ? "$" : "",
2775 r->float_min_offset, r->float_max_offset);
2776 if (r->check_substr)
2777 PerlIO_printf(Perl_debug_log,
2778 r->check_substr == r->float_substr
2779 ? "(checking floating" : "(checking anchored");
2780 if (r->reganch & ROPT_NOSCAN)
2781 PerlIO_printf(Perl_debug_log, " noscan");
2782 if (r->reganch & ROPT_CHECK_ALL)
2783 PerlIO_printf(Perl_debug_log, " isall");
2784 if (r->check_substr)
2785 PerlIO_printf(Perl_debug_log, ") ");
2786
46fc3d4c
PP
2787 if (r->regstclass) {
2788 regprop(sv, r->regstclass);
2789 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
2790 }
774d564b
PP
2791 if (r->reganch & ROPT_ANCH) {
2792 PerlIO_printf(Perl_debug_log, "anchored");
2793 if (r->reganch & ROPT_ANCH_BOL)
2794 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
2795 if (r->reganch & ROPT_ANCH_MBOL)
2796 PerlIO_printf(Perl_debug_log, "(MBOL)");
774d564b
PP
2797 if (r->reganch & ROPT_ANCH_GPOS)
2798 PerlIO_printf(Perl_debug_log, "(GPOS)");
2799 PerlIO_putc(Perl_debug_log, ' ');
2800 }
c277df42
IZ
2801 if (r->reganch & ROPT_GPOS_SEEN)
2802 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 2803 if (r->reganch & ROPT_SKIP)
760ac839 2804 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 2805 if (r->reganch & ROPT_IMPLICIT)
760ac839 2806 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 2807 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
2808 if (r->reganch & ROPT_EVAL_SEEN)
2809 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 2810 PerlIO_printf(Perl_debug_log, "\n");
17c3b450 2811#endif /* DEBUGGING */
a687059c
LW
2812}
2813
2814/*
a0d0e21e
LW
2815- regprop - printable representation of opcode
2816*/
46fc3d4c 2817void
c277df42 2818regprop(SV *sv, regnode *o)
a687059c 2819{
35ff7856 2820#ifdef DEBUGGING
5c0ca799 2821 dTHR;
a0d0e21e
LW
2822 register char *p = 0;
2823
54dc92de 2824 sv_setpvn(sv, "", 0);
11343788 2825 switch (OP(o)) {
a0d0e21e
LW
2826 case BOL:
2827 p = "BOL";
2828 break;
2829 case MBOL:
2830 p = "MBOL";
2831 break;
2832 case SBOL:
2833 p = "SBOL";
2834 break;
2835 case EOL:
2836 p = "EOL";
2837 break;
b85d18e9
IZ
2838 case EOS:
2839 p = "EOS";
2840 break;
a0d0e21e
LW
2841 case MEOL:
2842 p = "MEOL";
2843 break;
2844 case SEOL:
2845 p = "SEOL";
2846 break;
2847 case ANY:
2848 p = "ANY";
2849 break;
2850 case SANY:
2851 p = "SANY";
2852 break;
a0ed51b3
LW
2853 case ANYUTF8:
2854 p = "ANYUTF8";
2855 break;
2856 case SANYUTF8:
2857 p = "SANYUTF8";
2858 break;
2859 case ANYOFUTF8:
2860 p = "ANYOFUTF8";
2861 break;
a0d0e21e
LW
2862 case ANYOF:
2863 p = "ANYOF";
2864 break;
2865 case BRANCH:
2866 p = "BRANCH";
2867 break;
bbce6d69 2868 case EXACT:
3280af22 2869 sv_catpvf(sv, "EXACT <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
bbce6d69
PP
2870 break;
2871 case EXACTF:
3280af22 2872 sv_catpvf(sv, "EXACTF <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
bbce6d69
PP
2873 break;
2874 case EXACTFL:
3280af22 2875 sv_catpvf(sv, "EXACTFL <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]);
a0d0e21e
LW
2876 break;
2877 case NOTHING:
2878 p = "NOTHING";
2879 break;
c277df42
IZ
2880 case TAIL:
2881 p = "TAIL";
2882 break;
a0d0e21e
LW
2883 case BACK:
2884 p = "BACK";
2885 break;
2886 case END:
2887 p = "END";
2888 break;
a0d0e21e
LW
2889 case BOUND:
2890 p = "BOUND";
2891 break;
bbce6d69
PP
2892 case BOUNDL:
2893 p = "BOUNDL";
2894 break;
a0d0e21e
LW
2895 case NBOUND:
2896 p = "NBOUND";
2897 break;
bbce6d69
PP
2898 case NBOUNDL:
2899 p = "NBOUNDL";
a0d0e21e
LW
2900 break;
2901 case CURLY:
5dc0d613 2902 sv_catpvf(sv, "CURLY {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 2903 break;
c277df42 2904 case CURLYM:
c277df42 2905 sv_catpvf(sv, "CURLYM[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o));
c277df42
IZ
2906 break;
2907 case CURLYN:
c277df42 2908 sv_catpvf(sv, "CURLYN[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o));
c277df42 2909 break;
a0d0e21e 2910 case CURLYX:
5dc0d613 2911 sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e
LW
2912 break;
2913 case REF:
c277df42 2914 sv_catpvf(sv, "REF%d", ARG(o));
a0d0e21e 2915 break;
c8756f30 2916 case REFF:
c277df42 2917 sv_catpvf(sv, "REFF%d", ARG(o));
c8756f30
AK
2918 break;
2919 case REFFL:
c277df42 2920 sv_catpvf(sv, "REFFL%d", ARG(o));
c8756f30 2921 break;
a0d0e21e 2922 case OPEN:
c277df42 2923 sv_catpvf(sv, "OPEN%d", ARG(o));
a0d0e21e
LW
2924 break;
2925 case CLOSE:
c277df42 2926 sv_catpvf(sv, "CLOSE%d", ARG(o));
a0d0e21e
LW
2927 p = NULL;
2928 break;
2929 case STAR:
2930 p = "STAR";
2931 break;
2932 case PLUS:
2933 p = "PLUS";
2934 break;
2935 case MINMOD:
2936 p = "MINMOD";
2937 break;
774d564b
PP
2938 case GPOS:
2939 p = "GPOS";
a0d0e21e
LW
2940 break;
2941 case UNLESSM:
c277df42 2942 sv_catpvf(sv, "UNLESSM[-%d]", o->flags);
a0d0e21e
LW
2943 break;
2944 case IFMATCH:
c277df42 2945 sv_catpvf(sv, "IFMATCH[-%d]", o->flags);
a0d0e21e
LW
2946 break;
2947 case SUCCEED:
2948 p = "SUCCEED";
2949 break;
2950 case WHILEM:
2951 p = "WHILEM";
2952 break;
bbce6d69
PP
2953 case DIGIT:
2954 p = "DIGIT";
2955 break;
2956 case NDIGIT:
2957 p = "NDIGIT";
2958 break;
2959 case ALNUM:
2960 p = "ALNUM";
2961 break;
2962 case NALNUM:
2963 p = "NALNUM";
2964 break;
2965 case SPACE:
2966 p = "SPACE";
2967 break;
2968 case NSPACE:
2969 p = "NSPACE";
2970 break;
2971 case ALNUML:
2972 p = "ALNUML";
2973 break;
2974 case NALNUML:
2975 p = "NALNUML";
2976 break;
2977 case SPACEL:
2978 p = "SPACEL";
2979 break;
2980 case NSPACEL:
2981 p = "NSPACEL";
2982 break;
c277df42
IZ
2983 case EVAL:
2984 p = "EVAL";
2985 break;
2986 case LONGJMP:
2987 p = "LONGJMP";
2988 break;
2989 case BRANCHJ:
2990 p = "BRANCHJ";
2991 break;
2992 case IFTHEN:
2993 p = "IFTHEN";
2994 break;
2995 case GROUPP:
2996 sv_catpvf(sv, "GROUPP%d", ARG(o));
2997 break;
2998 case LOGICAL:
2999 p = "LOGICAL";
3000 break;
3001 case SUSPEND:
3002 p = "SUSPEND";
3003 break;
3004 case RENUM:
3005 p = "RENUM";
3006 break;
3007 case OPTIMIZED:
3008 p = "OPTIMIZED";
3009 break;
a0d0e21e
LW
3010 default:
3011 FAIL("corrupted regexp opcode");
3012 }
46fc3d4c
PP
3013 if (p)
3014 sv_catpv(sv, p);
17c3b450 3015#endif /* DEBUGGING */
35ff7856 3016}
a687059c 3017
2b69d0c2 3018void
8ac85365 3019pregfree(struct regexp *r)
a687059c 3020{
5c0ca799 3021 dTHR;
c277df42 3022 if (!r || (--r->refcnt > 0))
a0d0e21e 3023 return;
c277df42 3024 if (r->precomp)
a0d0e21e 3025 Safefree(r->precomp);
c277df42 3026 if (r->subbase)
a0d0e21e 3027 Safefree(r->subbase);
a193d654
GS
3028 if (r->substrs) {
3029 if (r->anchored_substr)
3030 SvREFCNT_dec(r->anchored_substr);
3031 if (r->float_substr)
3032 SvREFCNT_dec(r->float_substr);
2779dcf1 3033 Safefree(r->substrs);
a193d654 3034 }
c277df42
IZ
3035 if (r->data) {
3036 int n = r->data->count;
3037 while (--n >= 0) {
3038 switch (r->data->what[n]) {
3039 case 's':
3040 SvREFCNT_dec((SV*)r->data->data[n]);
3041 break;
3042 case 'o':
3043 op_free((OP_4tree*)r->data->data[n]);
3044 break;
3045 case 'n':
3046 break;
3047 default:
3048 FAIL2("panic: regfree data code '%c'", r->data->what[n]);
3049 }
3050 }
3051 Safefree(r->data->what);
3052 Safefree(r->data);
a0d0e21e
LW
3053 }
3054 Safefree(r->startp);
3055 Safefree(r->endp);
3056 Safefree(r);
a687059c 3057}
c277df42
IZ
3058
3059/*
3060 - regnext - dig the "next" pointer out of a node
3061 *
3062 * [Note, when REGALIGN is defined there are two places in regmatch()
3063 * that bypass this code for speed.]
3064 */
3065regnode *
3066regnext(register regnode *p)
3067{
5c0ca799 3068 dTHR;
c277df42
IZ
3069 register I32 offset;
3070
3280af22 3071 if (p == &PL_regdummy)
c277df42
IZ
3072 return(NULL);
3073
3074 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
3075 if (offset == 0)
3076 return(NULL);
3077
c277df42 3078 return(p+offset);
c277df42
IZ
3079}
3080
01f988be 3081STATIC void
c277df42 3082re_croak2(const char* pat1,const char* pat2,...)
c277df42
IZ
3083{
3084 va_list args;
3085 STRLEN l1 = strlen(pat1);
3086 STRLEN l2 = strlen(pat2);
3087 char buf[512];
3088 char *message;
3089
3090 if (l1 > 510)
3091 l1 = 510;
3092 if (l1 + l2 > 510)
3093 l2 = 510 - l1;
3094 Copy(pat1, buf, l1 , char);
3095 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
3096 buf[l1 + l2] = '\n';
3097 buf[l1 + l2 + 1] = '\0';
c277df42 3098 va_start(args, pat2);
c277df42
IZ
3099 message = mess(buf, &args);
3100 va_end(args);
3101 l1 = strlen(message);
3102 if (l1 > 512)
3103 l1 = 512;
3104 Copy(message, buf, l1 , char);
3105 buf[l1] = '\0'; /* Overwrite \n */
3106 croak("%s", buf);
3107}
a0ed51b3
LW
3108
3109/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
3110
3111void
3112save_re_context(void)
c485e607
NIS
3113{
3114 dTHR;
a0ed51b3
LW
3115 SAVEPPTR(PL_bostr);
3116 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
3117 SAVEI32(PL_regnpar); /* () count. */
3118 SAVEI32(PL_regsize); /* Code size. */
3119 SAVEI16(PL_regflags); /* are we folding, multilining? */
3120 SAVEPPTR(PL_reginput); /* String-input pointer. */
3121 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
3122 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
3123 SAVESPTR(PL_regstartp); /* Pointer to startp array. */
3124 SAVESPTR(PL_regendp); /* Ditto for endp. */
3125 SAVESPTR(PL_reglastparen); /* Similarly for lastparen. */
3126 SAVEPPTR(PL_regtill); /* How far we are required to go. */
3127 SAVEI32(PL_regprev); /* char before regbol, \n if none */
3128 SAVESPTR(PL_reg_start_tmp); /* from regexec.c */
3129 PL_reg_start_tmp = 0;
3130 SAVEFREEPV(PL_reg_start_tmp);
3131 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
3132 PL_reg_start_tmpl = 0;
3133 SAVESPTR(PL_regdata);
3134 SAVEI32(PL_reg_flags); /* from regexec.c */
3135 SAVEI32(PL_reg_eval_set); /* from regexec.c */
3136 SAVEI32(PL_regnarrate); /* from regexec.c */
3137 SAVESPTR(PL_regprogram); /* from regexec.c */
3138 SAVEINT(PL_regindent); /* from regexec.c */
3139 SAVESPTR(PL_regcc); /* from regexec.c */
3140 SAVESPTR(PL_curcop);
3141 SAVESPTR(PL_regcomp_rx); /* from regcomp.c */
3142 SAVEI32(PL_regseen); /* from regcomp.c */
3143 SAVEI32(PL_regsawback); /* Did we see \1, ...? */
3144 SAVEI32(PL_regnaughty); /* How bad is this pattern? */
3145 SAVESPTR(PL_regcode); /* Code-emit pointer; &regdummy = don't */
3146 SAVEPPTR(PL_regxend); /* End of input for compile */
3147 SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */
3148}