This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Separate avhv_foo() key handling into avhv_keys(). Slightly tweaked
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
f0fcb552 22/*SUPPRESS 112*/
a687059c 23/*
e50aee73 24 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
25 *
26 * Copyright (c) 1986 by University of Toronto.
27 * Written by Henry Spencer. Not derived from licensed software.
28 *
29 * Permission is granted to anyone to use this software for any
30 * purpose on any computer system, and to redistribute it freely,
31 * subject to the following restrictions:
32 *
33 * 1. The author is not responsible for the consequences of use of
34 * this software, no matter how awful, even if they arise
35 * from defects in it.
36 *
37 * 2. The origin of this software must not be misrepresented, either
38 * by explicit claim or by omission.
39 *
40 * 3. Altered versions must be plainly marked as such, and must not
41 * be misrepresented as being the original software.
42 *
43 *
44 **** Alterations to Henry's code are...
45 ****
9607fc9c 46 **** Copyright (c) 1991-1997, Larry Wall
a687059c 47 ****
9ef589d8
LW
48 **** You may distribute under the terms of either the GNU General Public
49 **** License or the Artistic License, as specified in the README file.
50
a687059c
LW
51 *
52 * Beware that some of this code is subtly aware of the way operator
53 * precedence is structured in regular expressions. Serious changes in
54 * regular-expression syntax might require a total rethink.
55 */
56#include "EXTERN.h"
57#include "perl.h"
58#include "INTERN.h"
59#include "regcomp.h"
60
11343788
MB
61#ifdef USE_THREADS
62#undef op
63#endif /* USE_THREADS */
64
fe14fcc3
LW
65#ifdef MSDOS
66# if defined(BUGGY_MSC6)
67 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
68 # pragma optimize("a",off)
69 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
70 # pragma optimize("w",on )
71# endif /* BUGGY_MSC6 */
72#endif /* MSDOS */
73
a687059c
LW
74#ifndef STATIC
75#define STATIC static
76#endif
77
78#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
79#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
80 ((*s) == '{' && regcurly(s)))
2b69d0c2
LW
81#ifdef atarist
82#define PERL_META "^$.[()|?+*\\"
83#else
a687059c 84#define META "^$.[()|?+*\\"
2b69d0c2 85#endif
a687059c 86
35c8bce7
LW
87#ifdef SPSTART
88#undef SPSTART /* dratted cpp namespace... */
89#endif
a687059c
LW
90/*
91 * Flags to be passed up and down.
92 */
a687059c 93#define WORST 0 /* Worst case. */
a0d0e21e
LW
94#define HASWIDTH 0x1 /* Known never to match null string. */
95#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
96#define SPSTART 0x4 /* Starts with * or +. */
97#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c
LW
98
99/*
e50aee73 100 * Forward declarations for pregcomp()'s friends.
a687059c 101 */
a0d0e21e
LW
102
103static char *reg _((I32, I32 *));
104static char *reganode _((char, unsigned short));
105static char *regatom _((I32 *));
106static char *regbranch _((I32 *));
107static void regc _((char));
108static char *regclass _((void));
109STATIC I32 regcurly _((char *));
110static char *regnode _((char));
111static char *regpiece _((I32 *));
112static void reginsert _((char, char *));
113static void regoptail _((char *, char *));
bbce6d69 114static void regset _((char *, I32));
a0d0e21e 115static void regtail _((char *, char *));
5b5a24f7 116static char* regwhite _((char *, char *));
a0d0e21e 117static char* nextchar _((void));
a687059c
LW
118
119/*
e50aee73 120 - pregcomp - compile a regular expression into internal code
a687059c
LW
121 *
122 * We can't allocate space until we know how big the compiled form will be,
123 * but we can't compile it (and thus know how big it is) until we've got a
124 * place to put the code. So we cheat: we compile it twice, once with code
125 * generation turned off and size counting turned on, and once "for real".
126 * This also means that we don't allocate space until we are sure that the
127 * thing really will compile successfully, and we never have to move the
128 * code and thus invalidate pointers into it. (Note that it has to be in
129 * one piece because free() must be able to free it all.) [NB: not true in perl]
130 *
131 * Beware that the optimization-preparation code in here knows about some
132 * of the structure of the compiled regexp. [I'll say.]
133 */
134regexp *
8ac85365 135pregcomp(char *exp, char *xend, PMOP *pm)
a687059c 136{
a0d0e21e
LW
137 register regexp *r;
138 register char *scan;
139 register SV *longish;
140 SV *longest;
141 register I32 len;
142 register char *first;
143 I32 flags;
144 I32 backish;
145 I32 backest;
146 I32 curback;
147 I32 minlen = 0;
148 I32 sawplus = 0;
149 I32 sawopen = 0;
47109bfb
IZ
150#define MAX_REPEAT_DEPTH 12
151 struct {
152 char *opcode;
153 I32 count;
154 } repeat_stack[MAX_REPEAT_DEPTH];
155 I32 repeat_depth = 0;
156 I32 repeat_count = 1; /* We start unmultiplied. */
a0d0e21e
LW
157
158 if (exp == NULL)
159 croak("NULL regexp argument");
160
bbce6d69 161 regprecomp = savepvn(exp, xend - exp);
a0d0e21e 162 regflags = pm->op_pmflags;
bbce6d69 163 regsawback = 0;
164
165 /* First pass: determine size, legality. */
a0d0e21e
LW
166 regparse = exp;
167 regxend = xend;
a0d0e21e 168 regnaughty = 0;
a0d0e21e
LW
169 regnpar = 1;
170 regsize = 0L;
171 regcode = &regdummy;
172 regc((char)MAGIC);
173 if (reg(0, &flags) == NULL) {
174 Safefree(regprecomp);
175 regprecomp = Nullch;
176 return(NULL);
177 }
178
179 /* Small enough for pointer-storage convention? */
180 if (regsize >= 32767L) /* Probably could be 65535L. */
181 FAIL("regexp too big");
182
bbce6d69 183 /* Allocate space and initialize. */
a0d0e21e
LW
184 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
185 if (r == NULL)
186 FAIL("regexp out of space");
bbce6d69 187 r->prelen = xend - exp;
a0d0e21e
LW
188 r->precomp = regprecomp;
189 r->subbeg = r->subbase = NULL;
bbce6d69 190
191 /* Second pass: emit code. */
a0d0e21e 192 regparse = exp;
bbce6d69 193 regxend = xend;
194 regnaughty = 0;
a0d0e21e
LW
195 regnpar = 1;
196 regcode = r->program;
197 regc((char)MAGIC);
198 if (reg(0, &flags) == NULL)
199 return(NULL);
200
201 /* Dig out information for optimizations. */
202 pm->op_pmflags = regflags;
a0d0e21e
LW
203 r->regstart = Nullsv; /* Worst-case defaults. */
204 r->reganch = 0;
205 r->regmust = Nullsv;
206 r->regback = -1;
207 r->regstclass = Nullch;
208 r->naughty = regnaughty >= 10; /* Probably an expensive pattern. */
209 scan = r->program+1; /* First BRANCH. */
210 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
211 scan = NEXTOPER(scan);
212
213 first = scan;
214 while ((OP(first) == OPEN && (sawopen = 1)) ||
215 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
216 (OP(first) == PLUS) ||
217 (OP(first) == MINMOD) ||
218 (regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
219 if (OP(first) == PLUS)
220 sawplus = 1;
221 else
222 first += regarglen[(U8)OP(first)];
223 first = NEXTOPER(first);
a687059c
LW
224 }
225
a0d0e21e
LW
226 /* Starting-point info. */
227 again:
bbce6d69 228 if (OP(first) == EXACT) {
a0d0e21e 229 r->regstart = newSVpv(OPERAND(first)+1,*OPERAND(first));
bbce6d69 230 if (SvCUR(r->regstart) > !sawstudy)
231 fbm_compile(r->regstart);
232 (void)SvUPGRADE(r->regstart, SVt_PVBM);
a0d0e21e
LW
233 }
234 else if (strchr(simple+2,OP(first)))
235 r->regstclass = first;
bbce6d69 236 else if (regkind[(U8)OP(first)] == BOUND ||
237 regkind[(U8)OP(first)] == NBOUND)
a0d0e21e
LW
238 r->regstclass = first;
239 else if (regkind[(U8)OP(first)] == BOL) {
774d564b 240 r->reganch |= ROPT_ANCH_BOL;
a0d0e21e 241 first = NEXTOPER(first);
774d564b 242 goto again;
243 }
244 else if (OP(first) == GPOS) {
245 r->reganch |= ROPT_ANCH_GPOS;
246 first = NEXTOPER(first);
247 goto again;
a0d0e21e
LW
248 }
249 else if ((OP(first) == STAR &&
250 regkind[(U8)OP(NEXTOPER(first))] == ANY) &&
251 !(r->reganch & ROPT_ANCH) )
252 {
253 /* turn .* into ^.* with an implied $*=1 */
774d564b 254 r->reganch |= ROPT_ANCH_BOL | ROPT_IMPLICIT;
a0d0e21e 255 first = NEXTOPER(first);
774d564b 256 goto again;
a0d0e21e
LW
257 }
258 if (sawplus && (!sawopen || !regsawback))
259 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
260
fb73857a 261 DEBUG_r(PerlIO_printf(Perl_debug_log, "first %d next %d offset %ld\n",
262 OP(first), OP(NEXTOPER(first)), (long)(first - scan)));
a0d0e21e
LW
263 /*
264 * If there's something expensive in the r.e., find the
265 * longest literal string that must appear and make it the
266 * regmust. Resolve ties in favor of later strings, since
267 * the regstart check works with the beginning of the r.e.
268 * and avoiding duplication strengthens checking. Not a
269 * strong reason, but sufficient in the absence of others.
270 * [Now we resolve ties in favor of the earlier string if
271 * it happens that curback has been invalidated, since the
272 * earlier string may buy us something the later one won't.]
273 */
274 longish = newSVpv("",0);
275 longest = newSVpv("",0);
276 len = 0;
277 minlen = 0;
278 curback = 0;
279 backish = 0;
280 backest = 0;
281 while (OP(scan) != END) {
282 if (OP(scan) == BRANCH) {
283 if (OP(regnext(scan)) == BRANCH) {
284 curback = -30000;
285 while (OP(scan) == BRANCH)
286 scan = regnext(scan);
00bf170e 287 }
a0d0e21e
LW
288 else /* single branch is ok */
289 scan = NEXTOPER(scan);
748a9306 290 continue;
a0d0e21e
LW
291 }
292 if (OP(scan) == UNLESSM) {
293 curback = -30000;
294 scan = regnext(scan);
748a9306 295 continue;
a0d0e21e 296 }
bbce6d69 297 if (OP(scan) == EXACT) {
a0d0e21e 298 char *t;
a687059c 299
a0d0e21e 300 first = scan;
47109bfb 301 while ((t = regnext(scan)) && OP(t) == CLOSE)
a0d0e21e 302 scan = t;
47109bfb 303 minlen += *OPERAND(first) * repeat_count;
a0d0e21e
LW
304 if (curback - backish == len) {
305 sv_catpvn(longish, OPERAND(first)+1,
306 *OPERAND(first));
307 len += *OPERAND(first);
308 curback += *OPERAND(first);
309 first = regnext(scan);
2b69d0c2 310 }
a0d0e21e
LW
311 else if (*OPERAND(first) >= len + (curback >= 0)) {
312 len = *OPERAND(first);
313 sv_setpvn(longish, OPERAND(first)+1,len);
314 backish = curback;
315 curback += len;
316 first = regnext(scan);
9ef589d8 317 }
a0d0e21e
LW
318 else
319 curback += *OPERAND(first);
320 }
321 else if (strchr(varies,OP(scan))) {
47109bfb
IZ
322 int tcount;
323 char *next;
324
325 if (repeat_depth < MAX_REPEAT_DEPTH
326 && ((OP(scan) == PLUS
327 && (tcount = 1)
328 && (next = NEXTOPER(scan)))
329 || (regkind[(U8)OP(scan)] == CURLY
330 && (tcount = ARG1(scan))
331 && (next = NEXTOPER(scan)+4))))
332 {
333 /* We treat (abc)+ as (abc)(abc)*. */
334
335 /* Mark the place to return back. */
336 repeat_stack[repeat_depth].opcode = regnext(scan);
337 repeat_stack[repeat_depth].count = repeat_count;
338 repeat_depth++;
339 repeat_count *= tcount;
340
341 /* Go deeper: */
342 scan = next;
343 continue;
344 }
345 else {
346 curback = -30000;
347 len = 0;
348 if (SvCUR(longish) > SvCUR(longest)) {
349 sv_setsv(longest,longish);
350 backest = backish;
351 }
352 sv_setpvn(longish,"",0);
353 }
354 }
355 else if (strchr(simple,OP(scan))) {
356 curback++;
357 minlen += repeat_count;
a687059c 358 len = 0;
a0d0e21e 359 if (SvCUR(longish) > SvCUR(longest)) {
79072805 360 sv_setsv(longest,longish);
34de22dd
LW
361 backest = backish;
362 }
a0d0e21e 363 sv_setpvn(longish,"",0);
a0d0e21e 364 }
47109bfb
IZ
365 scan = regnext(scan);
366 if (!scan) { /* Go up PLUS or CURLY. */
367 if (!repeat_depth--)
368 croak("panic: re scan");
369 scan = repeat_stack[repeat_depth].opcode;
370 repeat_count = repeat_stack[repeat_depth].count;
371 /* Need to submit the longest string found: */
372 curback = -30000;
a0d0e21e
LW
373 len = 0;
374 if (SvCUR(longish) > SvCUR(longest)) {
375 sv_setsv(longest,longish);
376 backest = backish;
fe14fcc3 377 }
a0d0e21e
LW
378 sv_setpvn(longish,"",0);
379 }
a687059c
LW
380 }
381
a0d0e21e
LW
382 /* Prefer earlier on tie, unless we can tail match latter */
383
47109bfb 384 if (SvCUR(longish) + (first && regkind[(U8)OP(first)] == EOL)
bbce6d69 385 > SvCUR(longest))
a0d0e21e
LW
386 {
387 sv_setsv(longest,longish);
388 backest = backish;
389 }
390 else
391 sv_setpvn(longish,"",0);
392 if (SvCUR(longest)
bbce6d69 393 && (!r->regstart
394 || !fbm_instr((unsigned char*) SvPVX(r->regstart),
395 (unsigned char *) (SvPVX(r->regstart)
396 + SvCUR(r->regstart)),
397 longest)))
a0d0e21e
LW
398 {
399 r->regmust = longest;
400 if (backest < 0)
401 backest = -1;
402 r->regback = backest;
47109bfb
IZ
403 if (SvCUR(longest) > !(sawstudy ||
404 (first && regkind[(U8)OP(first)] == EOL)))
bbce6d69 405 fbm_compile(r->regmust);
a0d0e21e
LW
406 (void)SvUPGRADE(r->regmust, SVt_PVBM);
407 BmUSEFUL(r->regmust) = 100;
47109bfb 408 if (first && regkind[(U8)OP(first)] == EOL && SvCUR(longish))
a0d0e21e
LW
409 SvTAIL_on(r->regmust);
410 }
411 else {
412 SvREFCNT_dec(longest);
413 longest = Nullsv;
414 }
415 SvREFCNT_dec(longish);
416 }
417
a0d0e21e
LW
418 r->nparens = regnpar - 1;
419 r->minlen = minlen;
420 Newz(1002, r->startp, regnpar, char*);
421 Newz(1002, r->endp, regnpar, char*);
422 DEBUG_r(regdump(r));
423 return(r);
a687059c
LW
424}
425
426/*
427 - reg - regular expression, i.e. main body or parenthesized thing
428 *
429 * Caller must absorb opening parenthesis.
430 *
431 * Combining parenthesis handling with the base level of regular expression
432 * is a trifle forced, but the need to tie the tails of the branches to what
433 * follows makes it hard to avoid.
434 */
435static char *
8ac85365
NIS
436reg(I32 paren, I32 *flagp)
437 /* Parenthesized? */
438
a687059c 439{
a0d0e21e
LW
440 register char *ret;
441 register char *br;
442 register char *ender = 0;
443 register I32 parno = 0;
444 I32 flags;
445
446 *flagp = HASWIDTH; /* Tentatively. */
447
448 /* Make an OPEN node, if parenthesized. */
449 if (paren) {
450 if (*regparse == '?') {
451 regparse++;
748a9306 452 paren = *regparse++;
a0d0e21e
LW
453 ret = NULL;
454 switch (paren) {
455 case ':':
456 case '=':
457 case '!':
458 break;
459 case '$':
460 case '@':
ff0cee69 461 croak("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e
LW
462 break;
463 case '#':
464 while (*regparse && *regparse != ')')
465 regparse++;
466 if (*regparse != ')')
748a9306 467 croak("Sequence (?#... not terminated");
a0d0e21e
LW
468 nextchar();
469 *flagp = TRYAGAIN;
470 return NULL;
1b1626e4
MG
471 case 0:
472 croak("Sequence (? incomplete");
473 break;
a0d0e21e
LW
474 default:
475 --regparse;
c90c0ff4 476 while (*regparse && strchr("iogcmsx", *regparse))
a0d0e21e
LW
477 pmflag(&regflags, *regparse++);
478 if (*regparse != ')')
479 croak("Sequence (?%c...) not recognized", *regparse);
480 nextchar();
481 *flagp = TRYAGAIN;
482 return NULL;
483 }
484 }
485 else {
486 parno = regnpar;
487 regnpar++;
488 ret = reganode(OPEN, parno);
489 }
490 } else
491 ret = NULL;
492
493 /* Pick up the branches, linking them together. */
494 br = regbranch(&flags);
495 if (br == NULL)
496 return(NULL);
497 if (ret != NULL)
498 regtail(ret, br); /* OPEN -> first. */
499 else
500 ret = br;
501 if (!(flags&HASWIDTH))
502 *flagp &= ~HASWIDTH;
503 *flagp |= flags&SPSTART;
504 while (*regparse == '|') {
505 nextchar();
a687059c
LW
506 br = regbranch(&flags);
507 if (br == NULL)
a0d0e21e
LW
508 return(NULL);
509 regtail(ret, br); /* BRANCH -> BRANCH. */
a687059c 510 if (!(flags&HASWIDTH))
a0d0e21e 511 *flagp &= ~HASWIDTH;
a687059c 512 *flagp |= flags&SPSTART;
a0d0e21e
LW
513 }
514
515 /* Make a closing node, and hook it on the end. */
516 switch (paren) {
517 case ':':
518 ender = regnode(NOTHING);
519 break;
520 case 1:
521 ender = reganode(CLOSE, parno);
522 break;
523 case '=':
524 case '!':
525 ender = regnode(SUCCEED);
526 *flagp &= ~HASWIDTH;
527 break;
528 case 0:
529 ender = regnode(END);
530 break;
531 }
532 regtail(ret, ender);
533
534 /* Hook the tails of the branches to the closing node. */
535 for (br = ret; br != NULL; br = regnext(br))
536 regoptail(br, ender);
537
538 if (paren == '=') {
539 reginsert(IFMATCH,ret);
540 regtail(ret, regnode(NOTHING));
541 }
542 else if (paren == '!') {
543 reginsert(UNLESSM,ret);
544 regtail(ret, regnode(NOTHING));
545 }
546
547 /* Check for proper termination. */
4633a7c4 548 if (paren && (regparse >= regxend || *nextchar() != ')')) {
a0d0e21e
LW
549 FAIL("unmatched () in regexp");
550 } else if (!paren && regparse < regxend) {
551 if (*regparse == ')') {
552 FAIL("unmatched () in regexp");
553 } else
554 FAIL("junk on end of regexp"); /* "Can't happen". */
555 /* NOTREACHED */
556 }
a687059c 557
a0d0e21e 558 return(ret);
a687059c
LW
559}
560
561/*
562 - regbranch - one alternative of an | operator
563 *
564 * Implements the concatenation operator.
565 */
566static char *
8ac85365 567regbranch(I32 *flagp)
a687059c 568{
a0d0e21e
LW
569 register char *ret;
570 register char *chain;
571 register char *latest;
572 I32 flags = 0;
573
574 *flagp = WORST; /* Tentatively. */
575
576 ret = regnode(BRANCH);
577 chain = NULL;
578 regparse--;
579 nextchar();
580 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
581 flags &= ~TRYAGAIN;
582 latest = regpiece(&flags);
583 if (latest == NULL) {
584 if (flags & TRYAGAIN)
585 continue;
586 return(NULL);
587 }
588 *flagp |= flags&HASWIDTH;
589 if (chain == NULL) /* First piece. */
590 *flagp |= flags&SPSTART;
591 else {
592 regnaughty++;
593 regtail(chain, latest);
a687059c 594 }
a0d0e21e
LW
595 chain = latest;
596 }
597 if (chain == NULL) /* Loop ran zero times. */
598 (void) regnode(NOTHING);
a687059c 599
a0d0e21e 600 return(ret);
a687059c
LW
601}
602
603/*
604 - regpiece - something followed by possible [*+?]
605 *
606 * Note that the branching code sequences used for ? and the general cases
607 * of * and + are somewhat optimized: they use the same NOTHING node as
608 * both the endmarker for their branch list and the body of the last branch.
609 * It might seem that this node could be dispensed with entirely, but the
610 * endmarker role is not redundant.
611 */
612static char *
8ac85365 613regpiece(I32 *flagp)
a687059c 614{
a0d0e21e
LW
615 register char *ret;
616 register char op;
617 register char *next;
618 I32 flags;
619 char *origparse = regparse;
620 char *maxpos;
621 I32 min;
622 I32 max = 32767;
623
624 ret = regatom(&flags);
625 if (ret == NULL) {
626 if (flags & TRYAGAIN)
627 *flagp |= TRYAGAIN;
628 return(NULL);
629 }
630
631 op = *regparse;
632 if (op == '(' && regparse[1] == '?' && regparse[2] == '#') {
633 while (op && op != ')')
634 op = *++regparse;
635 if (op) {
636 nextchar();
637 op = *regparse;
638 }
639 }
640
641 if (op == '{' && regcurly(regparse)) {
642 next = regparse + 1;
643 maxpos = Nullch;
644 while (isDIGIT(*next) || *next == ',') {
645 if (*next == ',') {
646 if (maxpos)
647 break;
648 else
649 maxpos = next;
a687059c 650 }
a0d0e21e
LW
651 next++;
652 }
653 if (*next == '}') { /* got one */
654 if (!maxpos)
655 maxpos = next;
656 regparse++;
657 min = atoi(regparse);
658 if (*maxpos == ',')
659 maxpos++;
660 else
661 maxpos = regparse;
662 max = atoi(maxpos);
663 if (!max && *maxpos != '0')
664 max = 32767; /* meaning "infinity" */
665 regparse = next;
666 nextchar();
667
668 do_curly:
669 if ((flags&SIMPLE)) {
670 regnaughty += 2 + regnaughty / 2;
671 reginsert(CURLY, ret);
672 }
673 else {
674 regnaughty += 4 + regnaughty; /* compound interest */
675 regtail(ret, regnode(WHILEM));
676 reginsert(CURLYX,ret);
677 regtail(ret, regnode(NOTHING));
678 }
679
680 if (min > 0)
681 *flagp = (WORST|HASWIDTH);
682 if (max && max < min)
683 croak("Can't do {n,m} with n > m");
684 if (regcode != &regdummy) {
00bf170e 685#ifdef REGALIGN
a0d0e21e
LW
686 *(unsigned short *)(ret+3) = min;
687 *(unsigned short *)(ret+5) = max;
00bf170e 688#else
a0d0e21e
LW
689 ret[3] = min >> 8; ret[4] = min & 0377;
690 ret[5] = max >> 8; ret[6] = max & 0377;
00bf170e 691#endif
a687059c 692 }
a687059c 693
a0d0e21e 694 goto nest_check;
a687059c 695 }
a0d0e21e 696 }
a687059c 697
a0d0e21e
LW
698 if (!ISMULT1(op)) {
699 *flagp = flags;
a687059c 700 return(ret);
a0d0e21e 701 }
bb20fd44
SP
702
703 if (!(flags&HASWIDTH) && op != '?')
fb73857a 704 FAIL("regexp *+ operand could be empty"); /* else may core dump */
bb20fd44 705
a0d0e21e
LW
706 nextchar();
707
708 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
709
710 if (op == '*' && (flags&SIMPLE)) {
711 reginsert(STAR, ret);
712 regnaughty += 4;
713 }
714 else if (op == '*') {
715 min = 0;
716 goto do_curly;
717 } else if (op == '+' && (flags&SIMPLE)) {
718 reginsert(PLUS, ret);
719 regnaughty += 3;
720 }
721 else if (op == '+') {
722 min = 1;
723 goto do_curly;
724 } else if (op == '?') {
725 min = 0; max = 1;
726 goto do_curly;
727 }
728 nest_check:
729 if (dowarn && regcode != &regdummy && !(flags&HASWIDTH) && max > 10000) {
730 warn("%.*s matches null string many times",
731 regparse - origparse, origparse);
732 }
733
734 if (*regparse == '?') {
735 nextchar();
736 reginsert(MINMOD, ret);
737#ifdef REGALIGN
738 regtail(ret, ret + 4);
739#else
740 regtail(ret, ret + 3);
741#endif
742 }
743 if (ISMULT2(regparse))
744 FAIL("nested *?+ in regexp");
745
746 return(ret);
a687059c
LW
747}
748
749/*
750 - regatom - the lowest level
751 *
752 * Optimization: gobbles an entire sequence of ordinary characters so that
753 * it can turn them into a single node, which is smaller to store and
754 * faster to run. Backslashed characters are exceptions, each becoming a
755 * separate node; the code is simpler that way and it's not worth fixing.
756 *
757 * [Yes, it is worth fixing, some scripts can run twice the speed.]
758 */
759static char *
8ac85365 760regatom(I32 *flagp)
a687059c 761{
a0d0e21e
LW
762 register char *ret = 0;
763 I32 flags;
764
765 *flagp = WORST; /* Tentatively. */
766
767tryagain:
768 switch (*regparse) {
769 case '^':
770 nextchar();
771 if (regflags & PMf_MULTILINE)
772 ret = regnode(MBOL);
773 else if (regflags & PMf_SINGLELINE)
774 ret = regnode(SBOL);
775 else
776 ret = regnode(BOL);
777 break;
778 case '$':
779 nextchar();
780 if (regflags & PMf_MULTILINE)
781 ret = regnode(MEOL);
782 else if (regflags & PMf_SINGLELINE)
783 ret = regnode(SEOL);
784 else
785 ret = regnode(EOL);
786 break;
787 case '.':
788 nextchar();
789 if (regflags & PMf_SINGLELINE)
790 ret = regnode(SANY);
791 else
792 ret = regnode(ANY);
793 regnaughty++;
794 *flagp |= HASWIDTH|SIMPLE;
795 break;
796 case '[':
797 regparse++;
798 ret = regclass();
799 *flagp |= HASWIDTH|SIMPLE;
800 break;
801 case '(':
802 nextchar();
803 ret = reg(1, &flags);
804 if (ret == NULL) {
805 if (flags & TRYAGAIN)
806 goto tryagain;
807 return(NULL);
808 }
809 *flagp |= flags&(HASWIDTH|SPSTART);
810 break;
811 case '|':
812 case ')':
813 if (flags & TRYAGAIN) {
814 *flagp |= TRYAGAIN;
815 return NULL;
816 }
817 croak("internal urp in regexp at /%s/", regparse);
818 /* Supposed to be caught earlier. */
819 break;
85afd4ae
CS
820 case '{':
821 if (!regcurly(regparse)) {
822 regparse++;
823 goto defchar;
824 }
825 /* FALL THROUGH */
a0d0e21e
LW
826 case '?':
827 case '+':
828 case '*':
3115e423 829 FAIL("?+*{} follows nothing in regexp");
a0d0e21e
LW
830 break;
831 case '\\':
832 switch (*++regparse) {
833 case 'A':
834 ret = regnode(SBOL);
835 *flagp |= SIMPLE;
836 nextchar();
837 break;
838 case 'G':
774d564b 839 ret = regnode(GPOS);
a0d0e21e
LW
840 *flagp |= SIMPLE;
841 nextchar();
842 break;
843 case 'Z':
844 ret = regnode(SEOL);
845 *flagp |= SIMPLE;
846 nextchar();
847 break;
848 case 'w':
bbce6d69 849 ret = regnode((regflags & PMf_LOCALE) ? ALNUML : ALNUM);
a0d0e21e
LW
850 *flagp |= HASWIDTH|SIMPLE;
851 nextchar();
852 break;
853 case 'W':
bbce6d69 854 ret = regnode((regflags & PMf_LOCALE) ? NALNUML : NALNUM);
a0d0e21e
LW
855 *flagp |= HASWIDTH|SIMPLE;
856 nextchar();
857 break;
858 case 'b':
bbce6d69 859 ret = regnode((regflags & PMf_LOCALE) ? BOUNDL : BOUND);
a0d0e21e
LW
860 *flagp |= SIMPLE;
861 nextchar();
862 break;
863 case 'B':
bbce6d69 864 ret = regnode((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND);
a0d0e21e
LW
865 *flagp |= SIMPLE;
866 nextchar();
867 break;
868 case 's':
bbce6d69 869 ret = regnode((regflags & PMf_LOCALE) ? SPACEL : SPACE);
a0d0e21e
LW
870 *flagp |= HASWIDTH|SIMPLE;
871 nextchar();
872 break;
873 case 'S':
bbce6d69 874 ret = regnode((regflags & PMf_LOCALE) ? NSPACEL : NSPACE);
a0d0e21e
LW
875 *flagp |= HASWIDTH|SIMPLE;
876 nextchar();
877 break;
878 case 'd':
879 ret = regnode(DIGIT);
880 *flagp |= HASWIDTH|SIMPLE;
881 nextchar();
882 break;
883 case 'D':
884 ret = regnode(NDIGIT);
885 *flagp |= HASWIDTH|SIMPLE;
886 nextchar();
887 break;
888 case 'n':
889 case 'r':
890 case 't':
891 case 'f':
892 case 'e':
893 case 'a':
894 case 'x':
895 case 'c':
896 case '0':
897 goto defchar;
898 case '1': case '2': case '3': case '4':
899 case '5': case '6': case '7': case '8': case '9':
900 {
901 I32 num = atoi(regparse);
902
903 if (num > 9 && num >= regnpar)
904 goto defchar;
905 else {
906 regsawback = 1;
c8756f30
AK
907 ret = reganode((regflags & PMf_FOLD)
908 ? ((regflags & PMf_LOCALE) ? REFFL : REFF)
909 : REF, num);
a0d0e21e
LW
910 *flagp |= HASWIDTH;
911 while (isDIGIT(*regparse))
a687059c 912 regparse++;
a0d0e21e
LW
913 regparse--;
914 nextchar();
915 }
916 }
917 break;
918 case '\0':
919 if (regparse >= regxend)
920 FAIL("trailing \\ in regexp");
921 /* FALL THROUGH */
922 default:
923 goto defchar;
924 }
925 break;
4633a7c4
LW
926
927 case '#':
928 if (regflags & PMf_EXTENDED) {
929 while (regparse < regxend && *regparse != '\n') regparse++;
930 if (regparse < regxend)
931 goto tryagain;
932 }
933 /* FALL THROUGH */
934
a0d0e21e
LW
935 default: {
936 register I32 len;
937 register char ender;
938 register char *p;
939 char *oldp;
940 I32 numlen;
941
942 regparse++;
943
944 defchar:
bbce6d69 945 ret = regnode((regflags & PMf_FOLD)
946 ? ((regflags & PMf_LOCALE) ? EXACTFL : EXACTF)
947 : EXACT);
a0d0e21e
LW
948 regc(0); /* save spot for len */
949 for (len = 0, p = regparse - 1;
950 len < 127 && p < regxend;
951 len++)
952 {
953 oldp = p;
5b5a24f7
CS
954
955 if (regflags & PMf_EXTENDED)
956 p = regwhite(p, regxend);
a0d0e21e
LW
957 switch (*p) {
958 case '^':
959 case '$':
960 case '.':
961 case '[':
962 case '(':
963 case ')':
964 case '|':
965 goto loopdone;
966 case '\\':
967 switch (*++p) {
968 case 'A':
969 case 'G':
970 case 'Z':
971 case 'w':
972 case 'W':
973 case 'b':
974 case 'B':
975 case 's':
976 case 'S':
977 case 'd':
978 case 'D':
979 --p;
980 goto loopdone;
981 case 'n':
982 ender = '\n';
983 p++;
a687059c 984 break;
a0d0e21e
LW
985 case 'r':
986 ender = '\r';
987 p++;
a687059c 988 break;
a0d0e21e
LW
989 case 't':
990 ender = '\t';
991 p++;
a687059c 992 break;
a0d0e21e
LW
993 case 'f':
994 ender = '\f';
995 p++;
a687059c 996 break;
a0d0e21e
LW
997 case 'e':
998 ender = '\033';
999 p++;
a687059c 1000 break;
a0d0e21e
LW
1001 case 'a':
1002 ender = '\007';
1003 p++;
a687059c 1004 break;
a0d0e21e
LW
1005 case 'x':
1006 ender = scan_hex(++p, 2, &numlen);
1007 p += numlen;
a687059c 1008 break;
a0d0e21e
LW
1009 case 'c':
1010 p++;
bbce6d69 1011 ender = UCHARAT(p++);
1012 ender = toCTRL(ender);
a687059c 1013 break;
a0d0e21e
LW
1014 case '0': case '1': case '2': case '3':case '4':
1015 case '5': case '6': case '7': case '8':case '9':
1016 if (*p == '0' ||
1017 (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
1018 ender = scan_oct(p, 3, &numlen);
1019 p += numlen;
1020 }
1021 else {
1022 --p;
1023 goto loopdone;
a687059c
LW
1024 }
1025 break;
a0d0e21e
LW
1026 case '\0':
1027 if (p >= regxend)
a687059c
LW
1028 FAIL("trailing \\ in regexp");
1029 /* FALL THROUGH */
a0d0e21e
LW
1030 default:
1031 ender = *p++;
1032 break;
1033 }
1034 break;
a687059c 1035 default:
a0d0e21e
LW
1036 ender = *p++;
1037 break;
a687059c 1038 }
5b5a24f7
CS
1039 if (regflags & PMf_EXTENDED)
1040 p = regwhite(p, regxend);
a0d0e21e
LW
1041 if (ISMULT2(p)) { /* Back off on ?+*. */
1042 if (len)
1043 p = oldp;
1044 else {
1045 len++;
1046 regc(ender);
1047 }
1048 break;
a687059c 1049 }
a0d0e21e
LW
1050 regc(ender);
1051 }
1052 loopdone:
1053 regparse = p - 1;
1054 nextchar();
1055 if (len < 0)
1056 FAIL("internal disaster in regexp");
1057 if (len > 0)
1058 *flagp |= HASWIDTH;
1059 if (len == 1)
1060 *flagp |= SIMPLE;
1061 if (regcode != &regdummy)
1062 *OPERAND(ret) = len;
1063 regc('\0');
a687059c 1064 }
a0d0e21e
LW
1065 break;
1066 }
a687059c 1067
a0d0e21e 1068 return(ret);
a687059c
LW
1069}
1070
5b5a24f7 1071static char *
8ac85365 1072regwhite(char *p, char *e)
5b5a24f7
CS
1073{
1074 while (p < e) {
1075 if (isSPACE(*p))
1076 ++p;
1077 else if (*p == '#') {
1078 do {
1079 p++;
1080 } while (p < e && *p != '\n');
1081 }
1082 else
1083 break;
1084 }
1085 return p;
1086}
1087
a687059c 1088static void
8ac85365 1089regset(char *opnd, register I32 c)
a687059c 1090{
bbce6d69 1091 if (opnd == &regdummy)
1092 return;
1093 c &= 0xFF;
1094 opnd[1 + (c >> 3)] |= (1 << (c & 7));
a687059c
LW
1095}
1096
1097static char *
8ac85365 1098regclass(void)
a687059c 1099{
bbce6d69 1100 register char *opnd;
8ac85365 1101 register I32 Class;
a0d0e21e
LW
1102 register I32 lastclass = 1234;
1103 register I32 range = 0;
1104 register char *ret;
1105 register I32 def;
1106 I32 numlen;
1107
1108 ret = regnode(ANYOF);
bbce6d69 1109 opnd = regcode;
8ac85365 1110 for (Class = 0; Class < 33; Class++)
bbce6d69 1111 regc(0);
a0d0e21e
LW
1112 if (*regparse == '^') { /* Complement of range. */
1113 regnaughty++;
1114 regparse++;
bbce6d69 1115 if (opnd != &regdummy)
1116 *opnd |= ANYOF_INVERT;
1117 }
1118 if (opnd != &regdummy) {
1119 if (regflags & PMf_FOLD)
1120 *opnd |= ANYOF_FOLD;
1121 if (regflags & PMf_LOCALE)
1122 *opnd |= ANYOF_LOCALE;
a0d0e21e 1123 }
a0d0e21e
LW
1124 if (*regparse == ']' || *regparse == '-')
1125 goto skipcond; /* allow 1st char to be ] or - */
1126 while (regparse < regxend && *regparse != ']') {
1127 skipcond:
8ac85365
NIS
1128 Class = UCHARAT(regparse++);
1129 if (Class == '\\') {
1130 Class = UCHARAT(regparse++);
1131 switch (Class) {
a0d0e21e 1132 case 'w':
bbce6d69 1133 if (regflags & PMf_LOCALE) {
1134 if (opnd != &regdummy)
1135 *opnd |= ANYOF_ALNUML;
1136 }
1137 else {
8ac85365
NIS
1138 for (Class = 0; Class < 256; Class++)
1139 if (isALNUM(Class))
1140 regset(opnd, Class);
bbce6d69 1141 }
a0d0e21e
LW
1142 lastclass = 1234;
1143 continue;
1144 case 'W':
bbce6d69 1145 if (regflags & PMf_LOCALE) {
1146 if (opnd != &regdummy)
1147 *opnd |= ANYOF_NALNUML;
1148 }
1149 else {
8ac85365
NIS
1150 for (Class = 0; Class < 256; Class++)
1151 if (!isALNUM(Class))
1152 regset(opnd, Class);
bbce6d69 1153 }
a0d0e21e
LW
1154 lastclass = 1234;
1155 continue;
1156 case 's':
bbce6d69 1157 if (regflags & PMf_LOCALE) {
1158 if (opnd != &regdummy)
1159 *opnd |= ANYOF_SPACEL;
1160 }
1161 else {
8ac85365
NIS
1162 for (Class = 0; Class < 256; Class++)
1163 if (isSPACE(Class))
1164 regset(opnd, Class);
bbce6d69 1165 }
a0d0e21e
LW
1166 lastclass = 1234;
1167 continue;
1168 case 'S':
bbce6d69 1169 if (regflags & PMf_LOCALE) {
1170 if (opnd != &regdummy)
1171 *opnd |= ANYOF_NSPACEL;
1172 }
1173 else {
8ac85365
NIS
1174 for (Class = 0; Class < 256; Class++)
1175 if (!isSPACE(Class))
1176 regset(opnd, Class);
bbce6d69 1177 }
a0d0e21e
LW
1178 lastclass = 1234;
1179 continue;
1180 case 'd':
8ac85365
NIS
1181 for (Class = '0'; Class <= '9'; Class++)
1182 regset(opnd, Class);
a0d0e21e
LW
1183 lastclass = 1234;
1184 continue;
1185 case 'D':
8ac85365
NIS
1186 for (Class = 0; Class < '0'; Class++)
1187 regset(opnd, Class);
1188 for (Class = '9' + 1; Class < 256; Class++)
1189 regset(opnd, Class);
a0d0e21e
LW
1190 lastclass = 1234;
1191 continue;
1192 case 'n':
8ac85365 1193 Class = '\n';
a0d0e21e
LW
1194 break;
1195 case 'r':
8ac85365 1196 Class = '\r';
a0d0e21e
LW
1197 break;
1198 case 't':
8ac85365 1199 Class = '\t';
a0d0e21e
LW
1200 break;
1201 case 'f':
8ac85365 1202 Class = '\f';
a0d0e21e
LW
1203 break;
1204 case 'b':
8ac85365 1205 Class = '\b';
a0d0e21e
LW
1206 break;
1207 case 'e':
8ac85365 1208 Class = '\033';
a0d0e21e
LW
1209 break;
1210 case 'a':
8ac85365 1211 Class = '\007';
a0d0e21e
LW
1212 break;
1213 case 'x':
8ac85365 1214 Class = scan_hex(regparse, 2, &numlen);
a0d0e21e
LW
1215 regparse += numlen;
1216 break;
1217 case 'c':
8ac85365
NIS
1218 Class = UCHARAT(regparse++);
1219 Class = toCTRL(Class);
a0d0e21e
LW
1220 break;
1221 case '0': case '1': case '2': case '3': case '4':
1222 case '5': case '6': case '7': case '8': case '9':
8ac85365 1223 Class = scan_oct(--regparse, 3, &numlen);
a0d0e21e
LW
1224 regparse += numlen;
1225 break;
1226 }
1227 }
1228 if (range) {
8ac85365 1229 if (lastclass > Class)
a0d0e21e
LW
1230 FAIL("invalid [] range in regexp");
1231 range = 0;
1232 }
1233 else {
8ac85365 1234 lastclass = Class;
a0d0e21e
LW
1235 if (*regparse == '-' && regparse+1 < regxend &&
1236 regparse[1] != ']') {
a687059c 1237 regparse++;
a0d0e21e
LW
1238 range = 1;
1239 continue; /* do it next time */
1240 }
a687059c 1241 }
8ac85365 1242 for ( ; lastclass <= Class; lastclass++)
bbce6d69 1243 regset(opnd, lastclass);
8ac85365 1244 lastclass = Class;
a0d0e21e
LW
1245 }
1246 if (*regparse != ']')
1247 FAIL("unmatched [] in regexp");
1248 nextchar();
1249 return ret;
1250}
1251
1252static char*
8ac85365 1253nextchar(void)
a0d0e21e
LW
1254{
1255 char* retval = regparse++;
1256
4633a7c4
LW
1257 for (;;) {
1258 if (*regparse == '(' && regparse[1] == '?' &&
1259 regparse[2] == '#') {
1260 while (*regparse && *regparse != ')')
748a9306 1261 regparse++;
4633a7c4
LW
1262 regparse++;
1263 continue;
1264 }
1265 if (regflags & PMf_EXTENDED) {
1266 if (isSPACE(*regparse)) {
748a9306
LW
1267 regparse++;
1268 continue;
1269 }
1270 else if (*regparse == '#') {
1271 while (*regparse && *regparse != '\n')
1272 regparse++;
1273 regparse++;
1274 continue;
1275 }
748a9306 1276 }
4633a7c4 1277 return retval;
a0d0e21e 1278 }
a687059c
LW
1279}
1280
1281/*
a0d0e21e
LW
1282- regnode - emit a node
1283*/
1284#ifdef CAN_PROTOTYPE
1285static char * /* Location. */
1286regnode(char op)
1287#else
a687059c
LW
1288static char * /* Location. */
1289regnode(op)
1290char op;
a0d0e21e 1291#endif
a687059c 1292{
a0d0e21e
LW
1293 register char *ret;
1294 register char *ptr;
a687059c 1295
a0d0e21e
LW
1296 ret = regcode;
1297 if (ret == &regdummy) {
a687059c 1298#ifdef REGALIGN
a0d0e21e
LW
1299 if (!(regsize & 1))
1300 regsize++;
a687059c 1301#endif
a0d0e21e
LW
1302 regsize += 3;
1303 return(ret);
1304 }
a687059c
LW
1305
1306#ifdef REGALIGN
1307#ifndef lint
a0d0e21e
LW
1308 if (!((long)ret & 1))
1309 *ret++ = 127;
a687059c
LW
1310#endif
1311#endif
a0d0e21e
LW
1312 ptr = ret;
1313 *ptr++ = op;
1314 *ptr++ = '\0'; /* Null "next" pointer. */
1315 *ptr++ = '\0';
1316 regcode = ptr;
a687059c 1317
a0d0e21e 1318 return(ret);
a687059c
LW
1319}
1320
1321/*
a0d0e21e
LW
1322- reganode - emit a node with an argument
1323*/
1324#ifdef CAN_PROTOTYPE
1325static char * /* Location. */
1326reganode(char op, unsigned short arg)
1327#else
fe14fcc3
LW
1328static char * /* Location. */
1329reganode(op, arg)
1330char op;
1331unsigned short arg;
a0d0e21e 1332#endif
fe14fcc3 1333{
a0d0e21e
LW
1334 register char *ret;
1335 register char *ptr;
fe14fcc3 1336
a0d0e21e
LW
1337 ret = regcode;
1338 if (ret == &regdummy) {
fe14fcc3 1339#ifdef REGALIGN
a0d0e21e
LW
1340 if (!(regsize & 1))
1341 regsize++;
fe14fcc3 1342#endif
a0d0e21e
LW
1343 regsize += 5;
1344 return(ret);
1345 }
fe14fcc3
LW
1346
1347#ifdef REGALIGN
1348#ifndef lint
a0d0e21e
LW
1349 if (!((long)ret & 1))
1350 *ret++ = 127;
fe14fcc3
LW
1351#endif
1352#endif
a0d0e21e
LW
1353 ptr = ret;
1354 *ptr++ = op;
1355 *ptr++ = '\0'; /* Null "next" pointer. */
1356 *ptr++ = '\0';
fe14fcc3 1357#ifdef REGALIGN
a0d0e21e 1358 *(unsigned short *)(ret+3) = arg;
fe14fcc3 1359#else
a0d0e21e 1360 ret[3] = arg >> 8; ret[4] = arg & 0377;
fe14fcc3 1361#endif
a0d0e21e
LW
1362 ptr += 2;
1363 regcode = ptr;
fe14fcc3 1364
a0d0e21e 1365 return(ret);
fe14fcc3
LW
1366}
1367
1368/*
a0d0e21e
LW
1369- regc - emit (if appropriate) a byte of code
1370*/
1371#ifdef CAN_PROTOTYPE
1372static void
1373regc(char b)
1374#else
a687059c
LW
1375static void
1376regc(b)
1377char b;
a0d0e21e 1378#endif
a687059c 1379{
a0d0e21e
LW
1380 if (regcode != &regdummy)
1381 *regcode++ = b;
1382 else
1383 regsize++;
a687059c
LW
1384}
1385
1386/*
a0d0e21e
LW
1387- reginsert - insert an operator in front of already-emitted operand
1388*
1389* Means relocating the operand.
1390*/
1391#ifdef CAN_PROTOTYPE
1392static void
1393reginsert(char op, char *opnd)
1394#else
a687059c
LW
1395static void
1396reginsert(op, opnd)
1397char op;
1398char *opnd;
a0d0e21e 1399#endif
a687059c 1400{
a0d0e21e
LW
1401 register char *src;
1402 register char *dst;
1403 register char *place;
1404 register int offset = (regkind[(U8)op] == CURLY ? 4 : 0);
a687059c 1405
a0d0e21e 1406 if (regcode == &regdummy) {
a687059c 1407#ifdef REGALIGN
a0d0e21e 1408 regsize += 4 + offset;
a687059c 1409#else
a0d0e21e 1410 regsize += 3 + offset;
a687059c 1411#endif
a0d0e21e
LW
1412 return;
1413 }
a687059c 1414
a0d0e21e 1415 src = regcode;
a687059c 1416#ifdef REGALIGN
a0d0e21e 1417 regcode += 4 + offset;
a687059c 1418#else
a0d0e21e 1419 regcode += 3 + offset;
a687059c 1420#endif
a0d0e21e
LW
1421 dst = regcode;
1422 while (src > opnd)
1423 *--dst = *--src;
1424
1425 place = opnd; /* Op node, where operand used to be. */
1426 *place++ = op;
1427 *place++ = '\0';
1428 *place++ = '\0';
1429 while (offset-- > 0)
a687059c 1430 *place++ = '\0';
2b69d0c2 1431#ifdef REGALIGN
a0d0e21e 1432 *place++ = '\177';
2b69d0c2 1433#endif
a687059c
LW
1434}
1435
1436/*
a0d0e21e
LW
1437- regtail - set the next-pointer at the end of a node chain
1438*/
a687059c 1439static void
8ac85365 1440regtail(char *p, char *val)
a687059c 1441{
a0d0e21e
LW
1442 register char *scan;
1443 register char *temp;
1444 register I32 offset;
1445
1446 if (p == &regdummy)
1447 return;
1448
1449 /* Find last node. */
1450 scan = p;
1451 for (;;) {
1452 temp = regnext(scan);
1453 if (temp == NULL)
1454 break;
1455 scan = temp;
1456 }
a687059c
LW
1457
1458#ifdef REGALIGN
a0d0e21e 1459 offset = val - scan;
a687059c 1460#ifndef lint
a0d0e21e 1461 *(short*)(scan+1) = offset;
a687059c 1462#else
a0d0e21e 1463 offset = offset;
a687059c
LW
1464#endif
1465#else
a0d0e21e
LW
1466 if (OP(scan) == BACK)
1467 offset = scan - val;
1468 else
1469 offset = val - scan;
1470 *(scan+1) = (offset>>8)&0377;
1471 *(scan+2) = offset&0377;
a687059c
LW
1472#endif
1473}
1474
1475/*
a0d0e21e
LW
1476- regoptail - regtail on operand of first argument; nop if operandless
1477*/
a687059c 1478static void
8ac85365 1479regoptail(char *p, char *val)
a687059c 1480{
a0d0e21e
LW
1481 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1482 if (p == NULL || p == &regdummy || regkind[(U8)OP(p)] != BRANCH)
1483 return;
1484 regtail(NEXTOPER(p), val);
a687059c
LW
1485}
1486
1487/*
1488 - regcurly - a little FSA that accepts {\d+,?\d*}
1489 */
79072805 1490STATIC I32
8ac85365 1491regcurly(register char *s)
a687059c
LW
1492{
1493 if (*s++ != '{')
1494 return FALSE;
f0fcb552 1495 if (!isDIGIT(*s))
a687059c 1496 return FALSE;
f0fcb552 1497 while (isDIGIT(*s))
a687059c
LW
1498 s++;
1499 if (*s == ',')
1500 s++;
f0fcb552 1501 while (isDIGIT(*s))
a687059c
LW
1502 s++;
1503 if (*s != '}')
1504 return FALSE;
1505 return TRUE;
1506}
1507
1508#ifdef DEBUGGING
1509
1510/*
fd181c75 1511 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
1512 */
1513void
8ac85365 1514regdump(regexp *r)
a687059c 1515{
a0d0e21e 1516 register char *s;
bbce6d69 1517 register char op = EXACT; /* Arbitrary non-END op. */
a0d0e21e 1518 register char *next;
46fc3d4c 1519 SV *sv = sv_newmortal();
a687059c 1520
a0d0e21e
LW
1521 s = r->program + 1;
1522 while (op != END) { /* While that wasn't END last time... */
a687059c 1523#ifdef REGALIGN
a0d0e21e
LW
1524 if (!((long)s & 1))
1525 s++;
a687059c 1526#endif
a0d0e21e 1527 op = OP(s);
46fc3d4c 1528 /* where, what */
1529 regprop(sv, s);
fb73857a 1530 PerlIO_printf(Perl_debug_log, "%2ld%s", (long)(s - r->program), SvPVX(sv));
a0d0e21e
LW
1531 next = regnext(s);
1532 s += regarglen[(U8)op];
1533 if (next == NULL) /* Next ptr. */
760ac839 1534 PerlIO_printf(Perl_debug_log, "(0)");
a0d0e21e 1535 else
fb73857a 1536 PerlIO_printf(Perl_debug_log, "(%ld)", (long)(s-r->program)+(next-s));
a0d0e21e
LW
1537 s += 3;
1538 if (op == ANYOF) {
bbce6d69 1539 s += 33;
a687059c 1540 }
bbce6d69 1541 if (regkind[(U8)op] == EXACT) {
a0d0e21e
LW
1542 /* Literal string, where present. */
1543 s++;
760ac839
LW
1544 (void)PerlIO_putc(Perl_debug_log, ' ');
1545 (void)PerlIO_putc(Perl_debug_log, '<');
a0d0e21e 1546 while (*s != '\0') {
760ac839 1547 (void)PerlIO_putc(Perl_debug_log,*s);
a0d0e21e
LW
1548 s++;
1549 }
760ac839 1550 (void)PerlIO_putc(Perl_debug_log, '>');
a0d0e21e
LW
1551 s++;
1552 }
760ac839 1553 (void)PerlIO_putc(Perl_debug_log, '\n');
a0d0e21e
LW
1554 }
1555
1556 /* Header fields of interest. */
1557 if (r->regstart)
760ac839 1558 PerlIO_printf(Perl_debug_log, "start `%s' ", SvPVX(r->regstart));
46fc3d4c 1559 if (r->regstclass) {
1560 regprop(sv, r->regstclass);
1561 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
1562 }
774d564b 1563 if (r->reganch & ROPT_ANCH) {
1564 PerlIO_printf(Perl_debug_log, "anchored");
1565 if (r->reganch & ROPT_ANCH_BOL)
1566 PerlIO_printf(Perl_debug_log, "(BOL)");
1567 if (r->reganch & ROPT_ANCH_GPOS)
1568 PerlIO_printf(Perl_debug_log, "(GPOS)");
1569 PerlIO_putc(Perl_debug_log, ' ');
1570 }
a0d0e21e 1571 if (r->reganch & ROPT_SKIP)
760ac839 1572 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 1573 if (r->reganch & ROPT_IMPLICIT)
760ac839 1574 PerlIO_printf(Perl_debug_log, "implicit ");
a0d0e21e 1575 if (r->regmust != NULL)
760ac839 1576 PerlIO_printf(Perl_debug_log, "must have \"%s\" back %ld ", SvPVX(r->regmust),
a0d0e21e 1577 (long) r->regback);
760ac839
LW
1578 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
1579 PerlIO_printf(Perl_debug_log, "\n");
a687059c
LW
1580}
1581
1582/*
a0d0e21e
LW
1583- regprop - printable representation of opcode
1584*/
46fc3d4c 1585void
8ac85365 1586regprop(SV *sv, char *o)
a687059c 1587{
a0d0e21e
LW
1588 register char *p = 0;
1589
46fc3d4c 1590 sv_setpv(sv, ":");
11343788 1591 switch (OP(o)) {
a0d0e21e
LW
1592 case BOL:
1593 p = "BOL";
1594 break;
1595 case MBOL:
1596 p = "MBOL";
1597 break;
1598 case SBOL:
1599 p = "SBOL";
1600 break;
1601 case EOL:
1602 p = "EOL";
1603 break;
1604 case MEOL:
1605 p = "MEOL";
1606 break;
1607 case SEOL:
1608 p = "SEOL";
1609 break;
1610 case ANY:
1611 p = "ANY";
1612 break;
1613 case SANY:
1614 p = "SANY";
1615 break;
1616 case ANYOF:
1617 p = "ANYOF";
1618 break;
1619 case BRANCH:
1620 p = "BRANCH";
1621 break;
bbce6d69 1622 case EXACT:
1623 p = "EXACT";
1624 break;
1625 case EXACTF:
1626 p = "EXACTF";
1627 break;
1628 case EXACTFL:
1629 p = "EXACTFL";
a0d0e21e
LW
1630 break;
1631 case NOTHING:
1632 p = "NOTHING";
1633 break;
1634 case BACK:
1635 p = "BACK";
1636 break;
1637 case END:
1638 p = "END";
1639 break;
a0d0e21e
LW
1640 case BOUND:
1641 p = "BOUND";
1642 break;
bbce6d69 1643 case BOUNDL:
1644 p = "BOUNDL";
1645 break;
a0d0e21e
LW
1646 case NBOUND:
1647 p = "NBOUND";
1648 break;
bbce6d69 1649 case NBOUNDL:
1650 p = "NBOUNDL";
a0d0e21e
LW
1651 break;
1652 case CURLY:
5dc0d613 1653 sv_catpvf(sv, "CURLY {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e
LW
1654 break;
1655 case CURLYX:
5dc0d613 1656 sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e
LW
1657 break;
1658 case REF:
5dc0d613 1659 sv_catpvf(sv, "REF%d", ARG1(o));
a0d0e21e 1660 break;
c8756f30 1661 case REFF:
8206a063 1662 sv_catpvf(sv, "REFF%d", ARG1(o));
c8756f30
AK
1663 break;
1664 case REFFL:
8206a063 1665 sv_catpvf(sv, "REFFL%d", ARG1(o));
c8756f30 1666 break;
a0d0e21e 1667 case OPEN:
5dc0d613 1668 sv_catpvf(sv, "OPEN%d", ARG1(o));
a0d0e21e
LW
1669 break;
1670 case CLOSE:
5dc0d613 1671 sv_catpvf(sv, "CLOSE%d", ARG1(o));
a0d0e21e
LW
1672 p = NULL;
1673 break;
1674 case STAR:
1675 p = "STAR";
1676 break;
1677 case PLUS:
1678 p = "PLUS";
1679 break;
1680 case MINMOD:
1681 p = "MINMOD";
1682 break;
774d564b 1683 case GPOS:
1684 p = "GPOS";
a0d0e21e
LW
1685 break;
1686 case UNLESSM:
1687 p = "UNLESSM";
1688 break;
1689 case IFMATCH:
1690 p = "IFMATCH";
1691 break;
1692 case SUCCEED:
1693 p = "SUCCEED";
1694 break;
1695 case WHILEM:
1696 p = "WHILEM";
1697 break;
bbce6d69 1698 case DIGIT:
1699 p = "DIGIT";
1700 break;
1701 case NDIGIT:
1702 p = "NDIGIT";
1703 break;
1704 case ALNUM:
1705 p = "ALNUM";
1706 break;
1707 case NALNUM:
1708 p = "NALNUM";
1709 break;
1710 case SPACE:
1711 p = "SPACE";
1712 break;
1713 case NSPACE:
1714 p = "NSPACE";
1715 break;
1716 case ALNUML:
1717 p = "ALNUML";
1718 break;
1719 case NALNUML:
1720 p = "NALNUML";
1721 break;
1722 case SPACEL:
1723 p = "SPACEL";
1724 break;
1725 case NSPACEL:
1726 p = "NSPACEL";
1727 break;
a0d0e21e
LW
1728 default:
1729 FAIL("corrupted regexp opcode");
1730 }
46fc3d4c 1731 if (p)
1732 sv_catpv(sv, p);
a687059c
LW
1733}
1734#endif /* DEBUGGING */
1735
2b69d0c2 1736void
8ac85365 1737pregfree(struct regexp *r)
a687059c 1738{
a0d0e21e
LW
1739 if (!r)
1740 return;
1741 if (r->precomp) {
1742 Safefree(r->precomp);
1743 r->precomp = Nullch;
1744 }
1745 if (r->subbase) {
1746 Safefree(r->subbase);
1747 r->subbase = Nullch;
1748 }
1749 if (r->regmust) {
1750 SvREFCNT_dec(r->regmust);
1751 r->regmust = Nullsv;
1752 }
1753 if (r->regstart) {
1754 SvREFCNT_dec(r->regstart);
1755 r->regstart = Nullsv;
1756 }
1757 Safefree(r->startp);
1758 Safefree(r->endp);
1759 Safefree(r);
a687059c 1760}