This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 5.0 alpha 9
[perl5.git] / regcomp.c
CommitLineData
a687059c
LW
1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
79072805 10/* $RCSfile: regcomp.c,v $$Revision: 4.1 $$Date: 92/08/07 18:26:28 $
a687059c
LW
11 *
12 * $Log: regcomp.c,v $
79072805
LW
13 * Revision 4.1 92/08/07 18:26:28 lwall
14 *
2b69d0c2
LW
15 * Revision 4.0.1.5 92/06/08 15:23:36 lwall
16 * patch20: Perl now distinguishes overlapped copies from non-overlapped
17 * patch20: /^stuff/ wrongly assumed an implicit $* == 1
18 * patch20: /x{0}/ was wrongly interpreted as /x{0,}/
19 * patch20: added \W, \S and \D inside /[...]/
20 *
55204971
LW
21 * Revision 4.0.1.4 91/11/05 22:55:14 lwall
22 * patch11: Erratum
23 *
f0fcb552
LW
24 * Revision 4.0.1.3 91/11/05 18:22:28 lwall
25 * patch11: minimum match length calculation in regexp is now cumulative
26 * patch11: initial .* in pattern had dependency on value of $*
27 * patch11: certain patterns made use of garbage pointers from uncleared memory
28 * patch11: prepared for ctype implementations that don't define isascii()
29 *
9ef589d8
LW
30 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
31 * patch4: new copyright notice
32 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
33 * patch4: // wouldn't use previous pattern if it started with a null character
34 *
35c8bce7
LW
35 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
36 * patch1: random cleanup in cpp namespace
37 *
fe14fcc3
LW
38 * Revision 4.0 91/03/20 01:39:01 lwall
39 * 4.0 baseline.
a687059c
LW
40 *
41 */
f0fcb552 42/*SUPPRESS 112*/
a687059c
LW
43/*
44 * regcomp and regexec -- regsub and regerror are not used in perl
45 *
46 * Copyright (c) 1986 by University of Toronto.
47 * Written by Henry Spencer. Not derived from licensed software.
48 *
49 * Permission is granted to anyone to use this software for any
50 * purpose on any computer system, and to redistribute it freely,
51 * subject to the following restrictions:
52 *
53 * 1. The author is not responsible for the consequences of use of
54 * this software, no matter how awful, even if they arise
55 * from defects in it.
56 *
57 * 2. The origin of this software must not be misrepresented, either
58 * by explicit claim or by omission.
59 *
60 * 3. Altered versions must be plainly marked as such, and must not
61 * be misrepresented as being the original software.
62 *
63 *
64 **** Alterations to Henry's code are...
65 ****
9ef589d8 66 **** Copyright (c) 1991, Larry Wall
a687059c 67 ****
9ef589d8
LW
68 **** You may distribute under the terms of either the GNU General Public
69 **** License or the Artistic License, as specified in the README file.
70
a687059c
LW
71 *
72 * Beware that some of this code is subtly aware of the way operator
73 * precedence is structured in regular expressions. Serious changes in
74 * regular-expression syntax might require a total rethink.
75 */
76#include "EXTERN.h"
77#include "perl.h"
78#include "INTERN.h"
79#include "regcomp.h"
80
fe14fcc3
LW
81#ifdef MSDOS
82# if defined(BUGGY_MSC6)
83 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
84 # pragma optimize("a",off)
85 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
86 # pragma optimize("w",on )
87# endif /* BUGGY_MSC6 */
88#endif /* MSDOS */
89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
94#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
95#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
96 ((*s) == '{' && regcurly(s)))
2b69d0c2
LW
97#ifdef atarist
98#define PERL_META "^$.[()|?+*\\"
99#else
a687059c 100#define META "^$.[()|?+*\\"
2b69d0c2 101#endif
a687059c 102
35c8bce7
LW
103#ifdef SPSTART
104#undef SPSTART /* dratted cpp namespace... */
105#endif
a687059c
LW
106/*
107 * Flags to be passed up and down.
108 */
109#define HASWIDTH 01 /* Known never to match null string. */
110#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
111#define SPSTART 04 /* Starts with * or +. */
112#define WORST 0 /* Worst case. */
113
114/*
a687059c
LW
115 * Forward declarations for regcomp()'s friends.
116 */
79072805 117STATIC I32 regcurly();
a687059c
LW
118STATIC char *reg();
119STATIC char *regbranch();
120STATIC char *regpiece();
121STATIC char *regatom();
122STATIC char *regclass();
123STATIC char *regnode();
fe14fcc3 124STATIC char *reganode();
a687059c
LW
125STATIC void regc();
126STATIC void reginsert();
127STATIC void regtail();
128STATIC void regoptail();
129
130/*
131 - regcomp - compile a regular expression into internal code
132 *
133 * We can't allocate space until we know how big the compiled form will be,
134 * but we can't compile it (and thus know how big it is) until we've got a
135 * place to put the code. So we cheat: we compile it twice, once with code
136 * generation turned off and size counting turned on, and once "for real".
137 * This also means that we don't allocate space until we are sure that the
138 * thing really will compile successfully, and we never have to move the
139 * code and thus invalidate pointers into it. (Note that it has to be in
140 * one piece because free() must be able to free it all.) [NB: not true in perl]
141 *
142 * Beware that the optimization-preparation code in here knows about some
143 * of the structure of the compiled regexp. [I'll say.]
144 */
145regexp *
00bf170e 146regcomp(exp,xend,fold)
a687059c
LW
147char *exp;
148char *xend;
79072805 149I32 fold;
a687059c
LW
150{
151 register regexp *r;
152 register char *scan;
79072805
LW
153 register SV *longish;
154 SV *longest;
155 register I32 len;
a687059c 156 register char *first;
79072805
LW
157 I32 flags;
158 I32 backish;
159 I32 backest;
160 I32 curback;
161 I32 minlen;
162 I32 sawplus = 0;
163 I32 sawopen = 0;
a687059c
LW
164
165 if (exp == NULL)
463ee0b2 166 croak("NULL regexp argument");
a687059c
LW
167
168 /* First pass: determine size, legality. */
169 regfold = fold;
170 regparse = exp;
171 regxend = xend;
172 regprecomp = nsavestr(exp,xend-exp);
173 regsawbracket = 0;
9ef589d8 174 regsawback = 0;
a687059c
LW
175 regnpar = 1;
176 regsize = 0L;
177 regcode = &regdummy;
f0fcb552 178 regc((char)MAGIC);
a687059c
LW
179 if (reg(0, &flags) == NULL) {
180 Safefree(regprecomp);
fe14fcc3 181 regprecomp = Nullch;
a687059c
LW
182 return(NULL);
183 }
184
185 /* Small enough for pointer-storage convention? */
186 if (regsize >= 32767L) /* Probably could be 65535L. */
187 FAIL("regexp too big");
188
189 /* Allocate space. */
190 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
191 if (r == NULL)
192 FAIL("regexp out of space");
193
194 /* Second pass: emit code. */
195 if (regsawbracket)
2b69d0c2 196 Copy(regprecomp,exp,xend-exp,char);
9ef589d8 197 r->prelen = xend-exp;
a687059c 198 r->precomp = regprecomp;
9ef589d8 199 r->subbeg = r->subbase = NULL;
a687059c
LW
200 regparse = exp;
201 regnpar = 1;
202 regcode = r->program;
f0fcb552 203 regc((char)MAGIC);
a687059c
LW
204 if (reg(0, &flags) == NULL)
205 return(NULL);
206
207 /* Dig out information for optimizations. */
79072805 208 r->regstart = Nullsv; /* Worst-case defaults. */
a687059c 209 r->reganch = 0;
79072805 210 r->regmust = Nullsv;
a687059c
LW
211 r->regback = -1;
212 r->regstclass = Nullch;
213 scan = r->program+1; /* First BRANCH. */
214 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
215 scan = NEXTOPER(scan);
216
217 first = scan;
9ef589d8 218 while ((OP(first) == OPEN && (sawopen = 1)) ||
a687059c 219 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
00bf170e
LW
220 (OP(first) == PLUS) ||
221 (OP(first) == CURLY && ARG1(first) > 0) ) {
fe14fcc3 222 if (OP(first) == PLUS)
9ef589d8 223 sawplus = 1;
fe14fcc3
LW
224 else
225 first += regarglen[OP(first)];
a687059c 226 first = NEXTOPER(first);
00bf170e 227 }
a687059c
LW
228
229 /* Starting-point info. */
9ef589d8 230 again:
a687059c
LW
231 if (OP(first) == EXACTLY) {
232 r->regstart =
79072805
LW
233 newSVpv(OPERAND(first)+1,*OPERAND(first));
234 if (SvCUR(r->regstart) > !(sawstudy|fold))
235 fbm_compile(r->regstart,fold);
236 else
237 sv_upgrade(r->regstart, SVt_PVBM);
a687059c 238 }
93a17b20 239 else if ((exp = strchr(simple,OP(first))) && exp > simple)
a687059c
LW
240 r->regstclass = first;
241 else if (OP(first) == BOUND || OP(first) == NBOUND)
242 r->regstclass = first;
2b69d0c2
LW
243 else if (OP(first) == BOL) {
244 r->reganch = ROPT_ANCH;
245 first = NEXTOPER(first);
246 goto again;
247 }
248 else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) &&
249 !(r->reganch & ROPT_ANCH) ) {
250 /* turn .* into ^.* with an implied $*=1 */
f0fcb552 251 r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
9ef589d8
LW
252 first = NEXTOPER(first);
253 goto again;
254 }
255 if (sawplus && (!sawopen || !regsawback))
256 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
a687059c 257
79072805
LW
258 DEBUG_r(fprintf(stderr,"first %d next %d offset %d\n",
259 OP(first), OP(NEXTOPER(first)), first - scan));
a687059c
LW
260 /*
261 * If there's something expensive in the r.e., find the
262 * longest literal string that must appear and make it the
263 * regmust. Resolve ties in favor of later strings, since
264 * the regstart check works with the beginning of the r.e.
265 * and avoiding duplication strengthens checking. Not a
266 * strong reason, but sufficient in the absence of others.
267 * [Now we resolve ties in favor of the earlier string if
268 * it happens that curback has been invalidated, since the
269 * earlier string may buy us something the later one won't.]
270 */
79072805
LW
271 longish = newSVpv("",0);
272 longest = newSVpv("",0);
a687059c 273 len = 0;
f0fcb552 274 minlen = 0;
a687059c 275 curback = 0;
34de22dd
LW
276 backish = 0;
277 backest = 0;
e5d73d77 278 while (OP(scan) != END) {
a687059c
LW
279 if (OP(scan) == BRANCH) {
280 if (OP(regnext(scan)) == BRANCH) {
281 curback = -30000;
282 while (OP(scan) == BRANCH)
283 scan = regnext(scan);
284 }
285 else /* single branch is ok */
286 scan = NEXTOPER(scan);
287 }
288 if (OP(scan) == EXACTLY) {
fe14fcc3
LW
289 char *t;
290
a687059c 291 first = scan;
fe14fcc3
LW
292 while (OP(t = regnext(scan)) == CLOSE)
293 scan = t;
f0fcb552 294 minlen += *OPERAND(first);
34de22dd 295 if (curback - backish == len) {
79072805 296 sv_catpvn(longish, OPERAND(first)+1,
a687059c
LW
297 *OPERAND(first));
298 len += *OPERAND(first);
299 curback += *OPERAND(first);
300 first = regnext(scan);
301 }
302 else if (*OPERAND(first) >= len + (curback >= 0)) {
303 len = *OPERAND(first);
79072805 304 sv_setpvn(longish, OPERAND(first)+1,len);
34de22dd 305 backish = curback;
a687059c
LW
306 curback += len;
307 first = regnext(scan);
308 }
309 else
310 curback += *OPERAND(first);
311 }
93a17b20 312 else if (strchr(varies,OP(scan))) {
0a12ae7d
LW
313 curback = -30000;
314 len = 0;
79072805
LW
315 if (SvCUR(longish) > SvCUR(longest)) {
316 sv_setsv(longest,longish);
34de22dd
LW
317 backest = backish;
318 }
79072805 319 sv_setpvn(longish,"",0);
f0fcb552 320 if (OP(scan) == PLUS &&
93a17b20 321 strchr(simple,OP(NEXTOPER(scan))))
f0fcb552
LW
322 minlen++;
323 else if (OP(scan) == CURLY &&
93a17b20 324 strchr(simple,OP(NEXTOPER(scan)+4)))
f0fcb552 325 minlen += ARG1(scan);
0a12ae7d 326 }
93a17b20 327 else if (strchr(simple,OP(scan))) {
0a12ae7d 328 curback++;
f0fcb552 329 minlen++;
e5d73d77 330 len = 0;
79072805
LW
331 if (SvCUR(longish) > SvCUR(longest)) {
332 sv_setsv(longest,longish);
34de22dd
LW
333 backest = backish;
334 }
79072805 335 sv_setpvn(longish,"",0);
e5d73d77 336 }
a687059c
LW
337 scan = regnext(scan);
338 }
e5d73d77
LW
339
340 /* Prefer earlier on tie, unless we can tail match latter */
341
79072805
LW
342 if (SvCUR(longish) + (OP(first) == EOL) > SvCUR(longest)) {
343 sv_setsv(longest,longish);
34de22dd
LW
344 backest = backish;
345 }
e5d73d77 346 else
79072805
LW
347 sv_setpvn(longish,"",0);
348 if (SvCUR(longest)
34de22dd
LW
349 &&
350 (!r->regstart
351 ||
463ee0b2
LW
352 !fbm_instr((unsigned char*) SvPVX(r->regstart),
353 (unsigned char *) SvPVX(r->regstart)
79072805 354 + SvCUR(r->regstart),
34de22dd
LW
355 longest)
356 )
357 )
358 {
a687059c 359 r->regmust = longest;
34de22dd
LW
360 if (backest < 0)
361 backest = -1;
362 r->regback = backest;
79072805 363 if (SvCUR(longest)
0a12ae7d 364 > !(sawstudy || fold || OP(first) == EOL) )
79072805
LW
365 fbm_compile(r->regmust,fold);
366 SvUPGRADE(r->regmust, SVt_PVBM);
367 BmUSEFUL(r->regmust) = 100;
368 if (OP(first) == EOL && SvCUR(longish))
369 SvTAIL_on(r->regmust);
a687059c 370 }
fe14fcc3 371 else {
8990e307 372 SvREFCNT_dec(longest);
79072805 373 longest = Nullsv;
fe14fcc3 374 }
8990e307 375 SvREFCNT_dec(longish);
a687059c
LW
376 }
377
378 r->do_folding = fold;
379 r->nparens = regnpar - 1;
f0fcb552
LW
380 r->minlen = minlen;
381 Newz(1002, r->startp, regnpar, char*);
382 Newz(1002, r->endp, regnpar, char*);
79072805 383 DEBUG_r(regdump(r));
a687059c
LW
384 return(r);
385}
386
387/*
388 - reg - regular expression, i.e. main body or parenthesized thing
389 *
390 * Caller must absorb opening parenthesis.
391 *
392 * Combining parenthesis handling with the base level of regular expression
393 * is a trifle forced, but the need to tie the tails of the branches to what
394 * follows makes it hard to avoid.
395 */
396static char *
397reg(paren, flagp)
79072805
LW
398I32 paren; /* Parenthesized? */
399I32 *flagp;
a687059c
LW
400{
401 register char *ret;
402 register char *br;
403 register char *ender;
79072805
LW
404 register I32 parno;
405 I32 flags;
a687059c
LW
406
407 *flagp = HASWIDTH; /* Tentatively. */
408
409 /* Make an OPEN node, if parenthesized. */
410 if (paren) {
a687059c
LW
411 parno = regnpar;
412 regnpar++;
fe14fcc3 413 ret = reganode(OPEN, parno);
a687059c
LW
414 } else
415 ret = NULL;
416
417 /* Pick up the branches, linking them together. */
418 br = regbranch(&flags);
419 if (br == NULL)
420 return(NULL);
421 if (ret != NULL)
422 regtail(ret, br); /* OPEN -> first. */
423 else
424 ret = br;
425 if (!(flags&HASWIDTH))
426 *flagp &= ~HASWIDTH;
427 *flagp |= flags&SPSTART;
428 while (*regparse == '|') {
429 regparse++;
430 br = regbranch(&flags);
431 if (br == NULL)
432 return(NULL);
433 regtail(ret, br); /* BRANCH -> BRANCH. */
434 if (!(flags&HASWIDTH))
435 *flagp &= ~HASWIDTH;
436 *flagp |= flags&SPSTART;
437 }
438
439 /* Make a closing node, and hook it on the end. */
fe14fcc3
LW
440 if (paren)
441 ender = reganode(CLOSE, parno);
442 else
443 ender = regnode(END);
a687059c
LW
444 regtail(ret, ender);
445
446 /* Hook the tails of the branches to the closing node. */
447 for (br = ret; br != NULL; br = regnext(br))
448 regoptail(br, ender);
449
450 /* Check for proper termination. */
451 if (paren && *regparse++ != ')') {
452 FAIL("unmatched () in regexp");
453 } else if (!paren && regparse < regxend) {
454 if (*regparse == ')') {
455 FAIL("unmatched () in regexp");
456 } else
457 FAIL("junk on end of regexp"); /* "Can't happen". */
458 /* NOTREACHED */
459 }
460
461 return(ret);
462}
463
464/*
465 - regbranch - one alternative of an | operator
466 *
467 * Implements the concatenation operator.
468 */
469static char *
470regbranch(flagp)
79072805 471I32 *flagp;
a687059c
LW
472{
473 register char *ret;
474 register char *chain;
475 register char *latest;
79072805 476 I32 flags;
a687059c
LW
477
478 *flagp = WORST; /* Tentatively. */
479
480 ret = regnode(BRANCH);
481 chain = NULL;
482 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
483 latest = regpiece(&flags);
484 if (latest == NULL)
485 return(NULL);
486 *flagp |= flags&HASWIDTH;
487 if (chain == NULL) /* First piece. */
488 *flagp |= flags&SPSTART;
489 else
490 regtail(chain, latest);
491 chain = latest;
492 }
493 if (chain == NULL) /* Loop ran zero times. */
494 (void) regnode(NOTHING);
495
496 return(ret);
497}
498
499/*
500 - regpiece - something followed by possible [*+?]
501 *
502 * Note that the branching code sequences used for ? and the general cases
503 * of * and + are somewhat optimized: they use the same NOTHING node as
504 * both the endmarker for their branch list and the body of the last branch.
505 * It might seem that this node could be dispensed with entirely, but the
506 * endmarker role is not redundant.
507 */
508static char *
509regpiece(flagp)
79072805 510I32 *flagp;
a687059c
LW
511{
512 register char *ret;
513 register char op;
514 register char *next;
79072805 515 I32 flags;
a687059c 516 char *origparse = regparse;
79072805 517 I32 orignpar = regnpar;
a687059c 518 char *max;
79072805 519 I32 iter;
a687059c
LW
520 char ch;
521
522 ret = regatom(&flags);
523 if (ret == NULL)
524 return(NULL);
525
526 op = *regparse;
527
528 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
529 * then we decrement the first number by one and reset our
530 * parsing back to the beginning of the same atom. If the first number
531 * is down to 0, decrement the second number instead and fake up
532 * a ? after it. Given the way this compiler doesn't keep track
533 * of offsets on the first pass, this is the only way to replicate
534 * a piece of code. Sigh.
535 */
536 if (op == '{' && regcurly(regparse)) {
537 next = regparse + 1;
538 max = Nullch;
f0fcb552 539 while (isDIGIT(*next) || *next == ',') {
a687059c
LW
540 if (*next == ',') {
541 if (max)
542 break;
543 else
544 max = next;
545 }
546 next++;
547 }
548 if (*next == '}') { /* got one */
a687059c
LW
549 if (!max)
550 max = next;
551 regparse++;
552 iter = atoi(regparse);
00bf170e 553 if (flags&SIMPLE) { /* we can do it right after all */
79072805 554 I32 tmp;
00bf170e
LW
555
556 reginsert(CURLY, ret);
fe14fcc3
LW
557 if (iter > 0)
558 *flagp = (WORST|HASWIDTH);
00bf170e
LW
559 if (*max == ',')
560 max++;
6eb13c3b
LW
561 else
562 max = regparse;
00bf170e 563 tmp = atoi(max);
2b69d0c2
LW
564 if (!tmp && *max != '0')
565 tmp = 32767; /* meaning "infinity" */
00bf170e 566 if (tmp && tmp < iter)
463ee0b2 567 croak("Can't do {n,m} with n > m");
00bf170e
LW
568 if (regcode != &regdummy) {
569#ifdef REGALIGN
570 *(unsigned short *)(ret+3) = iter;
571 *(unsigned short *)(ret+5) = tmp;
572#else
573 ret[3] = iter >> 8; ret[4] = iter & 0377;
574 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
575#endif
576 }
577 regparse = next;
578 goto nest_check;
579 }
580 regsawbracket++; /* remember we clobbered exp */
a687059c
LW
581 if (iter > 0) {
582 ch = *max;
583 sprintf(regparse,"%.*d", max-regparse, iter - 1);
584 *max = ch;
00bf170e
LW
585 if (*max == ',' && max[1] != '}') {
586 if (atoi(max+1) <= 0)
463ee0b2 587 croak("Can't do {n,m} with n > m");
a687059c
LW
588 ch = *next;
589 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
590 *next = ch;
591 }
00bf170e 592 if (iter != 1 || *max == ',') {
a687059c
LW
593 regparse = origparse; /* back up input pointer */
594 regnpar = orignpar; /* don't make more parens */
595 }
596 else {
597 regparse = next;
598 goto nest_check;
599 }
600 *flagp = flags;
601 return ret;
602 }
603 if (*max == ',') {
604 max++;
605 iter = atoi(max);
606 if (max == next) { /* any number more? */
607 regparse = next;
608 op = '*'; /* fake up one with a star */
609 }
610 else if (iter > 0) {
611 op = '?'; /* fake up optional atom */
612 ch = *next;
613 sprintf(max,"%.*d", next-max, iter - 1);
614 *next = ch;
615 if (iter == 1)
616 regparse = next;
617 else {
618 regparse = origparse - 1; /* offset ++ below */
619 regnpar = orignpar;
620 }
621 }
622 else
463ee0b2 623 croak("Can't do {n,0}");
a687059c
LW
624 }
625 else
463ee0b2 626 croak("Can't do {0}");
a687059c
LW
627 }
628 }
629
630 if (!ISMULT1(op)) {
631 *flagp = flags;
632 return(ret);
633 }
634
635 if (!(flags&HASWIDTH) && op != '?')
636 FAIL("regexp *+ operand could be empty");
637 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
638
639 if (op == '*' && (flags&SIMPLE))
640 reginsert(STAR, ret);
641 else if (op == '*') {
642 /* Emit x* as (x&|), where & means "self". */
643 reginsert(BRANCH, ret); /* Either x */
644 regoptail(ret, regnode(BACK)); /* and loop */
645 regoptail(ret, ret); /* back */
646 regtail(ret, regnode(BRANCH)); /* or */
647 regtail(ret, regnode(NOTHING)); /* null. */
648 } else if (op == '+' && (flags&SIMPLE))
649 reginsert(PLUS, ret);
650 else if (op == '+') {
651 /* Emit x+ as x(&|), where & means "self". */
652 next = regnode(BRANCH); /* Either */
653 regtail(ret, next);
654 regtail(regnode(BACK), ret); /* loop back */
655 regtail(next, regnode(BRANCH)); /* or */
656 regtail(ret, regnode(NOTHING)); /* null. */
657 } else if (op == '?') {
658 /* Emit x? as (x|) */
659 reginsert(BRANCH, ret); /* Either x */
660 regtail(ret, regnode(BRANCH)); /* or */
661 next = regnode(NOTHING); /* null. */
662 regtail(ret, next);
663 regoptail(ret, next);
664 }
665 nest_check:
666 regparse++;
667 if (ISMULT2(regparse))
668 FAIL("nested *?+ in regexp");
669
670 return(ret);
671}
672
673/*
674 - regatom - the lowest level
675 *
676 * Optimization: gobbles an entire sequence of ordinary characters so that
677 * it can turn them into a single node, which is smaller to store and
678 * faster to run. Backslashed characters are exceptions, each becoming a
679 * separate node; the code is simpler that way and it's not worth fixing.
680 *
681 * [Yes, it is worth fixing, some scripts can run twice the speed.]
682 */
683static char *
684regatom(flagp)
79072805 685I32 *flagp;
a687059c
LW
686{
687 register char *ret;
79072805 688 I32 flags;
a687059c
LW
689
690 *flagp = WORST; /* Tentatively. */
691
692 switch (*regparse++) {
693 case '^':
694 ret = regnode(BOL);
695 break;
696 case '$':
697 ret = regnode(EOL);
698 break;
699 case '.':
700 ret = regnode(ANY);
701 *flagp |= HASWIDTH|SIMPLE;
702 break;
703 case '[':
704 ret = regclass();
705 *flagp |= HASWIDTH|SIMPLE;
706 break;
707 case '(':
708 ret = reg(1, &flags);
709 if (ret == NULL)
710 return(NULL);
711 *flagp |= flags&(HASWIDTH|SPSTART);
712 break;
713 case '|':
714 case ')':
715 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
716 break;
717 case '?':
718 case '+':
719 case '*':
720 FAIL("?+* follows nothing in regexp");
721 break;
722 case '\\':
723 switch (*regparse) {
724 case 'w':
725 ret = regnode(ALNUM);
726 *flagp |= HASWIDTH|SIMPLE;
727 regparse++;
728 break;
729 case 'W':
730 ret = regnode(NALNUM);
731 *flagp |= HASWIDTH|SIMPLE;
732 regparse++;
733 break;
734 case 'b':
735 ret = regnode(BOUND);
736 *flagp |= SIMPLE;
737 regparse++;
738 break;
739 case 'B':
740 ret = regnode(NBOUND);
741 *flagp |= SIMPLE;
742 regparse++;
743 break;
744 case 's':
745 ret = regnode(SPACE);
746 *flagp |= HASWIDTH|SIMPLE;
747 regparse++;
748 break;
749 case 'S':
750 ret = regnode(NSPACE);
751 *flagp |= HASWIDTH|SIMPLE;
752 regparse++;
753 break;
754 case 'd':
755 ret = regnode(DIGIT);
756 *flagp |= HASWIDTH|SIMPLE;
757 regparse++;
758 break;
759 case 'D':
760 ret = regnode(NDIGIT);
761 *flagp |= HASWIDTH|SIMPLE;
762 regparse++;
763 break;
764 case 'n':
765 case 'r':
766 case 't':
767 case 'f':
fe14fcc3
LW
768 case 'e':
769 case 'a':
770 case 'x':
771 case 'c':
772 case '0':
a687059c 773 goto defchar;
fe14fcc3 774 case '1': case '2': case '3': case '4':
a687059c 775 case '5': case '6': case '7': case '8': case '9':
fe14fcc3 776 {
79072805 777 I32 num = atoi(regparse);
fe14fcc3
LW
778
779 if (num > 9 && num >= regnpar)
a687059c 780 goto defchar;
fe14fcc3 781 else {
9ef589d8 782 regsawback = 1;
fe14fcc3 783 ret = reganode(REF, num);
f0fcb552 784 while (isDIGIT(*regparse))
fe14fcc3 785 regparse++;
a687059c 786 *flagp |= SIMPLE;
fe14fcc3 787 }
a687059c
LW
788 }
789 break;
790 case '\0':
791 if (regparse >= regxend)
792 FAIL("trailing \\ in regexp");
793 /* FALL THROUGH */
794 default:
795 goto defchar;
796 }
797 break;
798 default: {
79072805 799 register I32 len;
a687059c
LW
800 register char ender;
801 register char *p;
802 char *oldp;
79072805 803 I32 numlen;
a687059c
LW
804
805 defchar:
806 ret = regnode(EXACTLY);
807 regc(0); /* save spot for len */
808 for (len=0, p=regparse-1;
809 len < 127 && p < regxend;
810 len++)
811 {
812 oldp = p;
813 switch (*p) {
814 case '^':
815 case '$':
816 case '.':
817 case '[':
818 case '(':
819 case ')':
820 case '|':
821 goto loopdone;
822 case '\\':
823 switch (*++p) {
824 case 'w':
825 case 'W':
826 case 'b':
827 case 'B':
828 case 's':
829 case 'S':
830 case 'd':
831 case 'D':
832 --p;
833 goto loopdone;
834 case 'n':
835 ender = '\n';
836 p++;
837 break;
838 case 'r':
839 ender = '\r';
840 p++;
841 break;
842 case 't':
843 ender = '\t';
844 p++;
845 break;
846 case 'f':
847 ender = '\f';
848 p++;
849 break;
fe14fcc3
LW
850 case 'e':
851 ender = '\033';
852 p++;
853 break;
854 case 'a':
855 ender = '\007';
856 p++;
857 break;
858 case 'x':
79072805 859 ender = scan_hex(++p, 2, &numlen);
fe14fcc3
LW
860 p += numlen;
861 break;
862 case 'c':
863 p++;
864 ender = *p++;
f0fcb552 865 if (isLOWER(ender))
fe14fcc3
LW
866 ender = toupper(ender);
867 ender ^= 64;
868 break;
a687059c
LW
869 case '0': case '1': case '2': case '3':case '4':
870 case '5': case '6': case '7': case '8':case '9':
fe14fcc3 871 if (*p == '0' ||
f0fcb552 872 (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
79072805 873 ender = scan_oct(p, 3, &numlen);
fe14fcc3 874 p += numlen;
a687059c
LW
875 }
876 else {
877 --p;
878 goto loopdone;
879 }
880 break;
881 case '\0':
882 if (p >= regxend)
883 FAIL("trailing \\ in regexp");
884 /* FALL THROUGH */
885 default:
886 ender = *p++;
887 break;
888 }
889 break;
890 default:
891 ender = *p++;
892 break;
893 }
f0fcb552 894 if (regfold && isUPPER(ender))
a687059c
LW
895 ender = tolower(ender);
896 if (ISMULT2(p)) { /* Back off on ?+*. */
897 if (len)
898 p = oldp;
899 else {
900 len++;
901 regc(ender);
902 }
903 break;
904 }
905 regc(ender);
906 }
907 loopdone:
908 regparse = p;
909 if (len <= 0)
910 FAIL("internal disaster in regexp");
911 *flagp |= HASWIDTH;
912 if (len == 1)
913 *flagp |= SIMPLE;
914 if (regcode != &regdummy)
915 *OPERAND(ret) = len;
916 regc('\0');
917 }
918 break;
919 }
920
921 return(ret);
922}
923
924static void
925regset(bits,def,c)
926char *bits;
79072805
LW
927I32 def;
928register I32 c;
a687059c
LW
929{
930 if (regcode == &regdummy)
931 return;
ac58e20f 932 c &= 255;
a687059c
LW
933 if (def)
934 bits[c >> 3] &= ~(1 << (c & 7));
935 else
936 bits[c >> 3] |= (1 << (c & 7));
937}
938
939static char *
940regclass()
941{
942 register char *bits;
79072805
LW
943 register I32 class;
944 register I32 lastclass;
945 register I32 range = 0;
a687059c 946 register char *ret;
79072805
LW
947 register I32 def;
948 I32 numlen;
a687059c 949
00bf170e 950 ret = regnode(ANYOF);
a687059c 951 if (*regparse == '^') { /* Complement of range. */
a687059c
LW
952 regparse++;
953 def = 0;
954 } else {
a687059c
LW
955 def = 255;
956 }
957 bits = regcode;
958 for (class = 0; class < 32; class++)
959 regc(def);
960 if (*regparse == ']' || *regparse == '-')
00bf170e 961 goto skipcond; /* allow 1st char to be ] or - */
a687059c 962 while (regparse < regxend && *regparse != ']') {
00bf170e 963 skipcond:
a687059c
LW
964 class = UCHARAT(regparse++);
965 if (class == '\\') {
966 class = UCHARAT(regparse++);
967 switch (class) {
968 case 'w':
2b69d0c2
LW
969 for (class = 0; class < 256; class++)
970 if (isALNUM(class))
a687059c 971 regset(bits,def,class);
2b69d0c2
LW
972 lastclass = 1234;
973 continue;
974 case 'W':
975 for (class = 0; class < 256; class++)
976 if (!isALNUM(class))
a687059c 977 regset(bits,def,class);
a687059c
LW
978 lastclass = 1234;
979 continue;
980 case 's':
2b69d0c2
LW
981 for (class = 0; class < 256; class++)
982 if (isSPACE(class))
983 regset(bits,def,class);
984 lastclass = 1234;
985 continue;
986 case 'S':
987 for (class = 0; class < 256; class++)
988 if (!isSPACE(class))
989 regset(bits,def,class);
a687059c
LW
990 lastclass = 1234;
991 continue;
992 case 'd':
993 for (class = '0'; class <= '9'; class++)
994 regset(bits,def,class);
995 lastclass = 1234;
996 continue;
2b69d0c2
LW
997 case 'D':
998 for (class = 0; class < '0'; class++)
999 regset(bits,def,class);
1000 for (class = '9' + 1; class < 256; class++)
1001 regset(bits,def,class);
1002 lastclass = 1234;
1003 continue;
a687059c
LW
1004 case 'n':
1005 class = '\n';
1006 break;
1007 case 'r':
1008 class = '\r';
1009 break;
1010 case 't':
1011 class = '\t';
1012 break;
1013 case 'f':
1014 class = '\f';
1015 break;
1016 case 'b':
1017 class = '\b';
1018 break;
fe14fcc3
LW
1019 case 'e':
1020 class = '\033';
1021 break;
1022 case 'a':
1023 class = '\007';
1024 break;
1025 case 'x':
79072805 1026 class = scan_hex(regparse, 2, &numlen);
fe14fcc3
LW
1027 regparse += numlen;
1028 break;
1029 case 'c':
1030 class = *regparse++;
f0fcb552 1031 if (isLOWER(class))
fe14fcc3
LW
1032 class = toupper(class);
1033 class ^= 64;
1034 break;
a687059c
LW
1035 case '0': case '1': case '2': case '3': case '4':
1036 case '5': case '6': case '7': case '8': case '9':
79072805 1037 class = scan_oct(--regparse, 3, &numlen);
fe14fcc3 1038 regparse += numlen;
a687059c
LW
1039 break;
1040 }
1041 }
a687059c
LW
1042 if (range) {
1043 if (lastclass > class)
1044 FAIL("invalid [] range in regexp");
00bf170e 1045 range = 0;
a687059c 1046 }
00bf170e
LW
1047 else {
1048 lastclass = class;
1049 if (*regparse == '-' && regparse+1 < regxend &&
1050 regparse[1] != ']') {
1051 regparse++;
1052 range = 1;
1053 continue; /* do it next time */
1054 }
1055 }
1056 for ( ; lastclass <= class; lastclass++) {
a687059c 1057 regset(bits,def,lastclass);
f0fcb552 1058 if (regfold && isUPPER(lastclass))
a687059c
LW
1059 regset(bits,def,tolower(lastclass));
1060 }
1061 lastclass = class;
1062 }
1063 if (*regparse != ']')
1064 FAIL("unmatched [] in regexp");
a687059c
LW
1065 regparse++;
1066 return ret;
1067}
1068
1069/*
1070 - regnode - emit a node
1071 */
1072static char * /* Location. */
1073regnode(op)
1074char op;
1075{
1076 register char *ret;
1077 register char *ptr;
1078
1079 ret = regcode;
1080 if (ret == &regdummy) {
1081#ifdef REGALIGN
1082 if (!(regsize & 1))
1083 regsize++;
1084#endif
1085 regsize += 3;
1086 return(ret);
1087 }
1088
1089#ifdef REGALIGN
1090#ifndef lint
1091 if (!((long)ret & 1))
1092 *ret++ = 127;
1093#endif
1094#endif
1095 ptr = ret;
1096 *ptr++ = op;
1097 *ptr++ = '\0'; /* Null "next" pointer. */
1098 *ptr++ = '\0';
1099 regcode = ptr;
1100
1101 return(ret);
1102}
1103
1104/*
fe14fcc3
LW
1105 - reganode - emit a node with an argument
1106 */
1107static char * /* Location. */
1108reganode(op, arg)
1109char op;
1110unsigned short arg;
1111{
1112 register char *ret;
1113 register char *ptr;
1114
1115 ret = regcode;
1116 if (ret == &regdummy) {
1117#ifdef REGALIGN
1118 if (!(regsize & 1))
1119 regsize++;
1120#endif
1121 regsize += 5;
1122 return(ret);
1123 }
1124
1125#ifdef REGALIGN
1126#ifndef lint
1127 if (!((long)ret & 1))
1128 *ret++ = 127;
1129#endif
1130#endif
1131 ptr = ret;
1132 *ptr++ = op;
1133 *ptr++ = '\0'; /* Null "next" pointer. */
1134 *ptr++ = '\0';
1135#ifdef REGALIGN
1136 *(unsigned short *)(ret+3) = arg;
1137#else
1138 ret[3] = arg >> 8; ret[4] = arg & 0377;
1139#endif
1140 ptr += 2;
1141 regcode = ptr;
1142
1143 return(ret);
1144}
1145
1146/*
a687059c
LW
1147 - regc - emit (if appropriate) a byte of code
1148 */
1149static void
1150regc(b)
1151char b;
1152{
1153 if (regcode != &regdummy)
1154 *regcode++ = b;
1155 else
1156 regsize++;
1157}
1158
1159/*
1160 - reginsert - insert an operator in front of already-emitted operand
1161 *
1162 * Means relocating the operand.
1163 */
1164static void
1165reginsert(op, opnd)
1166char op;
1167char *opnd;
1168{
1169 register char *src;
1170 register char *dst;
1171 register char *place;
00bf170e 1172 register offset = (op == CURLY ? 4 : 0);
a687059c
LW
1173
1174 if (regcode == &regdummy) {
1175#ifdef REGALIGN
00bf170e 1176 regsize += 4 + offset;
a687059c 1177#else
00bf170e 1178 regsize += 3 + offset;
a687059c
LW
1179#endif
1180 return;
1181 }
1182
1183 src = regcode;
1184#ifdef REGALIGN
00bf170e 1185 regcode += 4 + offset;
a687059c 1186#else
00bf170e 1187 regcode += 3 + offset;
a687059c
LW
1188#endif
1189 dst = regcode;
1190 while (src > opnd)
1191 *--dst = *--src;
1192
1193 place = opnd; /* Op node, where operand used to be. */
1194 *place++ = op;
1195 *place++ = '\0';
1196 *place++ = '\0';
00bf170e
LW
1197 while (offset-- > 0)
1198 *place++ = '\0';
2b69d0c2
LW
1199#ifdef REGALIGN
1200 *place++ = '\177';
1201#endif
a687059c
LW
1202}
1203
1204/*
1205 - regtail - set the next-pointer at the end of a node chain
1206 */
1207static void
1208regtail(p, val)
1209char *p;
1210char *val;
1211{
1212 register char *scan;
1213 register char *temp;
79072805 1214 register I32 offset;
a687059c
LW
1215
1216 if (p == &regdummy)
1217 return;
1218
1219 /* Find last node. */
1220 scan = p;
1221 for (;;) {
1222 temp = regnext(scan);
1223 if (temp == NULL)
1224 break;
1225 scan = temp;
1226 }
1227
1228#ifdef REGALIGN
1229 offset = val - scan;
1230#ifndef lint
1231 *(short*)(scan+1) = offset;
1232#else
1233 offset = offset;
1234#endif
1235#else
1236 if (OP(scan) == BACK)
1237 offset = scan - val;
1238 else
1239 offset = val - scan;
1240 *(scan+1) = (offset>>8)&0377;
1241 *(scan+2) = offset&0377;
1242#endif
1243}
1244
1245/*
1246 - regoptail - regtail on operand of first argument; nop if operandless
1247 */
1248static void
1249regoptail(p, val)
1250char *p;
1251char *val;
1252{
1253 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1254 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1255 return;
1256 regtail(NEXTOPER(p), val);
1257}
1258
1259/*
1260 - regcurly - a little FSA that accepts {\d+,?\d*}
1261 */
79072805 1262STATIC I32
a687059c
LW
1263regcurly(s)
1264register char *s;
1265{
1266 if (*s++ != '{')
1267 return FALSE;
f0fcb552 1268 if (!isDIGIT(*s))
a687059c 1269 return FALSE;
f0fcb552 1270 while (isDIGIT(*s))
a687059c
LW
1271 s++;
1272 if (*s == ',')
1273 s++;
f0fcb552 1274 while (isDIGIT(*s))
a687059c
LW
1275 s++;
1276 if (*s != '}')
1277 return FALSE;
1278 return TRUE;
1279}
1280
1281#ifdef DEBUGGING
1282
1283/*
1284 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1285 */
1286void
1287regdump(r)
1288regexp *r;
1289{
1290 register char *s;
1291 register char op = EXACTLY; /* Arbitrary non-END op. */
1292 register char *next;
a687059c
LW
1293
1294
1295 s = r->program + 1;
1296 while (op != END) { /* While that wasn't END last time... */
1297#ifdef REGALIGN
1298 if (!((long)s & 1))
1299 s++;
1300#endif
1301 op = OP(s);
1302 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1303 next = regnext(s);
fe14fcc3 1304 s += regarglen[op];
a687059c
LW
1305 if (next == NULL) /* Next ptr. */
1306 fprintf(stderr,"(0)");
1307 else
1308 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1309 s += 3;
00bf170e 1310 if (op == ANYOF) {
a687059c
LW
1311 s += 32;
1312 }
1313 if (op == EXACTLY) {
1314 /* Literal string, where present. */
1315 s++;
1316 while (*s != '\0') {
79072805 1317 (void)putc(*s, stderr);
a687059c
LW
1318 s++;
1319 }
1320 s++;
1321 }
79072805 1322 (void)putc('\n', stderr);
a687059c
LW
1323 }
1324
1325 /* Header fields of interest. */
1326 if (r->regstart)
463ee0b2 1327 fprintf(stderr,"start `%s' ", SvPVX(r->regstart));
a687059c
LW
1328 if (r->regstclass)
1329 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
9ef589d8 1330 if (r->reganch & ROPT_ANCH)
a687059c 1331 fprintf(stderr,"anchored ");
9ef589d8 1332 if (r->reganch & ROPT_SKIP)
00bf170e 1333 fprintf(stderr,"plus ");
f0fcb552
LW
1334 if (r->reganch & ROPT_IMPLICIT)
1335 fprintf(stderr,"implicit ");
a687059c 1336 if (r->regmust != NULL)
463ee0b2 1337 fprintf(stderr,"must have \"%s\" back %d ", SvPVX(r->regmust),
a687059c 1338 r->regback);
f0fcb552 1339 fprintf(stderr, "minlen %d ", r->minlen);
a687059c
LW
1340 fprintf(stderr,"\n");
1341}
1342
1343/*
1344 - regprop - printable representation of opcode
1345 */
1346char *
1347regprop(op)
1348char *op;
1349{
1350 register char *p;
1351
1352 (void) strcpy(buf, ":");
1353
1354 switch (OP(op)) {
1355 case BOL:
1356 p = "BOL";
1357 break;
1358 case EOL:
1359 p = "EOL";
1360 break;
1361 case ANY:
1362 p = "ANY";
1363 break;
1364 case ANYOF:
1365 p = "ANYOF";
1366 break;
a687059c
LW
1367 case BRANCH:
1368 p = "BRANCH";
1369 break;
1370 case EXACTLY:
1371 p = "EXACTLY";
1372 break;
1373 case NOTHING:
1374 p = "NOTHING";
1375 break;
1376 case BACK:
1377 p = "BACK";
1378 break;
1379 case END:
1380 p = "END";
1381 break;
1382 case ALNUM:
1383 p = "ALNUM";
1384 break;
1385 case NALNUM:
1386 p = "NALNUM";
1387 break;
1388 case BOUND:
1389 p = "BOUND";
1390 break;
1391 case NBOUND:
1392 p = "NBOUND";
1393 break;
1394 case SPACE:
1395 p = "SPACE";
1396 break;
1397 case NSPACE:
1398 p = "NSPACE";
1399 break;
1400 case DIGIT:
1401 p = "DIGIT";
1402 break;
1403 case NDIGIT:
1404 p = "NDIGIT";
1405 break;
00bf170e
LW
1406 case CURLY:
1407 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1408 ARG1(op),ARG2(op));
1409 p = NULL;
1410 break;
a687059c 1411 case REF:
fe14fcc3 1412 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
a687059c
LW
1413 p = NULL;
1414 break;
fe14fcc3
LW
1415 case OPEN:
1416 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
a687059c
LW
1417 p = NULL;
1418 break;
fe14fcc3
LW
1419 case CLOSE:
1420 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
a687059c
LW
1421 p = NULL;
1422 break;
1423 case STAR:
1424 p = "STAR";
1425 break;
1426 case PLUS:
1427 p = "PLUS";
1428 break;
1429 default:
1430 FAIL("corrupted regexp opcode");
1431 }
1432 if (p != NULL)
1433 (void) strcat(buf, p);
1434 return(buf);
1435}
1436#endif /* DEBUGGING */
1437
2b69d0c2 1438void
a687059c
LW
1439regfree(r)
1440struct regexp *r;
1441{
fe14fcc3 1442 if (r->precomp) {
a687059c 1443 Safefree(r->precomp);
fe14fcc3
LW
1444 r->precomp = Nullch;
1445 }
1446 if (r->subbase) {
a687059c 1447 Safefree(r->subbase);
fe14fcc3
LW
1448 r->subbase = Nullch;
1449 }
1450 if (r->regmust) {
8990e307 1451 SvREFCNT_dec(r->regmust);
79072805 1452 r->regmust = Nullsv;
fe14fcc3
LW
1453 }
1454 if (r->regstart) {
8990e307 1455 SvREFCNT_dec(r->regstart);
79072805 1456 r->regstart = Nullsv;
fe14fcc3
LW
1457 }
1458 Safefree(r->startp);
1459 Safefree(r->endp);
a687059c
LW
1460 Safefree(r);
1461}