This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 4.0 patch 16: patch #11, continued
[perl5.git] / regcomp.c
CommitLineData
a687059c
LW
1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
f0fcb552 10/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.3 $$Date: 91/11/05 18:22:28 $
a687059c
LW
11 *
12 * $Log: regcomp.c,v $
f0fcb552
LW
13 * Revision 4.0.1.3 91/11/05 18:22:28 lwall
14 * patch11: minimum match length calculation in regexp is now cumulative
15 * patch11: initial .* in pattern had dependency on value of $*
16 * patch11: certain patterns made use of garbage pointers from uncleared memory
17 * patch11: prepared for ctype implementations that don't define isascii()
18 *
9ef589d8
LW
19 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
20 * patch4: new copyright notice
21 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
22 * patch4: // wouldn't use previous pattern if it started with a null character
23 *
35c8bce7
LW
24 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
25 * patch1: random cleanup in cpp namespace
26 *
fe14fcc3
LW
27 * Revision 4.0 91/03/20 01:39:01 lwall
28 * 4.0 baseline.
a687059c
LW
29 *
30 */
f0fcb552 31/*SUPPRESS 112*/
a687059c
LW
32/*
33 * regcomp and regexec -- regsub and regerror are not used in perl
34 *
35 * Copyright (c) 1986 by University of Toronto.
36 * Written by Henry Spencer. Not derived from licensed software.
37 *
38 * Permission is granted to anyone to use this software for any
39 * purpose on any computer system, and to redistribute it freely,
40 * subject to the following restrictions:
41 *
42 * 1. The author is not responsible for the consequences of use of
43 * this software, no matter how awful, even if they arise
44 * from defects in it.
45 *
46 * 2. The origin of this software must not be misrepresented, either
47 * by explicit claim or by omission.
48 *
49 * 3. Altered versions must be plainly marked as such, and must not
50 * be misrepresented as being the original software.
51 *
52 *
53 **** Alterations to Henry's code are...
54 ****
9ef589d8 55 **** Copyright (c) 1991, Larry Wall
a687059c 56 ****
9ef589d8
LW
57 **** You may distribute under the terms of either the GNU General Public
58 **** License or the Artistic License, as specified in the README file.
59
a687059c
LW
60 *
61 * Beware that some of this code is subtly aware of the way operator
62 * precedence is structured in regular expressions. Serious changes in
63 * regular-expression syntax might require a total rethink.
64 */
65#include "EXTERN.h"
66#include "perl.h"
67#include "INTERN.h"
68#include "regcomp.h"
69
fe14fcc3
LW
70#ifdef MSDOS
71# if defined(BUGGY_MSC6)
72 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
73 # pragma optimize("a",off)
74 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
75 # pragma optimize("w",on )
76# endif /* BUGGY_MSC6 */
77#endif /* MSDOS */
78
a687059c
LW
79#ifndef STATIC
80#define STATIC static
81#endif
82
83#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
84#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
85 ((*s) == '{' && regcurly(s)))
86#define META "^$.[()|?+*\\"
87
35c8bce7
LW
88#ifdef SPSTART
89#undef SPSTART /* dratted cpp namespace... */
90#endif
a687059c
LW
91/*
92 * Flags to be passed up and down.
93 */
94#define HASWIDTH 01 /* Known never to match null string. */
95#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
96#define SPSTART 04 /* Starts with * or +. */
97#define WORST 0 /* Worst case. */
98
99/*
100 * Global work variables for regcomp().
101 */
102static char *regprecomp; /* uncompiled string. */
103static char *regparse; /* Input-scan pointer. */
104static char *regxend; /* End of input for compile */
105static int regnpar; /* () count. */
106static char *regcode; /* Code-emit pointer; &regdummy = don't. */
107static long regsize; /* Code size. */
108static int regfold;
109static int regsawbracket; /* Did we do {d,d} trick? */
9ef589d8 110static int regsawback; /* Did we see \1, ...? */
a687059c
LW
111
112/*
113 * Forward declarations for regcomp()'s friends.
114 */
115STATIC int regcurly();
116STATIC char *reg();
117STATIC char *regbranch();
118STATIC char *regpiece();
119STATIC char *regatom();
120STATIC char *regclass();
121STATIC char *regnode();
fe14fcc3 122STATIC char *reganode();
a687059c
LW
123STATIC void regc();
124STATIC void reginsert();
125STATIC void regtail();
126STATIC void regoptail();
127
128/*
129 - regcomp - compile a regular expression into internal code
130 *
131 * We can't allocate space until we know how big the compiled form will be,
132 * but we can't compile it (and thus know how big it is) until we've got a
133 * place to put the code. So we cheat: we compile it twice, once with code
134 * generation turned off and size counting turned on, and once "for real".
135 * This also means that we don't allocate space until we are sure that the
136 * thing really will compile successfully, and we never have to move the
137 * code and thus invalidate pointers into it. (Note that it has to be in
138 * one piece because free() must be able to free it all.) [NB: not true in perl]
139 *
140 * Beware that the optimization-preparation code in here knows about some
141 * of the structure of the compiled regexp. [I'll say.]
142 */
143regexp *
00bf170e 144regcomp(exp,xend,fold)
a687059c
LW
145char *exp;
146char *xend;
147int fold;
a687059c
LW
148{
149 register regexp *r;
150 register char *scan;
0a12ae7d
LW
151 register STR *longish;
152 STR *longest;
a687059c
LW
153 register int len;
154 register char *first;
155 int flags;
34de22dd
LW
156 int backish;
157 int backest;
a687059c 158 int curback;
f0fcb552 159 int minlen;
a687059c
LW
160 extern char *safemalloc();
161 extern char *savestr();
00bf170e 162 int sawplus = 0;
9ef589d8 163 int sawopen = 0;
a687059c
LW
164
165 if (exp == NULL)
166 fatal("NULL regexp argument");
167
168 /* First pass: determine size, legality. */
169 regfold = fold;
170 regparse = exp;
171 regxend = xend;
172 regprecomp = nsavestr(exp,xend-exp);
173 regsawbracket = 0;
9ef589d8 174 regsawback = 0;
a687059c
LW
175 regnpar = 1;
176 regsize = 0L;
177 regcode = &regdummy;
f0fcb552 178 regc((char)MAGIC);
a687059c
LW
179 if (reg(0, &flags) == NULL) {
180 Safefree(regprecomp);
fe14fcc3 181 regprecomp = Nullch;
a687059c
LW
182 return(NULL);
183 }
184
185 /* Small enough for pointer-storage convention? */
186 if (regsize >= 32767L) /* Probably could be 65535L. */
187 FAIL("regexp too big");
188
189 /* Allocate space. */
190 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
191 if (r == NULL)
192 FAIL("regexp out of space");
193
194 /* Second pass: emit code. */
195 if (regsawbracket)
196 bcopy(regprecomp,exp,xend-exp);
9ef589d8 197 r->prelen = xend-exp;
a687059c 198 r->precomp = regprecomp;
9ef589d8 199 r->subbeg = r->subbase = NULL;
a687059c
LW
200 regparse = exp;
201 regnpar = 1;
202 regcode = r->program;
f0fcb552 203 regc((char)MAGIC);
a687059c
LW
204 if (reg(0, &flags) == NULL)
205 return(NULL);
206
207 /* Dig out information for optimizations. */
208 r->regstart = Nullstr; /* Worst-case defaults. */
209 r->reganch = 0;
210 r->regmust = Nullstr;
211 r->regback = -1;
212 r->regstclass = Nullch;
213 scan = r->program+1; /* First BRANCH. */
214 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
215 scan = NEXTOPER(scan);
216
217 first = scan;
9ef589d8 218 while ((OP(first) == OPEN && (sawopen = 1)) ||
a687059c 219 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
00bf170e
LW
220 (OP(first) == PLUS) ||
221 (OP(first) == CURLY && ARG1(first) > 0) ) {
fe14fcc3 222 if (OP(first) == PLUS)
9ef589d8 223 sawplus = 1;
fe14fcc3
LW
224 else
225 first += regarglen[OP(first)];
a687059c 226 first = NEXTOPER(first);
00bf170e 227 }
a687059c
LW
228
229 /* Starting-point info. */
9ef589d8 230 again:
a687059c
LW
231 if (OP(first) == EXACTLY) {
232 r->regstart =
233 str_make(OPERAND(first)+1,*OPERAND(first));
234 if (r->regstart->str_cur > !(sawstudy|fold))
235 fbmcompile(r->regstart,fold);
236 }
237 else if ((exp = index(simple,OP(first))) && exp > simple)
238 r->regstclass = first;
239 else if (OP(first) == BOUND || OP(first) == NBOUND)
240 r->regstclass = first;
00bf170e 241 else if (OP(first) == BOL ||
9ef589d8 242 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
f0fcb552
LW
243 /* kinda turn .* into ^.* */
244 r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
9ef589d8
LW
245 first = NEXTOPER(first);
246 goto again;
247 }
248 if (sawplus && (!sawopen || !regsawback))
249 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
a687059c
LW
250
251#ifdef DEBUGGING
252 if (debug & 512)
253 fprintf(stderr,"first %d next %d offset %d\n",
254 OP(first), OP(NEXTOPER(first)), first - scan);
255#endif
256 /*
257 * If there's something expensive in the r.e., find the
258 * longest literal string that must appear and make it the
259 * regmust. Resolve ties in favor of later strings, since
260 * the regstart check works with the beginning of the r.e.
261 * and avoiding duplication strengthens checking. Not a
262 * strong reason, but sufficient in the absence of others.
263 * [Now we resolve ties in favor of the earlier string if
264 * it happens that curback has been invalidated, since the
265 * earlier string may buy us something the later one won't.]
266 */
0a12ae7d 267 longish = str_make("",0);
a687059c
LW
268 longest = str_make("",0);
269 len = 0;
f0fcb552 270 minlen = 0;
a687059c 271 curback = 0;
34de22dd
LW
272 backish = 0;
273 backest = 0;
e5d73d77 274 while (OP(scan) != END) {
a687059c
LW
275 if (OP(scan) == BRANCH) {
276 if (OP(regnext(scan)) == BRANCH) {
277 curback = -30000;
278 while (OP(scan) == BRANCH)
279 scan = regnext(scan);
280 }
281 else /* single branch is ok */
282 scan = NEXTOPER(scan);
283 }
284 if (OP(scan) == EXACTLY) {
fe14fcc3
LW
285 char *t;
286
a687059c 287 first = scan;
fe14fcc3
LW
288 while (OP(t = regnext(scan)) == CLOSE)
289 scan = t;
f0fcb552 290 minlen += *OPERAND(first);
34de22dd 291 if (curback - backish == len) {
0a12ae7d 292 str_ncat(longish, OPERAND(first)+1,
a687059c
LW
293 *OPERAND(first));
294 len += *OPERAND(first);
295 curback += *OPERAND(first);
296 first = regnext(scan);
297 }
298 else if (*OPERAND(first) >= len + (curback >= 0)) {
299 len = *OPERAND(first);
0a12ae7d 300 str_nset(longish, OPERAND(first)+1,len);
34de22dd 301 backish = curback;
a687059c
LW
302 curback += len;
303 first = regnext(scan);
304 }
305 else
306 curback += *OPERAND(first);
307 }
0a12ae7d
LW
308 else if (index(varies,OP(scan))) {
309 curback = -30000;
310 len = 0;
34de22dd 311 if (longish->str_cur > longest->str_cur) {
0a12ae7d 312 str_sset(longest,longish);
34de22dd
LW
313 backest = backish;
314 }
0a12ae7d 315 str_nset(longish,"",0);
f0fcb552
LW
316 if (OP(scan) == PLUS &&
317 index(simple,OP(NEXTOPER(scan))))
318 minlen++;
319 else if (OP(scan) == CURLY &&
320 index(simple,OP(NEXTOPER(scan)+4)))
321 minlen += ARG1(scan);
0a12ae7d 322 }
e5d73d77 323 else if (index(simple,OP(scan))) {
0a12ae7d 324 curback++;
f0fcb552 325 minlen++;
e5d73d77 326 len = 0;
34de22dd 327 if (longish->str_cur > longest->str_cur) {
e5d73d77 328 str_sset(longest,longish);
34de22dd
LW
329 backest = backish;
330 }
e5d73d77
LW
331 str_nset(longish,"",0);
332 }
a687059c
LW
333 scan = regnext(scan);
334 }
e5d73d77
LW
335
336 /* Prefer earlier on tie, unless we can tail match latter */
337
34de22dd 338 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
0a12ae7d 339 str_sset(longest,longish);
34de22dd
LW
340 backest = backish;
341 }
e5d73d77
LW
342 else
343 str_nset(longish,"",0);
34de22dd
LW
344 if (longest->str_cur
345 &&
346 (!r->regstart
347 ||
f0fcb552
LW
348 !fbminstr((unsigned char*) r->regstart->str_ptr,
349 (unsigned char *) r->regstart->str_ptr
350 + r->regstart->str_cur,
34de22dd
LW
351 longest)
352 )
353 )
354 {
a687059c 355 r->regmust = longest;
34de22dd
LW
356 if (backest < 0)
357 backest = -1;
358 r->regback = backest;
0a12ae7d
LW
359 if (longest->str_cur
360 > !(sawstudy || fold || OP(first) == EOL) )
a687059c
LW
361 fbmcompile(r->regmust,fold);
362 r->regmust->str_u.str_useful = 100;
e5d73d77 363 if (OP(first) == EOL && longish->str_cur)
a687059c
LW
364 r->regmust->str_pok |= SP_TAIL;
365 }
fe14fcc3 366 else {
a687059c 367 str_free(longest);
fe14fcc3
LW
368 longest = Nullstr;
369 }
e5d73d77 370 str_free(longish);
a687059c
LW
371 }
372
373 r->do_folding = fold;
374 r->nparens = regnpar - 1;
f0fcb552
LW
375 r->minlen = minlen;
376 Newz(1002, r->startp, regnpar, char*);
377 Newz(1002, r->endp, regnpar, char*);
a687059c
LW
378#ifdef DEBUGGING
379 if (debug & 512)
380 regdump(r);
381#endif
382 return(r);
383}
384
385/*
386 - reg - regular expression, i.e. main body or parenthesized thing
387 *
388 * Caller must absorb opening parenthesis.
389 *
390 * Combining parenthesis handling with the base level of regular expression
391 * is a trifle forced, but the need to tie the tails of the branches to what
392 * follows makes it hard to avoid.
393 */
394static char *
395reg(paren, flagp)
396int paren; /* Parenthesized? */
397int *flagp;
398{
399 register char *ret;
400 register char *br;
401 register char *ender;
402 register int parno;
403 int flags;
404
405 *flagp = HASWIDTH; /* Tentatively. */
406
407 /* Make an OPEN node, if parenthesized. */
408 if (paren) {
a687059c
LW
409 parno = regnpar;
410 regnpar++;
fe14fcc3 411 ret = reganode(OPEN, parno);
a687059c
LW
412 } else
413 ret = NULL;
414
415 /* Pick up the branches, linking them together. */
416 br = regbranch(&flags);
417 if (br == NULL)
418 return(NULL);
419 if (ret != NULL)
420 regtail(ret, br); /* OPEN -> first. */
421 else
422 ret = br;
423 if (!(flags&HASWIDTH))
424 *flagp &= ~HASWIDTH;
425 *flagp |= flags&SPSTART;
426 while (*regparse == '|') {
427 regparse++;
428 br = regbranch(&flags);
429 if (br == NULL)
430 return(NULL);
431 regtail(ret, br); /* BRANCH -> BRANCH. */
432 if (!(flags&HASWIDTH))
433 *flagp &= ~HASWIDTH;
434 *flagp |= flags&SPSTART;
435 }
436
437 /* Make a closing node, and hook it on the end. */
fe14fcc3
LW
438 if (paren)
439 ender = reganode(CLOSE, parno);
440 else
441 ender = regnode(END);
a687059c
LW
442 regtail(ret, ender);
443
444 /* Hook the tails of the branches to the closing node. */
445 for (br = ret; br != NULL; br = regnext(br))
446 regoptail(br, ender);
447
448 /* Check for proper termination. */
449 if (paren && *regparse++ != ')') {
450 FAIL("unmatched () in regexp");
451 } else if (!paren && regparse < regxend) {
452 if (*regparse == ')') {
453 FAIL("unmatched () in regexp");
454 } else
455 FAIL("junk on end of regexp"); /* "Can't happen". */
456 /* NOTREACHED */
457 }
458
459 return(ret);
460}
461
462/*
463 - regbranch - one alternative of an | operator
464 *
465 * Implements the concatenation operator.
466 */
467static char *
468regbranch(flagp)
469int *flagp;
470{
471 register char *ret;
472 register char *chain;
473 register char *latest;
474 int flags;
475
476 *flagp = WORST; /* Tentatively. */
477
478 ret = regnode(BRANCH);
479 chain = NULL;
480 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
481 latest = regpiece(&flags);
482 if (latest == NULL)
483 return(NULL);
484 *flagp |= flags&HASWIDTH;
485 if (chain == NULL) /* First piece. */
486 *flagp |= flags&SPSTART;
487 else
488 regtail(chain, latest);
489 chain = latest;
490 }
491 if (chain == NULL) /* Loop ran zero times. */
492 (void) regnode(NOTHING);
493
494 return(ret);
495}
496
497/*
498 - regpiece - something followed by possible [*+?]
499 *
500 * Note that the branching code sequences used for ? and the general cases
501 * of * and + are somewhat optimized: they use the same NOTHING node as
502 * both the endmarker for their branch list and the body of the last branch.
503 * It might seem that this node could be dispensed with entirely, but the
504 * endmarker role is not redundant.
505 */
506static char *
507regpiece(flagp)
508int *flagp;
509{
510 register char *ret;
511 register char op;
512 register char *next;
513 int flags;
514 char *origparse = regparse;
515 int orignpar = regnpar;
516 char *max;
517 int iter;
518 char ch;
519
520 ret = regatom(&flags);
521 if (ret == NULL)
522 return(NULL);
523
524 op = *regparse;
525
526 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
527 * then we decrement the first number by one and reset our
528 * parsing back to the beginning of the same atom. If the first number
529 * is down to 0, decrement the second number instead and fake up
530 * a ? after it. Given the way this compiler doesn't keep track
531 * of offsets on the first pass, this is the only way to replicate
532 * a piece of code. Sigh.
533 */
534 if (op == '{' && regcurly(regparse)) {
535 next = regparse + 1;
536 max = Nullch;
f0fcb552 537 while (isDIGIT(*next) || *next == ',') {
a687059c
LW
538 if (*next == ',') {
539 if (max)
540 break;
541 else
542 max = next;
543 }
544 next++;
545 }
546 if (*next == '}') { /* got one */
a687059c
LW
547 if (!max)
548 max = next;
549 regparse++;
550 iter = atoi(regparse);
00bf170e
LW
551 if (flags&SIMPLE) { /* we can do it right after all */
552 int tmp;
553
554 reginsert(CURLY, ret);
fe14fcc3
LW
555 if (iter > 0)
556 *flagp = (WORST|HASWIDTH);
00bf170e
LW
557 if (*max == ',')
558 max++;
6eb13c3b
LW
559 else
560 max = regparse;
00bf170e
LW
561 tmp = atoi(max);
562 if (tmp && tmp < iter)
563 fatal("Can't do {n,m} with n > m");
564 if (regcode != &regdummy) {
565#ifdef REGALIGN
566 *(unsigned short *)(ret+3) = iter;
567 *(unsigned short *)(ret+5) = tmp;
568#else
569 ret[3] = iter >> 8; ret[4] = iter & 0377;
570 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
571#endif
572 }
573 regparse = next;
574 goto nest_check;
575 }
576 regsawbracket++; /* remember we clobbered exp */
a687059c
LW
577 if (iter > 0) {
578 ch = *max;
579 sprintf(regparse,"%.*d", max-regparse, iter - 1);
580 *max = ch;
00bf170e
LW
581 if (*max == ',' && max[1] != '}') {
582 if (atoi(max+1) <= 0)
583 fatal("Can't do {n,m} with n > m");
a687059c
LW
584 ch = *next;
585 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
586 *next = ch;
587 }
00bf170e 588 if (iter != 1 || *max == ',') {
a687059c
LW
589 regparse = origparse; /* back up input pointer */
590 regnpar = orignpar; /* don't make more parens */
591 }
592 else {
593 regparse = next;
594 goto nest_check;
595 }
596 *flagp = flags;
597 return ret;
598 }
599 if (*max == ',') {
600 max++;
601 iter = atoi(max);
602 if (max == next) { /* any number more? */
603 regparse = next;
604 op = '*'; /* fake up one with a star */
605 }
606 else if (iter > 0) {
607 op = '?'; /* fake up optional atom */
608 ch = *next;
609 sprintf(max,"%.*d", next-max, iter - 1);
610 *next = ch;
611 if (iter == 1)
612 regparse = next;
613 else {
614 regparse = origparse - 1; /* offset ++ below */
615 regnpar = orignpar;
616 }
617 }
618 else
619 fatal("Can't do {n,0}");
620 }
621 else
622 fatal("Can't do {0}");
623 }
624 }
625
626 if (!ISMULT1(op)) {
627 *flagp = flags;
628 return(ret);
629 }
630
631 if (!(flags&HASWIDTH) && op != '?')
632 FAIL("regexp *+ operand could be empty");
633 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
634
635 if (op == '*' && (flags&SIMPLE))
636 reginsert(STAR, ret);
637 else if (op == '*') {
638 /* Emit x* as (x&|), where & means "self". */
639 reginsert(BRANCH, ret); /* Either x */
640 regoptail(ret, regnode(BACK)); /* and loop */
641 regoptail(ret, ret); /* back */
642 regtail(ret, regnode(BRANCH)); /* or */
643 regtail(ret, regnode(NOTHING)); /* null. */
644 } else if (op == '+' && (flags&SIMPLE))
645 reginsert(PLUS, ret);
646 else if (op == '+') {
647 /* Emit x+ as x(&|), where & means "self". */
648 next = regnode(BRANCH); /* Either */
649 regtail(ret, next);
650 regtail(regnode(BACK), ret); /* loop back */
651 regtail(next, regnode(BRANCH)); /* or */
652 regtail(ret, regnode(NOTHING)); /* null. */
653 } else if (op == '?') {
654 /* Emit x? as (x|) */
655 reginsert(BRANCH, ret); /* Either x */
656 regtail(ret, regnode(BRANCH)); /* or */
657 next = regnode(NOTHING); /* null. */
658 regtail(ret, next);
659 regoptail(ret, next);
660 }
661 nest_check:
662 regparse++;
663 if (ISMULT2(regparse))
664 FAIL("nested *?+ in regexp");
665
666 return(ret);
667}
668
669/*
670 - regatom - the lowest level
671 *
672 * Optimization: gobbles an entire sequence of ordinary characters so that
673 * it can turn them into a single node, which is smaller to store and
674 * faster to run. Backslashed characters are exceptions, each becoming a
675 * separate node; the code is simpler that way and it's not worth fixing.
676 *
677 * [Yes, it is worth fixing, some scripts can run twice the speed.]
678 */
679static char *
680regatom(flagp)
681int *flagp;
682{
683 register char *ret;
684 int flags;
685
686 *flagp = WORST; /* Tentatively. */
687
688 switch (*regparse++) {
689 case '^':
690 ret = regnode(BOL);
691 break;
692 case '$':
693 ret = regnode(EOL);
694 break;
695 case '.':
696 ret = regnode(ANY);
697 *flagp |= HASWIDTH|SIMPLE;
698 break;
699 case '[':
700 ret = regclass();
701 *flagp |= HASWIDTH|SIMPLE;
702 break;
703 case '(':
704 ret = reg(1, &flags);
705 if (ret == NULL)
706 return(NULL);
707 *flagp |= flags&(HASWIDTH|SPSTART);
708 break;
709 case '|':
710 case ')':
711 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
712 break;
713 case '?':
714 case '+':
715 case '*':
716 FAIL("?+* follows nothing in regexp");
717 break;
718 case '\\':
719 switch (*regparse) {
720 case 'w':
721 ret = regnode(ALNUM);
722 *flagp |= HASWIDTH|SIMPLE;
723 regparse++;
724 break;
725 case 'W':
726 ret = regnode(NALNUM);
727 *flagp |= HASWIDTH|SIMPLE;
728 regparse++;
729 break;
730 case 'b':
731 ret = regnode(BOUND);
732 *flagp |= SIMPLE;
733 regparse++;
734 break;
735 case 'B':
736 ret = regnode(NBOUND);
737 *flagp |= SIMPLE;
738 regparse++;
739 break;
740 case 's':
741 ret = regnode(SPACE);
742 *flagp |= HASWIDTH|SIMPLE;
743 regparse++;
744 break;
745 case 'S':
746 ret = regnode(NSPACE);
747 *flagp |= HASWIDTH|SIMPLE;
748 regparse++;
749 break;
750 case 'd':
751 ret = regnode(DIGIT);
752 *flagp |= HASWIDTH|SIMPLE;
753 regparse++;
754 break;
755 case 'D':
756 ret = regnode(NDIGIT);
757 *flagp |= HASWIDTH|SIMPLE;
758 regparse++;
759 break;
760 case 'n':
761 case 'r':
762 case 't':
763 case 'f':
fe14fcc3
LW
764 case 'e':
765 case 'a':
766 case 'x':
767 case 'c':
768 case '0':
a687059c 769 goto defchar;
fe14fcc3 770 case '1': case '2': case '3': case '4':
a687059c 771 case '5': case '6': case '7': case '8': case '9':
fe14fcc3
LW
772 {
773 int num = atoi(regparse);
774
775 if (num > 9 && num >= regnpar)
a687059c 776 goto defchar;
fe14fcc3 777 else {
9ef589d8 778 regsawback = 1;
fe14fcc3 779 ret = reganode(REF, num);
f0fcb552 780 while (isDIGIT(*regparse))
fe14fcc3 781 regparse++;
a687059c 782 *flagp |= SIMPLE;
fe14fcc3 783 }
a687059c
LW
784 }
785 break;
786 case '\0':
787 if (regparse >= regxend)
788 FAIL("trailing \\ in regexp");
789 /* FALL THROUGH */
790 default:
791 goto defchar;
792 }
793 break;
794 default: {
795 register int len;
796 register char ender;
797 register char *p;
798 char *oldp;
fe14fcc3 799 int numlen;
a687059c
LW
800
801 defchar:
802 ret = regnode(EXACTLY);
803 regc(0); /* save spot for len */
804 for (len=0, p=regparse-1;
805 len < 127 && p < regxend;
806 len++)
807 {
808 oldp = p;
809 switch (*p) {
810 case '^':
811 case '$':
812 case '.':
813 case '[':
814 case '(':
815 case ')':
816 case '|':
817 goto loopdone;
818 case '\\':
819 switch (*++p) {
820 case 'w':
821 case 'W':
822 case 'b':
823 case 'B':
824 case 's':
825 case 'S':
826 case 'd':
827 case 'D':
828 --p;
829 goto loopdone;
830 case 'n':
831 ender = '\n';
832 p++;
833 break;
834 case 'r':
835 ender = '\r';
836 p++;
837 break;
838 case 't':
839 ender = '\t';
840 p++;
841 break;
842 case 'f':
843 ender = '\f';
844 p++;
845 break;
fe14fcc3
LW
846 case 'e':
847 ender = '\033';
848 p++;
849 break;
850 case 'a':
851 ender = '\007';
852 p++;
853 break;
854 case 'x':
855 ender = scanhex(++p, 2, &numlen);
856 p += numlen;
857 break;
858 case 'c':
859 p++;
860 ender = *p++;
f0fcb552 861 if (isLOWER(ender))
fe14fcc3
LW
862 ender = toupper(ender);
863 ender ^= 64;
864 break;
a687059c
LW
865 case '0': case '1': case '2': case '3':case '4':
866 case '5': case '6': case '7': case '8':case '9':
fe14fcc3 867 if (*p == '0' ||
f0fcb552 868 (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
fe14fcc3
LW
869 ender = scanoct(p, 3, &numlen);
870 p += numlen;
a687059c
LW
871 }
872 else {
873 --p;
874 goto loopdone;
875 }
876 break;
877 case '\0':
878 if (p >= regxend)
879 FAIL("trailing \\ in regexp");
880 /* FALL THROUGH */
881 default:
882 ender = *p++;
883 break;
884 }
885 break;
886 default:
887 ender = *p++;
888 break;
889 }
f0fcb552 890 if (regfold && isUPPER(ender))
a687059c
LW
891 ender = tolower(ender);
892 if (ISMULT2(p)) { /* Back off on ?+*. */
893 if (len)
894 p = oldp;
895 else {
896 len++;
897 regc(ender);
898 }
899 break;
900 }
901 regc(ender);
902 }
903 loopdone:
904 regparse = p;
905 if (len <= 0)
906 FAIL("internal disaster in regexp");
907 *flagp |= HASWIDTH;
908 if (len == 1)
909 *flagp |= SIMPLE;
910 if (regcode != &regdummy)
911 *OPERAND(ret) = len;
912 regc('\0');
913 }
914 break;
915 }
916
917 return(ret);
918}
919
920static void
921regset(bits,def,c)
922char *bits;
923int def;
924register int c;
925{
926 if (regcode == &regdummy)
927 return;
ac58e20f 928 c &= 255;
a687059c
LW
929 if (def)
930 bits[c >> 3] &= ~(1 << (c & 7));
931 else
932 bits[c >> 3] |= (1 << (c & 7));
933}
934
935static char *
936regclass()
937{
938 register char *bits;
939 register int class;
940 register int lastclass;
941 register int range = 0;
942 register char *ret;
943 register int def;
fe14fcc3 944 int numlen;
a687059c 945
00bf170e 946 ret = regnode(ANYOF);
a687059c 947 if (*regparse == '^') { /* Complement of range. */
a687059c
LW
948 regparse++;
949 def = 0;
950 } else {
a687059c
LW
951 def = 255;
952 }
953 bits = regcode;
954 for (class = 0; class < 32; class++)
955 regc(def);
956 if (*regparse == ']' || *regparse == '-')
00bf170e 957 goto skipcond; /* allow 1st char to be ] or - */
a687059c 958 while (regparse < regxend && *regparse != ']') {
00bf170e 959 skipcond:
a687059c
LW
960 class = UCHARAT(regparse++);
961 if (class == '\\') {
962 class = UCHARAT(regparse++);
963 switch (class) {
964 case 'w':
965 for (class = 'a'; class <= 'z'; class++)
966 regset(bits,def,class);
967 for (class = 'A'; class <= 'Z'; class++)
968 regset(bits,def,class);
969 for (class = '0'; class <= '9'; class++)
970 regset(bits,def,class);
971 regset(bits,def,'_');
972 lastclass = 1234;
973 continue;
974 case 's':
975 regset(bits,def,' ');
976 regset(bits,def,'\t');
977 regset(bits,def,'\r');
978 regset(bits,def,'\f');
979 regset(bits,def,'\n');
980 lastclass = 1234;
981 continue;
982 case 'd':
983 for (class = '0'; class <= '9'; class++)
984 regset(bits,def,class);
985 lastclass = 1234;
986 continue;
987 case 'n':
988 class = '\n';
989 break;
990 case 'r':
991 class = '\r';
992 break;
993 case 't':
994 class = '\t';
995 break;
996 case 'f':
997 class = '\f';
998 break;
999 case 'b':
1000 class = '\b';
1001 break;
fe14fcc3
LW
1002 case 'e':
1003 class = '\033';
1004 break;
1005 case 'a':
1006 class = '\007';
1007 break;
1008 case 'x':
1009 class = scanhex(regparse, 2, &numlen);
1010 regparse += numlen;
1011 break;
1012 case 'c':
1013 class = *regparse++;
f0fcb552 1014 if (isLOWER(class))
fe14fcc3
LW
1015 class = toupper(class);
1016 class ^= 64;
1017 break;
a687059c
LW
1018 case '0': case '1': case '2': case '3': case '4':
1019 case '5': case '6': case '7': case '8': case '9':
fe14fcc3
LW
1020 class = scanoct(--regparse, 3, &numlen);
1021 regparse += numlen;
a687059c
LW
1022 break;
1023 }
1024 }
a687059c
LW
1025 if (range) {
1026 if (lastclass > class)
1027 FAIL("invalid [] range in regexp");
00bf170e 1028 range = 0;
a687059c 1029 }
00bf170e
LW
1030 else {
1031 lastclass = class;
1032 if (*regparse == '-' && regparse+1 < regxend &&
1033 regparse[1] != ']') {
1034 regparse++;
1035 range = 1;
1036 continue; /* do it next time */
1037 }
1038 }
1039 for ( ; lastclass <= class; lastclass++) {
a687059c 1040 regset(bits,def,lastclass);
f0fcb552 1041 if (regfold && isUPPER(lastclass))
a687059c
LW
1042 regset(bits,def,tolower(lastclass));
1043 }
1044 lastclass = class;
1045 }
1046 if (*regparse != ']')
1047 FAIL("unmatched [] in regexp");
a687059c
LW
1048 regparse++;
1049 return ret;
1050}
1051
1052/*
1053 - regnode - emit a node
1054 */
1055static char * /* Location. */
1056regnode(op)
1057char op;
1058{
1059 register char *ret;
1060 register char *ptr;
1061
1062 ret = regcode;
1063 if (ret == &regdummy) {
1064#ifdef REGALIGN
1065 if (!(regsize & 1))
1066 regsize++;
1067#endif
1068 regsize += 3;
1069 return(ret);
1070 }
1071
1072#ifdef REGALIGN
1073#ifndef lint
1074 if (!((long)ret & 1))
1075 *ret++ = 127;
1076#endif
1077#endif
1078 ptr = ret;
1079 *ptr++ = op;
1080 *ptr++ = '\0'; /* Null "next" pointer. */
1081 *ptr++ = '\0';
1082 regcode = ptr;
1083
1084 return(ret);
1085}
1086
1087/*
fe14fcc3
LW
1088 - reganode - emit a node with an argument
1089 */
1090static char * /* Location. */
1091reganode(op, arg)
1092char op;
1093unsigned short arg;
1094{
1095 register char *ret;
1096 register char *ptr;
1097
1098 ret = regcode;
1099 if (ret == &regdummy) {
1100#ifdef REGALIGN
1101 if (!(regsize & 1))
1102 regsize++;
1103#endif
1104 regsize += 5;
1105 return(ret);
1106 }
1107
1108#ifdef REGALIGN
1109#ifndef lint
1110 if (!((long)ret & 1))
1111 *ret++ = 127;
1112#endif
1113#endif
1114 ptr = ret;
1115 *ptr++ = op;
1116 *ptr++ = '\0'; /* Null "next" pointer. */
1117 *ptr++ = '\0';
1118#ifdef REGALIGN
1119 *(unsigned short *)(ret+3) = arg;
1120#else
1121 ret[3] = arg >> 8; ret[4] = arg & 0377;
1122#endif
1123 ptr += 2;
1124 regcode = ptr;
1125
1126 return(ret);
1127}
1128
1129/*
a687059c
LW
1130 - regc - emit (if appropriate) a byte of code
1131 */
1132static void
1133regc(b)
1134char b;
1135{
1136 if (regcode != &regdummy)
1137 *regcode++ = b;
1138 else
1139 regsize++;
1140}
1141
1142/*
1143 - reginsert - insert an operator in front of already-emitted operand
1144 *
1145 * Means relocating the operand.
1146 */
1147static void
1148reginsert(op, opnd)
1149char op;
1150char *opnd;
1151{
1152 register char *src;
1153 register char *dst;
1154 register char *place;
00bf170e 1155 register offset = (op == CURLY ? 4 : 0);
a687059c
LW
1156
1157 if (regcode == &regdummy) {
1158#ifdef REGALIGN
00bf170e 1159 regsize += 4 + offset;
a687059c 1160#else
00bf170e 1161 regsize += 3 + offset;
a687059c
LW
1162#endif
1163 return;
1164 }
1165
1166 src = regcode;
1167#ifdef REGALIGN
00bf170e 1168 regcode += 4 + offset;
a687059c 1169#else
00bf170e 1170 regcode += 3 + offset;
a687059c
LW
1171#endif
1172 dst = regcode;
1173 while (src > opnd)
1174 *--dst = *--src;
1175
1176 place = opnd; /* Op node, where operand used to be. */
1177 *place++ = op;
1178 *place++ = '\0';
1179 *place++ = '\0';
00bf170e
LW
1180 while (offset-- > 0)
1181 *place++ = '\0';
a687059c
LW
1182}
1183
1184/*
1185 - regtail - set the next-pointer at the end of a node chain
1186 */
1187static void
1188regtail(p, val)
1189char *p;
1190char *val;
1191{
1192 register char *scan;
1193 register char *temp;
1194 register int offset;
1195
1196 if (p == &regdummy)
1197 return;
1198
1199 /* Find last node. */
1200 scan = p;
1201 for (;;) {
1202 temp = regnext(scan);
1203 if (temp == NULL)
1204 break;
1205 scan = temp;
1206 }
1207
1208#ifdef REGALIGN
1209 offset = val - scan;
1210#ifndef lint
1211 *(short*)(scan+1) = offset;
1212#else
1213 offset = offset;
1214#endif
1215#else
1216 if (OP(scan) == BACK)
1217 offset = scan - val;
1218 else
1219 offset = val - scan;
1220 *(scan+1) = (offset>>8)&0377;
1221 *(scan+2) = offset&0377;
1222#endif
1223}
1224
1225/*
1226 - regoptail - regtail on operand of first argument; nop if operandless
1227 */
1228static void
1229regoptail(p, val)
1230char *p;
1231char *val;
1232{
1233 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1234 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1235 return;
1236 regtail(NEXTOPER(p), val);
1237}
1238
1239/*
1240 - regcurly - a little FSA that accepts {\d+,?\d*}
1241 */
1242STATIC int
1243regcurly(s)
1244register char *s;
1245{
1246 if (*s++ != '{')
1247 return FALSE;
f0fcb552 1248 if (!isDIGIT(*s))
a687059c 1249 return FALSE;
f0fcb552 1250 while (isDIGIT(*s))
a687059c
LW
1251 s++;
1252 if (*s == ',')
1253 s++;
f0fcb552 1254 while (isDIGIT(*s))
a687059c
LW
1255 s++;
1256 if (*s != '}')
1257 return FALSE;
1258 return TRUE;
1259}
1260
1261#ifdef DEBUGGING
1262
1263/*
1264 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1265 */
1266void
1267regdump(r)
1268regexp *r;
1269{
1270 register char *s;
1271 register char op = EXACTLY; /* Arbitrary non-END op. */
1272 register char *next;
a687059c
LW
1273
1274
1275 s = r->program + 1;
1276 while (op != END) { /* While that wasn't END last time... */
1277#ifdef REGALIGN
1278 if (!((long)s & 1))
1279 s++;
1280#endif
1281 op = OP(s);
1282 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1283 next = regnext(s);
fe14fcc3 1284 s += regarglen[op];
a687059c
LW
1285 if (next == NULL) /* Next ptr. */
1286 fprintf(stderr,"(0)");
1287 else
1288 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1289 s += 3;
00bf170e 1290 if (op == ANYOF) {
a687059c
LW
1291 s += 32;
1292 }
1293 if (op == EXACTLY) {
1294 /* Literal string, where present. */
1295 s++;
1296 while (*s != '\0') {
1297 (void)putchar(*s);
1298 s++;
1299 }
1300 s++;
1301 }
1302 (void)putchar('\n');
1303 }
1304
1305 /* Header fields of interest. */
1306 if (r->regstart)
1307 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1308 if (r->regstclass)
1309 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
9ef589d8 1310 if (r->reganch & ROPT_ANCH)
a687059c 1311 fprintf(stderr,"anchored ");
9ef589d8 1312 if (r->reganch & ROPT_SKIP)
00bf170e 1313 fprintf(stderr,"plus ");
f0fcb552
LW
1314 if (r->reganch & ROPT_IMPLICIT)
1315 fprintf(stderr,"implicit ");
a687059c
LW
1316 if (r->regmust != NULL)
1317 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1318 r->regback);
f0fcb552 1319 fprintf(stderr, "minlen %d ", r->minlen);
a687059c
LW
1320 fprintf(stderr,"\n");
1321}
1322
1323/*
1324 - regprop - printable representation of opcode
1325 */
1326char *
1327regprop(op)
1328char *op;
1329{
1330 register char *p;
1331
1332 (void) strcpy(buf, ":");
1333
1334 switch (OP(op)) {
1335 case BOL:
1336 p = "BOL";
1337 break;
1338 case EOL:
1339 p = "EOL";
1340 break;
1341 case ANY:
1342 p = "ANY";
1343 break;
1344 case ANYOF:
1345 p = "ANYOF";
1346 break;
a687059c
LW
1347 case BRANCH:
1348 p = "BRANCH";
1349 break;
1350 case EXACTLY:
1351 p = "EXACTLY";
1352 break;
1353 case NOTHING:
1354 p = "NOTHING";
1355 break;
1356 case BACK:
1357 p = "BACK";
1358 break;
1359 case END:
1360 p = "END";
1361 break;
1362 case ALNUM:
1363 p = "ALNUM";
1364 break;
1365 case NALNUM:
1366 p = "NALNUM";
1367 break;
1368 case BOUND:
1369 p = "BOUND";
1370 break;
1371 case NBOUND:
1372 p = "NBOUND";
1373 break;
1374 case SPACE:
1375 p = "SPACE";
1376 break;
1377 case NSPACE:
1378 p = "NSPACE";
1379 break;
1380 case DIGIT:
1381 p = "DIGIT";
1382 break;
1383 case NDIGIT:
1384 p = "NDIGIT";
1385 break;
00bf170e
LW
1386 case CURLY:
1387 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1388 ARG1(op),ARG2(op));
1389 p = NULL;
1390 break;
a687059c 1391 case REF:
fe14fcc3 1392 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
a687059c
LW
1393 p = NULL;
1394 break;
fe14fcc3
LW
1395 case OPEN:
1396 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
a687059c
LW
1397 p = NULL;
1398 break;
fe14fcc3
LW
1399 case CLOSE:
1400 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
a687059c
LW
1401 p = NULL;
1402 break;
1403 case STAR:
1404 p = "STAR";
1405 break;
1406 case PLUS:
1407 p = "PLUS";
1408 break;
1409 default:
1410 FAIL("corrupted regexp opcode");
1411 }
1412 if (p != NULL)
1413 (void) strcat(buf, p);
1414 return(buf);
1415}
1416#endif /* DEBUGGING */
1417
1418regfree(r)
1419struct regexp *r;
1420{
fe14fcc3 1421 if (r->precomp) {
a687059c 1422 Safefree(r->precomp);
fe14fcc3
LW
1423 r->precomp = Nullch;
1424 }
1425 if (r->subbase) {
a687059c 1426 Safefree(r->subbase);
fe14fcc3
LW
1427 r->subbase = Nullch;
1428 }
1429 if (r->regmust) {
a687059c 1430 str_free(r->regmust);
fe14fcc3
LW
1431 r->regmust = Nullstr;
1432 }
1433 if (r->regstart) {
a687059c 1434 str_free(r->regstart);
fe14fcc3
LW
1435 r->regstart = Nullstr;
1436 }
1437 Safefree(r->startp);
1438 Safefree(r->endp);
a687059c
LW
1439 Safefree(r);
1440}