This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #4 Patch #2 continued
[perl5.git] / regcomp.c
CommitLineData
a687059c
LW
1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
ae986130 10/* $Header: regcomp.c,v 3.0.1.1 89/11/11 04:51:04 lwall Locked $
a687059c
LW
11 *
12 * $Log: regcomp.c,v $
ae986130
LW
13 * Revision 3.0.1.1 89/11/11 04:51:04 lwall
14 * patch2: /[\000]/ didn't work
15 *
a687059c
LW
16 * Revision 3.0 89/10/18 15:22:29 lwall
17 * 3.0 baseline
18 *
19 */
20
21/*
22 * regcomp and regexec -- regsub and regerror are not used in perl
23 *
24 * Copyright (c) 1986 by University of Toronto.
25 * Written by Henry Spencer. Not derived from licensed software.
26 *
27 * Permission is granted to anyone to use this software for any
28 * purpose on any computer system, and to redistribute it freely,
29 * subject to the following restrictions:
30 *
31 * 1. The author is not responsible for the consequences of use of
32 * this software, no matter how awful, even if they arise
33 * from defects in it.
34 *
35 * 2. The origin of this software must not be misrepresented, either
36 * by explicit claim or by omission.
37 *
38 * 3. Altered versions must be plainly marked as such, and must not
39 * be misrepresented as being the original software.
40 *
41 *
42 **** Alterations to Henry's code are...
43 ****
44 **** Copyright (c) 1989, Larry Wall
45 ****
46 **** You may distribute under the terms of the GNU General Public License
47 **** as specified in the README file that comes with the perl 3.0 kit.
48 *
49 * Beware that some of this code is subtly aware of the way operator
50 * precedence is structured in regular expressions. Serious changes in
51 * regular-expression syntax might require a total rethink.
52 */
53#include "EXTERN.h"
54#include "perl.h"
55#include "INTERN.h"
56#include "regcomp.h"
57
58#ifndef STATIC
59#define STATIC static
60#endif
61
62#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
63#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
64 ((*s) == '{' && regcurly(s)))
65#define META "^$.[()|?+*\\"
66
67/*
68 * Flags to be passed up and down.
69 */
70#define HASWIDTH 01 /* Known never to match null string. */
71#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
72#define SPSTART 04 /* Starts with * or +. */
73#define WORST 0 /* Worst case. */
74
75/*
76 * Global work variables for regcomp().
77 */
78static char *regprecomp; /* uncompiled string. */
79static char *regparse; /* Input-scan pointer. */
80static char *regxend; /* End of input for compile */
81static int regnpar; /* () count. */
82static char *regcode; /* Code-emit pointer; &regdummy = don't. */
83static long regsize; /* Code size. */
84static int regfold;
85static int regsawbracket; /* Did we do {d,d} trick? */
86
87/*
88 * Forward declarations for regcomp()'s friends.
89 */
90STATIC int regcurly();
91STATIC char *reg();
92STATIC char *regbranch();
93STATIC char *regpiece();
94STATIC char *regatom();
95STATIC char *regclass();
96STATIC char *regnode();
97STATIC void regc();
98STATIC void reginsert();
99STATIC void regtail();
100STATIC void regoptail();
101
102/*
103 - regcomp - compile a regular expression into internal code
104 *
105 * We can't allocate space until we know how big the compiled form will be,
106 * but we can't compile it (and thus know how big it is) until we've got a
107 * place to put the code. So we cheat: we compile it twice, once with code
108 * generation turned off and size counting turned on, and once "for real".
109 * This also means that we don't allocate space until we are sure that the
110 * thing really will compile successfully, and we never have to move the
111 * code and thus invalidate pointers into it. (Note that it has to be in
112 * one piece because free() must be able to free it all.) [NB: not true in perl]
113 *
114 * Beware that the optimization-preparation code in here knows about some
115 * of the structure of the compiled regexp. [I'll say.]
116 */
117regexp *
118regcomp(exp,xend,fold,rare)
119char *exp;
120char *xend;
121int fold;
122int rare;
123{
124 register regexp *r;
125 register char *scan;
126 register STR *longest;
127 register int len;
128 register char *first;
129 int flags;
130 int back;
131 int curback;
132 extern char *safemalloc();
133 extern char *savestr();
134
135 if (exp == NULL)
136 fatal("NULL regexp argument");
137
138 /* First pass: determine size, legality. */
139 regfold = fold;
140 regparse = exp;
141 regxend = xend;
142 regprecomp = nsavestr(exp,xend-exp);
143 regsawbracket = 0;
144 regnpar = 1;
145 regsize = 0L;
146 regcode = &regdummy;
147 regc(MAGIC);
148 if (reg(0, &flags) == NULL) {
149 Safefree(regprecomp);
150 return(NULL);
151 }
152
153 /* Small enough for pointer-storage convention? */
154 if (regsize >= 32767L) /* Probably could be 65535L. */
155 FAIL("regexp too big");
156
157 /* Allocate space. */
158 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
159 if (r == NULL)
160 FAIL("regexp out of space");
161
162 /* Second pass: emit code. */
163 if (regsawbracket)
164 bcopy(regprecomp,exp,xend-exp);
165 r->precomp = regprecomp;
166 r->subbase = NULL;
167 regparse = exp;
168 regnpar = 1;
169 regcode = r->program;
170 regc(MAGIC);
171 if (reg(0, &flags) == NULL)
172 return(NULL);
173
174 /* Dig out information for optimizations. */
175 r->regstart = Nullstr; /* Worst-case defaults. */
176 r->reganch = 0;
177 r->regmust = Nullstr;
178 r->regback = -1;
179 r->regstclass = Nullch;
180 scan = r->program+1; /* First BRANCH. */
181 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
182 scan = NEXTOPER(scan);
183
184 first = scan;
185 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
186 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
187 (OP(first) == PLUS) )
188 first = NEXTOPER(first);
189
190 /* Starting-point info. */
191 if (OP(first) == EXACTLY) {
192 r->regstart =
193 str_make(OPERAND(first)+1,*OPERAND(first));
194 if (r->regstart->str_cur > !(sawstudy|fold))
195 fbmcompile(r->regstart,fold);
196 }
197 else if ((exp = index(simple,OP(first))) && exp > simple)
198 r->regstclass = first;
199 else if (OP(first) == BOUND || OP(first) == NBOUND)
200 r->regstclass = first;
201 else if (OP(first) == BOL)
202 r->reganch++;
203
204#ifdef DEBUGGING
205 if (debug & 512)
206 fprintf(stderr,"first %d next %d offset %d\n",
207 OP(first), OP(NEXTOPER(first)), first - scan);
208#endif
209 /*
210 * If there's something expensive in the r.e., find the
211 * longest literal string that must appear and make it the
212 * regmust. Resolve ties in favor of later strings, since
213 * the regstart check works with the beginning of the r.e.
214 * and avoiding duplication strengthens checking. Not a
215 * strong reason, but sufficient in the absence of others.
216 * [Now we resolve ties in favor of the earlier string if
217 * it happens that curback has been invalidated, since the
218 * earlier string may buy us something the later one won't.]
219 */
220 longest = str_make("",0);
221 len = 0;
222 curback = 0;
223 back = 0;
224 while (scan != NULL) {
225 if (OP(scan) == BRANCH) {
226 if (OP(regnext(scan)) == BRANCH) {
227 curback = -30000;
228 while (OP(scan) == BRANCH)
229 scan = regnext(scan);
230 }
231 else /* single branch is ok */
232 scan = NEXTOPER(scan);
233 }
234 if (OP(scan) == EXACTLY) {
235 first = scan;
236 while (OP(regnext(scan)) >= CLOSE)
237 scan = regnext(scan);
238 if (curback - back == len) {
239 str_ncat(longest, OPERAND(first)+1,
240 *OPERAND(first));
241 len += *OPERAND(first);
242 curback += *OPERAND(first);
243 first = regnext(scan);
244 }
245 else if (*OPERAND(first) >= len + (curback >= 0)) {
246 len = *OPERAND(first);
247 str_nset(longest, OPERAND(first)+1,len);
248 back = curback;
249 curback += len;
250 first = regnext(scan);
251 }
252 else
253 curback += *OPERAND(first);
254 }
255 else if (index(varies,OP(scan)))
256 curback = -30000;
257 else if (index(simple,OP(scan)))
258 curback++;
259 scan = regnext(scan);
260 }
261 if (len) {
262 r->regmust = longest;
263 if (back < 0)
264 back = -1;
265 r->regback = back;
266 if (len > !(sawstudy||fold||OP(first)==EOL))
267 fbmcompile(r->regmust,fold);
268 r->regmust->str_u.str_useful = 100;
269 if (OP(first) == EOL) /* is match anchored to EOL? */
270 r->regmust->str_pok |= SP_TAIL;
271 }
272 else
273 str_free(longest);
274 }
275
276 r->do_folding = fold;
277 r->nparens = regnpar - 1;
278#ifdef DEBUGGING
279 if (debug & 512)
280 regdump(r);
281#endif
282 return(r);
283}
284
285/*
286 - reg - regular expression, i.e. main body or parenthesized thing
287 *
288 * Caller must absorb opening parenthesis.
289 *
290 * Combining parenthesis handling with the base level of regular expression
291 * is a trifle forced, but the need to tie the tails of the branches to what
292 * follows makes it hard to avoid.
293 */
294static char *
295reg(paren, flagp)
296int paren; /* Parenthesized? */
297int *flagp;
298{
299 register char *ret;
300 register char *br;
301 register char *ender;
302 register int parno;
303 int flags;
304
305 *flagp = HASWIDTH; /* Tentatively. */
306
307 /* Make an OPEN node, if parenthesized. */
308 if (paren) {
309 if (regnpar >= NSUBEXP)
310 FAIL("too many () in regexp");
311 parno = regnpar;
312 regnpar++;
313 ret = regnode(OPEN+parno);
314 } else
315 ret = NULL;
316
317 /* Pick up the branches, linking them together. */
318 br = regbranch(&flags);
319 if (br == NULL)
320 return(NULL);
321 if (ret != NULL)
322 regtail(ret, br); /* OPEN -> first. */
323 else
324 ret = br;
325 if (!(flags&HASWIDTH))
326 *flagp &= ~HASWIDTH;
327 *flagp |= flags&SPSTART;
328 while (*regparse == '|') {
329 regparse++;
330 br = regbranch(&flags);
331 if (br == NULL)
332 return(NULL);
333 regtail(ret, br); /* BRANCH -> BRANCH. */
334 if (!(flags&HASWIDTH))
335 *flagp &= ~HASWIDTH;
336 *flagp |= flags&SPSTART;
337 }
338
339 /* Make a closing node, and hook it on the end. */
340 ender = regnode((paren) ? CLOSE+parno : END);
341 regtail(ret, ender);
342
343 /* Hook the tails of the branches to the closing node. */
344 for (br = ret; br != NULL; br = regnext(br))
345 regoptail(br, ender);
346
347 /* Check for proper termination. */
348 if (paren && *regparse++ != ')') {
349 FAIL("unmatched () in regexp");
350 } else if (!paren && regparse < regxend) {
351 if (*regparse == ')') {
352 FAIL("unmatched () in regexp");
353 } else
354 FAIL("junk on end of regexp"); /* "Can't happen". */
355 /* NOTREACHED */
356 }
357
358 return(ret);
359}
360
361/*
362 - regbranch - one alternative of an | operator
363 *
364 * Implements the concatenation operator.
365 */
366static char *
367regbranch(flagp)
368int *flagp;
369{
370 register char *ret;
371 register char *chain;
372 register char *latest;
373 int flags;
374
375 *flagp = WORST; /* Tentatively. */
376
377 ret = regnode(BRANCH);
378 chain = NULL;
379 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
380 latest = regpiece(&flags);
381 if (latest == NULL)
382 return(NULL);
383 *flagp |= flags&HASWIDTH;
384 if (chain == NULL) /* First piece. */
385 *flagp |= flags&SPSTART;
386 else
387 regtail(chain, latest);
388 chain = latest;
389 }
390 if (chain == NULL) /* Loop ran zero times. */
391 (void) regnode(NOTHING);
392
393 return(ret);
394}
395
396/*
397 - regpiece - something followed by possible [*+?]
398 *
399 * Note that the branching code sequences used for ? and the general cases
400 * of * and + are somewhat optimized: they use the same NOTHING node as
401 * both the endmarker for their branch list and the body of the last branch.
402 * It might seem that this node could be dispensed with entirely, but the
403 * endmarker role is not redundant.
404 */
405static char *
406regpiece(flagp)
407int *flagp;
408{
409 register char *ret;
410 register char op;
411 register char *next;
412 int flags;
413 char *origparse = regparse;
414 int orignpar = regnpar;
415 char *max;
416 int iter;
417 char ch;
418
419 ret = regatom(&flags);
420 if (ret == NULL)
421 return(NULL);
422
423 op = *regparse;
424
425 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
426 * then we decrement the first number by one and reset our
427 * parsing back to the beginning of the same atom. If the first number
428 * is down to 0, decrement the second number instead and fake up
429 * a ? after it. Given the way this compiler doesn't keep track
430 * of offsets on the first pass, this is the only way to replicate
431 * a piece of code. Sigh.
432 */
433 if (op == '{' && regcurly(regparse)) {
434 next = regparse + 1;
435 max = Nullch;
436 while (isdigit(*next) || *next == ',') {
437 if (*next == ',') {
438 if (max)
439 break;
440 else
441 max = next;
442 }
443 next++;
444 }
445 if (*next == '}') { /* got one */
446 regsawbracket++; /* remember we clobbered exp */
447 if (!max)
448 max = next;
449 regparse++;
450 iter = atoi(regparse);
451 if (iter > 0) {
452 ch = *max;
453 sprintf(regparse,"%.*d", max-regparse, iter - 1);
454 *max = ch;
455 if (*max == ',' && atoi(max+1) > 0) {
456 ch = *next;
457 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
458 *next = ch;
459 }
460 if (iter != 1 || (*max == ',' || atoi(max+1))) {
461 regparse = origparse; /* back up input pointer */
462 regnpar = orignpar; /* don't make more parens */
463 }
464 else {
465 regparse = next;
466 goto nest_check;
467 }
468 *flagp = flags;
469 return ret;
470 }
471 if (*max == ',') {
472 max++;
473 iter = atoi(max);
474 if (max == next) { /* any number more? */
475 regparse = next;
476 op = '*'; /* fake up one with a star */
477 }
478 else if (iter > 0) {
479 op = '?'; /* fake up optional atom */
480 ch = *next;
481 sprintf(max,"%.*d", next-max, iter - 1);
482 *next = ch;
483 if (iter == 1)
484 regparse = next;
485 else {
486 regparse = origparse - 1; /* offset ++ below */
487 regnpar = orignpar;
488 }
489 }
490 else
491 fatal("Can't do {n,0}");
492 }
493 else
494 fatal("Can't do {0}");
495 }
496 }
497
498 if (!ISMULT1(op)) {
499 *flagp = flags;
500 return(ret);
501 }
502
503 if (!(flags&HASWIDTH) && op != '?')
504 FAIL("regexp *+ operand could be empty");
505 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
506
507 if (op == '*' && (flags&SIMPLE))
508 reginsert(STAR, ret);
509 else if (op == '*') {
510 /* Emit x* as (x&|), where & means "self". */
511 reginsert(BRANCH, ret); /* Either x */
512 regoptail(ret, regnode(BACK)); /* and loop */
513 regoptail(ret, ret); /* back */
514 regtail(ret, regnode(BRANCH)); /* or */
515 regtail(ret, regnode(NOTHING)); /* null. */
516 } else if (op == '+' && (flags&SIMPLE))
517 reginsert(PLUS, ret);
518 else if (op == '+') {
519 /* Emit x+ as x(&|), where & means "self". */
520 next = regnode(BRANCH); /* Either */
521 regtail(ret, next);
522 regtail(regnode(BACK), ret); /* loop back */
523 regtail(next, regnode(BRANCH)); /* or */
524 regtail(ret, regnode(NOTHING)); /* null. */
525 } else if (op == '?') {
526 /* Emit x? as (x|) */
527 reginsert(BRANCH, ret); /* Either x */
528 regtail(ret, regnode(BRANCH)); /* or */
529 next = regnode(NOTHING); /* null. */
530 regtail(ret, next);
531 regoptail(ret, next);
532 }
533 nest_check:
534 regparse++;
535 if (ISMULT2(regparse))
536 FAIL("nested *?+ in regexp");
537
538 return(ret);
539}
540
541/*
542 - regatom - the lowest level
543 *
544 * Optimization: gobbles an entire sequence of ordinary characters so that
545 * it can turn them into a single node, which is smaller to store and
546 * faster to run. Backslashed characters are exceptions, each becoming a
547 * separate node; the code is simpler that way and it's not worth fixing.
548 *
549 * [Yes, it is worth fixing, some scripts can run twice the speed.]
550 */
551static char *
552regatom(flagp)
553int *flagp;
554{
555 register char *ret;
556 int flags;
557
558 *flagp = WORST; /* Tentatively. */
559
560 switch (*regparse++) {
561 case '^':
562 ret = regnode(BOL);
563 break;
564 case '$':
565 ret = regnode(EOL);
566 break;
567 case '.':
568 ret = regnode(ANY);
569 *flagp |= HASWIDTH|SIMPLE;
570 break;
571 case '[':
572 ret = regclass();
573 *flagp |= HASWIDTH|SIMPLE;
574 break;
575 case '(':
576 ret = reg(1, &flags);
577 if (ret == NULL)
578 return(NULL);
579 *flagp |= flags&(HASWIDTH|SPSTART);
580 break;
581 case '|':
582 case ')':
583 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
584 break;
585 case '?':
586 case '+':
587 case '*':
588 FAIL("?+* follows nothing in regexp");
589 break;
590 case '\\':
591 switch (*regparse) {
592 case 'w':
593 ret = regnode(ALNUM);
594 *flagp |= HASWIDTH|SIMPLE;
595 regparse++;
596 break;
597 case 'W':
598 ret = regnode(NALNUM);
599 *flagp |= HASWIDTH|SIMPLE;
600 regparse++;
601 break;
602 case 'b':
603 ret = regnode(BOUND);
604 *flagp |= SIMPLE;
605 regparse++;
606 break;
607 case 'B':
608 ret = regnode(NBOUND);
609 *flagp |= SIMPLE;
610 regparse++;
611 break;
612 case 's':
613 ret = regnode(SPACE);
614 *flagp |= HASWIDTH|SIMPLE;
615 regparse++;
616 break;
617 case 'S':
618 ret = regnode(NSPACE);
619 *flagp |= HASWIDTH|SIMPLE;
620 regparse++;
621 break;
622 case 'd':
623 ret = regnode(DIGIT);
624 *flagp |= HASWIDTH|SIMPLE;
625 regparse++;
626 break;
627 case 'D':
628 ret = regnode(NDIGIT);
629 *flagp |= HASWIDTH|SIMPLE;
630 regparse++;
631 break;
632 case 'n':
633 case 'r':
634 case 't':
635 case 'f':
636 goto defchar;
637 case '0': case '1': case '2': case '3': case '4':
638 case '5': case '6': case '7': case '8': case '9':
639 if (isdigit(regparse[1]))
640 goto defchar;
641 else {
642 ret = regnode(REF + *regparse++ - '0');
643 *flagp |= SIMPLE;
644 }
645 break;
646 case '\0':
647 if (regparse >= regxend)
648 FAIL("trailing \\ in regexp");
649 /* FALL THROUGH */
650 default:
651 goto defchar;
652 }
653 break;
654 default: {
655 register int len;
656 register char ender;
657 register char *p;
658 char *oldp;
659 int foo;
660
661 defchar:
662 ret = regnode(EXACTLY);
663 regc(0); /* save spot for len */
664 for (len=0, p=regparse-1;
665 len < 127 && p < regxend;
666 len++)
667 {
668 oldp = p;
669 switch (*p) {
670 case '^':
671 case '$':
672 case '.':
673 case '[':
674 case '(':
675 case ')':
676 case '|':
677 goto loopdone;
678 case '\\':
679 switch (*++p) {
680 case 'w':
681 case 'W':
682 case 'b':
683 case 'B':
684 case 's':
685 case 'S':
686 case 'd':
687 case 'D':
688 --p;
689 goto loopdone;
690 case 'n':
691 ender = '\n';
692 p++;
693 break;
694 case 'r':
695 ender = '\r';
696 p++;
697 break;
698 case 't':
699 ender = '\t';
700 p++;
701 break;
702 case 'f':
703 ender = '\f';
704 p++;
705 break;
706 case '0': case '1': case '2': case '3':case '4':
707 case '5': case '6': case '7': case '8':case '9':
708 if (isdigit(p[1])) {
709 foo = *p++ - '0';
710 foo <<= 3;
711 foo += *p - '0';
712 if (isdigit(p[1]))
713 foo = (foo<<3) + *++p - '0';
714 ender = foo;
715 p++;
716 }
717 else {
718 --p;
719 goto loopdone;
720 }
721 break;
722 case '\0':
723 if (p >= regxend)
724 FAIL("trailing \\ in regexp");
725 /* FALL THROUGH */
726 default:
727 ender = *p++;
728 break;
729 }
730 break;
731 default:
732 ender = *p++;
733 break;
734 }
735 if (regfold && isupper(ender))
736 ender = tolower(ender);
737 if (ISMULT2(p)) { /* Back off on ?+*. */
738 if (len)
739 p = oldp;
740 else {
741 len++;
742 regc(ender);
743 }
744 break;
745 }
746 regc(ender);
747 }
748 loopdone:
749 regparse = p;
750 if (len <= 0)
751 FAIL("internal disaster in regexp");
752 *flagp |= HASWIDTH;
753 if (len == 1)
754 *flagp |= SIMPLE;
755 if (regcode != &regdummy)
756 *OPERAND(ret) = len;
757 regc('\0');
758 }
759 break;
760 }
761
762 return(ret);
763}
764
765static void
766regset(bits,def,c)
767char *bits;
768int def;
769register int c;
770{
771 if (regcode == &regdummy)
772 return;
773 if (def)
774 bits[c >> 3] &= ~(1 << (c & 7));
775 else
776 bits[c >> 3] |= (1 << (c & 7));
777}
778
779static char *
780regclass()
781{
782 register char *bits;
783 register int class;
784 register int lastclass;
785 register int range = 0;
786 register char *ret;
787 register int def;
788
789 if (*regparse == '^') { /* Complement of range. */
790 ret = regnode(ANYBUT);
791 regparse++;
792 def = 0;
793 } else {
794 ret = regnode(ANYOF);
795 def = 255;
796 }
797 bits = regcode;
798 for (class = 0; class < 32; class++)
799 regc(def);
800 if (*regparse == ']' || *regparse == '-')
801 regset(bits,def,lastclass = *regparse++);
802 while (regparse < regxend && *regparse != ']') {
803 class = UCHARAT(regparse++);
804 if (class == '\\') {
805 class = UCHARAT(regparse++);
806 switch (class) {
807 case 'w':
808 for (class = 'a'; class <= 'z'; class++)
809 regset(bits,def,class);
810 for (class = 'A'; class <= 'Z'; class++)
811 regset(bits,def,class);
812 for (class = '0'; class <= '9'; class++)
813 regset(bits,def,class);
814 regset(bits,def,'_');
815 lastclass = 1234;
816 continue;
817 case 's':
818 regset(bits,def,' ');
819 regset(bits,def,'\t');
820 regset(bits,def,'\r');
821 regset(bits,def,'\f');
822 regset(bits,def,'\n');
823 lastclass = 1234;
824 continue;
825 case 'd':
826 for (class = '0'; class <= '9'; class++)
827 regset(bits,def,class);
828 lastclass = 1234;
829 continue;
830 case 'n':
831 class = '\n';
832 break;
833 case 'r':
834 class = '\r';
835 break;
836 case 't':
837 class = '\t';
838 break;
839 case 'f':
840 class = '\f';
841 break;
842 case 'b':
843 class = '\b';
844 break;
845 case '0': case '1': case '2': case '3': case '4':
846 case '5': case '6': case '7': case '8': case '9':
847 class -= '0';
848 if (isdigit(*regparse)) {
849 class <<= 3;
850 class += *regparse++ - '0';
851 }
852 if (isdigit(*regparse)) {
853 class <<= 3;
854 class += *regparse++ - '0';
855 }
856 break;
857 }
858 }
859 if (!range && class == '-' && regparse < regxend &&
860 *regparse != ']') {
861 range = 1;
862 continue;
863 }
864 if (range) {
865 if (lastclass > class)
866 FAIL("invalid [] range in regexp");
867 }
868 else
869 lastclass = class - 1;
870 range = 0;
871 for (lastclass++; lastclass <= class; lastclass++) {
872 regset(bits,def,lastclass);
873 if (regfold && isupper(lastclass))
874 regset(bits,def,tolower(lastclass));
875 }
876 lastclass = class;
877 }
878 if (*regparse != ']')
879 FAIL("unmatched [] in regexp");
a687059c
LW
880 regparse++;
881 return ret;
882}
883
884/*
885 - regnode - emit a node
886 */
887static char * /* Location. */
888regnode(op)
889char op;
890{
891 register char *ret;
892 register char *ptr;
893
894 ret = regcode;
895 if (ret == &regdummy) {
896#ifdef REGALIGN
897 if (!(regsize & 1))
898 regsize++;
899#endif
900 regsize += 3;
901 return(ret);
902 }
903
904#ifdef REGALIGN
905#ifndef lint
906 if (!((long)ret & 1))
907 *ret++ = 127;
908#endif
909#endif
910 ptr = ret;
911 *ptr++ = op;
912 *ptr++ = '\0'; /* Null "next" pointer. */
913 *ptr++ = '\0';
914 regcode = ptr;
915
916 return(ret);
917}
918
919/*
920 - regc - emit (if appropriate) a byte of code
921 */
922static void
923regc(b)
924char b;
925{
926 if (regcode != &regdummy)
927 *regcode++ = b;
928 else
929 regsize++;
930}
931
932/*
933 - reginsert - insert an operator in front of already-emitted operand
934 *
935 * Means relocating the operand.
936 */
937static void
938reginsert(op, opnd)
939char op;
940char *opnd;
941{
942 register char *src;
943 register char *dst;
944 register char *place;
945
946 if (regcode == &regdummy) {
947#ifdef REGALIGN
948 regsize += 4;
949#else
950 regsize += 3;
951#endif
952 return;
953 }
954
955 src = regcode;
956#ifdef REGALIGN
957 regcode += 4;
958#else
959 regcode += 3;
960#endif
961 dst = regcode;
962 while (src > opnd)
963 *--dst = *--src;
964
965 place = opnd; /* Op node, where operand used to be. */
966 *place++ = op;
967 *place++ = '\0';
968 *place++ = '\0';
969}
970
971/*
972 - regtail - set the next-pointer at the end of a node chain
973 */
974static void
975regtail(p, val)
976char *p;
977char *val;
978{
979 register char *scan;
980 register char *temp;
981 register int offset;
982
983 if (p == &regdummy)
984 return;
985
986 /* Find last node. */
987 scan = p;
988 for (;;) {
989 temp = regnext(scan);
990 if (temp == NULL)
991 break;
992 scan = temp;
993 }
994
995#ifdef REGALIGN
996 offset = val - scan;
997#ifndef lint
998 *(short*)(scan+1) = offset;
999#else
1000 offset = offset;
1001#endif
1002#else
1003 if (OP(scan) == BACK)
1004 offset = scan - val;
1005 else
1006 offset = val - scan;
1007 *(scan+1) = (offset>>8)&0377;
1008 *(scan+2) = offset&0377;
1009#endif
1010}
1011
1012/*
1013 - regoptail - regtail on operand of first argument; nop if operandless
1014 */
1015static void
1016regoptail(p, val)
1017char *p;
1018char *val;
1019{
1020 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1021 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1022 return;
1023 regtail(NEXTOPER(p), val);
1024}
1025
1026/*
1027 - regcurly - a little FSA that accepts {\d+,?\d*}
1028 */
1029STATIC int
1030regcurly(s)
1031register char *s;
1032{
1033 if (*s++ != '{')
1034 return FALSE;
1035 if (!isdigit(*s))
1036 return FALSE;
1037 while (isdigit(*s))
1038 s++;
1039 if (*s == ',')
1040 s++;
1041 while (isdigit(*s))
1042 s++;
1043 if (*s != '}')
1044 return FALSE;
1045 return TRUE;
1046}
1047
1048#ifdef DEBUGGING
1049
1050/*
1051 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1052 */
1053void
1054regdump(r)
1055regexp *r;
1056{
1057 register char *s;
1058 register char op = EXACTLY; /* Arbitrary non-END op. */
1059 register char *next;
1060 extern char *index();
1061
1062
1063 s = r->program + 1;
1064 while (op != END) { /* While that wasn't END last time... */
1065#ifdef REGALIGN
1066 if (!((long)s & 1))
1067 s++;
1068#endif
1069 op = OP(s);
1070 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1071 next = regnext(s);
1072 if (next == NULL) /* Next ptr. */
1073 fprintf(stderr,"(0)");
1074 else
1075 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1076 s += 3;
1077 if (op == ANYOF || op == ANYBUT) {
1078 s += 32;
1079 }
1080 if (op == EXACTLY) {
1081 /* Literal string, where present. */
1082 s++;
1083 while (*s != '\0') {
1084 (void)putchar(*s);
1085 s++;
1086 }
1087 s++;
1088 }
1089 (void)putchar('\n');
1090 }
1091
1092 /* Header fields of interest. */
1093 if (r->regstart)
1094 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1095 if (r->regstclass)
1096 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1097 if (r->reganch)
1098 fprintf(stderr,"anchored ");
1099 if (r->regmust != NULL)
1100 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1101 r->regback);
1102 fprintf(stderr,"\n");
1103}
1104
1105/*
1106 - regprop - printable representation of opcode
1107 */
1108char *
1109regprop(op)
1110char *op;
1111{
1112 register char *p;
1113
1114 (void) strcpy(buf, ":");
1115
1116 switch (OP(op)) {
1117 case BOL:
1118 p = "BOL";
1119 break;
1120 case EOL:
1121 p = "EOL";
1122 break;
1123 case ANY:
1124 p = "ANY";
1125 break;
1126 case ANYOF:
1127 p = "ANYOF";
1128 break;
1129 case ANYBUT:
1130 p = "ANYBUT";
1131 break;
1132 case BRANCH:
1133 p = "BRANCH";
1134 break;
1135 case EXACTLY:
1136 p = "EXACTLY";
1137 break;
1138 case NOTHING:
1139 p = "NOTHING";
1140 break;
1141 case BACK:
1142 p = "BACK";
1143 break;
1144 case END:
1145 p = "END";
1146 break;
1147 case ALNUM:
1148 p = "ALNUM";
1149 break;
1150 case NALNUM:
1151 p = "NALNUM";
1152 break;
1153 case BOUND:
1154 p = "BOUND";
1155 break;
1156 case NBOUND:
1157 p = "NBOUND";
1158 break;
1159 case SPACE:
1160 p = "SPACE";
1161 break;
1162 case NSPACE:
1163 p = "NSPACE";
1164 break;
1165 case DIGIT:
1166 p = "DIGIT";
1167 break;
1168 case NDIGIT:
1169 p = "NDIGIT";
1170 break;
1171 case REF:
1172 case REF+1:
1173 case REF+2:
1174 case REF+3:
1175 case REF+4:
1176 case REF+5:
1177 case REF+6:
1178 case REF+7:
1179 case REF+8:
1180 case REF+9:
1181 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1182 p = NULL;
1183 break;
1184 case OPEN+1:
1185 case OPEN+2:
1186 case OPEN+3:
1187 case OPEN+4:
1188 case OPEN+5:
1189 case OPEN+6:
1190 case OPEN+7:
1191 case OPEN+8:
1192 case OPEN+9:
1193 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1194 p = NULL;
1195 break;
1196 case CLOSE+1:
1197 case CLOSE+2:
1198 case CLOSE+3:
1199 case CLOSE+4:
1200 case CLOSE+5:
1201 case CLOSE+6:
1202 case CLOSE+7:
1203 case CLOSE+8:
1204 case CLOSE+9:
1205 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1206 p = NULL;
1207 break;
1208 case STAR:
1209 p = "STAR";
1210 break;
1211 case PLUS:
1212 p = "PLUS";
1213 break;
1214 default:
1215 FAIL("corrupted regexp opcode");
1216 }
1217 if (p != NULL)
1218 (void) strcat(buf, p);
1219 return(buf);
1220}
1221#endif /* DEBUGGING */
1222
1223regfree(r)
1224struct regexp *r;
1225{
1226 if (r->precomp)
1227 Safefree(r->precomp);
1228 if (r->subbase)
1229 Safefree(r->subbase);
1230 if (r->regmust)
1231 str_free(r->regmust);
1232 if (r->regstart)
1233 str_free(r->regstart);
1234 Safefree(r);
1235}