This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #4 Patch #2 continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.1 89/11/11 04:51:04 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
14  * patch2: /[\000]/ didn't work
15  * 
16  * Revision 3.0  89/10/18  15:22:29  lwall
17  * 3.0 baseline
18  * 
19  */
20
21 /*
22  * regcomp and regexec -- regsub and regerror are not used in perl
23  *
24  *      Copyright (c) 1986 by University of Toronto.
25  *      Written by Henry Spencer.  Not derived from licensed software.
26  *
27  *      Permission is granted to anyone to use this software for any
28  *      purpose on any computer system, and to redistribute it freely,
29  *      subject to the following restrictions:
30  *
31  *      1. The author is not responsible for the consequences of use of
32  *              this software, no matter how awful, even if they arise
33  *              from defects in it.
34  *
35  *      2. The origin of this software must not be misrepresented, either
36  *              by explicit claim or by omission.
37  *
38  *      3. Altered versions must be plainly marked as such, and must not
39  *              be misrepresented as being the original software.
40  *
41  *
42  ****    Alterations to Henry's code are...
43  ****
44  ****    Copyright (c) 1989, Larry Wall
45  ****
46  ****    You may distribute under the terms of the GNU General Public License
47  ****    as specified in the README file that comes with the perl 3.0 kit.
48  *
49  * Beware that some of this code is subtly aware of the way operator
50  * precedence is structured in regular expressions.  Serious changes in
51  * regular-expression syntax might require a total rethink.
52  */
53 #include "EXTERN.h"
54 #include "perl.h"
55 #include "INTERN.h"
56 #include "regcomp.h"
57
58 #ifndef STATIC
59 #define STATIC  static
60 #endif
61
62 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
63 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
64         ((*s) == '{' && regcurly(s)))
65 #define META    "^$.[()|?+*\\"
66
67 /*
68  * Flags to be passed up and down.
69  */
70 #define HASWIDTH        01      /* Known never to match null string. */
71 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
72 #define SPSTART         04      /* Starts with * or +. */
73 #define WORST           0       /* Worst case. */
74
75 /*
76  * Global work variables for regcomp().
77  */
78 static char *regprecomp;                /* uncompiled string. */
79 static char *regparse;          /* Input-scan pointer. */
80 static char *regxend;           /* End of input for compile */
81 static int regnpar;             /* () count. */
82 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
83 static long regsize;            /* Code size. */
84 static int regfold;
85 static int regsawbracket;       /* Did we do {d,d} trick? */
86
87 /*
88  * Forward declarations for regcomp()'s friends.
89  */
90 STATIC int regcurly();
91 STATIC char *reg();
92 STATIC char *regbranch();
93 STATIC char *regpiece();
94 STATIC char *regatom();
95 STATIC char *regclass();
96 STATIC char *regnode();
97 STATIC void regc();
98 STATIC void reginsert();
99 STATIC void regtail();
100 STATIC void regoptail();
101
102 /*
103  - regcomp - compile a regular expression into internal code
104  *
105  * We can't allocate space until we know how big the compiled form will be,
106  * but we can't compile it (and thus know how big it is) until we've got a
107  * place to put the code.  So we cheat:  we compile it twice, once with code
108  * generation turned off and size counting turned on, and once "for real".
109  * This also means that we don't allocate space until we are sure that the
110  * thing really will compile successfully, and we never have to move the
111  * code and thus invalidate pointers into it.  (Note that it has to be in
112  * one piece because free() must be able to free it all.) [NB: not true in perl]
113  *
114  * Beware that the optimization-preparation code in here knows about some
115  * of the structure of the compiled regexp.  [I'll say.]
116  */
117 regexp *
118 regcomp(exp,xend,fold,rare)
119 char *exp;
120 char *xend;
121 int fold;
122 int rare;
123 {
124         register regexp *r;
125         register char *scan;
126         register STR *longest;
127         register int len;
128         register char *first;
129         int flags;
130         int back;
131         int curback;
132         extern char *safemalloc();
133         extern char *savestr();
134
135         if (exp == NULL)
136                 fatal("NULL regexp argument");
137
138         /* First pass: determine size, legality. */
139         regfold = fold;
140         regparse = exp;
141         regxend = xend;
142         regprecomp = nsavestr(exp,xend-exp);
143         regsawbracket = 0;
144         regnpar = 1;
145         regsize = 0L;
146         regcode = &regdummy;
147         regc(MAGIC);
148         if (reg(0, &flags) == NULL) {
149                 Safefree(regprecomp);
150                 return(NULL);
151         }
152
153         /* Small enough for pointer-storage convention? */
154         if (regsize >= 32767L)          /* Probably could be 65535L. */
155                 FAIL("regexp too big");
156
157         /* Allocate space. */
158         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
159         if (r == NULL)
160                 FAIL("regexp out of space");
161
162         /* Second pass: emit code. */
163         if (regsawbracket)
164             bcopy(regprecomp,exp,xend-exp);
165         r->precomp = regprecomp;
166         r->subbase = NULL;
167         regparse = exp;
168         regnpar = 1;
169         regcode = r->program;
170         regc(MAGIC);
171         if (reg(0, &flags) == NULL)
172                 return(NULL);
173
174         /* Dig out information for optimizations. */
175         r->regstart = Nullstr;  /* Worst-case defaults. */
176         r->reganch = 0;
177         r->regmust = Nullstr;
178         r->regback = -1;
179         r->regstclass = Nullch;
180         scan = r->program+1;                    /* First BRANCH. */
181         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
182                 scan = NEXTOPER(scan);
183
184                 first = scan;
185                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
186                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
187                     (OP(first) == PLUS) )
188                         first = NEXTOPER(first);
189
190                 /* Starting-point info. */
191                 if (OP(first) == EXACTLY) {
192                         r->regstart =
193                             str_make(OPERAND(first)+1,*OPERAND(first));
194                         if (r->regstart->str_cur > !(sawstudy|fold))
195                                 fbmcompile(r->regstart,fold);
196                 }
197                 else if ((exp = index(simple,OP(first))) && exp > simple)
198                         r->regstclass = first;
199                 else if (OP(first) == BOUND || OP(first) == NBOUND)
200                         r->regstclass = first;
201                 else if (OP(first) == BOL)
202                         r->reganch++;
203
204 #ifdef DEBUGGING
205                 if (debug & 512)
206                     fprintf(stderr,"first %d next %d offset %d\n",
207                       OP(first), OP(NEXTOPER(first)), first - scan);
208 #endif
209                 /*
210                  * If there's something expensive in the r.e., find the
211                  * longest literal string that must appear and make it the
212                  * regmust.  Resolve ties in favor of later strings, since
213                  * the regstart check works with the beginning of the r.e.
214                  * and avoiding duplication strengthens checking.  Not a
215                  * strong reason, but sufficient in the absence of others.
216                  * [Now we resolve ties in favor of the earlier string if
217                  * it happens that curback has been invalidated, since the
218                  * earlier string may buy us something the later one won't.]
219                  */
220                 longest = str_make("",0);
221                 len = 0;
222                 curback = 0;
223                 back = 0;
224                 while (scan != NULL) {
225                         if (OP(scan) == BRANCH) {
226                             if (OP(regnext(scan)) == BRANCH) {
227                                 curback = -30000;
228                                 while (OP(scan) == BRANCH)
229                                     scan = regnext(scan);
230                             }
231                             else        /* single branch is ok */
232                                 scan = NEXTOPER(scan);
233                         }
234                         if (OP(scan) == EXACTLY) {
235                             first = scan;
236                             while (OP(regnext(scan)) >= CLOSE)
237                                 scan = regnext(scan);
238                             if (curback - back == len) {
239                                 str_ncat(longest, OPERAND(first)+1,
240                                     *OPERAND(first));
241                                 len += *OPERAND(first);
242                                 curback += *OPERAND(first);
243                                 first = regnext(scan);
244                             }
245                             else if (*OPERAND(first) >= len + (curback >= 0)) {
246                                 len = *OPERAND(first);
247                                 str_nset(longest, OPERAND(first)+1,len);
248                                 back = curback;
249                                 curback += len;
250                                 first = regnext(scan);
251                             }
252                             else
253                                 curback += *OPERAND(first);
254                         }
255                         else if (index(varies,OP(scan)))
256                                 curback = -30000;
257                         else if (index(simple,OP(scan)))
258                                 curback++;
259                         scan = regnext(scan);
260                 }
261                 if (len) {
262                         r->regmust = longest;
263                         if (back < 0)
264                                 back = -1;
265                         r->regback = back;
266                         if (len > !(sawstudy||fold||OP(first)==EOL))
267                                 fbmcompile(r->regmust,fold);
268                         r->regmust->str_u.str_useful = 100;
269                         if (OP(first) == EOL) /* is match anchored to EOL? */
270                             r->regmust->str_pok |= SP_TAIL;
271                 }
272                 else
273                         str_free(longest);
274         }
275
276         r->do_folding = fold;
277         r->nparens = regnpar - 1;
278 #ifdef DEBUGGING
279         if (debug & 512)
280                 regdump(r);
281 #endif
282         return(r);
283 }
284
285 /*
286  - reg - regular expression, i.e. main body or parenthesized thing
287  *
288  * Caller must absorb opening parenthesis.
289  *
290  * Combining parenthesis handling with the base level of regular expression
291  * is a trifle forced, but the need to tie the tails of the branches to what
292  * follows makes it hard to avoid.
293  */
294 static char *
295 reg(paren, flagp)
296 int paren;                      /* Parenthesized? */
297 int *flagp;
298 {
299         register char *ret;
300         register char *br;
301         register char *ender;
302         register int parno;
303         int flags;
304
305         *flagp = HASWIDTH;      /* Tentatively. */
306
307         /* Make an OPEN node, if parenthesized. */
308         if (paren) {
309                 if (regnpar >= NSUBEXP)
310                         FAIL("too many () in regexp");
311                 parno = regnpar;
312                 regnpar++;
313                 ret = regnode(OPEN+parno);
314         } else
315                 ret = NULL;
316
317         /* Pick up the branches, linking them together. */
318         br = regbranch(&flags);
319         if (br == NULL)
320                 return(NULL);
321         if (ret != NULL)
322                 regtail(ret, br);       /* OPEN -> first. */
323         else
324                 ret = br;
325         if (!(flags&HASWIDTH))
326                 *flagp &= ~HASWIDTH;
327         *flagp |= flags&SPSTART;
328         while (*regparse == '|') {
329                 regparse++;
330                 br = regbranch(&flags);
331                 if (br == NULL)
332                         return(NULL);
333                 regtail(ret, br);       /* BRANCH -> BRANCH. */
334                 if (!(flags&HASWIDTH))
335                         *flagp &= ~HASWIDTH;
336                 *flagp |= flags&SPSTART;
337         }
338
339         /* Make a closing node, and hook it on the end. */
340         ender = regnode((paren) ? CLOSE+parno : END);   
341         regtail(ret, ender);
342
343         /* Hook the tails of the branches to the closing node. */
344         for (br = ret; br != NULL; br = regnext(br))
345                 regoptail(br, ender);
346
347         /* Check for proper termination. */
348         if (paren && *regparse++ != ')') {
349                 FAIL("unmatched () in regexp");
350         } else if (!paren && regparse < regxend) {
351                 if (*regparse == ')') {
352                         FAIL("unmatched () in regexp");
353                 } else
354                         FAIL("junk on end of regexp");  /* "Can't happen". */
355                 /* NOTREACHED */
356         }
357
358         return(ret);
359 }
360
361 /*
362  - regbranch - one alternative of an | operator
363  *
364  * Implements the concatenation operator.
365  */
366 static char *
367 regbranch(flagp)
368 int *flagp;
369 {
370         register char *ret;
371         register char *chain;
372         register char *latest;
373         int flags;
374
375         *flagp = WORST;         /* Tentatively. */
376
377         ret = regnode(BRANCH);
378         chain = NULL;
379         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
380                 latest = regpiece(&flags);
381                 if (latest == NULL)
382                         return(NULL);
383                 *flagp |= flags&HASWIDTH;
384                 if (chain == NULL)      /* First piece. */
385                         *flagp |= flags&SPSTART;
386                 else
387                         regtail(chain, latest);
388                 chain = latest;
389         }
390         if (chain == NULL)      /* Loop ran zero times. */
391                 (void) regnode(NOTHING);
392
393         return(ret);
394 }
395
396 /*
397  - regpiece - something followed by possible [*+?]
398  *
399  * Note that the branching code sequences used for ? and the general cases
400  * of * and + are somewhat optimized:  they use the same NOTHING node as
401  * both the endmarker for their branch list and the body of the last branch.
402  * It might seem that this node could be dispensed with entirely, but the
403  * endmarker role is not redundant.
404  */
405 static char *
406 regpiece(flagp)
407 int *flagp;
408 {
409         register char *ret;
410         register char op;
411         register char *next;
412         int flags;
413         char *origparse = regparse;
414         int orignpar = regnpar;
415         char *max;
416         int iter;
417         char ch;
418
419         ret = regatom(&flags);
420         if (ret == NULL)
421                 return(NULL);
422
423         op = *regparse;
424
425         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
426          * then we decrement the first number by one and reset our
427          * parsing back to the beginning of the same atom.  If the first number
428          * is down to 0, decrement the second number instead and fake up
429          * a ? after it.  Given the way this compiler doesn't keep track
430          * of offsets on the first pass, this is the only way to replicate
431          * a piece of code.  Sigh.
432          */
433         if (op == '{' && regcurly(regparse)) {
434             next = regparse + 1;
435             max = Nullch;
436             while (isdigit(*next) || *next == ',') {
437                 if (*next == ',') {
438                     if (max)
439                         break;
440                     else
441                         max = next;
442                 }
443                 next++;
444             }
445             if (*next == '}') {         /* got one */
446                 regsawbracket++;        /* remember we clobbered exp */
447                 if (!max)
448                     max = next;
449                 regparse++;
450                 iter = atoi(regparse);
451                 if (iter > 0) {
452                     ch = *max;
453                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
454                     *max = ch;
455                     if (*max == ',' && atoi(max+1) > 0) {
456                         ch = *next;
457                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
458                         *next = ch;
459                     }
460                     if (iter != 1 || (*max == ',' || atoi(max+1))) {
461                         regparse = origparse;   /* back up input pointer */
462                         regnpar = orignpar;     /* don't make more parens */
463                     }
464                     else {
465                         regparse = next;
466                         goto nest_check;
467                     }
468                     *flagp = flags;
469                     return ret;
470                 }
471                 if (*max == ',') {
472                     max++;
473                     iter = atoi(max);
474                     if (max == next) {          /* any number more? */
475                         regparse = next;
476                         op = '*';               /* fake up one with a star */
477                     }
478                     else if (iter > 0) {
479                         op = '?';               /* fake up optional atom */
480                         ch = *next;
481                         sprintf(max,"%.*d", next-max, iter - 1);
482                         *next = ch;
483                         if (iter == 1)
484                             regparse = next;
485                         else {
486                             regparse = origparse - 1; /* offset ++ below */
487                             regnpar = orignpar;
488                         }
489                     }
490                     else
491                         fatal("Can't do {n,0}");
492                 }
493                 else
494                     fatal("Can't do {0}");
495             }
496         }
497
498         if (!ISMULT1(op)) {
499                 *flagp = flags;
500                 return(ret);
501         }
502
503         if (!(flags&HASWIDTH) && op != '?')
504                 FAIL("regexp *+ operand could be empty");
505         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
506
507         if (op == '*' && (flags&SIMPLE))
508                 reginsert(STAR, ret);
509         else if (op == '*') {
510                 /* Emit x* as (x&|), where & means "self". */
511                 reginsert(BRANCH, ret);                 /* Either x */
512                 regoptail(ret, regnode(BACK));          /* and loop */
513                 regoptail(ret, ret);                    /* back */
514                 regtail(ret, regnode(BRANCH));          /* or */
515                 regtail(ret, regnode(NOTHING));         /* null. */
516         } else if (op == '+' && (flags&SIMPLE))
517                 reginsert(PLUS, ret);
518         else if (op == '+') {
519                 /* Emit x+ as x(&|), where & means "self". */
520                 next = regnode(BRANCH);                 /* Either */
521                 regtail(ret, next);
522                 regtail(regnode(BACK), ret);            /* loop back */
523                 regtail(next, regnode(BRANCH));         /* or */
524                 regtail(ret, regnode(NOTHING));         /* null. */
525         } else if (op == '?') {
526                 /* Emit x? as (x|) */
527                 reginsert(BRANCH, ret);                 /* Either x */
528                 regtail(ret, regnode(BRANCH));          /* or */
529                 next = regnode(NOTHING);                /* null. */
530                 regtail(ret, next);
531                 regoptail(ret, next);
532         }
533       nest_check:
534         regparse++;
535         if (ISMULT2(regparse))
536                 FAIL("nested *?+ in regexp");
537
538         return(ret);
539 }
540
541 /*
542  - regatom - the lowest level
543  *
544  * Optimization:  gobbles an entire sequence of ordinary characters so that
545  * it can turn them into a single node, which is smaller to store and
546  * faster to run.  Backslashed characters are exceptions, each becoming a
547  * separate node; the code is simpler that way and it's not worth fixing.
548  *
549  * [Yes, it is worth fixing, some scripts can run twice the speed.]
550  */
551 static char *
552 regatom(flagp)
553 int *flagp;
554 {
555         register char *ret;
556         int flags;
557
558         *flagp = WORST;         /* Tentatively. */
559
560         switch (*regparse++) {
561         case '^':
562                 ret = regnode(BOL);
563                 break;
564         case '$':
565                 ret = regnode(EOL);
566                 break;
567         case '.':
568                 ret = regnode(ANY);
569                 *flagp |= HASWIDTH|SIMPLE;
570                 break;
571         case '[':
572                 ret = regclass();
573                 *flagp |= HASWIDTH|SIMPLE;
574                 break;
575         case '(':
576                 ret = reg(1, &flags);
577                 if (ret == NULL)
578                         return(NULL);
579                 *flagp |= flags&(HASWIDTH|SPSTART);
580                 break;
581         case '|':
582         case ')':
583                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
584                 break;
585         case '?':
586         case '+':
587         case '*':
588                 FAIL("?+* follows nothing in regexp");
589                 break;
590         case '\\':
591                 switch (*regparse) {
592                 case 'w':
593                         ret = regnode(ALNUM);
594                         *flagp |= HASWIDTH|SIMPLE;
595                         regparse++;
596                         break;
597                 case 'W':
598                         ret = regnode(NALNUM);
599                         *flagp |= HASWIDTH|SIMPLE;
600                         regparse++;
601                         break;
602                 case 'b':
603                         ret = regnode(BOUND);
604                         *flagp |= SIMPLE;
605                         regparse++;
606                         break;
607                 case 'B':
608                         ret = regnode(NBOUND);
609                         *flagp |= SIMPLE;
610                         regparse++;
611                         break;
612                 case 's':
613                         ret = regnode(SPACE);
614                         *flagp |= HASWIDTH|SIMPLE;
615                         regparse++;
616                         break;
617                 case 'S':
618                         ret = regnode(NSPACE);
619                         *flagp |= HASWIDTH|SIMPLE;
620                         regparse++;
621                         break;
622                 case 'd':
623                         ret = regnode(DIGIT);
624                         *flagp |= HASWIDTH|SIMPLE;
625                         regparse++;
626                         break;
627                 case 'D':
628                         ret = regnode(NDIGIT);
629                         *flagp |= HASWIDTH|SIMPLE;
630                         regparse++;
631                         break;
632                 case 'n':
633                 case 'r':
634                 case 't':
635                 case 'f':
636                         goto defchar;
637                 case '0': case '1': case '2': case '3': case '4':
638                 case '5': case '6': case '7': case '8': case '9':
639                         if (isdigit(regparse[1]))
640                                 goto defchar;
641                         else {
642                                 ret = regnode(REF + *regparse++ - '0');
643                                 *flagp |= SIMPLE;
644                         }
645                         break;
646                 case '\0':
647                         if (regparse >= regxend)
648                             FAIL("trailing \\ in regexp");
649                         /* FALL THROUGH */
650                 default:
651                         goto defchar;
652                 }
653                 break;
654         default: {
655                         register int len;
656                         register char ender;
657                         register char *p;
658                         char *oldp;
659                         int foo;
660
661                     defchar:
662                         ret = regnode(EXACTLY);
663                         regc(0);                /* save spot for len */
664                         for (len=0, p=regparse-1;
665                           len < 127 && p < regxend;
666                           len++)
667                         {
668                             oldp = p;
669                             switch (*p) {
670                             case '^':
671                             case '$':
672                             case '.':
673                             case '[':
674                             case '(':
675                             case ')':
676                             case '|':
677                                 goto loopdone;
678                             case '\\':
679                                 switch (*++p) {
680                                 case 'w':
681                                 case 'W':
682                                 case 'b':
683                                 case 'B':
684                                 case 's':
685                                 case 'S':
686                                 case 'd':
687                                 case 'D':
688                                     --p;
689                                     goto loopdone;
690                                 case 'n':
691                                         ender = '\n';
692                                         p++;
693                                         break;
694                                 case 'r':
695                                         ender = '\r';
696                                         p++;
697                                         break;
698                                 case 't':
699                                         ender = '\t';
700                                         p++;
701                                         break;
702                                 case 'f':
703                                         ender = '\f';
704                                         p++;
705                                         break;
706                                 case '0': case '1': case '2': case '3':case '4':
707                                 case '5': case '6': case '7': case '8':case '9':
708                                     if (isdigit(p[1])) {
709                                         foo = *p++ - '0';
710                                         foo <<= 3;
711                                         foo += *p - '0';
712                                         if (isdigit(p[1]))
713                                             foo = (foo<<3) + *++p - '0';
714                                         ender = foo;
715                                         p++;
716                                     }
717                                     else {
718                                         --p;
719                                         goto loopdone;
720                                     }
721                                     break;
722                                 case '\0':
723                                     if (p >= regxend)
724                                         FAIL("trailing \\ in regexp");
725                                     /* FALL THROUGH */
726                                 default:
727                                     ender = *p++;
728                                     break;
729                                 }
730                                 break;
731                             default:
732                                 ender = *p++;
733                                 break;
734                             }
735                             if (regfold && isupper(ender))
736                                     ender = tolower(ender);
737                             if (ISMULT2(p)) { /* Back off on ?+*. */
738                                 if (len)
739                                     p = oldp;
740                                 else {
741                                     len++;
742                                     regc(ender);
743                                 }
744                                 break;
745                             }
746                             regc(ender);
747                         }
748                     loopdone:
749                         regparse = p;
750                         if (len <= 0)
751                                 FAIL("internal disaster in regexp");
752                         *flagp |= HASWIDTH;
753                         if (len == 1)
754                                 *flagp |= SIMPLE;
755                         if (regcode != &regdummy)
756                             *OPERAND(ret) = len;
757                         regc('\0');
758                 }
759                 break;
760         }
761
762         return(ret);
763 }
764
765 static void
766 regset(bits,def,c)
767 char *bits;
768 int def;
769 register int c;
770 {
771         if (regcode == &regdummy)
772             return;
773         if (def)
774                 bits[c >> 3] &= ~(1 << (c & 7));
775         else
776                 bits[c >> 3] |=  (1 << (c & 7));
777 }
778
779 static char *
780 regclass()
781 {
782         register char *bits;
783         register int class;
784         register int lastclass;
785         register int range = 0;
786         register char *ret;
787         register int def;
788
789         if (*regparse == '^') { /* Complement of range. */
790                 ret = regnode(ANYBUT);
791                 regparse++;
792                 def = 0;
793         } else {
794                 ret = regnode(ANYOF);
795                 def = 255;
796         }
797         bits = regcode;
798         for (class = 0; class < 32; class++)
799             regc(def);
800         if (*regparse == ']' || *regparse == '-')
801                 regset(bits,def,lastclass = *regparse++);
802         while (regparse < regxend && *regparse != ']') {
803                 class = UCHARAT(regparse++);
804                 if (class == '\\') {
805                         class = UCHARAT(regparse++);
806                         switch (class) {
807                         case 'w':
808                                 for (class = 'a'; class <= 'z'; class++)
809                                         regset(bits,def,class);
810                                 for (class = 'A'; class <= 'Z'; class++)
811                                         regset(bits,def,class);
812                                 for (class = '0'; class <= '9'; class++)
813                                         regset(bits,def,class);
814                                 regset(bits,def,'_');
815                                 lastclass = 1234;
816                                 continue;
817                         case 's':
818                                 regset(bits,def,' ');
819                                 regset(bits,def,'\t');
820                                 regset(bits,def,'\r');
821                                 regset(bits,def,'\f');
822                                 regset(bits,def,'\n');
823                                 lastclass = 1234;
824                                 continue;
825                         case 'd':
826                                 for (class = '0'; class <= '9'; class++)
827                                         regset(bits,def,class);
828                                 lastclass = 1234;
829                                 continue;
830                         case 'n':
831                                 class = '\n';
832                                 break;
833                         case 'r':
834                                 class = '\r';
835                                 break;
836                         case 't':
837                                 class = '\t';
838                                 break;
839                         case 'f':
840                                 class = '\f';
841                                 break;
842                         case 'b':
843                                 class = '\b';
844                                 break;
845                         case '0': case '1': case '2': case '3': case '4':
846                         case '5': case '6': case '7': case '8': case '9':
847                                 class -= '0';
848                                 if (isdigit(*regparse)) {
849                                         class <<= 3;
850                                         class += *regparse++ - '0';
851                                 }
852                                 if (isdigit(*regparse)) {
853                                         class <<= 3;
854                                         class += *regparse++ - '0';
855                                 }
856                                 break;
857                         }
858                 }
859                 if (!range && class == '-' && regparse < regxend &&
860                     *regparse != ']') {
861                         range = 1;
862                         continue;
863                 }
864                 if (range) {
865                         if (lastclass > class)
866                                 FAIL("invalid [] range in regexp");
867                 }
868                 else
869                         lastclass = class - 1;
870                 range = 0;
871                 for (lastclass++; lastclass <= class; lastclass++) {
872                         regset(bits,def,lastclass);
873                         if (regfold && isupper(lastclass))
874                                 regset(bits,def,tolower(lastclass));
875                 }
876                 lastclass = class;
877         }
878         if (*regparse != ']')
879                 FAIL("unmatched [] in regexp");
880         regparse++;
881         return ret;
882 }
883
884 /*
885  - regnode - emit a node
886  */
887 static char *                   /* Location. */
888 regnode(op)
889 char op;
890 {
891         register char *ret;
892         register char *ptr;
893
894         ret = regcode;
895         if (ret == &regdummy) {
896 #ifdef REGALIGN
897                 if (!(regsize & 1))
898                         regsize++;
899 #endif
900                 regsize += 3;
901                 return(ret);
902         }
903
904 #ifdef REGALIGN
905 #ifndef lint
906         if (!((long)ret & 1))
907             *ret++ = 127;
908 #endif
909 #endif
910         ptr = ret;
911         *ptr++ = op;
912         *ptr++ = '\0';          /* Null "next" pointer. */
913         *ptr++ = '\0';
914         regcode = ptr;
915
916         return(ret);
917 }
918
919 /*
920  - regc - emit (if appropriate) a byte of code
921  */
922 static void
923 regc(b)
924 char b;
925 {
926         if (regcode != &regdummy)
927                 *regcode++ = b;
928         else
929                 regsize++;
930 }
931
932 /*
933  - reginsert - insert an operator in front of already-emitted operand
934  *
935  * Means relocating the operand.
936  */
937 static void
938 reginsert(op, opnd)
939 char op;
940 char *opnd;
941 {
942         register char *src;
943         register char *dst;
944         register char *place;
945
946         if (regcode == &regdummy) {
947 #ifdef REGALIGN
948                 regsize += 4;
949 #else
950                 regsize += 3;
951 #endif
952                 return;
953         }
954
955         src = regcode;
956 #ifdef REGALIGN
957         regcode += 4;
958 #else
959         regcode += 3;
960 #endif
961         dst = regcode;
962         while (src > opnd)
963                 *--dst = *--src;
964
965         place = opnd;           /* Op node, where operand used to be. */
966         *place++ = op;
967         *place++ = '\0';
968         *place++ = '\0';
969 }
970
971 /*
972  - regtail - set the next-pointer at the end of a node chain
973  */
974 static void
975 regtail(p, val)
976 char *p;
977 char *val;
978 {
979         register char *scan;
980         register char *temp;
981         register int offset;
982
983         if (p == &regdummy)
984                 return;
985
986         /* Find last node. */
987         scan = p;
988         for (;;) {
989                 temp = regnext(scan);
990                 if (temp == NULL)
991                         break;
992                 scan = temp;
993         }
994
995 #ifdef REGALIGN
996         offset = val - scan;
997 #ifndef lint
998         *(short*)(scan+1) = offset;
999 #else
1000         offset = offset;
1001 #endif
1002 #else
1003         if (OP(scan) == BACK)
1004                 offset = scan - val;
1005         else
1006                 offset = val - scan;
1007         *(scan+1) = (offset>>8)&0377;
1008         *(scan+2) = offset&0377;
1009 #endif
1010 }
1011
1012 /*
1013  - regoptail - regtail on operand of first argument; nop if operandless
1014  */
1015 static void
1016 regoptail(p, val)
1017 char *p;
1018 char *val;
1019 {
1020         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1021         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1022                 return;
1023         regtail(NEXTOPER(p), val);
1024 }
1025
1026 /*
1027  - regcurly - a little FSA that accepts {\d+,?\d*}
1028  */
1029 STATIC int
1030 regcurly(s)
1031 register char *s;
1032 {
1033     if (*s++ != '{')
1034         return FALSE;
1035     if (!isdigit(*s))
1036         return FALSE;
1037     while (isdigit(*s))
1038         s++;
1039     if (*s == ',')
1040         s++;
1041     while (isdigit(*s))
1042         s++;
1043     if (*s != '}')
1044         return FALSE;
1045     return TRUE;
1046 }
1047
1048 #ifdef DEBUGGING
1049
1050 /*
1051  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1052  */
1053 void
1054 regdump(r)
1055 regexp *r;
1056 {
1057         register char *s;
1058         register char op = EXACTLY;     /* Arbitrary non-END op. */
1059         register char *next;
1060         extern char *index();
1061
1062
1063         s = r->program + 1;
1064         while (op != END) {     /* While that wasn't END last time... */
1065 #ifdef REGALIGN
1066                 if (!((long)s & 1))
1067                         s++;
1068 #endif
1069                 op = OP(s);
1070                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1071                 next = regnext(s);
1072                 if (next == NULL)               /* Next ptr. */
1073                         fprintf(stderr,"(0)");
1074                 else 
1075                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1076                 s += 3;
1077                 if (op == ANYOF || op == ANYBUT) {
1078                         s += 32;
1079                 }
1080                 if (op == EXACTLY) {
1081                         /* Literal string, where present. */
1082                         s++;
1083                         while (*s != '\0') {
1084                                 (void)putchar(*s);
1085                                 s++;
1086                         }
1087                         s++;
1088                 }
1089                 (void)putchar('\n');
1090         }
1091
1092         /* Header fields of interest. */
1093         if (r->regstart)
1094                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1095         if (r->regstclass)
1096                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1097         if (r->reganch)
1098                 fprintf(stderr,"anchored ");
1099         if (r->regmust != NULL)
1100                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1101                   r->regback);
1102         fprintf(stderr,"\n");
1103 }
1104
1105 /*
1106  - regprop - printable representation of opcode
1107  */
1108 char *
1109 regprop(op)
1110 char *op;
1111 {
1112         register char *p;
1113
1114         (void) strcpy(buf, ":");
1115
1116         switch (OP(op)) {
1117         case BOL:
1118                 p = "BOL";
1119                 break;
1120         case EOL:
1121                 p = "EOL";
1122                 break;
1123         case ANY:
1124                 p = "ANY";
1125                 break;
1126         case ANYOF:
1127                 p = "ANYOF";
1128                 break;
1129         case ANYBUT:
1130                 p = "ANYBUT";
1131                 break;
1132         case BRANCH:
1133                 p = "BRANCH";
1134                 break;
1135         case EXACTLY:
1136                 p = "EXACTLY";
1137                 break;
1138         case NOTHING:
1139                 p = "NOTHING";
1140                 break;
1141         case BACK:
1142                 p = "BACK";
1143                 break;
1144         case END:
1145                 p = "END";
1146                 break;
1147         case ALNUM:
1148                 p = "ALNUM";
1149                 break;
1150         case NALNUM:
1151                 p = "NALNUM";
1152                 break;
1153         case BOUND:
1154                 p = "BOUND";
1155                 break;
1156         case NBOUND:
1157                 p = "NBOUND";
1158                 break;
1159         case SPACE:
1160                 p = "SPACE";
1161                 break;
1162         case NSPACE:
1163                 p = "NSPACE";
1164                 break;
1165         case DIGIT:
1166                 p = "DIGIT";
1167                 break;
1168         case NDIGIT:
1169                 p = "NDIGIT";
1170                 break;
1171         case REF:
1172         case REF+1:
1173         case REF+2:
1174         case REF+3:
1175         case REF+4:
1176         case REF+5:
1177         case REF+6:
1178         case REF+7:
1179         case REF+8:
1180         case REF+9:
1181                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1182                 p = NULL;
1183                 break;
1184         case OPEN+1:
1185         case OPEN+2:
1186         case OPEN+3:
1187         case OPEN+4:
1188         case OPEN+5:
1189         case OPEN+6:
1190         case OPEN+7:
1191         case OPEN+8:
1192         case OPEN+9:
1193                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1194                 p = NULL;
1195                 break;
1196         case CLOSE+1:
1197         case CLOSE+2:
1198         case CLOSE+3:
1199         case CLOSE+4:
1200         case CLOSE+5:
1201         case CLOSE+6:
1202         case CLOSE+7:
1203         case CLOSE+8:
1204         case CLOSE+9:
1205                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1206                 p = NULL;
1207                 break;
1208         case STAR:
1209                 p = "STAR";
1210                 break;
1211         case PLUS:
1212                 p = "PLUS";
1213                 break;
1214         default:
1215                 FAIL("corrupted regexp opcode");
1216         }
1217         if (p != NULL)
1218                 (void) strcat(buf, p);
1219         return(buf);
1220 }
1221 #endif /* DEBUGGING */
1222
1223 regfree(r)
1224 struct regexp *r;
1225 {
1226         if (r->precomp)
1227                 Safefree(r->precomp);
1228         if (r->subbase)
1229                 Safefree(r->subbase);
1230         if (r->regmust)
1231                 str_free(r->regmust);
1232         if (r->regstart)
1233                 str_free(r->regstart);
1234         Safefree(r);
1235 }