This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #25 patch #19, continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.4 90/08/09 05:05:33 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.4  90/08/09  05:05:33  lwall
14  * patch19: sped up /x+y/ patterns greatly by not retrying on every x
15  * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
16  * patch19: sped up {m,n} on simple items
17  * patch19: optimized /.*whatever/ to /^.*whatever/
18  * patch19: fixed character classes to allow backslashing hyphen
19  * 
20  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
21  * patch13: pattern matches can now use \0 to mean \000
22  * 
23  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
24  * patch9: /[\200-\377]/ didn't work on machines with signed chars
25  * 
26  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
27  * patch2: /[\000]/ didn't work
28  * 
29  * Revision 3.0  89/10/18  15:22:29  lwall
30  * 3.0 baseline
31  * 
32  */
33
34 /*
35  * regcomp and regexec -- regsub and regerror are not used in perl
36  *
37  *      Copyright (c) 1986 by University of Toronto.
38  *      Written by Henry Spencer.  Not derived from licensed software.
39  *
40  *      Permission is granted to anyone to use this software for any
41  *      purpose on any computer system, and to redistribute it freely,
42  *      subject to the following restrictions:
43  *
44  *      1. The author is not responsible for the consequences of use of
45  *              this software, no matter how awful, even if they arise
46  *              from defects in it.
47  *
48  *      2. The origin of this software must not be misrepresented, either
49  *              by explicit claim or by omission.
50  *
51  *      3. Altered versions must be plainly marked as such, and must not
52  *              be misrepresented as being the original software.
53  *
54  *
55  ****    Alterations to Henry's code are...
56  ****
57  ****    Copyright (c) 1989, Larry Wall
58  ****
59  ****    You may distribute under the terms of the GNU General Public License
60  ****    as specified in the README file that comes with the perl 3.0 kit.
61  *
62  * Beware that some of this code is subtly aware of the way operator
63  * precedence is structured in regular expressions.  Serious changes in
64  * regular-expression syntax might require a total rethink.
65  */
66 #include "EXTERN.h"
67 #include "perl.h"
68 #include "INTERN.h"
69 #include "regcomp.h"
70
71 #ifndef STATIC
72 #define STATIC  static
73 #endif
74
75 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
76 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
77         ((*s) == '{' && regcurly(s)))
78 #define META    "^$.[()|?+*\\"
79
80 /*
81  * Flags to be passed up and down.
82  */
83 #define HASWIDTH        01      /* Known never to match null string. */
84 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
85 #define SPSTART         04      /* Starts with * or +. */
86 #define WORST           0       /* Worst case. */
87
88 /*
89  * Global work variables for regcomp().
90  */
91 static char *regprecomp;                /* uncompiled string. */
92 static char *regparse;          /* Input-scan pointer. */
93 static char *regxend;           /* End of input for compile */
94 static int regnpar;             /* () count. */
95 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
96 static long regsize;            /* Code size. */
97 static int regfold;
98 static int regsawbracket;       /* Did we do {d,d} trick? */
99
100 /*
101  * Forward declarations for regcomp()'s friends.
102  */
103 STATIC int regcurly();
104 STATIC char *reg();
105 STATIC char *regbranch();
106 STATIC char *regpiece();
107 STATIC char *regatom();
108 STATIC char *regclass();
109 STATIC char *regnode();
110 STATIC void regc();
111 STATIC void reginsert();
112 STATIC void regtail();
113 STATIC void regoptail();
114
115 /*
116  - regcomp - compile a regular expression into internal code
117  *
118  * We can't allocate space until we know how big the compiled form will be,
119  * but we can't compile it (and thus know how big it is) until we've got a
120  * place to put the code.  So we cheat:  we compile it twice, once with code
121  * generation turned off and size counting turned on, and once "for real".
122  * This also means that we don't allocate space until we are sure that the
123  * thing really will compile successfully, and we never have to move the
124  * code and thus invalidate pointers into it.  (Note that it has to be in
125  * one piece because free() must be able to free it all.) [NB: not true in perl]
126  *
127  * Beware that the optimization-preparation code in here knows about some
128  * of the structure of the compiled regexp.  [I'll say.]
129  */
130 regexp *
131 regcomp(exp,xend,fold)
132 char *exp;
133 char *xend;
134 int fold;
135 {
136         register regexp *r;
137         register char *scan;
138         register STR *longest;
139         register int len;
140         register char *first;
141         int flags;
142         int back;
143         int curback;
144         extern char *safemalloc();
145         extern char *savestr();
146         int sawplus = 0;
147
148         if (exp == NULL)
149                 fatal("NULL regexp argument");
150
151         /* First pass: determine size, legality. */
152         regfold = fold;
153         regparse = exp;
154         regxend = xend;
155         regprecomp = nsavestr(exp,xend-exp);
156         regsawbracket = 0;
157         regnpar = 1;
158         regsize = 0L;
159         regcode = &regdummy;
160         regc(MAGIC);
161         if (reg(0, &flags) == NULL) {
162                 Safefree(regprecomp);
163                 return(NULL);
164         }
165
166         /* Small enough for pointer-storage convention? */
167         if (regsize >= 32767L)          /* Probably could be 65535L. */
168                 FAIL("regexp too big");
169
170         /* Allocate space. */
171         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
172         if (r == NULL)
173                 FAIL("regexp out of space");
174
175         /* Second pass: emit code. */
176         if (regsawbracket)
177             bcopy(regprecomp,exp,xend-exp);
178         r->precomp = regprecomp;
179         r->subbase = NULL;
180         regparse = exp;
181         regnpar = 1;
182         regcode = r->program;
183         regc(MAGIC);
184         if (reg(0, &flags) == NULL)
185                 return(NULL);
186
187         /* Dig out information for optimizations. */
188         r->regstart = Nullstr;  /* Worst-case defaults. */
189         r->reganch = 0;
190         r->regmust = Nullstr;
191         r->regback = -1;
192         r->regstclass = Nullch;
193         scan = r->program+1;                    /* First BRANCH. */
194         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
195                 scan = NEXTOPER(scan);
196
197                 first = scan;
198                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
199                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
200                     (OP(first) == PLUS) ||
201                     (OP(first) == CURLY && ARG1(first) > 0) ) {
202                         if (OP(first) == CURLY)
203                             first += 4;
204                         else if (OP(first) == PLUS)
205                             sawplus = 2;
206                         first = NEXTOPER(first);
207                 }
208
209                 /* Starting-point info. */
210                 if (OP(first) == EXACTLY) {
211                         r->regstart =
212                             str_make(OPERAND(first)+1,*OPERAND(first));
213                         if (r->regstart->str_cur > !(sawstudy|fold))
214                                 fbmcompile(r->regstart,fold);
215                 }
216                 else if ((exp = index(simple,OP(first))) && exp > simple)
217                         r->regstclass = first;
218                 else if (OP(first) == BOUND || OP(first) == NBOUND)
219                         r->regstclass = first;
220                 else if (OP(first) == BOL ||
221                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
222                         r->reganch = 1;         /* kinda turn .* into ^.* */
223                 r->reganch |= sawplus;
224
225 #ifdef DEBUGGING
226                 if (debug & 512)
227                     fprintf(stderr,"first %d next %d offset %d\n",
228                       OP(first), OP(NEXTOPER(first)), first - scan);
229 #endif
230                 /*
231                  * If there's something expensive in the r.e., find the
232                  * longest literal string that must appear and make it the
233                  * regmust.  Resolve ties in favor of later strings, since
234                  * the regstart check works with the beginning of the r.e.
235                  * and avoiding duplication strengthens checking.  Not a
236                  * strong reason, but sufficient in the absence of others.
237                  * [Now we resolve ties in favor of the earlier string if
238                  * it happens that curback has been invalidated, since the
239                  * earlier string may buy us something the later one won't.]
240                  */
241                 longest = str_make("",0);
242                 len = 0;
243                 curback = 0;
244                 back = 0;
245                 while (scan != NULL) {
246                         if (OP(scan) == BRANCH) {
247                             if (OP(regnext(scan)) == BRANCH) {
248                                 curback = -30000;
249                                 while (OP(scan) == BRANCH)
250                                     scan = regnext(scan);
251                             }
252                             else        /* single branch is ok */
253                                 scan = NEXTOPER(scan);
254                         }
255                         if (OP(scan) == EXACTLY) {
256                             first = scan;
257                             while (OP(regnext(scan)) >= CLOSE)
258                                 scan = regnext(scan);
259                             if (curback - back == len) {
260                                 str_ncat(longest, OPERAND(first)+1,
261                                     *OPERAND(first));
262                                 len += *OPERAND(first);
263                                 curback += *OPERAND(first);
264                                 first = regnext(scan);
265                             }
266                             else if (*OPERAND(first) >= len + (curback >= 0)) {
267                                 len = *OPERAND(first);
268                                 str_nset(longest, OPERAND(first)+1,len);
269                                 back = curback;
270                                 curback += len;
271                                 first = regnext(scan);
272                             }
273                             else
274                                 curback += *OPERAND(first);
275                         }
276                         else if (index(varies,OP(scan)))
277                                 curback = -30000;
278                         else if (index(simple,OP(scan)))
279                                 curback++;
280                         scan = regnext(scan);
281                 }
282                 if (len) {
283                         r->regmust = longest;
284                         if (back < 0)
285                                 back = -1;
286                         r->regback = back;
287                         if (len > !(sawstudy||fold||OP(first)==EOL))
288                                 fbmcompile(r->regmust,fold);
289                         r->regmust->str_u.str_useful = 100;
290                         if (OP(first) == EOL) /* is match anchored to EOL? */
291                             r->regmust->str_pok |= SP_TAIL;
292                 }
293                 else
294                         str_free(longest);
295         }
296
297         r->do_folding = fold;
298         r->nparens = regnpar - 1;
299 #ifdef DEBUGGING
300         if (debug & 512)
301                 regdump(r);
302 #endif
303         return(r);
304 }
305
306 /*
307  - reg - regular expression, i.e. main body or parenthesized thing
308  *
309  * Caller must absorb opening parenthesis.
310  *
311  * Combining parenthesis handling with the base level of regular expression
312  * is a trifle forced, but the need to tie the tails of the branches to what
313  * follows makes it hard to avoid.
314  */
315 static char *
316 reg(paren, flagp)
317 int paren;                      /* Parenthesized? */
318 int *flagp;
319 {
320         register char *ret;
321         register char *br;
322         register char *ender;
323         register int parno;
324         int flags;
325
326         *flagp = HASWIDTH;      /* Tentatively. */
327
328         /* Make an OPEN node, if parenthesized. */
329         if (paren) {
330                 if (regnpar >= NSUBEXP)
331                         FAIL("too many () in regexp");
332                 parno = regnpar;
333                 regnpar++;
334                 ret = regnode(OPEN+parno);
335         } else
336                 ret = NULL;
337
338         /* Pick up the branches, linking them together. */
339         br = regbranch(&flags);
340         if (br == NULL)
341                 return(NULL);
342         if (ret != NULL)
343                 regtail(ret, br);       /* OPEN -> first. */
344         else
345                 ret = br;
346         if (!(flags&HASWIDTH))
347                 *flagp &= ~HASWIDTH;
348         *flagp |= flags&SPSTART;
349         while (*regparse == '|') {
350                 regparse++;
351                 br = regbranch(&flags);
352                 if (br == NULL)
353                         return(NULL);
354                 regtail(ret, br);       /* BRANCH -> BRANCH. */
355                 if (!(flags&HASWIDTH))
356                         *flagp &= ~HASWIDTH;
357                 *flagp |= flags&SPSTART;
358         }
359
360         /* Make a closing node, and hook it on the end. */
361         ender = regnode((paren) ? CLOSE+parno : END);   
362         regtail(ret, ender);
363
364         /* Hook the tails of the branches to the closing node. */
365         for (br = ret; br != NULL; br = regnext(br))
366                 regoptail(br, ender);
367
368         /* Check for proper termination. */
369         if (paren && *regparse++ != ')') {
370                 FAIL("unmatched () in regexp");
371         } else if (!paren && regparse < regxend) {
372                 if (*regparse == ')') {
373                         FAIL("unmatched () in regexp");
374                 } else
375                         FAIL("junk on end of regexp");  /* "Can't happen". */
376                 /* NOTREACHED */
377         }
378
379         return(ret);
380 }
381
382 /*
383  - regbranch - one alternative of an | operator
384  *
385  * Implements the concatenation operator.
386  */
387 static char *
388 regbranch(flagp)
389 int *flagp;
390 {
391         register char *ret;
392         register char *chain;
393         register char *latest;
394         int flags;
395
396         *flagp = WORST;         /* Tentatively. */
397
398         ret = regnode(BRANCH);
399         chain = NULL;
400         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
401                 latest = regpiece(&flags);
402                 if (latest == NULL)
403                         return(NULL);
404                 *flagp |= flags&HASWIDTH;
405                 if (chain == NULL)      /* First piece. */
406                         *flagp |= flags&SPSTART;
407                 else
408                         regtail(chain, latest);
409                 chain = latest;
410         }
411         if (chain == NULL)      /* Loop ran zero times. */
412                 (void) regnode(NOTHING);
413
414         return(ret);
415 }
416
417 /*
418  - regpiece - something followed by possible [*+?]
419  *
420  * Note that the branching code sequences used for ? and the general cases
421  * of * and + are somewhat optimized:  they use the same NOTHING node as
422  * both the endmarker for their branch list and the body of the last branch.
423  * It might seem that this node could be dispensed with entirely, but the
424  * endmarker role is not redundant.
425  */
426 static char *
427 regpiece(flagp)
428 int *flagp;
429 {
430         register char *ret;
431         register char op;
432         register char *next;
433         int flags;
434         char *origparse = regparse;
435         int orignpar = regnpar;
436         char *max;
437         int iter;
438         char ch;
439
440         ret = regatom(&flags);
441         if (ret == NULL)
442                 return(NULL);
443
444         op = *regparse;
445
446         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
447          * then we decrement the first number by one and reset our
448          * parsing back to the beginning of the same atom.  If the first number
449          * is down to 0, decrement the second number instead and fake up
450          * a ? after it.  Given the way this compiler doesn't keep track
451          * of offsets on the first pass, this is the only way to replicate
452          * a piece of code.  Sigh.
453          */
454         if (op == '{' && regcurly(regparse)) {
455             next = regparse + 1;
456             max = Nullch;
457             while (isdigit(*next) || *next == ',') {
458                 if (*next == ',') {
459                     if (max)
460                         break;
461                     else
462                         max = next;
463                 }
464                 next++;
465             }
466             if (*next == '}') {         /* got one */
467                 if (!max)
468                     max = next;
469                 regparse++;
470                 iter = atoi(regparse);
471                 if (flags&SIMPLE) {     /* we can do it right after all */
472                     int tmp;
473
474                     reginsert(CURLY, ret);
475                     if (*max == ',')
476                         max++;
477                     tmp = atoi(max);
478                     if (tmp && tmp < iter)
479                         fatal("Can't do {n,m} with n > m");
480                     if (regcode != &regdummy) {
481 #ifdef REGALIGN
482                         *(unsigned short *)(ret+3) = iter;
483                         *(unsigned short *)(ret+5) = tmp;
484 #else
485                         ret[3] = iter >> 8; ret[4] = iter & 0377;
486                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
487 #endif
488                     }
489                     regparse = next;
490                     goto nest_check;
491                 }
492                 regsawbracket++;        /* remember we clobbered exp */
493                 if (iter > 0) {
494                     ch = *max;
495                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
496                     *max = ch;
497                     if (*max == ',' && max[1] != '}') {
498                         if (atoi(max+1) <= 0)
499                             fatal("Can't do {n,m} with n > m");
500                         ch = *next;
501                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
502                         *next = ch;
503                     }
504                     if (iter != 1 || *max == ',') {
505                         regparse = origparse;   /* back up input pointer */
506                         regnpar = orignpar;     /* don't make more parens */
507                     }
508                     else {
509                         regparse = next;
510                         goto nest_check;
511                     }
512                     *flagp = flags;
513                     return ret;
514                 }
515                 if (*max == ',') {
516                     max++;
517                     iter = atoi(max);
518                     if (max == next) {          /* any number more? */
519                         regparse = next;
520                         op = '*';               /* fake up one with a star */
521                     }
522                     else if (iter > 0) {
523                         op = '?';               /* fake up optional atom */
524                         ch = *next;
525                         sprintf(max,"%.*d", next-max, iter - 1);
526                         *next = ch;
527                         if (iter == 1)
528                             regparse = next;
529                         else {
530                             regparse = origparse - 1; /* offset ++ below */
531                             regnpar = orignpar;
532                         }
533                     }
534                     else
535                         fatal("Can't do {n,0}");
536                 }
537                 else
538                     fatal("Can't do {0}");
539             }
540         }
541
542         if (!ISMULT1(op)) {
543                 *flagp = flags;
544                 return(ret);
545         }
546
547         if (!(flags&HASWIDTH) && op != '?')
548                 FAIL("regexp *+ operand could be empty");
549         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
550
551         if (op == '*' && (flags&SIMPLE))
552                 reginsert(STAR, ret);
553         else if (op == '*') {
554                 /* Emit x* as (x&|), where & means "self". */
555                 reginsert(BRANCH, ret);                 /* Either x */
556                 regoptail(ret, regnode(BACK));          /* and loop */
557                 regoptail(ret, ret);                    /* back */
558                 regtail(ret, regnode(BRANCH));          /* or */
559                 regtail(ret, regnode(NOTHING));         /* null. */
560         } else if (op == '+' && (flags&SIMPLE))
561                 reginsert(PLUS, ret);
562         else if (op == '+') {
563                 /* Emit x+ as x(&|), where & means "self". */
564                 next = regnode(BRANCH);                 /* Either */
565                 regtail(ret, next);
566                 regtail(regnode(BACK), ret);            /* loop back */
567                 regtail(next, regnode(BRANCH));         /* or */
568                 regtail(ret, regnode(NOTHING));         /* null. */
569         } else if (op == '?') {
570                 /* Emit x? as (x|) */
571                 reginsert(BRANCH, ret);                 /* Either x */
572                 regtail(ret, regnode(BRANCH));          /* or */
573                 next = regnode(NOTHING);                /* null. */
574                 regtail(ret, next);
575                 regoptail(ret, next);
576         }
577       nest_check:
578         regparse++;
579         if (ISMULT2(regparse))
580                 FAIL("nested *?+ in regexp");
581
582         return(ret);
583 }
584
585 /*
586  - regatom - the lowest level
587  *
588  * Optimization:  gobbles an entire sequence of ordinary characters so that
589  * it can turn them into a single node, which is smaller to store and
590  * faster to run.  Backslashed characters are exceptions, each becoming a
591  * separate node; the code is simpler that way and it's not worth fixing.
592  *
593  * [Yes, it is worth fixing, some scripts can run twice the speed.]
594  */
595 static char *
596 regatom(flagp)
597 int *flagp;
598 {
599         register char *ret;
600         int flags;
601
602         *flagp = WORST;         /* Tentatively. */
603
604         switch (*regparse++) {
605         case '^':
606                 ret = regnode(BOL);
607                 break;
608         case '$':
609                 ret = regnode(EOL);
610                 break;
611         case '.':
612                 ret = regnode(ANY);
613                 *flagp |= HASWIDTH|SIMPLE;
614                 break;
615         case '[':
616                 ret = regclass();
617                 *flagp |= HASWIDTH|SIMPLE;
618                 break;
619         case '(':
620                 ret = reg(1, &flags);
621                 if (ret == NULL)
622                         return(NULL);
623                 *flagp |= flags&(HASWIDTH|SPSTART);
624                 break;
625         case '|':
626         case ')':
627                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
628                 break;
629         case '?':
630         case '+':
631         case '*':
632                 FAIL("?+* follows nothing in regexp");
633                 break;
634         case '\\':
635                 switch (*regparse) {
636                 case 'w':
637                         ret = regnode(ALNUM);
638                         *flagp |= HASWIDTH|SIMPLE;
639                         regparse++;
640                         break;
641                 case 'W':
642                         ret = regnode(NALNUM);
643                         *flagp |= HASWIDTH|SIMPLE;
644                         regparse++;
645                         break;
646                 case 'b':
647                         ret = regnode(BOUND);
648                         *flagp |= SIMPLE;
649                         regparse++;
650                         break;
651                 case 'B':
652                         ret = regnode(NBOUND);
653                         *flagp |= SIMPLE;
654                         regparse++;
655                         break;
656                 case 's':
657                         ret = regnode(SPACE);
658                         *flagp |= HASWIDTH|SIMPLE;
659                         regparse++;
660                         break;
661                 case 'S':
662                         ret = regnode(NSPACE);
663                         *flagp |= HASWIDTH|SIMPLE;
664                         regparse++;
665                         break;
666                 case 'd':
667                         ret = regnode(DIGIT);
668                         *flagp |= HASWIDTH|SIMPLE;
669                         regparse++;
670                         break;
671                 case 'D':
672                         ret = regnode(NDIGIT);
673                         *flagp |= HASWIDTH|SIMPLE;
674                         regparse++;
675                         break;
676                 case 'n':
677                 case 'r':
678                 case 't':
679                 case 'f':
680                         goto defchar;
681                 case '0': case '1': case '2': case '3': case '4':
682                 case '5': case '6': case '7': case '8': case '9':
683                         if (isdigit(regparse[1]) || *regparse == '0')
684                                 goto defchar;
685                         else {
686                                 ret = regnode(REF + *regparse++ - '0');
687                                 *flagp |= SIMPLE;
688                         }
689                         break;
690                 case '\0':
691                         if (regparse >= regxend)
692                             FAIL("trailing \\ in regexp");
693                         /* FALL THROUGH */
694                 default:
695                         goto defchar;
696                 }
697                 break;
698         default: {
699                         register int len;
700                         register char ender;
701                         register char *p;
702                         char *oldp;
703                         int foo;
704
705                     defchar:
706                         ret = regnode(EXACTLY);
707                         regc(0);                /* save spot for len */
708                         for (len=0, p=regparse-1;
709                           len < 127 && p < regxend;
710                           len++)
711                         {
712                             oldp = p;
713                             switch (*p) {
714                             case '^':
715                             case '$':
716                             case '.':
717                             case '[':
718                             case '(':
719                             case ')':
720                             case '|':
721                                 goto loopdone;
722                             case '\\':
723                                 switch (*++p) {
724                                 case 'w':
725                                 case 'W':
726                                 case 'b':
727                                 case 'B':
728                                 case 's':
729                                 case 'S':
730                                 case 'd':
731                                 case 'D':
732                                     --p;
733                                     goto loopdone;
734                                 case 'n':
735                                         ender = '\n';
736                                         p++;
737                                         break;
738                                 case 'r':
739                                         ender = '\r';
740                                         p++;
741                                         break;
742                                 case 't':
743                                         ender = '\t';
744                                         p++;
745                                         break;
746                                 case 'f':
747                                         ender = '\f';
748                                         p++;
749                                         break;
750                                 case '0': case '1': case '2': case '3':case '4':
751                                 case '5': case '6': case '7': case '8':case '9':
752                                     if (isdigit(p[1]) || *p == '0') {
753                                         foo = *p - '0';
754                                         if (isdigit(p[1]))
755                                             foo = (foo<<3) + *++p - '0';
756                                         if (isdigit(p[1]))
757                                             foo = (foo<<3) + *++p - '0';
758                                         ender = foo;
759                                         p++;
760                                     }
761                                     else {
762                                         --p;
763                                         goto loopdone;
764                                     }
765                                     break;
766                                 case '\0':
767                                     if (p >= regxend)
768                                         FAIL("trailing \\ in regexp");
769                                     /* FALL THROUGH */
770                                 default:
771                                     ender = *p++;
772                                     break;
773                                 }
774                                 break;
775                             default:
776                                 ender = *p++;
777                                 break;
778                             }
779                             if (regfold && isupper(ender))
780                                     ender = tolower(ender);
781                             if (ISMULT2(p)) { /* Back off on ?+*. */
782                                 if (len)
783                                     p = oldp;
784                                 else {
785                                     len++;
786                                     regc(ender);
787                                 }
788                                 break;
789                             }
790                             regc(ender);
791                         }
792                     loopdone:
793                         regparse = p;
794                         if (len <= 0)
795                                 FAIL("internal disaster in regexp");
796                         *flagp |= HASWIDTH;
797                         if (len == 1)
798                                 *flagp |= SIMPLE;
799                         if (regcode != &regdummy)
800                             *OPERAND(ret) = len;
801                         regc('\0');
802                 }
803                 break;
804         }
805
806         return(ret);
807 }
808
809 static void
810 regset(bits,def,c)
811 char *bits;
812 int def;
813 register int c;
814 {
815         if (regcode == &regdummy)
816             return;
817         c &= 255;
818         if (def)
819                 bits[c >> 3] &= ~(1 << (c & 7));
820         else
821                 bits[c >> 3] |=  (1 << (c & 7));
822 }
823
824 static char *
825 regclass()
826 {
827         register char *bits;
828         register int class;
829         register int lastclass;
830         register int range = 0;
831         register char *ret;
832         register int def;
833
834         ret = regnode(ANYOF);
835         if (*regparse == '^') { /* Complement of range. */
836                 regparse++;
837                 def = 0;
838         } else {
839                 def = 255;
840         }
841         bits = regcode;
842         for (class = 0; class < 32; class++)
843             regc(def);
844         if (*regparse == ']' || *regparse == '-')
845                 goto skipcond;          /* allow 1st char to be ] or - */
846         while (regparse < regxend && *regparse != ']') {
847               skipcond:
848                 class = UCHARAT(regparse++);
849                 if (class == '\\') {
850                         class = UCHARAT(regparse++);
851                         switch (class) {
852                         case 'w':
853                                 for (class = 'a'; class <= 'z'; class++)
854                                         regset(bits,def,class);
855                                 for (class = 'A'; class <= 'Z'; class++)
856                                         regset(bits,def,class);
857                                 for (class = '0'; class <= '9'; class++)
858                                         regset(bits,def,class);
859                                 regset(bits,def,'_');
860                                 lastclass = 1234;
861                                 continue;
862                         case 's':
863                                 regset(bits,def,' ');
864                                 regset(bits,def,'\t');
865                                 regset(bits,def,'\r');
866                                 regset(bits,def,'\f');
867                                 regset(bits,def,'\n');
868                                 lastclass = 1234;
869                                 continue;
870                         case 'd':
871                                 for (class = '0'; class <= '9'; class++)
872                                         regset(bits,def,class);
873                                 lastclass = 1234;
874                                 continue;
875                         case 'n':
876                                 class = '\n';
877                                 break;
878                         case 'r':
879                                 class = '\r';
880                                 break;
881                         case 't':
882                                 class = '\t';
883                                 break;
884                         case 'f':
885                                 class = '\f';
886                                 break;
887                         case 'b':
888                                 class = '\b';
889                                 break;
890                         case '0': case '1': case '2': case '3': case '4':
891                         case '5': case '6': case '7': case '8': case '9':
892                                 class -= '0';
893                                 if (isdigit(*regparse)) {
894                                         class <<= 3;
895                                         class += *regparse++ - '0';
896                                 }
897                                 if (isdigit(*regparse)) {
898                                         class <<= 3;
899                                         class += *regparse++ - '0';
900                                 }
901                                 break;
902                         }
903                 }
904                 if (range) {
905                         if (lastclass > class)
906                                 FAIL("invalid [] range in regexp");
907                         range = 0;
908                 }
909                 else {
910                         lastclass = class;
911                         if (*regparse == '-' && regparse+1 < regxend &&
912                             regparse[1] != ']') {
913                                 regparse++;
914                                 range = 1;
915                                 continue;       /* do it next time */
916                         }
917                 }
918                 for ( ; lastclass <= class; lastclass++) {
919                         regset(bits,def,lastclass);
920                         if (regfold && isupper(lastclass))
921                                 regset(bits,def,tolower(lastclass));
922                 }
923                 lastclass = class;
924         }
925         if (*regparse != ']')
926                 FAIL("unmatched [] in regexp");
927         regparse++;
928         return ret;
929 }
930
931 /*
932  - regnode - emit a node
933  */
934 static char *                   /* Location. */
935 regnode(op)
936 char op;
937 {
938         register char *ret;
939         register char *ptr;
940
941         ret = regcode;
942         if (ret == &regdummy) {
943 #ifdef REGALIGN
944                 if (!(regsize & 1))
945                         regsize++;
946 #endif
947                 regsize += 3;
948                 return(ret);
949         }
950
951 #ifdef REGALIGN
952 #ifndef lint
953         if (!((long)ret & 1))
954             *ret++ = 127;
955 #endif
956 #endif
957         ptr = ret;
958         *ptr++ = op;
959         *ptr++ = '\0';          /* Null "next" pointer. */
960         *ptr++ = '\0';
961         regcode = ptr;
962
963         return(ret);
964 }
965
966 /*
967  - regc - emit (if appropriate) a byte of code
968  */
969 static void
970 regc(b)
971 char b;
972 {
973         if (regcode != &regdummy)
974                 *regcode++ = b;
975         else
976                 regsize++;
977 }
978
979 /*
980  - reginsert - insert an operator in front of already-emitted operand
981  *
982  * Means relocating the operand.
983  */
984 static void
985 reginsert(op, opnd)
986 char op;
987 char *opnd;
988 {
989         register char *src;
990         register char *dst;
991         register char *place;
992         register offset = (op == CURLY ? 4 : 0);
993
994         if (regcode == &regdummy) {
995 #ifdef REGALIGN
996                 regsize += 4 + offset;
997 #else
998                 regsize += 3 + offset;
999 #endif
1000                 return;
1001         }
1002
1003         src = regcode;
1004 #ifdef REGALIGN
1005         regcode += 4 + offset;
1006 #else
1007         regcode += 3 + offset;
1008 #endif
1009         dst = regcode;
1010         while (src > opnd)
1011                 *--dst = *--src;
1012
1013         place = opnd;           /* Op node, where operand used to be. */
1014         *place++ = op;
1015         *place++ = '\0';
1016         *place++ = '\0';
1017         while (offset-- > 0)
1018             *place++ = '\0';
1019 }
1020
1021 /*
1022  - regtail - set the next-pointer at the end of a node chain
1023  */
1024 static void
1025 regtail(p, val)
1026 char *p;
1027 char *val;
1028 {
1029         register char *scan;
1030         register char *temp;
1031         register int offset;
1032
1033         if (p == &regdummy)
1034                 return;
1035
1036         /* Find last node. */
1037         scan = p;
1038         for (;;) {
1039                 temp = regnext(scan);
1040                 if (temp == NULL)
1041                         break;
1042                 scan = temp;
1043         }
1044
1045 #ifdef REGALIGN
1046         offset = val - scan;
1047 #ifndef lint
1048         *(short*)(scan+1) = offset;
1049 #else
1050         offset = offset;
1051 #endif
1052 #else
1053         if (OP(scan) == BACK)
1054                 offset = scan - val;
1055         else
1056                 offset = val - scan;
1057         *(scan+1) = (offset>>8)&0377;
1058         *(scan+2) = offset&0377;
1059 #endif
1060 }
1061
1062 /*
1063  - regoptail - regtail on operand of first argument; nop if operandless
1064  */
1065 static void
1066 regoptail(p, val)
1067 char *p;
1068 char *val;
1069 {
1070         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1071         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1072                 return;
1073         regtail(NEXTOPER(p), val);
1074 }
1075
1076 /*
1077  - regcurly - a little FSA that accepts {\d+,?\d*}
1078  */
1079 STATIC int
1080 regcurly(s)
1081 register char *s;
1082 {
1083     if (*s++ != '{')
1084         return FALSE;
1085     if (!isdigit(*s))
1086         return FALSE;
1087     while (isdigit(*s))
1088         s++;
1089     if (*s == ',')
1090         s++;
1091     while (isdigit(*s))
1092         s++;
1093     if (*s != '}')
1094         return FALSE;
1095     return TRUE;
1096 }
1097
1098 #ifdef DEBUGGING
1099
1100 /*
1101  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1102  */
1103 void
1104 regdump(r)
1105 regexp *r;
1106 {
1107         register char *s;
1108         register char op = EXACTLY;     /* Arbitrary non-END op. */
1109         register char *next;
1110         extern char *index();
1111
1112
1113         s = r->program + 1;
1114         while (op != END) {     /* While that wasn't END last time... */
1115 #ifdef REGALIGN
1116                 if (!((long)s & 1))
1117                         s++;
1118 #endif
1119                 op = OP(s);
1120                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1121                 next = regnext(s);
1122                 if (next == NULL)               /* Next ptr. */
1123                         fprintf(stderr,"(0)");
1124                 else 
1125                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1126                 s += 3;
1127                 if (op == ANYOF) {
1128                         s += 32;
1129                 }
1130                 if (op == EXACTLY) {
1131                         /* Literal string, where present. */
1132                         s++;
1133                         while (*s != '\0') {
1134                                 (void)putchar(*s);
1135                                 s++;
1136                         }
1137                         s++;
1138                 }
1139                 (void)putchar('\n');
1140         }
1141
1142         /* Header fields of interest. */
1143         if (r->regstart)
1144                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1145         if (r->regstclass)
1146                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1147         if (r->reganch & 1)
1148                 fprintf(stderr,"anchored ");
1149         if (r->reganch & 2)
1150                 fprintf(stderr,"plus ");
1151         if (r->regmust != NULL)
1152                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1153                   r->regback);
1154         fprintf(stderr,"\n");
1155 }
1156
1157 /*
1158  - regprop - printable representation of opcode
1159  */
1160 char *
1161 regprop(op)
1162 char *op;
1163 {
1164         register char *p;
1165
1166         (void) strcpy(buf, ":");
1167
1168         switch (OP(op)) {
1169         case BOL:
1170                 p = "BOL";
1171                 break;
1172         case EOL:
1173                 p = "EOL";
1174                 break;
1175         case ANY:
1176                 p = "ANY";
1177                 break;
1178         case ANYOF:
1179                 p = "ANYOF";
1180                 break;
1181         case BRANCH:
1182                 p = "BRANCH";
1183                 break;
1184         case EXACTLY:
1185                 p = "EXACTLY";
1186                 break;
1187         case NOTHING:
1188                 p = "NOTHING";
1189                 break;
1190         case BACK:
1191                 p = "BACK";
1192                 break;
1193         case END:
1194                 p = "END";
1195                 break;
1196         case ALNUM:
1197                 p = "ALNUM";
1198                 break;
1199         case NALNUM:
1200                 p = "NALNUM";
1201                 break;
1202         case BOUND:
1203                 p = "BOUND";
1204                 break;
1205         case NBOUND:
1206                 p = "NBOUND";
1207                 break;
1208         case SPACE:
1209                 p = "SPACE";
1210                 break;
1211         case NSPACE:
1212                 p = "NSPACE";
1213                 break;
1214         case DIGIT:
1215                 p = "DIGIT";
1216                 break;
1217         case NDIGIT:
1218                 p = "NDIGIT";
1219                 break;
1220         case CURLY:
1221                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1222                     ARG1(op),ARG2(op));
1223                 p = NULL;
1224                 break;
1225         case REF:
1226         case REF+1:
1227         case REF+2:
1228         case REF+3:
1229         case REF+4:
1230         case REF+5:
1231         case REF+6:
1232         case REF+7:
1233         case REF+8:
1234         case REF+9:
1235                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1236                 p = NULL;
1237                 break;
1238         case OPEN+1:
1239         case OPEN+2:
1240         case OPEN+3:
1241         case OPEN+4:
1242         case OPEN+5:
1243         case OPEN+6:
1244         case OPEN+7:
1245         case OPEN+8:
1246         case OPEN+9:
1247                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1248                 p = NULL;
1249                 break;
1250         case CLOSE+1:
1251         case CLOSE+2:
1252         case CLOSE+3:
1253         case CLOSE+4:
1254         case CLOSE+5:
1255         case CLOSE+6:
1256         case CLOSE+7:
1257         case CLOSE+8:
1258         case CLOSE+9:
1259                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1260                 p = NULL;
1261                 break;
1262         case STAR:
1263                 p = "STAR";
1264                 break;
1265         case PLUS:
1266                 p = "PLUS";
1267                 break;
1268         default:
1269                 FAIL("corrupted regexp opcode");
1270         }
1271         if (p != NULL)
1272                 (void) strcat(buf, p);
1273         return(buf);
1274 }
1275 #endif /* DEBUGGING */
1276
1277 regfree(r)
1278 struct regexp *r;
1279 {
1280         if (r->precomp)
1281                 Safefree(r->precomp);
1282         if (r->subbase)
1283                 Safefree(r->subbase);
1284         if (r->regmust)
1285                 str_free(r->regmust);
1286         if (r->regstart)
1287                 str_free(r->regstart);
1288         Safefree(r);
1289 }