This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #28 (combined patch)
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.5 90/08/13 22:23:29 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.5  90/08/13  22:23:29  lwall
14  * patch28: /x{m}/ didn't work right
15  * 
16  * Revision 3.0.1.4  90/08/09  05:05:33  lwall
17  * patch19: sped up /x+y/ patterns greatly by not retrying on every x
18  * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
19  * patch19: sped up {m,n} on simple items
20  * patch19: optimized /.*whatever/ to /^.*whatever/
21  * patch19: fixed character classes to allow backslashing hyphen
22  * 
23  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
24  * patch13: pattern matches can now use \0 to mean \000
25  * 
26  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
27  * patch9: /[\200-\377]/ didn't work on machines with signed chars
28  * 
29  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
30  * patch2: /[\000]/ didn't work
31  * 
32  * Revision 3.0  89/10/18  15:22:29  lwall
33  * 3.0 baseline
34  * 
35  */
36
37 /*
38  * regcomp and regexec -- regsub and regerror are not used in perl
39  *
40  *      Copyright (c) 1986 by University of Toronto.
41  *      Written by Henry Spencer.  Not derived from licensed software.
42  *
43  *      Permission is granted to anyone to use this software for any
44  *      purpose on any computer system, and to redistribute it freely,
45  *      subject to the following restrictions:
46  *
47  *      1. The author is not responsible for the consequences of use of
48  *              this software, no matter how awful, even if they arise
49  *              from defects in it.
50  *
51  *      2. The origin of this software must not be misrepresented, either
52  *              by explicit claim or by omission.
53  *
54  *      3. Altered versions must be plainly marked as such, and must not
55  *              be misrepresented as being the original software.
56  *
57  *
58  ****    Alterations to Henry's code are...
59  ****
60  ****    Copyright (c) 1989, Larry Wall
61  ****
62  ****    You may distribute under the terms of the GNU General Public License
63  ****    as specified in the README file that comes with the perl 3.0 kit.
64  *
65  * Beware that some of this code is subtly aware of the way operator
66  * precedence is structured in regular expressions.  Serious changes in
67  * regular-expression syntax might require a total rethink.
68  */
69 #include "EXTERN.h"
70 #include "perl.h"
71 #include "INTERN.h"
72 #include "regcomp.h"
73
74 #ifndef STATIC
75 #define STATIC  static
76 #endif
77
78 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
79 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
80         ((*s) == '{' && regcurly(s)))
81 #define META    "^$.[()|?+*\\"
82
83 /*
84  * Flags to be passed up and down.
85  */
86 #define HASWIDTH        01      /* Known never to match null string. */
87 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
88 #define SPSTART         04      /* Starts with * or +. */
89 #define WORST           0       /* Worst case. */
90
91 /*
92  * Global work variables for regcomp().
93  */
94 static char *regprecomp;                /* uncompiled string. */
95 static char *regparse;          /* Input-scan pointer. */
96 static char *regxend;           /* End of input for compile */
97 static int regnpar;             /* () count. */
98 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
99 static long regsize;            /* Code size. */
100 static int regfold;
101 static int regsawbracket;       /* Did we do {d,d} trick? */
102
103 /*
104  * Forward declarations for regcomp()'s friends.
105  */
106 STATIC int regcurly();
107 STATIC char *reg();
108 STATIC char *regbranch();
109 STATIC char *regpiece();
110 STATIC char *regatom();
111 STATIC char *regclass();
112 STATIC char *regnode();
113 STATIC void regc();
114 STATIC void reginsert();
115 STATIC void regtail();
116 STATIC void regoptail();
117
118 /*
119  - regcomp - compile a regular expression into internal code
120  *
121  * We can't allocate space until we know how big the compiled form will be,
122  * but we can't compile it (and thus know how big it is) until we've got a
123  * place to put the code.  So we cheat:  we compile it twice, once with code
124  * generation turned off and size counting turned on, and once "for real".
125  * This also means that we don't allocate space until we are sure that the
126  * thing really will compile successfully, and we never have to move the
127  * code and thus invalidate pointers into it.  (Note that it has to be in
128  * one piece because free() must be able to free it all.) [NB: not true in perl]
129  *
130  * Beware that the optimization-preparation code in here knows about some
131  * of the structure of the compiled regexp.  [I'll say.]
132  */
133 regexp *
134 regcomp(exp,xend,fold)
135 char *exp;
136 char *xend;
137 int fold;
138 {
139         register regexp *r;
140         register char *scan;
141         register STR *longest;
142         register int len;
143         register char *first;
144         int flags;
145         int back;
146         int curback;
147         extern char *safemalloc();
148         extern char *savestr();
149         int sawplus = 0;
150
151         if (exp == NULL)
152                 fatal("NULL regexp argument");
153
154         /* First pass: determine size, legality. */
155         regfold = fold;
156         regparse = exp;
157         regxend = xend;
158         regprecomp = nsavestr(exp,xend-exp);
159         regsawbracket = 0;
160         regnpar = 1;
161         regsize = 0L;
162         regcode = &regdummy;
163         regc(MAGIC);
164         if (reg(0, &flags) == NULL) {
165                 Safefree(regprecomp);
166                 return(NULL);
167         }
168
169         /* Small enough for pointer-storage convention? */
170         if (regsize >= 32767L)          /* Probably could be 65535L. */
171                 FAIL("regexp too big");
172
173         /* Allocate space. */
174         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
175         if (r == NULL)
176                 FAIL("regexp out of space");
177
178         /* Second pass: emit code. */
179         if (regsawbracket)
180             bcopy(regprecomp,exp,xend-exp);
181         r->precomp = regprecomp;
182         r->subbase = NULL;
183         regparse = exp;
184         regnpar = 1;
185         regcode = r->program;
186         regc(MAGIC);
187         if (reg(0, &flags) == NULL)
188                 return(NULL);
189
190         /* Dig out information for optimizations. */
191         r->regstart = Nullstr;  /* Worst-case defaults. */
192         r->reganch = 0;
193         r->regmust = Nullstr;
194         r->regback = -1;
195         r->regstclass = Nullch;
196         scan = r->program+1;                    /* First BRANCH. */
197         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
198                 scan = NEXTOPER(scan);
199
200                 first = scan;
201                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
202                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
203                     (OP(first) == PLUS) ||
204                     (OP(first) == CURLY && ARG1(first) > 0) ) {
205                         if (OP(first) == CURLY)
206                             first += 4;
207                         else if (OP(first) == PLUS)
208                             sawplus = 2;
209                         first = NEXTOPER(first);
210                 }
211
212                 /* Starting-point info. */
213                 if (OP(first) == EXACTLY) {
214                         r->regstart =
215                             str_make(OPERAND(first)+1,*OPERAND(first));
216                         if (r->regstart->str_cur > !(sawstudy|fold))
217                                 fbmcompile(r->regstart,fold);
218                 }
219                 else if ((exp = index(simple,OP(first))) && exp > simple)
220                         r->regstclass = first;
221                 else if (OP(first) == BOUND || OP(first) == NBOUND)
222                         r->regstclass = first;
223                 else if (OP(first) == BOL ||
224                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
225                         r->reganch = 1;         /* kinda turn .* into ^.* */
226                 r->reganch |= sawplus;
227
228 #ifdef DEBUGGING
229                 if (debug & 512)
230                     fprintf(stderr,"first %d next %d offset %d\n",
231                       OP(first), OP(NEXTOPER(first)), first - scan);
232 #endif
233                 /*
234                  * If there's something expensive in the r.e., find the
235                  * longest literal string that must appear and make it the
236                  * regmust.  Resolve ties in favor of later strings, since
237                  * the regstart check works with the beginning of the r.e.
238                  * and avoiding duplication strengthens checking.  Not a
239                  * strong reason, but sufficient in the absence of others.
240                  * [Now we resolve ties in favor of the earlier string if
241                  * it happens that curback has been invalidated, since the
242                  * earlier string may buy us something the later one won't.]
243                  */
244                 longest = str_make("",0);
245                 len = 0;
246                 curback = 0;
247                 back = 0;
248                 while (scan != NULL) {
249                         if (OP(scan) == BRANCH) {
250                             if (OP(regnext(scan)) == BRANCH) {
251                                 curback = -30000;
252                                 while (OP(scan) == BRANCH)
253                                     scan = regnext(scan);
254                             }
255                             else        /* single branch is ok */
256                                 scan = NEXTOPER(scan);
257                         }
258                         if (OP(scan) == EXACTLY) {
259                             first = scan;
260                             while (OP(regnext(scan)) >= CLOSE)
261                                 scan = regnext(scan);
262                             if (curback - back == len) {
263                                 str_ncat(longest, OPERAND(first)+1,
264                                     *OPERAND(first));
265                                 len += *OPERAND(first);
266                                 curback += *OPERAND(first);
267                                 first = regnext(scan);
268                             }
269                             else if (*OPERAND(first) >= len + (curback >= 0)) {
270                                 len = *OPERAND(first);
271                                 str_nset(longest, OPERAND(first)+1,len);
272                                 back = curback;
273                                 curback += len;
274                                 first = regnext(scan);
275                             }
276                             else
277                                 curback += *OPERAND(first);
278                         }
279                         else if (index(varies,OP(scan)))
280                                 curback = -30000;
281                         else if (index(simple,OP(scan)))
282                                 curback++;
283                         scan = regnext(scan);
284                 }
285                 if (len) {
286                         r->regmust = longest;
287                         if (back < 0)
288                                 back = -1;
289                         r->regback = back;
290                         if (len > !(sawstudy||fold||OP(first)==EOL))
291                                 fbmcompile(r->regmust,fold);
292                         r->regmust->str_u.str_useful = 100;
293                         if (OP(first) == EOL) /* is match anchored to EOL? */
294                             r->regmust->str_pok |= SP_TAIL;
295                 }
296                 else
297                         str_free(longest);
298         }
299
300         r->do_folding = fold;
301         r->nparens = regnpar - 1;
302 #ifdef DEBUGGING
303         if (debug & 512)
304                 regdump(r);
305 #endif
306         return(r);
307 }
308
309 /*
310  - reg - regular expression, i.e. main body or parenthesized thing
311  *
312  * Caller must absorb opening parenthesis.
313  *
314  * Combining parenthesis handling with the base level of regular expression
315  * is a trifle forced, but the need to tie the tails of the branches to what
316  * follows makes it hard to avoid.
317  */
318 static char *
319 reg(paren, flagp)
320 int paren;                      /* Parenthesized? */
321 int *flagp;
322 {
323         register char *ret;
324         register char *br;
325         register char *ender;
326         register int parno;
327         int flags;
328
329         *flagp = HASWIDTH;      /* Tentatively. */
330
331         /* Make an OPEN node, if parenthesized. */
332         if (paren) {
333                 if (regnpar >= NSUBEXP)
334                         FAIL("too many () in regexp");
335                 parno = regnpar;
336                 regnpar++;
337                 ret = regnode(OPEN+parno);
338         } else
339                 ret = NULL;
340
341         /* Pick up the branches, linking them together. */
342         br = regbranch(&flags);
343         if (br == NULL)
344                 return(NULL);
345         if (ret != NULL)
346                 regtail(ret, br);       /* OPEN -> first. */
347         else
348                 ret = br;
349         if (!(flags&HASWIDTH))
350                 *flagp &= ~HASWIDTH;
351         *flagp |= flags&SPSTART;
352         while (*regparse == '|') {
353                 regparse++;
354                 br = regbranch(&flags);
355                 if (br == NULL)
356                         return(NULL);
357                 regtail(ret, br);       /* BRANCH -> BRANCH. */
358                 if (!(flags&HASWIDTH))
359                         *flagp &= ~HASWIDTH;
360                 *flagp |= flags&SPSTART;
361         }
362
363         /* Make a closing node, and hook it on the end. */
364         ender = regnode((paren) ? CLOSE+parno : END);   
365         regtail(ret, ender);
366
367         /* Hook the tails of the branches to the closing node. */
368         for (br = ret; br != NULL; br = regnext(br))
369                 regoptail(br, ender);
370
371         /* Check for proper termination. */
372         if (paren && *regparse++ != ')') {
373                 FAIL("unmatched () in regexp");
374         } else if (!paren && regparse < regxend) {
375                 if (*regparse == ')') {
376                         FAIL("unmatched () in regexp");
377                 } else
378                         FAIL("junk on end of regexp");  /* "Can't happen". */
379                 /* NOTREACHED */
380         }
381
382         return(ret);
383 }
384
385 /*
386  - regbranch - one alternative of an | operator
387  *
388  * Implements the concatenation operator.
389  */
390 static char *
391 regbranch(flagp)
392 int *flagp;
393 {
394         register char *ret;
395         register char *chain;
396         register char *latest;
397         int flags;
398
399         *flagp = WORST;         /* Tentatively. */
400
401         ret = regnode(BRANCH);
402         chain = NULL;
403         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
404                 latest = regpiece(&flags);
405                 if (latest == NULL)
406                         return(NULL);
407                 *flagp |= flags&HASWIDTH;
408                 if (chain == NULL)      /* First piece. */
409                         *flagp |= flags&SPSTART;
410                 else
411                         regtail(chain, latest);
412                 chain = latest;
413         }
414         if (chain == NULL)      /* Loop ran zero times. */
415                 (void) regnode(NOTHING);
416
417         return(ret);
418 }
419
420 /*
421  - regpiece - something followed by possible [*+?]
422  *
423  * Note that the branching code sequences used for ? and the general cases
424  * of * and + are somewhat optimized:  they use the same NOTHING node as
425  * both the endmarker for their branch list and the body of the last branch.
426  * It might seem that this node could be dispensed with entirely, but the
427  * endmarker role is not redundant.
428  */
429 static char *
430 regpiece(flagp)
431 int *flagp;
432 {
433         register char *ret;
434         register char op;
435         register char *next;
436         int flags;
437         char *origparse = regparse;
438         int orignpar = regnpar;
439         char *max;
440         int iter;
441         char ch;
442
443         ret = regatom(&flags);
444         if (ret == NULL)
445                 return(NULL);
446
447         op = *regparse;
448
449         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
450          * then we decrement the first number by one and reset our
451          * parsing back to the beginning of the same atom.  If the first number
452          * is down to 0, decrement the second number instead and fake up
453          * a ? after it.  Given the way this compiler doesn't keep track
454          * of offsets on the first pass, this is the only way to replicate
455          * a piece of code.  Sigh.
456          */
457         if (op == '{' && regcurly(regparse)) {
458             next = regparse + 1;
459             max = Nullch;
460             while (isdigit(*next) || *next == ',') {
461                 if (*next == ',') {
462                     if (max)
463                         break;
464                     else
465                         max = next;
466                 }
467                 next++;
468             }
469             if (*next == '}') {         /* got one */
470                 if (!max)
471                     max = next;
472                 regparse++;
473                 iter = atoi(regparse);
474                 if (flags&SIMPLE) {     /* we can do it right after all */
475                     int tmp;
476
477                     reginsert(CURLY, ret);
478                     if (*max == ',')
479                         max++;
480                     else
481                         max = regparse;
482                     tmp = atoi(max);
483                     if (tmp && tmp < iter)
484                         fatal("Can't do {n,m} with n > m");
485                     if (regcode != &regdummy) {
486 #ifdef REGALIGN
487                         *(unsigned short *)(ret+3) = iter;
488                         *(unsigned short *)(ret+5) = tmp;
489 #else
490                         ret[3] = iter >> 8; ret[4] = iter & 0377;
491                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
492 #endif
493                     }
494                     regparse = next;
495                     goto nest_check;
496                 }
497                 regsawbracket++;        /* remember we clobbered exp */
498                 if (iter > 0) {
499                     ch = *max;
500                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
501                     *max = ch;
502                     if (*max == ',' && max[1] != '}') {
503                         if (atoi(max+1) <= 0)
504                             fatal("Can't do {n,m} with n > m");
505                         ch = *next;
506                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
507                         *next = ch;
508                     }
509                     if (iter != 1 || *max == ',') {
510                         regparse = origparse;   /* back up input pointer */
511                         regnpar = orignpar;     /* don't make more parens */
512                     }
513                     else {
514                         regparse = next;
515                         goto nest_check;
516                     }
517                     *flagp = flags;
518                     return ret;
519                 }
520                 if (*max == ',') {
521                     max++;
522                     iter = atoi(max);
523                     if (max == next) {          /* any number more? */
524                         regparse = next;
525                         op = '*';               /* fake up one with a star */
526                     }
527                     else if (iter > 0) {
528                         op = '?';               /* fake up optional atom */
529                         ch = *next;
530                         sprintf(max,"%.*d", next-max, iter - 1);
531                         *next = ch;
532                         if (iter == 1)
533                             regparse = next;
534                         else {
535                             regparse = origparse - 1; /* offset ++ below */
536                             regnpar = orignpar;
537                         }
538                     }
539                     else
540                         fatal("Can't do {n,0}");
541                 }
542                 else
543                     fatal("Can't do {0}");
544             }
545         }
546
547         if (!ISMULT1(op)) {
548                 *flagp = flags;
549                 return(ret);
550         }
551
552         if (!(flags&HASWIDTH) && op != '?')
553                 FAIL("regexp *+ operand could be empty");
554         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
555
556         if (op == '*' && (flags&SIMPLE))
557                 reginsert(STAR, ret);
558         else if (op == '*') {
559                 /* Emit x* as (x&|), where & means "self". */
560                 reginsert(BRANCH, ret);                 /* Either x */
561                 regoptail(ret, regnode(BACK));          /* and loop */
562                 regoptail(ret, ret);                    /* back */
563                 regtail(ret, regnode(BRANCH));          /* or */
564                 regtail(ret, regnode(NOTHING));         /* null. */
565         } else if (op == '+' && (flags&SIMPLE))
566                 reginsert(PLUS, ret);
567         else if (op == '+') {
568                 /* Emit x+ as x(&|), where & means "self". */
569                 next = regnode(BRANCH);                 /* Either */
570                 regtail(ret, next);
571                 regtail(regnode(BACK), ret);            /* loop back */
572                 regtail(next, regnode(BRANCH));         /* or */
573                 regtail(ret, regnode(NOTHING));         /* null. */
574         } else if (op == '?') {
575                 /* Emit x? as (x|) */
576                 reginsert(BRANCH, ret);                 /* Either x */
577                 regtail(ret, regnode(BRANCH));          /* or */
578                 next = regnode(NOTHING);                /* null. */
579                 regtail(ret, next);
580                 regoptail(ret, next);
581         }
582       nest_check:
583         regparse++;
584         if (ISMULT2(regparse))
585                 FAIL("nested *?+ in regexp");
586
587         return(ret);
588 }
589
590 /*
591  - regatom - the lowest level
592  *
593  * Optimization:  gobbles an entire sequence of ordinary characters so that
594  * it can turn them into a single node, which is smaller to store and
595  * faster to run.  Backslashed characters are exceptions, each becoming a
596  * separate node; the code is simpler that way and it's not worth fixing.
597  *
598  * [Yes, it is worth fixing, some scripts can run twice the speed.]
599  */
600 static char *
601 regatom(flagp)
602 int *flagp;
603 {
604         register char *ret;
605         int flags;
606
607         *flagp = WORST;         /* Tentatively. */
608
609         switch (*regparse++) {
610         case '^':
611                 ret = regnode(BOL);
612                 break;
613         case '$':
614                 ret = regnode(EOL);
615                 break;
616         case '.':
617                 ret = regnode(ANY);
618                 *flagp |= HASWIDTH|SIMPLE;
619                 break;
620         case '[':
621                 ret = regclass();
622                 *flagp |= HASWIDTH|SIMPLE;
623                 break;
624         case '(':
625                 ret = reg(1, &flags);
626                 if (ret == NULL)
627                         return(NULL);
628                 *flagp |= flags&(HASWIDTH|SPSTART);
629                 break;
630         case '|':
631         case ')':
632                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
633                 break;
634         case '?':
635         case '+':
636         case '*':
637                 FAIL("?+* follows nothing in regexp");
638                 break;
639         case '\\':
640                 switch (*regparse) {
641                 case 'w':
642                         ret = regnode(ALNUM);
643                         *flagp |= HASWIDTH|SIMPLE;
644                         regparse++;
645                         break;
646                 case 'W':
647                         ret = regnode(NALNUM);
648                         *flagp |= HASWIDTH|SIMPLE;
649                         regparse++;
650                         break;
651                 case 'b':
652                         ret = regnode(BOUND);
653                         *flagp |= SIMPLE;
654                         regparse++;
655                         break;
656                 case 'B':
657                         ret = regnode(NBOUND);
658                         *flagp |= SIMPLE;
659                         regparse++;
660                         break;
661                 case 's':
662                         ret = regnode(SPACE);
663                         *flagp |= HASWIDTH|SIMPLE;
664                         regparse++;
665                         break;
666                 case 'S':
667                         ret = regnode(NSPACE);
668                         *flagp |= HASWIDTH|SIMPLE;
669                         regparse++;
670                         break;
671                 case 'd':
672                         ret = regnode(DIGIT);
673                         *flagp |= HASWIDTH|SIMPLE;
674                         regparse++;
675                         break;
676                 case 'D':
677                         ret = regnode(NDIGIT);
678                         *flagp |= HASWIDTH|SIMPLE;
679                         regparse++;
680                         break;
681                 case 'n':
682                 case 'r':
683                 case 't':
684                 case 'f':
685                         goto defchar;
686                 case '0': case '1': case '2': case '3': case '4':
687                 case '5': case '6': case '7': case '8': case '9':
688                         if (isdigit(regparse[1]) || *regparse == '0')
689                                 goto defchar;
690                         else {
691                                 ret = regnode(REF + *regparse++ - '0');
692                                 *flagp |= SIMPLE;
693                         }
694                         break;
695                 case '\0':
696                         if (regparse >= regxend)
697                             FAIL("trailing \\ in regexp");
698                         /* FALL THROUGH */
699                 default:
700                         goto defchar;
701                 }
702                 break;
703         default: {
704                         register int len;
705                         register char ender;
706                         register char *p;
707                         char *oldp;
708                         int foo;
709
710                     defchar:
711                         ret = regnode(EXACTLY);
712                         regc(0);                /* save spot for len */
713                         for (len=0, p=regparse-1;
714                           len < 127 && p < regxend;
715                           len++)
716                         {
717                             oldp = p;
718                             switch (*p) {
719                             case '^':
720                             case '$':
721                             case '.':
722                             case '[':
723                             case '(':
724                             case ')':
725                             case '|':
726                                 goto loopdone;
727                             case '\\':
728                                 switch (*++p) {
729                                 case 'w':
730                                 case 'W':
731                                 case 'b':
732                                 case 'B':
733                                 case 's':
734                                 case 'S':
735                                 case 'd':
736                                 case 'D':
737                                     --p;
738                                     goto loopdone;
739                                 case 'n':
740                                         ender = '\n';
741                                         p++;
742                                         break;
743                                 case 'r':
744                                         ender = '\r';
745                                         p++;
746                                         break;
747                                 case 't':
748                                         ender = '\t';
749                                         p++;
750                                         break;
751                                 case 'f':
752                                         ender = '\f';
753                                         p++;
754                                         break;
755                                 case '0': case '1': case '2': case '3':case '4':
756                                 case '5': case '6': case '7': case '8':case '9':
757                                     if (isdigit(p[1]) || *p == '0') {
758                                         foo = *p - '0';
759                                         if (isdigit(p[1]))
760                                             foo = (foo<<3) + *++p - '0';
761                                         if (isdigit(p[1]))
762                                             foo = (foo<<3) + *++p - '0';
763                                         ender = foo;
764                                         p++;
765                                     }
766                                     else {
767                                         --p;
768                                         goto loopdone;
769                                     }
770                                     break;
771                                 case '\0':
772                                     if (p >= regxend)
773                                         FAIL("trailing \\ in regexp");
774                                     /* FALL THROUGH */
775                                 default:
776                                     ender = *p++;
777                                     break;
778                                 }
779                                 break;
780                             default:
781                                 ender = *p++;
782                                 break;
783                             }
784                             if (regfold && isupper(ender))
785                                     ender = tolower(ender);
786                             if (ISMULT2(p)) { /* Back off on ?+*. */
787                                 if (len)
788                                     p = oldp;
789                                 else {
790                                     len++;
791                                     regc(ender);
792                                 }
793                                 break;
794                             }
795                             regc(ender);
796                         }
797                     loopdone:
798                         regparse = p;
799                         if (len <= 0)
800                                 FAIL("internal disaster in regexp");
801                         *flagp |= HASWIDTH;
802                         if (len == 1)
803                                 *flagp |= SIMPLE;
804                         if (regcode != &regdummy)
805                             *OPERAND(ret) = len;
806                         regc('\0');
807                 }
808                 break;
809         }
810
811         return(ret);
812 }
813
814 static void
815 regset(bits,def,c)
816 char *bits;
817 int def;
818 register int c;
819 {
820         if (regcode == &regdummy)
821             return;
822         c &= 255;
823         if (def)
824                 bits[c >> 3] &= ~(1 << (c & 7));
825         else
826                 bits[c >> 3] |=  (1 << (c & 7));
827 }
828
829 static char *
830 regclass()
831 {
832         register char *bits;
833         register int class;
834         register int lastclass;
835         register int range = 0;
836         register char *ret;
837         register int def;
838
839         ret = regnode(ANYOF);
840         if (*regparse == '^') { /* Complement of range. */
841                 regparse++;
842                 def = 0;
843         } else {
844                 def = 255;
845         }
846         bits = regcode;
847         for (class = 0; class < 32; class++)
848             regc(def);
849         if (*regparse == ']' || *regparse == '-')
850                 goto skipcond;          /* allow 1st char to be ] or - */
851         while (regparse < regxend && *regparse != ']') {
852               skipcond:
853                 class = UCHARAT(regparse++);
854                 if (class == '\\') {
855                         class = UCHARAT(regparse++);
856                         switch (class) {
857                         case 'w':
858                                 for (class = 'a'; class <= 'z'; class++)
859                                         regset(bits,def,class);
860                                 for (class = 'A'; class <= 'Z'; class++)
861                                         regset(bits,def,class);
862                                 for (class = '0'; class <= '9'; class++)
863                                         regset(bits,def,class);
864                                 regset(bits,def,'_');
865                                 lastclass = 1234;
866                                 continue;
867                         case 's':
868                                 regset(bits,def,' ');
869                                 regset(bits,def,'\t');
870                                 regset(bits,def,'\r');
871                                 regset(bits,def,'\f');
872                                 regset(bits,def,'\n');
873                                 lastclass = 1234;
874                                 continue;
875                         case 'd':
876                                 for (class = '0'; class <= '9'; class++)
877                                         regset(bits,def,class);
878                                 lastclass = 1234;
879                                 continue;
880                         case 'n':
881                                 class = '\n';
882                                 break;
883                         case 'r':
884                                 class = '\r';
885                                 break;
886                         case 't':
887                                 class = '\t';
888                                 break;
889                         case 'f':
890                                 class = '\f';
891                                 break;
892                         case 'b':
893                                 class = '\b';
894                                 break;
895                         case '0': case '1': case '2': case '3': case '4':
896                         case '5': case '6': case '7': case '8': case '9':
897                                 class -= '0';
898                                 if (isdigit(*regparse)) {
899                                         class <<= 3;
900                                         class += *regparse++ - '0';
901                                 }
902                                 if (isdigit(*regparse)) {
903                                         class <<= 3;
904                                         class += *regparse++ - '0';
905                                 }
906                                 break;
907                         }
908                 }
909                 if (range) {
910                         if (lastclass > class)
911                                 FAIL("invalid [] range in regexp");
912                         range = 0;
913                 }
914                 else {
915                         lastclass = class;
916                         if (*regparse == '-' && regparse+1 < regxend &&
917                             regparse[1] != ']') {
918                                 regparse++;
919                                 range = 1;
920                                 continue;       /* do it next time */
921                         }
922                 }
923                 for ( ; lastclass <= class; lastclass++) {
924                         regset(bits,def,lastclass);
925                         if (regfold && isupper(lastclass))
926                                 regset(bits,def,tolower(lastclass));
927                 }
928                 lastclass = class;
929         }
930         if (*regparse != ']')
931                 FAIL("unmatched [] in regexp");
932         regparse++;
933         return ret;
934 }
935
936 /*
937  - regnode - emit a node
938  */
939 static char *                   /* Location. */
940 regnode(op)
941 char op;
942 {
943         register char *ret;
944         register char *ptr;
945
946         ret = regcode;
947         if (ret == &regdummy) {
948 #ifdef REGALIGN
949                 if (!(regsize & 1))
950                         regsize++;
951 #endif
952                 regsize += 3;
953                 return(ret);
954         }
955
956 #ifdef REGALIGN
957 #ifndef lint
958         if (!((long)ret & 1))
959             *ret++ = 127;
960 #endif
961 #endif
962         ptr = ret;
963         *ptr++ = op;
964         *ptr++ = '\0';          /* Null "next" pointer. */
965         *ptr++ = '\0';
966         regcode = ptr;
967
968         return(ret);
969 }
970
971 /*
972  - regc - emit (if appropriate) a byte of code
973  */
974 static void
975 regc(b)
976 char b;
977 {
978         if (regcode != &regdummy)
979                 *regcode++ = b;
980         else
981                 regsize++;
982 }
983
984 /*
985  - reginsert - insert an operator in front of already-emitted operand
986  *
987  * Means relocating the operand.
988  */
989 static void
990 reginsert(op, opnd)
991 char op;
992 char *opnd;
993 {
994         register char *src;
995         register char *dst;
996         register char *place;
997         register offset = (op == CURLY ? 4 : 0);
998
999         if (regcode == &regdummy) {
1000 #ifdef REGALIGN
1001                 regsize += 4 + offset;
1002 #else
1003                 regsize += 3 + offset;
1004 #endif
1005                 return;
1006         }
1007
1008         src = regcode;
1009 #ifdef REGALIGN
1010         regcode += 4 + offset;
1011 #else
1012         regcode += 3 + offset;
1013 #endif
1014         dst = regcode;
1015         while (src > opnd)
1016                 *--dst = *--src;
1017
1018         place = opnd;           /* Op node, where operand used to be. */
1019         *place++ = op;
1020         *place++ = '\0';
1021         *place++ = '\0';
1022         while (offset-- > 0)
1023             *place++ = '\0';
1024 }
1025
1026 /*
1027  - regtail - set the next-pointer at the end of a node chain
1028  */
1029 static void
1030 regtail(p, val)
1031 char *p;
1032 char *val;
1033 {
1034         register char *scan;
1035         register char *temp;
1036         register int offset;
1037
1038         if (p == &regdummy)
1039                 return;
1040
1041         /* Find last node. */
1042         scan = p;
1043         for (;;) {
1044                 temp = regnext(scan);
1045                 if (temp == NULL)
1046                         break;
1047                 scan = temp;
1048         }
1049
1050 #ifdef REGALIGN
1051         offset = val - scan;
1052 #ifndef lint
1053         *(short*)(scan+1) = offset;
1054 #else
1055         offset = offset;
1056 #endif
1057 #else
1058         if (OP(scan) == BACK)
1059                 offset = scan - val;
1060         else
1061                 offset = val - scan;
1062         *(scan+1) = (offset>>8)&0377;
1063         *(scan+2) = offset&0377;
1064 #endif
1065 }
1066
1067 /*
1068  - regoptail - regtail on operand of first argument; nop if operandless
1069  */
1070 static void
1071 regoptail(p, val)
1072 char *p;
1073 char *val;
1074 {
1075         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1076         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1077                 return;
1078         regtail(NEXTOPER(p), val);
1079 }
1080
1081 /*
1082  - regcurly - a little FSA that accepts {\d+,?\d*}
1083  */
1084 STATIC int
1085 regcurly(s)
1086 register char *s;
1087 {
1088     if (*s++ != '{')
1089         return FALSE;
1090     if (!isdigit(*s))
1091         return FALSE;
1092     while (isdigit(*s))
1093         s++;
1094     if (*s == ',')
1095         s++;
1096     while (isdigit(*s))
1097         s++;
1098     if (*s != '}')
1099         return FALSE;
1100     return TRUE;
1101 }
1102
1103 #ifdef DEBUGGING
1104
1105 /*
1106  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1107  */
1108 void
1109 regdump(r)
1110 regexp *r;
1111 {
1112         register char *s;
1113         register char op = EXACTLY;     /* Arbitrary non-END op. */
1114         register char *next;
1115         extern char *index();
1116
1117
1118         s = r->program + 1;
1119         while (op != END) {     /* While that wasn't END last time... */
1120 #ifdef REGALIGN
1121                 if (!((long)s & 1))
1122                         s++;
1123 #endif
1124                 op = OP(s);
1125                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1126                 next = regnext(s);
1127                 if (next == NULL)               /* Next ptr. */
1128                         fprintf(stderr,"(0)");
1129                 else 
1130                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1131                 s += 3;
1132                 if (op == ANYOF) {
1133                         s += 32;
1134                 }
1135                 if (op == EXACTLY) {
1136                         /* Literal string, where present. */
1137                         s++;
1138                         while (*s != '\0') {
1139                                 (void)putchar(*s);
1140                                 s++;
1141                         }
1142                         s++;
1143                 }
1144                 (void)putchar('\n');
1145         }
1146
1147         /* Header fields of interest. */
1148         if (r->regstart)
1149                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1150         if (r->regstclass)
1151                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1152         if (r->reganch & 1)
1153                 fprintf(stderr,"anchored ");
1154         if (r->reganch & 2)
1155                 fprintf(stderr,"plus ");
1156         if (r->regmust != NULL)
1157                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1158                   r->regback);
1159         fprintf(stderr,"\n");
1160 }
1161
1162 /*
1163  - regprop - printable representation of opcode
1164  */
1165 char *
1166 regprop(op)
1167 char *op;
1168 {
1169         register char *p;
1170
1171         (void) strcpy(buf, ":");
1172
1173         switch (OP(op)) {
1174         case BOL:
1175                 p = "BOL";
1176                 break;
1177         case EOL:
1178                 p = "EOL";
1179                 break;
1180         case ANY:
1181                 p = "ANY";
1182                 break;
1183         case ANYOF:
1184                 p = "ANYOF";
1185                 break;
1186         case BRANCH:
1187                 p = "BRANCH";
1188                 break;
1189         case EXACTLY:
1190                 p = "EXACTLY";
1191                 break;
1192         case NOTHING:
1193                 p = "NOTHING";
1194                 break;
1195         case BACK:
1196                 p = "BACK";
1197                 break;
1198         case END:
1199                 p = "END";
1200                 break;
1201         case ALNUM:
1202                 p = "ALNUM";
1203                 break;
1204         case NALNUM:
1205                 p = "NALNUM";
1206                 break;
1207         case BOUND:
1208                 p = "BOUND";
1209                 break;
1210         case NBOUND:
1211                 p = "NBOUND";
1212                 break;
1213         case SPACE:
1214                 p = "SPACE";
1215                 break;
1216         case NSPACE:
1217                 p = "NSPACE";
1218                 break;
1219         case DIGIT:
1220                 p = "DIGIT";
1221                 break;
1222         case NDIGIT:
1223                 p = "NDIGIT";
1224                 break;
1225         case CURLY:
1226                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1227                     ARG1(op),ARG2(op));
1228                 p = NULL;
1229                 break;
1230         case REF:
1231         case REF+1:
1232         case REF+2:
1233         case REF+3:
1234         case REF+4:
1235         case REF+5:
1236         case REF+6:
1237         case REF+7:
1238         case REF+8:
1239         case REF+9:
1240                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1241                 p = NULL;
1242                 break;
1243         case OPEN+1:
1244         case OPEN+2:
1245         case OPEN+3:
1246         case OPEN+4:
1247         case OPEN+5:
1248         case OPEN+6:
1249         case OPEN+7:
1250         case OPEN+8:
1251         case OPEN+9:
1252                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1253                 p = NULL;
1254                 break;
1255         case CLOSE+1:
1256         case CLOSE+2:
1257         case CLOSE+3:
1258         case CLOSE+4:
1259         case CLOSE+5:
1260         case CLOSE+6:
1261         case CLOSE+7:
1262         case CLOSE+8:
1263         case CLOSE+9:
1264                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1265                 p = NULL;
1266                 break;
1267         case STAR:
1268                 p = "STAR";
1269                 break;
1270         case PLUS:
1271                 p = "PLUS";
1272                 break;
1273         default:
1274                 FAIL("corrupted regexp opcode");
1275         }
1276         if (p != NULL)
1277                 (void) strcat(buf, p);
1278         return(buf);
1279 }
1280 #endif /* DEBUGGING */
1281
1282 regfree(r)
1283 struct regexp *r;
1284 {
1285         if (r->precomp)
1286                 Safefree(r->precomp);
1287         if (r->subbase)
1288                 Safefree(r->subbase);
1289         if (r->regmust)
1290                 str_free(r->regmust);
1291         if (r->regstart)
1292                 str_free(r->regstart);
1293         Safefree(r);
1294 }