This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #34 patch #29, continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.6 90/10/16 10:17:33 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.6  90/10/16  10:17:33  lwall
14  * patch29: patterns with multiple short literal strings sometimes failed
15  * 
16  * Revision 3.0.1.5  90/08/13  22:23:29  lwall
17  * patch28: /x{m}/ didn't work right
18  * 
19  * Revision 3.0.1.4  90/08/09  05:05:33  lwall
20  * patch19: sped up /x+y/ patterns greatly by not retrying on every x
21  * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
22  * patch19: sped up {m,n} on simple items
23  * patch19: optimized /.*whatever/ to /^.*whatever/
24  * patch19: fixed character classes to allow backslashing hyphen
25  * 
26  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
27  * patch13: pattern matches can now use \0 to mean \000
28  * 
29  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
30  * patch9: /[\200-\377]/ didn't work on machines with signed chars
31  * 
32  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
33  * patch2: /[\000]/ didn't work
34  * 
35  * Revision 3.0  89/10/18  15:22:29  lwall
36  * 3.0 baseline
37  * 
38  */
39
40 /*
41  * regcomp and regexec -- regsub and regerror are not used in perl
42  *
43  *      Copyright (c) 1986 by University of Toronto.
44  *      Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *      Permission is granted to anyone to use this software for any
47  *      purpose on any computer system, and to redistribute it freely,
48  *      subject to the following restrictions:
49  *
50  *      1. The author is not responsible for the consequences of use of
51  *              this software, no matter how awful, even if they arise
52  *              from defects in it.
53  *
54  *      2. The origin of this software must not be misrepresented, either
55  *              by explicit claim or by omission.
56  *
57  *      3. Altered versions must be plainly marked as such, and must not
58  *              be misrepresented as being the original software.
59  *
60  *
61  ****    Alterations to Henry's code are...
62  ****
63  ****    Copyright (c) 1989, Larry Wall
64  ****
65  ****    You may distribute under the terms of the GNU General Public License
66  ****    as specified in the README file that comes with the perl 3.0 kit.
67  *
68  * Beware that some of this code is subtly aware of the way operator
69  * precedence is structured in regular expressions.  Serious changes in
70  * regular-expression syntax might require a total rethink.
71  */
72 #include "EXTERN.h"
73 #include "perl.h"
74 #include "INTERN.h"
75 #include "regcomp.h"
76
77 #ifndef STATIC
78 #define STATIC  static
79 #endif
80
81 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
82 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
83         ((*s) == '{' && regcurly(s)))
84 #define META    "^$.[()|?+*\\"
85
86 /*
87  * Flags to be passed up and down.
88  */
89 #define HASWIDTH        01      /* Known never to match null string. */
90 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
91 #define SPSTART         04      /* Starts with * or +. */
92 #define WORST           0       /* Worst case. */
93
94 /*
95  * Global work variables for regcomp().
96  */
97 static char *regprecomp;                /* uncompiled string. */
98 static char *regparse;          /* Input-scan pointer. */
99 static char *regxend;           /* End of input for compile */
100 static int regnpar;             /* () count. */
101 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
102 static long regsize;            /* Code size. */
103 static int regfold;
104 static int regsawbracket;       /* Did we do {d,d} trick? */
105
106 /*
107  * Forward declarations for regcomp()'s friends.
108  */
109 STATIC int regcurly();
110 STATIC char *reg();
111 STATIC char *regbranch();
112 STATIC char *regpiece();
113 STATIC char *regatom();
114 STATIC char *regclass();
115 STATIC char *regnode();
116 STATIC void regc();
117 STATIC void reginsert();
118 STATIC void regtail();
119 STATIC void regoptail();
120
121 /*
122  - regcomp - compile a regular expression into internal code
123  *
124  * We can't allocate space until we know how big the compiled form will be,
125  * but we can't compile it (and thus know how big it is) until we've got a
126  * place to put the code.  So we cheat:  we compile it twice, once with code
127  * generation turned off and size counting turned on, and once "for real".
128  * This also means that we don't allocate space until we are sure that the
129  * thing really will compile successfully, and we never have to move the
130  * code and thus invalidate pointers into it.  (Note that it has to be in
131  * one piece because free() must be able to free it all.) [NB: not true in perl]
132  *
133  * Beware that the optimization-preparation code in here knows about some
134  * of the structure of the compiled regexp.  [I'll say.]
135  */
136 regexp *
137 regcomp(exp,xend,fold)
138 char *exp;
139 char *xend;
140 int fold;
141 {
142         register regexp *r;
143         register char *scan;
144         register STR *longish;
145         STR *longest;
146         register int len;
147         register char *first;
148         int flags;
149         int back;
150         int curback;
151         extern char *safemalloc();
152         extern char *savestr();
153         int sawplus = 0;
154
155         if (exp == NULL)
156                 fatal("NULL regexp argument");
157
158         /* First pass: determine size, legality. */
159         regfold = fold;
160         regparse = exp;
161         regxend = xend;
162         regprecomp = nsavestr(exp,xend-exp);
163         regsawbracket = 0;
164         regnpar = 1;
165         regsize = 0L;
166         regcode = &regdummy;
167         regc(MAGIC);
168         if (reg(0, &flags) == NULL) {
169                 Safefree(regprecomp);
170                 return(NULL);
171         }
172
173         /* Small enough for pointer-storage convention? */
174         if (regsize >= 32767L)          /* Probably could be 65535L. */
175                 FAIL("regexp too big");
176
177         /* Allocate space. */
178         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
179         if (r == NULL)
180                 FAIL("regexp out of space");
181
182         /* Second pass: emit code. */
183         if (regsawbracket)
184             bcopy(regprecomp,exp,xend-exp);
185         r->precomp = regprecomp;
186         r->subbase = NULL;
187         regparse = exp;
188         regnpar = 1;
189         regcode = r->program;
190         regc(MAGIC);
191         if (reg(0, &flags) == NULL)
192                 return(NULL);
193
194         /* Dig out information for optimizations. */
195         r->regstart = Nullstr;  /* Worst-case defaults. */
196         r->reganch = 0;
197         r->regmust = Nullstr;
198         r->regback = -1;
199         r->regstclass = Nullch;
200         scan = r->program+1;                    /* First BRANCH. */
201         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
202                 scan = NEXTOPER(scan);
203
204                 first = scan;
205                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
206                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
207                     (OP(first) == PLUS) ||
208                     (OP(first) == CURLY && ARG1(first) > 0) ) {
209                         if (OP(first) == CURLY)
210                             first += 4;
211                         else if (OP(first) == PLUS)
212                             sawplus = 2;
213                         first = NEXTOPER(first);
214                 }
215
216                 /* Starting-point info. */
217                 if (OP(first) == EXACTLY) {
218                         r->regstart =
219                             str_make(OPERAND(first)+1,*OPERAND(first));
220                         if (r->regstart->str_cur > !(sawstudy|fold))
221                                 fbmcompile(r->regstart,fold);
222                 }
223                 else if ((exp = index(simple,OP(first))) && exp > simple)
224                         r->regstclass = first;
225                 else if (OP(first) == BOUND || OP(first) == NBOUND)
226                         r->regstclass = first;
227                 else if (OP(first) == BOL ||
228                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
229                         r->reganch = 1;         /* kinda turn .* into ^.* */
230                 r->reganch |= sawplus;
231
232 #ifdef DEBUGGING
233                 if (debug & 512)
234                     fprintf(stderr,"first %d next %d offset %d\n",
235                       OP(first), OP(NEXTOPER(first)), first - scan);
236 #endif
237                 /*
238                  * If there's something expensive in the r.e., find the
239                  * longest literal string that must appear and make it the
240                  * regmust.  Resolve ties in favor of later strings, since
241                  * the regstart check works with the beginning of the r.e.
242                  * and avoiding duplication strengthens checking.  Not a
243                  * strong reason, but sufficient in the absence of others.
244                  * [Now we resolve ties in favor of the earlier string if
245                  * it happens that curback has been invalidated, since the
246                  * earlier string may buy us something the later one won't.]
247                  */
248                 longish = str_make("",0);
249                 longest = str_make("",0);
250                 len = 0;
251                 curback = 0;
252                 back = 0;
253                 while (scan != NULL) {
254                         if (OP(scan) == BRANCH) {
255                             if (OP(regnext(scan)) == BRANCH) {
256                                 curback = -30000;
257                                 while (OP(scan) == BRANCH)
258                                     scan = regnext(scan);
259                             }
260                             else        /* single branch is ok */
261                                 scan = NEXTOPER(scan);
262                         }
263                         if (OP(scan) == EXACTLY) {
264                             first = scan;
265                             while (OP(regnext(scan)) >= CLOSE)
266                                 scan = regnext(scan);
267                             if (curback - back == len) {
268                                 str_ncat(longish, OPERAND(first)+1,
269                                     *OPERAND(first));
270                                 len += *OPERAND(first);
271                                 curback += *OPERAND(first);
272                                 first = regnext(scan);
273                             }
274                             else if (*OPERAND(first) >= len + (curback >= 0)) {
275                                 len = *OPERAND(first);
276                                 str_nset(longish, OPERAND(first)+1,len);
277                                 back = curback;
278                                 curback += len;
279                                 first = regnext(scan);
280                             }
281                             else
282                                 curback += *OPERAND(first);
283                         }
284                         else if (index(varies,OP(scan))) {
285                             curback = -30000;
286                             len = 0;
287                             if (longish->str_cur > longest->str_cur)
288                                 str_sset(longest,longish);
289                             str_nset(longish,"",0);
290                         }
291                         else if (index(simple,OP(scan)))
292                             curback++;
293                         scan = regnext(scan);
294                 }
295                 if (longish->str_cur > longest->str_cur)
296                     str_sset(longest,longish);
297                 str_free(longish);
298                 if (longest->str_cur) {
299                         r->regmust = longest;
300                         if (back < 0)
301                                 back = -1;
302                         r->regback = back;
303                         if (longest->str_cur
304                           > !(sawstudy || fold || OP(first) == EOL) )
305                                 fbmcompile(r->regmust,fold);
306                         r->regmust->str_u.str_useful = 100;
307                         if (OP(first) == EOL) /* is match anchored to EOL? */
308                             r->regmust->str_pok |= SP_TAIL;
309                 }
310                 else
311                         str_free(longest);
312         }
313
314         r->do_folding = fold;
315         r->nparens = regnpar - 1;
316 #ifdef DEBUGGING
317         if (debug & 512)
318                 regdump(r);
319 #endif
320         return(r);
321 }
322
323 /*
324  - reg - regular expression, i.e. main body or parenthesized thing
325  *
326  * Caller must absorb opening parenthesis.
327  *
328  * Combining parenthesis handling with the base level of regular expression
329  * is a trifle forced, but the need to tie the tails of the branches to what
330  * follows makes it hard to avoid.
331  */
332 static char *
333 reg(paren, flagp)
334 int paren;                      /* Parenthesized? */
335 int *flagp;
336 {
337         register char *ret;
338         register char *br;
339         register char *ender;
340         register int parno;
341         int flags;
342
343         *flagp = HASWIDTH;      /* Tentatively. */
344
345         /* Make an OPEN node, if parenthesized. */
346         if (paren) {
347                 if (regnpar >= NSUBEXP)
348                         FAIL("too many () in regexp");
349                 parno = regnpar;
350                 regnpar++;
351                 ret = regnode(OPEN+parno);
352         } else
353                 ret = NULL;
354
355         /* Pick up the branches, linking them together. */
356         br = regbranch(&flags);
357         if (br == NULL)
358                 return(NULL);
359         if (ret != NULL)
360                 regtail(ret, br);       /* OPEN -> first. */
361         else
362                 ret = br;
363         if (!(flags&HASWIDTH))
364                 *flagp &= ~HASWIDTH;
365         *flagp |= flags&SPSTART;
366         while (*regparse == '|') {
367                 regparse++;
368                 br = regbranch(&flags);
369                 if (br == NULL)
370                         return(NULL);
371                 regtail(ret, br);       /* BRANCH -> BRANCH. */
372                 if (!(flags&HASWIDTH))
373                         *flagp &= ~HASWIDTH;
374                 *flagp |= flags&SPSTART;
375         }
376
377         /* Make a closing node, and hook it on the end. */
378         ender = regnode((paren) ? CLOSE+parno : END);   
379         regtail(ret, ender);
380
381         /* Hook the tails of the branches to the closing node. */
382         for (br = ret; br != NULL; br = regnext(br))
383                 regoptail(br, ender);
384
385         /* Check for proper termination. */
386         if (paren && *regparse++ != ')') {
387                 FAIL("unmatched () in regexp");
388         } else if (!paren && regparse < regxend) {
389                 if (*regparse == ')') {
390                         FAIL("unmatched () in regexp");
391                 } else
392                         FAIL("junk on end of regexp");  /* "Can't happen". */
393                 /* NOTREACHED */
394         }
395
396         return(ret);
397 }
398
399 /*
400  - regbranch - one alternative of an | operator
401  *
402  * Implements the concatenation operator.
403  */
404 static char *
405 regbranch(flagp)
406 int *flagp;
407 {
408         register char *ret;
409         register char *chain;
410         register char *latest;
411         int flags;
412
413         *flagp = WORST;         /* Tentatively. */
414
415         ret = regnode(BRANCH);
416         chain = NULL;
417         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
418                 latest = regpiece(&flags);
419                 if (latest == NULL)
420                         return(NULL);
421                 *flagp |= flags&HASWIDTH;
422                 if (chain == NULL)      /* First piece. */
423                         *flagp |= flags&SPSTART;
424                 else
425                         regtail(chain, latest);
426                 chain = latest;
427         }
428         if (chain == NULL)      /* Loop ran zero times. */
429                 (void) regnode(NOTHING);
430
431         return(ret);
432 }
433
434 /*
435  - regpiece - something followed by possible [*+?]
436  *
437  * Note that the branching code sequences used for ? and the general cases
438  * of * and + are somewhat optimized:  they use the same NOTHING node as
439  * both the endmarker for their branch list and the body of the last branch.
440  * It might seem that this node could be dispensed with entirely, but the
441  * endmarker role is not redundant.
442  */
443 static char *
444 regpiece(flagp)
445 int *flagp;
446 {
447         register char *ret;
448         register char op;
449         register char *next;
450         int flags;
451         char *origparse = regparse;
452         int orignpar = regnpar;
453         char *max;
454         int iter;
455         char ch;
456
457         ret = regatom(&flags);
458         if (ret == NULL)
459                 return(NULL);
460
461         op = *regparse;
462
463         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
464          * then we decrement the first number by one and reset our
465          * parsing back to the beginning of the same atom.  If the first number
466          * is down to 0, decrement the second number instead and fake up
467          * a ? after it.  Given the way this compiler doesn't keep track
468          * of offsets on the first pass, this is the only way to replicate
469          * a piece of code.  Sigh.
470          */
471         if (op == '{' && regcurly(regparse)) {
472             next = regparse + 1;
473             max = Nullch;
474             while (isdigit(*next) || *next == ',') {
475                 if (*next == ',') {
476                     if (max)
477                         break;
478                     else
479                         max = next;
480                 }
481                 next++;
482             }
483             if (*next == '}') {         /* got one */
484                 if (!max)
485                     max = next;
486                 regparse++;
487                 iter = atoi(regparse);
488                 if (flags&SIMPLE) {     /* we can do it right after all */
489                     int tmp;
490
491                     reginsert(CURLY, ret);
492                     if (*max == ',')
493                         max++;
494                     else
495                         max = regparse;
496                     tmp = atoi(max);
497                     if (tmp && tmp < iter)
498                         fatal("Can't do {n,m} with n > m");
499                     if (regcode != &regdummy) {
500 #ifdef REGALIGN
501                         *(unsigned short *)(ret+3) = iter;
502                         *(unsigned short *)(ret+5) = tmp;
503 #else
504                         ret[3] = iter >> 8; ret[4] = iter & 0377;
505                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
506 #endif
507                     }
508                     regparse = next;
509                     goto nest_check;
510                 }
511                 regsawbracket++;        /* remember we clobbered exp */
512                 if (iter > 0) {
513                     ch = *max;
514                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
515                     *max = ch;
516                     if (*max == ',' && max[1] != '}') {
517                         if (atoi(max+1) <= 0)
518                             fatal("Can't do {n,m} with n > m");
519                         ch = *next;
520                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
521                         *next = ch;
522                     }
523                     if (iter != 1 || *max == ',') {
524                         regparse = origparse;   /* back up input pointer */
525                         regnpar = orignpar;     /* don't make more parens */
526                     }
527                     else {
528                         regparse = next;
529                         goto nest_check;
530                     }
531                     *flagp = flags;
532                     return ret;
533                 }
534                 if (*max == ',') {
535                     max++;
536                     iter = atoi(max);
537                     if (max == next) {          /* any number more? */
538                         regparse = next;
539                         op = '*';               /* fake up one with a star */
540                     }
541                     else if (iter > 0) {
542                         op = '?';               /* fake up optional atom */
543                         ch = *next;
544                         sprintf(max,"%.*d", next-max, iter - 1);
545                         *next = ch;
546                         if (iter == 1)
547                             regparse = next;
548                         else {
549                             regparse = origparse - 1; /* offset ++ below */
550                             regnpar = orignpar;
551                         }
552                     }
553                     else
554                         fatal("Can't do {n,0}");
555                 }
556                 else
557                     fatal("Can't do {0}");
558             }
559         }
560
561         if (!ISMULT1(op)) {
562                 *flagp = flags;
563                 return(ret);
564         }
565
566         if (!(flags&HASWIDTH) && op != '?')
567                 FAIL("regexp *+ operand could be empty");
568         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
569
570         if (op == '*' && (flags&SIMPLE))
571                 reginsert(STAR, ret);
572         else if (op == '*') {
573                 /* Emit x* as (x&|), where & means "self". */
574                 reginsert(BRANCH, ret);                 /* Either x */
575                 regoptail(ret, regnode(BACK));          /* and loop */
576                 regoptail(ret, ret);                    /* back */
577                 regtail(ret, regnode(BRANCH));          /* or */
578                 regtail(ret, regnode(NOTHING));         /* null. */
579         } else if (op == '+' && (flags&SIMPLE))
580                 reginsert(PLUS, ret);
581         else if (op == '+') {
582                 /* Emit x+ as x(&|), where & means "self". */
583                 next = regnode(BRANCH);                 /* Either */
584                 regtail(ret, next);
585                 regtail(regnode(BACK), ret);            /* loop back */
586                 regtail(next, regnode(BRANCH));         /* or */
587                 regtail(ret, regnode(NOTHING));         /* null. */
588         } else if (op == '?') {
589                 /* Emit x? as (x|) */
590                 reginsert(BRANCH, ret);                 /* Either x */
591                 regtail(ret, regnode(BRANCH));          /* or */
592                 next = regnode(NOTHING);                /* null. */
593                 regtail(ret, next);
594                 regoptail(ret, next);
595         }
596       nest_check:
597         regparse++;
598         if (ISMULT2(regparse))
599                 FAIL("nested *?+ in regexp");
600
601         return(ret);
602 }
603
604 /*
605  - regatom - the lowest level
606  *
607  * Optimization:  gobbles an entire sequence of ordinary characters so that
608  * it can turn them into a single node, which is smaller to store and
609  * faster to run.  Backslashed characters are exceptions, each becoming a
610  * separate node; the code is simpler that way and it's not worth fixing.
611  *
612  * [Yes, it is worth fixing, some scripts can run twice the speed.]
613  */
614 static char *
615 regatom(flagp)
616 int *flagp;
617 {
618         register char *ret;
619         int flags;
620
621         *flagp = WORST;         /* Tentatively. */
622
623         switch (*regparse++) {
624         case '^':
625                 ret = regnode(BOL);
626                 break;
627         case '$':
628                 ret = regnode(EOL);
629                 break;
630         case '.':
631                 ret = regnode(ANY);
632                 *flagp |= HASWIDTH|SIMPLE;
633                 break;
634         case '[':
635                 ret = regclass();
636                 *flagp |= HASWIDTH|SIMPLE;
637                 break;
638         case '(':
639                 ret = reg(1, &flags);
640                 if (ret == NULL)
641                         return(NULL);
642                 *flagp |= flags&(HASWIDTH|SPSTART);
643                 break;
644         case '|':
645         case ')':
646                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
647                 break;
648         case '?':
649         case '+':
650         case '*':
651                 FAIL("?+* follows nothing in regexp");
652                 break;
653         case '\\':
654                 switch (*regparse) {
655                 case 'w':
656                         ret = regnode(ALNUM);
657                         *flagp |= HASWIDTH|SIMPLE;
658                         regparse++;
659                         break;
660                 case 'W':
661                         ret = regnode(NALNUM);
662                         *flagp |= HASWIDTH|SIMPLE;
663                         regparse++;
664                         break;
665                 case 'b':
666                         ret = regnode(BOUND);
667                         *flagp |= SIMPLE;
668                         regparse++;
669                         break;
670                 case 'B':
671                         ret = regnode(NBOUND);
672                         *flagp |= SIMPLE;
673                         regparse++;
674                         break;
675                 case 's':
676                         ret = regnode(SPACE);
677                         *flagp |= HASWIDTH|SIMPLE;
678                         regparse++;
679                         break;
680                 case 'S':
681                         ret = regnode(NSPACE);
682                         *flagp |= HASWIDTH|SIMPLE;
683                         regparse++;
684                         break;
685                 case 'd':
686                         ret = regnode(DIGIT);
687                         *flagp |= HASWIDTH|SIMPLE;
688                         regparse++;
689                         break;
690                 case 'D':
691                         ret = regnode(NDIGIT);
692                         *flagp |= HASWIDTH|SIMPLE;
693                         regparse++;
694                         break;
695                 case 'n':
696                 case 'r':
697                 case 't':
698                 case 'f':
699                         goto defchar;
700                 case '0': case '1': case '2': case '3': case '4':
701                 case '5': case '6': case '7': case '8': case '9':
702                         if (isdigit(regparse[1]) || *regparse == '0')
703                                 goto defchar;
704                         else {
705                                 ret = regnode(REF + *regparse++ - '0');
706                                 *flagp |= SIMPLE;
707                         }
708                         break;
709                 case '\0':
710                         if (regparse >= regxend)
711                             FAIL("trailing \\ in regexp");
712                         /* FALL THROUGH */
713                 default:
714                         goto defchar;
715                 }
716                 break;
717         default: {
718                         register int len;
719                         register char ender;
720                         register char *p;
721                         char *oldp;
722                         int foo;
723
724                     defchar:
725                         ret = regnode(EXACTLY);
726                         regc(0);                /* save spot for len */
727                         for (len=0, p=regparse-1;
728                           len < 127 && p < regxend;
729                           len++)
730                         {
731                             oldp = p;
732                             switch (*p) {
733                             case '^':
734                             case '$':
735                             case '.':
736                             case '[':
737                             case '(':
738                             case ')':
739                             case '|':
740                                 goto loopdone;
741                             case '\\':
742                                 switch (*++p) {
743                                 case 'w':
744                                 case 'W':
745                                 case 'b':
746                                 case 'B':
747                                 case 's':
748                                 case 'S':
749                                 case 'd':
750                                 case 'D':
751                                     --p;
752                                     goto loopdone;
753                                 case 'n':
754                                         ender = '\n';
755                                         p++;
756                                         break;
757                                 case 'r':
758                                         ender = '\r';
759                                         p++;
760                                         break;
761                                 case 't':
762                                         ender = '\t';
763                                         p++;
764                                         break;
765                                 case 'f':
766                                         ender = '\f';
767                                         p++;
768                                         break;
769                                 case '0': case '1': case '2': case '3':case '4':
770                                 case '5': case '6': case '7': case '8':case '9':
771                                     if (isdigit(p[1]) || *p == '0') {
772                                         foo = *p - '0';
773                                         if (isdigit(p[1]))
774                                             foo = (foo<<3) + *++p - '0';
775                                         if (isdigit(p[1]))
776                                             foo = (foo<<3) + *++p - '0';
777                                         ender = foo;
778                                         p++;
779                                     }
780                                     else {
781                                         --p;
782                                         goto loopdone;
783                                     }
784                                     break;
785                                 case '\0':
786                                     if (p >= regxend)
787                                         FAIL("trailing \\ in regexp");
788                                     /* FALL THROUGH */
789                                 default:
790                                     ender = *p++;
791                                     break;
792                                 }
793                                 break;
794                             default:
795                                 ender = *p++;
796                                 break;
797                             }
798                             if (regfold && isupper(ender))
799                                     ender = tolower(ender);
800                             if (ISMULT2(p)) { /* Back off on ?+*. */
801                                 if (len)
802                                     p = oldp;
803                                 else {
804                                     len++;
805                                     regc(ender);
806                                 }
807                                 break;
808                             }
809                             regc(ender);
810                         }
811                     loopdone:
812                         regparse = p;
813                         if (len <= 0)
814                                 FAIL("internal disaster in regexp");
815                         *flagp |= HASWIDTH;
816                         if (len == 1)
817                                 *flagp |= SIMPLE;
818                         if (regcode != &regdummy)
819                             *OPERAND(ret) = len;
820                         regc('\0');
821                 }
822                 break;
823         }
824
825         return(ret);
826 }
827
828 static void
829 regset(bits,def,c)
830 char *bits;
831 int def;
832 register int c;
833 {
834         if (regcode == &regdummy)
835             return;
836         c &= 255;
837         if (def)
838                 bits[c >> 3] &= ~(1 << (c & 7));
839         else
840                 bits[c >> 3] |=  (1 << (c & 7));
841 }
842
843 static char *
844 regclass()
845 {
846         register char *bits;
847         register int class;
848         register int lastclass;
849         register int range = 0;
850         register char *ret;
851         register int def;
852
853         ret = regnode(ANYOF);
854         if (*regparse == '^') { /* Complement of range. */
855                 regparse++;
856                 def = 0;
857         } else {
858                 def = 255;
859         }
860         bits = regcode;
861         for (class = 0; class < 32; class++)
862             regc(def);
863         if (*regparse == ']' || *regparse == '-')
864                 goto skipcond;          /* allow 1st char to be ] or - */
865         while (regparse < regxend && *regparse != ']') {
866               skipcond:
867                 class = UCHARAT(regparse++);
868                 if (class == '\\') {
869                         class = UCHARAT(regparse++);
870                         switch (class) {
871                         case 'w':
872                                 for (class = 'a'; class <= 'z'; class++)
873                                         regset(bits,def,class);
874                                 for (class = 'A'; class <= 'Z'; class++)
875                                         regset(bits,def,class);
876                                 for (class = '0'; class <= '9'; class++)
877                                         regset(bits,def,class);
878                                 regset(bits,def,'_');
879                                 lastclass = 1234;
880                                 continue;
881                         case 's':
882                                 regset(bits,def,' ');
883                                 regset(bits,def,'\t');
884                                 regset(bits,def,'\r');
885                                 regset(bits,def,'\f');
886                                 regset(bits,def,'\n');
887                                 lastclass = 1234;
888                                 continue;
889                         case 'd':
890                                 for (class = '0'; class <= '9'; class++)
891                                         regset(bits,def,class);
892                                 lastclass = 1234;
893                                 continue;
894                         case 'n':
895                                 class = '\n';
896                                 break;
897                         case 'r':
898                                 class = '\r';
899                                 break;
900                         case 't':
901                                 class = '\t';
902                                 break;
903                         case 'f':
904                                 class = '\f';
905                                 break;
906                         case 'b':
907                                 class = '\b';
908                                 break;
909                         case '0': case '1': case '2': case '3': case '4':
910                         case '5': case '6': case '7': case '8': case '9':
911                                 class -= '0';
912                                 if (isdigit(*regparse)) {
913                                         class <<= 3;
914                                         class += *regparse++ - '0';
915                                 }
916                                 if (isdigit(*regparse)) {
917                                         class <<= 3;
918                                         class += *regparse++ - '0';
919                                 }
920                                 break;
921                         }
922                 }
923                 if (range) {
924                         if (lastclass > class)
925                                 FAIL("invalid [] range in regexp");
926                         range = 0;
927                 }
928                 else {
929                         lastclass = class;
930                         if (*regparse == '-' && regparse+1 < regxend &&
931                             regparse[1] != ']') {
932                                 regparse++;
933                                 range = 1;
934                                 continue;       /* do it next time */
935                         }
936                 }
937                 for ( ; lastclass <= class; lastclass++) {
938                         regset(bits,def,lastclass);
939                         if (regfold && isupper(lastclass))
940                                 regset(bits,def,tolower(lastclass));
941                 }
942                 lastclass = class;
943         }
944         if (*regparse != ']')
945                 FAIL("unmatched [] in regexp");
946         regparse++;
947         return ret;
948 }
949
950 /*
951  - regnode - emit a node
952  */
953 static char *                   /* Location. */
954 regnode(op)
955 char op;
956 {
957         register char *ret;
958         register char *ptr;
959
960         ret = regcode;
961         if (ret == &regdummy) {
962 #ifdef REGALIGN
963                 if (!(regsize & 1))
964                         regsize++;
965 #endif
966                 regsize += 3;
967                 return(ret);
968         }
969
970 #ifdef REGALIGN
971 #ifndef lint
972         if (!((long)ret & 1))
973             *ret++ = 127;
974 #endif
975 #endif
976         ptr = ret;
977         *ptr++ = op;
978         *ptr++ = '\0';          /* Null "next" pointer. */
979         *ptr++ = '\0';
980         regcode = ptr;
981
982         return(ret);
983 }
984
985 /*
986  - regc - emit (if appropriate) a byte of code
987  */
988 static void
989 regc(b)
990 char b;
991 {
992         if (regcode != &regdummy)
993                 *regcode++ = b;
994         else
995                 regsize++;
996 }
997
998 /*
999  - reginsert - insert an operator in front of already-emitted operand
1000  *
1001  * Means relocating the operand.
1002  */
1003 static void
1004 reginsert(op, opnd)
1005 char op;
1006 char *opnd;
1007 {
1008         register char *src;
1009         register char *dst;
1010         register char *place;
1011         register offset = (op == CURLY ? 4 : 0);
1012
1013         if (regcode == &regdummy) {
1014 #ifdef REGALIGN
1015                 regsize += 4 + offset;
1016 #else
1017                 regsize += 3 + offset;
1018 #endif
1019                 return;
1020         }
1021
1022         src = regcode;
1023 #ifdef REGALIGN
1024         regcode += 4 + offset;
1025 #else
1026         regcode += 3 + offset;
1027 #endif
1028         dst = regcode;
1029         while (src > opnd)
1030                 *--dst = *--src;
1031
1032         place = opnd;           /* Op node, where operand used to be. */
1033         *place++ = op;
1034         *place++ = '\0';
1035         *place++ = '\0';
1036         while (offset-- > 0)
1037             *place++ = '\0';
1038 }
1039
1040 /*
1041  - regtail - set the next-pointer at the end of a node chain
1042  */
1043 static void
1044 regtail(p, val)
1045 char *p;
1046 char *val;
1047 {
1048         register char *scan;
1049         register char *temp;
1050         register int offset;
1051
1052         if (p == &regdummy)
1053                 return;
1054
1055         /* Find last node. */
1056         scan = p;
1057         for (;;) {
1058                 temp = regnext(scan);
1059                 if (temp == NULL)
1060                         break;
1061                 scan = temp;
1062         }
1063
1064 #ifdef REGALIGN
1065         offset = val - scan;
1066 #ifndef lint
1067         *(short*)(scan+1) = offset;
1068 #else
1069         offset = offset;
1070 #endif
1071 #else
1072         if (OP(scan) == BACK)
1073                 offset = scan - val;
1074         else
1075                 offset = val - scan;
1076         *(scan+1) = (offset>>8)&0377;
1077         *(scan+2) = offset&0377;
1078 #endif
1079 }
1080
1081 /*
1082  - regoptail - regtail on operand of first argument; nop if operandless
1083  */
1084 static void
1085 regoptail(p, val)
1086 char *p;
1087 char *val;
1088 {
1089         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1090         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1091                 return;
1092         regtail(NEXTOPER(p), val);
1093 }
1094
1095 /*
1096  - regcurly - a little FSA that accepts {\d+,?\d*}
1097  */
1098 STATIC int
1099 regcurly(s)
1100 register char *s;
1101 {
1102     if (*s++ != '{')
1103         return FALSE;
1104     if (!isdigit(*s))
1105         return FALSE;
1106     while (isdigit(*s))
1107         s++;
1108     if (*s == ',')
1109         s++;
1110     while (isdigit(*s))
1111         s++;
1112     if (*s != '}')
1113         return FALSE;
1114     return TRUE;
1115 }
1116
1117 #ifdef DEBUGGING
1118
1119 /*
1120  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1121  */
1122 void
1123 regdump(r)
1124 regexp *r;
1125 {
1126         register char *s;
1127         register char op = EXACTLY;     /* Arbitrary non-END op. */
1128         register char *next;
1129         extern char *index();
1130
1131
1132         s = r->program + 1;
1133         while (op != END) {     /* While that wasn't END last time... */
1134 #ifdef REGALIGN
1135                 if (!((long)s & 1))
1136                         s++;
1137 #endif
1138                 op = OP(s);
1139                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1140                 if (op == CURLY)
1141                     s += 4;
1142                 next = regnext(s);
1143                 if (next == NULL)               /* Next ptr. */
1144                         fprintf(stderr,"(0)");
1145                 else 
1146                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1147                 s += 3;
1148                 if (op == ANYOF) {
1149                         s += 32;
1150                 }
1151                 if (op == EXACTLY) {
1152                         /* Literal string, where present. */
1153                         s++;
1154                         while (*s != '\0') {
1155                                 (void)putchar(*s);
1156                                 s++;
1157                         }
1158                         s++;
1159                 }
1160                 (void)putchar('\n');
1161         }
1162
1163         /* Header fields of interest. */
1164         if (r->regstart)
1165                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1166         if (r->regstclass)
1167                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1168         if (r->reganch & 1)
1169                 fprintf(stderr,"anchored ");
1170         if (r->reganch & 2)
1171                 fprintf(stderr,"plus ");
1172         if (r->regmust != NULL)
1173                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1174                   r->regback);
1175         fprintf(stderr,"\n");
1176 }
1177
1178 /*
1179  - regprop - printable representation of opcode
1180  */
1181 char *
1182 regprop(op)
1183 char *op;
1184 {
1185         register char *p;
1186
1187         (void) strcpy(buf, ":");
1188
1189         switch (OP(op)) {
1190         case BOL:
1191                 p = "BOL";
1192                 break;
1193         case EOL:
1194                 p = "EOL";
1195                 break;
1196         case ANY:
1197                 p = "ANY";
1198                 break;
1199         case ANYOF:
1200                 p = "ANYOF";
1201                 break;
1202         case BRANCH:
1203                 p = "BRANCH";
1204                 break;
1205         case EXACTLY:
1206                 p = "EXACTLY";
1207                 break;
1208         case NOTHING:
1209                 p = "NOTHING";
1210                 break;
1211         case BACK:
1212                 p = "BACK";
1213                 break;
1214         case END:
1215                 p = "END";
1216                 break;
1217         case ALNUM:
1218                 p = "ALNUM";
1219                 break;
1220         case NALNUM:
1221                 p = "NALNUM";
1222                 break;
1223         case BOUND:
1224                 p = "BOUND";
1225                 break;
1226         case NBOUND:
1227                 p = "NBOUND";
1228                 break;
1229         case SPACE:
1230                 p = "SPACE";
1231                 break;
1232         case NSPACE:
1233                 p = "NSPACE";
1234                 break;
1235         case DIGIT:
1236                 p = "DIGIT";
1237                 break;
1238         case NDIGIT:
1239                 p = "NDIGIT";
1240                 break;
1241         case CURLY:
1242                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1243                     ARG1(op),ARG2(op));
1244                 p = NULL;
1245                 break;
1246         case REF:
1247         case REF+1:
1248         case REF+2:
1249         case REF+3:
1250         case REF+4:
1251         case REF+5:
1252         case REF+6:
1253         case REF+7:
1254         case REF+8:
1255         case REF+9:
1256                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1257                 p = NULL;
1258                 break;
1259         case OPEN+1:
1260         case OPEN+2:
1261         case OPEN+3:
1262         case OPEN+4:
1263         case OPEN+5:
1264         case OPEN+6:
1265         case OPEN+7:
1266         case OPEN+8:
1267         case OPEN+9:
1268                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1269                 p = NULL;
1270                 break;
1271         case CLOSE+1:
1272         case CLOSE+2:
1273         case CLOSE+3:
1274         case CLOSE+4:
1275         case CLOSE+5:
1276         case CLOSE+6:
1277         case CLOSE+7:
1278         case CLOSE+8:
1279         case CLOSE+9:
1280                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1281                 p = NULL;
1282                 break;
1283         case STAR:
1284                 p = "STAR";
1285                 break;
1286         case PLUS:
1287                 p = "PLUS";
1288                 break;
1289         default:
1290                 FAIL("corrupted regexp opcode");
1291         }
1292         if (p != NULL)
1293                 (void) strcat(buf, p);
1294         return(buf);
1295 }
1296 #endif /* DEBUGGING */
1297
1298 regfree(r)
1299 struct regexp *r;
1300 {
1301         if (r->precomp)
1302                 Safefree(r->precomp);
1303         if (r->subbase)
1304                 Safefree(r->subbase);
1305         if (r->regmust)
1306                 str_free(r->regmust);
1307         if (r->regstart)
1308                 str_free(r->regstart);
1309         Safefree(r);
1310 }