This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 4.0 patch 2: Patch 1 continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.1 $$Date: 91/04/12 09:04:45 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
14  * patch1: random cleanup in cpp namespace
15  * 
16  * Revision 4.0  91/03/20  01:39:01  lwall
17  * 4.0 baseline.
18  * 
19  */
20
21 /*
22  * regcomp and regexec -- regsub and regerror are not used in perl
23  *
24  *      Copyright (c) 1986 by University of Toronto.
25  *      Written by Henry Spencer.  Not derived from licensed software.
26  *
27  *      Permission is granted to anyone to use this software for any
28  *      purpose on any computer system, and to redistribute it freely,
29  *      subject to the following restrictions:
30  *
31  *      1. The author is not responsible for the consequences of use of
32  *              this software, no matter how awful, even if they arise
33  *              from defects in it.
34  *
35  *      2. The origin of this software must not be misrepresented, either
36  *              by explicit claim or by omission.
37  *
38  *      3. Altered versions must be plainly marked as such, and must not
39  *              be misrepresented as being the original software.
40  *
41  *
42  ****    Alterations to Henry's code are...
43  ****
44  ****    Copyright (c) 1989, Larry Wall
45  ****
46  ****    You may distribute under the terms of the GNU General Public License
47  ****    as specified in the README file that comes with the perl 3.0 kit.
48  *
49  * Beware that some of this code is subtly aware of the way operator
50  * precedence is structured in regular expressions.  Serious changes in
51  * regular-expression syntax might require a total rethink.
52  */
53 #include "EXTERN.h"
54 #include "perl.h"
55 #include "INTERN.h"
56 #include "regcomp.h"
57
58 #ifdef MSDOS
59 # if defined(BUGGY_MSC6)
60  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
61  # pragma optimize("a",off)
62  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
63  # pragma optimize("w",on )
64 # endif /* BUGGY_MSC6 */
65 #endif /* MSDOS */
66
67 #ifndef STATIC
68 #define STATIC  static
69 #endif
70
71 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
72 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
73         ((*s) == '{' && regcurly(s)))
74 #define META    "^$.[()|?+*\\"
75
76 #ifdef SPSTART
77 #undef SPSTART          /* dratted cpp namespace... */
78 #endif
79 /*
80  * Flags to be passed up and down.
81  */
82 #define HASWIDTH        01      /* Known never to match null string. */
83 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
84 #define SPSTART         04      /* Starts with * or +. */
85 #define WORST           0       /* Worst case. */
86
87 /*
88  * Global work variables for regcomp().
89  */
90 static char *regprecomp;                /* uncompiled string. */
91 static char *regparse;          /* Input-scan pointer. */
92 static char *regxend;           /* End of input for compile */
93 static int regnpar;             /* () count. */
94 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
95 static long regsize;            /* Code size. */
96 static int regfold;
97 static int regsawbracket;       /* Did we do {d,d} trick? */
98
99 /*
100  * Forward declarations for regcomp()'s friends.
101  */
102 STATIC int regcurly();
103 STATIC char *reg();
104 STATIC char *regbranch();
105 STATIC char *regpiece();
106 STATIC char *regatom();
107 STATIC char *regclass();
108 STATIC char *regnode();
109 STATIC char *reganode();
110 STATIC void regc();
111 STATIC void reginsert();
112 STATIC void regtail();
113 STATIC void regoptail();
114
115 /*
116  - regcomp - compile a regular expression into internal code
117  *
118  * We can't allocate space until we know how big the compiled form will be,
119  * but we can't compile it (and thus know how big it is) until we've got a
120  * place to put the code.  So we cheat:  we compile it twice, once with code
121  * generation turned off and size counting turned on, and once "for real".
122  * This also means that we don't allocate space until we are sure that the
123  * thing really will compile successfully, and we never have to move the
124  * code and thus invalidate pointers into it.  (Note that it has to be in
125  * one piece because free() must be able to free it all.) [NB: not true in perl]
126  *
127  * Beware that the optimization-preparation code in here knows about some
128  * of the structure of the compiled regexp.  [I'll say.]
129  */
130 regexp *
131 regcomp(exp,xend,fold)
132 char *exp;
133 char *xend;
134 int fold;
135 {
136         register regexp *r;
137         register char *scan;
138         register STR *longish;
139         STR *longest;
140         register int len;
141         register char *first;
142         int flags;
143         int backish;
144         int backest;
145         int curback;
146         extern char *safemalloc();
147         extern char *savestr();
148         int sawplus = 0;
149
150         if (exp == NULL)
151                 fatal("NULL regexp argument");
152
153         /* First pass: determine size, legality. */
154         regfold = fold;
155         regparse = exp;
156         regxend = xend;
157         regprecomp = nsavestr(exp,xend-exp);
158         regsawbracket = 0;
159         regnpar = 1;
160         regsize = 0L;
161         regcode = &regdummy;
162         regc(MAGIC);
163         if (reg(0, &flags) == NULL) {
164                 Safefree(regprecomp);
165                 regprecomp = Nullch;
166                 return(NULL);
167         }
168
169         /* Small enough for pointer-storage convention? */
170         if (regsize >= 32767L)          /* Probably could be 65535L. */
171                 FAIL("regexp too big");
172
173         /* Allocate space. */
174         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
175         if (r == NULL)
176                 FAIL("regexp out of space");
177
178         /* Second pass: emit code. */
179         if (regsawbracket)
180             bcopy(regprecomp,exp,xend-exp);
181         r->precomp = regprecomp;
182         r->subbase = NULL;
183         regparse = exp;
184         regnpar = 1;
185         regcode = r->program;
186         regc(MAGIC);
187         if (reg(0, &flags) == NULL)
188                 return(NULL);
189
190         /* Dig out information for optimizations. */
191         r->regstart = Nullstr;  /* Worst-case defaults. */
192         r->reganch = 0;
193         r->regmust = Nullstr;
194         r->regback = -1;
195         r->regstclass = Nullch;
196         scan = r->program+1;                    /* First BRANCH. */
197         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
198                 scan = NEXTOPER(scan);
199
200                 first = scan;
201                 while (OP(first) == OPEN ||
202                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
203                     (OP(first) == PLUS) ||
204                     (OP(first) == CURLY && ARG1(first) > 0) ) {
205                         if (OP(first) == PLUS)
206                             sawplus = 2;
207                         else
208                             first += regarglen[OP(first)];
209                         first = NEXTOPER(first);
210                 }
211
212                 /* Starting-point info. */
213                 if (OP(first) == EXACTLY) {
214                         r->regstart =
215                             str_make(OPERAND(first)+1,*OPERAND(first));
216                         if (r->regstart->str_cur > !(sawstudy|fold))
217                                 fbmcompile(r->regstart,fold);
218                 }
219                 else if ((exp = index(simple,OP(first))) && exp > simple)
220                         r->regstclass = first;
221                 else if (OP(first) == BOUND || OP(first) == NBOUND)
222                         r->regstclass = first;
223                 else if (OP(first) == BOL ||
224                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
225                         r->reganch = 1;         /* kinda turn .* into ^.* */
226                 r->reganch |= sawplus;
227
228 #ifdef DEBUGGING
229                 if (debug & 512)
230                     fprintf(stderr,"first %d next %d offset %d\n",
231                       OP(first), OP(NEXTOPER(first)), first - scan);
232 #endif
233                 /*
234                  * If there's something expensive in the r.e., find the
235                  * longest literal string that must appear and make it the
236                  * regmust.  Resolve ties in favor of later strings, since
237                  * the regstart check works with the beginning of the r.e.
238                  * and avoiding duplication strengthens checking.  Not a
239                  * strong reason, but sufficient in the absence of others.
240                  * [Now we resolve ties in favor of the earlier string if
241                  * it happens that curback has been invalidated, since the
242                  * earlier string may buy us something the later one won't.]
243                  */
244                 longish = str_make("",0);
245                 longest = str_make("",0);
246                 len = 0;
247                 curback = 0;
248                 backish = 0;
249                 backest = 0;
250                 while (OP(scan) != END) {
251                         if (OP(scan) == BRANCH) {
252                             if (OP(regnext(scan)) == BRANCH) {
253                                 curback = -30000;
254                                 while (OP(scan) == BRANCH)
255                                     scan = regnext(scan);
256                             }
257                             else        /* single branch is ok */
258                                 scan = NEXTOPER(scan);
259                         }
260                         if (OP(scan) == EXACTLY) {
261                             char *t;
262
263                             first = scan;
264                             while (OP(t = regnext(scan)) == CLOSE)
265                                 scan = t;
266                             if (curback - backish == len) {
267                                 str_ncat(longish, OPERAND(first)+1,
268                                     *OPERAND(first));
269                                 len += *OPERAND(first);
270                                 curback += *OPERAND(first);
271                                 first = regnext(scan);
272                             }
273                             else if (*OPERAND(first) >= len + (curback >= 0)) {
274                                 len = *OPERAND(first);
275                                 str_nset(longish, OPERAND(first)+1,len);
276                                 backish = curback;
277                                 curback += len;
278                                 first = regnext(scan);
279                             }
280                             else
281                                 curback += *OPERAND(first);
282                         }
283                         else if (index(varies,OP(scan))) {
284                             curback = -30000;
285                             len = 0;
286                             if (longish->str_cur > longest->str_cur) {
287                                 str_sset(longest,longish);
288                                 backest = backish;
289                             }
290                             str_nset(longish,"",0);
291                         }
292                         else if (index(simple,OP(scan))) {
293                             curback++;
294                             len = 0;
295                             if (longish->str_cur > longest->str_cur) {
296                                 str_sset(longest,longish);
297                                 backest = backish;
298                             }
299                             str_nset(longish,"",0);
300                         }
301                         scan = regnext(scan);
302                 }
303
304                 /* Prefer earlier on tie, unless we can tail match latter */
305
306                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
307                     str_sset(longest,longish);
308                     backest = backish;
309                 }
310                 else
311                     str_nset(longish,"",0);
312                 if (longest->str_cur
313                     &&
314                     (!r->regstart
315                      ||
316                      !fbminstr(r->regstart->str_ptr,
317                           r->regstart->str_ptr + r->regstart->str_cur,
318                           longest)
319                     )
320                    )
321                 {
322                         r->regmust = longest;
323                         if (backest < 0)
324                                 backest = -1;
325                         r->regback = backest;
326                         if (longest->str_cur
327                           > !(sawstudy || fold || OP(first) == EOL) )
328                                 fbmcompile(r->regmust,fold);
329                         r->regmust->str_u.str_useful = 100;
330                         if (OP(first) == EOL && longish->str_cur)
331                             r->regmust->str_pok |= SP_TAIL;
332                 }
333                 else {
334                         str_free(longest);
335                         longest = Nullstr;
336                 }
337                 str_free(longish);
338         }
339
340         r->do_folding = fold;
341         r->nparens = regnpar - 1;
342         New(1002, r->startp, regnpar, char*);
343         New(1002, r->endp, regnpar, char*);
344 #ifdef DEBUGGING
345         if (debug & 512)
346                 regdump(r);
347 #endif
348         return(r);
349 }
350
351 /*
352  - reg - regular expression, i.e. main body or parenthesized thing
353  *
354  * Caller must absorb opening parenthesis.
355  *
356  * Combining parenthesis handling with the base level of regular expression
357  * is a trifle forced, but the need to tie the tails of the branches to what
358  * follows makes it hard to avoid.
359  */
360 static char *
361 reg(paren, flagp)
362 int paren;                      /* Parenthesized? */
363 int *flagp;
364 {
365         register char *ret;
366         register char *br;
367         register char *ender;
368         register int parno;
369         int flags;
370
371         *flagp = HASWIDTH;      /* Tentatively. */
372
373         /* Make an OPEN node, if parenthesized. */
374         if (paren) {
375                 parno = regnpar;
376                 regnpar++;
377                 ret = reganode(OPEN, parno);
378         } else
379                 ret = NULL;
380
381         /* Pick up the branches, linking them together. */
382         br = regbranch(&flags);
383         if (br == NULL)
384                 return(NULL);
385         if (ret != NULL)
386                 regtail(ret, br);       /* OPEN -> first. */
387         else
388                 ret = br;
389         if (!(flags&HASWIDTH))
390                 *flagp &= ~HASWIDTH;
391         *flagp |= flags&SPSTART;
392         while (*regparse == '|') {
393                 regparse++;
394                 br = regbranch(&flags);
395                 if (br == NULL)
396                         return(NULL);
397                 regtail(ret, br);       /* BRANCH -> BRANCH. */
398                 if (!(flags&HASWIDTH))
399                         *flagp &= ~HASWIDTH;
400                 *flagp |= flags&SPSTART;
401         }
402
403         /* Make a closing node, and hook it on the end. */
404         if (paren)
405             ender = reganode(CLOSE, parno);
406         else
407             ender = regnode(END);
408         regtail(ret, ender);
409
410         /* Hook the tails of the branches to the closing node. */
411         for (br = ret; br != NULL; br = regnext(br))
412                 regoptail(br, ender);
413
414         /* Check for proper termination. */
415         if (paren && *regparse++ != ')') {
416                 FAIL("unmatched () in regexp");
417         } else if (!paren && regparse < regxend) {
418                 if (*regparse == ')') {
419                         FAIL("unmatched () in regexp");
420                 } else
421                         FAIL("junk on end of regexp");  /* "Can't happen". */
422                 /* NOTREACHED */
423         }
424
425         return(ret);
426 }
427
428 /*
429  - regbranch - one alternative of an | operator
430  *
431  * Implements the concatenation operator.
432  */
433 static char *
434 regbranch(flagp)
435 int *flagp;
436 {
437         register char *ret;
438         register char *chain;
439         register char *latest;
440         int flags;
441
442         *flagp = WORST;         /* Tentatively. */
443
444         ret = regnode(BRANCH);
445         chain = NULL;
446         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
447                 latest = regpiece(&flags);
448                 if (latest == NULL)
449                         return(NULL);
450                 *flagp |= flags&HASWIDTH;
451                 if (chain == NULL)      /* First piece. */
452                         *flagp |= flags&SPSTART;
453                 else
454                         regtail(chain, latest);
455                 chain = latest;
456         }
457         if (chain == NULL)      /* Loop ran zero times. */
458                 (void) regnode(NOTHING);
459
460         return(ret);
461 }
462
463 /*
464  - regpiece - something followed by possible [*+?]
465  *
466  * Note that the branching code sequences used for ? and the general cases
467  * of * and + are somewhat optimized:  they use the same NOTHING node as
468  * both the endmarker for their branch list and the body of the last branch.
469  * It might seem that this node could be dispensed with entirely, but the
470  * endmarker role is not redundant.
471  */
472 static char *
473 regpiece(flagp)
474 int *flagp;
475 {
476         register char *ret;
477         register char op;
478         register char *next;
479         int flags;
480         char *origparse = regparse;
481         int orignpar = regnpar;
482         char *max;
483         int iter;
484         char ch;
485
486         ret = regatom(&flags);
487         if (ret == NULL)
488                 return(NULL);
489
490         op = *regparse;
491
492         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
493          * then we decrement the first number by one and reset our
494          * parsing back to the beginning of the same atom.  If the first number
495          * is down to 0, decrement the second number instead and fake up
496          * a ? after it.  Given the way this compiler doesn't keep track
497          * of offsets on the first pass, this is the only way to replicate
498          * a piece of code.  Sigh.
499          */
500         if (op == '{' && regcurly(regparse)) {
501             next = regparse + 1;
502             max = Nullch;
503             while (isdigit(*next) || *next == ',') {
504                 if (*next == ',') {
505                     if (max)
506                         break;
507                     else
508                         max = next;
509                 }
510                 next++;
511             }
512             if (*next == '}') {         /* got one */
513                 if (!max)
514                     max = next;
515                 regparse++;
516                 iter = atoi(regparse);
517                 if (flags&SIMPLE) {     /* we can do it right after all */
518                     int tmp;
519
520                     reginsert(CURLY, ret);
521                     if (iter > 0)
522                         *flagp = (WORST|HASWIDTH);
523                     if (*max == ',')
524                         max++;
525                     else
526                         max = regparse;
527                     tmp = atoi(max);
528                     if (tmp && tmp < iter)
529                         fatal("Can't do {n,m} with n > m");
530                     if (regcode != &regdummy) {
531 #ifdef REGALIGN
532                         *(unsigned short *)(ret+3) = iter;
533                         *(unsigned short *)(ret+5) = tmp;
534 #else
535                         ret[3] = iter >> 8; ret[4] = iter & 0377;
536                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
537 #endif
538                     }
539                     regparse = next;
540                     goto nest_check;
541                 }
542                 regsawbracket++;        /* remember we clobbered exp */
543                 if (iter > 0) {
544                     ch = *max;
545                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
546                     *max = ch;
547                     if (*max == ',' && max[1] != '}') {
548                         if (atoi(max+1) <= 0)
549                             fatal("Can't do {n,m} with n > m");
550                         ch = *next;
551                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
552                         *next = ch;
553                     }
554                     if (iter != 1 || *max == ',') {
555                         regparse = origparse;   /* back up input pointer */
556                         regnpar = orignpar;     /* don't make more parens */
557                     }
558                     else {
559                         regparse = next;
560                         goto nest_check;
561                     }
562                     *flagp = flags;
563                     return ret;
564                 }
565                 if (*max == ',') {
566                     max++;
567                     iter = atoi(max);
568                     if (max == next) {          /* any number more? */
569                         regparse = next;
570                         op = '*';               /* fake up one with a star */
571                     }
572                     else if (iter > 0) {
573                         op = '?';               /* fake up optional atom */
574                         ch = *next;
575                         sprintf(max,"%.*d", next-max, iter - 1);
576                         *next = ch;
577                         if (iter == 1)
578                             regparse = next;
579                         else {
580                             regparse = origparse - 1; /* offset ++ below */
581                             regnpar = orignpar;
582                         }
583                     }
584                     else
585                         fatal("Can't do {n,0}");
586                 }
587                 else
588                     fatal("Can't do {0}");
589             }
590         }
591
592         if (!ISMULT1(op)) {
593                 *flagp = flags;
594                 return(ret);
595         }
596
597         if (!(flags&HASWIDTH) && op != '?')
598                 FAIL("regexp *+ operand could be empty");
599         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
600
601         if (op == '*' && (flags&SIMPLE))
602                 reginsert(STAR, ret);
603         else if (op == '*') {
604                 /* Emit x* as (x&|), where & means "self". */
605                 reginsert(BRANCH, ret);                 /* Either x */
606                 regoptail(ret, regnode(BACK));          /* and loop */
607                 regoptail(ret, ret);                    /* back */
608                 regtail(ret, regnode(BRANCH));          /* or */
609                 regtail(ret, regnode(NOTHING));         /* null. */
610         } else if (op == '+' && (flags&SIMPLE))
611                 reginsert(PLUS, ret);
612         else if (op == '+') {
613                 /* Emit x+ as x(&|), where & means "self". */
614                 next = regnode(BRANCH);                 /* Either */
615                 regtail(ret, next);
616                 regtail(regnode(BACK), ret);            /* loop back */
617                 regtail(next, regnode(BRANCH));         /* or */
618                 regtail(ret, regnode(NOTHING));         /* null. */
619         } else if (op == '?') {
620                 /* Emit x? as (x|) */
621                 reginsert(BRANCH, ret);                 /* Either x */
622                 regtail(ret, regnode(BRANCH));          /* or */
623                 next = regnode(NOTHING);                /* null. */
624                 regtail(ret, next);
625                 regoptail(ret, next);
626         }
627       nest_check:
628         regparse++;
629         if (ISMULT2(regparse))
630                 FAIL("nested *?+ in regexp");
631
632         return(ret);
633 }
634
635 /*
636  - regatom - the lowest level
637  *
638  * Optimization:  gobbles an entire sequence of ordinary characters so that
639  * it can turn them into a single node, which is smaller to store and
640  * faster to run.  Backslashed characters are exceptions, each becoming a
641  * separate node; the code is simpler that way and it's not worth fixing.
642  *
643  * [Yes, it is worth fixing, some scripts can run twice the speed.]
644  */
645 static char *
646 regatom(flagp)
647 int *flagp;
648 {
649         register char *ret;
650         int flags;
651
652         *flagp = WORST;         /* Tentatively. */
653
654         switch (*regparse++) {
655         case '^':
656                 ret = regnode(BOL);
657                 break;
658         case '$':
659                 ret = regnode(EOL);
660                 break;
661         case '.':
662                 ret = regnode(ANY);
663                 *flagp |= HASWIDTH|SIMPLE;
664                 break;
665         case '[':
666                 ret = regclass();
667                 *flagp |= HASWIDTH|SIMPLE;
668                 break;
669         case '(':
670                 ret = reg(1, &flags);
671                 if (ret == NULL)
672                         return(NULL);
673                 *flagp |= flags&(HASWIDTH|SPSTART);
674                 break;
675         case '|':
676         case ')':
677                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
678                 break;
679         case '?':
680         case '+':
681         case '*':
682                 FAIL("?+* follows nothing in regexp");
683                 break;
684         case '\\':
685                 switch (*regparse) {
686                 case 'w':
687                         ret = regnode(ALNUM);
688                         *flagp |= HASWIDTH|SIMPLE;
689                         regparse++;
690                         break;
691                 case 'W':
692                         ret = regnode(NALNUM);
693                         *flagp |= HASWIDTH|SIMPLE;
694                         regparse++;
695                         break;
696                 case 'b':
697                         ret = regnode(BOUND);
698                         *flagp |= SIMPLE;
699                         regparse++;
700                         break;
701                 case 'B':
702                         ret = regnode(NBOUND);
703                         *flagp |= SIMPLE;
704                         regparse++;
705                         break;
706                 case 's':
707                         ret = regnode(SPACE);
708                         *flagp |= HASWIDTH|SIMPLE;
709                         regparse++;
710                         break;
711                 case 'S':
712                         ret = regnode(NSPACE);
713                         *flagp |= HASWIDTH|SIMPLE;
714                         regparse++;
715                         break;
716                 case 'd':
717                         ret = regnode(DIGIT);
718                         *flagp |= HASWIDTH|SIMPLE;
719                         regparse++;
720                         break;
721                 case 'D':
722                         ret = regnode(NDIGIT);
723                         *flagp |= HASWIDTH|SIMPLE;
724                         regparse++;
725                         break;
726                 case 'n':
727                 case 'r':
728                 case 't':
729                 case 'f':
730                 case 'e':
731                 case 'a':
732                 case 'x':
733                 case 'c':
734                 case '0':
735                         goto defchar;
736                 case '1': case '2': case '3': case '4':
737                 case '5': case '6': case '7': case '8': case '9':
738                         {
739                             int num = atoi(regparse);
740
741                             if (num > 9 && num >= regnpar)
742                                 goto defchar;
743                             else {
744                                 ret = reganode(REF, num);
745                                 while (isascii(*regparse) && isdigit(*regparse))
746                                     regparse++;
747                                 *flagp |= SIMPLE;
748                             }
749                         }
750                         break;
751                 case '\0':
752                         if (regparse >= regxend)
753                             FAIL("trailing \\ in regexp");
754                         /* FALL THROUGH */
755                 default:
756                         goto defchar;
757                 }
758                 break;
759         default: {
760                         register int len;
761                         register char ender;
762                         register char *p;
763                         char *oldp;
764                         int numlen;
765
766                     defchar:
767                         ret = regnode(EXACTLY);
768                         regc(0);                /* save spot for len */
769                         for (len=0, p=regparse-1;
770                           len < 127 && p < regxend;
771                           len++)
772                         {
773                             oldp = p;
774                             switch (*p) {
775                             case '^':
776                             case '$':
777                             case '.':
778                             case '[':
779                             case '(':
780                             case ')':
781                             case '|':
782                                 goto loopdone;
783                             case '\\':
784                                 switch (*++p) {
785                                 case 'w':
786                                 case 'W':
787                                 case 'b':
788                                 case 'B':
789                                 case 's':
790                                 case 'S':
791                                 case 'd':
792                                 case 'D':
793                                     --p;
794                                     goto loopdone;
795                                 case 'n':
796                                         ender = '\n';
797                                         p++;
798                                         break;
799                                 case 'r':
800                                         ender = '\r';
801                                         p++;
802                                         break;
803                                 case 't':
804                                         ender = '\t';
805                                         p++;
806                                         break;
807                                 case 'f':
808                                         ender = '\f';
809                                         p++;
810                                         break;
811                                 case 'e':
812                                         ender = '\033';
813                                         p++;
814                                         break;
815                                 case 'a':
816                                         ender = '\007';
817                                         p++;
818                                         break;
819                                 case 'x':
820                                     ender = scanhex(++p, 2, &numlen);
821                                     p += numlen;
822                                     break;
823                                 case 'c':
824                                     p++;
825                                     ender = *p++;
826                                     if (islower(ender))
827                                         ender = toupper(ender);
828                                     ender ^= 64;
829                                     break;
830                                 case '0': case '1': case '2': case '3':case '4':
831                                 case '5': case '6': case '7': case '8':case '9':
832                                     if (*p == '0' ||
833                                       (isdigit(p[1]) && atoi(p) >= regnpar) ) {
834                                         ender = scanoct(p, 3, &numlen);
835                                         p += numlen;
836                                     }
837                                     else {
838                                         --p;
839                                         goto loopdone;
840                                     }
841                                     break;
842                                 case '\0':
843                                     if (p >= regxend)
844                                         FAIL("trailing \\ in regexp");
845                                     /* FALL THROUGH */
846                                 default:
847                                     ender = *p++;
848                                     break;
849                                 }
850                                 break;
851                             default:
852                                 ender = *p++;
853                                 break;
854                             }
855                             if (regfold && isupper(ender))
856                                     ender = tolower(ender);
857                             if (ISMULT2(p)) { /* Back off on ?+*. */
858                                 if (len)
859                                     p = oldp;
860                                 else {
861                                     len++;
862                                     regc(ender);
863                                 }
864                                 break;
865                             }
866                             regc(ender);
867                         }
868                     loopdone:
869                         regparse = p;
870                         if (len <= 0)
871                                 FAIL("internal disaster in regexp");
872                         *flagp |= HASWIDTH;
873                         if (len == 1)
874                                 *flagp |= SIMPLE;
875                         if (regcode != &regdummy)
876                             *OPERAND(ret) = len;
877                         regc('\0');
878                 }
879                 break;
880         }
881
882         return(ret);
883 }
884
885 static void
886 regset(bits,def,c)
887 char *bits;
888 int def;
889 register int c;
890 {
891         if (regcode == &regdummy)
892             return;
893         c &= 255;
894         if (def)
895                 bits[c >> 3] &= ~(1 << (c & 7));
896         else
897                 bits[c >> 3] |=  (1 << (c & 7));
898 }
899
900 static char *
901 regclass()
902 {
903         register char *bits;
904         register int class;
905         register int lastclass;
906         register int range = 0;
907         register char *ret;
908         register int def;
909         int numlen;
910
911         ret = regnode(ANYOF);
912         if (*regparse == '^') { /* Complement of range. */
913                 regparse++;
914                 def = 0;
915         } else {
916                 def = 255;
917         }
918         bits = regcode;
919         for (class = 0; class < 32; class++)
920             regc(def);
921         if (*regparse == ']' || *regparse == '-')
922                 goto skipcond;          /* allow 1st char to be ] or - */
923         while (regparse < regxend && *regparse != ']') {
924               skipcond:
925                 class = UCHARAT(regparse++);
926                 if (class == '\\') {
927                         class = UCHARAT(regparse++);
928                         switch (class) {
929                         case 'w':
930                                 for (class = 'a'; class <= 'z'; class++)
931                                         regset(bits,def,class);
932                                 for (class = 'A'; class <= 'Z'; class++)
933                                         regset(bits,def,class);
934                                 for (class = '0'; class <= '9'; class++)
935                                         regset(bits,def,class);
936                                 regset(bits,def,'_');
937                                 lastclass = 1234;
938                                 continue;
939                         case 's':
940                                 regset(bits,def,' ');
941                                 regset(bits,def,'\t');
942                                 regset(bits,def,'\r');
943                                 regset(bits,def,'\f');
944                                 regset(bits,def,'\n');
945                                 lastclass = 1234;
946                                 continue;
947                         case 'd':
948                                 for (class = '0'; class <= '9'; class++)
949                                         regset(bits,def,class);
950                                 lastclass = 1234;
951                                 continue;
952                         case 'n':
953                                 class = '\n';
954                                 break;
955                         case 'r':
956                                 class = '\r';
957                                 break;
958                         case 't':
959                                 class = '\t';
960                                 break;
961                         case 'f':
962                                 class = '\f';
963                                 break;
964                         case 'b':
965                                 class = '\b';
966                                 break;
967                         case 'e':
968                                 class = '\033';
969                                 break;
970                         case 'a':
971                                 class = '\007';
972                                 break;
973                         case 'x':
974                                 class = scanhex(regparse, 2, &numlen);
975                                 regparse += numlen;
976                                 break;
977                         case 'c':
978                                 class = *regparse++;
979                                 if (islower(class))
980                                     class = toupper(class);
981                                 class ^= 64;
982                                 break;
983                         case '0': case '1': case '2': case '3': case '4':
984                         case '5': case '6': case '7': case '8': case '9':
985                                 class = scanoct(--regparse, 3, &numlen);
986                                 regparse += numlen;
987                                 break;
988                         }
989                 }
990                 if (range) {
991                         if (lastclass > class)
992                                 FAIL("invalid [] range in regexp");
993                         range = 0;
994                 }
995                 else {
996                         lastclass = class;
997                         if (*regparse == '-' && regparse+1 < regxend &&
998                             regparse[1] != ']') {
999                                 regparse++;
1000                                 range = 1;
1001                                 continue;       /* do it next time */
1002                         }
1003                 }
1004                 for ( ; lastclass <= class; lastclass++) {
1005                         regset(bits,def,lastclass);
1006                         if (regfold && isupper(lastclass))
1007                                 regset(bits,def,tolower(lastclass));
1008                 }
1009                 lastclass = class;
1010         }
1011         if (*regparse != ']')
1012                 FAIL("unmatched [] in regexp");
1013         regparse++;
1014         return ret;
1015 }
1016
1017 /*
1018  - regnode - emit a node
1019  */
1020 static char *                   /* Location. */
1021 regnode(op)
1022 char op;
1023 {
1024         register char *ret;
1025         register char *ptr;
1026
1027         ret = regcode;
1028         if (ret == &regdummy) {
1029 #ifdef REGALIGN
1030                 if (!(regsize & 1))
1031                         regsize++;
1032 #endif
1033                 regsize += 3;
1034                 return(ret);
1035         }
1036
1037 #ifdef REGALIGN
1038 #ifndef lint
1039         if (!((long)ret & 1))
1040             *ret++ = 127;
1041 #endif
1042 #endif
1043         ptr = ret;
1044         *ptr++ = op;
1045         *ptr++ = '\0';          /* Null "next" pointer. */
1046         *ptr++ = '\0';
1047         regcode = ptr;
1048
1049         return(ret);
1050 }
1051
1052 /*
1053  - reganode - emit a node with an argument
1054  */
1055 static char *                   /* Location. */
1056 reganode(op, arg)
1057 char op;
1058 unsigned short arg;
1059 {
1060         register char *ret;
1061         register char *ptr;
1062
1063         ret = regcode;
1064         if (ret == &regdummy) {
1065 #ifdef REGALIGN
1066                 if (!(regsize & 1))
1067                         regsize++;
1068 #endif
1069                 regsize += 5;
1070                 return(ret);
1071         }
1072
1073 #ifdef REGALIGN
1074 #ifndef lint
1075         if (!((long)ret & 1))
1076             *ret++ = 127;
1077 #endif
1078 #endif
1079         ptr = ret;
1080         *ptr++ = op;
1081         *ptr++ = '\0';          /* Null "next" pointer. */
1082         *ptr++ = '\0';
1083 #ifdef REGALIGN
1084         *(unsigned short *)(ret+3) = arg;
1085 #else
1086         ret[3] = arg >> 8; ret[4] = arg & 0377;
1087 #endif
1088         ptr += 2;
1089         regcode = ptr;
1090
1091         return(ret);
1092 }
1093
1094 /*
1095  - regc - emit (if appropriate) a byte of code
1096  */
1097 static void
1098 regc(b)
1099 char b;
1100 {
1101         if (regcode != &regdummy)
1102                 *regcode++ = b;
1103         else
1104                 regsize++;
1105 }
1106
1107 /*
1108  - reginsert - insert an operator in front of already-emitted operand
1109  *
1110  * Means relocating the operand.
1111  */
1112 static void
1113 reginsert(op, opnd)
1114 char op;
1115 char *opnd;
1116 {
1117         register char *src;
1118         register char *dst;
1119         register char *place;
1120         register offset = (op == CURLY ? 4 : 0);
1121
1122         if (regcode == &regdummy) {
1123 #ifdef REGALIGN
1124                 regsize += 4 + offset;
1125 #else
1126                 regsize += 3 + offset;
1127 #endif
1128                 return;
1129         }
1130
1131         src = regcode;
1132 #ifdef REGALIGN
1133         regcode += 4 + offset;
1134 #else
1135         regcode += 3 + offset;
1136 #endif
1137         dst = regcode;
1138         while (src > opnd)
1139                 *--dst = *--src;
1140
1141         place = opnd;           /* Op node, where operand used to be. */
1142         *place++ = op;
1143         *place++ = '\0';
1144         *place++ = '\0';
1145         while (offset-- > 0)
1146             *place++ = '\0';
1147 }
1148
1149 /*
1150  - regtail - set the next-pointer at the end of a node chain
1151  */
1152 static void
1153 regtail(p, val)
1154 char *p;
1155 char *val;
1156 {
1157         register char *scan;
1158         register char *temp;
1159         register int offset;
1160
1161         if (p == &regdummy)
1162                 return;
1163
1164         /* Find last node. */
1165         scan = p;
1166         for (;;) {
1167                 temp = regnext(scan);
1168                 if (temp == NULL)
1169                         break;
1170                 scan = temp;
1171         }
1172
1173 #ifdef REGALIGN
1174         offset = val - scan;
1175 #ifndef lint
1176         *(short*)(scan+1) = offset;
1177 #else
1178         offset = offset;
1179 #endif
1180 #else
1181         if (OP(scan) == BACK)
1182                 offset = scan - val;
1183         else
1184                 offset = val - scan;
1185         *(scan+1) = (offset>>8)&0377;
1186         *(scan+2) = offset&0377;
1187 #endif
1188 }
1189
1190 /*
1191  - regoptail - regtail on operand of first argument; nop if operandless
1192  */
1193 static void
1194 regoptail(p, val)
1195 char *p;
1196 char *val;
1197 {
1198         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1199         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1200                 return;
1201         regtail(NEXTOPER(p), val);
1202 }
1203
1204 /*
1205  - regcurly - a little FSA that accepts {\d+,?\d*}
1206  */
1207 STATIC int
1208 regcurly(s)
1209 register char *s;
1210 {
1211     if (*s++ != '{')
1212         return FALSE;
1213     if (!isdigit(*s))
1214         return FALSE;
1215     while (isdigit(*s))
1216         s++;
1217     if (*s == ',')
1218         s++;
1219     while (isdigit(*s))
1220         s++;
1221     if (*s != '}')
1222         return FALSE;
1223     return TRUE;
1224 }
1225
1226 #ifdef DEBUGGING
1227
1228 /*
1229  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1230  */
1231 void
1232 regdump(r)
1233 regexp *r;
1234 {
1235         register char *s;
1236         register char op = EXACTLY;     /* Arbitrary non-END op. */
1237         register char *next;
1238
1239
1240         s = r->program + 1;
1241         while (op != END) {     /* While that wasn't END last time... */
1242 #ifdef REGALIGN
1243                 if (!((long)s & 1))
1244                         s++;
1245 #endif
1246                 op = OP(s);
1247                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1248                 next = regnext(s);
1249                 s += regarglen[op];
1250                 if (next == NULL)               /* Next ptr. */
1251                         fprintf(stderr,"(0)");
1252                 else 
1253                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1254                 s += 3;
1255                 if (op == ANYOF) {
1256                         s += 32;
1257                 }
1258                 if (op == EXACTLY) {
1259                         /* Literal string, where present. */
1260                         s++;
1261                         while (*s != '\0') {
1262                                 (void)putchar(*s);
1263                                 s++;
1264                         }
1265                         s++;
1266                 }
1267                 (void)putchar('\n');
1268         }
1269
1270         /* Header fields of interest. */
1271         if (r->regstart)
1272                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1273         if (r->regstclass)
1274                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1275         if (r->reganch & 1)
1276                 fprintf(stderr,"anchored ");
1277         if (r->reganch & 2)
1278                 fprintf(stderr,"plus ");
1279         if (r->regmust != NULL)
1280                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1281                   r->regback);
1282         fprintf(stderr,"\n");
1283 }
1284
1285 /*
1286  - regprop - printable representation of opcode
1287  */
1288 char *
1289 regprop(op)
1290 char *op;
1291 {
1292         register char *p;
1293
1294         (void) strcpy(buf, ":");
1295
1296         switch (OP(op)) {
1297         case BOL:
1298                 p = "BOL";
1299                 break;
1300         case EOL:
1301                 p = "EOL";
1302                 break;
1303         case ANY:
1304                 p = "ANY";
1305                 break;
1306         case ANYOF:
1307                 p = "ANYOF";
1308                 break;
1309         case BRANCH:
1310                 p = "BRANCH";
1311                 break;
1312         case EXACTLY:
1313                 p = "EXACTLY";
1314                 break;
1315         case NOTHING:
1316                 p = "NOTHING";
1317                 break;
1318         case BACK:
1319                 p = "BACK";
1320                 break;
1321         case END:
1322                 p = "END";
1323                 break;
1324         case ALNUM:
1325                 p = "ALNUM";
1326                 break;
1327         case NALNUM:
1328                 p = "NALNUM";
1329                 break;
1330         case BOUND:
1331                 p = "BOUND";
1332                 break;
1333         case NBOUND:
1334                 p = "NBOUND";
1335                 break;
1336         case SPACE:
1337                 p = "SPACE";
1338                 break;
1339         case NSPACE:
1340                 p = "NSPACE";
1341                 break;
1342         case DIGIT:
1343                 p = "DIGIT";
1344                 break;
1345         case NDIGIT:
1346                 p = "NDIGIT";
1347                 break;
1348         case CURLY:
1349                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1350                     ARG1(op),ARG2(op));
1351                 p = NULL;
1352                 break;
1353         case REF:
1354                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1355                 p = NULL;
1356                 break;
1357         case OPEN:
1358                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1359                 p = NULL;
1360                 break;
1361         case CLOSE:
1362                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1363                 p = NULL;
1364                 break;
1365         case STAR:
1366                 p = "STAR";
1367                 break;
1368         case PLUS:
1369                 p = "PLUS";
1370                 break;
1371         default:
1372                 FAIL("corrupted regexp opcode");
1373         }
1374         if (p != NULL)
1375                 (void) strcat(buf, p);
1376         return(buf);
1377 }
1378 #endif /* DEBUGGING */
1379
1380 regfree(r)
1381 struct regexp *r;
1382 {
1383         if (r->precomp) {
1384                 Safefree(r->precomp);
1385                 r->precomp = Nullch;
1386         }
1387         if (r->subbase) {
1388                 Safefree(r->subbase);
1389                 r->subbase = Nullch;
1390         }
1391         if (r->regmust) {
1392                 str_free(r->regmust);
1393                 r->regmust = Nullstr;
1394         }
1395         if (r->regstart) {
1396                 str_free(r->regstart);
1397                 r->regstart = Nullstr;
1398         }
1399         Safefree(r->startp);
1400         Safefree(r->endp);
1401         Safefree(r);
1402 }