perl 4.0 patch 29: patch #20, continued
[perl.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.4 $$Date: 91/11/05 22:55:14 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0.1.4  91/11/05  22:55:14  lwall
14  * patch11: Erratum
15  * 
16  * Revision 4.0.1.3  91/11/05  18:22:28  lwall
17  * patch11: minimum match length calculation in regexp is now cumulative
18  * patch11: initial .* in pattern had dependency on value of $*
19  * patch11: certain patterns made use of garbage pointers from uncleared memory
20  * patch11: prepared for ctype implementations that don't define isascii()
21  * 
22  * Revision 4.0.1.2  91/06/07  11:48:24  lwall
23  * patch4: new copyright notice
24  * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
25  * patch4: // wouldn't use previous pattern if it started with a null character
26  * 
27  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
28  * patch1: random cleanup in cpp namespace
29  * 
30  * Revision 4.0  91/03/20  01:39:01  lwall
31  * 4.0 baseline.
32  * 
33  */
34 /*SUPPRESS 112*/
35 /*
36  * regcomp and regexec -- regsub and regerror are not used in perl
37  *
38  *      Copyright (c) 1986 by University of Toronto.
39  *      Written by Henry Spencer.  Not derived from licensed software.
40  *
41  *      Permission is granted to anyone to use this software for any
42  *      purpose on any computer system, and to redistribute it freely,
43  *      subject to the following restrictions:
44  *
45  *      1. The author is not responsible for the consequences of use of
46  *              this software, no matter how awful, even if they arise
47  *              from defects in it.
48  *
49  *      2. The origin of this software must not be misrepresented, either
50  *              by explicit claim or by omission.
51  *
52  *      3. Altered versions must be plainly marked as such, and must not
53  *              be misrepresented as being the original software.
54  *
55  *
56  ****    Alterations to Henry's code are...
57  ****
58  ****    Copyright (c) 1991, Larry Wall
59  ****
60  ****    You may distribute under the terms of either the GNU General Public
61  ****    License or the Artistic License, as specified in the README file.
62
63  *
64  * Beware that some of this code is subtly aware of the way operator
65  * precedence is structured in regular expressions.  Serious changes in
66  * regular-expression syntax might require a total rethink.
67  */
68 #include "EXTERN.h"
69 #include "perl.h"
70 #include "INTERN.h"
71 #include "regcomp.h"
72
73 #ifdef MSDOS
74 # if defined(BUGGY_MSC6)
75  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
76  # pragma optimize("a",off)
77  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
78  # pragma optimize("w",on )
79 # endif /* BUGGY_MSC6 */
80 #endif /* MSDOS */
81
82 #ifndef STATIC
83 #define STATIC  static
84 #endif
85
86 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
87 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
88         ((*s) == '{' && regcurly(s)))
89 #define META    "^$.[()|?+*\\"
90
91 #ifdef SPSTART
92 #undef SPSTART          /* dratted cpp namespace... */
93 #endif
94 /*
95  * Flags to be passed up and down.
96  */
97 #define HASWIDTH        01      /* Known never to match null string. */
98 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
99 #define SPSTART         04      /* Starts with * or +. */
100 #define WORST           0       /* Worst case. */
101
102 /*
103  * Global work variables for regcomp().
104  */
105 static char *regprecomp;                /* uncompiled string. */
106 static char *regparse;          /* Input-scan pointer. */
107 static char *regxend;           /* End of input for compile */
108 static int regnpar;             /* () count. */
109 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
110 static long regsize;            /* Code size. */
111 static int regfold;
112 static int regsawbracket;       /* Did we do {d,d} trick? */
113 static int regsawback;          /* Did we see \1, ...? */
114
115 /*
116  * Forward declarations for regcomp()'s friends.
117  */
118 STATIC int regcurly();
119 STATIC char *reg();
120 STATIC char *regbranch();
121 STATIC char *regpiece();
122 STATIC char *regatom();
123 STATIC char *regclass();
124 STATIC char *regnode();
125 STATIC char *reganode();
126 STATIC void regc();
127 STATIC void reginsert();
128 STATIC void regtail();
129 STATIC void regoptail();
130
131 /*
132  - regcomp - compile a regular expression into internal code
133  *
134  * We can't allocate space until we know how big the compiled form will be,
135  * but we can't compile it (and thus know how big it is) until we've got a
136  * place to put the code.  So we cheat:  we compile it twice, once with code
137  * generation turned off and size counting turned on, and once "for real".
138  * This also means that we don't allocate space until we are sure that the
139  * thing really will compile successfully, and we never have to move the
140  * code and thus invalidate pointers into it.  (Note that it has to be in
141  * one piece because free() must be able to free it all.) [NB: not true in perl]
142  *
143  * Beware that the optimization-preparation code in here knows about some
144  * of the structure of the compiled regexp.  [I'll say.]
145  */
146 regexp *
147 regcomp(exp,xend,fold)
148 char *exp;
149 char *xend;
150 int fold;
151 {
152         register regexp *r;
153         register char *scan;
154         register STR *longish;
155         STR *longest;
156         register int len;
157         register char *first;
158         int flags;
159         int backish;
160         int backest;
161         int curback;
162         int minlen;
163 #ifndef safemalloc
164         extern char *safemalloc();
165 #endif
166         extern char *savestr();
167         int sawplus = 0;
168         int sawopen = 0;
169
170         if (exp == NULL)
171                 fatal("NULL regexp argument");
172
173         /* First pass: determine size, legality. */
174         regfold = fold;
175         regparse = exp;
176         regxend = xend;
177         regprecomp = nsavestr(exp,xend-exp);
178         regsawbracket = 0;
179         regsawback = 0;
180         regnpar = 1;
181         regsize = 0L;
182         regcode = &regdummy;
183         regc((char)MAGIC);
184         if (reg(0, &flags) == NULL) {
185                 Safefree(regprecomp);
186                 regprecomp = Nullch;
187                 return(NULL);
188         }
189
190         /* Small enough for pointer-storage convention? */
191         if (regsize >= 32767L)          /* Probably could be 65535L. */
192                 FAIL("regexp too big");
193
194         /* Allocate space. */
195         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
196         if (r == NULL)
197                 FAIL("regexp out of space");
198
199         /* Second pass: emit code. */
200         if (regsawbracket)
201             bcopy(regprecomp,exp,xend-exp);
202         r->prelen = xend-exp;
203         r->precomp = regprecomp;
204         r->subbeg = r->subbase = NULL;
205         regparse = exp;
206         regnpar = 1;
207         regcode = r->program;
208         regc((char)MAGIC);
209         if (reg(0, &flags) == NULL)
210                 return(NULL);
211
212         /* Dig out information for optimizations. */
213         r->regstart = Nullstr;  /* Worst-case defaults. */
214         r->reganch = 0;
215         r->regmust = Nullstr;
216         r->regback = -1;
217         r->regstclass = Nullch;
218         scan = r->program+1;                    /* First BRANCH. */
219         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
220                 scan = NEXTOPER(scan);
221
222                 first = scan;
223                 while ((OP(first) == OPEN && (sawopen = 1)) ||
224                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
225                     (OP(first) == PLUS) ||
226                     (OP(first) == CURLY && ARG1(first) > 0) ) {
227                         if (OP(first) == PLUS)
228                             sawplus = 1;
229                         else
230                             first += regarglen[OP(first)];
231                         first = NEXTOPER(first);
232                 }
233
234                 /* Starting-point info. */
235             again:
236                 if (OP(first) == EXACTLY) {
237                         r->regstart =
238                             str_make(OPERAND(first)+1,*OPERAND(first));
239                         if (r->regstart->str_cur > !(sawstudy|fold))
240                                 fbmcompile(r->regstart,fold);
241                 }
242                 else if ((exp = index(simple,OP(first))) && exp > simple)
243                         r->regstclass = first;
244                 else if (OP(first) == BOUND || OP(first) == NBOUND)
245                         r->regstclass = first;
246                 else if (OP(first) == BOL ||
247                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
248                         /* kinda turn .* into ^.* */
249                         r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
250                         first = NEXTOPER(first);
251                         goto again;
252                 }
253                 if (sawplus && (!sawopen || !regsawback))
254                     r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
255
256 #ifdef DEBUGGING
257                 if (debug & 512)
258                     fprintf(stderr,"first %d next %d offset %d\n",
259                       OP(first), OP(NEXTOPER(first)), first - scan);
260 #endif
261                 /*
262                  * If there's something expensive in the r.e., find the
263                  * longest literal string that must appear and make it the
264                  * regmust.  Resolve ties in favor of later strings, since
265                  * the regstart check works with the beginning of the r.e.
266                  * and avoiding duplication strengthens checking.  Not a
267                  * strong reason, but sufficient in the absence of others.
268                  * [Now we resolve ties in favor of the earlier string if
269                  * it happens that curback has been invalidated, since the
270                  * earlier string may buy us something the later one won't.]
271                  */
272                 longish = str_make("",0);
273                 longest = str_make("",0);
274                 len = 0;
275                 minlen = 0;
276                 curback = 0;
277                 backish = 0;
278                 backest = 0;
279                 while (OP(scan) != END) {
280                         if (OP(scan) == BRANCH) {
281                             if (OP(regnext(scan)) == BRANCH) {
282                                 curback = -30000;
283                                 while (OP(scan) == BRANCH)
284                                     scan = regnext(scan);
285                             }
286                             else        /* single branch is ok */
287                                 scan = NEXTOPER(scan);
288                         }
289                         if (OP(scan) == EXACTLY) {
290                             char *t;
291
292                             first = scan;
293                             while (OP(t = regnext(scan)) == CLOSE)
294                                 scan = t;
295                             minlen += *OPERAND(first);
296                             if (curback - backish == len) {
297                                 str_ncat(longish, OPERAND(first)+1,
298                                     *OPERAND(first));
299                                 len += *OPERAND(first);
300                                 curback += *OPERAND(first);
301                                 first = regnext(scan);
302                             }
303                             else if (*OPERAND(first) >= len + (curback >= 0)) {
304                                 len = *OPERAND(first);
305                                 str_nset(longish, OPERAND(first)+1,len);
306                                 backish = curback;
307                                 curback += len;
308                                 first = regnext(scan);
309                             }
310                             else
311                                 curback += *OPERAND(first);
312                         }
313                         else if (index(varies,OP(scan))) {
314                             curback = -30000;
315                             len = 0;
316                             if (longish->str_cur > longest->str_cur) {
317                                 str_sset(longest,longish);
318                                 backest = backish;
319                             }
320                             str_nset(longish,"",0);
321                             if (OP(scan) == PLUS &&
322                               index(simple,OP(NEXTOPER(scan))))
323                                 minlen++;
324                             else if (OP(scan) == CURLY &&
325                               index(simple,OP(NEXTOPER(scan)+4)))
326                                 minlen += ARG1(scan);
327                         }
328                         else if (index(simple,OP(scan))) {
329                             curback++;
330                             minlen++;
331                             len = 0;
332                             if (longish->str_cur > longest->str_cur) {
333                                 str_sset(longest,longish);
334                                 backest = backish;
335                             }
336                             str_nset(longish,"",0);
337                         }
338                         scan = regnext(scan);
339                 }
340
341                 /* Prefer earlier on tie, unless we can tail match latter */
342
343                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
344                     str_sset(longest,longish);
345                     backest = backish;
346                 }
347                 else
348                     str_nset(longish,"",0);
349                 if (longest->str_cur
350                     &&
351                     (!r->regstart
352                      ||
353                      !fbminstr((unsigned char*) r->regstart->str_ptr,
354                           (unsigned char *) r->regstart->str_ptr
355                             + r->regstart->str_cur,
356                           longest)
357                     )
358                    )
359                 {
360                         r->regmust = longest;
361                         if (backest < 0)
362                                 backest = -1;
363                         r->regback = backest;
364                         if (longest->str_cur
365                           > !(sawstudy || fold || OP(first) == EOL) )
366                                 fbmcompile(r->regmust,fold);
367                         r->regmust->str_u.str_useful = 100;
368                         if (OP(first) == EOL && longish->str_cur)
369                             r->regmust->str_pok |= SP_TAIL;
370                 }
371                 else {
372                         str_free(longest);
373                         longest = Nullstr;
374                 }
375                 str_free(longish);
376         }
377
378         r->do_folding = fold;
379         r->nparens = regnpar - 1;
380         r->minlen = minlen;
381         Newz(1002, r->startp, regnpar, char*);
382         Newz(1002, r->endp, regnpar, char*);
383 #ifdef DEBUGGING
384         if (debug & 512)
385                 regdump(r);
386 #endif
387         return(r);
388 }
389
390 /*
391  - reg - regular expression, i.e. main body or parenthesized thing
392  *
393  * Caller must absorb opening parenthesis.
394  *
395  * Combining parenthesis handling with the base level of regular expression
396  * is a trifle forced, but the need to tie the tails of the branches to what
397  * follows makes it hard to avoid.
398  */
399 static char *
400 reg(paren, flagp)
401 int paren;                      /* Parenthesized? */
402 int *flagp;
403 {
404         register char *ret;
405         register char *br;
406         register char *ender;
407         register int parno;
408         int flags;
409
410         *flagp = HASWIDTH;      /* Tentatively. */
411
412         /* Make an OPEN node, if parenthesized. */
413         if (paren) {
414                 parno = regnpar;
415                 regnpar++;
416                 ret = reganode(OPEN, parno);
417         } else
418                 ret = NULL;
419
420         /* Pick up the branches, linking them together. */
421         br = regbranch(&flags);
422         if (br == NULL)
423                 return(NULL);
424         if (ret != NULL)
425                 regtail(ret, br);       /* OPEN -> first. */
426         else
427                 ret = br;
428         if (!(flags&HASWIDTH))
429                 *flagp &= ~HASWIDTH;
430         *flagp |= flags&SPSTART;
431         while (*regparse == '|') {
432                 regparse++;
433                 br = regbranch(&flags);
434                 if (br == NULL)
435                         return(NULL);
436                 regtail(ret, br);       /* BRANCH -> BRANCH. */
437                 if (!(flags&HASWIDTH))
438                         *flagp &= ~HASWIDTH;
439                 *flagp |= flags&SPSTART;
440         }
441
442         /* Make a closing node, and hook it on the end. */
443         if (paren)
444             ender = reganode(CLOSE, parno);
445         else
446             ender = regnode(END);
447         regtail(ret, ender);
448
449         /* Hook the tails of the branches to the closing node. */
450         for (br = ret; br != NULL; br = regnext(br))
451                 regoptail(br, ender);
452
453         /* Check for proper termination. */
454         if (paren && *regparse++ != ')') {
455                 FAIL("unmatched () in regexp");
456         } else if (!paren && regparse < regxend) {
457                 if (*regparse == ')') {
458                         FAIL("unmatched () in regexp");
459                 } else
460                         FAIL("junk on end of regexp");  /* "Can't happen". */
461                 /* NOTREACHED */
462         }
463
464         return(ret);
465 }
466
467 /*
468  - regbranch - one alternative of an | operator
469  *
470  * Implements the concatenation operator.
471  */
472 static char *
473 regbranch(flagp)
474 int *flagp;
475 {
476         register char *ret;
477         register char *chain;
478         register char *latest;
479         int flags;
480
481         *flagp = WORST;         /* Tentatively. */
482
483         ret = regnode(BRANCH);
484         chain = NULL;
485         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
486                 latest = regpiece(&flags);
487                 if (latest == NULL)
488                         return(NULL);
489                 *flagp |= flags&HASWIDTH;
490                 if (chain == NULL)      /* First piece. */
491                         *flagp |= flags&SPSTART;
492                 else
493                         regtail(chain, latest);
494                 chain = latest;
495         }
496         if (chain == NULL)      /* Loop ran zero times. */
497                 (void) regnode(NOTHING);
498
499         return(ret);
500 }
501
502 /*
503  - regpiece - something followed by possible [*+?]
504  *
505  * Note that the branching code sequences used for ? and the general cases
506  * of * and + are somewhat optimized:  they use the same NOTHING node as
507  * both the endmarker for their branch list and the body of the last branch.
508  * It might seem that this node could be dispensed with entirely, but the
509  * endmarker role is not redundant.
510  */
511 static char *
512 regpiece(flagp)
513 int *flagp;
514 {
515         register char *ret;
516         register char op;
517         register char *next;
518         int flags;
519         char *origparse = regparse;
520         int orignpar = regnpar;
521         char *max;
522         int iter;
523         char ch;
524
525         ret = regatom(&flags);
526         if (ret == NULL)
527                 return(NULL);
528
529         op = *regparse;
530
531         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
532          * then we decrement the first number by one and reset our
533          * parsing back to the beginning of the same atom.  If the first number
534          * is down to 0, decrement the second number instead and fake up
535          * a ? after it.  Given the way this compiler doesn't keep track
536          * of offsets on the first pass, this is the only way to replicate
537          * a piece of code.  Sigh.
538          */
539         if (op == '{' && regcurly(regparse)) {
540             next = regparse + 1;
541             max = Nullch;
542             while (isDIGIT(*next) || *next == ',') {
543                 if (*next == ',') {
544                     if (max)
545                         break;
546                     else
547                         max = next;
548                 }
549                 next++;
550             }
551             if (*next == '}') {         /* got one */
552                 if (!max)
553                     max = next;
554                 regparse++;
555                 iter = atoi(regparse);
556                 if (flags&SIMPLE) {     /* we can do it right after all */
557                     int tmp;
558
559                     reginsert(CURLY, ret);
560                     if (iter > 0)
561                         *flagp = (WORST|HASWIDTH);
562                     if (*max == ',')
563                         max++;
564                     else
565                         max = regparse;
566                     tmp = atoi(max);
567                     if (tmp && tmp < iter)
568                         fatal("Can't do {n,m} with n > m");
569                     if (regcode != &regdummy) {
570 #ifdef REGALIGN
571                         *(unsigned short *)(ret+3) = iter;
572                         *(unsigned short *)(ret+5) = tmp;
573 #else
574                         ret[3] = iter >> 8; ret[4] = iter & 0377;
575                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
576 #endif
577                     }
578                     regparse = next;
579                     goto nest_check;
580                 }
581                 regsawbracket++;        /* remember we clobbered exp */
582                 if (iter > 0) {
583                     ch = *max;
584                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
585                     *max = ch;
586                     if (*max == ',' && max[1] != '}') {
587                         if (atoi(max+1) <= 0)
588                             fatal("Can't do {n,m} with n > m");
589                         ch = *next;
590                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
591                         *next = ch;
592                     }
593                     if (iter != 1 || *max == ',') {
594                         regparse = origparse;   /* back up input pointer */
595                         regnpar = orignpar;     /* don't make more parens */
596                     }
597                     else {
598                         regparse = next;
599                         goto nest_check;
600                     }
601                     *flagp = flags;
602                     return ret;
603                 }
604                 if (*max == ',') {
605                     max++;
606                     iter = atoi(max);
607                     if (max == next) {          /* any number more? */
608                         regparse = next;
609                         op = '*';               /* fake up one with a star */
610                     }
611                     else if (iter > 0) {
612                         op = '?';               /* fake up optional atom */
613                         ch = *next;
614                         sprintf(max,"%.*d", next-max, iter - 1);
615                         *next = ch;
616                         if (iter == 1)
617                             regparse = next;
618                         else {
619                             regparse = origparse - 1; /* offset ++ below */
620                             regnpar = orignpar;
621                         }
622                     }
623                     else
624                         fatal("Can't do {n,0}");
625                 }
626                 else
627                     fatal("Can't do {0}");
628             }
629         }
630
631         if (!ISMULT1(op)) {
632                 *flagp = flags;
633                 return(ret);
634         }
635
636         if (!(flags&HASWIDTH) && op != '?')
637                 FAIL("regexp *+ operand could be empty");
638         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
639
640         if (op == '*' && (flags&SIMPLE))
641                 reginsert(STAR, ret);
642         else if (op == '*') {
643                 /* Emit x* as (x&|), where & means "self". */
644                 reginsert(BRANCH, ret);                 /* Either x */
645                 regoptail(ret, regnode(BACK));          /* and loop */
646                 regoptail(ret, ret);                    /* back */
647                 regtail(ret, regnode(BRANCH));          /* or */
648                 regtail(ret, regnode(NOTHING));         /* null. */
649         } else if (op == '+' && (flags&SIMPLE))
650                 reginsert(PLUS, ret);
651         else if (op == '+') {
652                 /* Emit x+ as x(&|), where & means "self". */
653                 next = regnode(BRANCH);                 /* Either */
654                 regtail(ret, next);
655                 regtail(regnode(BACK), ret);            /* loop back */
656                 regtail(next, regnode(BRANCH));         /* or */
657                 regtail(ret, regnode(NOTHING));         /* null. */
658         } else if (op == '?') {
659                 /* Emit x? as (x|) */
660                 reginsert(BRANCH, ret);                 /* Either x */
661                 regtail(ret, regnode(BRANCH));          /* or */
662                 next = regnode(NOTHING);                /* null. */
663                 regtail(ret, next);
664                 regoptail(ret, next);
665         }
666       nest_check:
667         regparse++;
668         if (ISMULT2(regparse))
669                 FAIL("nested *?+ in regexp");
670
671         return(ret);
672 }
673
674 /*
675  - regatom - the lowest level
676  *
677  * Optimization:  gobbles an entire sequence of ordinary characters so that
678  * it can turn them into a single node, which is smaller to store and
679  * faster to run.  Backslashed characters are exceptions, each becoming a
680  * separate node; the code is simpler that way and it's not worth fixing.
681  *
682  * [Yes, it is worth fixing, some scripts can run twice the speed.]
683  */
684 static char *
685 regatom(flagp)
686 int *flagp;
687 {
688         register char *ret;
689         int flags;
690
691         *flagp = WORST;         /* Tentatively. */
692
693         switch (*regparse++) {
694         case '^':
695                 ret = regnode(BOL);
696                 break;
697         case '$':
698                 ret = regnode(EOL);
699                 break;
700         case '.':
701                 ret = regnode(ANY);
702                 *flagp |= HASWIDTH|SIMPLE;
703                 break;
704         case '[':
705                 ret = regclass();
706                 *flagp |= HASWIDTH|SIMPLE;
707                 break;
708         case '(':
709                 ret = reg(1, &flags);
710                 if (ret == NULL)
711                         return(NULL);
712                 *flagp |= flags&(HASWIDTH|SPSTART);
713                 break;
714         case '|':
715         case ')':
716                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
717                 break;
718         case '?':
719         case '+':
720         case '*':
721                 FAIL("?+* follows nothing in regexp");
722                 break;
723         case '\\':
724                 switch (*regparse) {
725                 case 'w':
726                         ret = regnode(ALNUM);
727                         *flagp |= HASWIDTH|SIMPLE;
728                         regparse++;
729                         break;
730                 case 'W':
731                         ret = regnode(NALNUM);
732                         *flagp |= HASWIDTH|SIMPLE;
733                         regparse++;
734                         break;
735                 case 'b':
736                         ret = regnode(BOUND);
737                         *flagp |= SIMPLE;
738                         regparse++;
739                         break;
740                 case 'B':
741                         ret = regnode(NBOUND);
742                         *flagp |= SIMPLE;
743                         regparse++;
744                         break;
745                 case 's':
746                         ret = regnode(SPACE);
747                         *flagp |= HASWIDTH|SIMPLE;
748                         regparse++;
749                         break;
750                 case 'S':
751                         ret = regnode(NSPACE);
752                         *flagp |= HASWIDTH|SIMPLE;
753                         regparse++;
754                         break;
755                 case 'd':
756                         ret = regnode(DIGIT);
757                         *flagp |= HASWIDTH|SIMPLE;
758                         regparse++;
759                         break;
760                 case 'D':
761                         ret = regnode(NDIGIT);
762                         *flagp |= HASWIDTH|SIMPLE;
763                         regparse++;
764                         break;
765                 case 'n':
766                 case 'r':
767                 case 't':
768                 case 'f':
769                 case 'e':
770                 case 'a':
771                 case 'x':
772                 case 'c':
773                 case '0':
774                         goto defchar;
775                 case '1': case '2': case '3': case '4':
776                 case '5': case '6': case '7': case '8': case '9':
777                         {
778                             int num = atoi(regparse);
779
780                             if (num > 9 && num >= regnpar)
781                                 goto defchar;
782                             else {
783                                 regsawback = 1;
784                                 ret = reganode(REF, num);
785                                 while (isDIGIT(*regparse))
786                                     regparse++;
787                                 *flagp |= SIMPLE;
788                             }
789                         }
790                         break;
791                 case '\0':
792                         if (regparse >= regxend)
793                             FAIL("trailing \\ in regexp");
794                         /* FALL THROUGH */
795                 default:
796                         goto defchar;
797                 }
798                 break;
799         default: {
800                         register int len;
801                         register char ender;
802                         register char *p;
803                         char *oldp;
804                         int numlen;
805
806                     defchar:
807                         ret = regnode(EXACTLY);
808                         regc(0);                /* save spot for len */
809                         for (len=0, p=regparse-1;
810                           len < 127 && p < regxend;
811                           len++)
812                         {
813                             oldp = p;
814                             switch (*p) {
815                             case '^':
816                             case '$':
817                             case '.':
818                             case '[':
819                             case '(':
820                             case ')':
821                             case '|':
822                                 goto loopdone;
823                             case '\\':
824                                 switch (*++p) {
825                                 case 'w':
826                                 case 'W':
827                                 case 'b':
828                                 case 'B':
829                                 case 's':
830                                 case 'S':
831                                 case 'd':
832                                 case 'D':
833                                     --p;
834                                     goto loopdone;
835                                 case 'n':
836                                         ender = '\n';
837                                         p++;
838                                         break;
839                                 case 'r':
840                                         ender = '\r';
841                                         p++;
842                                         break;
843                                 case 't':
844                                         ender = '\t';
845                                         p++;
846                                         break;
847                                 case 'f':
848                                         ender = '\f';
849                                         p++;
850                                         break;
851                                 case 'e':
852                                         ender = '\033';
853                                         p++;
854                                         break;
855                                 case 'a':
856                                         ender = '\007';
857                                         p++;
858                                         break;
859                                 case 'x':
860                                     ender = scanhex(++p, 2, &numlen);
861                                     p += numlen;
862                                     break;
863                                 case 'c':
864                                     p++;
865                                     ender = *p++;
866                                     if (isLOWER(ender))
867                                         ender = toupper(ender);
868                                     ender ^= 64;
869                                     break;
870                                 case '0': case '1': case '2': case '3':case '4':
871                                 case '5': case '6': case '7': case '8':case '9':
872                                     if (*p == '0' ||
873                                       (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
874                                         ender = scanoct(p, 3, &numlen);
875                                         p += numlen;
876                                     }
877                                     else {
878                                         --p;
879                                         goto loopdone;
880                                     }
881                                     break;
882                                 case '\0':
883                                     if (p >= regxend)
884                                         FAIL("trailing \\ in regexp");
885                                     /* FALL THROUGH */
886                                 default:
887                                     ender = *p++;
888                                     break;
889                                 }
890                                 break;
891                             default:
892                                 ender = *p++;
893                                 break;
894                             }
895                             if (regfold && isUPPER(ender))
896                                     ender = tolower(ender);
897                             if (ISMULT2(p)) { /* Back off on ?+*. */
898                                 if (len)
899                                     p = oldp;
900                                 else {
901                                     len++;
902                                     regc(ender);
903                                 }
904                                 break;
905                             }
906                             regc(ender);
907                         }
908                     loopdone:
909                         regparse = p;
910                         if (len <= 0)
911                                 FAIL("internal disaster in regexp");
912                         *flagp |= HASWIDTH;
913                         if (len == 1)
914                                 *flagp |= SIMPLE;
915                         if (regcode != &regdummy)
916                             *OPERAND(ret) = len;
917                         regc('\0');
918                 }
919                 break;
920         }
921
922         return(ret);
923 }
924
925 static void
926 regset(bits,def,c)
927 char *bits;
928 int def;
929 register int c;
930 {
931         if (regcode == &regdummy)
932             return;
933         c &= 255;
934         if (def)
935                 bits[c >> 3] &= ~(1 << (c & 7));
936         else
937                 bits[c >> 3] |=  (1 << (c & 7));
938 }
939
940 static char *
941 regclass()
942 {
943         register char *bits;
944         register int class;
945         register int lastclass;
946         register int range = 0;
947         register char *ret;
948         register int def;
949         int numlen;
950
951         ret = regnode(ANYOF);
952         if (*regparse == '^') { /* Complement of range. */
953                 regparse++;
954                 def = 0;
955         } else {
956                 def = 255;
957         }
958         bits = regcode;
959         for (class = 0; class < 32; class++)
960             regc(def);
961         if (*regparse == ']' || *regparse == '-')
962                 goto skipcond;          /* allow 1st char to be ] or - */
963         while (regparse < regxend && *regparse != ']') {
964               skipcond:
965                 class = UCHARAT(regparse++);
966                 if (class == '\\') {
967                         class = UCHARAT(regparse++);
968                         switch (class) {
969                         case 'w':
970                                 for (class = 'a'; class <= 'z'; class++)
971                                         regset(bits,def,class);
972                                 for (class = 'A'; class <= 'Z'; class++)
973                                         regset(bits,def,class);
974                                 for (class = '0'; class <= '9'; class++)
975                                         regset(bits,def,class);
976                                 regset(bits,def,'_');
977                                 lastclass = 1234;
978                                 continue;
979                         case 's':
980                                 regset(bits,def,' ');
981                                 regset(bits,def,'\t');
982                                 regset(bits,def,'\r');
983                                 regset(bits,def,'\f');
984                                 regset(bits,def,'\n');
985                                 lastclass = 1234;
986                                 continue;
987                         case 'd':
988                                 for (class = '0'; class <= '9'; class++)
989                                         regset(bits,def,class);
990                                 lastclass = 1234;
991                                 continue;
992                         case 'n':
993                                 class = '\n';
994                                 break;
995                         case 'r':
996                                 class = '\r';
997                                 break;
998                         case 't':
999                                 class = '\t';
1000                                 break;
1001                         case 'f':
1002                                 class = '\f';
1003                                 break;
1004                         case 'b':
1005                                 class = '\b';
1006                                 break;
1007                         case 'e':
1008                                 class = '\033';
1009                                 break;
1010                         case 'a':
1011                                 class = '\007';
1012                                 break;
1013                         case 'x':
1014                                 class = scanhex(regparse, 2, &numlen);
1015                                 regparse += numlen;
1016                                 break;
1017                         case 'c':
1018                                 class = *regparse++;
1019                                 if (isLOWER(class))
1020                                     class = toupper(class);
1021                                 class ^= 64;
1022                                 break;
1023                         case '0': case '1': case '2': case '3': case '4':
1024                         case '5': case '6': case '7': case '8': case '9':
1025                                 class = scanoct(--regparse, 3, &numlen);
1026                                 regparse += numlen;
1027                                 break;
1028                         }
1029                 }
1030                 if (range) {
1031                         if (lastclass > class)
1032                                 FAIL("invalid [] range in regexp");
1033                         range = 0;
1034                 }
1035                 else {
1036                         lastclass = class;
1037                         if (*regparse == '-' && regparse+1 < regxend &&
1038                             regparse[1] != ']') {
1039                                 regparse++;
1040                                 range = 1;
1041                                 continue;       /* do it next time */
1042                         }
1043                 }
1044                 for ( ; lastclass <= class; lastclass++) {
1045                         regset(bits,def,lastclass);
1046                         if (regfold && isUPPER(lastclass))
1047                                 regset(bits,def,tolower(lastclass));
1048                 }
1049                 lastclass = class;
1050         }
1051         if (*regparse != ']')
1052                 FAIL("unmatched [] in regexp");
1053         regparse++;
1054         return ret;
1055 }
1056
1057 /*
1058  - regnode - emit a node
1059  */
1060 static char *                   /* Location. */
1061 regnode(op)
1062 char op;
1063 {
1064         register char *ret;
1065         register char *ptr;
1066
1067         ret = regcode;
1068         if (ret == &regdummy) {
1069 #ifdef REGALIGN
1070                 if (!(regsize & 1))
1071                         regsize++;
1072 #endif
1073                 regsize += 3;
1074                 return(ret);
1075         }
1076
1077 #ifdef REGALIGN
1078 #ifndef lint
1079         if (!((long)ret & 1))
1080             *ret++ = 127;
1081 #endif
1082 #endif
1083         ptr = ret;
1084         *ptr++ = op;
1085         *ptr++ = '\0';          /* Null "next" pointer. */
1086         *ptr++ = '\0';
1087         regcode = ptr;
1088
1089         return(ret);
1090 }
1091
1092 /*
1093  - reganode - emit a node with an argument
1094  */
1095 static char *                   /* Location. */
1096 reganode(op, arg)
1097 char op;
1098 unsigned short arg;
1099 {
1100         register char *ret;
1101         register char *ptr;
1102
1103         ret = regcode;
1104         if (ret == &regdummy) {
1105 #ifdef REGALIGN
1106                 if (!(regsize & 1))
1107                         regsize++;
1108 #endif
1109                 regsize += 5;
1110                 return(ret);
1111         }
1112
1113 #ifdef REGALIGN
1114 #ifndef lint
1115         if (!((long)ret & 1))
1116             *ret++ = 127;
1117 #endif
1118 #endif
1119         ptr = ret;
1120         *ptr++ = op;
1121         *ptr++ = '\0';          /* Null "next" pointer. */
1122         *ptr++ = '\0';
1123 #ifdef REGALIGN
1124         *(unsigned short *)(ret+3) = arg;
1125 #else
1126         ret[3] = arg >> 8; ret[4] = arg & 0377;
1127 #endif
1128         ptr += 2;
1129         regcode = ptr;
1130
1131         return(ret);
1132 }
1133
1134 /*
1135  - regc - emit (if appropriate) a byte of code
1136  */
1137 static void
1138 regc(b)
1139 char b;
1140 {
1141         if (regcode != &regdummy)
1142                 *regcode++ = b;
1143         else
1144                 regsize++;
1145 }
1146
1147 /*
1148  - reginsert - insert an operator in front of already-emitted operand
1149  *
1150  * Means relocating the operand.
1151  */
1152 static void
1153 reginsert(op, opnd)
1154 char op;
1155 char *opnd;
1156 {
1157         register char *src;
1158         register char *dst;
1159         register char *place;
1160         register offset = (op == CURLY ? 4 : 0);
1161
1162         if (regcode == &regdummy) {
1163 #ifdef REGALIGN
1164                 regsize += 4 + offset;
1165 #else
1166                 regsize += 3 + offset;
1167 #endif
1168                 return;
1169         }
1170
1171         src = regcode;
1172 #ifdef REGALIGN
1173         regcode += 4 + offset;
1174 #else
1175         regcode += 3 + offset;
1176 #endif
1177         dst = regcode;
1178         while (src > opnd)
1179                 *--dst = *--src;
1180
1181         place = opnd;           /* Op node, where operand used to be. */
1182         *place++ = op;
1183         *place++ = '\0';
1184         *place++ = '\0';
1185         while (offset-- > 0)
1186             *place++ = '\0';
1187 }
1188
1189 /*
1190  - regtail - set the next-pointer at the end of a node chain
1191  */
1192 static void
1193 regtail(p, val)
1194 char *p;
1195 char *val;
1196 {
1197         register char *scan;
1198         register char *temp;
1199         register int offset;
1200
1201         if (p == &regdummy)
1202                 return;
1203
1204         /* Find last node. */
1205         scan = p;
1206         for (;;) {
1207                 temp = regnext(scan);
1208                 if (temp == NULL)
1209                         break;
1210                 scan = temp;
1211         }
1212
1213 #ifdef REGALIGN
1214         offset = val - scan;
1215 #ifndef lint
1216         *(short*)(scan+1) = offset;
1217 #else
1218         offset = offset;
1219 #endif
1220 #else
1221         if (OP(scan) == BACK)
1222                 offset = scan - val;
1223         else
1224                 offset = val - scan;
1225         *(scan+1) = (offset>>8)&0377;
1226         *(scan+2) = offset&0377;
1227 #endif
1228 }
1229
1230 /*
1231  - regoptail - regtail on operand of first argument; nop if operandless
1232  */
1233 static void
1234 regoptail(p, val)
1235 char *p;
1236 char *val;
1237 {
1238         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1239         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1240                 return;
1241         regtail(NEXTOPER(p), val);
1242 }
1243
1244 /*
1245  - regcurly - a little FSA that accepts {\d+,?\d*}
1246  */
1247 STATIC int
1248 regcurly(s)
1249 register char *s;
1250 {
1251     if (*s++ != '{')
1252         return FALSE;
1253     if (!isDIGIT(*s))
1254         return FALSE;
1255     while (isDIGIT(*s))
1256         s++;
1257     if (*s == ',')
1258         s++;
1259     while (isDIGIT(*s))
1260         s++;
1261     if (*s != '}')
1262         return FALSE;
1263     return TRUE;
1264 }
1265
1266 #ifdef DEBUGGING
1267
1268 /*
1269  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1270  */
1271 void
1272 regdump(r)
1273 regexp *r;
1274 {
1275         register char *s;
1276         register char op = EXACTLY;     /* Arbitrary non-END op. */
1277         register char *next;
1278
1279
1280         s = r->program + 1;
1281         while (op != END) {     /* While that wasn't END last time... */
1282 #ifdef REGALIGN
1283                 if (!((long)s & 1))
1284                         s++;
1285 #endif
1286                 op = OP(s);
1287                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1288                 next = regnext(s);
1289                 s += regarglen[op];
1290                 if (next == NULL)               /* Next ptr. */
1291                         fprintf(stderr,"(0)");
1292                 else 
1293                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1294                 s += 3;
1295                 if (op == ANYOF) {
1296                         s += 32;
1297                 }
1298                 if (op == EXACTLY) {
1299                         /* Literal string, where present. */
1300                         s++;
1301                         while (*s != '\0') {
1302                                 (void)putchar(*s);
1303                                 s++;
1304                         }
1305                         s++;
1306                 }
1307                 (void)putchar('\n');
1308         }
1309
1310         /* Header fields of interest. */
1311         if (r->regstart)
1312                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1313         if (r->regstclass)
1314                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1315         if (r->reganch & ROPT_ANCH)
1316                 fprintf(stderr,"anchored ");
1317         if (r->reganch & ROPT_SKIP)
1318                 fprintf(stderr,"plus ");
1319         if (r->reganch & ROPT_IMPLICIT)
1320                 fprintf(stderr,"implicit ");
1321         if (r->regmust != NULL)
1322                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1323                   r->regback);
1324         fprintf(stderr, "minlen %d ", r->minlen);
1325         fprintf(stderr,"\n");
1326 }
1327
1328 /*
1329  - regprop - printable representation of opcode
1330  */
1331 char *
1332 regprop(op)
1333 char *op;
1334 {
1335         register char *p;
1336
1337         (void) strcpy(buf, ":");
1338
1339         switch (OP(op)) {
1340         case BOL:
1341                 p = "BOL";
1342                 break;
1343         case EOL:
1344                 p = "EOL";
1345                 break;
1346         case ANY:
1347                 p = "ANY";
1348                 break;
1349         case ANYOF:
1350                 p = "ANYOF";
1351                 break;
1352         case BRANCH:
1353                 p = "BRANCH";
1354                 break;
1355         case EXACTLY:
1356                 p = "EXACTLY";
1357                 break;
1358         case NOTHING:
1359                 p = "NOTHING";
1360                 break;
1361         case BACK:
1362                 p = "BACK";
1363                 break;
1364         case END:
1365                 p = "END";
1366                 break;
1367         case ALNUM:
1368                 p = "ALNUM";
1369                 break;
1370         case NALNUM:
1371                 p = "NALNUM";
1372                 break;
1373         case BOUND:
1374                 p = "BOUND";
1375                 break;
1376         case NBOUND:
1377                 p = "NBOUND";
1378                 break;
1379         case SPACE:
1380                 p = "SPACE";
1381                 break;
1382         case NSPACE:
1383                 p = "NSPACE";
1384                 break;
1385         case DIGIT:
1386                 p = "DIGIT";
1387                 break;
1388         case NDIGIT:
1389                 p = "NDIGIT";
1390                 break;
1391         case CURLY:
1392                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1393                     ARG1(op),ARG2(op));
1394                 p = NULL;
1395                 break;
1396         case REF:
1397                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1398                 p = NULL;
1399                 break;
1400         case OPEN:
1401                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1402                 p = NULL;
1403                 break;
1404         case CLOSE:
1405                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1406                 p = NULL;
1407                 break;
1408         case STAR:
1409                 p = "STAR";
1410                 break;
1411         case PLUS:
1412                 p = "PLUS";
1413                 break;
1414         default:
1415                 FAIL("corrupted regexp opcode");
1416         }
1417         if (p != NULL)
1418                 (void) strcat(buf, p);
1419         return(buf);
1420 }
1421 #endif /* DEBUGGING */
1422
1423 regfree(r)
1424 struct regexp *r;
1425 {
1426         if (r->precomp) {
1427                 Safefree(r->precomp);
1428                 r->precomp = Nullch;
1429         }
1430         if (r->subbase) {
1431                 Safefree(r->subbase);
1432                 r->subbase = Nullch;
1433         }
1434         if (r->regmust) {
1435                 str_free(r->regmust);
1436                 r->regmust = Nullstr;
1437         }
1438         if (r->regstart) {
1439                 str_free(r->regstart);
1440                 r->regstart = Nullstr;
1441         }
1442         Safefree(r->startp);
1443         Safefree(r->endp);
1444         Safefree(r);
1445 }