This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 4.0 patch 16: patch #11, continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.3 $$Date: 91/11/05 18:22:28 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0.1.3  91/11/05  18:22:28  lwall
14  * patch11: minimum match length calculation in regexp is now cumulative
15  * patch11: initial .* in pattern had dependency on value of $*
16  * patch11: certain patterns made use of garbage pointers from uncleared memory
17  * patch11: prepared for ctype implementations that don't define isascii()
18  * 
19  * Revision 4.0.1.2  91/06/07  11:48:24  lwall
20  * patch4: new copyright notice
21  * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
22  * patch4: // wouldn't use previous pattern if it started with a null character
23  * 
24  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
25  * patch1: random cleanup in cpp namespace
26  * 
27  * Revision 4.0  91/03/20  01:39:01  lwall
28  * 4.0 baseline.
29  * 
30  */
31 /*SUPPRESS 112*/
32 /*
33  * regcomp and regexec -- regsub and regerror are not used in perl
34  *
35  *      Copyright (c) 1986 by University of Toronto.
36  *      Written by Henry Spencer.  Not derived from licensed software.
37  *
38  *      Permission is granted to anyone to use this software for any
39  *      purpose on any computer system, and to redistribute it freely,
40  *      subject to the following restrictions:
41  *
42  *      1. The author is not responsible for the consequences of use of
43  *              this software, no matter how awful, even if they arise
44  *              from defects in it.
45  *
46  *      2. The origin of this software must not be misrepresented, either
47  *              by explicit claim or by omission.
48  *
49  *      3. Altered versions must be plainly marked as such, and must not
50  *              be misrepresented as being the original software.
51  *
52  *
53  ****    Alterations to Henry's code are...
54  ****
55  ****    Copyright (c) 1991, Larry Wall
56  ****
57  ****    You may distribute under the terms of either the GNU General Public
58  ****    License or the Artistic License, as specified in the README file.
59
60  *
61  * Beware that some of this code is subtly aware of the way operator
62  * precedence is structured in regular expressions.  Serious changes in
63  * regular-expression syntax might require a total rethink.
64  */
65 #include "EXTERN.h"
66 #include "perl.h"
67 #include "INTERN.h"
68 #include "regcomp.h"
69
70 #ifdef MSDOS
71 # if defined(BUGGY_MSC6)
72  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
73  # pragma optimize("a",off)
74  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
75  # pragma optimize("w",on )
76 # endif /* BUGGY_MSC6 */
77 #endif /* MSDOS */
78
79 #ifndef STATIC
80 #define STATIC  static
81 #endif
82
83 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
84 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
85         ((*s) == '{' && regcurly(s)))
86 #define META    "^$.[()|?+*\\"
87
88 #ifdef SPSTART
89 #undef SPSTART          /* dratted cpp namespace... */
90 #endif
91 /*
92  * Flags to be passed up and down.
93  */
94 #define HASWIDTH        01      /* Known never to match null string. */
95 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
96 #define SPSTART         04      /* Starts with * or +. */
97 #define WORST           0       /* Worst case. */
98
99 /*
100  * Global work variables for regcomp().
101  */
102 static char *regprecomp;                /* uncompiled string. */
103 static char *regparse;          /* Input-scan pointer. */
104 static char *regxend;           /* End of input for compile */
105 static int regnpar;             /* () count. */
106 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
107 static long regsize;            /* Code size. */
108 static int regfold;
109 static int regsawbracket;       /* Did we do {d,d} trick? */
110 static int regsawback;          /* Did we see \1, ...? */
111
112 /*
113  * Forward declarations for regcomp()'s friends.
114  */
115 STATIC int regcurly();
116 STATIC char *reg();
117 STATIC char *regbranch();
118 STATIC char *regpiece();
119 STATIC char *regatom();
120 STATIC char *regclass();
121 STATIC char *regnode();
122 STATIC char *reganode();
123 STATIC void regc();
124 STATIC void reginsert();
125 STATIC void regtail();
126 STATIC void regoptail();
127
128 /*
129  - regcomp - compile a regular expression into internal code
130  *
131  * We can't allocate space until we know how big the compiled form will be,
132  * but we can't compile it (and thus know how big it is) until we've got a
133  * place to put the code.  So we cheat:  we compile it twice, once with code
134  * generation turned off and size counting turned on, and once "for real".
135  * This also means that we don't allocate space until we are sure that the
136  * thing really will compile successfully, and we never have to move the
137  * code and thus invalidate pointers into it.  (Note that it has to be in
138  * one piece because free() must be able to free it all.) [NB: not true in perl]
139  *
140  * Beware that the optimization-preparation code in here knows about some
141  * of the structure of the compiled regexp.  [I'll say.]
142  */
143 regexp *
144 regcomp(exp,xend,fold)
145 char *exp;
146 char *xend;
147 int fold;
148 {
149         register regexp *r;
150         register char *scan;
151         register STR *longish;
152         STR *longest;
153         register int len;
154         register char *first;
155         int flags;
156         int backish;
157         int backest;
158         int curback;
159         int minlen;
160         extern char *safemalloc();
161         extern char *savestr();
162         int sawplus = 0;
163         int sawopen = 0;
164
165         if (exp == NULL)
166                 fatal("NULL regexp argument");
167
168         /* First pass: determine size, legality. */
169         regfold = fold;
170         regparse = exp;
171         regxend = xend;
172         regprecomp = nsavestr(exp,xend-exp);
173         regsawbracket = 0;
174         regsawback = 0;
175         regnpar = 1;
176         regsize = 0L;
177         regcode = &regdummy;
178         regc((char)MAGIC);
179         if (reg(0, &flags) == NULL) {
180                 Safefree(regprecomp);
181                 regprecomp = Nullch;
182                 return(NULL);
183         }
184
185         /* Small enough for pointer-storage convention? */
186         if (regsize >= 32767L)          /* Probably could be 65535L. */
187                 FAIL("regexp too big");
188
189         /* Allocate space. */
190         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
191         if (r == NULL)
192                 FAIL("regexp out of space");
193
194         /* Second pass: emit code. */
195         if (regsawbracket)
196             bcopy(regprecomp,exp,xend-exp);
197         r->prelen = xend-exp;
198         r->precomp = regprecomp;
199         r->subbeg = r->subbase = NULL;
200         regparse = exp;
201         regnpar = 1;
202         regcode = r->program;
203         regc((char)MAGIC);
204         if (reg(0, &flags) == NULL)
205                 return(NULL);
206
207         /* Dig out information for optimizations. */
208         r->regstart = Nullstr;  /* Worst-case defaults. */
209         r->reganch = 0;
210         r->regmust = Nullstr;
211         r->regback = -1;
212         r->regstclass = Nullch;
213         scan = r->program+1;                    /* First BRANCH. */
214         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
215                 scan = NEXTOPER(scan);
216
217                 first = scan;
218                 while ((OP(first) == OPEN && (sawopen = 1)) ||
219                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
220                     (OP(first) == PLUS) ||
221                     (OP(first) == CURLY && ARG1(first) > 0) ) {
222                         if (OP(first) == PLUS)
223                             sawplus = 1;
224                         else
225                             first += regarglen[OP(first)];
226                         first = NEXTOPER(first);
227                 }
228
229                 /* Starting-point info. */
230             again:
231                 if (OP(first) == EXACTLY) {
232                         r->regstart =
233                             str_make(OPERAND(first)+1,*OPERAND(first));
234                         if (r->regstart->str_cur > !(sawstudy|fold))
235                                 fbmcompile(r->regstart,fold);
236                 }
237                 else if ((exp = index(simple,OP(first))) && exp > simple)
238                         r->regstclass = first;
239                 else if (OP(first) == BOUND || OP(first) == NBOUND)
240                         r->regstclass = first;
241                 else if (OP(first) == BOL ||
242                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
243                         /* kinda turn .* into ^.* */
244                         r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
245                         first = NEXTOPER(first);
246                         goto again;
247                 }
248                 if (sawplus && (!sawopen || !regsawback))
249                     r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
250
251 #ifdef DEBUGGING
252                 if (debug & 512)
253                     fprintf(stderr,"first %d next %d offset %d\n",
254                       OP(first), OP(NEXTOPER(first)), first - scan);
255 #endif
256                 /*
257                  * If there's something expensive in the r.e., find the
258                  * longest literal string that must appear and make it the
259                  * regmust.  Resolve ties in favor of later strings, since
260                  * the regstart check works with the beginning of the r.e.
261                  * and avoiding duplication strengthens checking.  Not a
262                  * strong reason, but sufficient in the absence of others.
263                  * [Now we resolve ties in favor of the earlier string if
264                  * it happens that curback has been invalidated, since the
265                  * earlier string may buy us something the later one won't.]
266                  */
267                 longish = str_make("",0);
268                 longest = str_make("",0);
269                 len = 0;
270                 minlen = 0;
271                 curback = 0;
272                 backish = 0;
273                 backest = 0;
274                 while (OP(scan) != END) {
275                         if (OP(scan) == BRANCH) {
276                             if (OP(regnext(scan)) == BRANCH) {
277                                 curback = -30000;
278                                 while (OP(scan) == BRANCH)
279                                     scan = regnext(scan);
280                             }
281                             else        /* single branch is ok */
282                                 scan = NEXTOPER(scan);
283                         }
284                         if (OP(scan) == EXACTLY) {
285                             char *t;
286
287                             first = scan;
288                             while (OP(t = regnext(scan)) == CLOSE)
289                                 scan = t;
290                             minlen += *OPERAND(first);
291                             if (curback - backish == len) {
292                                 str_ncat(longish, OPERAND(first)+1,
293                                     *OPERAND(first));
294                                 len += *OPERAND(first);
295                                 curback += *OPERAND(first);
296                                 first = regnext(scan);
297                             }
298                             else if (*OPERAND(first) >= len + (curback >= 0)) {
299                                 len = *OPERAND(first);
300                                 str_nset(longish, OPERAND(first)+1,len);
301                                 backish = curback;
302                                 curback += len;
303                                 first = regnext(scan);
304                             }
305                             else
306                                 curback += *OPERAND(first);
307                         }
308                         else if (index(varies,OP(scan))) {
309                             curback = -30000;
310                             len = 0;
311                             if (longish->str_cur > longest->str_cur) {
312                                 str_sset(longest,longish);
313                                 backest = backish;
314                             }
315                             str_nset(longish,"",0);
316                             if (OP(scan) == PLUS &&
317                               index(simple,OP(NEXTOPER(scan))))
318                                 minlen++;
319                             else if (OP(scan) == CURLY &&
320                               index(simple,OP(NEXTOPER(scan)+4)))
321                                 minlen += ARG1(scan);
322                         }
323                         else if (index(simple,OP(scan))) {
324                             curback++;
325                             minlen++;
326                             len = 0;
327                             if (longish->str_cur > longest->str_cur) {
328                                 str_sset(longest,longish);
329                                 backest = backish;
330                             }
331                             str_nset(longish,"",0);
332                         }
333                         scan = regnext(scan);
334                 }
335
336                 /* Prefer earlier on tie, unless we can tail match latter */
337
338                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
339                     str_sset(longest,longish);
340                     backest = backish;
341                 }
342                 else
343                     str_nset(longish,"",0);
344                 if (longest->str_cur
345                     &&
346                     (!r->regstart
347                      ||
348                      !fbminstr((unsigned char*) r->regstart->str_ptr,
349                           (unsigned char *) r->regstart->str_ptr
350                             + r->regstart->str_cur,
351                           longest)
352                     )
353                    )
354                 {
355                         r->regmust = longest;
356                         if (backest < 0)
357                                 backest = -1;
358                         r->regback = backest;
359                         if (longest->str_cur
360                           > !(sawstudy || fold || OP(first) == EOL) )
361                                 fbmcompile(r->regmust,fold);
362                         r->regmust->str_u.str_useful = 100;
363                         if (OP(first) == EOL && longish->str_cur)
364                             r->regmust->str_pok |= SP_TAIL;
365                 }
366                 else {
367                         str_free(longest);
368                         longest = Nullstr;
369                 }
370                 str_free(longish);
371         }
372
373         r->do_folding = fold;
374         r->nparens = regnpar - 1;
375         r->minlen = minlen;
376         Newz(1002, r->startp, regnpar, char*);
377         Newz(1002, r->endp, regnpar, char*);
378 #ifdef DEBUGGING
379         if (debug & 512)
380                 regdump(r);
381 #endif
382         return(r);
383 }
384
385 /*
386  - reg - regular expression, i.e. main body or parenthesized thing
387  *
388  * Caller must absorb opening parenthesis.
389  *
390  * Combining parenthesis handling with the base level of regular expression
391  * is a trifle forced, but the need to tie the tails of the branches to what
392  * follows makes it hard to avoid.
393  */
394 static char *
395 reg(paren, flagp)
396 int paren;                      /* Parenthesized? */
397 int *flagp;
398 {
399         register char *ret;
400         register char *br;
401         register char *ender;
402         register int parno;
403         int flags;
404
405         *flagp = HASWIDTH;      /* Tentatively. */
406
407         /* Make an OPEN node, if parenthesized. */
408         if (paren) {
409                 parno = regnpar;
410                 regnpar++;
411                 ret = reganode(OPEN, parno);
412         } else
413                 ret = NULL;
414
415         /* Pick up the branches, linking them together. */
416         br = regbranch(&flags);
417         if (br == NULL)
418                 return(NULL);
419         if (ret != NULL)
420                 regtail(ret, br);       /* OPEN -> first. */
421         else
422                 ret = br;
423         if (!(flags&HASWIDTH))
424                 *flagp &= ~HASWIDTH;
425         *flagp |= flags&SPSTART;
426         while (*regparse == '|') {
427                 regparse++;
428                 br = regbranch(&flags);
429                 if (br == NULL)
430                         return(NULL);
431                 regtail(ret, br);       /* BRANCH -> BRANCH. */
432                 if (!(flags&HASWIDTH))
433                         *flagp &= ~HASWIDTH;
434                 *flagp |= flags&SPSTART;
435         }
436
437         /* Make a closing node, and hook it on the end. */
438         if (paren)
439             ender = reganode(CLOSE, parno);
440         else
441             ender = regnode(END);
442         regtail(ret, ender);
443
444         /* Hook the tails of the branches to the closing node. */
445         for (br = ret; br != NULL; br = regnext(br))
446                 regoptail(br, ender);
447
448         /* Check for proper termination. */
449         if (paren && *regparse++ != ')') {
450                 FAIL("unmatched () in regexp");
451         } else if (!paren && regparse < regxend) {
452                 if (*regparse == ')') {
453                         FAIL("unmatched () in regexp");
454                 } else
455                         FAIL("junk on end of regexp");  /* "Can't happen". */
456                 /* NOTREACHED */
457         }
458
459         return(ret);
460 }
461
462 /*
463  - regbranch - one alternative of an | operator
464  *
465  * Implements the concatenation operator.
466  */
467 static char *
468 regbranch(flagp)
469 int *flagp;
470 {
471         register char *ret;
472         register char *chain;
473         register char *latest;
474         int flags;
475
476         *flagp = WORST;         /* Tentatively. */
477
478         ret = regnode(BRANCH);
479         chain = NULL;
480         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
481                 latest = regpiece(&flags);
482                 if (latest == NULL)
483                         return(NULL);
484                 *flagp |= flags&HASWIDTH;
485                 if (chain == NULL)      /* First piece. */
486                         *flagp |= flags&SPSTART;
487                 else
488                         regtail(chain, latest);
489                 chain = latest;
490         }
491         if (chain == NULL)      /* Loop ran zero times. */
492                 (void) regnode(NOTHING);
493
494         return(ret);
495 }
496
497 /*
498  - regpiece - something followed by possible [*+?]
499  *
500  * Note that the branching code sequences used for ? and the general cases
501  * of * and + are somewhat optimized:  they use the same NOTHING node as
502  * both the endmarker for their branch list and the body of the last branch.
503  * It might seem that this node could be dispensed with entirely, but the
504  * endmarker role is not redundant.
505  */
506 static char *
507 regpiece(flagp)
508 int *flagp;
509 {
510         register char *ret;
511         register char op;
512         register char *next;
513         int flags;
514         char *origparse = regparse;
515         int orignpar = regnpar;
516         char *max;
517         int iter;
518         char ch;
519
520         ret = regatom(&flags);
521         if (ret == NULL)
522                 return(NULL);
523
524         op = *regparse;
525
526         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
527          * then we decrement the first number by one and reset our
528          * parsing back to the beginning of the same atom.  If the first number
529          * is down to 0, decrement the second number instead and fake up
530          * a ? after it.  Given the way this compiler doesn't keep track
531          * of offsets on the first pass, this is the only way to replicate
532          * a piece of code.  Sigh.
533          */
534         if (op == '{' && regcurly(regparse)) {
535             next = regparse + 1;
536             max = Nullch;
537             while (isDIGIT(*next) || *next == ',') {
538                 if (*next == ',') {
539                     if (max)
540                         break;
541                     else
542                         max = next;
543                 }
544                 next++;
545             }
546             if (*next == '}') {         /* got one */
547                 if (!max)
548                     max = next;
549                 regparse++;
550                 iter = atoi(regparse);
551                 if (flags&SIMPLE) {     /* we can do it right after all */
552                     int tmp;
553
554                     reginsert(CURLY, ret);
555                     if (iter > 0)
556                         *flagp = (WORST|HASWIDTH);
557                     if (*max == ',')
558                         max++;
559                     else
560                         max = regparse;
561                     tmp = atoi(max);
562                     if (tmp && tmp < iter)
563                         fatal("Can't do {n,m} with n > m");
564                     if (regcode != &regdummy) {
565 #ifdef REGALIGN
566                         *(unsigned short *)(ret+3) = iter;
567                         *(unsigned short *)(ret+5) = tmp;
568 #else
569                         ret[3] = iter >> 8; ret[4] = iter & 0377;
570                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
571 #endif
572                     }
573                     regparse = next;
574                     goto nest_check;
575                 }
576                 regsawbracket++;        /* remember we clobbered exp */
577                 if (iter > 0) {
578                     ch = *max;
579                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
580                     *max = ch;
581                     if (*max == ',' && max[1] != '}') {
582                         if (atoi(max+1) <= 0)
583                             fatal("Can't do {n,m} with n > m");
584                         ch = *next;
585                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
586                         *next = ch;
587                     }
588                     if (iter != 1 || *max == ',') {
589                         regparse = origparse;   /* back up input pointer */
590                         regnpar = orignpar;     /* don't make more parens */
591                     }
592                     else {
593                         regparse = next;
594                         goto nest_check;
595                     }
596                     *flagp = flags;
597                     return ret;
598                 }
599                 if (*max == ',') {
600                     max++;
601                     iter = atoi(max);
602                     if (max == next) {          /* any number more? */
603                         regparse = next;
604                         op = '*';               /* fake up one with a star */
605                     }
606                     else if (iter > 0) {
607                         op = '?';               /* fake up optional atom */
608                         ch = *next;
609                         sprintf(max,"%.*d", next-max, iter - 1);
610                         *next = ch;
611                         if (iter == 1)
612                             regparse = next;
613                         else {
614                             regparse = origparse - 1; /* offset ++ below */
615                             regnpar = orignpar;
616                         }
617                     }
618                     else
619                         fatal("Can't do {n,0}");
620                 }
621                 else
622                     fatal("Can't do {0}");
623             }
624         }
625
626         if (!ISMULT1(op)) {
627                 *flagp = flags;
628                 return(ret);
629         }
630
631         if (!(flags&HASWIDTH) && op != '?')
632                 FAIL("regexp *+ operand could be empty");
633         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
634
635         if (op == '*' && (flags&SIMPLE))
636                 reginsert(STAR, ret);
637         else if (op == '*') {
638                 /* Emit x* as (x&|), where & means "self". */
639                 reginsert(BRANCH, ret);                 /* Either x */
640                 regoptail(ret, regnode(BACK));          /* and loop */
641                 regoptail(ret, ret);                    /* back */
642                 regtail(ret, regnode(BRANCH));          /* or */
643                 regtail(ret, regnode(NOTHING));         /* null. */
644         } else if (op == '+' && (flags&SIMPLE))
645                 reginsert(PLUS, ret);
646         else if (op == '+') {
647                 /* Emit x+ as x(&|), where & means "self". */
648                 next = regnode(BRANCH);                 /* Either */
649                 regtail(ret, next);
650                 regtail(regnode(BACK), ret);            /* loop back */
651                 regtail(next, regnode(BRANCH));         /* or */
652                 regtail(ret, regnode(NOTHING));         /* null. */
653         } else if (op == '?') {
654                 /* Emit x? as (x|) */
655                 reginsert(BRANCH, ret);                 /* Either x */
656                 regtail(ret, regnode(BRANCH));          /* or */
657                 next = regnode(NOTHING);                /* null. */
658                 regtail(ret, next);
659                 regoptail(ret, next);
660         }
661       nest_check:
662         regparse++;
663         if (ISMULT2(regparse))
664                 FAIL("nested *?+ in regexp");
665
666         return(ret);
667 }
668
669 /*
670  - regatom - the lowest level
671  *
672  * Optimization:  gobbles an entire sequence of ordinary characters so that
673  * it can turn them into a single node, which is smaller to store and
674  * faster to run.  Backslashed characters are exceptions, each becoming a
675  * separate node; the code is simpler that way and it's not worth fixing.
676  *
677  * [Yes, it is worth fixing, some scripts can run twice the speed.]
678  */
679 static char *
680 regatom(flagp)
681 int *flagp;
682 {
683         register char *ret;
684         int flags;
685
686         *flagp = WORST;         /* Tentatively. */
687
688         switch (*regparse++) {
689         case '^':
690                 ret = regnode(BOL);
691                 break;
692         case '$':
693                 ret = regnode(EOL);
694                 break;
695         case '.':
696                 ret = regnode(ANY);
697                 *flagp |= HASWIDTH|SIMPLE;
698                 break;
699         case '[':
700                 ret = regclass();
701                 *flagp |= HASWIDTH|SIMPLE;
702                 break;
703         case '(':
704                 ret = reg(1, &flags);
705                 if (ret == NULL)
706                         return(NULL);
707                 *flagp |= flags&(HASWIDTH|SPSTART);
708                 break;
709         case '|':
710         case ')':
711                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
712                 break;
713         case '?':
714         case '+':
715         case '*':
716                 FAIL("?+* follows nothing in regexp");
717                 break;
718         case '\\':
719                 switch (*regparse) {
720                 case 'w':
721                         ret = regnode(ALNUM);
722                         *flagp |= HASWIDTH|SIMPLE;
723                         regparse++;
724                         break;
725                 case 'W':
726                         ret = regnode(NALNUM);
727                         *flagp |= HASWIDTH|SIMPLE;
728                         regparse++;
729                         break;
730                 case 'b':
731                         ret = regnode(BOUND);
732                         *flagp |= SIMPLE;
733                         regparse++;
734                         break;
735                 case 'B':
736                         ret = regnode(NBOUND);
737                         *flagp |= SIMPLE;
738                         regparse++;
739                         break;
740                 case 's':
741                         ret = regnode(SPACE);
742                         *flagp |= HASWIDTH|SIMPLE;
743                         regparse++;
744                         break;
745                 case 'S':
746                         ret = regnode(NSPACE);
747                         *flagp |= HASWIDTH|SIMPLE;
748                         regparse++;
749                         break;
750                 case 'd':
751                         ret = regnode(DIGIT);
752                         *flagp |= HASWIDTH|SIMPLE;
753                         regparse++;
754                         break;
755                 case 'D':
756                         ret = regnode(NDIGIT);
757                         *flagp |= HASWIDTH|SIMPLE;
758                         regparse++;
759                         break;
760                 case 'n':
761                 case 'r':
762                 case 't':
763                 case 'f':
764                 case 'e':
765                 case 'a':
766                 case 'x':
767                 case 'c':
768                 case '0':
769                         goto defchar;
770                 case '1': case '2': case '3': case '4':
771                 case '5': case '6': case '7': case '8': case '9':
772                         {
773                             int num = atoi(regparse);
774
775                             if (num > 9 && num >= regnpar)
776                                 goto defchar;
777                             else {
778                                 regsawback = 1;
779                                 ret = reganode(REF, num);
780                                 while (isDIGIT(*regparse))
781                                     regparse++;
782                                 *flagp |= SIMPLE;
783                             }
784                         }
785                         break;
786                 case '\0':
787                         if (regparse >= regxend)
788                             FAIL("trailing \\ in regexp");
789                         /* FALL THROUGH */
790                 default:
791                         goto defchar;
792                 }
793                 break;
794         default: {
795                         register int len;
796                         register char ender;
797                         register char *p;
798                         char *oldp;
799                         int numlen;
800
801                     defchar:
802                         ret = regnode(EXACTLY);
803                         regc(0);                /* save spot for len */
804                         for (len=0, p=regparse-1;
805                           len < 127 && p < regxend;
806                           len++)
807                         {
808                             oldp = p;
809                             switch (*p) {
810                             case '^':
811                             case '$':
812                             case '.':
813                             case '[':
814                             case '(':
815                             case ')':
816                             case '|':
817                                 goto loopdone;
818                             case '\\':
819                                 switch (*++p) {
820                                 case 'w':
821                                 case 'W':
822                                 case 'b':
823                                 case 'B':
824                                 case 's':
825                                 case 'S':
826                                 case 'd':
827                                 case 'D':
828                                     --p;
829                                     goto loopdone;
830                                 case 'n':
831                                         ender = '\n';
832                                         p++;
833                                         break;
834                                 case 'r':
835                                         ender = '\r';
836                                         p++;
837                                         break;
838                                 case 't':
839                                         ender = '\t';
840                                         p++;
841                                         break;
842                                 case 'f':
843                                         ender = '\f';
844                                         p++;
845                                         break;
846                                 case 'e':
847                                         ender = '\033';
848                                         p++;
849                                         break;
850                                 case 'a':
851                                         ender = '\007';
852                                         p++;
853                                         break;
854                                 case 'x':
855                                     ender = scanhex(++p, 2, &numlen);
856                                     p += numlen;
857                                     break;
858                                 case 'c':
859                                     p++;
860                                     ender = *p++;
861                                     if (isLOWER(ender))
862                                         ender = toupper(ender);
863                                     ender ^= 64;
864                                     break;
865                                 case '0': case '1': case '2': case '3':case '4':
866                                 case '5': case '6': case '7': case '8':case '9':
867                                     if (*p == '0' ||
868                                       (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
869                                         ender = scanoct(p, 3, &numlen);
870                                         p += numlen;
871                                     }
872                                     else {
873                                         --p;
874                                         goto loopdone;
875                                     }
876                                     break;
877                                 case '\0':
878                                     if (p >= regxend)
879                                         FAIL("trailing \\ in regexp");
880                                     /* FALL THROUGH */
881                                 default:
882                                     ender = *p++;
883                                     break;
884                                 }
885                                 break;
886                             default:
887                                 ender = *p++;
888                                 break;
889                             }
890                             if (regfold && isUPPER(ender))
891                                     ender = tolower(ender);
892                             if (ISMULT2(p)) { /* Back off on ?+*. */
893                                 if (len)
894                                     p = oldp;
895                                 else {
896                                     len++;
897                                     regc(ender);
898                                 }
899                                 break;
900                             }
901                             regc(ender);
902                         }
903                     loopdone:
904                         regparse = p;
905                         if (len <= 0)
906                                 FAIL("internal disaster in regexp");
907                         *flagp |= HASWIDTH;
908                         if (len == 1)
909                                 *flagp |= SIMPLE;
910                         if (regcode != &regdummy)
911                             *OPERAND(ret) = len;
912                         regc('\0');
913                 }
914                 break;
915         }
916
917         return(ret);
918 }
919
920 static void
921 regset(bits,def,c)
922 char *bits;
923 int def;
924 register int c;
925 {
926         if (regcode == &regdummy)
927             return;
928         c &= 255;
929         if (def)
930                 bits[c >> 3] &= ~(1 << (c & 7));
931         else
932                 bits[c >> 3] |=  (1 << (c & 7));
933 }
934
935 static char *
936 regclass()
937 {
938         register char *bits;
939         register int class;
940         register int lastclass;
941         register int range = 0;
942         register char *ret;
943         register int def;
944         int numlen;
945
946         ret = regnode(ANYOF);
947         if (*regparse == '^') { /* Complement of range. */
948                 regparse++;
949                 def = 0;
950         } else {
951                 def = 255;
952         }
953         bits = regcode;
954         for (class = 0; class < 32; class++)
955             regc(def);
956         if (*regparse == ']' || *regparse == '-')
957                 goto skipcond;          /* allow 1st char to be ] or - */
958         while (regparse < regxend && *regparse != ']') {
959               skipcond:
960                 class = UCHARAT(regparse++);
961                 if (class == '\\') {
962                         class = UCHARAT(regparse++);
963                         switch (class) {
964                         case 'w':
965                                 for (class = 'a'; class <= 'z'; class++)
966                                         regset(bits,def,class);
967                                 for (class = 'A'; class <= 'Z'; class++)
968                                         regset(bits,def,class);
969                                 for (class = '0'; class <= '9'; class++)
970                                         regset(bits,def,class);
971                                 regset(bits,def,'_');
972                                 lastclass = 1234;
973                                 continue;
974                         case 's':
975                                 regset(bits,def,' ');
976                                 regset(bits,def,'\t');
977                                 regset(bits,def,'\r');
978                                 regset(bits,def,'\f');
979                                 regset(bits,def,'\n');
980                                 lastclass = 1234;
981                                 continue;
982                         case 'd':
983                                 for (class = '0'; class <= '9'; class++)
984                                         regset(bits,def,class);
985                                 lastclass = 1234;
986                                 continue;
987                         case 'n':
988                                 class = '\n';
989                                 break;
990                         case 'r':
991                                 class = '\r';
992                                 break;
993                         case 't':
994                                 class = '\t';
995                                 break;
996                         case 'f':
997                                 class = '\f';
998                                 break;
999                         case 'b':
1000                                 class = '\b';
1001                                 break;
1002                         case 'e':
1003                                 class = '\033';
1004                                 break;
1005                         case 'a':
1006                                 class = '\007';
1007                                 break;
1008                         case 'x':
1009                                 class = scanhex(regparse, 2, &numlen);
1010                                 regparse += numlen;
1011                                 break;
1012                         case 'c':
1013                                 class = *regparse++;
1014                                 if (isLOWER(class))
1015                                     class = toupper(class);
1016                                 class ^= 64;
1017                                 break;
1018                         case '0': case '1': case '2': case '3': case '4':
1019                         case '5': case '6': case '7': case '8': case '9':
1020                                 class = scanoct(--regparse, 3, &numlen);
1021                                 regparse += numlen;
1022                                 break;
1023                         }
1024                 }
1025                 if (range) {
1026                         if (lastclass > class)
1027                                 FAIL("invalid [] range in regexp");
1028                         range = 0;
1029                 }
1030                 else {
1031                         lastclass = class;
1032                         if (*regparse == '-' && regparse+1 < regxend &&
1033                             regparse[1] != ']') {
1034                                 regparse++;
1035                                 range = 1;
1036                                 continue;       /* do it next time */
1037                         }
1038                 }
1039                 for ( ; lastclass <= class; lastclass++) {
1040                         regset(bits,def,lastclass);
1041                         if (regfold && isUPPER(lastclass))
1042                                 regset(bits,def,tolower(lastclass));
1043                 }
1044                 lastclass = class;
1045         }
1046         if (*regparse != ']')
1047                 FAIL("unmatched [] in regexp");
1048         regparse++;
1049         return ret;
1050 }
1051
1052 /*
1053  - regnode - emit a node
1054  */
1055 static char *                   /* Location. */
1056 regnode(op)
1057 char op;
1058 {
1059         register char *ret;
1060         register char *ptr;
1061
1062         ret = regcode;
1063         if (ret == &regdummy) {
1064 #ifdef REGALIGN
1065                 if (!(regsize & 1))
1066                         regsize++;
1067 #endif
1068                 regsize += 3;
1069                 return(ret);
1070         }
1071
1072 #ifdef REGALIGN
1073 #ifndef lint
1074         if (!((long)ret & 1))
1075             *ret++ = 127;
1076 #endif
1077 #endif
1078         ptr = ret;
1079         *ptr++ = op;
1080         *ptr++ = '\0';          /* Null "next" pointer. */
1081         *ptr++ = '\0';
1082         regcode = ptr;
1083
1084         return(ret);
1085 }
1086
1087 /*
1088  - reganode - emit a node with an argument
1089  */
1090 static char *                   /* Location. */
1091 reganode(op, arg)
1092 char op;
1093 unsigned short arg;
1094 {
1095         register char *ret;
1096         register char *ptr;
1097
1098         ret = regcode;
1099         if (ret == &regdummy) {
1100 #ifdef REGALIGN
1101                 if (!(regsize & 1))
1102                         regsize++;
1103 #endif
1104                 regsize += 5;
1105                 return(ret);
1106         }
1107
1108 #ifdef REGALIGN
1109 #ifndef lint
1110         if (!((long)ret & 1))
1111             *ret++ = 127;
1112 #endif
1113 #endif
1114         ptr = ret;
1115         *ptr++ = op;
1116         *ptr++ = '\0';          /* Null "next" pointer. */
1117         *ptr++ = '\0';
1118 #ifdef REGALIGN
1119         *(unsigned short *)(ret+3) = arg;
1120 #else
1121         ret[3] = arg >> 8; ret[4] = arg & 0377;
1122 #endif
1123         ptr += 2;
1124         regcode = ptr;
1125
1126         return(ret);
1127 }
1128
1129 /*
1130  - regc - emit (if appropriate) a byte of code
1131  */
1132 static void
1133 regc(b)
1134 char b;
1135 {
1136         if (regcode != &regdummy)
1137                 *regcode++ = b;
1138         else
1139                 regsize++;
1140 }
1141
1142 /*
1143  - reginsert - insert an operator in front of already-emitted operand
1144  *
1145  * Means relocating the operand.
1146  */
1147 static void
1148 reginsert(op, opnd)
1149 char op;
1150 char *opnd;
1151 {
1152         register char *src;
1153         register char *dst;
1154         register char *place;
1155         register offset = (op == CURLY ? 4 : 0);
1156
1157         if (regcode == &regdummy) {
1158 #ifdef REGALIGN
1159                 regsize += 4 + offset;
1160 #else
1161                 regsize += 3 + offset;
1162 #endif
1163                 return;
1164         }
1165
1166         src = regcode;
1167 #ifdef REGALIGN
1168         regcode += 4 + offset;
1169 #else
1170         regcode += 3 + offset;
1171 #endif
1172         dst = regcode;
1173         while (src > opnd)
1174                 *--dst = *--src;
1175
1176         place = opnd;           /* Op node, where operand used to be. */
1177         *place++ = op;
1178         *place++ = '\0';
1179         *place++ = '\0';
1180         while (offset-- > 0)
1181             *place++ = '\0';
1182 }
1183
1184 /*
1185  - regtail - set the next-pointer at the end of a node chain
1186  */
1187 static void
1188 regtail(p, val)
1189 char *p;
1190 char *val;
1191 {
1192         register char *scan;
1193         register char *temp;
1194         register int offset;
1195
1196         if (p == &regdummy)
1197                 return;
1198
1199         /* Find last node. */
1200         scan = p;
1201         for (;;) {
1202                 temp = regnext(scan);
1203                 if (temp == NULL)
1204                         break;
1205                 scan = temp;
1206         }
1207
1208 #ifdef REGALIGN
1209         offset = val - scan;
1210 #ifndef lint
1211         *(short*)(scan+1) = offset;
1212 #else
1213         offset = offset;
1214 #endif
1215 #else
1216         if (OP(scan) == BACK)
1217                 offset = scan - val;
1218         else
1219                 offset = val - scan;
1220         *(scan+1) = (offset>>8)&0377;
1221         *(scan+2) = offset&0377;
1222 #endif
1223 }
1224
1225 /*
1226  - regoptail - regtail on operand of first argument; nop if operandless
1227  */
1228 static void
1229 regoptail(p, val)
1230 char *p;
1231 char *val;
1232 {
1233         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1234         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1235                 return;
1236         regtail(NEXTOPER(p), val);
1237 }
1238
1239 /*
1240  - regcurly - a little FSA that accepts {\d+,?\d*}
1241  */
1242 STATIC int
1243 regcurly(s)
1244 register char *s;
1245 {
1246     if (*s++ != '{')
1247         return FALSE;
1248     if (!isDIGIT(*s))
1249         return FALSE;
1250     while (isDIGIT(*s))
1251         s++;
1252     if (*s == ',')
1253         s++;
1254     while (isDIGIT(*s))
1255         s++;
1256     if (*s != '}')
1257         return FALSE;
1258     return TRUE;
1259 }
1260
1261 #ifdef DEBUGGING
1262
1263 /*
1264  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1265  */
1266 void
1267 regdump(r)
1268 regexp *r;
1269 {
1270         register char *s;
1271         register char op = EXACTLY;     /* Arbitrary non-END op. */
1272         register char *next;
1273
1274
1275         s = r->program + 1;
1276         while (op != END) {     /* While that wasn't END last time... */
1277 #ifdef REGALIGN
1278                 if (!((long)s & 1))
1279                         s++;
1280 #endif
1281                 op = OP(s);
1282                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1283                 next = regnext(s);
1284                 s += regarglen[op];
1285                 if (next == NULL)               /* Next ptr. */
1286                         fprintf(stderr,"(0)");
1287                 else 
1288                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1289                 s += 3;
1290                 if (op == ANYOF) {
1291                         s += 32;
1292                 }
1293                 if (op == EXACTLY) {
1294                         /* Literal string, where present. */
1295                         s++;
1296                         while (*s != '\0') {
1297                                 (void)putchar(*s);
1298                                 s++;
1299                         }
1300                         s++;
1301                 }
1302                 (void)putchar('\n');
1303         }
1304
1305         /* Header fields of interest. */
1306         if (r->regstart)
1307                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1308         if (r->regstclass)
1309                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1310         if (r->reganch & ROPT_ANCH)
1311                 fprintf(stderr,"anchored ");
1312         if (r->reganch & ROPT_SKIP)
1313                 fprintf(stderr,"plus ");
1314         if (r->reganch & ROPT_IMPLICIT)
1315                 fprintf(stderr,"implicit ");
1316         if (r->regmust != NULL)
1317                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1318                   r->regback);
1319         fprintf(stderr, "minlen %d ", r->minlen);
1320         fprintf(stderr,"\n");
1321 }
1322
1323 /*
1324  - regprop - printable representation of opcode
1325  */
1326 char *
1327 regprop(op)
1328 char *op;
1329 {
1330         register char *p;
1331
1332         (void) strcpy(buf, ":");
1333
1334         switch (OP(op)) {
1335         case BOL:
1336                 p = "BOL";
1337                 break;
1338         case EOL:
1339                 p = "EOL";
1340                 break;
1341         case ANY:
1342                 p = "ANY";
1343                 break;
1344         case ANYOF:
1345                 p = "ANYOF";
1346                 break;
1347         case BRANCH:
1348                 p = "BRANCH";
1349                 break;
1350         case EXACTLY:
1351                 p = "EXACTLY";
1352                 break;
1353         case NOTHING:
1354                 p = "NOTHING";
1355                 break;
1356         case BACK:
1357                 p = "BACK";
1358                 break;
1359         case END:
1360                 p = "END";
1361                 break;
1362         case ALNUM:
1363                 p = "ALNUM";
1364                 break;
1365         case NALNUM:
1366                 p = "NALNUM";
1367                 break;
1368         case BOUND:
1369                 p = "BOUND";
1370                 break;
1371         case NBOUND:
1372                 p = "NBOUND";
1373                 break;
1374         case SPACE:
1375                 p = "SPACE";
1376                 break;
1377         case NSPACE:
1378                 p = "NSPACE";
1379                 break;
1380         case DIGIT:
1381                 p = "DIGIT";
1382                 break;
1383         case NDIGIT:
1384                 p = "NDIGIT";
1385                 break;
1386         case CURLY:
1387                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1388                     ARG1(op),ARG2(op));
1389                 p = NULL;
1390                 break;
1391         case REF:
1392                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1393                 p = NULL;
1394                 break;
1395         case OPEN:
1396                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1397                 p = NULL;
1398                 break;
1399         case CLOSE:
1400                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1401                 p = NULL;
1402                 break;
1403         case STAR:
1404                 p = "STAR";
1405                 break;
1406         case PLUS:
1407                 p = "PLUS";
1408                 break;
1409         default:
1410                 FAIL("corrupted regexp opcode");
1411         }
1412         if (p != NULL)
1413                 (void) strcat(buf, p);
1414         return(buf);
1415 }
1416 #endif /* DEBUGGING */
1417
1418 regfree(r)
1419 struct regexp *r;
1420 {
1421         if (r->precomp) {
1422                 Safefree(r->precomp);
1423                 r->precomp = Nullch;
1424         }
1425         if (r->subbase) {
1426                 Safefree(r->subbase);
1427                 r->subbase = Nullch;
1428         }
1429         if (r->regmust) {
1430                 str_free(r->regmust);
1431                 r->regmust = Nullstr;
1432         }
1433         if (r->regstart) {
1434                 str_free(r->regstart);
1435                 r->regstart = Nullstr;
1436         }
1437         Safefree(r->startp);
1438         Safefree(r->endp);
1439         Safefree(r);
1440 }