perl 3.0 patch #38 (combined patch)
[perl.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.7 90/10/20 02:18:32 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.7  90/10/20  02:18:32  lwall
14  * patch37: /foo.*bar$/ wrongly optimized to do tail matching on "foo"
15  * 
16  * Revision 3.0.1.6  90/10/16  10:17:33  lwall
17  * patch29: patterns with multiple short literal strings sometimes failed
18  * 
19  * Revision 3.0.1.5  90/08/13  22:23:29  lwall
20  * patch28: /x{m}/ didn't work right
21  * 
22  * Revision 3.0.1.4  90/08/09  05:05:33  lwall
23  * patch19: sped up /x+y/ patterns greatly by not retrying on every x
24  * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
25  * patch19: sped up {m,n} on simple items
26  * patch19: optimized /.*whatever/ to /^.*whatever/
27  * patch19: fixed character classes to allow backslashing hyphen
28  * 
29  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
30  * patch13: pattern matches can now use \0 to mean \000
31  * 
32  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
33  * patch9: /[\200-\377]/ didn't work on machines with signed chars
34  * 
35  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
36  * patch2: /[\000]/ didn't work
37  * 
38  * Revision 3.0  89/10/18  15:22:29  lwall
39  * 3.0 baseline
40  * 
41  */
42
43 /*
44  * regcomp and regexec -- regsub and regerror are not used in perl
45  *
46  *      Copyright (c) 1986 by University of Toronto.
47  *      Written by Henry Spencer.  Not derived from licensed software.
48  *
49  *      Permission is granted to anyone to use this software for any
50  *      purpose on any computer system, and to redistribute it freely,
51  *      subject to the following restrictions:
52  *
53  *      1. The author is not responsible for the consequences of use of
54  *              this software, no matter how awful, even if they arise
55  *              from defects in it.
56  *
57  *      2. The origin of this software must not be misrepresented, either
58  *              by explicit claim or by omission.
59  *
60  *      3. Altered versions must be plainly marked as such, and must not
61  *              be misrepresented as being the original software.
62  *
63  *
64  ****    Alterations to Henry's code are...
65  ****
66  ****    Copyright (c) 1989, Larry Wall
67  ****
68  ****    You may distribute under the terms of the GNU General Public License
69  ****    as specified in the README file that comes with the perl 3.0 kit.
70  *
71  * Beware that some of this code is subtly aware of the way operator
72  * precedence is structured in regular expressions.  Serious changes in
73  * regular-expression syntax might require a total rethink.
74  */
75 #include "EXTERN.h"
76 #include "perl.h"
77 #include "INTERN.h"
78 #include "regcomp.h"
79
80 #ifndef STATIC
81 #define STATIC  static
82 #endif
83
84 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
85 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
86         ((*s) == '{' && regcurly(s)))
87 #define META    "^$.[()|?+*\\"
88
89 /*
90  * Flags to be passed up and down.
91  */
92 #define HASWIDTH        01      /* Known never to match null string. */
93 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
94 #define SPSTART         04      /* Starts with * or +. */
95 #define WORST           0       /* Worst case. */
96
97 /*
98  * Global work variables for regcomp().
99  */
100 static char *regprecomp;                /* uncompiled string. */
101 static char *regparse;          /* Input-scan pointer. */
102 static char *regxend;           /* End of input for compile */
103 static int regnpar;             /* () count. */
104 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
105 static long regsize;            /* Code size. */
106 static int regfold;
107 static int regsawbracket;       /* Did we do {d,d} trick? */
108
109 /*
110  * Forward declarations for regcomp()'s friends.
111  */
112 STATIC int regcurly();
113 STATIC char *reg();
114 STATIC char *regbranch();
115 STATIC char *regpiece();
116 STATIC char *regatom();
117 STATIC char *regclass();
118 STATIC char *regnode();
119 STATIC void regc();
120 STATIC void reginsert();
121 STATIC void regtail();
122 STATIC void regoptail();
123
124 /*
125  - regcomp - compile a regular expression into internal code
126  *
127  * We can't allocate space until we know how big the compiled form will be,
128  * but we can't compile it (and thus know how big it is) until we've got a
129  * place to put the code.  So we cheat:  we compile it twice, once with code
130  * generation turned off and size counting turned on, and once "for real".
131  * This also means that we don't allocate space until we are sure that the
132  * thing really will compile successfully, and we never have to move the
133  * code and thus invalidate pointers into it.  (Note that it has to be in
134  * one piece because free() must be able to free it all.) [NB: not true in perl]
135  *
136  * Beware that the optimization-preparation code in here knows about some
137  * of the structure of the compiled regexp.  [I'll say.]
138  */
139 regexp *
140 regcomp(exp,xend,fold)
141 char *exp;
142 char *xend;
143 int fold;
144 {
145         register regexp *r;
146         register char *scan;
147         register STR *longish;
148         STR *longest;
149         register int len;
150         register char *first;
151         int flags;
152         int back;
153         int curback;
154         extern char *safemalloc();
155         extern char *savestr();
156         int sawplus = 0;
157
158         if (exp == NULL)
159                 fatal("NULL regexp argument");
160
161         /* First pass: determine size, legality. */
162         regfold = fold;
163         regparse = exp;
164         regxend = xend;
165         regprecomp = nsavestr(exp,xend-exp);
166         regsawbracket = 0;
167         regnpar = 1;
168         regsize = 0L;
169         regcode = &regdummy;
170         regc(MAGIC);
171         if (reg(0, &flags) == NULL) {
172                 Safefree(regprecomp);
173                 return(NULL);
174         }
175
176         /* Small enough for pointer-storage convention? */
177         if (regsize >= 32767L)          /* Probably could be 65535L. */
178                 FAIL("regexp too big");
179
180         /* Allocate space. */
181         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
182         if (r == NULL)
183                 FAIL("regexp out of space");
184
185         /* Second pass: emit code. */
186         if (regsawbracket)
187             bcopy(regprecomp,exp,xend-exp);
188         r->precomp = regprecomp;
189         r->subbase = NULL;
190         regparse = exp;
191         regnpar = 1;
192         regcode = r->program;
193         regc(MAGIC);
194         if (reg(0, &flags) == NULL)
195                 return(NULL);
196
197         /* Dig out information for optimizations. */
198         r->regstart = Nullstr;  /* Worst-case defaults. */
199         r->reganch = 0;
200         r->regmust = Nullstr;
201         r->regback = -1;
202         r->regstclass = Nullch;
203         scan = r->program+1;                    /* First BRANCH. */
204         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
205                 scan = NEXTOPER(scan);
206
207                 first = scan;
208                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
209                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
210                     (OP(first) == PLUS) ||
211                     (OP(first) == CURLY && ARG1(first) > 0) ) {
212                         if (OP(first) == CURLY)
213                             first += 4;
214                         else if (OP(first) == PLUS)
215                             sawplus = 2;
216                         first = NEXTOPER(first);
217                 }
218
219                 /* Starting-point info. */
220                 if (OP(first) == EXACTLY) {
221                         r->regstart =
222                             str_make(OPERAND(first)+1,*OPERAND(first));
223                         if (r->regstart->str_cur > !(sawstudy|fold))
224                                 fbmcompile(r->regstart,fold);
225                 }
226                 else if ((exp = index(simple,OP(first))) && exp > simple)
227                         r->regstclass = first;
228                 else if (OP(first) == BOUND || OP(first) == NBOUND)
229                         r->regstclass = first;
230                 else if (OP(first) == BOL ||
231                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
232                         r->reganch = 1;         /* kinda turn .* into ^.* */
233                 r->reganch |= sawplus;
234
235 #ifdef DEBUGGING
236                 if (debug & 512)
237                     fprintf(stderr,"first %d next %d offset %d\n",
238                       OP(first), OP(NEXTOPER(first)), first - scan);
239 #endif
240                 /*
241                  * If there's something expensive in the r.e., find the
242                  * longest literal string that must appear and make it the
243                  * regmust.  Resolve ties in favor of later strings, since
244                  * the regstart check works with the beginning of the r.e.
245                  * and avoiding duplication strengthens checking.  Not a
246                  * strong reason, but sufficient in the absence of others.
247                  * [Now we resolve ties in favor of the earlier string if
248                  * it happens that curback has been invalidated, since the
249                  * earlier string may buy us something the later one won't.]
250                  */
251                 longish = str_make("",0);
252                 longest = str_make("",0);
253                 len = 0;
254                 curback = 0;
255                 back = 0;
256                 while (OP(scan) != END) {
257                         if (OP(scan) == BRANCH) {
258                             if (OP(regnext(scan)) == BRANCH) {
259                                 curback = -30000;
260                                 while (OP(scan) == BRANCH)
261                                     scan = regnext(scan);
262                             }
263                             else        /* single branch is ok */
264                                 scan = NEXTOPER(scan);
265                         }
266                         if (OP(scan) == EXACTLY) {
267                             first = scan;
268                             while (OP(regnext(scan)) >= CLOSE)
269                                 scan = regnext(scan);
270                             if (curback - back == len) {
271                                 str_ncat(longish, OPERAND(first)+1,
272                                     *OPERAND(first));
273                                 len += *OPERAND(first);
274                                 curback += *OPERAND(first);
275                                 first = regnext(scan);
276                             }
277                             else if (*OPERAND(first) >= len + (curback >= 0)) {
278                                 len = *OPERAND(first);
279                                 str_nset(longish, OPERAND(first)+1,len);
280                                 back = curback;
281                                 curback += len;
282                                 first = regnext(scan);
283                             }
284                             else
285                                 curback += *OPERAND(first);
286                         }
287                         else if (index(varies,OP(scan))) {
288                             curback = -30000;
289                             len = 0;
290                             if (longish->str_cur > longest->str_cur)
291                                 str_sset(longest,longish);
292                             str_nset(longish,"",0);
293                         }
294                         else if (index(simple,OP(scan))) {
295                             curback++;
296                             len = 0;
297                             if (longish->str_cur > longest->str_cur)
298                                 str_sset(longest,longish);
299                             str_nset(longish,"",0);
300                         }
301                         scan = regnext(scan);
302                 }
303
304                 /* Prefer earlier on tie, unless we can tail match latter */
305
306                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur)
307                     str_sset(longest,longish);
308                 else
309                     str_nset(longish,"",0);
310                 if (longest->str_cur) {
311                         r->regmust = longest;
312                         if (back < 0)
313                                 back = -1;
314                         r->regback = back;
315                         if (longest->str_cur
316                           > !(sawstudy || fold || OP(first) == EOL) )
317                                 fbmcompile(r->regmust,fold);
318                         r->regmust->str_u.str_useful = 100;
319                         if (OP(first) == EOL && longish->str_cur)
320                             r->regmust->str_pok |= SP_TAIL;
321                 }
322                 else
323                         str_free(longest);
324                 str_free(longish);
325         }
326
327         r->do_folding = fold;
328         r->nparens = regnpar - 1;
329 #ifdef DEBUGGING
330         if (debug & 512)
331                 regdump(r);
332 #endif
333         return(r);
334 }
335
336 /*
337  - reg - regular expression, i.e. main body or parenthesized thing
338  *
339  * Caller must absorb opening parenthesis.
340  *
341  * Combining parenthesis handling with the base level of regular expression
342  * is a trifle forced, but the need to tie the tails of the branches to what
343  * follows makes it hard to avoid.
344  */
345 static char *
346 reg(paren, flagp)
347 int paren;                      /* Parenthesized? */
348 int *flagp;
349 {
350         register char *ret;
351         register char *br;
352         register char *ender;
353         register int parno;
354         int flags;
355
356         *flagp = HASWIDTH;      /* Tentatively. */
357
358         /* Make an OPEN node, if parenthesized. */
359         if (paren) {
360                 if (regnpar >= NSUBEXP)
361                         FAIL("too many () in regexp");
362                 parno = regnpar;
363                 regnpar++;
364                 ret = regnode(OPEN+parno);
365         } else
366                 ret = NULL;
367
368         /* Pick up the branches, linking them together. */
369         br = regbranch(&flags);
370         if (br == NULL)
371                 return(NULL);
372         if (ret != NULL)
373                 regtail(ret, br);       /* OPEN -> first. */
374         else
375                 ret = br;
376         if (!(flags&HASWIDTH))
377                 *flagp &= ~HASWIDTH;
378         *flagp |= flags&SPSTART;
379         while (*regparse == '|') {
380                 regparse++;
381                 br = regbranch(&flags);
382                 if (br == NULL)
383                         return(NULL);
384                 regtail(ret, br);       /* BRANCH -> BRANCH. */
385                 if (!(flags&HASWIDTH))
386                         *flagp &= ~HASWIDTH;
387                 *flagp |= flags&SPSTART;
388         }
389
390         /* Make a closing node, and hook it on the end. */
391         ender = regnode((paren) ? CLOSE+parno : END);   
392         regtail(ret, ender);
393
394         /* Hook the tails of the branches to the closing node. */
395         for (br = ret; br != NULL; br = regnext(br))
396                 regoptail(br, ender);
397
398         /* Check for proper termination. */
399         if (paren && *regparse++ != ')') {
400                 FAIL("unmatched () in regexp");
401         } else if (!paren && regparse < regxend) {
402                 if (*regparse == ')') {
403                         FAIL("unmatched () in regexp");
404                 } else
405                         FAIL("junk on end of regexp");  /* "Can't happen". */
406                 /* NOTREACHED */
407         }
408
409         return(ret);
410 }
411
412 /*
413  - regbranch - one alternative of an | operator
414  *
415  * Implements the concatenation operator.
416  */
417 static char *
418 regbranch(flagp)
419 int *flagp;
420 {
421         register char *ret;
422         register char *chain;
423         register char *latest;
424         int flags;
425
426         *flagp = WORST;         /* Tentatively. */
427
428         ret = regnode(BRANCH);
429         chain = NULL;
430         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
431                 latest = regpiece(&flags);
432                 if (latest == NULL)
433                         return(NULL);
434                 *flagp |= flags&HASWIDTH;
435                 if (chain == NULL)      /* First piece. */
436                         *flagp |= flags&SPSTART;
437                 else
438                         regtail(chain, latest);
439                 chain = latest;
440         }
441         if (chain == NULL)      /* Loop ran zero times. */
442                 (void) regnode(NOTHING);
443
444         return(ret);
445 }
446
447 /*
448  - regpiece - something followed by possible [*+?]
449  *
450  * Note that the branching code sequences used for ? and the general cases
451  * of * and + are somewhat optimized:  they use the same NOTHING node as
452  * both the endmarker for their branch list and the body of the last branch.
453  * It might seem that this node could be dispensed with entirely, but the
454  * endmarker role is not redundant.
455  */
456 static char *
457 regpiece(flagp)
458 int *flagp;
459 {
460         register char *ret;
461         register char op;
462         register char *next;
463         int flags;
464         char *origparse = regparse;
465         int orignpar = regnpar;
466         char *max;
467         int iter;
468         char ch;
469
470         ret = regatom(&flags);
471         if (ret == NULL)
472                 return(NULL);
473
474         op = *regparse;
475
476         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
477          * then we decrement the first number by one and reset our
478          * parsing back to the beginning of the same atom.  If the first number
479          * is down to 0, decrement the second number instead and fake up
480          * a ? after it.  Given the way this compiler doesn't keep track
481          * of offsets on the first pass, this is the only way to replicate
482          * a piece of code.  Sigh.
483          */
484         if (op == '{' && regcurly(regparse)) {
485             next = regparse + 1;
486             max = Nullch;
487             while (isdigit(*next) || *next == ',') {
488                 if (*next == ',') {
489                     if (max)
490                         break;
491                     else
492                         max = next;
493                 }
494                 next++;
495             }
496             if (*next == '}') {         /* got one */
497                 if (!max)
498                     max = next;
499                 regparse++;
500                 iter = atoi(regparse);
501                 if (flags&SIMPLE) {     /* we can do it right after all */
502                     int tmp;
503
504                     reginsert(CURLY, ret);
505                     if (*max == ',')
506                         max++;
507                     else
508                         max = regparse;
509                     tmp = atoi(max);
510                     if (tmp && tmp < iter)
511                         fatal("Can't do {n,m} with n > m");
512                     if (regcode != &regdummy) {
513 #ifdef REGALIGN
514                         *(unsigned short *)(ret+3) = iter;
515                         *(unsigned short *)(ret+5) = tmp;
516 #else
517                         ret[3] = iter >> 8; ret[4] = iter & 0377;
518                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
519 #endif
520                     }
521                     regparse = next;
522                     goto nest_check;
523                 }
524                 regsawbracket++;        /* remember we clobbered exp */
525                 if (iter > 0) {
526                     ch = *max;
527                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
528                     *max = ch;
529                     if (*max == ',' && max[1] != '}') {
530                         if (atoi(max+1) <= 0)
531                             fatal("Can't do {n,m} with n > m");
532                         ch = *next;
533                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
534                         *next = ch;
535                     }
536                     if (iter != 1 || *max == ',') {
537                         regparse = origparse;   /* back up input pointer */
538                         regnpar = orignpar;     /* don't make more parens */
539                     }
540                     else {
541                         regparse = next;
542                         goto nest_check;
543                     }
544                     *flagp = flags;
545                     return ret;
546                 }
547                 if (*max == ',') {
548                     max++;
549                     iter = atoi(max);
550                     if (max == next) {          /* any number more? */
551                         regparse = next;
552                         op = '*';               /* fake up one with a star */
553                     }
554                     else if (iter > 0) {
555                         op = '?';               /* fake up optional atom */
556                         ch = *next;
557                         sprintf(max,"%.*d", next-max, iter - 1);
558                         *next = ch;
559                         if (iter == 1)
560                             regparse = next;
561                         else {
562                             regparse = origparse - 1; /* offset ++ below */
563                             regnpar = orignpar;
564                         }
565                     }
566                     else
567                         fatal("Can't do {n,0}");
568                 }
569                 else
570                     fatal("Can't do {0}");
571             }
572         }
573
574         if (!ISMULT1(op)) {
575                 *flagp = flags;
576                 return(ret);
577         }
578
579         if (!(flags&HASWIDTH) && op != '?')
580                 FAIL("regexp *+ operand could be empty");
581         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
582
583         if (op == '*' && (flags&SIMPLE))
584                 reginsert(STAR, ret);
585         else if (op == '*') {
586                 /* Emit x* as (x&|), where & means "self". */
587                 reginsert(BRANCH, ret);                 /* Either x */
588                 regoptail(ret, regnode(BACK));          /* and loop */
589                 regoptail(ret, ret);                    /* back */
590                 regtail(ret, regnode(BRANCH));          /* or */
591                 regtail(ret, regnode(NOTHING));         /* null. */
592         } else if (op == '+' && (flags&SIMPLE))
593                 reginsert(PLUS, ret);
594         else if (op == '+') {
595                 /* Emit x+ as x(&|), where & means "self". */
596                 next = regnode(BRANCH);                 /* Either */
597                 regtail(ret, next);
598                 regtail(regnode(BACK), ret);            /* loop back */
599                 regtail(next, regnode(BRANCH));         /* or */
600                 regtail(ret, regnode(NOTHING));         /* null. */
601         } else if (op == '?') {
602                 /* Emit x? as (x|) */
603                 reginsert(BRANCH, ret);                 /* Either x */
604                 regtail(ret, regnode(BRANCH));          /* or */
605                 next = regnode(NOTHING);                /* null. */
606                 regtail(ret, next);
607                 regoptail(ret, next);
608         }
609       nest_check:
610         regparse++;
611         if (ISMULT2(regparse))
612                 FAIL("nested *?+ in regexp");
613
614         return(ret);
615 }
616
617 /*
618  - regatom - the lowest level
619  *
620  * Optimization:  gobbles an entire sequence of ordinary characters so that
621  * it can turn them into a single node, which is smaller to store and
622  * faster to run.  Backslashed characters are exceptions, each becoming a
623  * separate node; the code is simpler that way and it's not worth fixing.
624  *
625  * [Yes, it is worth fixing, some scripts can run twice the speed.]
626  */
627 static char *
628 regatom(flagp)
629 int *flagp;
630 {
631         register char *ret;
632         int flags;
633
634         *flagp = WORST;         /* Tentatively. */
635
636         switch (*regparse++) {
637         case '^':
638                 ret = regnode(BOL);
639                 break;
640         case '$':
641                 ret = regnode(EOL);
642                 break;
643         case '.':
644                 ret = regnode(ANY);
645                 *flagp |= HASWIDTH|SIMPLE;
646                 break;
647         case '[':
648                 ret = regclass();
649                 *flagp |= HASWIDTH|SIMPLE;
650                 break;
651         case '(':
652                 ret = reg(1, &flags);
653                 if (ret == NULL)
654                         return(NULL);
655                 *flagp |= flags&(HASWIDTH|SPSTART);
656                 break;
657         case '|':
658         case ')':
659                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
660                 break;
661         case '?':
662         case '+':
663         case '*':
664                 FAIL("?+* follows nothing in regexp");
665                 break;
666         case '\\':
667                 switch (*regparse) {
668                 case 'w':
669                         ret = regnode(ALNUM);
670                         *flagp |= HASWIDTH|SIMPLE;
671                         regparse++;
672                         break;
673                 case 'W':
674                         ret = regnode(NALNUM);
675                         *flagp |= HASWIDTH|SIMPLE;
676                         regparse++;
677                         break;
678                 case 'b':
679                         ret = regnode(BOUND);
680                         *flagp |= SIMPLE;
681                         regparse++;
682                         break;
683                 case 'B':
684                         ret = regnode(NBOUND);
685                         *flagp |= SIMPLE;
686                         regparse++;
687                         break;
688                 case 's':
689                         ret = regnode(SPACE);
690                         *flagp |= HASWIDTH|SIMPLE;
691                         regparse++;
692                         break;
693                 case 'S':
694                         ret = regnode(NSPACE);
695                         *flagp |= HASWIDTH|SIMPLE;
696                         regparse++;
697                         break;
698                 case 'd':
699                         ret = regnode(DIGIT);
700                         *flagp |= HASWIDTH|SIMPLE;
701                         regparse++;
702                         break;
703                 case 'D':
704                         ret = regnode(NDIGIT);
705                         *flagp |= HASWIDTH|SIMPLE;
706                         regparse++;
707                         break;
708                 case 'n':
709                 case 'r':
710                 case 't':
711                 case 'f':
712                         goto defchar;
713                 case '0': case '1': case '2': case '3': case '4':
714                 case '5': case '6': case '7': case '8': case '9':
715                         if (isdigit(regparse[1]) || *regparse == '0')
716                                 goto defchar;
717                         else {
718                                 ret = regnode(REF + *regparse++ - '0');
719                                 *flagp |= SIMPLE;
720                         }
721                         break;
722                 case '\0':
723                         if (regparse >= regxend)
724                             FAIL("trailing \\ in regexp");
725                         /* FALL THROUGH */
726                 default:
727                         goto defchar;
728                 }
729                 break;
730         default: {
731                         register int len;
732                         register char ender;
733                         register char *p;
734                         char *oldp;
735                         int foo;
736
737                     defchar:
738                         ret = regnode(EXACTLY);
739                         regc(0);                /* save spot for len */
740                         for (len=0, p=regparse-1;
741                           len < 127 && p < regxend;
742                           len++)
743                         {
744                             oldp = p;
745                             switch (*p) {
746                             case '^':
747                             case '$':
748                             case '.':
749                             case '[':
750                             case '(':
751                             case ')':
752                             case '|':
753                                 goto loopdone;
754                             case '\\':
755                                 switch (*++p) {
756                                 case 'w':
757                                 case 'W':
758                                 case 'b':
759                                 case 'B':
760                                 case 's':
761                                 case 'S':
762                                 case 'd':
763                                 case 'D':
764                                     --p;
765                                     goto loopdone;
766                                 case 'n':
767                                         ender = '\n';
768                                         p++;
769                                         break;
770                                 case 'r':
771                                         ender = '\r';
772                                         p++;
773                                         break;
774                                 case 't':
775                                         ender = '\t';
776                                         p++;
777                                         break;
778                                 case 'f':
779                                         ender = '\f';
780                                         p++;
781                                         break;
782                                 case '0': case '1': case '2': case '3':case '4':
783                                 case '5': case '6': case '7': case '8':case '9':
784                                     if (isdigit(p[1]) || *p == '0') {
785                                         foo = *p - '0';
786                                         if (isdigit(p[1]))
787                                             foo = (foo<<3) + *++p - '0';
788                                         if (isdigit(p[1]))
789                                             foo = (foo<<3) + *++p - '0';
790                                         ender = foo;
791                                         p++;
792                                     }
793                                     else {
794                                         --p;
795                                         goto loopdone;
796                                     }
797                                     break;
798                                 case '\0':
799                                     if (p >= regxend)
800                                         FAIL("trailing \\ in regexp");
801                                     /* FALL THROUGH */
802                                 default:
803                                     ender = *p++;
804                                     break;
805                                 }
806                                 break;
807                             default:
808                                 ender = *p++;
809                                 break;
810                             }
811                             if (regfold && isupper(ender))
812                                     ender = tolower(ender);
813                             if (ISMULT2(p)) { /* Back off on ?+*. */
814                                 if (len)
815                                     p = oldp;
816                                 else {
817                                     len++;
818                                     regc(ender);
819                                 }
820                                 break;
821                             }
822                             regc(ender);
823                         }
824                     loopdone:
825                         regparse = p;
826                         if (len <= 0)
827                                 FAIL("internal disaster in regexp");
828                         *flagp |= HASWIDTH;
829                         if (len == 1)
830                                 *flagp |= SIMPLE;
831                         if (regcode != &regdummy)
832                             *OPERAND(ret) = len;
833                         regc('\0');
834                 }
835                 break;
836         }
837
838         return(ret);
839 }
840
841 static void
842 regset(bits,def,c)
843 char *bits;
844 int def;
845 register int c;
846 {
847         if (regcode == &regdummy)
848             return;
849         c &= 255;
850         if (def)
851                 bits[c >> 3] &= ~(1 << (c & 7));
852         else
853                 bits[c >> 3] |=  (1 << (c & 7));
854 }
855
856 static char *
857 regclass()
858 {
859         register char *bits;
860         register int class;
861         register int lastclass;
862         register int range = 0;
863         register char *ret;
864         register int def;
865
866         ret = regnode(ANYOF);
867         if (*regparse == '^') { /* Complement of range. */
868                 regparse++;
869                 def = 0;
870         } else {
871                 def = 255;
872         }
873         bits = regcode;
874         for (class = 0; class < 32; class++)
875             regc(def);
876         if (*regparse == ']' || *regparse == '-')
877                 goto skipcond;          /* allow 1st char to be ] or - */
878         while (regparse < regxend && *regparse != ']') {
879               skipcond:
880                 class = UCHARAT(regparse++);
881                 if (class == '\\') {
882                         class = UCHARAT(regparse++);
883                         switch (class) {
884                         case 'w':
885                                 for (class = 'a'; class <= 'z'; class++)
886                                         regset(bits,def,class);
887                                 for (class = 'A'; class <= 'Z'; class++)
888                                         regset(bits,def,class);
889                                 for (class = '0'; class <= '9'; class++)
890                                         regset(bits,def,class);
891                                 regset(bits,def,'_');
892                                 lastclass = 1234;
893                                 continue;
894                         case 's':
895                                 regset(bits,def,' ');
896                                 regset(bits,def,'\t');
897                                 regset(bits,def,'\r');
898                                 regset(bits,def,'\f');
899                                 regset(bits,def,'\n');
900                                 lastclass = 1234;
901                                 continue;
902                         case 'd':
903                                 for (class = '0'; class <= '9'; class++)
904                                         regset(bits,def,class);
905                                 lastclass = 1234;
906                                 continue;
907                         case 'n':
908                                 class = '\n';
909                                 break;
910                         case 'r':
911                                 class = '\r';
912                                 break;
913                         case 't':
914                                 class = '\t';
915                                 break;
916                         case 'f':
917                                 class = '\f';
918                                 break;
919                         case 'b':
920                                 class = '\b';
921                                 break;
922                         case '0': case '1': case '2': case '3': case '4':
923                         case '5': case '6': case '7': case '8': case '9':
924                                 class -= '0';
925                                 if (isdigit(*regparse)) {
926                                         class <<= 3;
927                                         class += *regparse++ - '0';
928                                 }
929                                 if (isdigit(*regparse)) {
930                                         class <<= 3;
931                                         class += *regparse++ - '0';
932                                 }
933                                 break;
934                         }
935                 }
936                 if (range) {
937                         if (lastclass > class)
938                                 FAIL("invalid [] range in regexp");
939                         range = 0;
940                 }
941                 else {
942                         lastclass = class;
943                         if (*regparse == '-' && regparse+1 < regxend &&
944                             regparse[1] != ']') {
945                                 regparse++;
946                                 range = 1;
947                                 continue;       /* do it next time */
948                         }
949                 }
950                 for ( ; lastclass <= class; lastclass++) {
951                         regset(bits,def,lastclass);
952                         if (regfold && isupper(lastclass))
953                                 regset(bits,def,tolower(lastclass));
954                 }
955                 lastclass = class;
956         }
957         if (*regparse != ']')
958                 FAIL("unmatched [] in regexp");
959         regparse++;
960         return ret;
961 }
962
963 /*
964  - regnode - emit a node
965  */
966 static char *                   /* Location. */
967 regnode(op)
968 char op;
969 {
970         register char *ret;
971         register char *ptr;
972
973         ret = regcode;
974         if (ret == &regdummy) {
975 #ifdef REGALIGN
976                 if (!(regsize & 1))
977                         regsize++;
978 #endif
979                 regsize += 3;
980                 return(ret);
981         }
982
983 #ifdef REGALIGN
984 #ifndef lint
985         if (!((long)ret & 1))
986             *ret++ = 127;
987 #endif
988 #endif
989         ptr = ret;
990         *ptr++ = op;
991         *ptr++ = '\0';          /* Null "next" pointer. */
992         *ptr++ = '\0';
993         regcode = ptr;
994
995         return(ret);
996 }
997
998 /*
999  - regc - emit (if appropriate) a byte of code
1000  */
1001 static void
1002 regc(b)
1003 char b;
1004 {
1005         if (regcode != &regdummy)
1006                 *regcode++ = b;
1007         else
1008                 regsize++;
1009 }
1010
1011 /*
1012  - reginsert - insert an operator in front of already-emitted operand
1013  *
1014  * Means relocating the operand.
1015  */
1016 static void
1017 reginsert(op, opnd)
1018 char op;
1019 char *opnd;
1020 {
1021         register char *src;
1022         register char *dst;
1023         register char *place;
1024         register offset = (op == CURLY ? 4 : 0);
1025
1026         if (regcode == &regdummy) {
1027 #ifdef REGALIGN
1028                 regsize += 4 + offset;
1029 #else
1030                 regsize += 3 + offset;
1031 #endif
1032                 return;
1033         }
1034
1035         src = regcode;
1036 #ifdef REGALIGN
1037         regcode += 4 + offset;
1038 #else
1039         regcode += 3 + offset;
1040 #endif
1041         dst = regcode;
1042         while (src > opnd)
1043                 *--dst = *--src;
1044
1045         place = opnd;           /* Op node, where operand used to be. */
1046         *place++ = op;
1047         *place++ = '\0';
1048         *place++ = '\0';
1049         while (offset-- > 0)
1050             *place++ = '\0';
1051 }
1052
1053 /*
1054  - regtail - set the next-pointer at the end of a node chain
1055  */
1056 static void
1057 regtail(p, val)
1058 char *p;
1059 char *val;
1060 {
1061         register char *scan;
1062         register char *temp;
1063         register int offset;
1064
1065         if (p == &regdummy)
1066                 return;
1067
1068         /* Find last node. */
1069         scan = p;
1070         for (;;) {
1071                 temp = regnext(scan);
1072                 if (temp == NULL)
1073                         break;
1074                 scan = temp;
1075         }
1076
1077 #ifdef REGALIGN
1078         offset = val - scan;
1079 #ifndef lint
1080         *(short*)(scan+1) = offset;
1081 #else
1082         offset = offset;
1083 #endif
1084 #else
1085         if (OP(scan) == BACK)
1086                 offset = scan - val;
1087         else
1088                 offset = val - scan;
1089         *(scan+1) = (offset>>8)&0377;
1090         *(scan+2) = offset&0377;
1091 #endif
1092 }
1093
1094 /*
1095  - regoptail - regtail on operand of first argument; nop if operandless
1096  */
1097 static void
1098 regoptail(p, val)
1099 char *p;
1100 char *val;
1101 {
1102         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1103         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1104                 return;
1105         regtail(NEXTOPER(p), val);
1106 }
1107
1108 /*
1109  - regcurly - a little FSA that accepts {\d+,?\d*}
1110  */
1111 STATIC int
1112 regcurly(s)
1113 register char *s;
1114 {
1115     if (*s++ != '{')
1116         return FALSE;
1117     if (!isdigit(*s))
1118         return FALSE;
1119     while (isdigit(*s))
1120         s++;
1121     if (*s == ',')
1122         s++;
1123     while (isdigit(*s))
1124         s++;
1125     if (*s != '}')
1126         return FALSE;
1127     return TRUE;
1128 }
1129
1130 #ifdef DEBUGGING
1131
1132 /*
1133  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1134  */
1135 void
1136 regdump(r)
1137 regexp *r;
1138 {
1139         register char *s;
1140         register char op = EXACTLY;     /* Arbitrary non-END op. */
1141         register char *next;
1142         extern char *index();
1143
1144
1145         s = r->program + 1;
1146         while (op != END) {     /* While that wasn't END last time... */
1147 #ifdef REGALIGN
1148                 if (!((long)s & 1))
1149                         s++;
1150 #endif
1151                 op = OP(s);
1152                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1153                 if (op == CURLY)
1154                     s += 4;
1155                 next = regnext(s);
1156                 if (next == NULL)               /* Next ptr. */
1157                         fprintf(stderr,"(0)");
1158                 else 
1159                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1160                 s += 3;
1161                 if (op == ANYOF) {
1162                         s += 32;
1163                 }
1164                 if (op == EXACTLY) {
1165                         /* Literal string, where present. */
1166                         s++;
1167                         while (*s != '\0') {
1168                                 (void)putchar(*s);
1169                                 s++;
1170                         }
1171                         s++;
1172                 }
1173                 (void)putchar('\n');
1174         }
1175
1176         /* Header fields of interest. */
1177         if (r->regstart)
1178                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1179         if (r->regstclass)
1180                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1181         if (r->reganch & 1)
1182                 fprintf(stderr,"anchored ");
1183         if (r->reganch & 2)
1184                 fprintf(stderr,"plus ");
1185         if (r->regmust != NULL)
1186                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1187                   r->regback);
1188         fprintf(stderr,"\n");
1189 }
1190
1191 /*
1192  - regprop - printable representation of opcode
1193  */
1194 char *
1195 regprop(op)
1196 char *op;
1197 {
1198         register char *p;
1199
1200         (void) strcpy(buf, ":");
1201
1202         switch (OP(op)) {
1203         case BOL:
1204                 p = "BOL";
1205                 break;
1206         case EOL:
1207                 p = "EOL";
1208                 break;
1209         case ANY:
1210                 p = "ANY";
1211                 break;
1212         case ANYOF:
1213                 p = "ANYOF";
1214                 break;
1215         case BRANCH:
1216                 p = "BRANCH";
1217                 break;
1218         case EXACTLY:
1219                 p = "EXACTLY";
1220                 break;
1221         case NOTHING:
1222                 p = "NOTHING";
1223                 break;
1224         case BACK:
1225                 p = "BACK";
1226                 break;
1227         case END:
1228                 p = "END";
1229                 break;
1230         case ALNUM:
1231                 p = "ALNUM";
1232                 break;
1233         case NALNUM:
1234                 p = "NALNUM";
1235                 break;
1236         case BOUND:
1237                 p = "BOUND";
1238                 break;
1239         case NBOUND:
1240                 p = "NBOUND";
1241                 break;
1242         case SPACE:
1243                 p = "SPACE";
1244                 break;
1245         case NSPACE:
1246                 p = "NSPACE";
1247                 break;
1248         case DIGIT:
1249                 p = "DIGIT";
1250                 break;
1251         case NDIGIT:
1252                 p = "NDIGIT";
1253                 break;
1254         case CURLY:
1255                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1256                     ARG1(op),ARG2(op));
1257                 p = NULL;
1258                 break;
1259         case REF:
1260         case REF+1:
1261         case REF+2:
1262         case REF+3:
1263         case REF+4:
1264         case REF+5:
1265         case REF+6:
1266         case REF+7:
1267         case REF+8:
1268         case REF+9:
1269                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1270                 p = NULL;
1271                 break;
1272         case OPEN+1:
1273         case OPEN+2:
1274         case OPEN+3:
1275         case OPEN+4:
1276         case OPEN+5:
1277         case OPEN+6:
1278         case OPEN+7:
1279         case OPEN+8:
1280         case OPEN+9:
1281                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1282                 p = NULL;
1283                 break;
1284         case CLOSE+1:
1285         case CLOSE+2:
1286         case CLOSE+3:
1287         case CLOSE+4:
1288         case CLOSE+5:
1289         case CLOSE+6:
1290         case CLOSE+7:
1291         case CLOSE+8:
1292         case CLOSE+9:
1293                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1294                 p = NULL;
1295                 break;
1296         case STAR:
1297                 p = "STAR";
1298                 break;
1299         case PLUS:
1300                 p = "PLUS";
1301                 break;
1302         default:
1303                 FAIL("corrupted regexp opcode");
1304         }
1305         if (p != NULL)
1306                 (void) strcat(buf, p);
1307         return(buf);
1308 }
1309 #endif /* DEBUGGING */
1310
1311 regfree(r)
1312 struct regexp *r;
1313 {
1314         if (r->precomp)
1315                 Safefree(r->precomp);
1316         if (r->subbase)
1317                 Safefree(r->subbase);
1318         if (r->regmust)
1319                 str_free(r->regmust);
1320         if (r->regstart)
1321                 str_free(r->regstart);
1322         Safefree(r);
1323 }