This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #40 patch #38, continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.8 90/11/10 01:57:46 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.8  90/11/10  01:57:46  lwall
14  * patch38: patterns with multiple constant strings occasionally malfed
15  * patch38: patterns like /foo.*foo/ sped up some
16  * 
17  * Revision 3.0.1.7  90/10/20  02:18:32  lwall
18  * patch37: /foo.*bar$/ wrongly optimized to do tail matching on "foo"
19  * 
20  * Revision 3.0.1.6  90/10/16  10:17:33  lwall
21  * patch29: patterns with multiple short literal strings sometimes failed
22  * 
23  * Revision 3.0.1.5  90/08/13  22:23:29  lwall
24  * patch28: /x{m}/ didn't work right
25  * 
26  * Revision 3.0.1.4  90/08/09  05:05:33  lwall
27  * patch19: sped up /x+y/ patterns greatly by not retrying on every x
28  * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
29  * patch19: sped up {m,n} on simple items
30  * patch19: optimized /.*whatever/ to /^.*whatever/
31  * patch19: fixed character classes to allow backslashing hyphen
32  * 
33  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
34  * patch13: pattern matches can now use \0 to mean \000
35  * 
36  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
37  * patch9: /[\200-\377]/ didn't work on machines with signed chars
38  * 
39  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
40  * patch2: /[\000]/ didn't work
41  * 
42  * Revision 3.0  89/10/18  15:22:29  lwall
43  * 3.0 baseline
44  * 
45  */
46
47 /*
48  * regcomp and regexec -- regsub and regerror are not used in perl
49  *
50  *      Copyright (c) 1986 by University of Toronto.
51  *      Written by Henry Spencer.  Not derived from licensed software.
52  *
53  *      Permission is granted to anyone to use this software for any
54  *      purpose on any computer system, and to redistribute it freely,
55  *      subject to the following restrictions:
56  *
57  *      1. The author is not responsible for the consequences of use of
58  *              this software, no matter how awful, even if they arise
59  *              from defects in it.
60  *
61  *      2. The origin of this software must not be misrepresented, either
62  *              by explicit claim or by omission.
63  *
64  *      3. Altered versions must be plainly marked as such, and must not
65  *              be misrepresented as being the original software.
66  *
67  *
68  ****    Alterations to Henry's code are...
69  ****
70  ****    Copyright (c) 1989, Larry Wall
71  ****
72  ****    You may distribute under the terms of the GNU General Public License
73  ****    as specified in the README file that comes with the perl 3.0 kit.
74  *
75  * Beware that some of this code is subtly aware of the way operator
76  * precedence is structured in regular expressions.  Serious changes in
77  * regular-expression syntax might require a total rethink.
78  */
79 #include "EXTERN.h"
80 #include "perl.h"
81 #include "INTERN.h"
82 #include "regcomp.h"
83
84 #ifndef STATIC
85 #define STATIC  static
86 #endif
87
88 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
89 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
90         ((*s) == '{' && regcurly(s)))
91 #define META    "^$.[()|?+*\\"
92
93 /*
94  * Flags to be passed up and down.
95  */
96 #define HASWIDTH        01      /* Known never to match null string. */
97 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
98 #define SPSTART         04      /* Starts with * or +. */
99 #define WORST           0       /* Worst case. */
100
101 /*
102  * Global work variables for regcomp().
103  */
104 static char *regprecomp;                /* uncompiled string. */
105 static char *regparse;          /* Input-scan pointer. */
106 static char *regxend;           /* End of input for compile */
107 static int regnpar;             /* () count. */
108 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
109 static long regsize;            /* Code size. */
110 static int regfold;
111 static int regsawbracket;       /* Did we do {d,d} trick? */
112
113 /*
114  * Forward declarations for regcomp()'s friends.
115  */
116 STATIC int regcurly();
117 STATIC char *reg();
118 STATIC char *regbranch();
119 STATIC char *regpiece();
120 STATIC char *regatom();
121 STATIC char *regclass();
122 STATIC char *regnode();
123 STATIC void regc();
124 STATIC void reginsert();
125 STATIC void regtail();
126 STATIC void regoptail();
127
128 /*
129  - regcomp - compile a regular expression into internal code
130  *
131  * We can't allocate space until we know how big the compiled form will be,
132  * but we can't compile it (and thus know how big it is) until we've got a
133  * place to put the code.  So we cheat:  we compile it twice, once with code
134  * generation turned off and size counting turned on, and once "for real".
135  * This also means that we don't allocate space until we are sure that the
136  * thing really will compile successfully, and we never have to move the
137  * code and thus invalidate pointers into it.  (Note that it has to be in
138  * one piece because free() must be able to free it all.) [NB: not true in perl]
139  *
140  * Beware that the optimization-preparation code in here knows about some
141  * of the structure of the compiled regexp.  [I'll say.]
142  */
143 regexp *
144 regcomp(exp,xend,fold)
145 char *exp;
146 char *xend;
147 int fold;
148 {
149         register regexp *r;
150         register char *scan;
151         register STR *longish;
152         STR *longest;
153         register int len;
154         register char *first;
155         int flags;
156         int backish;
157         int backest;
158         int curback;
159         extern char *safemalloc();
160         extern char *savestr();
161         int sawplus = 0;
162
163         if (exp == NULL)
164                 fatal("NULL regexp argument");
165
166         /* First pass: determine size, legality. */
167         regfold = fold;
168         regparse = exp;
169         regxend = xend;
170         regprecomp = nsavestr(exp,xend-exp);
171         regsawbracket = 0;
172         regnpar = 1;
173         regsize = 0L;
174         regcode = &regdummy;
175         regc(MAGIC);
176         if (reg(0, &flags) == NULL) {
177                 Safefree(regprecomp);
178                 return(NULL);
179         }
180
181         /* Small enough for pointer-storage convention? */
182         if (regsize >= 32767L)          /* Probably could be 65535L. */
183                 FAIL("regexp too big");
184
185         /* Allocate space. */
186         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
187         if (r == NULL)
188                 FAIL("regexp out of space");
189
190         /* Second pass: emit code. */
191         if (regsawbracket)
192             bcopy(regprecomp,exp,xend-exp);
193         r->precomp = regprecomp;
194         r->subbase = NULL;
195         regparse = exp;
196         regnpar = 1;
197         regcode = r->program;
198         regc(MAGIC);
199         if (reg(0, &flags) == NULL)
200                 return(NULL);
201
202         /* Dig out information for optimizations. */
203         r->regstart = Nullstr;  /* Worst-case defaults. */
204         r->reganch = 0;
205         r->regmust = Nullstr;
206         r->regback = -1;
207         r->regstclass = Nullch;
208         scan = r->program+1;                    /* First BRANCH. */
209         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
210                 scan = NEXTOPER(scan);
211
212                 first = scan;
213                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
214                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
215                     (OP(first) == PLUS) ||
216                     (OP(first) == CURLY && ARG1(first) > 0) ) {
217                         if (OP(first) == CURLY)
218                             first += 4;
219                         else if (OP(first) == PLUS)
220                             sawplus = 2;
221                         first = NEXTOPER(first);
222                 }
223
224                 /* Starting-point info. */
225                 if (OP(first) == EXACTLY) {
226                         r->regstart =
227                             str_make(OPERAND(first)+1,*OPERAND(first));
228                         if (r->regstart->str_cur > !(sawstudy|fold))
229                                 fbmcompile(r->regstart,fold);
230                 }
231                 else if ((exp = index(simple,OP(first))) && exp > simple)
232                         r->regstclass = first;
233                 else if (OP(first) == BOUND || OP(first) == NBOUND)
234                         r->regstclass = first;
235                 else if (OP(first) == BOL ||
236                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
237                         r->reganch = 1;         /* kinda turn .* into ^.* */
238                 r->reganch |= sawplus;
239
240 #ifdef DEBUGGING
241                 if (debug & 512)
242                     fprintf(stderr,"first %d next %d offset %d\n",
243                       OP(first), OP(NEXTOPER(first)), first - scan);
244 #endif
245                 /*
246                  * If there's something expensive in the r.e., find the
247                  * longest literal string that must appear and make it the
248                  * regmust.  Resolve ties in favor of later strings, since
249                  * the regstart check works with the beginning of the r.e.
250                  * and avoiding duplication strengthens checking.  Not a
251                  * strong reason, but sufficient in the absence of others.
252                  * [Now we resolve ties in favor of the earlier string if
253                  * it happens that curback has been invalidated, since the
254                  * earlier string may buy us something the later one won't.]
255                  */
256                 longish = str_make("",0);
257                 longest = str_make("",0);
258                 len = 0;
259                 curback = 0;
260                 backish = 0;
261                 backest = 0;
262                 while (OP(scan) != END) {
263                         if (OP(scan) == BRANCH) {
264                             if (OP(regnext(scan)) == BRANCH) {
265                                 curback = -30000;
266                                 while (OP(scan) == BRANCH)
267                                     scan = regnext(scan);
268                             }
269                             else        /* single branch is ok */
270                                 scan = NEXTOPER(scan);
271                         }
272                         if (OP(scan) == EXACTLY) {
273                             first = scan;
274                             while (OP(regnext(scan)) >= CLOSE)
275                                 scan = regnext(scan);
276                             if (curback - backish == len) {
277                                 str_ncat(longish, OPERAND(first)+1,
278                                     *OPERAND(first));
279                                 len += *OPERAND(first);
280                                 curback += *OPERAND(first);
281                                 first = regnext(scan);
282                             }
283                             else if (*OPERAND(first) >= len + (curback >= 0)) {
284                                 len = *OPERAND(first);
285                                 str_nset(longish, OPERAND(first)+1,len);
286                                 backish = curback;
287                                 curback += len;
288                                 first = regnext(scan);
289                             }
290                             else
291                                 curback += *OPERAND(first);
292                         }
293                         else if (index(varies,OP(scan))) {
294                             curback = -30000;
295                             len = 0;
296                             if (longish->str_cur > longest->str_cur) {
297                                 str_sset(longest,longish);
298                                 backest = backish;
299                             }
300                             str_nset(longish,"",0);
301                         }
302                         else if (index(simple,OP(scan))) {
303                             curback++;
304                             len = 0;
305                             if (longish->str_cur > longest->str_cur) {
306                                 str_sset(longest,longish);
307                                 backest = backish;
308                             }
309                             str_nset(longish,"",0);
310                         }
311                         scan = regnext(scan);
312                 }
313
314                 /* Prefer earlier on tie, unless we can tail match latter */
315
316                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
317                     str_sset(longest,longish);
318                     backest = backish;
319                 }
320                 else
321                     str_nset(longish,"",0);
322                 if (longest->str_cur
323                     &&
324                     (!r->regstart
325                      ||
326                      !fbminstr(r->regstart->str_ptr,
327                           r->regstart->str_ptr + r->regstart->str_cur,
328                           longest)
329                     )
330                    )
331                 {
332                         r->regmust = longest;
333                         if (backest < 0)
334                                 backest = -1;
335                         r->regback = backest;
336                         if (longest->str_cur
337                           > !(sawstudy || fold || OP(first) == EOL) )
338                                 fbmcompile(r->regmust,fold);
339                         r->regmust->str_u.str_useful = 100;
340                         if (OP(first) == EOL && longish->str_cur)
341                             r->regmust->str_pok |= SP_TAIL;
342                 }
343                 else
344                         str_free(longest);
345                 str_free(longish);
346         }
347
348         r->do_folding = fold;
349         r->nparens = regnpar - 1;
350 #ifdef DEBUGGING
351         if (debug & 512)
352                 regdump(r);
353 #endif
354         return(r);
355 }
356
357 /*
358  - reg - regular expression, i.e. main body or parenthesized thing
359  *
360  * Caller must absorb opening parenthesis.
361  *
362  * Combining parenthesis handling with the base level of regular expression
363  * is a trifle forced, but the need to tie the tails of the branches to what
364  * follows makes it hard to avoid.
365  */
366 static char *
367 reg(paren, flagp)
368 int paren;                      /* Parenthesized? */
369 int *flagp;
370 {
371         register char *ret;
372         register char *br;
373         register char *ender;
374         register int parno;
375         int flags;
376
377         *flagp = HASWIDTH;      /* Tentatively. */
378
379         /* Make an OPEN node, if parenthesized. */
380         if (paren) {
381                 if (regnpar >= NSUBEXP)
382                         FAIL("too many () in regexp");
383                 parno = regnpar;
384                 regnpar++;
385                 ret = regnode(OPEN+parno);
386         } else
387                 ret = NULL;
388
389         /* Pick up the branches, linking them together. */
390         br = regbranch(&flags);
391         if (br == NULL)
392                 return(NULL);
393         if (ret != NULL)
394                 regtail(ret, br);       /* OPEN -> first. */
395         else
396                 ret = br;
397         if (!(flags&HASWIDTH))
398                 *flagp &= ~HASWIDTH;
399         *flagp |= flags&SPSTART;
400         while (*regparse == '|') {
401                 regparse++;
402                 br = regbranch(&flags);
403                 if (br == NULL)
404                         return(NULL);
405                 regtail(ret, br);       /* BRANCH -> BRANCH. */
406                 if (!(flags&HASWIDTH))
407                         *flagp &= ~HASWIDTH;
408                 *flagp |= flags&SPSTART;
409         }
410
411         /* Make a closing node, and hook it on the end. */
412         ender = regnode((paren) ? CLOSE+parno : END);   
413         regtail(ret, ender);
414
415         /* Hook the tails of the branches to the closing node. */
416         for (br = ret; br != NULL; br = regnext(br))
417                 regoptail(br, ender);
418
419         /* Check for proper termination. */
420         if (paren && *regparse++ != ')') {
421                 FAIL("unmatched () in regexp");
422         } else if (!paren && regparse < regxend) {
423                 if (*regparse == ')') {
424                         FAIL("unmatched () in regexp");
425                 } else
426                         FAIL("junk on end of regexp");  /* "Can't happen". */
427                 /* NOTREACHED */
428         }
429
430         return(ret);
431 }
432
433 /*
434  - regbranch - one alternative of an | operator
435  *
436  * Implements the concatenation operator.
437  */
438 static char *
439 regbranch(flagp)
440 int *flagp;
441 {
442         register char *ret;
443         register char *chain;
444         register char *latest;
445         int flags;
446
447         *flagp = WORST;         /* Tentatively. */
448
449         ret = regnode(BRANCH);
450         chain = NULL;
451         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
452                 latest = regpiece(&flags);
453                 if (latest == NULL)
454                         return(NULL);
455                 *flagp |= flags&HASWIDTH;
456                 if (chain == NULL)      /* First piece. */
457                         *flagp |= flags&SPSTART;
458                 else
459                         regtail(chain, latest);
460                 chain = latest;
461         }
462         if (chain == NULL)      /* Loop ran zero times. */
463                 (void) regnode(NOTHING);
464
465         return(ret);
466 }
467
468 /*
469  - regpiece - something followed by possible [*+?]
470  *
471  * Note that the branching code sequences used for ? and the general cases
472  * of * and + are somewhat optimized:  they use the same NOTHING node as
473  * both the endmarker for their branch list and the body of the last branch.
474  * It might seem that this node could be dispensed with entirely, but the
475  * endmarker role is not redundant.
476  */
477 static char *
478 regpiece(flagp)
479 int *flagp;
480 {
481         register char *ret;
482         register char op;
483         register char *next;
484         int flags;
485         char *origparse = regparse;
486         int orignpar = regnpar;
487         char *max;
488         int iter;
489         char ch;
490
491         ret = regatom(&flags);
492         if (ret == NULL)
493                 return(NULL);
494
495         op = *regparse;
496
497         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
498          * then we decrement the first number by one and reset our
499          * parsing back to the beginning of the same atom.  If the first number
500          * is down to 0, decrement the second number instead and fake up
501          * a ? after it.  Given the way this compiler doesn't keep track
502          * of offsets on the first pass, this is the only way to replicate
503          * a piece of code.  Sigh.
504          */
505         if (op == '{' && regcurly(regparse)) {
506             next = regparse + 1;
507             max = Nullch;
508             while (isdigit(*next) || *next == ',') {
509                 if (*next == ',') {
510                     if (max)
511                         break;
512                     else
513                         max = next;
514                 }
515                 next++;
516             }
517             if (*next == '}') {         /* got one */
518                 if (!max)
519                     max = next;
520                 regparse++;
521                 iter = atoi(regparse);
522                 if (flags&SIMPLE) {     /* we can do it right after all */
523                     int tmp;
524
525                     reginsert(CURLY, ret);
526                     if (*max == ',')
527                         max++;
528                     else
529                         max = regparse;
530                     tmp = atoi(max);
531                     if (tmp && tmp < iter)
532                         fatal("Can't do {n,m} with n > m");
533                     if (regcode != &regdummy) {
534 #ifdef REGALIGN
535                         *(unsigned short *)(ret+3) = iter;
536                         *(unsigned short *)(ret+5) = tmp;
537 #else
538                         ret[3] = iter >> 8; ret[4] = iter & 0377;
539                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
540 #endif
541                     }
542                     regparse = next;
543                     goto nest_check;
544                 }
545                 regsawbracket++;        /* remember we clobbered exp */
546                 if (iter > 0) {
547                     ch = *max;
548                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
549                     *max = ch;
550                     if (*max == ',' && max[1] != '}') {
551                         if (atoi(max+1) <= 0)
552                             fatal("Can't do {n,m} with n > m");
553                         ch = *next;
554                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
555                         *next = ch;
556                     }
557                     if (iter != 1 || *max == ',') {
558                         regparse = origparse;   /* back up input pointer */
559                         regnpar = orignpar;     /* don't make more parens */
560                     }
561                     else {
562                         regparse = next;
563                         goto nest_check;
564                     }
565                     *flagp = flags;
566                     return ret;
567                 }
568                 if (*max == ',') {
569                     max++;
570                     iter = atoi(max);
571                     if (max == next) {          /* any number more? */
572                         regparse = next;
573                         op = '*';               /* fake up one with a star */
574                     }
575                     else if (iter > 0) {
576                         op = '?';               /* fake up optional atom */
577                         ch = *next;
578                         sprintf(max,"%.*d", next-max, iter - 1);
579                         *next = ch;
580                         if (iter == 1)
581                             regparse = next;
582                         else {
583                             regparse = origparse - 1; /* offset ++ below */
584                             regnpar = orignpar;
585                         }
586                     }
587                     else
588                         fatal("Can't do {n,0}");
589                 }
590                 else
591                     fatal("Can't do {0}");
592             }
593         }
594
595         if (!ISMULT1(op)) {
596                 *flagp = flags;
597                 return(ret);
598         }
599
600         if (!(flags&HASWIDTH) && op != '?')
601                 FAIL("regexp *+ operand could be empty");
602         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
603
604         if (op == '*' && (flags&SIMPLE))
605                 reginsert(STAR, ret);
606         else if (op == '*') {
607                 /* Emit x* as (x&|), where & means "self". */
608                 reginsert(BRANCH, ret);                 /* Either x */
609                 regoptail(ret, regnode(BACK));          /* and loop */
610                 regoptail(ret, ret);                    /* back */
611                 regtail(ret, regnode(BRANCH));          /* or */
612                 regtail(ret, regnode(NOTHING));         /* null. */
613         } else if (op == '+' && (flags&SIMPLE))
614                 reginsert(PLUS, ret);
615         else if (op == '+') {
616                 /* Emit x+ as x(&|), where & means "self". */
617                 next = regnode(BRANCH);                 /* Either */
618                 regtail(ret, next);
619                 regtail(regnode(BACK), ret);            /* loop back */
620                 regtail(next, regnode(BRANCH));         /* or */
621                 regtail(ret, regnode(NOTHING));         /* null. */
622         } else if (op == '?') {
623                 /* Emit x? as (x|) */
624                 reginsert(BRANCH, ret);                 /* Either x */
625                 regtail(ret, regnode(BRANCH));          /* or */
626                 next = regnode(NOTHING);                /* null. */
627                 regtail(ret, next);
628                 regoptail(ret, next);
629         }
630       nest_check:
631         regparse++;
632         if (ISMULT2(regparse))
633                 FAIL("nested *?+ in regexp");
634
635         return(ret);
636 }
637
638 /*
639  - regatom - the lowest level
640  *
641  * Optimization:  gobbles an entire sequence of ordinary characters so that
642  * it can turn them into a single node, which is smaller to store and
643  * faster to run.  Backslashed characters are exceptions, each becoming a
644  * separate node; the code is simpler that way and it's not worth fixing.
645  *
646  * [Yes, it is worth fixing, some scripts can run twice the speed.]
647  */
648 static char *
649 regatom(flagp)
650 int *flagp;
651 {
652         register char *ret;
653         int flags;
654
655         *flagp = WORST;         /* Tentatively. */
656
657         switch (*regparse++) {
658         case '^':
659                 ret = regnode(BOL);
660                 break;
661         case '$':
662                 ret = regnode(EOL);
663                 break;
664         case '.':
665                 ret = regnode(ANY);
666                 *flagp |= HASWIDTH|SIMPLE;
667                 break;
668         case '[':
669                 ret = regclass();
670                 *flagp |= HASWIDTH|SIMPLE;
671                 break;
672         case '(':
673                 ret = reg(1, &flags);
674                 if (ret == NULL)
675                         return(NULL);
676                 *flagp |= flags&(HASWIDTH|SPSTART);
677                 break;
678         case '|':
679         case ')':
680                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
681                 break;
682         case '?':
683         case '+':
684         case '*':
685                 FAIL("?+* follows nothing in regexp");
686                 break;
687         case '\\':
688                 switch (*regparse) {
689                 case 'w':
690                         ret = regnode(ALNUM);
691                         *flagp |= HASWIDTH|SIMPLE;
692                         regparse++;
693                         break;
694                 case 'W':
695                         ret = regnode(NALNUM);
696                         *flagp |= HASWIDTH|SIMPLE;
697                         regparse++;
698                         break;
699                 case 'b':
700                         ret = regnode(BOUND);
701                         *flagp |= SIMPLE;
702                         regparse++;
703                         break;
704                 case 'B':
705                         ret = regnode(NBOUND);
706                         *flagp |= SIMPLE;
707                         regparse++;
708                         break;
709                 case 's':
710                         ret = regnode(SPACE);
711                         *flagp |= HASWIDTH|SIMPLE;
712                         regparse++;
713                         break;
714                 case 'S':
715                         ret = regnode(NSPACE);
716                         *flagp |= HASWIDTH|SIMPLE;
717                         regparse++;
718                         break;
719                 case 'd':
720                         ret = regnode(DIGIT);
721                         *flagp |= HASWIDTH|SIMPLE;
722                         regparse++;
723                         break;
724                 case 'D':
725                         ret = regnode(NDIGIT);
726                         *flagp |= HASWIDTH|SIMPLE;
727                         regparse++;
728                         break;
729                 case 'n':
730                 case 'r':
731                 case 't':
732                 case 'f':
733                         goto defchar;
734                 case '0': case '1': case '2': case '3': case '4':
735                 case '5': case '6': case '7': case '8': case '9':
736                         if (isdigit(regparse[1]) || *regparse == '0')
737                                 goto defchar;
738                         else {
739                                 ret = regnode(REF + *regparse++ - '0');
740                                 *flagp |= SIMPLE;
741                         }
742                         break;
743                 case '\0':
744                         if (regparse >= regxend)
745                             FAIL("trailing \\ in regexp");
746                         /* FALL THROUGH */
747                 default:
748                         goto defchar;
749                 }
750                 break;
751         default: {
752                         register int len;
753                         register char ender;
754                         register char *p;
755                         char *oldp;
756                         int foo;
757
758                     defchar:
759                         ret = regnode(EXACTLY);
760                         regc(0);                /* save spot for len */
761                         for (len=0, p=regparse-1;
762                           len < 127 && p < regxend;
763                           len++)
764                         {
765                             oldp = p;
766                             switch (*p) {
767                             case '^':
768                             case '$':
769                             case '.':
770                             case '[':
771                             case '(':
772                             case ')':
773                             case '|':
774                                 goto loopdone;
775                             case '\\':
776                                 switch (*++p) {
777                                 case 'w':
778                                 case 'W':
779                                 case 'b':
780                                 case 'B':
781                                 case 's':
782                                 case 'S':
783                                 case 'd':
784                                 case 'D':
785                                     --p;
786                                     goto loopdone;
787                                 case 'n':
788                                         ender = '\n';
789                                         p++;
790                                         break;
791                                 case 'r':
792                                         ender = '\r';
793                                         p++;
794                                         break;
795                                 case 't':
796                                         ender = '\t';
797                                         p++;
798                                         break;
799                                 case 'f':
800                                         ender = '\f';
801                                         p++;
802                                         break;
803                                 case '0': case '1': case '2': case '3':case '4':
804                                 case '5': case '6': case '7': case '8':case '9':
805                                     if (isdigit(p[1]) || *p == '0') {
806                                         foo = *p - '0';
807                                         if (isdigit(p[1]))
808                                             foo = (foo<<3) + *++p - '0';
809                                         if (isdigit(p[1]))
810                                             foo = (foo<<3) + *++p - '0';
811                                         ender = foo;
812                                         p++;
813                                     }
814                                     else {
815                                         --p;
816                                         goto loopdone;
817                                     }
818                                     break;
819                                 case '\0':
820                                     if (p >= regxend)
821                                         FAIL("trailing \\ in regexp");
822                                     /* FALL THROUGH */
823                                 default:
824                                     ender = *p++;
825                                     break;
826                                 }
827                                 break;
828                             default:
829                                 ender = *p++;
830                                 break;
831                             }
832                             if (regfold && isupper(ender))
833                                     ender = tolower(ender);
834                             if (ISMULT2(p)) { /* Back off on ?+*. */
835                                 if (len)
836                                     p = oldp;
837                                 else {
838                                     len++;
839                                     regc(ender);
840                                 }
841                                 break;
842                             }
843                             regc(ender);
844                         }
845                     loopdone:
846                         regparse = p;
847                         if (len <= 0)
848                                 FAIL("internal disaster in regexp");
849                         *flagp |= HASWIDTH;
850                         if (len == 1)
851                                 *flagp |= SIMPLE;
852                         if (regcode != &regdummy)
853                             *OPERAND(ret) = len;
854                         regc('\0');
855                 }
856                 break;
857         }
858
859         return(ret);
860 }
861
862 static void
863 regset(bits,def,c)
864 char *bits;
865 int def;
866 register int c;
867 {
868         if (regcode == &regdummy)
869             return;
870         c &= 255;
871         if (def)
872                 bits[c >> 3] &= ~(1 << (c & 7));
873         else
874                 bits[c >> 3] |=  (1 << (c & 7));
875 }
876
877 static char *
878 regclass()
879 {
880         register char *bits;
881         register int class;
882         register int lastclass;
883         register int range = 0;
884         register char *ret;
885         register int def;
886
887         ret = regnode(ANYOF);
888         if (*regparse == '^') { /* Complement of range. */
889                 regparse++;
890                 def = 0;
891         } else {
892                 def = 255;
893         }
894         bits = regcode;
895         for (class = 0; class < 32; class++)
896             regc(def);
897         if (*regparse == ']' || *regparse == '-')
898                 goto skipcond;          /* allow 1st char to be ] or - */
899         while (regparse < regxend && *regparse != ']') {
900               skipcond:
901                 class = UCHARAT(regparse++);
902                 if (class == '\\') {
903                         class = UCHARAT(regparse++);
904                         switch (class) {
905                         case 'w':
906                                 for (class = 'a'; class <= 'z'; class++)
907                                         regset(bits,def,class);
908                                 for (class = 'A'; class <= 'Z'; class++)
909                                         regset(bits,def,class);
910                                 for (class = '0'; class <= '9'; class++)
911                                         regset(bits,def,class);
912                                 regset(bits,def,'_');
913                                 lastclass = 1234;
914                                 continue;
915                         case 's':
916                                 regset(bits,def,' ');
917                                 regset(bits,def,'\t');
918                                 regset(bits,def,'\r');
919                                 regset(bits,def,'\f');
920                                 regset(bits,def,'\n');
921                                 lastclass = 1234;
922                                 continue;
923                         case 'd':
924                                 for (class = '0'; class <= '9'; class++)
925                                         regset(bits,def,class);
926                                 lastclass = 1234;
927                                 continue;
928                         case 'n':
929                                 class = '\n';
930                                 break;
931                         case 'r':
932                                 class = '\r';
933                                 break;
934                         case 't':
935                                 class = '\t';
936                                 break;
937                         case 'f':
938                                 class = '\f';
939                                 break;
940                         case 'b':
941                                 class = '\b';
942                                 break;
943                         case '0': case '1': case '2': case '3': case '4':
944                         case '5': case '6': case '7': case '8': case '9':
945                                 class -= '0';
946                                 if (isdigit(*regparse)) {
947                                         class <<= 3;
948                                         class += *regparse++ - '0';
949                                 }
950                                 if (isdigit(*regparse)) {
951                                         class <<= 3;
952                                         class += *regparse++ - '0';
953                                 }
954                                 break;
955                         }
956                 }
957                 if (range) {
958                         if (lastclass > class)
959                                 FAIL("invalid [] range in regexp");
960                         range = 0;
961                 }
962                 else {
963                         lastclass = class;
964                         if (*regparse == '-' && regparse+1 < regxend &&
965                             regparse[1] != ']') {
966                                 regparse++;
967                                 range = 1;
968                                 continue;       /* do it next time */
969                         }
970                 }
971                 for ( ; lastclass <= class; lastclass++) {
972                         regset(bits,def,lastclass);
973                         if (regfold && isupper(lastclass))
974                                 regset(bits,def,tolower(lastclass));
975                 }
976                 lastclass = class;
977         }
978         if (*regparse != ']')
979                 FAIL("unmatched [] in regexp");
980         regparse++;
981         return ret;
982 }
983
984 /*
985  - regnode - emit a node
986  */
987 static char *                   /* Location. */
988 regnode(op)
989 char op;
990 {
991         register char *ret;
992         register char *ptr;
993
994         ret = regcode;
995         if (ret == &regdummy) {
996 #ifdef REGALIGN
997                 if (!(regsize & 1))
998                         regsize++;
999 #endif
1000                 regsize += 3;
1001                 return(ret);
1002         }
1003
1004 #ifdef REGALIGN
1005 #ifndef lint
1006         if (!((long)ret & 1))
1007             *ret++ = 127;
1008 #endif
1009 #endif
1010         ptr = ret;
1011         *ptr++ = op;
1012         *ptr++ = '\0';          /* Null "next" pointer. */
1013         *ptr++ = '\0';
1014         regcode = ptr;
1015
1016         return(ret);
1017 }
1018
1019 /*
1020  - regc - emit (if appropriate) a byte of code
1021  */
1022 static void
1023 regc(b)
1024 char b;
1025 {
1026         if (regcode != &regdummy)
1027                 *regcode++ = b;
1028         else
1029                 regsize++;
1030 }
1031
1032 /*
1033  - reginsert - insert an operator in front of already-emitted operand
1034  *
1035  * Means relocating the operand.
1036  */
1037 static void
1038 reginsert(op, opnd)
1039 char op;
1040 char *opnd;
1041 {
1042         register char *src;
1043         register char *dst;
1044         register char *place;
1045         register offset = (op == CURLY ? 4 : 0);
1046
1047         if (regcode == &regdummy) {
1048 #ifdef REGALIGN
1049                 regsize += 4 + offset;
1050 #else
1051                 regsize += 3 + offset;
1052 #endif
1053                 return;
1054         }
1055
1056         src = regcode;
1057 #ifdef REGALIGN
1058         regcode += 4 + offset;
1059 #else
1060         regcode += 3 + offset;
1061 #endif
1062         dst = regcode;
1063         while (src > opnd)
1064                 *--dst = *--src;
1065
1066         place = opnd;           /* Op node, where operand used to be. */
1067         *place++ = op;
1068         *place++ = '\0';
1069         *place++ = '\0';
1070         while (offset-- > 0)
1071             *place++ = '\0';
1072 }
1073
1074 /*
1075  - regtail - set the next-pointer at the end of a node chain
1076  */
1077 static void
1078 regtail(p, val)
1079 char *p;
1080 char *val;
1081 {
1082         register char *scan;
1083         register char *temp;
1084         register int offset;
1085
1086         if (p == &regdummy)
1087                 return;
1088
1089         /* Find last node. */
1090         scan = p;
1091         for (;;) {
1092                 temp = regnext(scan);
1093                 if (temp == NULL)
1094                         break;
1095                 scan = temp;
1096         }
1097
1098 #ifdef REGALIGN
1099         offset = val - scan;
1100 #ifndef lint
1101         *(short*)(scan+1) = offset;
1102 #else
1103         offset = offset;
1104 #endif
1105 #else
1106         if (OP(scan) == BACK)
1107                 offset = scan - val;
1108         else
1109                 offset = val - scan;
1110         *(scan+1) = (offset>>8)&0377;
1111         *(scan+2) = offset&0377;
1112 #endif
1113 }
1114
1115 /*
1116  - regoptail - regtail on operand of first argument; nop if operandless
1117  */
1118 static void
1119 regoptail(p, val)
1120 char *p;
1121 char *val;
1122 {
1123         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1124         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1125                 return;
1126         regtail(NEXTOPER(p), val);
1127 }
1128
1129 /*
1130  - regcurly - a little FSA that accepts {\d+,?\d*}
1131  */
1132 STATIC int
1133 regcurly(s)
1134 register char *s;
1135 {
1136     if (*s++ != '{')
1137         return FALSE;
1138     if (!isdigit(*s))
1139         return FALSE;
1140     while (isdigit(*s))
1141         s++;
1142     if (*s == ',')
1143         s++;
1144     while (isdigit(*s))
1145         s++;
1146     if (*s != '}')
1147         return FALSE;
1148     return TRUE;
1149 }
1150
1151 #ifdef DEBUGGING
1152
1153 /*
1154  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1155  */
1156 void
1157 regdump(r)
1158 regexp *r;
1159 {
1160         register char *s;
1161         register char op = EXACTLY;     /* Arbitrary non-END op. */
1162         register char *next;
1163         extern char *index();
1164
1165
1166         s = r->program + 1;
1167         while (op != END) {     /* While that wasn't END last time... */
1168 #ifdef REGALIGN
1169                 if (!((long)s & 1))
1170                         s++;
1171 #endif
1172                 op = OP(s);
1173                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1174                 if (op == CURLY)
1175                     s += 4;
1176                 next = regnext(s);
1177                 if (next == NULL)               /* Next ptr. */
1178                         fprintf(stderr,"(0)");
1179                 else 
1180                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1181                 s += 3;
1182                 if (op == ANYOF) {
1183                         s += 32;
1184                 }
1185                 if (op == EXACTLY) {
1186                         /* Literal string, where present. */
1187                         s++;
1188                         while (*s != '\0') {
1189                                 (void)putchar(*s);
1190                                 s++;
1191                         }
1192                         s++;
1193                 }
1194                 (void)putchar('\n');
1195         }
1196
1197         /* Header fields of interest. */
1198         if (r->regstart)
1199                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1200         if (r->regstclass)
1201                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1202         if (r->reganch & 1)
1203                 fprintf(stderr,"anchored ");
1204         if (r->reganch & 2)
1205                 fprintf(stderr,"plus ");
1206         if (r->regmust != NULL)
1207                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1208                   r->regback);
1209         fprintf(stderr,"\n");
1210 }
1211
1212 /*
1213  - regprop - printable representation of opcode
1214  */
1215 char *
1216 regprop(op)
1217 char *op;
1218 {
1219         register char *p;
1220
1221         (void) strcpy(buf, ":");
1222
1223         switch (OP(op)) {
1224         case BOL:
1225                 p = "BOL";
1226                 break;
1227         case EOL:
1228                 p = "EOL";
1229                 break;
1230         case ANY:
1231                 p = "ANY";
1232                 break;
1233         case ANYOF:
1234                 p = "ANYOF";
1235                 break;
1236         case BRANCH:
1237                 p = "BRANCH";
1238                 break;
1239         case EXACTLY:
1240                 p = "EXACTLY";
1241                 break;
1242         case NOTHING:
1243                 p = "NOTHING";
1244                 break;
1245         case BACK:
1246                 p = "BACK";
1247                 break;
1248         case END:
1249                 p = "END";
1250                 break;
1251         case ALNUM:
1252                 p = "ALNUM";
1253                 break;
1254         case NALNUM:
1255                 p = "NALNUM";
1256                 break;
1257         case BOUND:
1258                 p = "BOUND";
1259                 break;
1260         case NBOUND:
1261                 p = "NBOUND";
1262                 break;
1263         case SPACE:
1264                 p = "SPACE";
1265                 break;
1266         case NSPACE:
1267                 p = "NSPACE";
1268                 break;
1269         case DIGIT:
1270                 p = "DIGIT";
1271                 break;
1272         case NDIGIT:
1273                 p = "NDIGIT";
1274                 break;
1275         case CURLY:
1276                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1277                     ARG1(op),ARG2(op));
1278                 p = NULL;
1279                 break;
1280         case REF:
1281         case REF+1:
1282         case REF+2:
1283         case REF+3:
1284         case REF+4:
1285         case REF+5:
1286         case REF+6:
1287         case REF+7:
1288         case REF+8:
1289         case REF+9:
1290                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1291                 p = NULL;
1292                 break;
1293         case OPEN+1:
1294         case OPEN+2:
1295         case OPEN+3:
1296         case OPEN+4:
1297         case OPEN+5:
1298         case OPEN+6:
1299         case OPEN+7:
1300         case OPEN+8:
1301         case OPEN+9:
1302                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1303                 p = NULL;
1304                 break;
1305         case CLOSE+1:
1306         case CLOSE+2:
1307         case CLOSE+3:
1308         case CLOSE+4:
1309         case CLOSE+5:
1310         case CLOSE+6:
1311         case CLOSE+7:
1312         case CLOSE+8:
1313         case CLOSE+9:
1314                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1315                 p = NULL;
1316                 break;
1317         case STAR:
1318                 p = "STAR";
1319                 break;
1320         case PLUS:
1321                 p = "PLUS";
1322                 break;
1323         default:
1324                 FAIL("corrupted regexp opcode");
1325         }
1326         if (p != NULL)
1327                 (void) strcat(buf, p);
1328         return(buf);
1329 }
1330 #endif /* DEBUGGING */
1331
1332 regfree(r)
1333 struct regexp *r;
1334 {
1335         if (r->precomp)
1336                 Safefree(r->precomp);
1337         if (r->subbase)
1338                 Safefree(r->subbase);
1339         if (r->regmust)
1340                 str_free(r->regmust);
1341         if (r->regstart)
1342                 str_free(r->regstart);
1343         Safefree(r);
1344 }