This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 3.0 patch #11 patch #9, continued
[perl5.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.2 90/02/28 18:08:35 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
14  * patch9: /[\200-\377]/ didn't work on machines with signed chars
15  * 
16  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
17  * patch2: /[\000]/ didn't work
18  * 
19  * Revision 3.0  89/10/18  15:22:29  lwall
20  * 3.0 baseline
21  * 
22  */
23
24 /*
25  * regcomp and regexec -- regsub and regerror are not used in perl
26  *
27  *      Copyright (c) 1986 by University of Toronto.
28  *      Written by Henry Spencer.  Not derived from licensed software.
29  *
30  *      Permission is granted to anyone to use this software for any
31  *      purpose on any computer system, and to redistribute it freely,
32  *      subject to the following restrictions:
33  *
34  *      1. The author is not responsible for the consequences of use of
35  *              this software, no matter how awful, even if they arise
36  *              from defects in it.
37  *
38  *      2. The origin of this software must not be misrepresented, either
39  *              by explicit claim or by omission.
40  *
41  *      3. Altered versions must be plainly marked as such, and must not
42  *              be misrepresented as being the original software.
43  *
44  *
45  ****    Alterations to Henry's code are...
46  ****
47  ****    Copyright (c) 1989, Larry Wall
48  ****
49  ****    You may distribute under the terms of the GNU General Public License
50  ****    as specified in the README file that comes with the perl 3.0 kit.
51  *
52  * Beware that some of this code is subtly aware of the way operator
53  * precedence is structured in regular expressions.  Serious changes in
54  * regular-expression syntax might require a total rethink.
55  */
56 #include "EXTERN.h"
57 #include "perl.h"
58 #include "INTERN.h"
59 #include "regcomp.h"
60
61 #ifndef STATIC
62 #define STATIC  static
63 #endif
64
65 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
66 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
67         ((*s) == '{' && regcurly(s)))
68 #define META    "^$.[()|?+*\\"
69
70 /*
71  * Flags to be passed up and down.
72  */
73 #define HASWIDTH        01      /* Known never to match null string. */
74 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
75 #define SPSTART         04      /* Starts with * or +. */
76 #define WORST           0       /* Worst case. */
77
78 /*
79  * Global work variables for regcomp().
80  */
81 static char *regprecomp;                /* uncompiled string. */
82 static char *regparse;          /* Input-scan pointer. */
83 static char *regxend;           /* End of input for compile */
84 static int regnpar;             /* () count. */
85 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
86 static long regsize;            /* Code size. */
87 static int regfold;
88 static int regsawbracket;       /* Did we do {d,d} trick? */
89
90 /*
91  * Forward declarations for regcomp()'s friends.
92  */
93 STATIC int regcurly();
94 STATIC char *reg();
95 STATIC char *regbranch();
96 STATIC char *regpiece();
97 STATIC char *regatom();
98 STATIC char *regclass();
99 STATIC char *regnode();
100 STATIC void regc();
101 STATIC void reginsert();
102 STATIC void regtail();
103 STATIC void regoptail();
104
105 /*
106  - regcomp - compile a regular expression into internal code
107  *
108  * We can't allocate space until we know how big the compiled form will be,
109  * but we can't compile it (and thus know how big it is) until we've got a
110  * place to put the code.  So we cheat:  we compile it twice, once with code
111  * generation turned off and size counting turned on, and once "for real".
112  * This also means that we don't allocate space until we are sure that the
113  * thing really will compile successfully, and we never have to move the
114  * code and thus invalidate pointers into it.  (Note that it has to be in
115  * one piece because free() must be able to free it all.) [NB: not true in perl]
116  *
117  * Beware that the optimization-preparation code in here knows about some
118  * of the structure of the compiled regexp.  [I'll say.]
119  */
120 regexp *
121 regcomp(exp,xend,fold,rare)
122 char *exp;
123 char *xend;
124 int fold;
125 int rare;
126 {
127         register regexp *r;
128         register char *scan;
129         register STR *longest;
130         register int len;
131         register char *first;
132         int flags;
133         int back;
134         int curback;
135         extern char *safemalloc();
136         extern char *savestr();
137
138         if (exp == NULL)
139                 fatal("NULL regexp argument");
140
141         /* First pass: determine size, legality. */
142         regfold = fold;
143         regparse = exp;
144         regxend = xend;
145         regprecomp = nsavestr(exp,xend-exp);
146         regsawbracket = 0;
147         regnpar = 1;
148         regsize = 0L;
149         regcode = &regdummy;
150         regc(MAGIC);
151         if (reg(0, &flags) == NULL) {
152                 Safefree(regprecomp);
153                 return(NULL);
154         }
155
156         /* Small enough for pointer-storage convention? */
157         if (regsize >= 32767L)          /* Probably could be 65535L. */
158                 FAIL("regexp too big");
159
160         /* Allocate space. */
161         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
162         if (r == NULL)
163                 FAIL("regexp out of space");
164
165         /* Second pass: emit code. */
166         if (regsawbracket)
167             bcopy(regprecomp,exp,xend-exp);
168         r->precomp = regprecomp;
169         r->subbase = NULL;
170         regparse = exp;
171         regnpar = 1;
172         regcode = r->program;
173         regc(MAGIC);
174         if (reg(0, &flags) == NULL)
175                 return(NULL);
176
177         /* Dig out information for optimizations. */
178         r->regstart = Nullstr;  /* Worst-case defaults. */
179         r->reganch = 0;
180         r->regmust = Nullstr;
181         r->regback = -1;
182         r->regstclass = Nullch;
183         scan = r->program+1;                    /* First BRANCH. */
184         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
185                 scan = NEXTOPER(scan);
186
187                 first = scan;
188                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
189                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
190                     (OP(first) == PLUS) )
191                         first = NEXTOPER(first);
192
193                 /* Starting-point info. */
194                 if (OP(first) == EXACTLY) {
195                         r->regstart =
196                             str_make(OPERAND(first)+1,*OPERAND(first));
197                         if (r->regstart->str_cur > !(sawstudy|fold))
198                                 fbmcompile(r->regstart,fold);
199                 }
200                 else if ((exp = index(simple,OP(first))) && exp > simple)
201                         r->regstclass = first;
202                 else if (OP(first) == BOUND || OP(first) == NBOUND)
203                         r->regstclass = first;
204                 else if (OP(first) == BOL)
205                         r->reganch++;
206
207 #ifdef DEBUGGING
208                 if (debug & 512)
209                     fprintf(stderr,"first %d next %d offset %d\n",
210                       OP(first), OP(NEXTOPER(first)), first - scan);
211 #endif
212                 /*
213                  * If there's something expensive in the r.e., find the
214                  * longest literal string that must appear and make it the
215                  * regmust.  Resolve ties in favor of later strings, since
216                  * the regstart check works with the beginning of the r.e.
217                  * and avoiding duplication strengthens checking.  Not a
218                  * strong reason, but sufficient in the absence of others.
219                  * [Now we resolve ties in favor of the earlier string if
220                  * it happens that curback has been invalidated, since the
221                  * earlier string may buy us something the later one won't.]
222                  */
223                 longest = str_make("",0);
224                 len = 0;
225                 curback = 0;
226                 back = 0;
227                 while (scan != NULL) {
228                         if (OP(scan) == BRANCH) {
229                             if (OP(regnext(scan)) == BRANCH) {
230                                 curback = -30000;
231                                 while (OP(scan) == BRANCH)
232                                     scan = regnext(scan);
233                             }
234                             else        /* single branch is ok */
235                                 scan = NEXTOPER(scan);
236                         }
237                         if (OP(scan) == EXACTLY) {
238                             first = scan;
239                             while (OP(regnext(scan)) >= CLOSE)
240                                 scan = regnext(scan);
241                             if (curback - back == len) {
242                                 str_ncat(longest, OPERAND(first)+1,
243                                     *OPERAND(first));
244                                 len += *OPERAND(first);
245                                 curback += *OPERAND(first);
246                                 first = regnext(scan);
247                             }
248                             else if (*OPERAND(first) >= len + (curback >= 0)) {
249                                 len = *OPERAND(first);
250                                 str_nset(longest, OPERAND(first)+1,len);
251                                 back = curback;
252                                 curback += len;
253                                 first = regnext(scan);
254                             }
255                             else
256                                 curback += *OPERAND(first);
257                         }
258                         else if (index(varies,OP(scan)))
259                                 curback = -30000;
260                         else if (index(simple,OP(scan)))
261                                 curback++;
262                         scan = regnext(scan);
263                 }
264                 if (len) {
265                         r->regmust = longest;
266                         if (back < 0)
267                                 back = -1;
268                         r->regback = back;
269                         if (len > !(sawstudy||fold||OP(first)==EOL))
270                                 fbmcompile(r->regmust,fold);
271                         r->regmust->str_u.str_useful = 100;
272                         if (OP(first) == EOL) /* is match anchored to EOL? */
273                             r->regmust->str_pok |= SP_TAIL;
274                 }
275                 else
276                         str_free(longest);
277         }
278
279         r->do_folding = fold;
280         r->nparens = regnpar - 1;
281 #ifdef DEBUGGING
282         if (debug & 512)
283                 regdump(r);
284 #endif
285         return(r);
286 }
287
288 /*
289  - reg - regular expression, i.e. main body or parenthesized thing
290  *
291  * Caller must absorb opening parenthesis.
292  *
293  * Combining parenthesis handling with the base level of regular expression
294  * is a trifle forced, but the need to tie the tails of the branches to what
295  * follows makes it hard to avoid.
296  */
297 static char *
298 reg(paren, flagp)
299 int paren;                      /* Parenthesized? */
300 int *flagp;
301 {
302         register char *ret;
303         register char *br;
304         register char *ender;
305         register int parno;
306         int flags;
307
308         *flagp = HASWIDTH;      /* Tentatively. */
309
310         /* Make an OPEN node, if parenthesized. */
311         if (paren) {
312                 if (regnpar >= NSUBEXP)
313                         FAIL("too many () in regexp");
314                 parno = regnpar;
315                 regnpar++;
316                 ret = regnode(OPEN+parno);
317         } else
318                 ret = NULL;
319
320         /* Pick up the branches, linking them together. */
321         br = regbranch(&flags);
322         if (br == NULL)
323                 return(NULL);
324         if (ret != NULL)
325                 regtail(ret, br);       /* OPEN -> first. */
326         else
327                 ret = br;
328         if (!(flags&HASWIDTH))
329                 *flagp &= ~HASWIDTH;
330         *flagp |= flags&SPSTART;
331         while (*regparse == '|') {
332                 regparse++;
333                 br = regbranch(&flags);
334                 if (br == NULL)
335                         return(NULL);
336                 regtail(ret, br);       /* BRANCH -> BRANCH. */
337                 if (!(flags&HASWIDTH))
338                         *flagp &= ~HASWIDTH;
339                 *flagp |= flags&SPSTART;
340         }
341
342         /* Make a closing node, and hook it on the end. */
343         ender = regnode((paren) ? CLOSE+parno : END);   
344         regtail(ret, ender);
345
346         /* Hook the tails of the branches to the closing node. */
347         for (br = ret; br != NULL; br = regnext(br))
348                 regoptail(br, ender);
349
350         /* Check for proper termination. */
351         if (paren && *regparse++ != ')') {
352                 FAIL("unmatched () in regexp");
353         } else if (!paren && regparse < regxend) {
354                 if (*regparse == ')') {
355                         FAIL("unmatched () in regexp");
356                 } else
357                         FAIL("junk on end of regexp");  /* "Can't happen". */
358                 /* NOTREACHED */
359         }
360
361         return(ret);
362 }
363
364 /*
365  - regbranch - one alternative of an | operator
366  *
367  * Implements the concatenation operator.
368  */
369 static char *
370 regbranch(flagp)
371 int *flagp;
372 {
373         register char *ret;
374         register char *chain;
375         register char *latest;
376         int flags;
377
378         *flagp = WORST;         /* Tentatively. */
379
380         ret = regnode(BRANCH);
381         chain = NULL;
382         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
383                 latest = regpiece(&flags);
384                 if (latest == NULL)
385                         return(NULL);
386                 *flagp |= flags&HASWIDTH;
387                 if (chain == NULL)      /* First piece. */
388                         *flagp |= flags&SPSTART;
389                 else
390                         regtail(chain, latest);
391                 chain = latest;
392         }
393         if (chain == NULL)      /* Loop ran zero times. */
394                 (void) regnode(NOTHING);
395
396         return(ret);
397 }
398
399 /*
400  - regpiece - something followed by possible [*+?]
401  *
402  * Note that the branching code sequences used for ? and the general cases
403  * of * and + are somewhat optimized:  they use the same NOTHING node as
404  * both the endmarker for their branch list and the body of the last branch.
405  * It might seem that this node could be dispensed with entirely, but the
406  * endmarker role is not redundant.
407  */
408 static char *
409 regpiece(flagp)
410 int *flagp;
411 {
412         register char *ret;
413         register char op;
414         register char *next;
415         int flags;
416         char *origparse = regparse;
417         int orignpar = regnpar;
418         char *max;
419         int iter;
420         char ch;
421
422         ret = regatom(&flags);
423         if (ret == NULL)
424                 return(NULL);
425
426         op = *regparse;
427
428         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
429          * then we decrement the first number by one and reset our
430          * parsing back to the beginning of the same atom.  If the first number
431          * is down to 0, decrement the second number instead and fake up
432          * a ? after it.  Given the way this compiler doesn't keep track
433          * of offsets on the first pass, this is the only way to replicate
434          * a piece of code.  Sigh.
435          */
436         if (op == '{' && regcurly(regparse)) {
437             next = regparse + 1;
438             max = Nullch;
439             while (isdigit(*next) || *next == ',') {
440                 if (*next == ',') {
441                     if (max)
442                         break;
443                     else
444                         max = next;
445                 }
446                 next++;
447             }
448             if (*next == '}') {         /* got one */
449                 regsawbracket++;        /* remember we clobbered exp */
450                 if (!max)
451                     max = next;
452                 regparse++;
453                 iter = atoi(regparse);
454                 if (iter > 0) {
455                     ch = *max;
456                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
457                     *max = ch;
458                     if (*max == ',' && atoi(max+1) > 0) {
459                         ch = *next;
460                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
461                         *next = ch;
462                     }
463                     if (iter != 1 || (*max == ',' || atoi(max+1))) {
464                         regparse = origparse;   /* back up input pointer */
465                         regnpar = orignpar;     /* don't make more parens */
466                     }
467                     else {
468                         regparse = next;
469                         goto nest_check;
470                     }
471                     *flagp = flags;
472                     return ret;
473                 }
474                 if (*max == ',') {
475                     max++;
476                     iter = atoi(max);
477                     if (max == next) {          /* any number more? */
478                         regparse = next;
479                         op = '*';               /* fake up one with a star */
480                     }
481                     else if (iter > 0) {
482                         op = '?';               /* fake up optional atom */
483                         ch = *next;
484                         sprintf(max,"%.*d", next-max, iter - 1);
485                         *next = ch;
486                         if (iter == 1)
487                             regparse = next;
488                         else {
489                             regparse = origparse - 1; /* offset ++ below */
490                             regnpar = orignpar;
491                         }
492                     }
493                     else
494                         fatal("Can't do {n,0}");
495                 }
496                 else
497                     fatal("Can't do {0}");
498             }
499         }
500
501         if (!ISMULT1(op)) {
502                 *flagp = flags;
503                 return(ret);
504         }
505
506         if (!(flags&HASWIDTH) && op != '?')
507                 FAIL("regexp *+ operand could be empty");
508         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
509
510         if (op == '*' && (flags&SIMPLE))
511                 reginsert(STAR, ret);
512         else if (op == '*') {
513                 /* Emit x* as (x&|), where & means "self". */
514                 reginsert(BRANCH, ret);                 /* Either x */
515                 regoptail(ret, regnode(BACK));          /* and loop */
516                 regoptail(ret, ret);                    /* back */
517                 regtail(ret, regnode(BRANCH));          /* or */
518                 regtail(ret, regnode(NOTHING));         /* null. */
519         } else if (op == '+' && (flags&SIMPLE))
520                 reginsert(PLUS, ret);
521         else if (op == '+') {
522                 /* Emit x+ as x(&|), where & means "self". */
523                 next = regnode(BRANCH);                 /* Either */
524                 regtail(ret, next);
525                 regtail(regnode(BACK), ret);            /* loop back */
526                 regtail(next, regnode(BRANCH));         /* or */
527                 regtail(ret, regnode(NOTHING));         /* null. */
528         } else if (op == '?') {
529                 /* Emit x? as (x|) */
530                 reginsert(BRANCH, ret);                 /* Either x */
531                 regtail(ret, regnode(BRANCH));          /* or */
532                 next = regnode(NOTHING);                /* null. */
533                 regtail(ret, next);
534                 regoptail(ret, next);
535         }
536       nest_check:
537         regparse++;
538         if (ISMULT2(regparse))
539                 FAIL("nested *?+ in regexp");
540
541         return(ret);
542 }
543
544 /*
545  - regatom - the lowest level
546  *
547  * Optimization:  gobbles an entire sequence of ordinary characters so that
548  * it can turn them into a single node, which is smaller to store and
549  * faster to run.  Backslashed characters are exceptions, each becoming a
550  * separate node; the code is simpler that way and it's not worth fixing.
551  *
552  * [Yes, it is worth fixing, some scripts can run twice the speed.]
553  */
554 static char *
555 regatom(flagp)
556 int *flagp;
557 {
558         register char *ret;
559         int flags;
560
561         *flagp = WORST;         /* Tentatively. */
562
563         switch (*regparse++) {
564         case '^':
565                 ret = regnode(BOL);
566                 break;
567         case '$':
568                 ret = regnode(EOL);
569                 break;
570         case '.':
571                 ret = regnode(ANY);
572                 *flagp |= HASWIDTH|SIMPLE;
573                 break;
574         case '[':
575                 ret = regclass();
576                 *flagp |= HASWIDTH|SIMPLE;
577                 break;
578         case '(':
579                 ret = reg(1, &flags);
580                 if (ret == NULL)
581                         return(NULL);
582                 *flagp |= flags&(HASWIDTH|SPSTART);
583                 break;
584         case '|':
585         case ')':
586                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
587                 break;
588         case '?':
589         case '+':
590         case '*':
591                 FAIL("?+* follows nothing in regexp");
592                 break;
593         case '\\':
594                 switch (*regparse) {
595                 case 'w':
596                         ret = regnode(ALNUM);
597                         *flagp |= HASWIDTH|SIMPLE;
598                         regparse++;
599                         break;
600                 case 'W':
601                         ret = regnode(NALNUM);
602                         *flagp |= HASWIDTH|SIMPLE;
603                         regparse++;
604                         break;
605                 case 'b':
606                         ret = regnode(BOUND);
607                         *flagp |= SIMPLE;
608                         regparse++;
609                         break;
610                 case 'B':
611                         ret = regnode(NBOUND);
612                         *flagp |= SIMPLE;
613                         regparse++;
614                         break;
615                 case 's':
616                         ret = regnode(SPACE);
617                         *flagp |= HASWIDTH|SIMPLE;
618                         regparse++;
619                         break;
620                 case 'S':
621                         ret = regnode(NSPACE);
622                         *flagp |= HASWIDTH|SIMPLE;
623                         regparse++;
624                         break;
625                 case 'd':
626                         ret = regnode(DIGIT);
627                         *flagp |= HASWIDTH|SIMPLE;
628                         regparse++;
629                         break;
630                 case 'D':
631                         ret = regnode(NDIGIT);
632                         *flagp |= HASWIDTH|SIMPLE;
633                         regparse++;
634                         break;
635                 case 'n':
636                 case 'r':
637                 case 't':
638                 case 'f':
639                         goto defchar;
640                 case '0': case '1': case '2': case '3': case '4':
641                 case '5': case '6': case '7': case '8': case '9':
642                         if (isdigit(regparse[1]))
643                                 goto defchar;
644                         else {
645                                 ret = regnode(REF + *regparse++ - '0');
646                                 *flagp |= SIMPLE;
647                         }
648                         break;
649                 case '\0':
650                         if (regparse >= regxend)
651                             FAIL("trailing \\ in regexp");
652                         /* FALL THROUGH */
653                 default:
654                         goto defchar;
655                 }
656                 break;
657         default: {
658                         register int len;
659                         register char ender;
660                         register char *p;
661                         char *oldp;
662                         int foo;
663
664                     defchar:
665                         ret = regnode(EXACTLY);
666                         regc(0);                /* save spot for len */
667                         for (len=0, p=regparse-1;
668                           len < 127 && p < regxend;
669                           len++)
670                         {
671                             oldp = p;
672                             switch (*p) {
673                             case '^':
674                             case '$':
675                             case '.':
676                             case '[':
677                             case '(':
678                             case ')':
679                             case '|':
680                                 goto loopdone;
681                             case '\\':
682                                 switch (*++p) {
683                                 case 'w':
684                                 case 'W':
685                                 case 'b':
686                                 case 'B':
687                                 case 's':
688                                 case 'S':
689                                 case 'd':
690                                 case 'D':
691                                     --p;
692                                     goto loopdone;
693                                 case 'n':
694                                         ender = '\n';
695                                         p++;
696                                         break;
697                                 case 'r':
698                                         ender = '\r';
699                                         p++;
700                                         break;
701                                 case 't':
702                                         ender = '\t';
703                                         p++;
704                                         break;
705                                 case 'f':
706                                         ender = '\f';
707                                         p++;
708                                         break;
709                                 case '0': case '1': case '2': case '3':case '4':
710                                 case '5': case '6': case '7': case '8':case '9':
711                                     if (isdigit(p[1])) {
712                                         foo = *p++ - '0';
713                                         foo <<= 3;
714                                         foo += *p - '0';
715                                         if (isdigit(p[1]))
716                                             foo = (foo<<3) + *++p - '0';
717                                         ender = foo;
718                                         p++;
719                                     }
720                                     else {
721                                         --p;
722                                         goto loopdone;
723                                     }
724                                     break;
725                                 case '\0':
726                                     if (p >= regxend)
727                                         FAIL("trailing \\ in regexp");
728                                     /* FALL THROUGH */
729                                 default:
730                                     ender = *p++;
731                                     break;
732                                 }
733                                 break;
734                             default:
735                                 ender = *p++;
736                                 break;
737                             }
738                             if (regfold && isupper(ender))
739                                     ender = tolower(ender);
740                             if (ISMULT2(p)) { /* Back off on ?+*. */
741                                 if (len)
742                                     p = oldp;
743                                 else {
744                                     len++;
745                                     regc(ender);
746                                 }
747                                 break;
748                             }
749                             regc(ender);
750                         }
751                     loopdone:
752                         regparse = p;
753                         if (len <= 0)
754                                 FAIL("internal disaster in regexp");
755                         *flagp |= HASWIDTH;
756                         if (len == 1)
757                                 *flagp |= SIMPLE;
758                         if (regcode != &regdummy)
759                             *OPERAND(ret) = len;
760                         regc('\0');
761                 }
762                 break;
763         }
764
765         return(ret);
766 }
767
768 static void
769 regset(bits,def,c)
770 char *bits;
771 int def;
772 register int c;
773 {
774         if (regcode == &regdummy)
775             return;
776         c &= 255;
777         if (def)
778                 bits[c >> 3] &= ~(1 << (c & 7));
779         else
780                 bits[c >> 3] |=  (1 << (c & 7));
781 }
782
783 static char *
784 regclass()
785 {
786         register char *bits;
787         register int class;
788         register int lastclass;
789         register int range = 0;
790         register char *ret;
791         register int def;
792
793         if (*regparse == '^') { /* Complement of range. */
794                 ret = regnode(ANYBUT);
795                 regparse++;
796                 def = 0;
797         } else {
798                 ret = regnode(ANYOF);
799                 def = 255;
800         }
801         bits = regcode;
802         for (class = 0; class < 32; class++)
803             regc(def);
804         if (*regparse == ']' || *regparse == '-')
805                 regset(bits,def,lastclass = *regparse++);
806         while (regparse < regxend && *regparse != ']') {
807                 class = UCHARAT(regparse++);
808                 if (class == '\\') {
809                         class = UCHARAT(regparse++);
810                         switch (class) {
811                         case 'w':
812                                 for (class = 'a'; class <= 'z'; class++)
813                                         regset(bits,def,class);
814                                 for (class = 'A'; class <= 'Z'; class++)
815                                         regset(bits,def,class);
816                                 for (class = '0'; class <= '9'; class++)
817                                         regset(bits,def,class);
818                                 regset(bits,def,'_');
819                                 lastclass = 1234;
820                                 continue;
821                         case 's':
822                                 regset(bits,def,' ');
823                                 regset(bits,def,'\t');
824                                 regset(bits,def,'\r');
825                                 regset(bits,def,'\f');
826                                 regset(bits,def,'\n');
827                                 lastclass = 1234;
828                                 continue;
829                         case 'd':
830                                 for (class = '0'; class <= '9'; class++)
831                                         regset(bits,def,class);
832                                 lastclass = 1234;
833                                 continue;
834                         case 'n':
835                                 class = '\n';
836                                 break;
837                         case 'r':
838                                 class = '\r';
839                                 break;
840                         case 't':
841                                 class = '\t';
842                                 break;
843                         case 'f':
844                                 class = '\f';
845                                 break;
846                         case 'b':
847                                 class = '\b';
848                                 break;
849                         case '0': case '1': case '2': case '3': case '4':
850                         case '5': case '6': case '7': case '8': case '9':
851                                 class -= '0';
852                                 if (isdigit(*regparse)) {
853                                         class <<= 3;
854                                         class += *regparse++ - '0';
855                                 }
856                                 if (isdigit(*regparse)) {
857                                         class <<= 3;
858                                         class += *regparse++ - '0';
859                                 }
860                                 break;
861                         }
862                 }
863                 if (!range && class == '-' && regparse < regxend &&
864                     *regparse != ']') {
865                         range = 1;
866                         continue;
867                 }
868                 if (range) {
869                         if (lastclass > class)
870                                 FAIL("invalid [] range in regexp");
871                 }
872                 else
873                         lastclass = class - 1;
874                 range = 0;
875                 for (lastclass++; lastclass <= class; lastclass++) {
876                         regset(bits,def,lastclass);
877                         if (regfold && isupper(lastclass))
878                                 regset(bits,def,tolower(lastclass));
879                 }
880                 lastclass = class;
881         }
882         if (*regparse != ']')
883                 FAIL("unmatched [] in regexp");
884         regparse++;
885         return ret;
886 }
887
888 /*
889  - regnode - emit a node
890  */
891 static char *                   /* Location. */
892 regnode(op)
893 char op;
894 {
895         register char *ret;
896         register char *ptr;
897
898         ret = regcode;
899         if (ret == &regdummy) {
900 #ifdef REGALIGN
901                 if (!(regsize & 1))
902                         regsize++;
903 #endif
904                 regsize += 3;
905                 return(ret);
906         }
907
908 #ifdef REGALIGN
909 #ifndef lint
910         if (!((long)ret & 1))
911             *ret++ = 127;
912 #endif
913 #endif
914         ptr = ret;
915         *ptr++ = op;
916         *ptr++ = '\0';          /* Null "next" pointer. */
917         *ptr++ = '\0';
918         regcode = ptr;
919
920         return(ret);
921 }
922
923 /*
924  - regc - emit (if appropriate) a byte of code
925  */
926 static void
927 regc(b)
928 char b;
929 {
930         if (regcode != &regdummy)
931                 *regcode++ = b;
932         else
933                 regsize++;
934 }
935
936 /*
937  - reginsert - insert an operator in front of already-emitted operand
938  *
939  * Means relocating the operand.
940  */
941 static void
942 reginsert(op, opnd)
943 char op;
944 char *opnd;
945 {
946         register char *src;
947         register char *dst;
948         register char *place;
949
950         if (regcode == &regdummy) {
951 #ifdef REGALIGN
952                 regsize += 4;
953 #else
954                 regsize += 3;
955 #endif
956                 return;
957         }
958
959         src = regcode;
960 #ifdef REGALIGN
961         regcode += 4;
962 #else
963         regcode += 3;
964 #endif
965         dst = regcode;
966         while (src > opnd)
967                 *--dst = *--src;
968
969         place = opnd;           /* Op node, where operand used to be. */
970         *place++ = op;
971         *place++ = '\0';
972         *place++ = '\0';
973 }
974
975 /*
976  - regtail - set the next-pointer at the end of a node chain
977  */
978 static void
979 regtail(p, val)
980 char *p;
981 char *val;
982 {
983         register char *scan;
984         register char *temp;
985         register int offset;
986
987         if (p == &regdummy)
988                 return;
989
990         /* Find last node. */
991         scan = p;
992         for (;;) {
993                 temp = regnext(scan);
994                 if (temp == NULL)
995                         break;
996                 scan = temp;
997         }
998
999 #ifdef REGALIGN
1000         offset = val - scan;
1001 #ifndef lint
1002         *(short*)(scan+1) = offset;
1003 #else
1004         offset = offset;
1005 #endif
1006 #else
1007         if (OP(scan) == BACK)
1008                 offset = scan - val;
1009         else
1010                 offset = val - scan;
1011         *(scan+1) = (offset>>8)&0377;
1012         *(scan+2) = offset&0377;
1013 #endif
1014 }
1015
1016 /*
1017  - regoptail - regtail on operand of first argument; nop if operandless
1018  */
1019 static void
1020 regoptail(p, val)
1021 char *p;
1022 char *val;
1023 {
1024         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1025         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1026                 return;
1027         regtail(NEXTOPER(p), val);
1028 }
1029
1030 /*
1031  - regcurly - a little FSA that accepts {\d+,?\d*}
1032  */
1033 STATIC int
1034 regcurly(s)
1035 register char *s;
1036 {
1037     if (*s++ != '{')
1038         return FALSE;
1039     if (!isdigit(*s))
1040         return FALSE;
1041     while (isdigit(*s))
1042         s++;
1043     if (*s == ',')
1044         s++;
1045     while (isdigit(*s))
1046         s++;
1047     if (*s != '}')
1048         return FALSE;
1049     return TRUE;
1050 }
1051
1052 #ifdef DEBUGGING
1053
1054 /*
1055  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1056  */
1057 void
1058 regdump(r)
1059 regexp *r;
1060 {
1061         register char *s;
1062         register char op = EXACTLY;     /* Arbitrary non-END op. */
1063         register char *next;
1064         extern char *index();
1065
1066
1067         s = r->program + 1;
1068         while (op != END) {     /* While that wasn't END last time... */
1069 #ifdef REGALIGN
1070                 if (!((long)s & 1))
1071                         s++;
1072 #endif
1073                 op = OP(s);
1074                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1075                 next = regnext(s);
1076                 if (next == NULL)               /* Next ptr. */
1077                         fprintf(stderr,"(0)");
1078                 else 
1079                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1080                 s += 3;
1081                 if (op == ANYOF || op == ANYBUT) {
1082                         s += 32;
1083                 }
1084                 if (op == EXACTLY) {
1085                         /* Literal string, where present. */
1086                         s++;
1087                         while (*s != '\0') {
1088                                 (void)putchar(*s);
1089                                 s++;
1090                         }
1091                         s++;
1092                 }
1093                 (void)putchar('\n');
1094         }
1095
1096         /* Header fields of interest. */
1097         if (r->regstart)
1098                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1099         if (r->regstclass)
1100                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1101         if (r->reganch)
1102                 fprintf(stderr,"anchored ");
1103         if (r->regmust != NULL)
1104                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1105                   r->regback);
1106         fprintf(stderr,"\n");
1107 }
1108
1109 /*
1110  - regprop - printable representation of opcode
1111  */
1112 char *
1113 regprop(op)
1114 char *op;
1115 {
1116         register char *p;
1117
1118         (void) strcpy(buf, ":");
1119
1120         switch (OP(op)) {
1121         case BOL:
1122                 p = "BOL";
1123                 break;
1124         case EOL:
1125                 p = "EOL";
1126                 break;
1127         case ANY:
1128                 p = "ANY";
1129                 break;
1130         case ANYOF:
1131                 p = "ANYOF";
1132                 break;
1133         case ANYBUT:
1134                 p = "ANYBUT";
1135                 break;
1136         case BRANCH:
1137                 p = "BRANCH";
1138                 break;
1139         case EXACTLY:
1140                 p = "EXACTLY";
1141                 break;
1142         case NOTHING:
1143                 p = "NOTHING";
1144                 break;
1145         case BACK:
1146                 p = "BACK";
1147                 break;
1148         case END:
1149                 p = "END";
1150                 break;
1151         case ALNUM:
1152                 p = "ALNUM";
1153                 break;
1154         case NALNUM:
1155                 p = "NALNUM";
1156                 break;
1157         case BOUND:
1158                 p = "BOUND";
1159                 break;
1160         case NBOUND:
1161                 p = "NBOUND";
1162                 break;
1163         case SPACE:
1164                 p = "SPACE";
1165                 break;
1166         case NSPACE:
1167                 p = "NSPACE";
1168                 break;
1169         case DIGIT:
1170                 p = "DIGIT";
1171                 break;
1172         case NDIGIT:
1173                 p = "NDIGIT";
1174                 break;
1175         case REF:
1176         case REF+1:
1177         case REF+2:
1178         case REF+3:
1179         case REF+4:
1180         case REF+5:
1181         case REF+6:
1182         case REF+7:
1183         case REF+8:
1184         case REF+9:
1185                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1186                 p = NULL;
1187                 break;
1188         case OPEN+1:
1189         case OPEN+2:
1190         case OPEN+3:
1191         case OPEN+4:
1192         case OPEN+5:
1193         case OPEN+6:
1194         case OPEN+7:
1195         case OPEN+8:
1196         case OPEN+9:
1197                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1198                 p = NULL;
1199                 break;
1200         case CLOSE+1:
1201         case CLOSE+2:
1202         case CLOSE+3:
1203         case CLOSE+4:
1204         case CLOSE+5:
1205         case CLOSE+6:
1206         case CLOSE+7:
1207         case CLOSE+8:
1208         case CLOSE+9:
1209                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1210                 p = NULL;
1211                 break;
1212         case STAR:
1213                 p = "STAR";
1214                 break;
1215         case PLUS:
1216                 p = "PLUS";
1217                 break;
1218         default:
1219                 FAIL("corrupted regexp opcode");
1220         }
1221         if (p != NULL)
1222                 (void) strcat(buf, p);
1223         return(buf);
1224 }
1225 #endif /* DEBUGGING */
1226
1227 regfree(r)
1228 struct regexp *r;
1229 {
1230         if (r->precomp)
1231                 Safefree(r->precomp);
1232         if (r->subbase)
1233                 Safefree(r->subbase);
1234         if (r->regmust)
1235                 str_free(r->regmust);
1236         if (r->regstart)
1237                 str_free(r->regstart);
1238         Safefree(r);
1239 }