This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 4.0 patch 16: patch #11, continued
[perl5.git] / regcomp.c
index 04d62c3..0fd50c0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -7,36 +7,28 @@
  * blame Henry for some of the lack of readability.
  */
 
-/* $Header: regcomp.c,v 3.0.1.6 90/10/16 10:17:33 lwall Locked $
+/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.3 $$Date: 91/11/05 18:22:28 $
  *
  * $Log:       regcomp.c,v $
- * Revision 3.0.1.6  90/10/16  10:17:33  lwall
- * patch29: patterns with multiple short literal strings sometimes failed
+ * Revision 4.0.1.3  91/11/05  18:22:28  lwall
+ * patch11: minimum match length calculation in regexp is now cumulative
+ * patch11: initial .* in pattern had dependency on value of $*
+ * patch11: certain patterns made use of garbage pointers from uncleared memory
+ * patch11: prepared for ctype implementations that don't define isascii()
  * 
- * Revision 3.0.1.5  90/08/13  22:23:29  lwall
- * patch28: /x{m}/ didn't work right
+ * Revision 4.0.1.2  91/06/07  11:48:24  lwall
+ * patch4: new copyright notice
+ * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
+ * patch4: // wouldn't use previous pattern if it started with a null character
  * 
- * Revision 3.0.1.4  90/08/09  05:05:33  lwall
- * patch19: sped up /x+y/ patterns greatly by not retrying on every x
- * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
- * patch19: sped up {m,n} on simple items
- * patch19: optimized /.*whatever/ to /^.*whatever/
- * patch19: fixed character classes to allow backslashing hyphen
+ * Revision 4.0.1.1  91/04/12  09:04:45  lwall
+ * patch1: random cleanup in cpp namespace
  * 
- * Revision 3.0.1.3  90/03/12  16:59:22  lwall
- * patch13: pattern matches can now use \0 to mean \000
- * 
- * Revision 3.0.1.2  90/02/28  18:08:35  lwall
- * patch9: /[\200-\377]/ didn't work on machines with signed chars
- * 
- * Revision 3.0.1.1  89/11/11  04:51:04  lwall
- * patch2: /[\000]/ didn't work
- * 
- * Revision 3.0  89/10/18  15:22:29  lwall
- * 3.0 baseline
+ * Revision 4.0  91/03/20  01:39:01  lwall
+ * 4.0 baseline.
  * 
  */
-
+/*SUPPRESS 112*/
 /*
  * regcomp and regexec -- regsub and regerror are not used in perl
  *
  *
  ****    Alterations to Henry's code are...
  ****
- ****    Copyright (c) 1989, Larry Wall
+ ****    Copyright (c) 1991, Larry Wall
  ****
- ****    You may distribute under the terms of the GNU General Public License
- ****    as specified in the README file that comes with the perl 3.0 kit.
+ ****    You may distribute under the terms of either the GNU General Public
+ ****    License or the Artistic License, as specified in the README file.
+
  *
  * Beware that some of this code is subtly aware of the way operator
  * precedence is structured in regular expressions.  Serious changes in
 #include "INTERN.h"
 #include "regcomp.h"
 
+#ifdef MSDOS
+# if defined(BUGGY_MSC6)
+ /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
+ # pragma optimize("a",off)
+ /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
+ # pragma optimize("w",on )
+# endif /* BUGGY_MSC6 */
+#endif /* MSDOS */
+
 #ifndef STATIC
 #define        STATIC  static
 #endif
@@ -83,6 +85,9 @@
        ((*s) == '{' && regcurly(s)))
 #define        META    "^$.[()|?+*\\"
 
+#ifdef SPSTART
+#undef SPSTART         /* dratted cpp namespace... */
+#endif
 /*
  * Flags to be passed up and down.
  */
@@ -102,6 +107,7 @@ static char *regcode;               /* Code-emit pointer; &regdummy = don't. */
 static long regsize;           /* Code size. */
 static int regfold;
 static int regsawbracket;      /* Did we do {d,d} trick? */
+static int regsawback;         /* Did we see \1, ...? */
 
 /*
  * Forward declarations for regcomp()'s friends.
@@ -113,6 +119,7 @@ STATIC char *regpiece();
 STATIC char *regatom();
 STATIC char *regclass();
 STATIC char *regnode();
+STATIC char *reganode();
 STATIC void regc();
 STATIC void reginsert();
 STATIC void regtail();
@@ -146,11 +153,14 @@ int fold;
        register int len;
        register char *first;
        int flags;
-       int back;
+       int backish;
+       int backest;
        int curback;
+       int minlen;
        extern char *safemalloc();
        extern char *savestr();
        int sawplus = 0;
+       int sawopen = 0;
 
        if (exp == NULL)
                fatal("NULL regexp argument");
@@ -161,12 +171,14 @@ int fold;
        regxend = xend;
        regprecomp = nsavestr(exp,xend-exp);
        regsawbracket = 0;
+       regsawback = 0;
        regnpar = 1;
        regsize = 0L;
        regcode = &regdummy;
-       regc(MAGIC);
+       regc((char)MAGIC);
        if (reg(0, &flags) == NULL) {
                Safefree(regprecomp);
+               regprecomp = Nullch;
                return(NULL);
        }
 
@@ -182,12 +194,13 @@ int fold;
        /* Second pass: emit code. */
        if (regsawbracket)
            bcopy(regprecomp,exp,xend-exp);
+       r->prelen = xend-exp;
        r->precomp = regprecomp;
-       r->subbase = NULL;
+       r->subbeg = r->subbase = NULL;
        regparse = exp;
        regnpar = 1;
        regcode = r->program;
-       regc(MAGIC);
+       regc((char)MAGIC);
        if (reg(0, &flags) == NULL)
                return(NULL);
 
@@ -202,18 +215,19 @@ int fold;
                scan = NEXTOPER(scan);
 
                first = scan;
-               while ((OP(first) > OPEN && OP(first) < CLOSE) ||
+               while ((OP(first) == OPEN && (sawopen = 1)) ||
                    (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
                    (OP(first) == PLUS) ||
                    (OP(first) == CURLY && ARG1(first) > 0) ) {
-                       if (OP(first) == CURLY)
-                           first += 4;
-                       else if (OP(first) == PLUS)
-                           sawplus = 2;
+                       if (OP(first) == PLUS)
+                           sawplus = 1;
+                       else
+                           first += regarglen[OP(first)];
                        first = NEXTOPER(first);
                }
 
                /* Starting-point info. */
+           again:
                if (OP(first) == EXACTLY) {
                        r->regstart =
                            str_make(OPERAND(first)+1,*OPERAND(first));
@@ -225,9 +239,14 @@ int fold;
                else if (OP(first) == BOUND || OP(first) == NBOUND)
                        r->regstclass = first;
                else if (OP(first) == BOL ||
-                   (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
-                       r->reganch = 1;         /* kinda turn .* into ^.* */
-               r->reganch |= sawplus;
+                   (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
+                       /* kinda turn .* into ^.* */
+                       r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
+                       first = NEXTOPER(first);
+                       goto again;
+               }
+               if (sawplus && (!sawopen || !regsawback))
+                   r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
 
 #ifdef DEBUGGING
                if (debug & 512)
@@ -248,9 +267,11 @@ int fold;
                longish = str_make("",0);
                longest = str_make("",0);
                len = 0;
+               minlen = 0;
                curback = 0;
-               back = 0;
-               while (scan != NULL) {
+               backish = 0;
+               backest = 0;
+               while (OP(scan) != END) {
                        if (OP(scan) == BRANCH) {
                            if (OP(regnext(scan)) == BRANCH) {
                                curback = -30000;
@@ -261,10 +282,13 @@ int fold;
                                scan = NEXTOPER(scan);
                        }
                        if (OP(scan) == EXACTLY) {
+                           char *t;
+
                            first = scan;
-                           while (OP(regnext(scan)) >= CLOSE)
-                               scan = regnext(scan);
-                           if (curback - back == len) {
+                           while (OP(t = regnext(scan)) == CLOSE)
+                               scan = t;
+                           minlen += *OPERAND(first);
+                           if (curback - backish == len) {
                                str_ncat(longish, OPERAND(first)+1,
                                    *OPERAND(first));
                                len += *OPERAND(first);
@@ -274,7 +298,7 @@ int fold;
                            else if (*OPERAND(first) >= len + (curback >= 0)) {
                                len = *OPERAND(first);
                                str_nset(longish, OPERAND(first)+1,len);
-                               back = curback;
+                               backish = curback;
                                curback += len;
                                first = regnext(scan);
                            }
@@ -284,35 +308,73 @@ int fold;
                        else if (index(varies,OP(scan))) {
                            curback = -30000;
                            len = 0;
-                           if (longish->str_cur > longest->str_cur)
+                           if (longish->str_cur > longest->str_cur) {
                                str_sset(longest,longish);
+                               backest = backish;
+                           }
                            str_nset(longish,"",0);
+                           if (OP(scan) == PLUS &&
+                             index(simple,OP(NEXTOPER(scan))))
+                               minlen++;
+                           else if (OP(scan) == CURLY &&
+                             index(simple,OP(NEXTOPER(scan)+4)))
+                               minlen += ARG1(scan);
                        }
-                       else if (index(simple,OP(scan)))
+                       else if (index(simple,OP(scan))) {
                            curback++;
+                           minlen++;
+                           len = 0;
+                           if (longish->str_cur > longest->str_cur) {
+                               str_sset(longest,longish);
+                               backest = backish;
+                           }
+                           str_nset(longish,"",0);
+                       }
                        scan = regnext(scan);
                }
-               if (longish->str_cur > longest->str_cur)
+
+               /* Prefer earlier on tie, unless we can tail match latter */
+
+               if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
                    str_sset(longest,longish);
-               str_free(longish);
-               if (longest->str_cur) {
+                   backest = backish;
+               }
+               else
+                   str_nset(longish,"",0);
+               if (longest->str_cur
+                   &&
+                   (!r->regstart
+                    ||
+                    !fbminstr((unsigned char*) r->regstart->str_ptr,
+                         (unsigned char *) r->regstart->str_ptr
+                           + r->regstart->str_cur,
+                         longest)
+                   )
+                  )
+               {
                        r->regmust = longest;
-                       if (back < 0)
-                               back = -1;
-                       r->regback = back;
+                       if (backest < 0)
+                               backest = -1;
+                       r->regback = backest;
                        if (longest->str_cur
                          > !(sawstudy || fold || OP(first) == EOL) )
                                fbmcompile(r->regmust,fold);
                        r->regmust->str_u.str_useful = 100;
-                       if (OP(first) == EOL) /* is match anchored to EOL? */
+                       if (OP(first) == EOL && longish->str_cur)
                            r->regmust->str_pok |= SP_TAIL;
                }
-               else
+               else {
                        str_free(longest);
+                       longest = Nullstr;
+               }
+               str_free(longish);
        }
 
        r->do_folding = fold;
        r->nparens = regnpar - 1;
+       r->minlen = minlen;
+       Newz(1002, r->startp, regnpar, char*);
+       Newz(1002, r->endp, regnpar, char*);
 #ifdef DEBUGGING
        if (debug & 512)
                regdump(r);
@@ -344,11 +406,9 @@ int *flagp;
 
        /* Make an OPEN node, if parenthesized. */
        if (paren) {
-               if (regnpar >= NSUBEXP)
-                       FAIL("too many () in regexp");
                parno = regnpar;
                regnpar++;
-               ret = regnode(OPEN+parno);
+               ret = reganode(OPEN, parno);
        } else
                ret = NULL;
 
@@ -375,7 +435,10 @@ int *flagp;
        }
 
        /* Make a closing node, and hook it on the end. */
-       ender = regnode((paren) ? CLOSE+parno : END);   
+       if (paren)
+           ender = reganode(CLOSE, parno);
+       else
+           ender = regnode(END);
        regtail(ret, ender);
 
        /* Hook the tails of the branches to the closing node. */
@@ -471,7 +534,7 @@ int *flagp;
        if (op == '{' && regcurly(regparse)) {
            next = regparse + 1;
            max = Nullch;
-           while (isdigit(*next) || *next == ',') {
+           while (isDIGIT(*next) || *next == ',') {
                if (*next == ',') {
                    if (max)
                        break;
@@ -489,6 +552,8 @@ int *flagp;
                    int tmp;
 
                    reginsert(CURLY, ret);
+                   if (iter > 0)
+                       *flagp = (WORST|HASWIDTH);
                    if (*max == ',')
                        max++;
                    else
@@ -696,14 +761,26 @@ int *flagp;
                case 'r':
                case 't':
                case 'f':
+               case 'e':
+               case 'a':
+               case 'x':
+               case 'c':
+               case '0':
                        goto defchar;
-               case '0': case '1': case '2': case '3': case '4':
+               case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
-                       if (isdigit(regparse[1]) || *regparse == '0')
+                       {
+                           int num = atoi(regparse);
+
+                           if (num > 9 && num >= regnpar)
                                goto defchar;
-                       else {
-                               ret = regnode(REF + *regparse++ - '0');
+                           else {
+                               regsawback = 1;
+                               ret = reganode(REF, num);
+                               while (isDIGIT(*regparse))
+                                   regparse++;
                                *flagp |= SIMPLE;
+                           }
                        }
                        break;
                case '\0':
@@ -719,7 +796,7 @@ int *flagp;
                        register char ender;
                        register char *p;
                        char *oldp;
-                       int foo;
+                       int numlen;
 
                    defchar:
                        ret = regnode(EXACTLY);
@@ -766,16 +843,31 @@ int *flagp;
                                        ender = '\f';
                                        p++;
                                        break;
+                               case 'e':
+                                       ender = '\033';
+                                       p++;
+                                       break;
+                               case 'a':
+                                       ender = '\007';
+                                       p++;
+                                       break;
+                               case 'x':
+                                   ender = scanhex(++p, 2, &numlen);
+                                   p += numlen;
+                                   break;
+                               case 'c':
+                                   p++;
+                                   ender = *p++;
+                                   if (isLOWER(ender))
+                                       ender = toupper(ender);
+                                   ender ^= 64;
+                                   break;
                                case '0': case '1': case '2': case '3':case '4':
                                case '5': case '6': case '7': case '8':case '9':
-                                   if (isdigit(p[1]) || *p == '0') {
-                                       foo = *p - '0';
-                                       if (isdigit(p[1]))
-                                           foo = (foo<<3) + *++p - '0';
-                                       if (isdigit(p[1]))
-                                           foo = (foo<<3) + *++p - '0';
-                                       ender = foo;
-                                       p++;
+                                   if (*p == '0' ||
+                                     (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
+                                       ender = scanoct(p, 3, &numlen);
+                                       p += numlen;
                                    }
                                    else {
                                        --p;
@@ -795,7 +887,7 @@ int *flagp;
                                ender = *p++;
                                break;
                            }
-                           if (regfold && isupper(ender))
+                           if (regfold && isUPPER(ender))
                                    ender = tolower(ender);
                            if (ISMULT2(p)) { /* Back off on ?+*. */
                                if (len)
@@ -849,6 +941,7 @@ regclass()
        register int range = 0;
        register char *ret;
        register int def;
+       int numlen;
 
        ret = regnode(ANYOF);
        if (*regparse == '^') { /* Complement of range. */
@@ -906,17 +999,26 @@ regclass()
                        case 'b':
                                class = '\b';
                                break;
+                       case 'e':
+                               class = '\033';
+                               break;
+                       case 'a':
+                               class = '\007';
+                               break;
+                       case 'x':
+                               class = scanhex(regparse, 2, &numlen);
+                               regparse += numlen;
+                               break;
+                       case 'c':
+                               class = *regparse++;
+                               if (isLOWER(class))
+                                   class = toupper(class);
+                               class ^= 64;
+                               break;
                        case '0': case '1': case '2': case '3': case '4':
                        case '5': case '6': case '7': case '8': case '9':
-                               class -= '0';
-                               if (isdigit(*regparse)) {
-                                       class <<= 3;
-                                       class += *regparse++ - '0';
-                               }
-                               if (isdigit(*regparse)) {
-                                       class <<= 3;
-                                       class += *regparse++ - '0';
-                               }
+                               class = scanoct(--regparse, 3, &numlen);
+                               regparse += numlen;
                                break;
                        }
                }
@@ -936,7 +1038,7 @@ regclass()
                }
                for ( ; lastclass <= class; lastclass++) {
                        regset(bits,def,lastclass);
-                       if (regfold && isupper(lastclass))
+                       if (regfold && isUPPER(lastclass))
                                regset(bits,def,tolower(lastclass));
                }
                lastclass = class;
@@ -983,6 +1085,48 @@ char op;
 }
 
 /*
+ - reganode - emit a node with an argument
+ */
+static char *                  /* Location. */
+reganode(op, arg)
+char op;
+unsigned short arg;
+{
+       register char *ret;
+       register char *ptr;
+
+       ret = regcode;
+       if (ret == &regdummy) {
+#ifdef REGALIGN
+               if (!(regsize & 1))
+                       regsize++;
+#endif
+               regsize += 5;
+               return(ret);
+       }
+
+#ifdef REGALIGN
+#ifndef lint
+       if (!((long)ret & 1))
+           *ret++ = 127;
+#endif
+#endif
+       ptr = ret;
+       *ptr++ = op;
+       *ptr++ = '\0';          /* Null "next" pointer. */
+       *ptr++ = '\0';
+#ifdef REGALIGN
+       *(unsigned short *)(ret+3) = arg;
+#else
+       ret[3] = arg >> 8; ret[4] = arg & 0377;
+#endif
+       ptr += 2;
+       regcode = ptr;
+
+       return(ret);
+}
+
+/*
  - regc - emit (if appropriate) a byte of code
  */
 static void
@@ -1101,13 +1245,13 @@ register char *s;
 {
     if (*s++ != '{')
        return FALSE;
-    if (!isdigit(*s))
+    if (!isDIGIT(*s))
        return FALSE;
-    while (isdigit(*s))
+    while (isDIGIT(*s))
        s++;
     if (*s == ',')
        s++;
-    while (isdigit(*s))
+    while (isDIGIT(*s))
        s++;
     if (*s != '}')
        return FALSE;
@@ -1126,7 +1270,6 @@ regexp *r;
        register char *s;
        register char op = EXACTLY;     /* Arbitrary non-END op. */
        register char *next;
-       extern char *index();
 
 
        s = r->program + 1;
@@ -1137,9 +1280,8 @@ regexp *r;
 #endif
                op = OP(s);
                fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
-               if (op == CURLY)
-                   s += 4;
                next = regnext(s);
+               s += regarglen[op];
                if (next == NULL)               /* Next ptr. */
                        fprintf(stderr,"(0)");
                else 
@@ -1165,13 +1307,16 @@ regexp *r;
                fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
        if (r->regstclass)
                fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
-       if (r->reganch & 1)
+       if (r->reganch & ROPT_ANCH)
                fprintf(stderr,"anchored ");
-       if (r->reganch & 2)
+       if (r->reganch & ROPT_SKIP)
                fprintf(stderr,"plus ");
+       if (r->reganch & ROPT_IMPLICIT)
+               fprintf(stderr,"implicit ");
        if (r->regmust != NULL)
                fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
                  r->regback);
+       fprintf(stderr, "minlen %d ", r->minlen);
        fprintf(stderr,"\n");
 }
 
@@ -1244,40 +1389,15 @@ char *op;
                p = NULL;
                break;
        case REF:
-       case REF+1:
-       case REF+2:
-       case REF+3:
-       case REF+4:
-       case REF+5:
-       case REF+6:
-       case REF+7:
-       case REF+8:
-       case REF+9:
-               (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
+               (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
                p = NULL;
                break;
-       case OPEN+1:
-       case OPEN+2:
-       case OPEN+3:
-       case OPEN+4:
-       case OPEN+5:
-       case OPEN+6:
-       case OPEN+7:
-       case OPEN+8:
-       case OPEN+9:
-               (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
+       case OPEN:
+               (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
                p = NULL;
                break;
-       case CLOSE+1:
-       case CLOSE+2:
-       case CLOSE+3:
-       case CLOSE+4:
-       case CLOSE+5:
-       case CLOSE+6:
-       case CLOSE+7:
-       case CLOSE+8:
-       case CLOSE+9:
-               (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
+       case CLOSE:
+               (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
                p = NULL;
                break;
        case STAR:
@@ -1298,13 +1418,23 @@ char *op;
 regfree(r)
 struct regexp *r;
 {
-       if (r->precomp)
+       if (r->precomp) {
                Safefree(r->precomp);
-       if (r->subbase)
+               r->precomp = Nullch;
+       }
+       if (r->subbase) {
                Safefree(r->subbase);
-       if (r->regmust)
+               r->subbase = Nullch;
+       }
+       if (r->regmust) {
                str_free(r->regmust);
-       if (r->regstart)
+               r->regmust = Nullstr;
+       }
+       if (r->regstart) {
                str_free(r->regstart);
+               r->regstart = Nullstr;
+       }
+       Safefree(r->startp);
+       Safefree(r->endp);
        Safefree(r);
 }