This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl 4.0 patch 16: patch #11, continued
[perl5.git] / regcomp.c
index f11c602..0fd50c0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -7,9 +7,20 @@
  * blame Henry for some of the lack of readability.
  */
 
  * blame Henry for some of the lack of readability.
  */
 
-/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.1 $$Date: 91/04/12 09:04:45 $
+/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.3 $$Date: 91/11/05 18:22:28 $
  *
  * $Log:       regcomp.c,v $
  *
  * $Log:       regcomp.c,v $
+ * Revision 4.0.1.3  91/11/05  18:22:28  lwall
+ * patch11: minimum match length calculation in regexp is now cumulative
+ * patch11: initial .* in pattern had dependency on value of $*
+ * patch11: certain patterns made use of garbage pointers from uncleared memory
+ * patch11: prepared for ctype implementations that don't define isascii()
+ * 
+ * Revision 4.0.1.2  91/06/07  11:48:24  lwall
+ * patch4: new copyright notice
+ * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
+ * patch4: // wouldn't use previous pattern if it started with a null character
+ * 
  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
  * patch1: random cleanup in cpp namespace
  * 
  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
  * patch1: random cleanup in cpp namespace
  * 
@@ -17,7 +28,7 @@
  * 4.0 baseline.
  * 
  */
  * 4.0 baseline.
  * 
  */
-
+/*SUPPRESS 112*/
 /*
  * regcomp and regexec -- regsub and regerror are not used in perl
  *
 /*
  * regcomp and regexec -- regsub and regerror are not used in perl
  *
  *
  ****    Alterations to Henry's code are...
  ****
  *
  ****    Alterations to Henry's code are...
  ****
- ****    Copyright (c) 1989, Larry Wall
+ ****    Copyright (c) 1991, Larry Wall
  ****
  ****
- ****    You may distribute under the terms of the GNU General Public License
- ****    as specified in the README file that comes with the perl 3.0 kit.
+ ****    You may distribute under the terms of either the GNU General Public
+ ****    License or the Artistic License, as specified in the README file.
+
  *
  * Beware that some of this code is subtly aware of the way operator
  * precedence is structured in regular expressions.  Serious changes in
  *
  * Beware that some of this code is subtly aware of the way operator
  * precedence is structured in regular expressions.  Serious changes in
@@ -95,6 +107,7 @@ static char *regcode;                /* Code-emit pointer; &regdummy = don't. */
 static long regsize;           /* Code size. */
 static int regfold;
 static int regsawbracket;      /* Did we do {d,d} trick? */
 static long regsize;           /* Code size. */
 static int regfold;
 static int regsawbracket;      /* Did we do {d,d} trick? */
+static int regsawback;         /* Did we see \1, ...? */
 
 /*
  * Forward declarations for regcomp()'s friends.
 
 /*
  * Forward declarations for regcomp()'s friends.
@@ -143,9 +156,11 @@ int fold;
        int backish;
        int backest;
        int curback;
        int backish;
        int backest;
        int curback;
+       int minlen;
        extern char *safemalloc();
        extern char *savestr();
        int sawplus = 0;
        extern char *safemalloc();
        extern char *savestr();
        int sawplus = 0;
+       int sawopen = 0;
 
        if (exp == NULL)
                fatal("NULL regexp argument");
 
        if (exp == NULL)
                fatal("NULL regexp argument");
@@ -156,10 +171,11 @@ int fold;
        regxend = xend;
        regprecomp = nsavestr(exp,xend-exp);
        regsawbracket = 0;
        regxend = xend;
        regprecomp = nsavestr(exp,xend-exp);
        regsawbracket = 0;
+       regsawback = 0;
        regnpar = 1;
        regsize = 0L;
        regcode = &regdummy;
        regnpar = 1;
        regsize = 0L;
        regcode = &regdummy;
-       regc(MAGIC);
+       regc((char)MAGIC);
        if (reg(0, &flags) == NULL) {
                Safefree(regprecomp);
                regprecomp = Nullch;
        if (reg(0, &flags) == NULL) {
                Safefree(regprecomp);
                regprecomp = Nullch;
@@ -178,12 +194,13 @@ int fold;
        /* Second pass: emit code. */
        if (regsawbracket)
            bcopy(regprecomp,exp,xend-exp);
        /* Second pass: emit code. */
        if (regsawbracket)
            bcopy(regprecomp,exp,xend-exp);
+       r->prelen = xend-exp;
        r->precomp = regprecomp;
        r->precomp = regprecomp;
-       r->subbase = NULL;
+       r->subbeg = r->subbase = NULL;
        regparse = exp;
        regnpar = 1;
        regcode = r->program;
        regparse = exp;
        regnpar = 1;
        regcode = r->program;
-       regc(MAGIC);
+       regc((char)MAGIC);
        if (reg(0, &flags) == NULL)
                return(NULL);
 
        if (reg(0, &flags) == NULL)
                return(NULL);
 
@@ -198,18 +215,19 @@ int fold;
                scan = NEXTOPER(scan);
 
                first = scan;
                scan = NEXTOPER(scan);
 
                first = scan;
-               while (OP(first) == OPEN ||
+               while ((OP(first) == OPEN && (sawopen = 1)) ||
                    (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
                    (OP(first) == PLUS) ||
                    (OP(first) == CURLY && ARG1(first) > 0) ) {
                        if (OP(first) == PLUS)
                    (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
                    (OP(first) == PLUS) ||
                    (OP(first) == CURLY && ARG1(first) > 0) ) {
                        if (OP(first) == PLUS)
-                           sawplus = 2;
+                           sawplus = 1;
                        else
                            first += regarglen[OP(first)];
                        first = NEXTOPER(first);
                }
 
                /* Starting-point info. */
                        else
                            first += regarglen[OP(first)];
                        first = NEXTOPER(first);
                }
 
                /* Starting-point info. */
+           again:
                if (OP(first) == EXACTLY) {
                        r->regstart =
                            str_make(OPERAND(first)+1,*OPERAND(first));
                if (OP(first) == EXACTLY) {
                        r->regstart =
                            str_make(OPERAND(first)+1,*OPERAND(first));
@@ -221,9 +239,14 @@ int fold;
                else if (OP(first) == BOUND || OP(first) == NBOUND)
                        r->regstclass = first;
                else if (OP(first) == BOL ||
                else if (OP(first) == BOUND || OP(first) == NBOUND)
                        r->regstclass = first;
                else if (OP(first) == BOL ||
-                   (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
-                       r->reganch = 1;         /* kinda turn .* into ^.* */
-               r->reganch |= sawplus;
+                   (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
+                       /* kinda turn .* into ^.* */
+                       r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
+                       first = NEXTOPER(first);
+                       goto again;
+               }
+               if (sawplus && (!sawopen || !regsawback))
+                   r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
 
 #ifdef DEBUGGING
                if (debug & 512)
 
 #ifdef DEBUGGING
                if (debug & 512)
@@ -244,6 +267,7 @@ int fold;
                longish = str_make("",0);
                longest = str_make("",0);
                len = 0;
                longish = str_make("",0);
                longest = str_make("",0);
                len = 0;
+               minlen = 0;
                curback = 0;
                backish = 0;
                backest = 0;
                curback = 0;
                backish = 0;
                backest = 0;
@@ -263,6 +287,7 @@ int fold;
                            first = scan;
                            while (OP(t = regnext(scan)) == CLOSE)
                                scan = t;
                            first = scan;
                            while (OP(t = regnext(scan)) == CLOSE)
                                scan = t;
+                           minlen += *OPERAND(first);
                            if (curback - backish == len) {
                                str_ncat(longish, OPERAND(first)+1,
                                    *OPERAND(first));
                            if (curback - backish == len) {
                                str_ncat(longish, OPERAND(first)+1,
                                    *OPERAND(first));
@@ -288,9 +313,16 @@ int fold;
                                backest = backish;
                            }
                            str_nset(longish,"",0);
                                backest = backish;
                            }
                            str_nset(longish,"",0);
+                           if (OP(scan) == PLUS &&
+                             index(simple,OP(NEXTOPER(scan))))
+                               minlen++;
+                           else if (OP(scan) == CURLY &&
+                             index(simple,OP(NEXTOPER(scan)+4)))
+                               minlen += ARG1(scan);
                        }
                        else if (index(simple,OP(scan))) {
                            curback++;
                        }
                        else if (index(simple,OP(scan))) {
                            curback++;
+                           minlen++;
                            len = 0;
                            if (longish->str_cur > longest->str_cur) {
                                str_sset(longest,longish);
                            len = 0;
                            if (longish->str_cur > longest->str_cur) {
                                str_sset(longest,longish);
@@ -313,8 +345,9 @@ int fold;
                    &&
                    (!r->regstart
                     ||
                    &&
                    (!r->regstart
                     ||
-                    !fbminstr(r->regstart->str_ptr,
-                         r->regstart->str_ptr + r->regstart->str_cur,
+                    !fbminstr((unsigned char*) r->regstart->str_ptr,
+                         (unsigned char *) r->regstart->str_ptr
+                           + r->regstart->str_cur,
                          longest)
                    )
                   )
                          longest)
                    )
                   )
@@ -339,8 +372,9 @@ int fold;
 
        r->do_folding = fold;
        r->nparens = regnpar - 1;
 
        r->do_folding = fold;
        r->nparens = regnpar - 1;
-       New(1002, r->startp, regnpar, char*);
-       New(1002, r->endp, regnpar, char*);
+       r->minlen = minlen;
+       Newz(1002, r->startp, regnpar, char*);
+       Newz(1002, r->endp, regnpar, char*);
 #ifdef DEBUGGING
        if (debug & 512)
                regdump(r);
 #ifdef DEBUGGING
        if (debug & 512)
                regdump(r);
@@ -500,7 +534,7 @@ int *flagp;
        if (op == '{' && regcurly(regparse)) {
            next = regparse + 1;
            max = Nullch;
        if (op == '{' && regcurly(regparse)) {
            next = regparse + 1;
            max = Nullch;
-           while (isdigit(*next) || *next == ',') {
+           while (isDIGIT(*next) || *next == ',') {
                if (*next == ',') {
                    if (max)
                        break;
                if (*next == ',') {
                    if (max)
                        break;
@@ -741,8 +775,9 @@ int *flagp;
                            if (num > 9 && num >= regnpar)
                                goto defchar;
                            else {
                            if (num > 9 && num >= regnpar)
                                goto defchar;
                            else {
+                               regsawback = 1;
                                ret = reganode(REF, num);
                                ret = reganode(REF, num);
-                               while (isascii(*regparse) && isdigit(*regparse))
+                               while (isDIGIT(*regparse))
                                    regparse++;
                                *flagp |= SIMPLE;
                            }
                                    regparse++;
                                *flagp |= SIMPLE;
                            }
@@ -823,14 +858,14 @@ int *flagp;
                                case 'c':
                                    p++;
                                    ender = *p++;
                                case 'c':
                                    p++;
                                    ender = *p++;
-                                   if (islower(ender))
+                                   if (isLOWER(ender))
                                        ender = toupper(ender);
                                    ender ^= 64;
                                    break;
                                case '0': case '1': case '2': case '3':case '4':
                                case '5': case '6': case '7': case '8':case '9':
                                    if (*p == '0' ||
                                        ender = toupper(ender);
                                    ender ^= 64;
                                    break;
                                case '0': case '1': case '2': case '3':case '4':
                                case '5': case '6': case '7': case '8':case '9':
                                    if (*p == '0' ||
-                                     (isdigit(p[1]) && atoi(p) >= regnpar) ) {
+                                     (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
                                        ender = scanoct(p, 3, &numlen);
                                        p += numlen;
                                    }
                                        ender = scanoct(p, 3, &numlen);
                                        p += numlen;
                                    }
@@ -852,7 +887,7 @@ int *flagp;
                                ender = *p++;
                                break;
                            }
                                ender = *p++;
                                break;
                            }
-                           if (regfold && isupper(ender))
+                           if (regfold && isUPPER(ender))
                                    ender = tolower(ender);
                            if (ISMULT2(p)) { /* Back off on ?+*. */
                                if (len)
                                    ender = tolower(ender);
                            if (ISMULT2(p)) { /* Back off on ?+*. */
                                if (len)
@@ -976,7 +1011,7 @@ regclass()
                                break;
                        case 'c':
                                class = *regparse++;
                                break;
                        case 'c':
                                class = *regparse++;
-                               if (islower(class))
+                               if (isLOWER(class))
                                    class = toupper(class);
                                class ^= 64;
                                break;
                                    class = toupper(class);
                                class ^= 64;
                                break;
@@ -1003,7 +1038,7 @@ regclass()
                }
                for ( ; lastclass <= class; lastclass++) {
                        regset(bits,def,lastclass);
                }
                for ( ; lastclass <= class; lastclass++) {
                        regset(bits,def,lastclass);
-                       if (regfold && isupper(lastclass))
+                       if (regfold && isUPPER(lastclass))
                                regset(bits,def,tolower(lastclass));
                }
                lastclass = class;
                                regset(bits,def,tolower(lastclass));
                }
                lastclass = class;
@@ -1210,13 +1245,13 @@ register char *s;
 {
     if (*s++ != '{')
        return FALSE;
 {
     if (*s++ != '{')
        return FALSE;
-    if (!isdigit(*s))
+    if (!isDIGIT(*s))
        return FALSE;
        return FALSE;
-    while (isdigit(*s))
+    while (isDIGIT(*s))
        s++;
     if (*s == ',')
        s++;
        s++;
     if (*s == ',')
        s++;
-    while (isdigit(*s))
+    while (isDIGIT(*s))
        s++;
     if (*s != '}')
        return FALSE;
        s++;
     if (*s != '}')
        return FALSE;
@@ -1272,13 +1307,16 @@ regexp *r;
                fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
        if (r->regstclass)
                fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
                fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
        if (r->regstclass)
                fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
-       if (r->reganch & 1)
+       if (r->reganch & ROPT_ANCH)
                fprintf(stderr,"anchored ");
                fprintf(stderr,"anchored ");
-       if (r->reganch & 2)
+       if (r->reganch & ROPT_SKIP)
                fprintf(stderr,"plus ");
                fprintf(stderr,"plus ");
+       if (r->reganch & ROPT_IMPLICIT)
+               fprintf(stderr,"implicit ");
        if (r->regmust != NULL)
                fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
                  r->regback);
        if (r->regmust != NULL)
                fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
                  r->regback);
+       fprintf(stderr, "minlen %d ", r->minlen);
        fprintf(stderr,"\n");
 }
 
        fprintf(stderr,"\n");
 }