[perl5.git] / regexec.c

/* NOTE: this is derived from Henry Spencer's regexp code, and should not
 * confused with the original package (see point 3 below).  Thanks, Henry!
 */

/* Additional note: this code is very heavily munged from Henry's version
 * in places.  In some spots I've traded clarity for efficiency, so don't
 * blame Henry for some of the lack of readability.
 */

/* $Header: regexec.c,v 3.0 89/10/18 15:22:53 lwall Locked $
 *
 * $Log:	regexec.c,v $
 * Revision 3.0  89/10/18  15:22:53  lwall
 * 3.0 baseline
 * 
 */

/*
 * regcomp and regexec -- regsub and regerror are not used in perl
 *
 *	Copyright (c) 1986 by University of Toronto.
 *	Written by Henry Spencer.  Not derived from licensed software.
 *
 *	Permission is granted to anyone to use this software for any
 *	purpose on any computer system, and to redistribute it freely,
 *	subject to the following restrictions:
 *
 *	1. The author is not responsible for the consequences of use of
 *		this software, no matter how awful, even if they arise
 *		from defects in it.
 *
 *	2. The origin of this software must not be misrepresented, either
 *		by explicit claim or by omission.
 *
 *	3. Altered versions must be plainly marked as such, and must not
 *		be misrepresented as being the original software.
 *
 ****    Alterations to Henry's code are...
 ****
 ****    Copyright (c) 1989, Larry Wall
 ****
 ****    You may distribute under the terms of the GNU General Public License
 ****    as specified in the README file that comes with the perl 3.0 kit.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */
#include "EXTERN.h"
#include "perl.h"
#include "regcomp.h"

#ifndef STATIC
#define	STATIC	static
#endif

#ifdef DEBUGGING
int regnarrate = 0;
#endif

/*
 * regexec and friends
 */

/*
 * Global work variables for regexec().
 */
static char *regprecomp;
static char *reginput;		/* String-input pointer. */
static char *regbol;		/* Beginning of input, for ^ check. */
static char *regeol;		/* End of input, for $ check. */
static char **regstartp;	/* Pointer to startp array. */
static char **regendp;		/* Ditto for endp. */
static char *reglastparen;	/* Similarly for lastparen. */
static char *regtill;

static char *regmystartp[10];	/* For remembering backreferences. */
static char *regmyendp[10];

/*
 * Forwards.
 */
STATIC int regtry();
STATIC int regmatch();
STATIC int regrepeat();

extern int multiline;

/*
 - regexec - match a regexp against a string
 */
int
regexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
register regexp *prog;
char *stringarg;
register char *strend;	/* pointer to null at end of string */
char *strbeg;	/* real beginning of string */
int minend;	/* end of match must be at least minend after stringarg */
STR *screamer;
int safebase;	/* no need to remember string in subbase */
{
	register char *s;
	register int i;
	register char *c;
	register char *string = stringarg;
	register int tmp;
	int minlen = 0;		/* must match at least this many chars */
	int dontbother = 0;	/* how many characters not to try at end */
	int beginning = (string == strbeg);	/* is ^ valid at stringarg? */

	/* Be paranoid... */
	if (prog == NULL || string == NULL) {
		fatal("NULL regexp parameter");
		return(0);
	}

	regprecomp = prog->precomp;
	/* Check validity of program. */
	if (UCHARAT(prog->program) != MAGIC) {
		FAIL("corrupted regexp program");
	}

	if (prog->do_folding) {
		safebase = FALSE;
		i = strend - string;
		New(1101,c,i+1,char);
		(void)bcopy(string, c, i+1);
		string = c;
		strend = string + i;
		for (s = string; s < strend; s++)
			if (isupper(*s))
				*s = tolower(*s);
	}

	/* If there is a "must appear" string, look for it. */
	s = string;
	if (prog->regmust != Nullstr) {
		if (beginning && screamer) {
			if (screamfirst[prog->regmust->str_rare] >= 0)
				s = screaminstr(screamer,prog->regmust);
			else
				s = Nullch;
		}
#ifndef lint
		else
			s = fbminstr((unsigned char*)s, (unsigned char*)strend,
			    prog->regmust);
#endif
		if (!s) {
			++prog->regmust->str_u.str_useful;	/* hooray */
			goto phooey;	/* not present */
		}
		else if (prog->regback >= 0) {
			s -= prog->regback;
			if (s < string)
			    s = string;
			minlen = prog->regback + prog->regmust->str_cur;
		}
		else if (--prog->regmust->str_u.str_useful < 0) { /* boo */
			str_free(prog->regmust);
			prog->regmust = Nullstr;	/* disable regmust */
			s = string;
		}
		else {
			s = string;
			minlen = prog->regmust->str_cur;
		}
	}

	/* Mark beginning of line for ^ . */
	if (beginning)
		regbol = string;
	else
		regbol = NULL;

	/* Mark end of line for $ (and such) */
	regeol = strend;

	/* see how far we have to get to not match where we matched before */
	regtill = string+minend;

	/* Simplest case:  anchored match need be tried only once. */
	/*  [unless multiline is set] */
	if (prog->reganch) {
		if (regtry(prog, string))
			goto got_it;
		else if (multiline) {
			if (minlen)
			    dontbother = minlen - 1;
			strend -= dontbother;
			/* for multiline we only have to try after newlines */
			if (s > string)
			    s--;
			for (; s < strend; s++) {
			    if (*s == '\n') {
				if (++s < strend && regtry(prog, s))
				    goto got_it;
			    }
			}
		}
		goto phooey;
	}

	/* Messy cases:  unanchored match. */
	if (prog->regstart) {
		/* We know what string it must start with. */
		if (prog->regstart->str_pok == 3) {
#ifndef lint
		    while ((s = fbminstr((unsigned char*)s,
		      (unsigned char*)strend, prog->regstart)) != NULL)
#else
		    while (s = Nullch)
#endif
		    {
			    if (regtry(prog, s))
				    goto got_it;
			    s++;
		    }
		}
		else {
		    c = prog->regstart->str_ptr;
		    while ((s = ninstr(s, strend,
		      c, c + prog->regstart->str_cur )) != NULL) {
			    if (regtry(prog, s))
				    goto got_it;
			    s++;
		    }
		}
		goto phooey;
	}
	if (c = prog->regstclass) {
		if (minlen)
		    dontbother = minlen - 1;
		strend -= dontbother;	/* don't bother with what can't match */
		/* We know what class it must start with. */
		switch (OP(c)) {
		case ANYOF: case ANYBUT:
		    c = OPERAND(c);
		    while (s < strend) {
			    i = *s;
			    if (!(c[i >> 3] & (1 << (i&7))))
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case BOUND:
		    if (minlen)
			dontbother++,strend--;
		    if (s != string) {
			i = s[-1];
			tmp = (isalpha(i) || isdigit(i) || i == '_');
		    }
		    else
			tmp = 0;	/* assume not alphanumeric */
		    while (s < strend) {
			    i = *s;
			    if (tmp != (isalpha(i) || isdigit(i) || i == '_')) {
				    tmp = !tmp;
				    if (regtry(prog, s))
					    goto got_it;
			    }
			    s++;
		    }
		    if (tmp && regtry(prog,s))
			    goto got_it;
		    break;
		case NBOUND:
		    if (minlen)
			dontbother++,strend--;
		    if (s != string) {
			i = s[-1];
			tmp = (isalpha(i) || isdigit(i) || i == '_');
		    }
		    else
			tmp = 0;	/* assume not alphanumeric */
		    while (s < strend) {
			    i = *s;
			    if (tmp != (isalpha(i) || isdigit(i) || i == '_'))
				    tmp = !tmp;
			    else if (regtry(prog, s))
				    goto got_it;
			    s++;
		    }
		    if (!tmp && regtry(prog,s))
			    goto got_it;
		    break;
		case ALNUM:
		    while (s < strend) {
			    i = *s;
			    if (isalpha(i) || isdigit(i) || i == '_')
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case NALNUM:
		    while (s < strend) {
			    i = *s;
			    if (!isalpha(i) && !isdigit(i) && i != '_')
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case SPACE:
		    while (s < strend) {
			    if (isspace(*s))
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case NSPACE:
		    while (s < strend) {
			    if (!isspace(*s))
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case DIGIT:
		    while (s < strend) {
			    if (isdigit(*s))
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		case NDIGIT:
		    while (s < strend) {
			    if (!isdigit(*s))
				    if (regtry(prog, s))
					    goto got_it;
			    s++;
		    }
		    break;
		}
	}
	else {
		dontbother = minend;
		strend -= dontbother;
		/* We don't know much -- general case. */
		do {
			if (regtry(prog, s))
				goto got_it;
		} while (s++ < strend);
	}

	/* Failure. */
	goto phooey;

    got_it:
	if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding){
		strend += dontbother;	/* uncheat */
		if (safebase)			/* no need for $digit later */
		    s = strbeg;
		else if (strbeg != prog->subbase) {
		    i = strend - string + (stringarg - strbeg);
		    s = nsavestr(strbeg,i);	/* so $digit will work later */
		    if (prog->subbase)
			    Safefree(prog->subbase);
		    prog->subbase = s;
		}
		else
		    s = prog->subbase;
		s += (stringarg - strbeg);
		for (i = 0; i <= prog->nparens; i++) {
			if (prog->endp[i]) {
			    prog->startp[i] = s + (prog->startp[i] - string);
			    prog->endp[i] = s + (prog->endp[i] - string);
			}
		}
		if (prog->do_folding)
			Safefree(string);
	}
	return(1);

    phooey:
	if (prog->do_folding)
		Safefree(string);
	return(0);
}

/*
 - regtry - try match at specific point
 */
static int			/* 0 failure, 1 success */
regtry(prog, string)
regexp *prog;
char *string;
{
	register int i;
	register char **sp;
	register char **ep;

	reginput = string;
	regstartp = prog->startp;
	regendp = prog->endp;
	reglastparen = &prog->lastparen;
	prog->lastparen = 0;

	sp = prog->startp;
	ep = prog->endp;
	if (prog->nparens) {
		for (i = NSUBEXP; i > 0; i--) {
			*sp++ = NULL;
			*ep++ = NULL;
		}
	}
	if (regmatch(prog->program + 1) && reginput >= regtill) {
		prog->startp[0] = string;
		prog->endp[0] = reginput;
		return(1);
	} else
		return(0);
}

/*
 - regmatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 */
/* [lwall] I've hoisted the register declarations to the outer block in order to
 * maybe save a little bit of pushing and popping on the stack.  It also takes
 * advantage of machines that use a register save mask on subroutine entry.
 */
static int			/* 0 failure, 1 success */
regmatch(prog)
char *prog;
{
	register char *scan;	/* Current node. */
	char *next;		/* Next node. */
	register int nextchar;
	register int n;		/* no or next */
	register int ln;        /* len or last */
	register char *s;	/* operand or save */
	register char *locinput = reginput;

	nextchar = *locinput;
	scan = prog;
#ifdef DEBUGGING
	if (scan != NULL && regnarrate)
		fprintf(stderr, "%s(\n", regprop(scan));
#endif
	while (scan != NULL) {
#ifdef DEBUGGING
		if (regnarrate)
			fprintf(stderr, "%s...\n", regprop(scan));
#endif

#ifdef REGALIGN
		next = scan + NEXT(scan);
		if (next == scan)
		    next = NULL;
#else
		next = regnext(scan);
#endif

		switch (OP(scan)) {
		case BOL:
			if (locinput == regbol ||
			    ((nextchar || locinput < regeol) &&
			      locinput[-1] == '\n') )
			{
				regtill--;
				break;
			}
			return(0);
		case EOL:
			if ((nextchar || locinput < regeol) && nextchar != '\n')
				return(0);
			regtill--;
			break;
		case ANY:
			if ((nextchar == '\0' && locinput >= regeol) ||
			  nextchar == '\n')
				return(0);
			nextchar = *++locinput;
			break;
		case EXACTLY:
			s = OPERAND(scan);
			ln = *s++;
			/* Inline the first character, for speed. */
			if (*s != nextchar)
				return(0);
			if (locinput + ln > regeol)
				return 0;
			if (ln > 1 && bcmp(s, locinput, ln) != 0)
				return(0);
			locinput += ln;
			nextchar = *locinput;
			break;
		case ANYOF:
		case ANYBUT:
			s = OPERAND(scan);
			if (nextchar < 0)
				nextchar = UCHARAT(locinput);
			if (s[nextchar >> 3] & (1 << (nextchar&7)))
				return(0);
			nextchar = *++locinput;
			if (!nextchar && locinput > regeol)
				return 0;
			break;
		case ALNUM:
			if (!nextchar)
				return(0);
			if (!isalpha(nextchar) && !isdigit(nextchar) &&
			  nextchar != '_')
				return(0);
			nextchar = *++locinput;
			break;
		case NALNUM:
			if (!nextchar && locinput >= regeol)
				return(0);
			if (isalpha(nextchar) || isdigit(nextchar) ||
			  nextchar == '_')
				return(0);
			nextchar = *++locinput;
			break;
		case NBOUND:
		case BOUND:
			if (locinput == regbol)	/* was last char in word? */
				ln = 0;
			else 
				ln = (isalpha(locinput[-1]) ||
				     isdigit(locinput[-1]) ||
				     locinput[-1] == '_' );
			n = (isalpha(nextchar) || isdigit(nextchar) ||
			    nextchar == '_' );	/* is next char in word? */
			if ((ln == n) == (OP(scan) == BOUND))
				return(0);
			break;
		case SPACE:
			if (!nextchar && locinput >= regeol)
				return(0);
			if (!isspace(nextchar))
				return(0);
			nextchar = *++locinput;
			break;
		case NSPACE:
			if (!nextchar)
				return(0);
			if (isspace(nextchar))
				return(0);
			nextchar = *++locinput;
			break;
		case DIGIT:
			if (!isdigit(nextchar))
				return(0);
			nextchar = *++locinput;
			break;
		case NDIGIT:
			if (!nextchar && locinput >= regeol)
				return(0);
			if (isdigit(nextchar))
				return(0);
			nextchar = *++locinput;
			break;
		case REF:
		case REF+1:
		case REF+2:
		case REF+3:
		case REF+4:
		case REF+5:
		case REF+6:
		case REF+7:
		case REF+8:
		case REF+9:
			n = OP(scan) - REF;
			s = regmystartp[n];
			if (!s)
			    return(0);
			if (!regmyendp[n])
			    return(0);
			if (s == regmyendp[n])
			    break;
			/* Inline the first character, for speed. */
			if (*s != nextchar)
				return(0);
			ln = regmyendp[n] - s;
			if (locinput + ln > regeol)
				return 0;
			if (ln > 1 && bcmp(s, locinput, ln) != 0)
				return(0);
			locinput += ln;
			nextchar = *locinput;
			break;

		case NOTHING:
			break;
		case BACK:
			break;
		case OPEN+1:
		case OPEN+2:
		case OPEN+3:
		case OPEN+4:
		case OPEN+5:
		case OPEN+6:
		case OPEN+7:
		case OPEN+8:
		case OPEN+9:
			n = OP(scan) - OPEN;
			reginput = locinput;

			regmystartp[n] = locinput;	/* for REF */
			if (regmatch(next)) {
				/*
				 * Don't set startp if some later
				 * invocation of the same parentheses
				 * already has.
				 */
				if (regstartp[n] == NULL)
					regstartp[n] = locinput;
				return(1);
			} else
				return(0);
			/* NOTREACHED */
		case CLOSE+1:
		case CLOSE+2:
		case CLOSE+3:
		case CLOSE+4:
		case CLOSE+5:
		case CLOSE+6:
		case CLOSE+7:
		case CLOSE+8:
		case CLOSE+9: {
				n = OP(scan) - CLOSE;
				reginput = locinput;

				regmyendp[n] = locinput;	/* for REF */
				if (regmatch(next)) {
					/*
					 * Don't set endp if some later
					 * invocation of the same parentheses
					 * already has.
					 */
					if (regendp[n] == NULL) {
						regendp[n] = locinput;
						if (n > *reglastparen)
						    *reglastparen = n;
					}
					return(1);
				} else
					return(0);
			}
			/*NOTREACHED*/
		case BRANCH: {
				if (OP(next) != BRANCH)		/* No choice. */
					next = NEXTOPER(scan);	/* Avoid recursion. */
				else {
					do {
						reginput = locinput;
						if (regmatch(NEXTOPER(scan)))
							return(1);
#ifdef REGALIGN
						if (n = NEXT(scan))
						    scan += n;
						else
						    scan = NULL;
#else
						scan = regnext(scan);
#endif
					} while (scan != NULL && OP(scan) == BRANCH);
					return(0);
					/* NOTREACHED */
				}
			}
			break;
		case STAR:
		case PLUS:
			/*
			 * Lookahead to avoid useless match attempts
			 * when we know what character comes next.
			 */
			if (OP(next) == EXACTLY)
				nextchar = *(OPERAND(next)+1);
			else
				nextchar = -1000;
			ln = (OP(scan) == STAR) ? 0 : 1;
			reginput = locinput;
			n = regrepeat(NEXTOPER(scan));
			while (n >= ln) {
				/* If it could work, try it. */
				if (nextchar == -1000 || *reginput == nextchar)
					if (regmatch(next))
						return(1);
				/* Couldn't or didn't -- back up. */
				n--;
				reginput = locinput + n;
			}
			return(0);
		case END:
			reginput = locinput; /* put where regtry can find it */
			return(1);	/* Success! */
		default:
			printf("%x %d\n",scan,scan[1]);
			FAIL("regexp memory corruption");
		}

		scan = next;
	}

	/*
	 * We get here only if there's trouble -- normally "case END" is
	 * the terminating point.
	 */
	FAIL("corrupted regexp pointers");
	/*NOTREACHED*/
#ifdef lint
	return 0;
#endif
}

/*
 - regrepeat - repeatedly match something simple, report how many
 */
/*
 * [This routine now assumes that it will only match on things of length 1.
 * That was true before, but now we assume scan - reginput is the count,
 * rather than incrementing count on every character.]
 */
static int
regrepeat(p)
char *p;
{
	register char *scan;
	register char *opnd;
	register int c;
	register char *loceol = regeol;

	scan = reginput;
	opnd = OPERAND(p);
	switch (OP(p)) {
	case ANY:
		while (scan < loceol && *scan != '\n')
			scan++;
		break;
	case EXACTLY:		/* length of string is 1 */
		opnd++;
		while (scan < loceol && *opnd == *scan)
			scan++;
		break;
	case ANYOF:
	case ANYBUT:
		c = UCHARAT(scan);
		while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
			scan++;
			c = UCHARAT(scan);
		}
		break;
	case ALNUM:
		while (isalpha(*scan) || isdigit(*scan) || *scan == '_')
			scan++;
		break;
	case NALNUM:
		while (scan < loceol && (!isalpha(*scan) && !isdigit(*scan) &&
		  *scan != '_'))
			scan++;
		break;
	case SPACE:
		while (scan < loceol && isspace(*scan))
			scan++;
		break;
	case NSPACE:
		while (scan < loceol && !isspace(*scan))
			scan++;
		break;
	case DIGIT:
		while (isdigit(*scan))
			scan++;
		break;
	case NDIGIT:
		while (scan < loceol && !isdigit(*scan))
			scan++;
		break;
	default:		/* Oh dear.  Called inappropriately. */
		FAIL("internal regexp foulup");
		/* NOTREACHED */
	}

	c = scan - reginput;
	reginput = scan;

	return(c);
}

/*
 - regnext - dig the "next" pointer out of a node
 *
 * [Note, when REGALIGN is defined there are two places in regmatch()
 * that bypass this code for speed.]
 */
char *
regnext(p)
register char *p;
{
	register int offset;

	if (p == &regdummy)
		return(NULL);

	offset = NEXT(p);
	if (offset == 0)
		return(NULL);

#ifdef REGALIGN
	return(p+offset);
#else
	if (OP(p) == BACK)
		return(p-offset);
	else
		return(p+offset);
#endif
}
Commit	Line	Data
a687059c LW	1	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	2	* confused with the original package (see point 3 below). Thanks, Henry!
	3	*/
	4
	5	/* Additional note: this code is very heavily munged from Henry's version
	6	* in places. In some spots I've traded clarity for efficiency, so don't
	7	* blame Henry for some of the lack of readability.
	8	*/
	9
	10	/* $Header: regexec.c,v 3.0 89/10/18 15:22:53 lwall Locked $
	11	*
	12	* $Log: regexec.c,v $
	13	* Revision 3.0 89/10/18 15:22:53 lwall
	14	* 3.0 baseline
	15	*
	16	*/
	17
	18	/*
	19	* regcomp and regexec -- regsub and regerror are not used in perl
	20	*
	21	* Copyright (c) 1986 by University of Toronto.
	22	* Written by Henry Spencer. Not derived from licensed software.
	23	*
	24	* Permission is granted to anyone to use this software for any
	25	* purpose on any computer system, and to redistribute it freely,
	26	* subject to the following restrictions:
	27	*
	28	* 1. The author is not responsible for the consequences of use of
	29	* this software, no matter how awful, even if they arise
	30	* from defects in it.
	31	*
	32	* 2. The origin of this software must not be misrepresented, either
	33	* by explicit claim or by omission.
	34	*
	35	* 3. Altered versions must be plainly marked as such, and must not
	36	* be misrepresented as being the original software.
	37	*
	38	**** Alterations to Henry's code are...
	39	****
	40	**** Copyright (c) 1989, Larry Wall
	41	****
	42	**** You may distribute under the terms of the GNU General Public License
	43	**** as specified in the README file that comes with the perl 3.0 kit.
	44	*
	45	* Beware that some of this code is subtly aware of the way operator
	46	* precedence is structured in regular expressions. Serious changes in
	47	* regular-expression syntax might require a total rethink.
	48	*/
	49	#include "EXTERN.h"
	50	#include "perl.h"
	51	#include "regcomp.h"
	52
	53	#ifndef STATIC
	54	#define STATIC static
	55	#endif
	56
	57	#ifdef DEBUGGING
	58	int regnarrate = 0;
	59	#endif
	60
	61	/*
	62	* regexec and friends
	63	*/
	64
65	/*
66	* Global work variables for regexec().
67	*/
68	static char *regprecomp;
69	static char reginput; / String-input pointer. */
70	static char regbol; / Beginning of input, for ^ check. */
71	static char regeol; / End of input, for $ check. */
72	static char *regstartp; / Pointer to startp array. */
73	static char *regendp; / Ditto for endp. */
74	static char reglastparen; / Similarly for lastparen. */
75	static char *regtill;
76
77	static char regmystartp[10]; / For remembering backreferences. */
78	static char *regmyendp[10];
79
80	/*
81	* Forwards.
82	*/
83	STATIC int regtry();
84	STATIC int regmatch();
85	STATIC int regrepeat();
86
87	extern int multiline;
88
89	/*
90	- regexec - match a regexp against a string
91	*/
92	int
93	regexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
94	register regexp *prog;
95	char *stringarg;
96	register char strend; / pointer to null at end of string */
97	char strbeg; / real beginning of string */
98	int minend; /* end of match must be at least minend after stringarg */
99	STR *screamer;
100	int safebase; /* no need to remember string in subbase */
101	{
102	register char *s;
103	register int i;
104	register char *c;
105	register char *string = stringarg;
106	register int tmp;
107	int minlen = 0; /* must match at least this many chars */
108	int dontbother = 0; /* how many characters not to try at end */
109	int beginning = (string == strbeg); /* is ^ valid at stringarg? */
110
111	/* Be paranoid... */
112	if (prog == NULL \|\| string == NULL) {
113	fatal("NULL regexp parameter");
114	return(0);
115	}
116
117	regprecomp = prog->precomp;
118	/* Check validity of program. */
119	if (UCHARAT(prog->program) != MAGIC) {
120	FAIL("corrupted regexp program");
121	}
122
123	if (prog->do_folding) {
124	safebase = FALSE;
125	i = strend - string;
126	New(1101,c,i+1,char);
127	(void)bcopy(string, c, i+1);
128	string = c;
129	strend = string + i;
130	for (s = string; s < strend; s++)
131	if (isupper(*s))
132	s = tolower(s);
133	}
134
135	/* If there is a "must appear" string, look for it. */
136	s = string;
137	if (prog->regmust != Nullstr) {
138	if (beginning && screamer) {
139	if (screamfirst[prog->regmust->str_rare] >= 0)
140	s = screaminstr(screamer,prog->regmust);
141	else
142	s = Nullch;
143	}
144	#ifndef lint
145	else
146	s = fbminstr((unsigned char)s, (unsigned char)strend,
147	prog->regmust);
148	#endif
149	if (!s) {
150	++prog->regmust->str_u.str_useful; /* hooray */
151	goto phooey; /* not present */
152	}
153	else if (prog->regback >= 0) {
154	s -= prog->regback;
155	if (s < string)
156	s = string;
157	minlen = prog->regback + prog->regmust->str_cur;
158	}
159	else if (--prog->regmust->str_u.str_useful < 0) { /* boo */
160	str_free(prog->regmust);
161	prog->regmust = Nullstr; /* disable regmust */
162	s = string;
163	}
164	else {
165	s = string;
166	minlen = prog->regmust->str_cur;
167	}
168	}
169
170	/* Mark beginning of line for ^ . */
171	if (beginning)
172	regbol = string;
173	else
174	regbol = NULL;
175
176	/* Mark end of line for $ (and such) */
177	regeol = strend;
178
179	/* see how far we have to get to not match where we matched before */
180	regtill = string+minend;
181
182	/* Simplest case: anchored match need be tried only once. */
183	/* [unless multiline is set] */
184	if (prog->reganch) {
185	if (regtry(prog, string))
186	goto got_it;
187	else if (multiline) {
188	if (minlen)
189	dontbother = minlen - 1;
190	strend -= dontbother;
191	/* for multiline we only have to try after newlines */
192	if (s > string)
193	s--;
194	for (; s < strend; s++) {
195	if (*s == '\n') {
196	if (++s < strend && regtry(prog, s))
197	goto got_it;
198	}
199	}
200	}
201	goto phooey;
202	}
203
204	/* Messy cases: unanchored match. */
205	if (prog->regstart) {
206	/* We know what string it must start with. */
207	if (prog->regstart->str_pok == 3) {
208	#ifndef lint
209	while ((s = fbminstr((unsigned char*)s,
210	(unsigned char*)strend, prog->regstart)) != NULL)
211	#else
212	while (s = Nullch)
213	#endif
214	{
215	if (regtry(prog, s))
216	goto got_it;
217	s++;
218	}
219	}
220	else {
221	c = prog->regstart->str_ptr;
222	while ((s = ninstr(s, strend,
223	c, c + prog->regstart->str_cur )) != NULL) {
224	if (regtry(prog, s))
225	goto got_it;
226	s++;
227	}
228	}
229	goto phooey;
230	}
231	if (c = prog->regstclass) {
232	if (minlen)
233	dontbother = minlen - 1;
234	strend -= dontbother; /* don't bother with what can't match */
235	/* We know what class it must start with. */
236	switch (OP(c)) {
237	case ANYOF: case ANYBUT:
238	c = OPERAND(c);
239	while (s < strend) {
240	i = *s;
241	if (!(c[i >> 3] & (1 << (i&7))))
242	if (regtry(prog, s))
243	goto got_it;
244	s++;
245	}
246	break;
247	case BOUND:
248	if (minlen)
249	dontbother++,strend--;
250	if (s != string) {
251	i = s[-1];
252	tmp = (isalpha(i) \|\| isdigit(i) \|\| i == '_');
253	}
254	else
255	tmp = 0; /* assume not alphanumeric */
256	while (s < strend) {
257	i = *s;
258	if (tmp != (isalpha(i) \|\| isdigit(i) \|\| i == '_')) {
259	tmp = !tmp;
260	if (regtry(prog, s))
261	goto got_it;
262	}
263	s++;
264	}
265	if (tmp && regtry(prog,s))
266	goto got_it;
267	break;
268	case NBOUND:
269	if (minlen)
270	dontbother++,strend--;
271	if (s != string) {
272	i = s[-1];
273	tmp = (isalpha(i) \|\| isdigit(i) \|\| i == '_');
274	}
275	else
276	tmp = 0; /* assume not alphanumeric */
277	while (s < strend) {
278	i = *s;
279	if (tmp != (isalpha(i) \|\| isdigit(i) \|\| i == '_'))
280	tmp = !tmp;
281	else if (regtry(prog, s))
282	goto got_it;
283	s++;
284	}
285	if (!tmp && regtry(prog,s))
286	goto got_it;
287	break;
288	case ALNUM:
289	while (s < strend) {
290	i = *s;
291	if (isalpha(i) \|\| isdigit(i) \|\| i == '_')
292	if (regtry(prog, s))
293	goto got_it;
294	s++;
295	}
296	break;
297	case NALNUM:
298	while (s < strend) {
299	i = *s;
300	if (!isalpha(i) && !isdigit(i) && i != '_')
301	if (regtry(prog, s))
302	goto got_it;
303	s++;
304	}
305	break;
306	case SPACE:
307	while (s < strend) {
308	if (isspace(*s))
309	if (regtry(prog, s))
310	goto got_it;
311	s++;
312	}
313	break;
314	case NSPACE:
315	while (s < strend) {
316	if (!isspace(*s))
317	if (regtry(prog, s))
318	goto got_it;
319	s++;
320	}
321	break;
322	case DIGIT:
323	while (s < strend) {
324	if (isdigit(*s))
325	if (regtry(prog, s))
326	goto got_it;
327	s++;
328	}
329	break;
330	case NDIGIT:
331	while (s < strend) {
332	if (!isdigit(*s))
333	if (regtry(prog, s))
334	goto got_it;
335	s++;
336	}
337	break;
338	}
339	}
340	else {
341	dontbother = minend;
342	strend -= dontbother;
343	/* We don't know much -- general case. */
344	do {
345	if (regtry(prog, s))
346	goto got_it;
347	} while (s++ < strend);
348	}
349
350	/* Failure. */
351	goto phooey;
352
353	got_it:
354	if ((!safebase && (prog->nparens \|\| sawampersand)) \|\| prog->do_folding){
355	strend += dontbother; /* uncheat */
356	if (safebase) /* no need for $digit later */
357	s = strbeg;
358	else if (strbeg != prog->subbase) {
359	i = strend - string + (stringarg - strbeg);
360	s = nsavestr(strbeg,i); /* so $digit will work later */
361	if (prog->subbase)
362	Safefree(prog->subbase);
363	prog->subbase = s;
364	}
365	else
366	s = prog->subbase;
367	s += (stringarg - strbeg);
368	for (i = 0; i <= prog->nparens; i++) {
369	if (prog->endp[i]) {
370	prog->startp[i] = s + (prog->startp[i] - string);
371	prog->endp[i] = s + (prog->endp[i] - string);
372	}
373	}
374	if (prog->do_folding)
375	Safefree(string);
376	}
377	return(1);
378
379	phooey:
380	if (prog->do_folding)
381	Safefree(string);
382	return(0);
383	}
384
385	/*
386	- regtry - try match at specific point
387	*/
388	static int /* 0 failure, 1 success */
389	regtry(prog, string)
390	regexp *prog;
391	char *string;
392	{
393	register int i;
394	register char **sp;
395	register char **ep;
396
397	reginput = string;
398	regstartp = prog->startp;
399	regendp = prog->endp;
400	reglastparen = &prog->lastparen;
401	prog->lastparen = 0;
402
403	sp = prog->startp;
404	ep = prog->endp;
405	if (prog->nparens) {
406	for (i = NSUBEXP; i > 0; i--) {
407	*sp++ = NULL;
408	*ep++ = NULL;
409	}
410	}
411	if (regmatch(prog->program + 1) && reginput >= regtill) {
412	prog->startp[0] = string;
413	prog->endp[0] = reginput;
414	return(1);
415	} else
416	return(0);
417	}
418
419	/*
420	- regmatch - main matching routine
421	*
422	* Conceptually the strategy is simple: check to see whether the current
423	* node matches, call self recursively to see whether the rest matches,
424	* and then act accordingly. In practice we make some effort to avoid
425	* recursion, in particular by going through "ordinary" nodes (that don't
426	* need to know whether the rest of the match failed) by a loop instead of
427	* by recursion.
428	*/
429	/* [lwall] I've hoisted the register declarations to the outer block in order to
430	* maybe save a little bit of pushing and popping on the stack. It also takes
431	* advantage of machines that use a register save mask on subroutine entry.
432	*/
433	static int /* 0 failure, 1 success */
434	regmatch(prog)
435	char *prog;
436	{
437	register char scan; / Current node. */
438	char next; / Next node. */
439	register int nextchar;
440	register int n; /* no or next */
441	register int ln; /* len or last */
442	register char s; / operand or save */
443	register char *locinput = reginput;
444
445	nextchar = *locinput;
446	scan = prog;
447	#ifdef DEBUGGING
448	if (scan != NULL && regnarrate)
449	fprintf(stderr, "%s(\n", regprop(scan));
450	#endif
451	while (scan != NULL) {
452	#ifdef DEBUGGING
453	if (regnarrate)
454	fprintf(stderr, "%s...\n", regprop(scan));
455	#endif
456
457	#ifdef REGALIGN
458	next = scan + NEXT(scan);
459	if (next == scan)
460	next = NULL;
461	#else
462	next = regnext(scan);
463	#endif
464
465	switch (OP(scan)) {
466	case BOL:
467	if (locinput == regbol \|\|
468	((nextchar \|\| locinput < regeol) &&
469	locinput[-1] == '\n') )
470	{
471	regtill--;
472	break;
473	}
474	return(0);
475	case EOL:
476	if ((nextchar \|\| locinput < regeol) && nextchar != '\n')
477	return(0);
478	regtill--;
479	break;
480	case ANY:
481	if ((nextchar == '\0' && locinput >= regeol) \|\|
482	nextchar == '\n')
483	return(0);
484	nextchar = *++locinput;
485	break;
486	case EXACTLY:
487	s = OPERAND(scan);
488	ln = *s++;
489	/* Inline the first character, for speed. */
490	if (*s != nextchar)
491	return(0);
492	if (locinput + ln > regeol)
493	return 0;
494	if (ln > 1 && bcmp(s, locinput, ln) != 0)
495	return(0);
496	locinput += ln;
497	nextchar = *locinput;
498	break;
499	case ANYOF:
500	case ANYBUT:
501	s = OPERAND(scan);
502	if (nextchar < 0)
503	nextchar = UCHARAT(locinput);
504	if (s[nextchar >> 3] & (1 << (nextchar&7)))
505	return(0);
506	nextchar = *++locinput;
507	if (!nextchar && locinput > regeol)
508	return 0;
509	break;
510	case ALNUM:
511	if (!nextchar)
512	return(0);
513	if (!isalpha(nextchar) && !isdigit(nextchar) &&
514	nextchar != '_')
515	return(0);
516	nextchar = *++locinput;
517	break;
518	case NALNUM:
519	if (!nextchar && locinput >= regeol)
520	return(0);
521	if (isalpha(nextchar) \|\| isdigit(nextchar) \|\|
522	nextchar == '_')
523	return(0);
524	nextchar = *++locinput;
525	break;
526	case NBOUND:
527	case BOUND:
528	if (locinput == regbol) /* was last char in word? */
529	ln = 0;
530	else
531	ln = (isalpha(locinput[-1]) \|\|
532	isdigit(locinput[-1]) \|\|
533	locinput[-1] == '_' );
534	n = (isalpha(nextchar) \|\| isdigit(nextchar) \|\|
535	nextchar == '_' ); /* is next char in word? */
536	if ((ln == n) == (OP(scan) == BOUND))
537	return(0);
538	break;
539	case SPACE:
540	if (!nextchar && locinput >= regeol)
541	return(0);
542	if (!isspace(nextchar))
543	return(0);
544	nextchar = *++locinput;
545	break;
546	case NSPACE:
547	if (!nextchar)
548	return(0);
549	if (isspace(nextchar))
550	return(0);
551	nextchar = *++locinput;
552	break;
553	case DIGIT:
554	if (!isdigit(nextchar))
555	return(0);
556	nextchar = *++locinput;
557	break;
558	case NDIGIT:
559	if (!nextchar && locinput >= regeol)
560	return(0);
561	if (isdigit(nextchar))
562	return(0);
563	nextchar = *++locinput;
564	break;
565	case REF:
566	case REF+1:
567	case REF+2:
568	case REF+3:
569	case REF+4:
570	case REF+5:
571	case REF+6:
572	case REF+7:
573	case REF+8:
574	case REF+9:
575	n = OP(scan) - REF;
576	s = regmystartp[n];
577	if (!s)
578	return(0);
579	if (!regmyendp[n])
580	return(0);
581	if (s == regmyendp[n])
582	break;
583	/* Inline the first character, for speed. */
584	if (*s != nextchar)
585	return(0);
586	ln = regmyendp[n] - s;
587	if (locinput + ln > regeol)
588	return 0;
589	if (ln > 1 && bcmp(s, locinput, ln) != 0)
590	return(0);
591	locinput += ln;
592	nextchar = *locinput;
593	break;
594
595	case NOTHING:
596	break;
597	case BACK:
598	break;
599	case OPEN+1:
600	case OPEN+2:
601	case OPEN+3:
602	case OPEN+4:
603	case OPEN+5:
604	case OPEN+6:
605	case OPEN+7:
606	case OPEN+8:
607	case OPEN+9:
608	n = OP(scan) - OPEN;
609	reginput = locinput;
610
611	regmystartp[n] = locinput; /* for REF */
612	if (regmatch(next)) {
613	/*
614	* Don't set startp if some later
615	* invocation of the same parentheses
616	* already has.
617	*/
618	if (regstartp[n] == NULL)
619	regstartp[n] = locinput;
620	return(1);
621	} else
622	return(0);
623	/* NOTREACHED */
624	case CLOSE+1:
625	case CLOSE+2:
626	case CLOSE+3:
627	case CLOSE+4:
628	case CLOSE+5:
629	case CLOSE+6:
630	case CLOSE+7:
631	case CLOSE+8:
632	case CLOSE+9: {
633	n = OP(scan) - CLOSE;
634	reginput = locinput;
635
636	regmyendp[n] = locinput; /* for REF */
637	if (regmatch(next)) {
638	/*
639	* Don't set endp if some later
640	* invocation of the same parentheses
641	* already has.
642	*/
643	if (regendp[n] == NULL) {
644	regendp[n] = locinput;
645	if (n > *reglastparen)
646	*reglastparen = n;
647	}
648	return(1);
649	} else
650	return(0);
651	}
652	/NOTREACHED/
653	case BRANCH: {
654	if (OP(next) != BRANCH) /* No choice. */
655	next = NEXTOPER(scan); /* Avoid recursion. */
656	else {
657	do {
658	reginput = locinput;
659	if (regmatch(NEXTOPER(scan)))
660	return(1);
661	#ifdef REGALIGN
662	if (n = NEXT(scan))
663	scan += n;
664	else
665	scan = NULL;
666	#else
667	scan = regnext(scan);
668	#endif
669	} while (scan != NULL && OP(scan) == BRANCH);
670	return(0);
671	/* NOTREACHED */
672	}
673	}
674	break;
675	case STAR:
676	case PLUS:
677	/*
678	* Lookahead to avoid useless match attempts
679	* when we know what character comes next.
680	*/
681	if (OP(next) == EXACTLY)
682	nextchar = *(OPERAND(next)+1);
683	else
684	nextchar = -1000;
685	ln = (OP(scan) == STAR) ? 0 : 1;
686	reginput = locinput;
687	n = regrepeat(NEXTOPER(scan));
688	while (n >= ln) {
689	/* If it could work, try it. */
690	if (nextchar == -1000 \|\| *reginput == nextchar)
691	if (regmatch(next))
692	return(1);
693	/* Couldn't or didn't -- back up. */
694	n--;
695	reginput = locinput + n;
696	}
697	return(0);
698	case END:
699	reginput = locinput; /* put where regtry can find it */
700	return(1); /* Success! */
701	default:
702	printf("%x %d\n",scan,scan[1]);
703	FAIL("regexp memory corruption");
704	}
705
706	scan = next;
707	}
708
709	/*
710	* We get here only if there's trouble -- normally "case END" is
711	* the terminating point.
712	*/
713	FAIL("corrupted regexp pointers");
714	/NOTREACHED/
715	#ifdef lint
716	return 0;
717	#endif
718	}
719
720	/*
721	- regrepeat - repeatedly match something simple, report how many
722	*/
723	/*
724	* [This routine now assumes that it will only match on things of length 1.
725	* That was true before, but now we assume scan - reginput is the count,
726	* rather than incrementing count on every character.]
727	*/
728	static int
729	regrepeat(p)
730	char *p;
731	{
732	register char *scan;
733	register char *opnd;
734	register int c;
735	register char *loceol = regeol;
736
737	scan = reginput;
738	opnd = OPERAND(p);
739	switch (OP(p)) {
740	case ANY:
741	while (scan < loceol && *scan != '\n')
742	scan++;
743	break;
744	case EXACTLY: /* length of string is 1 */
745	opnd++;
746	while (scan < loceol && opnd == scan)
747	scan++;
748	break;
749	case ANYOF:
750	case ANYBUT:
751	c = UCHARAT(scan);
752	while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
753	scan++;
754	c = UCHARAT(scan);
755	}
756	break;
757	case ALNUM:
758	while (isalpha(scan) \|\| isdigit(scan) \|\| *scan == '_')
759	scan++;
760	break;
761	case NALNUM:
762	while (scan < loceol && (!isalpha(scan) && !isdigit(scan) &&
763	*scan != '_'))
764	scan++;
765	break;
766	case SPACE:
767	while (scan < loceol && isspace(*scan))
768	scan++;
769	break;
770	case NSPACE:
771	while (scan < loceol && !isspace(*scan))
772	scan++;
773	break;
774	case DIGIT:
775	while (isdigit(*scan))
776	scan++;
777	break;
778	case NDIGIT:
779	while (scan < loceol && !isdigit(*scan))
780	scan++;
781	break;
782	default: /* Oh dear. Called inappropriately. */
783	FAIL("internal regexp foulup");
784	/* NOTREACHED */
785	}
786
787	c = scan - reginput;
788	reginput = scan;
789
790	return(c);
791	}
792
793	/*
794	- regnext - dig the "next" pointer out of a node
795	*
796	* [Note, when REGALIGN is defined there are two places in regmatch()
797	* that bypass this code for speed.]
798	*/
799	char *
800	regnext(p)
801	register char *p;
802	{
803	register int offset;
804
805	if (p == &regdummy)
806	return(NULL);
807
808	offset = NEXT(p);
809	if (offset == 0)
810	return(NULL);
811
812	#ifdef REGALIGN
813	return(p+offset);
814	#else
815	if (OP(p) == BACK)
816	return(p-offset);
817	else
818	return(p+offset);
819	#endif
820	}