From 47109bfb6ec14235ab32462f69f32cdab1822cee Mon Sep 17 00:00:00 2001 From: Ilya Zakharevich Date: Tue, 4 Feb 1997 06:02:10 -0500 Subject: [PATCH] Regexp optimizations Subject: Re: Regexp optimizations p5p-msgid: <199702051450.JAA13439@rio.atlantic.net> private-msgid: <199702041102.GAA24805@monk.mps.ohio-state.edu> --- regcomp.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- regexec.c | 4 ++-- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/regcomp.c b/regcomp.c index d736c18..9e39afe 100644 --- a/regcomp.c +++ b/regcomp.c @@ -145,6 +145,13 @@ PMOP* pm; I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; +#define MAX_REPEAT_DEPTH 12 + struct { + char *opcode; + I32 count; + } repeat_stack[MAX_REPEAT_DEPTH]; + I32 repeat_depth = 0; + I32 repeat_count = 1; /* We start unmultiplied. */ if (exp == NULL) croak("NULL regexp argument"); @@ -289,9 +296,9 @@ PMOP* pm; char *t; first = scan; - while (OP(t = regnext(scan)) == CLOSE) + while ((t = regnext(scan)) && OP(t) == CLOSE) scan = t; - minlen += *OPERAND(first); + minlen += *OPERAND(first) * repeat_count; if (curback - backish == len) { sv_catpvn(longish, OPERAND(first)+1, *OPERAND(first)); @@ -310,22 +317,57 @@ PMOP* pm; curback += *OPERAND(first); } else if (strchr(varies,OP(scan))) { - curback = -30000; + int tcount; + char *next; + + if (repeat_depth < MAX_REPEAT_DEPTH + && ((OP(scan) == PLUS + && (tcount = 1) + && (next = NEXTOPER(scan))) + || (regkind[(U8)OP(scan)] == CURLY + && (tcount = ARG1(scan)) + && (next = NEXTOPER(scan)+4)))) + { + /* We treat (abc)+ as (abc)(abc)*. */ + + /* Mark the place to return back. */ + repeat_stack[repeat_depth].opcode = regnext(scan); + repeat_stack[repeat_depth].count = repeat_count; + repeat_depth++; + repeat_count *= tcount; + + /* Go deeper: */ + scan = next; + continue; + } + else { + curback = -30000; + len = 0; + if (SvCUR(longish) > SvCUR(longest)) { + sv_setsv(longest,longish); + backest = backish; + } + sv_setpvn(longish,"",0); + } + } + else if (strchr(simple,OP(scan))) { + curback++; + minlen += repeat_count; len = 0; if (SvCUR(longish) > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; } sv_setpvn(longish,"",0); - if (OP(scan) == PLUS && strchr(simple,OP(NEXTOPER(scan)))) - minlen++; - else if (regkind[(U8)OP(scan)] == CURLY && - strchr(simple,OP(NEXTOPER(scan)+4))) - minlen += ARG1(scan); } - else if (strchr(simple,OP(scan))) { - curback++; - minlen++; + scan = regnext(scan); + if (!scan) { /* Go up PLUS or CURLY. */ + if (!repeat_depth--) + croak("panic: re scan"); + scan = repeat_stack[repeat_depth].opcode; + repeat_count = repeat_stack[repeat_depth].count; + /* Need to submit the longest string found: */ + curback = -30000; len = 0; if (SvCUR(longish) > SvCUR(longest)) { sv_setsv(longest,longish); @@ -333,12 +375,11 @@ PMOP* pm; } sv_setpvn(longish,"",0); } - scan = regnext(scan); } /* Prefer earlier on tie, unless we can tail match latter */ - if (SvCUR(longish) + (regkind[(U8)OP(first)] == EOL) + if (SvCUR(longish) + (first && regkind[(U8)OP(first)] == EOL) > SvCUR(longest)) { sv_setsv(longest,longish); @@ -357,11 +398,12 @@ PMOP* pm; if (backest < 0) backest = -1; r->regback = backest; - if (SvCUR(longest) > !(sawstudy || regkind[(U8)OP(first)] == EOL)) + if (SvCUR(longest) > !(sawstudy || + (first && regkind[(U8)OP(first)] == EOL))) fbm_compile(r->regmust); (void)SvUPGRADE(r->regmust, SVt_PVBM); BmUSEFUL(r->regmust) = 100; - if (regkind[(U8)OP(first)] == EOL && SvCUR(longish)) + if (first && regkind[(U8)OP(first)] == EOL && SvCUR(longish)) SvTAIL_on(r->regmust); } else { diff --git a/regexec.c b/regexec.c index c55eb97..12908b2 100644 --- a/regexec.c +++ b/regexec.c @@ -290,7 +290,7 @@ I32 safebase; /* no need to remember string in subbase */ s++; } } - else if (SvPOK(prog->regstart) == 3) { + else if (SvTYPE(prog->regstart) == SVt_PVBM) { /* We know what string it must start with. */ while ((s = fbm_instr((unsigned char*)s, (unsigned char*)strend, prog->regstart)) != NULL) @@ -300,7 +300,7 @@ I32 safebase; /* no need to remember string in subbase */ s++; } } - else { + else { /* Optimized fbm_instr: */ c = SvPVX(prog->regstart); while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL) { -- 1.8.3.1