From cf93c79d660ae36ccc5f83d949c599473fc522ce Mon Sep 17 00:00:00 2001 From: Ilya Zakharevich Date: Mon, 24 May 1999 22:42:23 -0400 Subject: [PATCH] REx engine improvements Message-Id: <199905250642.CAA06208@monk.mps.ohio-state.edu> p4raw-id: //depot/perl@3475 --- embedvar.h | 6 ++ mg.c | 48 +++++---- objXSUB.h | 4 + pp.c | 21 ++-- pp_ctl.c | 29 ++--- pp_hot.c | 134 +++++++++++++---------- regcomp.c | 37 +++---- regexec.c | 191 +++++++++++++++++++-------------- regexp.h | 42 +++++--- t/op/pat.t | 16 ++- t/op/re_tests | 201 ++++++++++++++++++++++++++++++++++- t/op/regexp.t | 9 +- thrdvar.h | 6 +- util.c | 334 ++++++++++++++++++++++++++++++++++++++++------------------ 14 files changed, 758 insertions(+), 320 deletions(-) diff --git a/embedvar.h b/embedvar.h index e6dad21..73c674c 100644 --- a/embedvar.h +++ b/embedvar.h @@ -62,6 +62,8 @@ #define PL_reg_magic (PL_curinterp->Treg_magic) #define PL_reg_oldcurpm (PL_curinterp->Treg_oldcurpm) #define PL_reg_oldpos (PL_curinterp->Treg_oldpos) +#define PL_reg_oldsaved (PL_curinterp->Treg_oldsaved) +#define PL_reg_oldsavedlen (PL_curinterp->Treg_oldsavedlen) #define PL_reg_re (PL_curinterp->Treg_re) #define PL_reg_start_tmp (PL_curinterp->Treg_start_tmp) #define PL_reg_start_tmpl (PL_curinterp->Treg_start_tmpl) @@ -453,6 +455,8 @@ #define PL_Treg_magic PL_reg_magic #define PL_Treg_oldcurpm PL_reg_oldcurpm #define PL_Treg_oldpos PL_reg_oldpos +#define PL_Treg_oldsaved PL_reg_oldsaved +#define PL_Treg_oldsavedlen PL_reg_oldsavedlen #define PL_Treg_re PL_reg_re #define PL_Treg_start_tmp PL_reg_start_tmp #define PL_Treg_start_tmpl PL_reg_start_tmpl @@ -589,6 +593,8 @@ #define PL_reg_magic (thr->Treg_magic) #define PL_reg_oldcurpm (thr->Treg_oldcurpm) #define PL_reg_oldpos (thr->Treg_oldpos) +#define PL_reg_oldsaved (thr->Treg_oldsaved) +#define PL_reg_oldsavedlen (thr->Treg_oldsavedlen) #define PL_reg_re (thr->Treg_re) #define PL_reg_start_tmp (thr->Treg_start_tmp) #define PL_reg_start_tmpl (thr->Treg_start_tmpl) diff --git a/mg.c b/mg.c index 9183104..adfad7d 100644 --- a/mg.c +++ b/mg.c @@ -341,23 +341,23 @@ magic_regdatum_get(SV *sv, MAGIC *mg) { dTHR; register I32 paren; - register char *s; + register I32 s; register I32 i; register REGEXP *rx; - char *t; + I32 t; if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { paren = mg->mg_len; if (paren < 0) return 0; if (paren <= rx->nparens && - (s = rx->startp[paren]) && - (t = rx->endp[paren])) + (s = rx->startp[paren]) != -1 && + (t = rx->endp[paren]) != -1) { if (mg->mg_obj) /* @+ */ - i = t - rx->subbeg; + i = t; else /* @- */ - i = s - rx->subbeg; + i = s; sv_setiv(sv,i); } } @@ -378,13 +378,15 @@ magic_len(SV *sv, MAGIC *mg) case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '&': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { + I32 s1, t1; + paren = atoi(mg->mg_ptr); getparen: if (paren <= rx->nparens && - (s = rx->startp[paren]) && - (t = rx->endp[paren])) + (s1 = rx->startp[paren]) != -1 && + (t1 = rx->endp[paren]) != -1) { - i = t - s; + i = t1 - s1; if (i >= 0) return i; } @@ -399,8 +401,8 @@ magic_len(SV *sv, MAGIC *mg) return 0; case '`': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - if ((s = rx->subbeg) && rx->startp[0]) { - i = rx->startp[0] - s; + if (rx->startp[0] != -1) { + i = rx->startp[0]; if (i >= 0) return i; } @@ -408,8 +410,8 @@ magic_len(SV *sv, MAGIC *mg) return 0; case '\'': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - if (rx->subend && (s = rx->endp[0])) { - i = rx->subend - s; + if (rx->endp[0] != -1) { + i = rx->sublen - rx->endp[0]; if (i >= 0) return i; } @@ -589,6 +591,8 @@ magic_get(SV *sv, MAGIC *mg) case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '&': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { + I32 s1, t1; + /* * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj)); * XXX Does the new way break anything? @@ -596,10 +600,11 @@ magic_get(SV *sv, MAGIC *mg) paren = atoi(mg->mg_ptr); getparen: if (paren <= rx->nparens && - (s = rx->startp[paren]) && - (t = rx->endp[paren])) + (s1 = rx->startp[paren]) != -1 && + (t1 = rx->endp[paren]) != -1) { - i = t - s; + i = t1 - s1; + s = rx->subbeg + s1; getrx: if (i >= 0) { bool was_tainted; @@ -607,7 +612,7 @@ magic_get(SV *sv, MAGIC *mg) was_tainted = PL_tainted; PL_tainted = FALSE; } - sv_setpvn(sv,s,i); + sv_setpvn(sv, s, i); if (PL_tainting) PL_tainted = (was_tainted || RX_MATCH_TAINTED(rx)); break; @@ -626,8 +631,8 @@ magic_get(SV *sv, MAGIC *mg) break; case '`': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - if ((s = rx->subbeg) && rx->startp[0]) { - i = rx->startp[0] - s; + if ((s = rx->subbeg) && rx->startp[0] != -1) { + i = rx->startp[0]; goto getrx; } } @@ -635,8 +640,9 @@ magic_get(SV *sv, MAGIC *mg) break; case '\'': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - if (rx->subend && (s = rx->endp[0])) { - i = rx->subend - s; + if (rx->subbeg && rx->endp[0] != -1) { + s = rx->subbeg + rx->endp[0]; + i = rx->sublen - rx->endp[0]; goto getrx; } } diff --git a/objXSUB.h b/objXSUB.h index c29bc06..658e5ce 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -518,6 +518,10 @@ #define PL_reg_oldcurpm pPerl->PL_reg_oldcurpm #undef PL_reg_oldpos #define PL_reg_oldpos pPerl->PL_reg_oldpos +#undef PL_reg_oldsaved +#define PL_reg_oldsaved pPerl->PL_reg_oldsaved +#undef PL_reg_oldsavedlen +#define PL_reg_oldsavedlen pPerl->PL_reg_oldsavedlen #undef PL_reg_re #define PL_reg_re pPerl->PL_reg_re #undef PL_reg_start_tmp diff --git a/pp.c b/pp.c index 42fd9b8..1b9ebdd 100644 --- a/pp.c +++ b/pp.c @@ -5006,8 +5006,10 @@ PP(pp_split) else if (rx->check_substr && !rx->nparens && (rx->reganch & ROPT_CHECK_ALL) && !(rx->reganch & ROPT_ANCH)) { + int tail = SvTAIL(rx->check_substr) != 0; + i = SvCUR(rx->check_substr); - if (i == 1 && !SvTAIL(rx->check_substr)) { + if (i == 1 && !tail) { i = *SvPVX(rx->check_substr); while (--limit) { /*SUPPRESS 530*/ @@ -5026,7 +5028,7 @@ PP(pp_split) #ifndef lint while (s < strend && --limit && (m=fbm_instr((unsigned char*)s, (unsigned char*)strend, - rx->check_substr, 0)) ) + rx->check_substr, PL_multiline ? FBMrf_MULTILINE : 0)) ) #endif { dstr = NEWSV(31, m-s); @@ -5034,7 +5036,7 @@ PP(pp_split) if (make_mortal) sv_2mortal(dstr); XPUSHs(dstr); - s = m + i; + s = m + i - tail; /* Fake \n at the end */ } } } @@ -5044,15 +5046,14 @@ PP(pp_split) CALLREGEXEC(rx, s, strend, orig, 1, sv, NULL, 0)) { TAINT_IF(RX_MATCH_TAINTED(rx)); - if (rx->subbase - && rx->subbase != orig) { + if (RX_MATCH_COPIED(rx) && rx->subbeg != orig) { m = s; s = orig; - orig = rx->subbase; + orig = rx->subbeg; s = orig + (m - s); strend = s + (strend - m); } - m = rx->startp[0]; + m = rx->startp[0] + orig; dstr = NEWSV(32, m-s); sv_setpvn(dstr, s, m-s); if (make_mortal) @@ -5060,8 +5061,8 @@ PP(pp_split) XPUSHs(dstr); if (rx->nparens) { for (i = 1; i <= rx->nparens; i++) { - s = rx->startp[i]; - m = rx->endp[i]; + s = rx->startp[i] + orig; + m = rx->endp[i] + orig; if (m && s) { dstr = NEWSV(33, m-s); sv_setpvn(dstr, s, m-s); @@ -5073,7 +5074,7 @@ PP(pp_split) XPUSHs(dstr); } } - s = rx->endp[0]; + s = rx->endp[0] + orig; } } diff --git a/pp_ctl.c b/pp_ctl.c index 3e4db3b..a4c0247 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -172,8 +172,8 @@ PP(pp_substcont) if (cx->sb_once || !CALLREGEXEC(rx, s, cx->sb_strend, orig, s == m, cx->sb_targ, NULL, ((cx->sb_rflags & REXEC_COPY_STR) - ? REXEC_IGNOREPOS - : (REXEC_COPY_STR|REXEC_IGNOREPOS)))) + ? (REXEC_IGNOREPOS|REXEC_NOT_FIRST) + : (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST)))) { SV *targ = cx->sb_targ; sv_catpvn(dstr, s, cx->sb_strend - s); @@ -201,16 +201,16 @@ PP(pp_substcont) RETURNOP(pm->op_next); } } - if (rx->subbase && rx->subbase != orig) { + if (RX_MATCH_COPIED(rx) && rx->subbeg != orig) { m = s; s = orig; - cx->sb_orig = orig = rx->subbase; + cx->sb_orig = orig = rx->subbeg; s = orig + (m - s); cx->sb_strend = s + (cx->sb_strend - m); } - cx->sb_m = m = rx->startp[0]; + cx->sb_m = m = rx->startp[0] + orig; sv_catpvn(dstr, s, m-s); - cx->sb_s = rx->endp[0]; + cx->sb_s = rx->endp[0] + orig; cx->sb_rxtainted |= RX_MATCH_TAINTED(rx); rxres_save(&cx->sb_rxres, rx); RETURNOP(pm->op_pmreplstart); @@ -231,13 +231,13 @@ rxres_save(void **rsp, REGEXP *rx) *rsp = (void*)p; } - *p++ = (UV)rx->subbase; - rx->subbase = Nullch; + *p++ = (UV)(RX_MATCH_COPIED(rx) ? rx->subbeg : Nullch); + RX_MATCH_COPIED_off(rx); *p++ = rx->nparens; *p++ = (UV)rx->subbeg; - *p++ = (UV)rx->subend; + *p++ = (UV)rx->sublen; for (i = 0; i <= rx->nparens; ++i) { *p++ = (UV)rx->startp[i]; *p++ = (UV)rx->endp[i]; @@ -250,17 +250,18 @@ rxres_restore(void **rsp, REGEXP *rx) UV *p = (UV*)*rsp; U32 i; - Safefree(rx->subbase); - rx->subbase = (char*)(*p); + if (RX_MATCH_COPIED(rx)) + Safefree(rx->subbeg); + RX_MATCH_COPIED_set(rx, *p); *p++ = 0; rx->nparens = *p++; rx->subbeg = (char*)(*p++); - rx->subend = (char*)(*p++); + rx->sublen = (I32)(*p++); for (i = 0; i <= rx->nparens; ++i) { - rx->startp[i] = (char*)(*p++); - rx->endp[i] = (char*)(*p++); + rx->startp[i] = (I32)(*p++); + rx->endp[i] = (I32)(*p++); } } diff --git a/pp_hot.c b/pp_hot.c index 76e5e53..599a2af 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -846,7 +846,9 @@ PP(pp_match) char *strend; I32 global; I32 r_flags = 0; - char *truebase; + char *truebase; /* Start of string, may be + relocated if REx engine + copies the string. */ register REGEXP *rx = pm->op_pmregexp; bool rxtainted; I32 gimme = GIMME; @@ -888,15 +890,15 @@ PP(pp_match) /* XXXX What part of this is needed with true \G-support? */ if (global = pm->op_pmflags & PMf_GLOBAL) { - rx->startp[0] = 0; + rx->startp[0] = -1; if (SvTYPE(TARG) >= SVt_PVMG && SvMAGIC(TARG)) { MAGIC* mg = mg_find(TARG, 'g'); if (mg && mg->mg_len >= 0) { if (!(rx->reganch & ROPT_GPOS_SEEN)) - rx->endp[0] = rx->startp[0] = s + mg->mg_len; + rx->endp[0] = rx->startp[0] = mg->mg_len; else if (rx->reganch & ROPT_ANCH_GPOS) { r_flags |= REXEC_IGNOREPOS; - rx->endp[0] = rx->startp[0] = s + mg->mg_len; + rx->endp[0] = rx->startp[0] = mg->mg_len; } minmatch = (mg->mg_flags & MGf_MINMATCH); update_minmatch = 0; @@ -917,8 +919,8 @@ PP(pp_match) } play_it_again: - if (global && rx->startp[0]) { - t = s = rx->endp[0]; + if (global && rx->startp[0] != -1) { + t = s = rx->endp[0] + truebase; if ((s + rx->minlen) > strend) goto nope; if (update_minmatch++) @@ -926,29 +928,33 @@ play_it_again: } if (rx->check_substr) { if (!(rx->reganch & ROPT_NOSCAN)) { /* Floating checkstring. */ + SV *c = rx->check_substr; + if (r_flags & REXEC_SCREAM) { I32 p = -1; char *b; - - if (PL_screamfirst[BmRARE(rx->check_substr)] < 0) + + if (PL_screamfirst[BmRARE(c)] < 0 + && !( BmRARE(c) == '\n' && (BmPREVIOUS(c) == SvCUR(c) - 1) + && SvTAIL(c) )) goto nope; b = (char*)HOP((U8*)s, rx->check_offset_min); - if (!(s = screaminstr(TARG, rx->check_substr, b - s, 0, &p, 0))) + if (!(s = screaminstr(TARG, c, b - s, 0, &p, 0))) goto nope; if ((rx->reganch & ROPT_CHECK_ALL) - && !PL_sawampersand && !SvTAIL(rx->check_substr)) + && !PL_sawampersand && !SvTAIL(c)) goto yup; } else if (!(s = fbm_instr((unsigned char*)HOP((U8*)s, rx->check_offset_min), - (unsigned char*)strend, - rx->check_substr, 0))) + (unsigned char*)strend, c, + PL_multiline ? FBMrf_MULTILINE : 0))) goto nope; else if ((rx->reganch & ROPT_CHECK_ALL) && !PL_sawampersand) goto yup; if (s && rx->check_offset_max < s - t) { - ++BmUSEFUL(rx->check_substr); + ++BmUSEFUL(c); s = (char*)HOP((U8*)s, -rx->check_offset_max); } else @@ -959,10 +965,30 @@ play_it_again: else if (!PL_multiline) { /* Anchored near beginning of string. */ I32 slen; char *b = (char*)HOP((U8*)s, rx->check_offset_min); - if (*SvPVX(rx->check_substr) != *b - || ((slen = SvCUR(rx->check_substr)) > 1 - && memNE(SvPVX(rx->check_substr), b, slen))) - goto nope; + + if (SvTAIL(rx->check_substr)) { + slen = SvCUR(rx->check_substr); /* >= 1 */ + + if ( strend - b > slen || strend - b < slen - 1 ) + goto nope; + if ( strend - b == slen && strend[-1] != '\n') + goto nope; + /* Now should match b[0..slen-2] */ + slen--; + if (slen && (*SvPVX(rx->check_substr) != *b + || (slen > 1 + && memNE(SvPVX(rx->check_substr), b, slen)))) + goto nope; + if ((rx->reganch & ROPT_CHECK_ALL) && !PL_sawampersand) + goto yup; + } else { /* Assume len > 0 */ + if (*SvPVX(rx->check_substr) != *b + || ((slen = SvCUR(rx->check_substr)) > 1 + && memNE(SvPVX(rx->check_substr), b, slen))) + goto nope; + if ((rx->reganch & ROPT_CHECK_ALL) && !PL_sawampersand) + goto yup; + } } if (!(rx->reganch & ROPT_NAUGHTY) && --BmUSEFUL(rx->check_substr) < 0 && rx->check_substr == rx->float_substr) { @@ -1000,17 +1026,17 @@ play_it_again: for (i = !i; i <= iters; i++) { PUSHs(sv_newmortal()); /*SUPPRESS 560*/ - if ((s = rx->startp[i]) && rx->endp[i] ) { - len = rx->endp[i] - s; + if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) { + len = rx->endp[i] - rx->startp[i]; + s = rx->startp[i] + truebase; sv_setpvn(*SP, s, len); } } if (global) { - truebase = rx->subbeg; - strend = rx->subend; - had_zerolen = (rx->startp[0] && rx->startp[0] == rx->endp[0]); + had_zerolen = (rx->startp[0] != -1 + && rx->startp[0] == rx->endp[0]); PUTBACK; /* EVAL blocks may use stack */ - r_flags |= REXEC_IGNOREPOS; + r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; goto play_it_again; } else if (!iters) @@ -1027,8 +1053,8 @@ play_it_again: sv_magic(TARG, (SV*)0, 'g', Nullch, 0); mg = mg_find(TARG, 'g'); } - if (rx->startp[0]) { - mg->mg_len = rx->endp[0] - rx->subbeg; + if (rx->startp[0] != -1) { + mg->mg_len = rx->endp[0]; if (rx->startp[0] == rx->endp[0]) mg->mg_flags |= MGf_MINMATCH; else @@ -1047,23 +1073,29 @@ yup: /* Confirmed by check_substr */ PL_curpm = pm; if (pm->op_pmflags & PMf_ONCE) pm->op_pmdynflags |= PMdf_USED; - Safefree(rx->subbase); - rx->subbase = Nullch; + if (RX_MATCH_COPIED(rx)) + Safefree(rx->subbeg); + RX_MATCH_COPIED_off(rx); + rx->subbeg = Nullch; if (global) { rx->subbeg = truebase; - rx->subend = strend; - rx->startp[0] = s; - rx->endp[0] = s + SvCUR(rx->check_substr); + rx->startp[0] = s - truebase; + rx->endp[0] = s - truebase + SvCUR(rx->check_substr); + rx->sublen = strend - truebase; goto gotcha; - } + } if (PL_sawampersand) { - char *tmps; + I32 off; - tmps = rx->subbase = savepvn(t, strend-t); - rx->subbeg = tmps; - rx->subend = tmps + (strend-t); - tmps = rx->startp[0] = tmps + (s - t); - rx->endp[0] = tmps + SvCUR(rx->check_substr); + rx->subbeg = savepvn(t, strend - t); + rx->sublen = strend - t; + RX_MATCH_COPIED_on(rx); + off = rx->startp[0] = s - t; + rx->endp[0] = off + SvCUR(rx->check_substr); + } + else { /* startp/endp are used by @- @+. */ + rx->startp[0] = s - truebase; + rx->endp[0] = s - truebase + SvCUR(rx->check_substr); } LEAVE_SCOPE(oldsave); RETPUSHYES; @@ -1714,7 +1746,8 @@ PP(pp_subst) } else if (!(s = fbm_instr((unsigned char*)HOP((U8*)s, rx->check_offset_min), (unsigned char*)strend, - rx->check_substr, 0))) + rx->check_substr, + PL_multiline ? FBMrf_MULTILINE : 0))) goto nope; if (s && rx->check_offset_max < s - m) { ++BmUSEFUL(rx->check_substr); @@ -1766,13 +1799,8 @@ PP(pp_subst) SvSCREAM_off(TARG); /* disable possible screamer */ if (once) { rxtainted |= RX_MATCH_TAINTED(rx); - if (rx->subbase) { - m = orig + (rx->startp[0] - rx->subbase); - d = orig + (rx->endp[0] - rx->subbase); - } else { - m = rx->startp[0]; - d = rx->endp[0]; - } + m = orig + rx->startp[0]; + d = orig + rx->endp[0]; s = orig; if (m - s > strend - d) { /* faster to shorten from end */ if (clen) { @@ -1815,7 +1843,7 @@ PP(pp_subst) if (iters++ > maxiters) DIE("Substitution loop"); rxtainted |= RX_MATCH_TAINTED(rx); - m = rx->startp[0]; + m = rx->startp[0] + orig; /*SUPPRESS 560*/ if (i = m - s) { if (s != d) @@ -1826,9 +1854,9 @@ PP(pp_subst) Copy(c, d, clen, char); d += clen; } - s = rx->endp[0]; + s = rx->endp[0] + orig; } while (CALLREGEXEC(rx, s, strend, orig, s == m, - Nullsv, NULL, 0)); /* don't match same null twice */ + Nullsv, NULL, REXEC_NOT_FIRST)); /* don't match same null twice */ if (s != d) { i = strend - s; SvCUR_set(TARG, d - SvPVX(TARG) + i); @@ -1866,21 +1894,21 @@ PP(pp_subst) PUSHSUBST(cx); RETURNOP(cPMOP->op_pmreplroot); } - r_flags |= REXEC_IGNOREPOS; + r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; do { if (iters++ > maxiters) DIE("Substitution loop"); rxtainted |= RX_MATCH_TAINTED(rx); - if (rx->subbase && rx->subbase != orig) { + if (RX_MATCH_COPIED(rx) && rx->subbeg != orig) { m = s; s = orig; - orig = rx->subbase; + orig = rx->subbeg; s = orig + (m - s); strend = s + (strend - m); } - m = rx->startp[0]; + m = rx->startp[0] + orig; sv_catpvn(dstr, s, m-s); - s = rx->endp[0]; + s = rx->endp[0] + orig; if (clen) sv_catpvn(dstr, c, clen); if (once) diff --git a/regcomp.c b/regcomp.c index 34640b7..a360f6a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -875,7 +875,8 @@ pregcomp(char *exp, char *xend, PMOP *pm) r->refcnt = 1; r->prelen = xend - exp; r->precomp = PL_regprecomp; - r->subbeg = r->subbase = NULL; + r->subbeg = NULL; + r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = PL_regnpar - 1; /* set early to validate backrefs */ r->substrs = 0; /* Useful during FAIL. */ @@ -898,7 +899,7 @@ pregcomp(char *exp, char *xend, PMOP *pm) return(NULL); /* Dig out information for optimizations. */ - r->reganch = pm->op_pmflags & PMf_COMPILETIME; + r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ pm->op_pmflags = PL_regflags; if (UTF) r->reganch |= ROPT_UTF8; @@ -998,6 +999,8 @@ pregcomp(char *exp, char *xend, PMOP *pm) || (data.flags & SF_FL_BEFORE_EOL && (!(data.flags & SF_FL_BEFORE_MEOL) || (PL_regflags & PMf_MULTILINE)))) { + int t; + if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ && data.offset_fixed == data.offset_float_min && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) @@ -1006,12 +1009,10 @@ pregcomp(char *exp, char *xend, PMOP *pm) r->float_substr = data.longest_float; r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; - fbm_compile(r->float_substr, 0); - BmUSEFUL(r->float_substr) = 100; - if (data.flags & SF_FL_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE))) - SvTAIL_on(r->float_substr); + t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ + && (!(data.flags & SF_FL_BEFORE_MEOL) + || (PL_regflags & PMf_MULTILINE))); + fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); } else { remove_float: @@ -1025,14 +1026,14 @@ pregcomp(char *exp, char *xend, PMOP *pm) || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) || (PL_regflags & PMf_MULTILINE)))) { + int t; + r->anchored_substr = data.longest_fixed; r->anchored_offset = data.offset_fixed; - fbm_compile(r->anchored_substr, 0); - BmUSEFUL(r->anchored_substr) = 100; - if (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE))) - SvTAIL_on(r->anchored_substr); + t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (PL_regflags & PMf_MULTILINE))); + fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); } else { r->anchored_substr = Nullsv; @@ -1070,8 +1071,8 @@ pregcomp(char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_LOOKBEHIND_SEEN; if (PL_regseen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; - Newz(1002, r->startp, PL_regnpar, char*); - Newz(1002, r->endp, PL_regnpar, char*); + Newz(1002, r->startp, PL_regnpar, I32); + Newz(1002, r->endp, PL_regnpar, I32); DEBUG_r(regdump(r)); return(r); } @@ -2946,8 +2947,8 @@ pregfree(struct regexp *r) return; if (r->precomp) Safefree(r->precomp); - if (r->subbase) - Safefree(r->subbase); + if (RX_MATCH_COPIED(r)) + Safefree(r->subbeg); if (r->substrs) { if (r->anchored_substr) SvREFCNT_dec(r->anchored_substr); diff --git a/regexec.c b/regexec.c index 8631712..5806767 100644 --- a/regexec.c +++ b/regexec.c @@ -139,8 +139,8 @@ regcppush(I32 parenfloor) SSCHECK(i + 5); for (p = PL_regsize; p > parenfloor; p--) { - SSPUSHPTR(PL_regendp[p]); - SSPUSHPTR(PL_regstartp[p]); + SSPUSHINT(PL_regendp[p]); + SSPUSHINT(PL_regstartp[p]); SSPUSHPTR(PL_reg_start_tmp[p]); SSPUSHINT(p); } @@ -169,7 +169,7 @@ regcppop(void) I32 i = SSPOPINT; U32 paren = 0; char *input; - char *tmps; + I32 tmps; assert(i == SAVEt_REGCONTEXT); i = SSPOPINT; input = (char *) SSPOPPTR; @@ -178,16 +178,16 @@ regcppop(void) for (i -= 3; i > 0; i -= 4) { paren = (U32)SSPOPINT; PL_reg_start_tmp[paren] = (char *) SSPOPPTR; - PL_regstartp[paren] = (char *) SSPOPPTR; - tmps = (char*)SSPOPPTR; + PL_regstartp[paren] = SSPOPINT; + tmps = SSPOPINT; if (paren <= *PL_reglastparen) PL_regendp[paren] = tmps; DEBUG_r( PerlIO_printf(Perl_debug_log, " restoring \\%d to %d(%d)..%d%s\n", - paren, PL_regstartp[paren] - PL_regbol, - PL_reg_start_tmp[paren] - PL_regbol, - PL_regendp[paren] - PL_regbol, + paren, PL_regstartp[paren], + PL_reg_start_tmp[paren] - PL_bostr, + PL_regendp[paren], (paren > *PL_reglastparen ? "(no)" : "")); ); } @@ -200,8 +200,8 @@ regcppop(void) ); for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) { if (paren > PL_regsize) - PL_regstartp[paren] = Nullch; - PL_regendp[paren] = Nullch; + PL_regstartp[paren] = -1; + PL_regendp[paren] = -1; } return input; } @@ -266,7 +266,12 @@ STATIC void restore_pos(void *arg) { dTHR; - if (PL_reg_eval_set) { + if (PL_reg_eval_set) { + if (PL_reg_oldsaved) { + PL_reg_re->subbeg = PL_reg_oldsaved; + PL_reg_re->sublen = PL_reg_oldsavedlen; + RX_MATCH_COPIED_on(PL_reg_re); + } PL_reg_magic->mg_len = PL_reg_oldpos; PL_reg_eval_set = 0; PL_curpm = PL_reg_oldcurpm; @@ -363,9 +368,15 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, char *t; start_shift = prog->check_offset_min; /* okay to underestimate on CC */ /* Should be nonnegative! */ - end_shift = minlen - start_shift - CHR_SVLEN(prog->check_substr); + end_shift = minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); if (flags & REXEC_SCREAM) { - if (PL_screamfirst[BmRARE(prog->check_substr)] >= 0) + SV *c = prog->check_substr; + + if (PL_screamfirst[BmRARE(c)] >= 0 + || ( BmRARE(c) == '\n' + && (BmPREVIOUS(c) == SvCUR(c) - 1) + && SvTAIL(c) )) s = screaminstr(sv, prog->check_substr, start_shift + (stringarg - strbeg), end_shift, &scream_pos, 0); @@ -376,7 +387,7 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, else s = fbm_instr((unsigned char*)s + start_shift, (unsigned char*)strend - end_shift, - prog->check_substr, 0); + prog->check_substr, PL_multiline ? FBMrf_MULTILINE : 0); if (!s) { ++BmUSEFUL(prog->check_substr); /* hooray */ goto phooey; /* not present */ @@ -493,7 +504,8 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, I32 back_min = prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; I32 delta = back_max - back_min; - char *last = HOPc(strend, 0-(CHR_SVLEN(must) + back_min)); /* Cannot start after this */ + char *last = HOPc(strend, /* Cannot start after this */ + -(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min)); char *last1; /* Last position checked before */ if (s > PL_bostr) @@ -511,7 +523,8 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg, end_shift, &scream_pos, 0)) : (s = fbm_instr((unsigned char*)HOP(s, back_min), - (unsigned char*)strend, must, 0))) ) { + (unsigned char*)strend, must, + PL_multiline ? FBMrf_MULTILINE : 0))) ) { if (HOPc(s, -back_max) > last1) { last1 = HOPc(s, -back_min); s = HOPc(s, -back_max); @@ -943,17 +956,28 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, if (flags & REXEC_SCREAM) { last = screaminstr(sv, prog->float_substr, s - strbeg, end_shift, &scream_pos, 1); /* last one */ - if (!last) { + if (!last) last = scream_olds; /* Only one occurence. */ - } } else { STRLEN len; char *little = SvPV(prog->float_substr, len); - if (len) - last = rninstr(s, strend, little, little + len); - else - last = strend; /* matching `$' */ + + if (SvTAIL(prog->float_substr)) { + if (memEQ(strend - len + 1, little, len - 1)) + last = strend - len + 1; + else if (!PL_multiline) + last = memEQ(strend - len, little, len) + ? strend - len : Nullch; + else + goto find_last; + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching `$' */ + } } if (last == NULL) goto phooey; /* Should not happen! */ dontbother = strend - last + prog->float_min_offset; @@ -983,34 +1007,8 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, goto phooey; got_it: - prog->subbeg = strbeg; - prog->subend = PL_regeol; /* strend may have been modified */ RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); - /* make sure $`, $&, $', and $digit will work later */ - if (strbeg != prog->subbase) { /* second+ //g match. */ - if (!(flags & REXEC_COPY_STR)) { - if (prog->subbase) { - Safefree(prog->subbase); - prog->subbase = Nullch; - } - } - else { - I32 i = PL_regeol - startpos + (stringarg - strbeg); - s = savepvn(strbeg, i); - Safefree(prog->subbase); - prog->subbase = s; - prog->subbeg = prog->subbase; - prog->subend = prog->subbase + i; - s = prog->subbase + (stringarg - strbeg); - for (i = 0; i <= prog->nparens; i++) { - if (prog->endp[i]) { - prog->startp[i] = s + (prog->startp[i] - startpos); - prog->endp[i] = s + (prog->endp[i] - startpos); - } - } - } - } if (PL_reg_eval_set) { /* Preserve the current value of $^R */ if (oreplsv != GvSV(PL_replgv)) @@ -1019,6 +1017,26 @@ got_it: the same. */ restore_pos(0); } + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) { + if (RX_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RX_MATCH_COPIED_off(prog); + } + if (flags & REXEC_COPY_STR) { + I32 i = PL_regeol - startpos + (stringarg - strbeg); + + s = savepvn(strbeg, i); + prog->subbeg = s; + prog->sublen = i; + RX_MATCH_COPIED_on(prog); + } + else { + prog->subbeg = strbeg; + prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ + } + } return 1; @@ -1036,8 +1054,8 @@ regtry(regexp *prog, char *startpos) { dTHR; register I32 i; - register char **sp; - register char **ep; + register I32 *sp; + register I32 *ep; CHECKPOINT lastcp; if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { @@ -1080,10 +1098,20 @@ regtry(regexp *prog, char *startpos) PL_reg_curpm->op_pmregexp = prog; PL_reg_oldcurpm = PL_curpm; PL_curpm = PL_reg_curpm; + if (RX_MATCH_COPIED(prog)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + PL_reg_oldsaved = prog->subbeg; + PL_reg_oldsavedlen = prog->sublen; + RX_MATCH_COPIED_off(prog); + } + else + PL_reg_oldsaved = Nullch; prog->subbeg = PL_bostr; - prog->subend = PL_regeol; /* strend may have been modified */ + prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } - prog->startp[0] = startpos; + prog->startp[0] = startpos - PL_bostr; PL_reginput = startpos; PL_regstartp = prog->startp; PL_regendp = prog->endp; @@ -1106,13 +1134,13 @@ regtry(regexp *prog, char *startpos) ep = prog->endp; if (prog->nparens) { for (i = prog->nparens; i >= 1; i--) { - *++sp = NULL; - *++ep = NULL; + *++sp = -1; + *++ep = -1; } } REGCP_SET; if (regmatch(prog->program + 1)) { - prog->endp[0] = PL_reginput; + prog->endp[0] = PL_reginput - PL_bostr; return 1; } REGCP_UNWIND; @@ -1590,15 +1618,16 @@ regmatch(regnode *prog) case REF: case REFF: n = ARG(scan); /* which paren pair */ - s = PL_regstartp[n]; - if (*PL_reglastparen < n || !s) + ln = PL_regstartp[n]; + if (*PL_reglastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ - if (s == PL_regendp[n]) + if (ln == PL_regendp[n]) break; + s = PL_bostr + ln; if (UTF && OP(scan) != REF) { /* REF can do byte comparison */ char *l = locinput; - char *e = PL_regendp[n]; + char *e = PL_bostr + PL_regendp[n]; /* * Note that we can't do the "other character" lookup trick as * in the 8-bit case (no pun intended) because in Unicode we @@ -1635,7 +1664,7 @@ regmatch(regnode *prog) (UCHARAT(s) != ((OP(scan) == REFF ? PL_fold : PL_fold_locale)[nextchr])))) sayNO; - ln = PL_regendp[n] - s; + ln = PL_regendp[n] - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (OP(scan) == REF @@ -1665,8 +1694,7 @@ regmatch(regnode *prog) PL_op = (OP_4tree*)PL_regdata->data[n]; DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%x\n", PL_op) ); PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]); - PL_reg_magic->mg_len = locinput - PL_bostr; - PL_regendp[0] = locinput; + PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr; CALLRUNOPS(); /* Scalar context. */ SPAGAIN; @@ -1769,14 +1797,14 @@ regmatch(regnode *prog) break; case CLOSE: n = ARG(scan); /* which paren pair */ - PL_regstartp[n] = PL_reg_start_tmp[n]; - PL_regendp[n] = locinput; + PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr; + PL_regendp[n] = locinput - PL_bostr; if (n > *PL_reglastparen) *PL_reglastparen = n; break; case GROUPP: n = ARG(scan); /* which paren pair */ - sw = (*PL_reglastparen >= n && PL_regendp[n] != NULL); + sw = (*PL_reglastparen >= n && PL_regendp[n] != -1); break; case IFTHEN: if (sw) @@ -1999,7 +2027,7 @@ regmatch(regnode *prog) sayYES; REGCP_UNWIND; for (n = *PL_reglastparen; n > lastparen; n--) - PL_regendp[n] = 0; + PL_regendp[n] = -1; *PL_reglastparen = n; scan = next; /*SUPPRESS 560*/ @@ -2073,11 +2101,12 @@ regmatch(regnode *prog) { if (paren) { if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -l); - PL_regendp[paren] = PL_reginput; + PL_regstartp[paren] = + HOPc(PL_reginput, -l) - PL_bostr; + PL_regendp[paren] = PL_reginput - PL_bostr; } else - PL_regendp[paren] = NULL; + PL_regendp[paren] = -1; } if (regmatch(next)) sayYES; @@ -2134,11 +2163,11 @@ regmatch(regnode *prog) ); if (paren) { if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -l); - PL_regendp[paren] = PL_reginput; + PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr; + PL_regendp[paren] = PL_reginput - PL_bostr; } else - PL_regendp[paren] = NULL; + PL_regendp[paren] = -1; } if (regmatch(next)) sayYES; @@ -2233,11 +2262,11 @@ regmatch(regnode *prog) /* PL_reginput == locinput now */ if (paren) { if (ln) { - PL_regstartp[paren] = HOPc(locinput, -1); - PL_regendp[paren] = locinput; + PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; + PL_regendp[paren] = locinput - PL_bostr; } else - PL_regendp[paren] = NULL; + PL_regendp[paren] = -1; } if (regmatch(next)) sayYES; @@ -2256,11 +2285,11 @@ regmatch(regnode *prog) { if (paren) { if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -1); - PL_regendp[paren] = PL_reginput; + PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr; + PL_regendp[paren] = PL_reginput - PL_bostr; } else - PL_regendp[paren] = NULL; + PL_regendp[paren] = -1; } if (regmatch(next)) sayYES; @@ -2293,11 +2322,11 @@ regmatch(regnode *prog) { if (paren && n) { if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -1); - PL_regendp[paren] = PL_reginput; + PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr; + PL_regendp[paren] = PL_reginput - PL_bostr; } else - PL_regendp[paren] = NULL; + PL_regendp[paren] = -1; } if (regmatch(next)) sayYES; diff --git a/regexp.h b/regexp.h index b1170f1..9da5bd4 100644 --- a/regexp.h +++ b/regexp.h @@ -34,20 +34,9 @@ struct reg_substr_data { }; typedef struct regexp { - I32 refcnt; - char **startp; - char **endp; + I32 *startp; + I32 *endp; regnode *regstclass; - I32 minlen; /* mininum possible length of $& */ - I32 prelen; /* length of precomp */ - U32 nparens; /* number of parentheses */ - U32 lastparen; /* last paren matched */ - char *precomp; /* pre-compilation regular expression */ - char *subbase; /* saved string so \digit works forever */ - char *subbeg; /* same, but not responsible for allocation */ - char *subend; /* end of subbase */ - U32 reganch; /* Internal use only + - Tainted information used by regexec? */ #if 0 SV *anchored_substr; /* Substring at fixed position wrt start. */ I32 anchored_offset; /* Position of it. */ @@ -60,7 +49,18 @@ typedef struct regexp { #else struct reg_substr_data *substrs; #endif + char *precomp; /* pre-compilation regular expression */ struct reg_data *data; /* Additional data. */ + char *subbeg; /* saved or original string + so \digit works forever. */ + I32 sublen; /* Length of string pointed by subbeg */ + I32 refcnt; + I32 minlen; /* mininum possible length of $& */ + I32 prelen; /* length of precomp */ + U32 nparens; /* number of parentheses */ + U32 lastparen; /* last paren matched */ + U32 reganch; /* Internal use only + + Tainted information used by regexec? */ regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp; @@ -92,6 +92,7 @@ typedef struct regexp { #define ROPT_UTF8 0x10000 #define ROPT_NAUGHTY 0x20000 /* how exponential is this pattern? */ +#define ROPT_COPY_DONE 0x40000 /* subbeg is a copy of the string */ #define RX_MATCH_TAINTED(prog) ((prog)->reganch & ROPT_TAINTED_SEEN) #define RX_MATCH_TAINTED_on(prog) ((prog)->reganch |= ROPT_TAINTED_SEEN) @@ -100,10 +101,25 @@ typedef struct regexp { ? RX_MATCH_TAINTED_on(prog) \ : RX_MATCH_TAINTED_off(prog)) +#define RX_MATCH_COPIED(prog) ((prog)->reganch & ROPT_COPY_DONE) +#define RX_MATCH_COPIED_on(prog) ((prog)->reganch |= ROPT_COPY_DONE) +#define RX_MATCH_COPIED_off(prog) ((prog)->reganch &= ~ROPT_COPY_DONE) +#define RX_MATCH_COPIED_set(prog,t) ((t) \ + ? RX_MATCH_COPIED_on(prog) \ + : RX_MATCH_COPIED_off(prog)) + #define REXEC_COPY_STR 1 /* Need to copy the string. */ #define REXEC_CHECKED 2 /* check_substr already checked. */ #define REXEC_SCREAM 4 /* use scream table. */ #define REXEC_IGNOREPOS 8 /* \G matches at start. */ +#define REXEC_NOT_FIRST 0x10 /* This is another iteration of //g. */ #define ReREFCNT_inc(re) ((re && re->refcnt++), re) #define ReREFCNT_dec(re) pregfree(re) + +#define FBMcf_TAIL_DOLLAR 1 +#define FBMcf_TAIL_Z 2 +#define FBMcf_TAIL_z 4 +#define FBMcf_TAIL (FBMcf_TAIL_DOLLAR|FBMcf_TAIL_Z|FBMcf_TAIL_z) + +#define FBMrf_MULTILINE 1 diff --git a/t/op/pat.t b/t/op/pat.t index b6a3a3a..a086c12 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..186\n"; +print "1..188\n"; BEGIN { chdir 't' if -d 't'; @@ -858,3 +858,17 @@ $test++; print "$1\n"; $test++; +# See if $i work inside (?{}) in the presense of saved substrings and +# changing $_ +@a = qw(foo bar); +@b = (); +s/(\w)(?{push @b, $1})/,$1,/g for @a; + +print "# \@b='@b', expect 'f o o b a r'\nnot " unless("@b" eq "f o o b a r"); +print "ok $test\n"; +$test++; + +print "not " unless("@a" eq ",f,,o,,o, ,b,,a,,r,"); +print "ok $test\n"; +$test++; + diff --git a/t/op/re_tests b/t/op/re_tests index ba824ae..466fc85 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -482,11 +482,204 @@ $(?<=^(a)) a y $1 a ((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x (?<=x+)y - c - /(?<=x+)y/: variable length lookbehind not implemented a{37,17} - c - /a{37,17}/: Can't do {n,m} with n > m +\Z a\nb\n y $-[0] 3 +\z a\nb\n y $-[0] 4 +$ a\nb\n y $-[0] 3 +\Z b\na\n y $-[0] 3 +\z b\na\n y $-[0] 4 +$ b\na\n y $-[0] 3 +\Z b\na y $-[0] 3 +\z b\na y $-[0] 3 +$ b\na y $-[0] 3 +'\Z'm a\nb\n y $-[0] 3 +'\z'm a\nb\n y $-[0] 4 +'$'m a\nb\n y $-[0] 1 +'\Z'm b\na\n y $-[0] 3 +'\z'm b\na\n y $-[0] 4 +'$'m b\na\n y $-[0] 1 +'\Z'm b\na y $-[0] 3 +'\z'm b\na y $-[0] 3 +'$'m b\na y $-[0] 1 a\Z a\nb\n n - - -b\Z a\nb\n y - - -b\z a\nb\n n - - -b\Z a\nb y - - -b\z a\nb y - - +a\z a\nb\n n - - +a$ a\nb\n n - - +a\Z b\na\n y $-[0] 2 +a\z b\na\n n - - +a$ b\na\n y $-[0] 2 +a\Z b\na y $-[0] 2 +a\z b\na y $-[0] 2 +a$ b\na y $-[0] 2 +'a\Z'm a\nb\n bn - - +'a\z'm a\nb\n n - - +'a$'m a\nb\n y $-[0] 0 +'a\Z'm b\na\n y $-[0] 2 +'a\z'm b\na\n n - - +'a$'m b\na\n y $-[0] 2 +'a\Z'm b\na y $-[0] 2 +'a\z'm b\na y $-[0] 2 +'a$'m b\na y $-[0] 2 +aa\Z aa\nb\n n - - +aa\z aa\nb\n n - - +aa$ aa\nb\n n - - +aa\Z b\naa\n y $-[0] 2 +aa\z b\naa\n n - - +aa$ b\naa\n y $-[0] 2 +aa\Z b\naa y $-[0] 2 +aa\z b\naa y $-[0] 2 +aa$ b\naa y $-[0] 2 +'aa\Z'm aa\nb\n bn - - +'aa\z'm aa\nb\n n - - +'aa$'m aa\nb\n y $-[0] 0 +'aa\Z'm b\naa\n y $-[0] 2 +'aa\z'm b\naa\n n - - +'aa$'m b\naa\n y $-[0] 2 +'aa\Z'm b\naa y $-[0] 2 +'aa\z'm b\naa y $-[0] 2 +'aa$'m b\naa y $-[0] 2 +aa\Z ac\nb\n n - - +aa\z ac\nb\n n - - +aa$ ac\nb\n n - - +aa\Z b\nac\n n - - +aa\z b\nac\n n - - +aa$ b\nac\n n - - +aa\Z b\nac n - - +aa\z b\nac n - - +aa$ b\nac n - - +'aa\Z'm ac\nb\n n - - +'aa\z'm ac\nb\n n - - +'aa$'m ac\nb\n n - - +'aa\Z'm b\nac\n n - - +'aa\z'm b\nac\n n - - +'aa$'m b\nac\n n - - +'aa\Z'm b\nac n - - +'aa\z'm b\nac n - - +'aa$'m b\nac n - - +aa\Z ca\nb\n n - - +aa\z ca\nb\n n - - +aa$ ca\nb\n n - - +aa\Z b\nca\n n - - +aa\z b\nca\n n - - +aa$ b\nca\n n - - +aa\Z b\nca n - - +aa\z b\nca n - - +aa$ b\nca n - - +'aa\Z'm ca\nb\n n - - +'aa\z'm ca\nb\n n - - +'aa$'m ca\nb\n n - - +'aa\Z'm b\nca\n n - - +'aa\z'm b\nca\n n - - +'aa$'m b\nca\n n - - +'aa\Z'm b\nca n - - +'aa\z'm b\nca n - - +'aa$'m b\nca n - - +ab\Z ab\nb\n n - - +ab\z ab\nb\n n - - +ab$ ab\nb\n n - - +ab\Z b\nab\n y $-[0] 2 +ab\z b\nab\n n - - +ab$ b\nab\n y $-[0] 2 +ab\Z b\nab y $-[0] 2 +ab\z b\nab y $-[0] 2 +ab$ b\nab y $-[0] 2 +'ab\Z'm ab\nb\n bn - - +'ab\z'm ab\nb\n n - - +'ab$'m ab\nb\n y $-[0] 0 +'ab\Z'm b\nab\n y $-[0] 2 +'ab\z'm b\nab\n n - - +'ab$'m b\nab\n y $-[0] 2 +'ab\Z'm b\nab y $-[0] 2 +'ab\z'm b\nab y $-[0] 2 +'ab$'m b\nab y $-[0] 2 +ab\Z ac\nb\n n - - +ab\z ac\nb\n n - - +ab$ ac\nb\n n - - +ab\Z b\nac\n n - - +ab\z b\nac\n n - - +ab$ b\nac\n n - - +ab\Z b\nac n - - +ab\z b\nac n - - +ab$ b\nac n - - +'ab\Z'm ac\nb\n n - - +'ab\z'm ac\nb\n n - - +'ab$'m ac\nb\n n - - +'ab\Z'm b\nac\n n - - +'ab\z'm b\nac\n n - - +'ab$'m b\nac\n n - - +'ab\Z'm b\nac n - - +'ab\z'm b\nac n - - +'ab$'m b\nac n - - +ab\Z ca\nb\n n - - +ab\z ca\nb\n n - - +ab$ ca\nb\n n - - +ab\Z b\nca\n n - - +ab\z b\nca\n n - - +ab$ b\nca\n n - - +ab\Z b\nca n - - +ab\z b\nca n - - +ab$ b\nca n - - +'ab\Z'm ca\nb\n n - - +'ab\z'm ca\nb\n n - - +'ab$'m ca\nb\n n - - +'ab\Z'm b\nca\n n - - +'ab\z'm b\nca\n n - - +'ab$'m b\nca\n n - - +'ab\Z'm b\nca n - - +'ab\z'm b\nca n - - +'ab$'m b\nca n - - +abb\Z abb\nb\n n - - +abb\z abb\nb\n n - - +abb$ abb\nb\n n - - +abb\Z b\nabb\n y $-[0] 2 +abb\z b\nabb\n n - - +abb$ b\nabb\n y $-[0] 2 +abb\Z b\nabb y $-[0] 2 +abb\z b\nabb y $-[0] 2 +abb$ b\nabb y $-[0] 2 +'abb\Z'm abb\nb\n bn - - +'abb\z'm abb\nb\n n - - +'abb$'m abb\nb\n y $-[0] 0 +'abb\Z'm b\nabb\n y $-[0] 2 +'abb\z'm b\nabb\n n - - +'abb$'m b\nabb\n y $-[0] 2 +'abb\Z'm b\nabb y $-[0] 2 +'abb\z'm b\nabb y $-[0] 2 +'abb$'m b\nabb y $-[0] 2 +abb\Z ac\nb\n n - - +abb\z ac\nb\n n - - +abb$ ac\nb\n n - - +abb\Z b\nac\n n - - +abb\z b\nac\n n - - +abb$ b\nac\n n - - +abb\Z b\nac n - - +abb\z b\nac n - - +abb$ b\nac n - - +'abb\Z'm ac\nb\n n - - +'abb\z'm ac\nb\n n - - +'abb$'m ac\nb\n n - - +'abb\Z'm b\nac\n n - - +'abb\z'm b\nac\n n - - +'abb$'m b\nac\n n - - +'abb\Z'm b\nac n - - +'abb\z'm b\nac n - - +'abb$'m b\nac n - - +abb\Z ca\nb\n n - - +abb\z ca\nb\n n - - +abb$ ca\nb\n n - - +abb\Z b\nca\n n - - +abb\z b\nca\n n - - +abb$ b\nca\n n - - +abb\Z b\nca n - - +abb\z b\nca n - - +abb$ b\nca n - - +'abb\Z'm ca\nb\n n - - +'abb\z'm ca\nb\n n - - +'abb$'m ca\nb\n n - - +'abb\Z'm b\nca\n n - - +'abb\z'm b\nca\n n - - +'abb$'m b\nca\n n - - +'abb\Z'm b\nca n - - +'abb\z'm b\nca n - - +'abb$'m b\nca n - - (^|x)(c) ca y $2 c a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz x n - - a(?{$a=2;$b=3;($b)=$a})b yabz y $b 2 diff --git a/t/op/regexp.t b/t/op/regexp.t index 98d998d..66b2d1c 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -16,6 +16,8 @@ $ENV{PERL_DESTRUCT_LEVEL} = 0 unless $ENV{PERL_DESTRUCT_LEVEL} > 3; # y expect a match # n expect no match # c expect an error +# B test exposes a known bug in Perl, should be skipped +# b test exposes a known bug in Perl, should be skipped if noamp # # Columns 4 and 5 are used only if column 3 contains C or C. # @@ -62,7 +64,9 @@ while () { $subject =~ s/\\n/\n/g; $expect =~ s/\\n/\n/g; $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; - for $study ("", "study \$subject") { + $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); + $result =~ s/B//i unless $skip; + for $study ('', 'study \$subject') { $c = $iters; eval "$study; \$match = (\$subject =~ m$pat) while \$c--; \$got = \"$repl\";"; chomp( $err = $@ ); @@ -70,6 +74,9 @@ while () { if ($err !~ m!^\Q$expect!) { print "not ok $. (compile) $input => `$err'\n"; next TEST } last; # no need to study a syntax error } + elsif ( $skip ) { + print "ok $. # Skipped: not fixed yet\n"; next TEST; + } elsif ($@) { print "not ok $. $input => error `$err'\n"; next TEST; } diff --git a/thrdvar.h b/thrdvar.h index 7fae131..dcaaccb 100644 --- a/thrdvar.h +++ b/thrdvar.h @@ -142,8 +142,8 @@ PERLVAR(Tcolors[6], char *) /* from regcomp.c */ PERLVAR(Treginput, char *) /* String-input pointer. */ PERLVAR(Tregbol, char *) /* Beginning of input, for ^ check. */ PERLVAR(Tregeol, char *) /* End of input, for $ check. */ -PERLVAR(Tregstartp, char **) /* Pointer to startp array. */ -PERLVAR(Tregendp, char **) /* Ditto for endp. */ +PERLVAR(Tregstartp, I32 *) /* Pointer to startp array. */ +PERLVAR(Tregendp, I32 *) /* Ditto for endp. */ PERLVAR(Treglastparen, U32 *) /* Similarly for lastparen. */ PERLVAR(Tregtill, char *) /* How far we are required to go. */ PERLVAR(Tregprev, char) /* char before regbol, \n if none */ @@ -166,6 +166,8 @@ PERLVAR(Treg_magic, MAGIC *) /* pos-magic of what we match */ PERLVAR(Treg_oldpos, I32) /* old pos of what we match */ PERLVARI(Treg_oldcurpm, PMOP*, NULL) /* curpm before match */ PERLVARI(Treg_curpm, PMOP*, NULL) /* curpm during match */ +PERLVAR(Treg_oldsaved, char*) /* old saved substr during match */ +PERLVAR(Treg_oldsavedlen, STRLEN) /* old length of saved substr during match */ PERLVARI(Tregcompp, regcomp_t, FUNC_NAME_TO_PTR(pregcomp)) /* Pointer to RE compiler */ diff --git a/util.c b/util.c index 67c030b..0c2b052 100644 --- a/util.c +++ b/util.c @@ -889,6 +889,14 @@ mem_collxfrm(const char *s, STRLEN len, STRLEN *xlen) #endif /* USE_LOCALE_COLLATE */ +#define FBM_TABLE_OFFSET 2 /* Number of bytes between EOS and table*/ + +/* As a space optimization, we do not compile tables for strings of length + 0 and 1, and for strings of length 2 unless FBMcf_TAIL. These are + special-cased in fbm_instr(). + + If FBMcf_TAIL, the table is created as if the string has a trailing \n. */ + void fbm_compile(SV *sv, U32 flags /* not used yet */) { @@ -899,24 +907,32 @@ fbm_compile(SV *sv, U32 flags /* not used yet */) I32 rarest = 0; U32 frequency = 256; + if (flags & FBMcf_TAIL) + sv_catpvn(sv, "\n", 1); /* Taken into account in fbm_instr() */ s = (U8*)SvPV_force(sv, len); (void)SvUPGRADE(sv, SVt_PVBM); - if (len > 255 || len == 0) /* TAIL might be on on a zero-length string. */ - return; /* can't have offsets that big */ + if (len == 0) /* TAIL might be on on a zero-length string. */ + return; if (len > 2) { - Sv_Grow(sv,len + 258); - table = (unsigned char*)(SvPVX(sv) + len + 1); - s = table - 2; + I32 mlen = len; + unsigned char *sb; + + if (mlen > 255) + mlen = 255; + Sv_Grow(sv,len + 256 + FBM_TABLE_OFFSET); + table = (unsigned char*)(SvPVX(sv) + len + FBM_TABLE_OFFSET); + s = table - 1 - FBM_TABLE_OFFSET; /* Last char */ for (i = 0; i < 256; i++) { - table[i] = len; + table[i] = mlen; } + table[-1] = flags; /* Not used yet */ i = 0; - while (s >= (unsigned char*)(SvPVX(sv))) - { - if (table[*s] == len) - table[*s] = i; - s--,i++; - } + sb = s - mlen; + while (s >= sb) { + if (table[*s] == mlen) + table[*s] = i; + s--, i++; + } } sv_magic(sv, Nullsv, 'B', Nullch, 0); /* deep magic */ SvVALID_on(sv); @@ -930,119 +946,200 @@ fbm_compile(SV *sv, U32 flags /* not used yet */) } BmRARE(sv) = s[rarest]; BmPREVIOUS(sv) = rarest; + BmUSEFUL(sv) = 100; /* Initial value */ + if (flags & FBMcf_TAIL) + SvTAIL_on(sv); DEBUG_r(PerlIO_printf(Perl_debug_log, "rarest char %c at %d\n",BmRARE(sv),BmPREVIOUS(sv))); } +/* If SvTAIL(littlestr), it has a fake '\n' at end. */ +/* If SvTAIL is actually due to \Z or \z, this gives false positives + if multiline */ + char * fbm_instr(unsigned char *big, register unsigned char *bigend, SV *littlestr, U32 flags) { register unsigned char *s; - register I32 tmp; - register I32 littlelen; - register unsigned char *little; - register unsigned char *table; - register unsigned char *olds; - register unsigned char *oldlittle; + STRLEN l; + register unsigned char *little = (unsigned char *)SvPV(littlestr,l); + register STRLEN littlelen = l; + register I32 multiline = flags & FBMrf_MULTILINE; + + if (bigend - big < littlelen) { + check_tail: + if ( SvTAIL(littlestr) + && (bigend - big == littlelen - 1) + && (littlelen == 1 + || *big == *little && memEQ(big, little, littlelen - 1))) + return (char*)big; + return Nullch; + } - if (SvTYPE(littlestr) != SVt_PVBM || !SvVALID(littlestr)) { - STRLEN len; - char *l = SvPV(littlestr,len); - if (!len) { - if (SvTAIL(littlestr)) { /* Can be only 0-len constant - substr => we can ignore SvVALID */ - if (PL_multiline) { - char *t = "\n"; - if ((s = (unsigned char*)ninstr((char*)big, (char*)bigend, - t, t + len))) { - return (char*)s; + if (littlelen <= 2) { /* Special-cased */ + register char c; + + if (littlelen == 1) { + if (SvTAIL(littlestr) && !multiline) { /* Anchor only! */ + /* Know that bigend != big. */ + if (bigend[-1] == '\n') + return (char *)(bigend - 1); + return (char *) bigend; + } + s = big; + while (s < bigend) { + if (*s == *little) + return (char *)s; + s++; + } + if (SvTAIL(littlestr)) + return (char *) bigend; + return Nullch; + } + if (!littlelen) + return (char*)big; /* Cannot be SvTAIL! */ + + /* littlelen is 2 */ + if (SvTAIL(littlestr) && !multiline) { + if (bigend[-1] == '\n' && bigend[-2] == *little) + return (char*)bigend - 2; + if (bigend[-1] == *little) + return (char*)bigend - 1; + return Nullch; + } + { + /* This should be better than FBM if c1 == c2, and almost + as good otherwise: maybe better since we do less indirection. + And we save a lot of memory by caching no table. */ + register unsigned char c1 = little[0]; + register unsigned char c2 = little[1]; + + s = big + 1; + bigend--; + if (c1 != c2) { + while (s <= bigend) { + if (s[0] == c2) { + if (s[-1] == c1) + return (char*)s - 1; + s += 2; + continue; + } + next_chars: + if (s[0] == c1) { + if (s == bigend) + goto check_1char_anchor; + if (s[1] == c2) + return (char*)s; + else { + s++; + goto next_chars; + } } + else + s += 2; + } + goto check_1char_anchor; + } + /* Now c1 == c2 */ + while (s <= bigend) { + if (s[0] == c1) { + if (s[-1] == c1) + return (char*)s - 1; + if (s == bigend) + goto check_1char_anchor; + if (s[1] == c1) + return (char*)s; + s += 3; } - if (bigend > big && bigend[-1] == '\n') - return (char *)(bigend - 1); else - return (char *) bigend; + s += 2; } - return (char*)big; } - return ninstr((char*)big,(char*)bigend, l, l + len); + check_1char_anchor: /* One char and anchor! */ + if (SvTAIL(littlestr) && (*bigend == *little)) + return (char *)bigend; /* bigend is already decremented. */ + return Nullch; } - - littlelen = SvCUR(littlestr); - if (SvTAIL(littlestr) && !PL_multiline) { /* tail anchored? */ - if (littlelen > bigend - big) - return Nullch; - little = (unsigned char*)SvPVX(littlestr); + if (SvTAIL(littlestr) && !multiline) { /* tail anchored? */ s = bigend - littlelen; - if (s > big + if (s >= big && bigend[-1] == '\n' - && s[-1] == *little && memEQ((char*)s - 1,(char*)little,littlelen)) - return (char*)s - 1; /* how sweet it is */ - else if (*s == *little && memEQ((char*)s,(char*)little,littlelen)) + && *s == *little + /* Automatically of length > 2 */ + && memEQ((char*)s + 1, (char*)little + 1, littlelen - 2)) return (char*)s; /* how sweet it is */ + if (s[1] == *little && memEQ((char*)s + 2,(char*)little + 1, + littlelen - 2)) + return (char*)s + 1; /* how sweet it is */ return Nullch; } - if (littlelen <= 2) { - unsigned char c1 = (unsigned char)SvPVX(littlestr)[0]; - unsigned char c2 = (unsigned char)SvPVX(littlestr)[1]; - /* This may do extra comparisons if littlelen == 2, but this - should be hidden in the noise since we do less indirection. */ - - s = big; - bigend -= littlelen; - while (s <= bigend) { - if (s[0] == c1 - && (littlelen == 1 || s[1] == c2) - && (!SvTAIL(littlestr) - || s == bigend - || s[littlelen] == '\n')) /* Automatically multiline */ - { + if (SvTYPE(littlestr) != SVt_PVBM || !SvVALID(littlestr)) { + char *b = ninstr((char*)big,(char*)bigend, + (char*)little, (char*)little + littlelen); + + if (!b && SvTAIL(littlestr)) { /* Automatically multiline! */ + /* Chop \n from littlestr: */ + s = bigend - littlelen + 1; + if (*s == *little && memEQ((char*)s + 1, (char*)little + 1, + littlelen - 2)) return (char*)s; - } - s++; + return Nullch; } - return Nullch; + return b; } - table = (unsigned char*)(SvPVX(littlestr) + littlelen + 1); - if (--littlelen >= bigend - big) - return Nullch; - s = big + littlelen; - oldlittle = little = table - 2; - if (s < bigend) { - top2: - /*SUPPRESS 560*/ - if (tmp = table[*s]) { + + { /* Do actual FBM. */ + register unsigned char *table = little + littlelen + FBM_TABLE_OFFSET; + register unsigned char *oldlittle; + + if (littlelen > bigend - big) + return Nullch; + --littlelen; /* Last char found by table lookup */ + + s = big + littlelen; + little += littlelen; /* last char */ + oldlittle = little; + if (s < bigend) { + register I32 tmp; + + top2: + /*SUPPRESS 560*/ + if (tmp = table[*s]) { #ifdef POINTERRIGOR - if (bigend - s > tmp) { + if (bigend - s > tmp) { + s += tmp; + goto top2; + } s += tmp; - goto top2; - } #else - if ((s += tmp) < bigend) - goto top2; -#endif - return Nullch; - } - else { - tmp = littlelen; /* less expensive than calling strncmp() */ - olds = s; - while (tmp--) { - if (*--s == *--little) - continue; - differ: - s = olds + 1; /* here we pay the price for failure */ - little = oldlittle; - if (s < bigend) /* fake up continue to outer loop */ + if ((s += tmp) < bigend) goto top2; - return Nullch; +#endif + goto check_end; + } + else { /* less expensive than calling strncmp() */ + register unsigned char *olds = s; + + tmp = littlelen; + + while (tmp--) { + if (*--s == *--little) + continue; + differ: + s = olds + 1; /* here we pay the price for failure */ + little = oldlittle; + if (s < bigend) /* fake up continue to outer loop */ + goto top2; + goto check_end; + } + return (char *)s; } - if (SvTAIL(littlestr) /* automatically multiline */ - && olds + 1 != bigend - && olds[1] != '\n') - goto differ; - return (char *)s; } + check_end: + if ( s == bigend && (table[-1] & FBMcf_TAIL) + && memEQ(bigend - littlelen, oldlittle - littlelen, littlelen) ) + return (char*)bigend - littlelen; + return Nullch; } - return Nullch; } /* start_shift, end_shift are positive quantities which give offsets @@ -1051,10 +1148,15 @@ fbm_instr(unsigned char *big, register unsigned char *bigend, SV *littlestr, U32 old_posp is the way of communication between consequent calls if the next call needs to find the . The initial *old_posp should be -1. - Note that we do not take into account SvTAIL, so it may give wrong - positives if _ALL flag is set. + + Note that we take into account SvTAIL, so one can get extra + optimizations if _ALL flag is set. */ +/* If SvTAIL is actually due to \Z or \z, this gives false positives + if PL_multiline. In fact if !PL_multiline the autoritative answer + is not supported yet. */ + char * screaminstr(SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift, I32 *old_posp, I32 last) { @@ -1071,8 +1173,18 @@ screaminstr(SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift, I32 *old_ if (*old_posp == -1 ? (pos = PL_screamfirst[BmRARE(littlestr)]) < 0 - : (((pos = *old_posp), pos += PL_screamnext[pos]) == 0)) + : (((pos = *old_posp), pos += PL_screamnext[pos]) == 0)) { + cant_find: + if ( BmRARE(littlestr) == '\n' + && BmPREVIOUS(littlestr) == SvCUR(littlestr) - 1) { + little = (unsigned char *)(SvPVX(littlestr)); + littleend = little + SvCUR(littlestr); + first = *little++; + goto check_tail; + } return Nullch; + } + little = (unsigned char *)(SvPVX(littlestr)); littleend = little + SvCUR(littlestr); first = *little++; @@ -1081,10 +1193,14 @@ screaminstr(SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift, I32 *old_ big = (unsigned char *)(SvPVX(bigstr)); /* The value of pos we can stop at: */ stop_pos = SvCUR(bigstr) - end_shift - (SvCUR(littlestr) - 1 - previous); - if (previous + start_shift > stop_pos) return Nullch; + if (previous + start_shift > stop_pos) { + if (previous + start_shift == stop_pos + 1) /* A fake '\n'? */ + goto check_tail; + return Nullch; + } while (pos < previous + start_shift) { if (!(pos += PL_screamnext[pos])) - return Nullch; + goto cant_find; } #ifdef POINTERRIGOR do { @@ -1122,8 +1238,22 @@ screaminstr(SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift, I32 *old_ found = 1; } } while ( pos += PL_screamnext[pos] ); - return (last && found) ? (char *)(big+(*old_posp)) : Nullch; + if (last && found) + return (char *)(big+(*old_posp)); #endif /* POINTERRIGOR */ + check_tail: + if (!SvTAIL(littlestr) || (end_shift > 0)) + return Nullch; + /* Ignore the trailing "\n". This code is not microoptimized */ + big = (unsigned char *)(SvPVX(bigstr) + SvCUR(bigstr)); + stop_pos = littleend - little; /* Actual littlestr len */ + if (stop_pos == 0) + return (char*)big; + big -= stop_pos; + if (*big == first + && ((stop_pos == 1) || memEQ(big + 1, little, stop_pos - 1))) + return (char*)big; + return Nullch; } I32 -- 1.8.3.1