X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/8fdd422b549aa89c85abf0d3cec589db64be43b6..c567402153be05de9e4e6ef0b23195e50431c6d8:/regexec.c diff --git a/regexec.c b/regexec.c index d2ebc44..cbc8c19 100644 --- a/regexec.c +++ b/regexec.c @@ -221,6 +221,22 @@ typedef struct re_cc_state #define regcpblow(cp) LEAVE_SCOPE(cp) +#define TRYPAREN(paren, n, input) { \ + if (paren) { \ + if (n) { \ + PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \ + PL_regendp[paren] = input - PL_bostr; \ + } \ + else \ + PL_regendp[paren] = -1; \ + } \ + if (regmatch(next)) \ + sayYES; \ + if (paren && n) \ + PL_regendp[paren] = -1; \ +} + + /* * pregexec and friends */ @@ -346,7 +362,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, I32 slen; if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */ - && (sv && (strpos + SvCUR(sv) != strend)) ) { + /* SvCUR is not set on references: SvRV and SvPVX overlap */ + && sv && !SvROK(sv) + && (strpos + SvCUR(sv) != strend)) { DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } @@ -597,9 +615,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, find_anchor: while (t < strend - prog->minlen) { if (*t == '\n') { - if (t < s - prog->check_offset_min) { + if (t < check_at - prog->check_offset_min) { if (prog->anchored_substr) { - /* We definitely contradict the found anchored + /* Since we moved from the found position, + we definitely contradict the found anchored substr. Due to the above check we do not contradict "check" substr. Thus we can arrive here only if check substr @@ -610,14 +629,19 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); goto do_other_anchored; } + /* We don't contradict the found floating substring. */ + /* XXXX Why not check for STCLASS? */ s = t + 1; DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", PL_colors[0],PL_colors[1], (long)(s - i_strpos))); goto set_useful; } - DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n", + /* Position contradicts check-string */ + /* XXXX probably better to look for check-string + than for "\n", so one should lower the limit for t? */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos))); - strpos = s = t + 1; + other_last = strpos = s = t + 1; goto restart; } t++; @@ -626,19 +650,24 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, PL_colors[0],PL_colors[1])); goto fail_finish; } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", + PL_colors[0],PL_colors[1])); + } s = t; set_useful: ++BmUSEFUL(prog->check_substr); /* hooray/5 */ } else { PL_bostr = tmp; - /* The found string does not prohibit matching at beg-of-str + /* The found string does not prohibit matching at strpos, - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL. */ + unless it was an MBOL and we are not after MBOL, + or a future STCLASS check will fail this. */ try_at_start: /* Even in this situation we may use MBOL flag if strpos is offset wrt the start of the string. */ - if (ml_anch && sv + if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */ && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n' /* May be due to an implicit anchor of m{.*foo} */ && !(prog->reganch & ROPT_IMPLICIT)) @@ -647,8 +676,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, goto find_anchor; } DEBUG_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n", - PL_colors[0],PL_colors[1]); + PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", + (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]); ); success_at_start: if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */ @@ -657,9 +686,11 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, && prog->check_substr == prog->float_substr) { /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); SvREFCNT_dec(prog->check_substr); prog->check_substr = Nullsv; /* disable */ prog->float_substr = Nullsv; /* clear */ + check = Nullsv; /* abort */ s = strpos; /* XXXX This is a remnant of the old implementation. It looks wasteful, since now INTUIT can use many @@ -722,8 +753,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, "Could not match STCLASS...\n") ); goto fail; } + if (!check) + goto giveup; DEBUG_r( PerlIO_printf(Perl_debug_log, - "Trying %s substr starting at offset %ld...\n", + "Looking for %s substr starting at offset %ld...\n", what, (long)(s + start_shift - i_strpos)) ); goto restart; } @@ -732,8 +765,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, goto retry_floating_check; /* Recheck anchored substring, but not floating... */ s = check_at; + if (!check) + goto giveup; DEBUG_r( PerlIO_printf(Perl_debug_log, - "Trying anchored substr starting at offset %ld...\n", + "Looking for anchored substr starting at offset %ld...\n", (long)(other_last - i_strpos)) ); goto do_other_anchored; } @@ -741,9 +776,11 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, current position only: */ if (ml_anch) { s = t = t + 1; + if (!check) + goto giveup; DEBUG_r( PerlIO_printf(Perl_debug_log, - "Trying /^/m starting at offset %ld...\n", - (long)(t - i_strpos)) ); + "Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(t - i_strpos)) ); goto try_at_offset; } if (!prog->float_substr) /* Could have been deleted */ @@ -762,8 +799,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, PerlIO_printf(Perl_debug_log, "Does not contradict STCLASS...\n") ); } - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n", - PL_colors[4], PL_colors[5], (long)(s - i_strpos)) ); + giveup: + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", + PL_colors[4], (check ? "Guessed" : "Giving up"), + PL_colors[5], (long)(s - i_strpos)) ); return s; fail_finish: /* Substring not found */ @@ -781,9 +820,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta { I32 doevery = (prog->reganch & ROPT_SKIP) == 0; char *m; - int ln; - int c1; - int c2; + STRLEN ln; + unsigned int c1; + unsigned int c2; char *e; register I32 tmp = 1; /* Scratch variable? */ @@ -804,7 +843,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case ANYOF: while (s < strend) { - if (REGINCLASS(c, *s)) { + if (REGINCLASS(c, *(U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -818,13 +857,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case EXACTF: m = STRING(c); ln = STR_LEN(c); - c1 = *m; + c1 = *(U8*)m; c2 = PL_fold[c1]; goto do_exactf; case EXACTFL: m = STRING(c); ln = STR_LEN(c); - c1 = *m; + c1 = *(U8*)m; c2 = PL_fold_locale[c1]; do_exactf: e = strend - ln; @@ -834,7 +873,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta /* Here it is NOT UTF! */ if (c1 == c2) { while (s <= e) { - if ( *s == c1 + if ( *(U8*)s == c1 && (ln == 1 || !(OP(c) == EXACTF ? ibcmp(s, m, ln) : ibcmp_locale(s, m, ln))) @@ -844,7 +883,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } } else { while (s <= e) { - if ( (*s == c1 || *s == c2) + if ( (*(U8*)s == c1 || *(U8*)s == c2) && (ln == 1 || !(OP(c) == EXACTF ? ibcmp(s, m, ln) : ibcmp_locale(s, m, ln))) @@ -1274,10 +1313,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * register char *s; register regnode *c; register char *startpos = stringarg; - register I32 tmp; I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ - I32 start_shift = 0; /* Offset of the start to find + /* I32 start_shift = 0; */ /* Offset of the start to find constant substr. */ /* CC */ I32 end_shift = 0; /* Same for the end. */ /* CC */ I32 scream_pos = -1; /* Internal iterator of scream. */ @@ -1433,9 +1471,14 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* we have /x+whatever/ */ /* it must be a one character string (XXXX Except UTF?) */ char ch = SvPVX(prog->anchored_substr)[0]; +#ifdef DEBUGGING + int did_match = 0; +#endif + if (UTF) { while (s < strend) { if (*s == ch) { + DEBUG_r( did_match = 1 ); if (regtry(prog, s)) goto got_it; s += UTF8SKIP(s); while (s < strend && *s == ch) @@ -1447,6 +1490,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * else { while (s < strend) { if (*s == ch) { + DEBUG_r( did_match = 1 ); if (regtry(prog, s)) goto got_it; s++; while (s < strend && *s == ch) @@ -1455,6 +1499,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * s++; } } + DEBUG_r(did_match || + PerlIO_printf(Perl_debug_log, + "Did not find anchored character...\n")); } /*SUPPRESS 560*/ else if (prog->anchored_substr != Nullsv @@ -1466,11 +1513,13 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; I32 back_min = prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; - I32 delta = back_max - back_min; char *last = HOPc(strend, /* Cannot start after this */ -(I32)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min)); char *last1; /* Last position checked before */ +#ifdef DEBUGGING + int did_match = 0; +#endif if (s > PL_bostr) last1 = HOPc(s, -1); @@ -1489,6 +1538,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * : (s = fbm_instr((unsigned char*)HOP(s, back_min), (unsigned char*)strend, must, PL_multiline ? FBMrf_MULTILINE : 0))) ) { + DEBUG_r( did_match = 1 ); if (HOPc(s, -back_max) > last1) { last1 = HOPc(s, -back_min); s = HOPc(s, -back_max); @@ -1514,20 +1564,28 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } } } + DEBUG_r(did_match || + PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n", + ((must == prog->anchored_substr) + ? "anchored" : "floating"), + PL_colors[0], + (int)(SvCUR(must) - (SvTAIL(must)!=0)), + SvPVX(must), + PL_colors[1], (SvTAIL(must) ? "$" : ""))); goto phooey; } - else if (c = prog->regstclass) { + else if ((c = prog->regstclass)) { if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT) /* don't bother with what can't match */ strend = HOPc(strend, -(minlen - 1)); if (find_byclass(prog, c, s, strend, startpos, 0)) goto got_it; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n")); } else { dontbother = 0; if (prog->float_substr != Nullsv) { /* Trim the end. */ char *last; - I32 oldpos = scream_pos; if (flags & REXEC_SCREAM) { last = screaminstr(sv, prog->float_substr, s - strbeg, @@ -1555,7 +1613,12 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * last = strend; /* matching `$' */ } } - if (last == NULL) goto phooey; /* Should not happen! */ + if (last == NULL) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%sCan't trim the tail, match fails (should not happen)%s\n", + PL_colors[4],PL_colors[5])); + goto phooey; /* Should not happen! */ + } dontbother = strend - last + prog->float_min_offset; } if (minlen && (dontbother < minlen)) @@ -1617,6 +1680,8 @@ got_it: return 1; phooey: + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", + PL_colors[4],PL_colors[5])); if (PL_reg_eval_set) restore_pos(aTHXo_ 0); return 0; @@ -1841,7 +1906,7 @@ S_regmatch(pTHX_ regnode *prog) } sayNO; case SBOL: - if (locinput == PL_regbol && PL_regprev == '\n') + if (locinput == PL_bostr) break; sayNO; case GPOS: @@ -1894,12 +1959,12 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') sayNO; nextchr = UCHARAT(++locinput); break; case REG_ANY: - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') sayNO; nextchr = UCHARAT(++locinput); break; @@ -2663,10 +2728,10 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = cc; if (n >= cc->max) { /* Maximum greed exceeded? */ - if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY + if (ckWARN(WARN_REGEXP) && n >= REG_INFTY && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; - Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded", + Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded", "Complex regular subexpression recursion", REG_INFTY - 1); } @@ -2715,10 +2780,10 @@ S_regmatch(pTHX_ regnode *prog) REPORT_CODE_OFF+PL_regindent*2, "") ); } - if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY + if (ckWARN(WARN_REGEXP) && n >= REG_INFTY && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; - Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded", + Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded", "Complex regular subexpression recursion", REG_INFTY - 1); } @@ -2765,7 +2830,7 @@ S_regmatch(pTHX_ regnode *prog) *PL_reglastparen = n; scan = next; /*SUPPRESS 560*/ - if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))) + if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))) next += n; else next = NULL; @@ -2995,16 +3060,7 @@ S_regmatch(pTHX_ regnode *prog) sayNO; } /* PL_reginput == locinput now */ - if (paren) { - if (ln) { - PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; - PL_regendp[paren] = locinput - PL_bostr; - } - else - PL_regendp[paren] = -1; - } - if (regmatch(next)) - sayYES; + TRYPAREN(paren, ln, locinput); PL_reginput = locinput; /* Could be reset... */ REGCP_UNWIND; /* Couldn't or didn't -- move forward. */ @@ -3018,16 +3074,7 @@ S_regmatch(pTHX_ regnode *prog) UCHARAT(PL_reginput) == c1 || UCHARAT(PL_reginput) == c2) { - if (paren) { - if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr; - PL_regendp[paren] = PL_reginput - PL_bostr; - } - else - PL_regendp[paren] = -1; - } - if (regmatch(next)) - sayYES; + TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND; } /* Couldn't or didn't -- move forward. */ @@ -3061,16 +3108,7 @@ S_regmatch(pTHX_ regnode *prog) UCHARAT(PL_reginput) == c1 || UCHARAT(PL_reginput) == c2) { - if (paren && n) { - if (n) { - PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr; - PL_regendp[paren] = PL_reginput - PL_bostr; - } - else - PL_regendp[paren] = -1; - } - if (regmatch(next)) - sayYES; + TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND; } /* Couldn't or didn't -- back up. */ @@ -3085,8 +3123,7 @@ S_regmatch(pTHX_ regnode *prog) UCHARAT(PL_reginput) == c1 || UCHARAT(PL_reginput) == c2) { - if (regmatch(next)) - sayYES; + TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND; } /* Couldn't or didn't -- back up. */ @@ -3609,8 +3646,7 @@ S_reginclassutf8(pTHX_ regnode *f, U8 *p) if (swash_fetch(sv, p)) match = TRUE; else if (flags & ANYOF_FOLD) { - I32 cf; - U8 tmpbuf[10]; + U8 tmpbuf[UTF8_MAXLEN]; if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));