*/
#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
+#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
#define HOPc(pos,off) \
- (char *)(PL_reg_match_utf8 \
- ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
+ (char *)(reginfo->is_utf8_target \
+ ? reghop3((U8*)pos, off, \
+ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
: (U8*)(pos + off))
#define HOPBACKc(pos, off) \
- (char*)(PL_reg_match_utf8\
- ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
- : (pos - off >= PL_bostr) \
+ (char*)(reginfo->is_utf8_target \
+ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
+ : (pos - off >= reginfo->strbeg) \
? (U8*)pos - off \
: NULL)
-#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
+#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
#define NEXTCHR_IS_EOS (nextchr < 0)
#define SET_nextchr \
- nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
+ nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
#define SET_locinput(p) \
locinput = (p); \
#define SCount 11172 /* Length of block */
#define TCount 28
-static void restore_pos(pTHX_ void *arg);
+#define SLAB_FIRST(s) (&(s)->states[0])
+#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
+
+static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
+static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
+static regmatch_state * S_push_slab(pTHX);
#define REGCP_PAREN_ELEMS 3
#define REGCP_OTHER_ELEMS 3
* with giant delta may be not rechecked).
*/
-/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
-
/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
Otherwise, only SvCUR(sv) is used to get strbeg. */
-/* XXXX We assume that strpos is strbeg unless sv. */
-
/* XXXX Some places assume that there is a fixed substring.
An update may be needed if optimizer marks as "INTUITable"
RExen without fixed substrings. Similarly, it is assumed that
The nodes of the REx which we used for the search should have been
deleted from the finite automaton. */
+/* args:
+ * rx: the regex to match against
+ * sv: the SV being matched: only used for utf8 flag; the string
+ * itself is accessed via the pointers below. Note that on
+ * something like an overloaded SV, SvPOK(sv) may be false
+ * and the string pointers may point to something unrelated to
+ * the SV itself.
+ * strbeg: real beginning of string
+ * strpos: the point in the string at which to begin matching
+ * strend: pointer to the byte following the last char of the string
+ * flags currently unused; set to 0
+ * data: currently unused; set to NULL
+ */
+
char *
-Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
- char *strend, const U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_
+ REGEXP * const rx,
+ SV *sv,
+ const char * const strbeg,
+ char *strpos,
+ char *strend,
+ const U32 flags,
+ re_scream_pos_data *data)
{
dVAR;
struct regexp *const prog = ReANY(rx);
I32 end_shift = 0;
char *s;
SV *check;
- char *strbeg;
char *t;
const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
I32 ml_anch;
char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
RXi_GET_DECL(prog,progi);
- bool is_utf8_pat;
+ regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
+ regmatch_info *const reginfo = ®info_buf;
#ifdef DEBUGGING
const char * const i_strpos = strpos;
#endif
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(data);
- RX_MATCH_UTF8_set(rx,utf8_target);
-
- is_utf8_pat = cBOOL(RX_UTF8(rx));
-
- DEBUG_EXECUTE_r(
- debug_start_match(rx, utf8_target, strpos, strend,
- sv ? "Guessing start of match in sv for"
- : "Guessing start of match in string for");
- );
-
/* CHR_DIST() would be more correct here but it makes things slow. */
if (prog->minlen > strend - strpos) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
goto fail;
}
- /* XXX we need to pass strbeg as a separate arg: the following is
- * guesswork and can be wrong... */
- if (sv && SvPOK(sv)) {
- char * p = SvPVX(sv);
- STRLEN cur = SvCUR(sv);
- if (p <= strpos && strpos < p + cur) {
- strbeg = p;
- assert(p <= strend && strend <= p + cur);
- }
- else
- strbeg = strend - cur;
- }
- else
- strbeg = strpos;
+ reginfo->is_utf8_target = cBOOL(utf8_target);
+ reginfo->info_aux = NULL;
+ reginfo->strbeg = strbeg;
+ reginfo->strend = strend;
+ reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
+ reginfo->intuit = 1;
+ /* not actually used within intuit, but zero for safety anyway */
+ reginfo->poscache_maxiter = 0;
- PL_regeol = strend;
if (utf8_target) {
if (!prog->check_utf8 && prog->check_substr)
to_utf8_substr(prog);
}
check = prog->check_substr;
}
- if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
- ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
+ if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
+ && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
+ {
+ ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
|| ( (prog->extflags & RXf_ANCH_BOL)
&& !multiline ) ); /* Check after \n? */
if (!ml_anch) {
- if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
- && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
- /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
- && sv && !SvROK(sv)
+ if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
&& (strpos != strbeg)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
try_at_start:
/* Even in this situation we may use MBOL flag if strpos is offset
wrt the start of the string. */
- if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
- && (strpos != strbeg) && strpos[-1] != '\n'
+ if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
/* May be due to an implicit anchor of m{.*foo} */
&& !(prog->intflags & PREGf_IMPLICIT))
{
/* If regstclass takes bytelength more than 1: If charlength==1, OK.
This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
const U8* const str = (U8*)STRING(progi->regstclass);
+ /* XXX this value could be pre-computed */
const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
- ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
+ ? (reginfo->is_utf8_pat
+ ? utf8_distance(str + STR_LEN(progi->regstclass), str)
+ : STR_LEN(progi->regstclass))
: 1);
char * endpos;
if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
t = s;
s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
- NULL, is_utf8_pat);
+ reginfo);
if (s) {
checked_upto = s;
} else {
while (s <= e) { \
if ( (CoNd) \
&& (ln == 1 || folder(s, pat_string, ln)) \
- && (!reginfo || regtry(reginfo, &s)) ) \
+ && (reginfo->intuit || regtry(reginfo, &s)) )\
goto got_it; \
s++; \
} \
#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
REXEC_FBC_UTF8_SCAN( \
if (CoNd) { \
- if (tmp && (!reginfo || regtry(reginfo, &s))) \
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
goto got_it; \
else \
tmp = doevery; \
#define REXEC_FBC_CLASS_SCAN(CoNd) \
REXEC_FBC_SCAN( \
if (CoNd) { \
- if (tmp && (!reginfo || regtry(reginfo, &s))) \
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
goto got_it; \
else \
tmp = doevery; \
)
#define REXEC_FBC_TRYIT \
-if ((!reginfo || regtry(reginfo, &s))) \
+if ((reginfo->intuit || regtry(reginfo, &s))) \
goto got_it
#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
}
#define DUMP_EXEC_POS(li,s,doutf8) \
- dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
+ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
+ startpos, doutf8)
#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
- tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
+ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
tmp = TEST_NON_UTF8(tmp); \
REXEC_FBC_UTF8_SCAN( \
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
); \
#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
- if (s == PL_bostr) { \
+ if (s == reginfo->strbeg) { \
tmp = '\n'; \
} \
else { \
- U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
+ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
} \
tmp = TeSt1_UtF8; \
UTF8_CODE \
} \
else { /* Not utf8 */ \
- tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
+ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
tmp = TEST_NON_UTF8(tmp); \
REXEC_FBC_SCAN( \
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
} \
); \
} \
- if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
+ if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
goto got_it;
/* We know what class REx starts with. Try to find this position... */
-/* if reginfo is NULL, its a dryrun */
+/* if reginfo->intuit, its a dryrun */
/* annoyingly all the vars in this routine have different names from their counterparts
in regmatch. /grrr */
STATIC char *
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
- const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
+ const char *strend, regmatch_info *reginfo)
{
dVAR;
const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
U8 c2;
char *e;
I32 tmp = 1; /* Scratch variable? */
- const bool utf8_target = PL_reg_match_utf8;
+ const bool utf8_target = reginfo->is_utf8_target;
UV utf8_fold_flags = 0;
+ const bool is_utf8_pat = reginfo->is_utf8_pat;
bool to_complement = FALSE; /* Invert the result? Taking the xor of this
with a result inverts that result, as 0^1 =
1 and 1^1 = 0 */
break;
case CANY:
REXEC_FBC_SCAN(
- if (tmp && (!reginfo || regtry(reginfo, &s)))
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
goto got_it;
else
tmp = doevery;
* required minimum number from the far end */
e = HOP3c(strend, -((I32)ln), s);
- if (!reginfo && e < s) {
+ if (reginfo->intuit && e < s) {
e = s; /* Due to minlen logic of intuit() */
}
*/
e = HOP3c(strend, -((I32)lnc), s);
- if (!reginfo && e < s) {
+ if (reginfo->intuit && e < s) {
e = s; /* Due to minlen logic of intuit() */
}
char *my_strend= (char *)strend;
if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
- && (!reginfo || regtry(reginfo, &s)) )
+ && (reginfo->intuit || regtry(reginfo, &s)) )
{
goto got_it;
}
_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
classnum))))
{
- if (tmp && (!reginfo || regtry(reginfo, &s)))
+ if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
goto got_it;
else {
tmp = doevery;
(UV)accepted_word, (IV)(s - real_start)
);
});
- if (!reginfo || regtry(reginfo, &s)) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
FREETMPS;
LEAVE;
goto got_it;
return s;
}
+/* set RX_SAVED_COPY, RX_SUBBEG etc.
+ * flags have same meanings as with regexec_flags() */
+
+static void
+S_reg_set_capture_string(pTHX_ REGEXP * const rx,
+ char *strbeg,
+ char *strend,
+ SV *sv,
+ U32 flags,
+ bool utf8_target)
+{
+ struct regexp *const prog = ReANY(rx);
+
+ if (flags & REXEC_COPY_STR) {
+#ifdef PERL_ANY_COW
+ if (SvCANCOW(sv)) {
+ if (DEBUG_C_TEST) {
+ PerlIO_printf(Perl_debug_log,
+ "Copy on write: regexp capture, type %d\n",
+ (int) SvTYPE(sv));
+ }
+ /* skip creating new COW SV if a valid one already exists */
+ if (! ( prog->saved_copy
+ && SvIsCOW(sv)
+ && SvPOKp(sv)
+ && SvIsCOW(prog->saved_copy)
+ && SvPOKp(prog->saved_copy)
+ && SvPVX(sv) == SvPVX(prog->saved_copy)))
+ {
+ RX_MATCH_COPY_FREE(rx);
+ prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
+ prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
+ assert (SvPOKp(prog->saved_copy));
+ }
+ prog->sublen = strend - strbeg;
+ prog->suboffset = 0;
+ prog->subcoffset = 0;
+ } else
+#endif
+ {
+ I32 min = 0;
+ I32 max = strend - strbeg;
+ I32 sublen;
+
+ if ( (flags & REXEC_COPY_SKIP_POST)
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
+ && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
+ ) { /* don't copy $' part of string */
+ U32 n = 0;
+ max = -1;
+ /* calculate the right-most part of the string covered
+ * by a capture. Due to look-ahead, this may be to
+ * the right of $&, so we have to scan all captures */
+ while (n <= prog->lastparen) {
+ if (prog->offs[n].end > max)
+ max = prog->offs[n].end;
+ n++;
+ }
+ if (max == -1)
+ max = (PL_sawampersand & SAWAMPERSAND_LEFT)
+ ? prog->offs[0].start
+ : 0;
+ assert(max >= 0 && max <= strend - strbeg);
+ }
+
+ if ( (flags & REXEC_COPY_SKIP_PRE)
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
+ && !(PL_sawampersand & SAWAMPERSAND_LEFT)
+ ) { /* don't copy $` part of string */
+ U32 n = 0;
+ min = max;
+ /* calculate the left-most part of the string covered
+ * by a capture. Due to look-behind, this may be to
+ * the left of $&, so we have to scan all captures */
+ while (min && n <= prog->lastparen) {
+ if ( prog->offs[n].start != -1
+ && prog->offs[n].start < min)
+ {
+ min = prog->offs[n].start;
+ }
+ n++;
+ }
+ if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
+ && min > prog->offs[0].end
+ )
+ min = prog->offs[0].end;
+
+ }
+
+ assert(min >= 0 && min <= max && min <= strend - strbeg);
+ sublen = max - min;
+
+ if (RX_MATCH_COPIED(rx)) {
+ if (sublen > prog->sublen)
+ prog->subbeg =
+ (char*)saferealloc(prog->subbeg, sublen+1);
+ }
+ else
+ prog->subbeg = (char*)safemalloc(sublen+1);
+ Copy(strbeg + min, prog->subbeg, sublen, char);
+ prog->subbeg[sublen] = '\0';
+ prog->suboffset = min;
+ prog->sublen = sublen;
+ RX_MATCH_COPIED_on(rx);
+ }
+ prog->subcoffset = prog->suboffset;
+ if (prog->suboffset && utf8_target) {
+ /* Convert byte offset to chars.
+ * XXX ideally should only compute this if @-/@+
+ * has been seen, a la PL_sawampersand ??? */
+
+ /* If there's a direct correspondence between the
+ * string which we're matching and the original SV,
+ * then we can use the utf8 len cache associated with
+ * the SV. In particular, it means that under //g,
+ * sv_pos_b2u() will use the previously cached
+ * position to speed up working out the new length of
+ * subcoffset, rather than counting from the start of
+ * the string each time. This stops
+ * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
+ * from going quadratic */
+ if (SvPOKp(sv) && SvPVX(sv) == strbeg)
+ sv_pos_b2u(sv, &(prog->subcoffset));
+ else
+ prog->subcoffset = utf8_length((U8*)strbeg,
+ (U8*)(strbeg+prog->suboffset));
+ }
+ }
+ else {
+ RX_MATCH_COPY_FREE(rx);
+ prog->subbeg = strbeg;
+ prog->suboffset = 0;
+ prog->subcoffset = 0;
+ prog->sublen = strend - strbeg;
+ }
+}
+
+
+
/*
- regexec_flags - match a regexp against a string
/* sv: SV being matched: only used for utf8 flag, pos() etc; string
* itself is accessed via the pointers above */
/* data: May be used for some additional optimizations.
- Currently its only used, with a U32 cast, for transmitting
- the ganch offset when doing a /g match. This will change */
+ Currently unused. */
/* nosave: For optimizations. */
{
struct regexp *const prog = ReANY(rx);
char *s;
regnode *c;
- char *startpos = stringarg;
+ char *startpos;
I32 minlen; /* must match at least this many chars */
I32 dontbother = 0; /* how many characters not to try at end */
I32 end_shift = 0; /* Same for the end. */ /* CC */
- I32 scream_pos = -1; /* Internal iterator of scream. */
- char *scream_olds = NULL;
const bool utf8_target = cBOOL(DO_UTF8(sv));
I32 multiline;
RXi_GET_DECL(prog,progi);
- regmatch_info reginfo; /* create some info to pass to regtry etc */
+ regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
+ regmatch_info *const reginfo = ®info_buf;
regexp_paren_pair *swap = NULL;
+ I32 oldsave;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGEXEC_FLAGS;
PERL_UNUSED_ARG(data);
/* Be paranoid... */
- if (prog == NULL || startpos == NULL) {
+ if (prog == NULL || stringarg == NULL) {
Perl_croak(aTHX_ "NULL regexp parameter");
return 0;
}
- multiline = prog->extflags & RXf_PMf_MULTILINE;
- reginfo.prog = rx; /* Yes, sorry that this is confusing. */
-
- RX_MATCH_UTF8_set(rx, utf8_target);
- DEBUG_EXECUTE_r(
- debug_start_match(rx, utf8_target, startpos, strend,
+ DEBUG_EXECUTE_r(
+ debug_start_match(rx, utf8_target, stringarg, strend,
"Matching");
);
+ startpos = stringarg;
+
+ if (prog->extflags & RXf_GPOS_SEEN) {
+ MAGIC *mg;
+
+ /* in the presence of \G, we may need to start looking earlier in
+ * the string than the suggested start point of stringarg:
+ * if gofs->prog is set, then that's a known, fixed minimum
+ * offset, such as
+ * /..\G/: gofs = 2
+ * /ab|c\G/: gofs = 1
+ * or if the minimum offset isn't known, then we have to go back
+ * to the start of the string, e.g. /w+\G/
+ */
+ if (prog->gofs) {
+ if (startpos - prog->gofs < strbeg)
+ startpos = strbeg;
+ else
+ startpos -= prog->gofs;
+ }
+ else if (prog->extflags & RXf_GPOS_FLOAT)
+ startpos = strbeg;
+
+ /* set reginfo->ganch, the position where \G can match */
+
+ reginfo->ganch =
+ (flags & REXEC_IGNOREPOS)
+ ? stringarg /* use start pos rather than pos() */
+ : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
+ ? strbeg + mg->mg_len /* Defined pos() */
+ : strbeg; /* pos() not defined; use start of string */
+
+ DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
+ "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
+ }
+
minlen = prog->minlen;
+ if ((startpos + minlen) > strend || startpos < strbeg) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "Regex match can't succeed, so not even tried\n"));
+ return 0;
+ }
+
+ s = startpos;
+
+ if ((prog->extflags & RXf_USE_INTUIT)
+ && !(flags & REXEC_CHECKED))
+ {
+ s = re_intuit_start(rx, sv, strbeg, startpos, strend,
+ flags, NULL);
+ if (!s)
+ return 0;
+
+ if (prog->extflags & RXf_CHECK_ALL) {
+ /* we can match based purely on the result of INTUIT.
+ * Set up captures etc just for $& and $-[0]
+ * (an intuit-only match wont have $1,$2,..) */
+ assert(!prog->nparens);
+
+ /* s/// doesn't like it if $& is earlier than where we asked it to
+ * start searching (which can happen on something like /.\G/) */
+ if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
+ && (s < stringarg))
+ {
+ /* this should only be possible under \G */
+ assert(prog->extflags & RXf_GPOS_SEEN);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
+ goto phooey;
+ }
+
+ /* match via INTUIT shouldn't have any captures.
+ * Let @-, @+, $^N know */
+ prog->lastparen = prog->lastcloseparen = 0;
+ RX_MATCH_UTF8_set(rx, utf8_target);
+ if ( !(flags & REXEC_NOT_FIRST) )
+ S_reg_set_capture_string(aTHX_ rx,
+ strbeg, strend,
+ sv, flags, utf8_target);
+
+ prog->offs[0].start = s - strbeg;
+ prog->offs[0].end = utf8_target
+ ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+ : s - strbeg + prog->minlenret;
+ return 1;
+ }
+ }
+
+
+ /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
+ * which will call destuctors to reset PL_regmatch_state, free higher
+ * PL_regmatch_slabs, and clean up regmatch_info_aux and
+ * regmatch_info_aux_eval */
+
+ oldsave = PL_savestack_ix;
+
+ multiline = prog->extflags & RXf_PMf_MULTILINE;
- if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
+ if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"String too short [regexec_flags]...\n"));
goto phooey;
}
-
/* Check validity of program. */
if (UCHARAT(progi->program) != REG_MAGIC) {
}
RX_MATCH_TAINTED_off(rx);
- PL_reg_state.re_state_eval_setup_done = FALSE;
- PL_reg_maxiter = 0;
- reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
- reginfo.warned = FALSE;
- /* Mark beginning of line for ^ and lookbehind. */
- reginfo.bol = startpos; /* XXX not used ??? */
- PL_bostr = strbeg;
- reginfo.sv = sv;
+ reginfo->prog = rx; /* Yes, sorry that this is confusing. */
+ reginfo->intuit = 0;
+ reginfo->is_utf8_target = cBOOL(utf8_target);
+ reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
+ reginfo->warned = FALSE;
+ reginfo->strbeg = strbeg;
+ reginfo->sv = sv;
+ reginfo->poscache_maxiter = 0; /* not yet started a countdown */
+ reginfo->strend = strend;
+ /* see how far we have to get to not match where we matched before */
+ reginfo->till = stringarg + minend;
+
+ if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
+ /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
+ S_cleanup_regmatch_info_aux has executed (registered by
+ SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
+ magic belonging to this SV.
+ Not newSVsv, either, as it does not COW.
+ */
+ reginfo->sv = newSV(0);
+ sv_setsv(reginfo->sv, sv);
+ SAVEFREESV(reginfo->sv);
+ }
- /* Mark end of line for $ (and such) */
- PL_regeol = strend;
+ /* reserve next 2 or 3 slots in PL_regmatch_state:
+ * slot N+0: may currently be in use: skip it
+ * slot N+1: use for regmatch_info_aux struct
+ * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
+ * slot N+3: ready for use by regmatch()
+ */
- /* see how far we have to get to not match where we matched before */
- reginfo.till = startpos+minend;
+ {
+ regmatch_state *old_regmatch_state;
+ regmatch_slab *old_regmatch_slab;
+ int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
+
+ /* on first ever match, allocate first slab */
+ if (!PL_regmatch_slab) {
+ Newx(PL_regmatch_slab, 1, regmatch_slab);
+ PL_regmatch_slab->prev = NULL;
+ PL_regmatch_slab->next = NULL;
+ PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
+ }
- /* If there is a "must appear" string, look for it. */
- s = startpos;
+ old_regmatch_state = PL_regmatch_state;
+ old_regmatch_slab = PL_regmatch_slab;
- if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
- MAGIC *mg;
- if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
- reginfo.ganch = startpos + prog->gofs;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
- } else if (sv && SvTYPE(sv) >= SVt_PVMG
- && SvMAGIC(sv)
- && (mg = mg_find(sv, PERL_MAGIC_regex_global))
- && mg->mg_len >= 0) {
- reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
-
- if (prog->extflags & RXf_ANCH_GPOS) {
- if (s > reginfo.ganch)
- goto phooey;
- s = reginfo.ganch - prog->gofs;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
- if (s < strbeg)
- goto phooey;
- }
- }
- else if (data) {
- reginfo.ganch = strbeg + PTR2UV(data);
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
-
- } else { /* pos() not defined */
- reginfo.ganch = strbeg;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS: reginfo.ganch = strbeg\n"));
- }
+ for (i=0; i <= max; i++) {
+ if (i == 1)
+ reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
+ else if (i ==2)
+ reginfo->info_aux_eval =
+ reginfo->info_aux->info_aux_eval =
+ &(PL_regmatch_state->u.info_aux_eval);
+
+ if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
+ PL_regmatch_state = S_push_slab(aTHX);
+ }
+
+ /* note initial PL_regmatch_state position; at end of match we'll
+ * pop back to there and free any higher slabs */
+
+ reginfo->info_aux->old_regmatch_state = old_regmatch_state;
+ reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
+ reginfo->info_aux->poscache = NULL;
+
+ SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
+
+ if ((prog->extflags & RXf_EVAL_SEEN))
+ S_setup_eval_state(aTHX_ reginfo);
+ else
+ reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
}
+
+ /* If there is a "must appear" string, look for it. */
+
if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
/* We have to be careful. If the previous successful match
was from this regex we don't want a subsequent partially
PTR2UV(prog->offs)
));
}
- if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
- re_scream_pos_data d;
-
- d.scream_olds = &scream_olds;
- d.scream_pos = &scream_pos;
- s = re_intuit_start(rx, sv, s, strend, flags, &d);
- if (!s) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
- goto phooey; /* not present */
- }
- }
-
-
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
- if (s == startpos && regtry(®info, &startpos))
+ if (s == startpos && regtry(reginfo, &s))
goto got_it;
else if (multiline || (prog->intflags & PREGf_IMPLICIT)
|| (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
if (s == startpos)
goto after_try_utf8;
while (1) {
- if (regtry(®info, &s)) {
+ if (regtry(reginfo, &s)) {
goto got_it;
}
after_try_utf8:
goto phooey;
}
if (prog->extflags & RXf_USE_INTUIT) {
- s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
+ s = re_intuit_start(rx, sv, strbeg,
+ s + UTF8SKIP(s), strend, flags, NULL);
if (!s) {
goto phooey;
}
goto after_try_latin;
}
while (1) {
- if (regtry(®info, &s)) {
+ if (regtry(reginfo, &s)) {
goto got_it;
}
after_try_latin:
goto phooey;
}
if (prog->extflags & RXf_USE_INTUIT) {
- s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
+ s = re_intuit_start(rx, sv, strbeg,
+ s + 1, strend, flags, NULL);
if (!s) {
goto phooey;
}
/* We can use a more efficient search as newlines are the same in unicode as they are in latin */
while (s <= end) { /* note it could be possible to match at the end of the string */
if (*s++ == '\n') { /* don't need PL_utf8skip here */
- if (regtry(®info, &s))
+ if (regtry(reginfo, &s))
goto got_it;
}
}
goto phooey;
} else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
{
- /* the warning about reginfo.ganch being used without initialization
+ /* the warning about reginfo->ganch being used without initialization
is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
and we only enter this block when the same bit is set. */
- char *tmp_s = reginfo.ganch - prog->gofs;
+ char *tmp_s = reginfo->ganch - prog->gofs;
- if (tmp_s >= strbeg && regtry(®info, &tmp_s))
+ if (s <= tmp_s && regtry(reginfo, &tmp_s))
goto got_it;
goto phooey;
}
REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
- if (regtry(®info, &s)) goto got_it;
+ if (regtry(reginfo, &s)) goto got_it;
s += UTF8SKIP(s);
while (s < strend && *s == ch)
s += UTF8SKIP(s);
REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
- if (regtry(®info, &s)) goto got_it;
+ if (regtry(reginfo, &s)) goto got_it;
s++;
while (s < strend && *s == ch)
s++;
-(I32)(CHR_SVLEN(must)
- (SvTAIL(must) != 0) + back_min), strbeg);
}
- if (s > PL_bostr)
+ if (s > reginfo->strbeg)
last1 = HOPc(s, -1);
else
last1 = s - 1; /* bogus */
/* XXXX check_substr already used to find "s", can optimize if
check_substr==must. */
- scream_pos = -1;
dontbother = end_shift;
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
s = HOPc(s, -back_max);
}
else {
- char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
+ char * const t = (last1 >= reginfo->strbeg)
+ ? HOPc(last1, 1) : last1 + 1;
last1 = HOPc(s, -back_min);
s = t;
}
if (utf8_target) {
while (s <= last1) {
- if (regtry(®info, &s))
+ if (regtry(reginfo, &s))
goto got_it;
if (s >= last1) {
s++; /* to break out of outer loop */
}
else {
while (s <= last1) {
- if (regtry(®info, &s))
+ if (regtry(reginfo, &s))
goto got_it;
s++;
}
quoted, (int)(strend - s));
}
});
- if (find_byclass(prog, c, s, strend, ®info, reginfo.is_utf8_pat))
+ if (find_byclass(prog, c, s, strend, reginfo))
goto got_it;
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
}
/* We don't know much -- general case. */
if (utf8_target) {
for (;;) {
- if (regtry(®info, &s))
+ if (regtry(reginfo, &s))
goto got_it;
if (s >= strend)
break;
}
else {
do {
- if (regtry(®info, &s))
+ if (regtry(reginfo, &s))
goto got_it;
} while (s++ < strend);
}
goto phooey;
got_it:
+ /* s/// doesn't like it if $& is earlier than where we asked it to
+ * start searching (which can happen on something like /.\G/) */
+ if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
+ && (prog->offs[0].start < stringarg - strbeg))
+ {
+ /* this should only be possible under \G */
+ assert(prog->extflags & RXf_GPOS_SEEN);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
+ goto phooey;
+ }
+
DEBUG_BUFFERS_r(
if (swap)
PerlIO_printf(Perl_debug_log,
);
Safefree(swap);
- if (PL_reg_state.re_state_eval_setup_done)
- restore_pos(aTHX_ prog);
- if (RXp_PAREN_NAMES(prog))
- (void)hv_iterinit(RXp_PAREN_NAMES(prog));
+ /* clean up; this will trigger destructors that will free all slabs
+ * above the current one, and cleanup the regmatch_info_aux
+ * and regmatch_info_aux_eval sructs */
- /* make sure $`, $&, $', and $digit will work later */
- if ( !(flags & REXEC_NOT_FIRST) ) {
- if (flags & REXEC_COPY_STR) {
-#ifdef PERL_ANY_COW
- if (SvCANCOW(sv)) {
- if (DEBUG_C_TEST) {
- PerlIO_printf(Perl_debug_log,
- "Copy on write: regexp capture, type %d\n",
- (int) SvTYPE(sv));
- }
- RX_MATCH_COPY_FREE(rx);
- prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
- prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
- assert (SvPOKp(prog->saved_copy));
- prog->sublen = PL_regeol - strbeg;
- prog->suboffset = 0;
- prog->subcoffset = 0;
- } else
-#endif
- {
- I32 min = 0;
- I32 max = PL_regeol - strbeg;
- I32 sublen;
-
- if ( (flags & REXEC_COPY_SKIP_POST)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
- && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
- ) { /* don't copy $' part of string */
- U32 n = 0;
- max = -1;
- /* calculate the right-most part of the string covered
- * by a capture. Due to look-ahead, this may be to
- * the right of $&, so we have to scan all captures */
- while (n <= prog->lastparen) {
- if (prog->offs[n].end > max)
- max = prog->offs[n].end;
- n++;
- }
- if (max == -1)
- max = (PL_sawampersand & SAWAMPERSAND_LEFT)
- ? prog->offs[0].start
- : 0;
- assert(max >= 0 && max <= PL_regeol - strbeg);
- }
-
- if ( (flags & REXEC_COPY_SKIP_PRE)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
- && !(PL_sawampersand & SAWAMPERSAND_LEFT)
- ) { /* don't copy $` part of string */
- U32 n = 0;
- min = max;
- /* calculate the left-most part of the string covered
- * by a capture. Due to look-behind, this may be to
- * the left of $&, so we have to scan all captures */
- while (min && n <= prog->lastparen) {
- if ( prog->offs[n].start != -1
- && prog->offs[n].start < min)
- {
- min = prog->offs[n].start;
- }
- n++;
- }
- if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
- && min > prog->offs[0].end
- )
- min = prog->offs[0].end;
+ LEAVE_SCOPE(oldsave);
- }
+ if (RXp_PAREN_NAMES(prog))
+ (void)hv_iterinit(RXp_PAREN_NAMES(prog));
- assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
- sublen = max - min;
+ RX_MATCH_UTF8_set(rx, utf8_target);
- if (RX_MATCH_COPIED(rx)) {
- if (sublen > prog->sublen)
- prog->subbeg =
- (char*)saferealloc(prog->subbeg, sublen+1);
- }
- else
- prog->subbeg = (char*)safemalloc(sublen+1);
- Copy(strbeg + min, prog->subbeg, sublen, char);
- prog->subbeg[sublen] = '\0';
- prog->suboffset = min;
- prog->sublen = sublen;
- RX_MATCH_COPIED_on(rx);
- }
- prog->subcoffset = prog->suboffset;
- if (prog->suboffset && utf8_target) {
- /* Convert byte offset to chars.
- * XXX ideally should only compute this if @-/@+
- * has been seen, a la PL_sawampersand ??? */
-
- /* If there's a direct correspondence between the
- * string which we're matching and the original SV,
- * then we can use the utf8 len cache associated with
- * the SV. In particular, it means that under //g,
- * sv_pos_b2u() will use the previously cached
- * position to speed up working out the new length of
- * subcoffset, rather than counting from the start of
- * the string each time. This stops
- * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
- * from going quadratic */
- if (SvPOKp(sv) && SvPVX(sv) == strbeg)
- sv_pos_b2u(sv, &(prog->subcoffset));
- else
- prog->subcoffset = utf8_length((U8*)strbeg,
- (U8*)(strbeg+prog->suboffset));
- }
- }
- else {
- RX_MATCH_COPY_FREE(rx);
- prog->subbeg = strbeg;
- prog->suboffset = 0;
- prog->subcoffset = 0;
- prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
- }
- }
+ /* make sure $`, $&, $', and $digit will work later */
+ if ( !(flags & REXEC_NOT_FIRST) )
+ S_reg_set_capture_string(aTHX_ rx,
+ strbeg, reginfo->strend,
+ sv, flags, utf8_target);
return 1;
phooey:
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
PL_colors[4], PL_colors[5]));
- if (PL_reg_state.re_state_eval_setup_done)
- restore_pos(aTHX_ prog);
+
+ /* clean up; this will trigger destructors that will free all slabs
+ * above the current one, and cleanup the regmatch_info_aux
+ * and regmatch_info_aux_eval sructs */
+
+ LEAVE_SCOPE(oldsave);
+
if (swap) {
/* we failed :-( roll it back */
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
}
-/* Set which rex is pointed to by PL_reg_state, handling ref counting.
+/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
* Do inc before dec, in case old and new rex are the same */
#define SET_reg_curpm(Re2) \
- if (PL_reg_state.re_state_eval_setup_done) { \
+ if (reginfo->info_aux_eval) { \
(void)ReREFCNT_inc(Re2); \
ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
PM_SETRE((PL_reg_curpm), (Re2)); \
reginfo->cutpoint=NULL;
- if ((prog->extflags & RXf_EVAL_SEEN)
- && !PL_reg_state.re_state_eval_setup_done)
- {
- MAGIC *mg;
-
- PL_reg_state.re_state_eval_setup_done = TRUE;
- if (reginfo->sv) {
- /* Make $_ available to executed code. */
- if (reginfo->sv != DEFSV) {
- SAVE_DEFSV;
- DEFSV_set(reginfo->sv);
- }
-
- if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
- && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
- /* prepare for quick setting of pos */
-#ifdef PERL_OLD_COPY_ON_WRITE
- if (SvIsCOW(reginfo->sv))
- sv_force_normal_flags(reginfo->sv, 0);
-#endif
- mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
- &PL_vtbl_mglob, NULL, 0);
- mg->mg_len = -1;
- }
- PL_reg_magic = mg;
- PL_reg_oldpos = mg->mg_len;
- SAVEDESTRUCTOR_X(restore_pos, prog);
- }
- if (!PL_reg_curpm) {
- Newxz(PL_reg_curpm, 1, PMOP);
-#ifdef USE_ITHREADS
- {
- SV* const repointer = &PL_sv_undef;
- /* this regexp is also owned by the new PL_reg_curpm, which
- will try to free it. */
- av_push(PL_regex_padav, repointer);
- PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
- PL_regex_pad = AvARRAY(PL_regex_padav);
- }
-#endif
- }
- SET_reg_curpm(rx);
- PL_reg_oldcurpm = PL_curpm;
- PL_curpm = PL_reg_curpm;
- if (RXp_MATCH_COPIED(prog)) {
- /* Here is a serious problem: we cannot rewrite subbeg,
- since it may be needed if this match fails. Thus
- $` inside (?{}) could fail... */
- PL_reg_oldsaved = prog->subbeg;
- PL_reg_oldsavedlen = prog->sublen;
- PL_reg_oldsavedoffset = prog->suboffset;
- PL_reg_oldsavedcoffset = prog->suboffset;
-#ifdef PERL_ANY_COW
- PL_nrs = prog->saved_copy;
-#endif
- RXp_MATCH_COPIED_off(prog);
- }
- else
- PL_reg_oldsaved = NULL;
- prog->subbeg = PL_bostr;
- prog->suboffset = 0;
- prog->subcoffset = 0;
- prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
- }
-#ifdef DEBUGGING
- PL_reg_starttry = *startposp;
-#endif
- prog->offs[0].start = *startposp - PL_bostr;
+ prog->offs[0].start = *startposp - reginfo->strbeg;
prog->lastparen = 0;
prog->lastcloseparen = 0;
"unreachable code" warnings, which are bogus, but distracting. */
#define CACHEsayNO \
if (ST.cache_mask) \
- PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
+ reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
sayNO
/* this is used to determine how far from the left messages like
#define CHRTEST_NOT_A_CP_1 -999
#define CHRTEST_NOT_A_CP_2 -998
-#define SLAB_FIRST(s) (&(s)->states[0])
-#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
-
/* grab a new slab and return the first slot in it */
STATIC regmatch_state *
}
-/* free all slabs above current one - called during LEAVE_SCOPE */
-
-STATIC void
-S_clear_backtrack_stack(pTHX_ void *p)
-{
- regmatch_slab *s = PL_regmatch_slab->next;
- PERL_UNUSED_ARG(p);
-
- if (!s)
- return;
- PL_regmatch_slab->next = NULL;
- while (s) {
- regmatch_slab * const osl = s;
- s = s->next;
- Safefree(osl);
- }
-}
static bool
S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
- U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
+ U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
{
/* This function determines if there are one or two characters that match
* the first character of the passed-in EXACTish node <text_node>, and if
* point (unless inappropriately coerced to unsigned). *<c1p> will equal
* *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
- const bool utf8_target = PL_reg_match_utf8;
+ const bool utf8_target = reginfo->is_utf8_target;
UV c1 = CHRTEST_NOT_A_CP_1;
UV c2 = CHRTEST_NOT_A_CP_2;
bool use_chrtest_void = FALSE;
+ const bool is_utf8_pat = reginfo->is_utf8_pat;
/* Used when we have both utf8 input and utf8 output, to avoid converting
* to/from code points */
dMY_CXT;
#endif
dVAR;
- const bool utf8_target = PL_reg_match_utf8;
+ const bool utf8_target = reginfo->is_utf8_target;
const U32 uniflags = UTF8_ALLOW_DEFAULT;
REGEXP *rex_sv = reginfo->prog;
regexp *rex = ReANY(rex_sv);
RXi_GET_DECL(rex,rexi);
- I32 oldsave;
/* the current state. This is a cached copy of PL_regmatch_state */
regmatch_state *st;
/* cache heavy used fields of st in registers */
DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
PerlIO_printf(Perl_debug_log,"regmatch start\n");
}));
- /* on first ever call to regmatch, allocate first slab */
- if (!PL_regmatch_slab) {
- Newx(PL_regmatch_slab, 1, regmatch_slab);
- PL_regmatch_slab->prev = NULL;
- PL_regmatch_slab->next = NULL;
- PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
- }
- oldsave = PL_savestack_ix;
- SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
- SAVEVPTR(PL_regmatch_slab);
- SAVEVPTR(PL_regmatch_state);
-
- /* grab next free state slot */
- st = ++PL_regmatch_state;
- if (st > SLAB_LAST(PL_regmatch_slab))
- st = PL_regmatch_state = S_push_slab(aTHX);
+ st = PL_regmatch_state;
/* Note that nextchr is a byte even in UTF */
SET_nextchr;
switch (state_num) {
case BOL: /* /^../ */
- if (locinput == PL_bostr)
- {
- /* reginfo->till = reginfo->bol; */
+ if (locinput == reginfo->strbeg)
break;
- }
sayNO;
case MBOL: /* /^../m */
- if (locinput == PL_bostr ||
+ if (locinput == reginfo->strbeg ||
(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
{
break;
sayNO;
case SBOL: /* /^../s */
- if (locinput == PL_bostr)
+ if (locinput == reginfo->strbeg)
break;
sayNO;
case KEEPS: /* \K */
/* update the startpoint */
st->u.keeper.val = rex->offs[0].start;
- rex->offs[0].start = locinput - PL_bostr;
+ rex->offs[0].start = locinput - reginfo->strbeg;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
assert(0); /*NOTREACHED*/
case KEEPS_next_fail:
seol:
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
- if (PL_regeol - locinput > 1)
+ if (reginfo->strend - locinput > 1)
sayNO;
break;
shortest accept state and the wordnum of the longest
accept state */
- while ( state && uc <= (U8*)PL_regeol ) {
+ while ( state && uc <= (U8*)(reginfo->strend) ) {
U32 base = trie->states[ state ].trans.base;
UV uvc = 0;
U16 charid = 0;
});
/* read a char and goto next state */
- if ( base && (foldlen || uc < (U8*)PL_regeol)) {
+ if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
I32 offset;
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
uscan, len, uvc, charid, foldlen,
* is an invariant, but there are tests in the test suite
* dealing with (??{...}) which violate this) */
while (s < e) {
- if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
+ if (l >= reginfo->strend
+ || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
+ {
sayNO;
}
if (UTF8_IS_INVARIANT(*(U8*)l)) {
else {
/* The target is not utf8, the pattern is utf8. */
while (s < e) {
- if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
+ if (l >= reginfo->strend
+ || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
{
sayNO;
}
else {
/* The target and the pattern have the same utf8ness. */
/* Inline the first character, for speed. */
- if (PL_regeol - locinput < ln
+ if (reginfo->strend - locinput < ln
|| UCHARAT(s) != nextchr
|| (ln > 1 && memNE(s, locinput, ln)))
{
/* Either target or the pattern are utf8, or has the issue where
* the fold lengths may differ. */
const char * const l = locinput;
- char *e = PL_regeol;
+ char *e = reginfo->strend;
if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
l, &e, 0, utf8_target, fold_utf8_flags))
{
sayNO;
}
- if (PL_regeol - locinput < ln)
+ if (reginfo->strend - locinput < ln)
sayNO;
if (ln > 1 && ! folder(s, locinput, ln))
sayNO;
&& FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
&& FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
{
- if (locinput == PL_bostr)
+ if (locinput == reginfo->strbeg)
ln = '\n';
else {
- const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
+ const U8 * const r =
+ reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
}
* byte is never mistakable for ASCII, and so the test
* will say it is not a word character, which is the
* correct answer. */
- ln = (locinput != PL_bostr) ?
+ ln = (locinput != reginfo->strbeg) ?
UCHARAT(locinput - 1) : '\n';
switch (FLAGS(scan)) {
case REGEX_UNICODE_CHARSET:
locinput++; /* Match the . or CR */
if (nextchr == '\r' /* And if it was CR, and the next is LF,
match the LF */
- && locinput < PL_regeol
+ && locinput < reginfo->strend
&& UCHARAT(locinput) == '\n')
{
locinput++;
else {
/* Utf8: See if is ( CR LF ); already know that locinput <
- * PL_regeol, so locinput+1 is in bounds */
- if ( nextchr == '\r' && locinput+1 < PL_regeol
+ * reginfo->strend, so locinput+1 is in bounds */
+ if ( nextchr == '\r' && locinput+1 < reginfo->strend
&& UCHARAT(locinput + 1) == '\n')
{
locinput += 2;
LOAD_UTF8_CHARCLASS_GCB();
/* Match (prepend)* */
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_Prepend_utf8(locinput)))
{
previous_prepend = locinput;
* the next thing won't match, back off the last prepend we
* matched, as it is guaranteed to match the begin */
if (previous_prepend
- && (locinput >= PL_regeol
+ && (locinput >= reginfo->strend
|| (! swash_fetch(PL_utf8_X_regular_begin,
(U8*)locinput, utf8_target)
&& ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
locinput = previous_prepend;
}
- /* Note that here we know PL_regeol > locinput, as we
+ /* Note that here we know reginfo->strend > locinput, as we
* tested that upon input to this switch case, and if we
* moved locinput forward, we tested the result just above
* and it either passed, or we backed off so that it will
* RI+ */
if ((len = is_GCB_RI_utf8(locinput))) {
locinput += len;
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_RI_utf8(locinput)))
{
locinput += len;
} else if ((len = is_GCB_T_utf8(locinput))) {
/* Another possibility is T+ */
locinput += len;
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_T_utf8(locinput)))
{
locinput += len;
* L* (L | LVT T* | V * V* T* | LV V* T*) */
/* Match L* */
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_L_utf8(locinput)))
{
locinput += len;
* equation, we have a complete hangul syllable.
* Are done. */
- if (locinput < PL_regeol
+ if (locinput < reginfo->strend
&& is_GCB_LV_LVT_V_utf8(locinput))
{
/* Otherwise keep going. Must be LV, LVT or V.
/* Must be V or LV. Take it, then match
* V* */
locinput += UTF8SKIP(locinput);
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_V_utf8(locinput)))
{
locinput += len;
/* And any of LV, LVT, or V can be followed
* by T* */
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& (len = is_GCB_T_utf8(locinput)))
{
locinput += len;
}
/* Match any extender */
- while (locinput < PL_regeol
+ while (locinput < reginfo->strend
&& swash_fetch(PL_utf8_X_extend,
(U8*)locinput, utf8_target))
{
}
}
exit_utf8:
- if (locinput > PL_regeol) sayNO;
+ if (locinput > reginfo->strend) sayNO;
}
break;
op. */
/* don't initialize these in the declaration, it makes C++
unhappy */
- char *s;
+ const char *s;
char type;
re_fold_t folder;
const U8 *fold_array;
do_nref_ref_common:
ln = rex->offs[n].start;
- PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
+ reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
if (rex->lastparen < n || ln == -1)
sayNO; /* Do not match unless seen CLOSEn. */
if (ln == rex->offs[n].end)
break;
- s = PL_bostr + ln;
+ s = reginfo->strbeg + ln;
if (type != REF /* REF can do byte comparison */
&& (utf8_target || type == REFFU))
{ /* XXX handle REFFL better */
- char * limit = PL_regeol;
+ char * limit = reginfo->strend;
/* This call case insensitively compares the entire buffer
* at s, with the current input starting at locinput, but
- * not going off the end given by PL_regeol, and returns in
- * <limit> upon success, how much of the current input was
- * matched */
+ * not going off the end given by reginfo->strend, and
+ * returns in <limit> upon success, how much of the
+ * current input was matched */
if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
locinput, &limit, 0, utf8_target, utf8_fold_flags))
{
UCHARAT(s) != fold_array[nextchr]))
sayNO;
ln = rex->offs[n].end - ln;
- if (locinput + ln > PL_regeol)
+ if (locinput + ln > reginfo->strend)
sayNO;
if (ln > 1 && (type == REF
? memNE(s, locinput, ln)
OP * const oop = PL_op;
COP * const ocurcop = PL_curcop;
OP *nop;
- char *saved_regeol = PL_regeol;
- struct re_save_state saved_state;
CV *newcv;
/* save *all* paren positions */
regcppush(rex, 0, maxopenparen);
REGCP_SET(runops_cp);
- /* To not corrupt the existing regex state while executing the
- * eval we would normally put it on the save stack, like with
- * save_re_context. However, re-evals have a weird scoping so we
- * can't just add ENTER/LEAVE here. With that, things like
- *
- * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
- *
- * would break, as they expect the localisation to be unwound
- * only when the re-engine backtracks through the bit that
- * localised it.
- *
- * What we do instead is just saving the state in a local c
- * variable.
- */
- Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
-
if (!caller_cv)
caller_cv = find_runcv(NULL);
DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
" re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
- rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
+ rex->offs[0].end = locinput - reginfo->strbeg;
+ if (reginfo->info_aux_eval->pos_magic)
+ reginfo->info_aux_eval->pos_magic->mg_len
+ = locinput - reginfo->strbeg;
if (sv_yes_mark) {
SV *sv_mrk = get_sv("REGMARK", 1);
}
- Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
-
/* *** Note that at this point we don't restore
* PL_comppad, (or pop the CxSUB) on the assumption it may
* be used again soon. This is safe as long as nothing
* in the regexp code uses the pad ! */
PL_op = oop;
PL_curcop = ocurcop;
- PL_regeol = saved_regeol;
S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
+ PL_curpm = PL_reg_curpm;
if (logical != 2)
break;
re->subcoffset = rex->subcoffset;
rei = RXi_GET(re);
DEBUG_EXECUTE_r(
- debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
- "Matching embedded");
+ debug_start_match(re_sv, utf8_target, locinput,
+ reginfo->strend, "Matching embedded");
);
startpoint = rei->program + 1;
ST.close_paren = 0; /* only used for GOSUB */
maxopenparen = 0;
- /* XXXX This is too dramatic a measure... */
- PL_reg_maxiter = 0;
+ /* invalidate the S-L poscache. We're now executing a
+ * different set of WHILEM ops (and their associated
+ * indexes) against the same string, so the bits in the
+ * cache are meaningless. Setting maxiter to zero forces
+ * the cache to be invalidated and zeroed before reuse.
+ * XXX This is too dramatic a measure. Ideally we should
+ * save the old cache and restore when running the outer
+ * pattern again */
+ reginfo->poscache_maxiter = 0;
- ST.saved_utf8_pat = is_utf8_pat;
- is_utf8_pat = cBOOL(RX_UTF8(re_sv));
+ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
ST.prev_rex = rex_sv;
ST.prev_curlyx = cur_curlyx;
case EVAL_AB: /* cleanup after a successful (??{A})B */
/* note: this is called twice; first after popping B, then A */
- is_utf8_pat = ST.saved_utf8_pat;
rex_sv = ST.prev_rex;
+ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
rex = ReANY(rex_sv);
rexi = RXi_GET(rex);
cur_eval = ST.prev_eval;
cur_curlyx = ST.prev_curlyx;
- /* XXXX This is too dramatic a measure... */
- PL_reg_maxiter = 0;
+ /* Invalidate cache. See "invalidate" comment above. */
+ reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
sayYES;
case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
/* note: this is called twice; first after popping B, then A */
- is_utf8_pat = ST.saved_utf8_pat;
rex_sv = ST.prev_rex;
+ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
rex = ReANY(rex_sv);
rexi = RXi_GET(rex);
regcppop(rex, &maxopenparen);
cur_eval = ST.prev_eval;
cur_curlyx = ST.prev_curlyx;
- /* XXXX This is too dramatic a measure... */
- PL_reg_maxiter = 0;
+ /* Invalidate cache. See "invalidate" comment above. */
+ reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
sayNO_SILENT;
case OPEN: /* ( */
n = ARG(scan); /* which paren pair */
- rex->offs[n].start_tmp = locinput - PL_bostr;
+ rex->offs[n].start_tmp = locinput - reginfo->strbeg;
if (n > maxopenparen)
maxopenparen = n;
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
/* XXX really need to log other places start/end are set too */
#define CLOSE_CAPTURE \
rex->offs[n].start = rex->offs[n].start_tmp; \
- rex->offs[n].end = locinput - PL_bostr; \
+ rex->offs[n].end = locinput - reginfo->strbeg; \
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
PTR2UV(rex), \
break;
case IFTHEN: /* (?(cond)A|B) */
- PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
+ reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
if (sw)
next = NEXTOPER(NEXTOPER(scan));
else {
goto do_whilem_B_max;
}
- /* super-linear cache processing */
+ /* super-linear cache processing.
+ *
+ * The idea here is that for certain types of CURLYX/WHILEM -
+ * principally those whose upper bound is infinity (and
+ * excluding regexes that have things like \1 and other very
+ * non-regular expresssiony things), then if a pattern like
+ * /....A*.../ fails and we backtrack to the WHILEM, then we
+ * make a note that this particular WHILEM op was at string
+ * position 47 (say) when the rest of pattern failed. Then, if
+ * we ever find ourselves back at that WHILEM, and at string
+ * position 47 again, we can just fail immediately rather than
+ * running the rest of the pattern again.
+ *
+ * This is very handy when patterns start to go
+ * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
+ * with a combinatorial explosion of backtracking.
+ *
+ * The cache is implemented as a bit array, with one bit per
+ * string byte position per WHILEM op (up to 16) - so its
+ * between 0.25 and 2x the string size.
+ *
+ * To avoid allocating a poscache buffer every time, we do an
+ * initially countdown; only after we have executed a WHILEM
+ * op (string-length x #WHILEMs) times do we allocate the
+ * cache.
+ *
+ * The top 4 bits of scan->flags byte say how many different
+ * relevant CURLLYX/WHILEM op pairs there are, while the
+ * bottom 4-bits is the identifying index number of this
+ * WHILEM.
+ */
if (scan->flags) {
- if (!PL_reg_maxiter) {
+ if (!reginfo->poscache_maxiter) {
/* start the countdown: Postpone detection until we
* know the match is not *that* much linear. */
- PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
+ reginfo->poscache_maxiter
+ = (reginfo->strend - reginfo->strbeg + 1)
+ * (scan->flags>>4);
/* possible overflow for long strings and many CURLYX's */
- if (PL_reg_maxiter < 0)
- PL_reg_maxiter = I32_MAX;
- PL_reg_leftiter = PL_reg_maxiter;
+ if (reginfo->poscache_maxiter < 0)
+ reginfo->poscache_maxiter = I32_MAX;
+ reginfo->poscache_iter = reginfo->poscache_maxiter;
}
- if (PL_reg_leftiter-- == 0) {
+ if (reginfo->poscache_iter-- == 0) {
/* initialise cache */
- const I32 size = (PL_reg_maxiter + 7)/8;
- if (PL_reg_poscache) {
- if ((I32)PL_reg_poscache_size < size) {
- Renew(PL_reg_poscache, size, char);
- PL_reg_poscache_size = size;
+ const I32 size = (reginfo->poscache_maxiter + 7)/8;
+ regmatch_info_aux *const aux = reginfo->info_aux;
+ if (aux->poscache) {
+ if ((I32)reginfo->poscache_size < size) {
+ Renew(aux->poscache, size, char);
+ reginfo->poscache_size = size;
}
- Zero(PL_reg_poscache, size, char);
+ Zero(aux->poscache, size, char);
}
else {
- PL_reg_poscache_size = size;
- Newxz(PL_reg_poscache, size, char);
+ reginfo->poscache_size = size;
+ Newxz(aux->poscache, size, char);
}
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
"%swhilem: Detected a super-linear match, switching on caching%s...\n",
);
}
- if (PL_reg_leftiter < 0) {
+ if (reginfo->poscache_iter < 0) {
/* have we already failed at this position? */
I32 offset, mask;
+
+ reginfo->poscache_iter = -1; /* stop eventual underflow */
offset = (scan->flags & 0xf) - 1
- + (locinput - PL_bostr) * (scan->flags>>4);
+ + (locinput - reginfo->strbeg)
+ * (scan->flags>>4);
mask = 1 << (offset % 8);
offset /= 8;
- if (PL_reg_poscache[offset] & mask) {
+ if (reginfo->info_aux->poscache[offset] & mask) {
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
"%*s whilem: (cache) already tried at this position...\n",
REPORT_CODE_OFF+depth*2, "")
ST.count++;
/* after first match, determine A's length: u.curlym.alen */
if (ST.count == 1) {
- if (PL_reg_match_utf8) {
+ if (reginfo->is_utf8_target) {
char *s = st->locinput;
while (s < locinput) {
ST.alen++;
if (PL_regkind[OP(text_node)] == EXACT) {
if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
- is_utf8_pat))
+ reginfo))
{
sayNO;
}
I32 paren = ST.me->flags;
if (ST.count) {
rex->offs[paren].start
- = HOPc(locinput, -ST.alen) - PL_bostr;
- rex->offs[paren].end = locinput - PL_bostr;
+ = HOPc(locinput, -ST.alen) - reginfo->strbeg;
+ rex->offs[paren].end = locinput - reginfo->strbeg;
if ((U32)paren > rex->lastparen)
rex->lastparen = paren;
rex->lastcloseparen = paren;
#define CURLY_SETPAREN(paren, success) \
if (paren) { \
if (success) { \
- rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
- rex->offs[paren].end = locinput - PL_bostr; \
+ rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
+ rex->offs[paren].end = locinput - reginfo->strbeg; \
if (paren > rex->lastparen) \
rex->lastparen = paren; \
rex->lastcloseparen = paren; \
friends need to change. */
if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
- is_utf8_pat))
+ reginfo))
{
sayNO;
}
char *li = locinput;
minmod = 0;
if (ST.min &&
- regrepeat(rex, &li, ST.A, ST.min, depth, is_utf8_pat)
+ regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
< ST.min)
sayNO;
SET_locinput(li);
/* set ST.maxpos to the furthest point along the
* string that could possibly match */
if (ST.max == REG_INFTY) {
- ST.maxpos = PL_regeol - 1;
+ ST.maxpos = reginfo->strend - 1;
if (utf8_target)
while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
ST.maxpos--;
else if (utf8_target) {
int m = ST.max - ST.min;
for (ST.maxpos = locinput;
- m >0 && ST.maxpos < PL_regeol; m--)
+ m >0 && ST.maxpos < reginfo->strend; m--)
ST.maxpos += UTF8SKIP(ST.maxpos);
}
else {
ST.maxpos = locinput + ST.max - ST.min;
- if (ST.maxpos >= PL_regeol)
- ST.maxpos = PL_regeol - 1;
+ if (ST.maxpos >= reginfo->strend)
+ ST.maxpos = reginfo->strend - 1;
}
goto curly_try_B_min_known;
/* avoid taking address of locinput, so it can remain
* a register var */
char *li = locinput;
- ST.count = regrepeat(rex, &li, ST.A, ST.max, depth,
- is_utf8_pat);
+ ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
if (ST.count < ST.min)
sayNO;
SET_locinput(li);
* locinput matches */
char *li = ST.oldloc;
ST.count += n;
- if (regrepeat(rex, &li, ST.A, n, depth, is_utf8_pat) < n)
+ if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
sayNO;
assert(n == REG_INFTY || locinput == li);
}
/* failed -- move forward one */
{
char *li = locinput;
- if (!regrepeat(rex, &li, ST.A, 1, depth, is_utf8_pat)) {
+ if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
sayNO;
}
locinput = li;
goto fake_end;
}
{
- bool could_match = locinput < PL_regeol;
+ bool could_match = locinput < reginfo->strend;
/* If it could work, try it. */
if (ST.c1 != CHRTEST_VOID && could_match) {
fake_end:
if (cur_eval) {
/* we've just finished A in /(??{A})B/; now continue with B */
- st->u.eval.saved_utf8_pat = is_utf8_pat;
- is_utf8_pat = cur_eval->u.eval.saved_utf8_pat;
st->u.eval.prev_rex = rex_sv; /* inner */
/* Save *all* the positions. */
st->u.eval.cp = regcppush(rex, 0, maxopenparen);
rex_sv = cur_eval->u.eval.prev_rex;
+ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
SET_reg_curpm(rex_sv);
rex = ReANY(rex_sv);
rexi = RXi_GET(rex);
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
PL_colors[4],
- (long)(locinput - PL_reg_starttry),
- (long)(reginfo->till - PL_reg_starttry),
+ (long)(locinput - startpos),
+ (long)(reginfo->till - startpos),
PL_colors[5]));
sayNO_SILENT; /* Cannot match: too short. */
break;
case COMMIT: /* (*COMMIT) */
- reginfo->cutpoint = PL_regeol;
+ reginfo->cutpoint = reginfo->strend;
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
#undef ST
case LNBREAK: /* \R */
- if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
+ if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
locinput += n;
} else
sayNO;
if (utf8_target) {
locinput += PL_utf8skip[nextchr];
/* locinput is allowed to go 1 char off the end, but not 2+ */
- if (locinput > PL_regeol)
+ if (locinput > reginfo->strend)
sayNO;
}
else
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
PL_colors[4], PL_colors[5]));
- if (PL_reg_state.re_state_eval_setup_done) {
+ if (reginfo->info_aux_eval) {
/* each successfully executed (?{...}) block does the equivalent of
* local $^R = do {...}
* When popping the save stack, all these locals would be undone;
PERL_UNUSED_VAR(SP);
}
- /* clean up; in particular, free all slabs above current one */
- LEAVE_SCOPE(oldsave);
-
- assert(!result || locinput - PL_bostr >= 0);
- return result ? locinput - PL_bostr : -1;
+ assert(!result || locinput - reginfo->strbeg >= 0);
+ return result ? locinput - reginfo->strbeg : -1;
}
/*
* to point to the byte following the highest successful
* match.
* p - the regnode to be repeatedly matched against.
+ * reginfo - struct holding match state, such as strend
* max - maximum number of things to match.
* depth - (for debugging) backtracking depth.
*/
STATIC I32
S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
- I32 max, int depth, bool is_utf8_pat)
+ regmatch_info *const reginfo, I32 max, int depth)
{
dVAR;
char *scan; /* Pointer to current position in target string */
I32 c;
- char *loceol = PL_regeol; /* local version */
+ char *loceol = reginfo->strend; /* local version */
I32 hardcount = 0; /* How many matches so far */
- bool utf8_target = PL_reg_match_utf8;
+ bool utf8_target = reginfo->is_utf8_target;
int to_complement = 0; /* Invert the result? */
UV utf8_flags;
_char_class_number classnum;
scan = *startposp;
if (max == REG_INFTY)
max = I32_MAX;
- else if (! utf8_target && scan + max < loceol)
+ else if (! utf8_target && loceol - scan > max)
loceol = scan + max;
/* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
scan = loceol;
break;
case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
- if (utf8_target && scan + max < loceol) {
+ if (utf8_target && loceol - scan > max) {
/* <loceol> hadn't been adjusted in the UTF-8 case */
scan += max;
}
break;
case EXACT:
- assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
+ assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
c = (U8)*STRING(p);
* under UTF-8, or both target and pattern aren't UTF-8. Note that we
* can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
* true iff it doesn't matter if the argument is in UTF-8 or not */
- if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) {
- if (utf8_target && scan + max < loceol) {
+ if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
+ if (utf8_target && loceol - scan > max) {
/* We didn't adjust <loceol> because is UTF-8, but ok to do so,
* since here, to match at all, 1 char == 1 byte */
loceol = scan + max;
scan++;
}
}
- else if (is_utf8_pat) {
+ else if (reginfo->is_utf8_pat) {
if (utf8_target) {
STRLEN scan_char_len;
case EXACTFU_SS:
case EXACTFU_TRICKYFOLD:
case EXACTFU:
- utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+ utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
do_exactf: {
int c1, c2;
U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
- assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
+ assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
- is_utf8_pat))
+ reginfo))
{
if (c1 == CHRTEST_VOID) {
/* Use full Unicode fold matching */
- char *tmpeol = PL_regeol;
- STRLEN pat_len = is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
+ char *tmpeol = reginfo->strend;
+ STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
while (hardcount < max
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
STRING(p), NULL, pat_len,
- is_utf8_pat, utf8_flags))
+ reginfo->is_utf8_pat, utf8_flags))
{
scan = tmpeol;
- tmpeol = PL_regeol;
+ tmpeol = reginfo->strend;
hardcount++;
}
}
/* FALLTHROUGH */
case POSIXA:
- if (utf8_target && scan + max < loceol) {
+ if (utf8_target && loceol - scan > max) {
/* We didn't adjust <loceol> at the beginning of this routine
* because is UTF-8, but it is actually ok to do so, since here, to
/* LNBREAK can match one or two latin chars, which is ok, but we
* have to use hardcount in this situation, and throw away the
* adjustment to <loceol> done before the switch statement */
- loceol = PL_regeol;
+ loceol = reginfo->strend;
while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
scan+=c;
hardcount++;
return s;
}
+
+/* when executing a regex that may have (?{}), extra stuff needs setting
+ up that will be visible to the called code, even before the current
+ match has finished. In particular:
+
+ * $_ is localised to the SV currently being matched;
+ * pos($_) is created if necessary, ready to be updated on each call-out
+ to code;
+ * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
+ isn't set until the current pattern is successfully finished), so that
+ $1 etc of the match-so-far can be seen;
+ * save the old values of subbeg etc of the current regex, and set then
+ to the current string (again, this is normally only done at the end
+ of execution)
+*/
+
static void
-restore_pos(pTHX_ void *arg)
+S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
+{
+ MAGIC *mg;
+ regexp *const rex = ReANY(reginfo->prog);
+ regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
+
+ eval_state->rex = rex;
+
+ if (reginfo->sv) {
+ /* Make $_ available to executed code. */
+ if (reginfo->sv != DEFSV) {
+ SAVE_DEFSV;
+ DEFSV_set(reginfo->sv);
+ }
+
+ if (!(mg = mg_find_mglob(reginfo->sv))) {
+ /* prepare for quick setting of pos */
+ mg = sv_magicext_mglob(reginfo->sv);
+ mg->mg_len = -1;
+ }
+ eval_state->pos_magic = mg;
+ eval_state->pos = mg->mg_len;
+ }
+ else
+ eval_state->pos_magic = NULL;
+
+ if (!PL_reg_curpm) {
+ /* PL_reg_curpm is a fake PMOP that we can attach the current
+ * regex to and point PL_curpm at, so that $1 et al are visible
+ * within a /(?{})/. It's just allocated once per interpreter the
+ * first time its needed */
+ Newxz(PL_reg_curpm, 1, PMOP);
+#ifdef USE_ITHREADS
+ {
+ SV* const repointer = &PL_sv_undef;
+ /* this regexp is also owned by the new PL_reg_curpm, which
+ will try to free it. */
+ av_push(PL_regex_padav, repointer);
+ PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
+ PL_regex_pad = AvARRAY(PL_regex_padav);
+ }
+#endif
+ }
+ SET_reg_curpm(reginfo->prog);
+ eval_state->curpm = PL_curpm;
+ PL_curpm = PL_reg_curpm;
+ if (RXp_MATCH_COPIED(rex)) {
+ /* Here is a serious problem: we cannot rewrite subbeg,
+ since it may be needed if this match fails. Thus
+ $` inside (?{}) could fail... */
+ eval_state->subbeg = rex->subbeg;
+ eval_state->sublen = rex->sublen;
+ eval_state->suboffset = rex->suboffset;
+ eval_state->subcoffset = rex->subcoffset;
+#ifdef PERL_ANY_COW
+ eval_state->saved_copy = rex->saved_copy;
+#endif
+ RXp_MATCH_COPIED_off(rex);
+ }
+ else
+ eval_state->subbeg = NULL;
+ rex->subbeg = (char *)reginfo->strbeg;
+ rex->suboffset = 0;
+ rex->subcoffset = 0;
+ rex->sublen = reginfo->strend - reginfo->strbeg;
+}
+
+
+/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
+
+static void
+S_cleanup_regmatch_info_aux(pTHX_ void *arg)
{
dVAR;
- regexp * const rex = (regexp *)arg;
- if (PL_reg_state.re_state_eval_setup_done) {
- if (PL_reg_oldsaved) {
- rex->subbeg = PL_reg_oldsaved;
- rex->sublen = PL_reg_oldsavedlen;
- rex->suboffset = PL_reg_oldsavedoffset;
- rex->subcoffset = PL_reg_oldsavedcoffset;
+ regmatch_info_aux *aux = (regmatch_info_aux *) arg;
+ regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
+ regmatch_slab *s;
+
+ Safefree(aux->poscache);
+
+ if (eval_state) {
+
+ /* undo the effects of S_setup_eval_state() */
+
+ if (eval_state->subbeg) {
+ regexp * const rex = eval_state->rex;
+ rex->subbeg = eval_state->subbeg;
+ rex->sublen = eval_state->sublen;
+ rex->suboffset = eval_state->suboffset;
+ rex->subcoffset = eval_state->subcoffset;
#ifdef PERL_ANY_COW
- rex->saved_copy = PL_nrs;
+ rex->saved_copy = eval_state->saved_copy;
#endif
- RXp_MATCH_COPIED_on(rex);
- }
- PL_reg_magic->mg_len = PL_reg_oldpos;
- PL_reg_state.re_state_eval_setup_done = FALSE;
- PL_curpm = PL_reg_oldcurpm;
- }
+ RXp_MATCH_COPIED_on(rex);
+ }
+ if (eval_state->pos_magic)
+ eval_state->pos_magic->mg_len = eval_state->pos;
+
+ PL_curpm = eval_state->curpm;
+ }
+
+ PL_regmatch_state = aux->old_regmatch_state;
+ PL_regmatch_slab = aux->old_regmatch_slab;
+
+ /* free all slabs above current one - this must be the last action
+ * of this function, as aux and eval_state are allocated within
+ * slabs and may be freed here */
+
+ s = PL_regmatch_slab->next;
+ if (s) {
+ PL_regmatch_slab->next = NULL;
+ while (s) {
+ regmatch_slab * const osl = s;
+ s = s->next;
+ Safefree(osl);
+ }
+ }
}
+
STATIC void
S_to_utf8_substr(pTHX_ regexp *prog)
{