);
for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
- SSPUSHINT(rex->offs[p].end);
- SSPUSHINT(rex->offs[p].start);
+ SSPUSHIV(rex->offs[p].end);
+ SSPUSHIV(rex->offs[p].start);
SSPUSHINT(rex->offs[p].start_tmp);
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
I32 tmps;
rex->offs[paren].start_tmp = SSPOPINT;
- rex->offs[paren].start = SSPOPINT;
- tmps = SSPOPINT;
+ rex->offs[paren].start = SSPOPIV;
+ tmps = SSPOPIV;
if (paren <= rex->lastparen)
rex->offs[paren].end = tmps;
DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
* with giant delta may be not rechecked).
*/
-/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
-
/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
Otherwise, only SvCUR(sv) is used to get strbeg. */
-/* XXXX We assume that strpos is strbeg unless sv. */
-
/* XXXX Some places assume that there is a fixed substring.
An update may be needed if optimizer marks as "INTUITable"
RExen without fixed substrings. Similarly, it is assumed that
{
dVAR;
struct regexp *const prog = ReANY(rx);
- I32 start_shift = 0;
+ SSize_t start_shift = 0;
/* Should be nonnegative! */
I32 end_shift = 0;
char *s;
}
check = prog->check_substr;
}
- if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
- ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
+ if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
+ && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
+ {
+ ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
|| ( (prog->extflags & RXf_ANCH_BOL)
&& !multiline ) ); /* Check after \n? */
if (!ml_anch) {
- if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
- && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
+ if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
&& (strpos != strbeg)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
the "check" substring in the region corrected by start/end_shift. */
{
- I32 srch_start_shift = start_shift;
+ SSize_t srch_start_shift = start_shift;
I32 srch_end_shift = end_shift;
U8* start_point;
U8* end_point;
"Copy on write: regexp capture, type %d\n",
(int) SvTYPE(sv));
}
- RX_MATCH_COPY_FREE(rx);
- prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
+ /* Create a new COW SV to share the match string and store
+ * in saved_copy, unless the current COW SV in saved_copy
+ * is valid and suitable for our purpose */
+ if (( prog->saved_copy
+ && SvIsCOW(prog->saved_copy)
+ && SvPOKp(prog->saved_copy)
+ && SvIsCOW(sv)
+ && SvPOKp(sv)
+ && SvPVX(sv) == SvPVX(prog->saved_copy)))
+ {
+ /* just reuse saved_copy SV */
+ if (RXp_MATCH_COPIED(prog)) {
+ Safefree(prog->subbeg);
+ RXp_MATCH_COPIED_off(prog);
+ }
+ }
+ else {
+ /* create new COW SV to share string */
+ RX_MATCH_COPY_FREE(rx);
+ prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
+ }
prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
assert (SvPOKp(prog->saved_copy));
prog->sublen = strend - strbeg;
} else
#endif
{
- I32 min = 0;
- I32 max = strend - strbeg;
+ SSize_t min = 0;
+ SSize_t max = strend - strbeg;
I32 sublen;
if ( (flags & REXEC_COPY_SKIP_POST)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
&& !(PL_sawampersand & SAWAMPERSAND_RIGHT)
) { /* don't copy $' part of string */
U32 n = 0;
}
if ( (flags & REXEC_COPY_SKIP_PRE)
- && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
&& !(PL_sawampersand & SAWAMPERSAND_LEFT)
) { /* don't copy $` part of string */
U32 n = 0;
/* sv: SV being matched: only used for utf8 flag, pos() etc; string
* itself is accessed via the pointers above */
/* data: May be used for some additional optimizations.
- Currently its only used, with a U32 cast, for transmitting
- the ganch offset when doing a /g match. This will change */
-/* nosave: For optimizations. */
+ Currently unused. */
+/* flags: For optimizations. See REXEC_* in regexp.h */
{
dVAR;
struct regexp *const prog = ReANY(rx);
char *s;
regnode *c;
- char *startpos = stringarg;
+ char *startpos;
I32 minlen; /* must match at least this many chars */
I32 dontbother = 0; /* how many characters not to try at end */
- I32 end_shift = 0; /* Same for the end. */ /* CC */
- I32 scream_pos = -1; /* Internal iterator of scream. */
- char *scream_olds = NULL;
const bool utf8_target = cBOOL(DO_UTF8(sv));
I32 multiline;
RXi_GET_DECL(prog,progi);
PERL_UNUSED_ARG(data);
/* Be paranoid... */
- if (prog == NULL || startpos == NULL) {
+ if (prog == NULL || stringarg == NULL) {
Perl_croak(aTHX_ "NULL regexp parameter");
return 0;
}
DEBUG_EXECUTE_r(
- debug_start_match(rx, utf8_target, startpos, strend,
+ debug_start_match(rx, utf8_target, stringarg, strend,
"Matching");
);
- if ((RX_EXTFLAGS(rx) & RXf_USE_INTUIT)
+ startpos = stringarg;
+
+ if (prog->extflags & RXf_GPOS_SEEN) {
+ MAGIC *mg;
+
+ /* set reginfo->ganch, the position where \G can match */
+
+ reginfo->ganch =
+ (flags & REXEC_IGNOREPOS)
+ ? stringarg /* use start pos rather than pos() */
+ : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
+ /* Defined pos(): */
+ ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
+ : strbeg; /* pos() not defined; use start of string */
+
+ DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
+ "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
+
+ /* in the presence of \G, we may need to start looking earlier in
+ * the string than the suggested start point of stringarg:
+ * if gofs->prog is set, then that's a known, fixed minimum
+ * offset, such as
+ * /..\G/: gofs = 2
+ * /ab|c\G/: gofs = 1
+ * or if the minimum offset isn't known, then we have to go back
+ * to the start of the string, e.g. /w+\G/
+ */
+
+ if (prog->extflags & RXf_ANCH_GPOS) {
+ startpos = reginfo->ganch - prog->gofs;
+ if (startpos <
+ ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
+ {
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "fail: ganch-gofs before earliest possible start\n"));
+ return 0;
+ }
+ }
+ else if (prog->gofs) {
+ if (startpos - prog->gofs < strbeg)
+ startpos = strbeg;
+ else
+ startpos -= prog->gofs;
+ }
+ else if (prog->extflags & RXf_GPOS_FLOAT)
+ startpos = strbeg;
+ }
+
+ minlen = prog->minlen;
+ if ((startpos + minlen) > strend || startpos < strbeg) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "Regex match can't succeed, so not even tried\n"));
+ return 0;
+ }
+
+ /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
+ * which will call destuctors to reset PL_regmatch_state, free higher
+ * PL_regmatch_slabs, and clean up regmatch_info_aux and
+ * regmatch_info_aux_eval */
+
+ oldsave = PL_savestack_ix;
+
+ s = startpos;
+
+ if ((prog->extflags & RXf_USE_INTUIT)
&& !(flags & REXEC_CHECKED))
{
- stringarg = re_intuit_start(rx, sv, strbeg, stringarg, strend,
+ s = re_intuit_start(rx, sv, strbeg, startpos, strend,
flags, NULL);
- if (!stringarg)
+ if (!s)
return 0;
- if (RX_EXTFLAGS(rx) & RXf_CHECK_ALL) {
+ if (prog->extflags & RXf_CHECK_ALL) {
/* we can match based purely on the result of INTUIT.
* Set up captures etc just for $& and $-[0]
* (an intuit-only match wont have $1,$2,..) */
assert(!prog->nparens);
+
+ /* s/// doesn't like it if $& is earlier than where we asked it to
+ * start searching (which can happen on something like /.\G/) */
+ if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
+ && (s < stringarg))
+ {
+ /* this should only be possible under \G */
+ assert(prog->extflags & RXf_GPOS_SEEN);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
+ goto phooey;
+ }
+
/* match via INTUIT shouldn't have any captures.
* Let @-, @+, $^N know */
prog->lastparen = prog->lastcloseparen = 0;
RX_MATCH_UTF8_set(rx, utf8_target);
+ prog->offs[0].start = s - strbeg;
+ prog->offs[0].end = utf8_target
+ ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+ : s - strbeg + prog->minlenret;
if ( !(flags & REXEC_NOT_FIRST) )
S_reg_set_capture_string(aTHX_ rx,
strbeg, strend,
sv, flags, utf8_target);
- prog->offs[0].start = stringarg - strbeg;
- prog->offs[0].end = utf8_target
- ? (char*)utf8_hop((U8*)stringarg, prog->minlenret) - strbeg
- : stringarg - strbeg + prog->minlenret;
return 1;
}
}
-
- /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
- * which will call destuctors to reset PL_regmatch_state, free higher
- * PL_regmatch_slabs, and clean up regmatch_info_aux and
- * regmatch_info_aux_eval */
-
- oldsave = PL_savestack_ix;
-
multiline = prog->extflags & RXf_PMf_MULTILINE;
- minlen = prog->minlen;
- if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
+ if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"String too short [regexec_flags]...\n"));
goto phooey;
reginfo->poscache_maxiter = 0; /* not yet started a countdown */
reginfo->strend = strend;
/* see how far we have to get to not match where we matched before */
- reginfo->till = startpos+minend;
+ reginfo->till = stringarg + minend;
if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
/* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
}
/* If there is a "must appear" string, look for it. */
- s = startpos;
- if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
- MAGIC *mg;
- if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
- reginfo->ganch = startpos + prog->gofs;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
- } else if (sv && (mg = mg_find_mglob(sv))
- && mg->mg_len >= 0) {
- reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
-
- if (prog->extflags & RXf_ANCH_GPOS) {
- if (s > reginfo->ganch)
- goto phooey;
- s = reginfo->ganch - prog->gofs;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
- if (s < strbeg)
- goto phooey;
- }
- }
- else if (data) {
- reginfo->ganch = strbeg + PTR2UV(data);
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
-
- } else { /* pos() not defined */
- reginfo->ganch = strbeg;
- DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
- "GPOS: reginfo->ganch = strbeg\n"));
- }
- }
if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
/* We have to be careful. If the previous successful match
was from this regex we don't want a subsequent partially
PTR2UV(prog->offs)
));
}
- if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
- re_scream_pos_data d;
-
- d.scream_olds = &scream_olds;
- d.scream_pos = &scream_pos;
- s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
- if (!s) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
- goto phooey; /* not present */
- }
- }
-
-
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
- if (s == startpos && regtry(reginfo, &startpos))
+ if (s == startpos && regtry(reginfo, &s))
goto got_it;
else if (multiline || (prog->intflags & PREGf_IMPLICIT)
|| (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
goto phooey;
} else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
{
- /* the warning about reginfo->ganch being used without initialization
- is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
- and we only enter this block when the same bit is set. */
- char *tmp_s = reginfo->ganch - prog->gofs;
-
- if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
+ /* For anchored \G, the only position it can match from is
+ * (ganch-gofs); we already set startpos to this above; if intuit
+ * moved us on from there, we can't possibly succeed */
+ assert(startpos == reginfo->ganch - prog->gofs);
+ if (s == startpos && regtry(reginfo, &s))
goto got_it;
goto phooey;
}
/* XXXX check_substr already used to find "s", can optimize if
check_substr==must. */
- scream_pos = -1;
- dontbother = end_shift;
+ dontbother = 0;
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
(s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
goto phooey;
got_it:
+ /* s/// doesn't like it if $& is earlier than where we asked it to
+ * start searching (which can happen on something like /.\G/) */
+ if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
+ && (prog->offs[0].start < stringarg - strbeg))
+ {
+ /* this should only be possible under \G */
+ assert(prog->extflags & RXf_GPOS_SEEN);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
+ goto phooey;
+ }
+
DEBUG_BUFFERS_r(
if (swap)
PerlIO_printf(Perl_debug_log,
CHECKPOINT lastcp;
REGEXP *const rx = reginfo->prog;
regexp *const prog = ReANY(rx);
- I32 result;
+ SSize_t result;
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
}
/* returns -1 on failure, $+[0] on success */
-STATIC I32
+STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
{
#if PERL_VERSION < 9 && !defined(PERL_CORE)
rex->offs[0].end = locinput - reginfo->strbeg;
if (reginfo->info_aux_eval->pos_magic)
- reginfo->info_aux_eval->pos_magic->mg_len
- = locinput - reginfo->strbeg;
+ MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
+ reginfo->sv, reginfo->strbeg,
+ locinput - reginfo->strbeg);
if (sv_yes_mark) {
SV *sv_mrk = get_sv("REGMARK", 1);
S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
{
/* Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is true, will attempt to create the swash if not already
+ * If <doinit> is 'true', will attempt to create the swash if not already
* done.
- * If <listsvp> is non-null, will return the swash initialization string in
- * it.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
* Tied intimately to how regcomp.c sets up the data structure */
dVAR;
SV *sw = NULL;
- SV *si = NULL;
+ SV *si = NULL; /* Input swash initialization string */
SV* invlist = NULL;
RXi_GET_DECL(prog,progi);
/* Element [1] is reserved for the set-up swash. If already there,
* return it; if not, create it and store it there */
- if (SvROK(ary[1])) {
+ if (ary[1] && SvROK(ary[1])) {
sw = ary[1];
}
else if (si && doinit) {
}
}
+ /* If requested, return a printable version of what this swash matches */
if (listsvp) {
SV* matches_string = newSVpvn("", 0);
- /* Use the swash, if any, which has to have incorporated into it all
- * possibilities */
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
&& (si && si != &PL_sv_undef))
{
-
- /* If no swash, use the input initialization string, if available */
sv_catsv(matches_string, si);
}
}
eval_state->pos_magic = mg;
eval_state->pos = mg->mg_len;
+ eval_state->pos_flags = mg->mg_flags;
}
else
eval_state->pos_magic = NULL;
RXp_MATCH_COPIED_on(rex);
}
if (eval_state->pos_magic)
+ {
eval_state->pos_magic->mg_len = eval_state->pos;
+ eval_state->pos_magic->mg_flags =
+ (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
+ | (eval_state->pos_flags & MGf_BYTES);
+ }
PL_curpm = eval_state->curpm;
}