(UV)(r->pre_prefix));
Perl_dump_indent(aTHX_ level, file, " SUBLEN = %"IVdf"\n",
(IV)(r->sublen));
+ Perl_dump_indent(aTHX_ level, file, " SUBOFFSET = %"IVdf"\n",
+ (IV)(r->suboffset));
+ Perl_dump_indent(aTHX_ level, file, " SUBCOFFSET = %"IVdf"\n",
+ (IV)(r->subcoffset));
if (r->subbeg)
Perl_dump_indent(aTHX_ level, file, " SUBBEG = 0x%"UVxf" %s\n",
PTR2UV(r->subbeg),
GOFS = 0
PRE_PREFIX = 4
SUBLEN = 0
+ SUBOFFSET = 0
+ SUBCOFFSET = 0
SUBBEG = 0x0
ENGINE = $ADDR
MOTHER_RE = $ADDR
return (U32)-1;
}
+/* @-, @+ */
+
int
Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg)
{
if (i > 0 && RX_MATCH_UTF8(rx)) {
const char * const b = RX_SUBBEG(rx);
if (b)
- i = utf8_length((U8*)b, (U8*)(b+i));
+ i = RX_SUBCOFFSET(rx) +
+ utf8_length((U8*)b,
+ (U8*)(b-RX_SUBOFFSET(rx)+i));
}
sv_setiv(sv, i);
return 0;
}
+/* @-, @+ */
+
int
Perl_magic_regdatum_set(pTHX_ SV *sv, MAGIC *mg)
{
char *subbeg; /* saved or original string so \digit works forever. */
SV_SAVED_COPY /* If non-NULL, SV which is COW from original */
I32 sublen; /* Length of string pointed by subbeg */
+ I32 suboffset; /* byte offset of subbeg from logical start of str */
+ I32 subcoffset; /* suboffset equiv, but in chars (for @-/@+) */
/* Information about the match that isn't often used */
I32 prelen; /* length of precomp */
Fast-Boyer-Moore searches on the string to find out if its worth using
the regex engine at all, and if so where in the string to search.
-=head2 C<subbeg> C<sublen> C<saved_copy>
-
-Used during execution phase for managing search and replace patterns.
+=head2 C<subbeg> C<sublen> C<saved_copy> C<suboffset> C<subcoffset>
+
+Used during the execution phase for managing search and replace patterns,
+and for providing the text for C<$&>, C<$1> etc. C<subbeg> points to a
+buffer (either the original string, or a copy in the case of
+C<RX_MATCH_COPIED(rx)>), and C<sublen> is the length of the buffer. The
+C<RX_OFFS> start and end indices index into this buffer.
+
+In the presence of the C<REXEC_COPY_STR> flag, but with the addition of
+the C<REXEC_COPY_SKIP_PRE> or C<REXEC_COPY_SKIP_POST> flags, an engine
+can choose not to copy the full buffer (although it must still do so in
+the presence of C<RXf_PMf_KEEPCOPY> or the relevant bits being set in
+C<PL_sawampersand>). In this case, it may set C<suboffset> to indicate the
+number of bytes from the logical start of the buffer to the physical start
+(i.e. C<subbeg>). It should also set C<subcoffset>, the number of
+characters in the offset. The latter is needed to support C<@-> and C<@+>
+which work in characters, not bytes.
=head2 C<wrapped> C<wraplen>
if (rex_return == 0)
break;
TAINT_IF(RX_MATCH_TAINTED(rx));
+ /* we never pass the REXEC_COPY_STR flag, so it should
+ * never get copied */
+ assert(!RX_MATCH_COPIED(rx));
if (RX_MATCH_COPIED(rx) && RX_SUBBEG(rx) != orig) {
m = s;
s = orig;
if (RX_MATCH_COPIED(rx) && RX_SUBBEG(rx) != orig) {
m = s;
s = orig;
+ assert(!RX_SUBOFFSET(rx));
cx->sb_orig = orig = RX_SUBBEG(rx);
s = orig + (m - s);
cx->sb_strend = s + (cx->sb_strend - m);
if (!p || p[1] < RX_NPARENS(rx)) {
#ifdef PERL_OLD_COPY_ON_WRITE
- i = 7 + RX_NPARENS(rx) * 2;
+ i = 7 + (RX_NPARENS(rx)+1) * 2;
#else
- i = 6 + RX_NPARENS(rx) * 2;
+ i = 6 + (RX_NPARENS(rx)+1) * 2;
#endif
if (!p)
Newx(p, i, UV);
*rsp = (void*)p;
}
- *p++ = PTR2UV(RX_MATCH_COPIED(rx) ? RX_SUBBEG(rx) : NULL);
+ *p++ = RX_MATCH_COPIED(rx) ? 1 : 0;
RX_MATCH_COPIED_off(rx);
#ifdef PERL_OLD_COPY_ON_WRITE
#endif
*p++ = RX_NPARENS(rx);
-
*p++ = PTR2UV(RX_SUBBEG(rx));
*p++ = (UV)RX_SUBLEN(rx);
+ *p++ = (UV)RX_SUBOFFSET(rx);
+ *p++ = (UV)RX_SUBCOFFSET(rx);
for (i = 0; i <= RX_NPARENS(rx); ++i) {
*p++ = (UV)RX_OFFS(rx)[i].start;
*p++ = (UV)RX_OFFS(rx)[i].end;
#endif
RX_NPARENS(rx) = *p++;
-
RX_SUBBEG(rx) = INT2PTR(char*,*p++);
RX_SUBLEN(rx) = (I32)(*p++);
+ RX_SUBOFFSET(rx) = (I32)*p++;
+ RX_SUBCOFFSET(rx) = (I32)*p++;
for (i = 0; i <= RX_NPARENS(rx); ++i) {
RX_OFFS(rx)[i].start = (I32)(*p++);
RX_OFFS(rx)[i].end = (I32)(*p++);
appears to be quite tricky.
Test for the unsafe vars are TODO for now. */
if ( (!global && RX_NPARENS(rx))
- || SvTEMP(TARG) || SvAMAGIC(TARG) || PL_sawampersand
- || (RX_EXTFLAGS(rx) & (RXf_EVAL_SEEN|RXf_PMf_KEEPCOPY)))
- r_flags |= REXEC_COPY_STR;
+ || PL_sawampersand
+ || SvTEMP(TARG)
+ || SvAMAGIC(TARG)
+ || (RX_EXTFLAGS(rx) & (RXf_EVAL_SEEN|RXf_PMf_KEEPCOPY))
+ ) {
+ r_flags |= (REXEC_COPY_STR|REXEC_COPY_SKIP_PRE);
+ /* in @a =~ /(.)/g, we iterate multiple times, but copy the buffer
+ * only on the first iteration. Therefore we need to copy $' as well
+ * as $&, to make the rest of the string available for captures in
+ * subsequent iterations */
+ if (! (global && gimme == G_ARRAY))
+ r_flags |= REXEC_COPY_SKIP_POST;
+ };
play_it_again:
if (global && RX_OFFS(rx)[0].start != -1) {
if (global) {
/* FIXME - should rx->subbeg be const char *? */
RX_SUBBEG(rx) = (char *) truebase;
+ RX_SUBOFFSET(rx) = 0;
+ RX_SUBCOFFSET(rx) = 0;
RX_OFFS(rx)[0].start = s - truebase;
if (RX_MATCH_UTF8(rx)) {
char * const t = (char*)utf8_hop((U8*)s, RX_MINLENRET(rx));
#endif
}
RX_SUBLEN(rx) = strend - t;
+ RX_SUBOFFSET(rx) = 0;
+ RX_SUBCOFFSET(rx) = 0;
RX_MATCH_COPIED_on(rx);
off = RX_OFFS(rx)[0].start = s - t;
RX_OFFS(rx)[0].end = off + RX_MINLENRET(rx);
pm = PL_curpm;
rx = PM_GETRE(pm);
}
- r_flags = (RX_NPARENS(rx) || SvTEMP(TARG) || PL_sawampersand
- || (RX_EXTFLAGS(rx) & (RXf_EVAL_SEEN|RXf_PMf_KEEPCOPY)) )
- ? REXEC_COPY_STR : 0;
+
+ r_flags = ( RX_NPARENS(rx)
+ || PL_sawampersand
+ || SvTEMP(TARG)
+ || (RX_EXTFLAGS(rx) & (RXf_EVAL_SEEN|RXf_PMf_KEEPCOPY))
+ )
+ ? REXEC_COPY_STR
+ : 0;
orig = m = s;
if (RX_EXTFLAGS(rx) & RXf_USE_INTUIT) {
if (RX_MATCH_COPIED(rx) && RX_SUBBEG(rx) != orig) {
m = s;
s = orig;
+ assert(RX_SUBOFFSET(rx) == 0);
orig = RX_SUBBEG(rx);
s = orig + (m - s);
strend = s + (strend - m);
&& rx->offs[0].end != -1)
{
/* $', ${^POSTMATCH} */
- s = rx->subbeg + rx->offs[0].end;
- i = rx->sublen - rx->offs[0].end;
+ s = rx->subbeg - rx->suboffset + rx->offs[0].end;
+ i = rx->sublen + rx->suboffset - rx->offs[0].end;
}
else
if ( 0 <= n && n <= (I32)rx->nparens &&
{
/* $&, ${^MATCH}, $1 ... */
i = t1 - s1;
- s = rx->subbeg + s1;
+ s = rx->subbeg + s1 - rx->suboffset;
} else {
goto ret_undef;
}
}
getlen:
if (i > 0 && RXp_MATCH_UTF8(rx)) {
- const char * const s = rx->subbeg + s1;
+ const char * const s = rx->subbeg - rx->suboffset + s1;
const U8 *ep;
STRLEN el;
PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
+ PL_reg_oldsavedoffset = 0;
+ PL_reg_oldsavedcoffset = 0;
PL_reg_maxiter = 0;
PL_reg_leftiter = 0;
PL_reg_poscache = NULL;
/* make sure $`, $&, $', and $digit will work later */
if ( !(flags & REXEC_NOT_FIRST) ) {
- RX_MATCH_COPY_FREE(rx);
if (flags & REXEC_COPY_STR) {
- const I32 i = PL_regeol - strbeg;
#ifdef PERL_OLD_COPY_ON_WRITE
if ((SvIsCOW(sv)
|| (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
assert (SvPOKp(prog->saved_copy));
+ prog->sublen = PL_regeol - strbeg;
+ prog->suboffset = 0;
+ prog->subcoffset = 0;
} else
#endif
{
- RX_MATCH_COPIED_on(rx);
- s = savepvn(strbeg, i);
- prog->subbeg = s;
- }
- prog->sublen = i;
+ I32 min = 0;
+ I32 max = PL_regeol - strbeg;
+ I32 sublen;
+
+ if ( (flags & REXEC_COPY_SKIP_POST)
+ && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
+ ) { /* don't copy $' part of string */
+ U32 n = (PL_sawampersand & SAWAMPERSAND_MIDDLE) ? 0 : 1;
+ max = -1;
+ /* calculate the right-most part of the string covered
+ * by a capture. Due to look-ahead, this may be to
+ * the right of $&, so we have to scan all captures */
+ while (n <= prog->lastparen) {
+ if (prog->offs[n].end > max)
+ max = prog->offs[n].end;
+ n++;
+ }
+ if (max == -1)
+ max = (PL_sawampersand & SAWAMPERSAND_LEFT)
+ ? prog->offs[0].start
+ : 0;
+ assert(max >= 0 && max <= PL_regeol - strbeg);
+ }
+
+ if ( (flags & REXEC_COPY_SKIP_PRE)
+ && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
+ && !(PL_sawampersand & SAWAMPERSAND_LEFT)
+ ) { /* don't copy $` part of string */
+ U32 n = (PL_sawampersand & SAWAMPERSAND_MIDDLE) ? 0 : 1;
+ min = max;
+ /* calculate the left-most part of the string covered
+ * by a capture. Due to look-behind, this may be to
+ * the left of $&, so we have to scan all captures */
+ while (min && n <= prog->lastparen) {
+ if ( prog->offs[n].start != -1
+ && prog->offs[n].start < min)
+ {
+ min = prog->offs[n].start;
+ }
+ n++;
+ }
+ if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
+ && min > prog->offs[0].end
+ )
+ min = prog->offs[0].end;
+
+ }
+
+ assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
+ sublen = max - min;
+
+ if (RX_MATCH_COPIED(rx)) {
+ if (sublen > prog->sublen)
+ prog->subbeg =
+ (char*)saferealloc(prog->subbeg, sublen+1);
+ }
+ else
+ prog->subbeg = (char*)safemalloc(sublen+1);
+ Copy(strbeg + min, prog->subbeg, sublen, char);
+ prog->subbeg[sublen] = '\0';
+ prog->suboffset = min;
+ prog->sublen = sublen;
+ }
+ RX_MATCH_COPIED_on(rx);
+ prog->subcoffset = prog->suboffset;
+ if (prog->suboffset && utf8_target) {
+ /* Convert byte offset to chars.
+ * XXX ideally should only compute this if @-/@+
+ * has been seen, a la PL_sawampersand ??? */
+
+ /* If there's a direct correspondence between the
+ * string which we're matching and the original SV,
+ * then we can use the utf8 len cache associated with
+ * the SV. In particular, it means that under //g,
+ * sv_pos_b2u() will use the previously cached
+ * position to speed up working out the new length of
+ * subcoffset, rather than counting from the start of
+ * the string each time. This stops
+ * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
+ * from going quadratic */
+ if (SvPOKp(sv) && SvPVX(sv) == strbeg)
+ sv_pos_b2u(sv, &(prog->subcoffset));
+ else
+ prog->subcoffset = utf8_length((U8*)strbeg,
+ (U8*)(strbeg+prog->suboffset));
+ }
}
else {
+ RX_MATCH_COPY_FREE(rx);
prog->subbeg = strbeg;
+ prog->suboffset = 0;
+ prog->subcoffset = 0;
prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
}
}
$` inside (?{}) could fail... */
PL_reg_oldsaved = prog->subbeg;
PL_reg_oldsavedlen = prog->sublen;
+ PL_reg_oldsavedoffset = prog->suboffset;
+ PL_reg_oldsavedcoffset = prog->suboffset;
#ifdef PERL_OLD_COPY_ON_WRITE
PL_nrs = prog->saved_copy;
#endif
else
PL_reg_oldsaved = NULL;
prog->subbeg = PL_bostr;
+ prog->suboffset = 0;
+ prog->subcoffset = 0;
prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
}
#ifdef DEBUGGING
RXp_MATCH_COPIED_off(re);
re->subbeg = rex->subbeg;
re->sublen = rex->sublen;
+ re->suboffset = rex->suboffset;
+ re->subcoffset = rex->subcoffset;
rei = RXi_GET(re);
DEBUG_EXECUTE_r(
debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
if (PL_reg_oldsaved) {
rex->subbeg = PL_reg_oldsaved;
rex->sublen = PL_reg_oldsavedlen;
+ rex->suboffset = PL_reg_oldsavedoffset;
+ rex->subcoffset = PL_reg_oldsavedcoffset;
#ifdef PERL_OLD_COPY_ON_WRITE
rex->saved_copy = PL_nrs;
#endif
char *subbeg; \
SV_SAVED_COPY /* If non-NULL, SV which is COW from original */\
I32 sublen; /* Length of string pointed by subbeg */ \
+ I32 suboffset; /* byte offset of subbeg from logical start of str */ \
+ I32 subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \
/* Information about the match that isn't often used */ \
/* offset from wrapped to the start of precomp */ \
PERL_BITFIELD32 pre_prefix:4; \
assert(SvTYPE(_rx_subbeg) == SVt_REGEXP); \
&SvANY(_rx_subbeg)->subbeg; \
}))
+# define RX_SUBOFFSET(prog) \
+ (*({ \
+ const REGEXP *const _rx_suboffset = (prog); \
+ assert(SvTYPE(_rx_suboffset) == SVt_REGEXP); \
+ &SvANY(_rx_suboffset)->suboffset; \
+ }))
+# define RX_SUBCOFFSET(prog) \
+ (*({ \
+ const REGEXP *const _rx_subcoffset = (prog); \
+ assert(SvTYPE(_rx_subcoffset) == SVt_REGEXP); \
+ &SvANY(_rx_subcoffset)->subcoffset; \
+ }))
# define RX_OFFS(prog) \
(*({ \
const REGEXP *const _rx_offs = (prog); \
# define RX_EXTFLAGS(prog) RXp_EXTFLAGS((struct regexp *)SvANY(prog))
# define RX_ENGINE(prog) (((struct regexp *)SvANY(prog))->engine)
# define RX_SUBBEG(prog) (((struct regexp *)SvANY(prog))->subbeg)
+# define RX_SUBOFFSET(prog) (((struct regexp *)SvANY(prog))->suboffset)
+# define RX_SUBCOFFSET(prog) (((struct regexp *)SvANY(prog))->subcoffset)
# define RX_OFFS(prog) (((struct regexp *)SvANY(prog))->offs)
# define RX_NPARENS(prog) (((struct regexp *)SvANY(prog))->nparens)
#endif
#define REXEC_SCREAM 0x04 /* use scream table. */
#define REXEC_IGNOREPOS 0x08 /* \G matches at start. */
#define REXEC_NOT_FIRST 0x10 /* This is another iteration of //g. */
+ /* under REXEC_COPY_STR, it's ok for the
+ * engine (modulo PL_sawamperand etc)
+ * to skip copying ... */
+#define REXEC_COPY_SKIP_PRE 0x20 /* ...the $` part of the string, or */
+#define REXEC_COPY_SKIP_POST 0x40 /* ...the $' part of the string */
#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
# define ReREFCNT_inc(re) \
#define PL_reg_curpm PL_reg_state.re_state_reg_curpm
#define PL_reg_oldsaved PL_reg_state.re_state_reg_oldsaved
#define PL_reg_oldsavedlen PL_reg_state.re_state_reg_oldsavedlen
+#define PL_reg_oldsavedoffset PL_reg_state.re_state_reg_oldsavedoffset
+#define PL_reg_oldsavedcoffset PL_reg_state.re_state_reg_oldsavedcoffset
#define PL_reg_maxiter PL_reg_state.re_state_reg_maxiter
#define PL_reg_leftiter PL_reg_state.re_state_reg_leftiter
#define PL_reg_poscache PL_reg_state.re_state_reg_poscache
PMOP *re_state_reg_curpm; /* from regexec.c */
char *re_state_reg_oldsaved; /* old saved substr during match */
STRLEN re_state_reg_oldsavedlen; /* old length of saved substr during match */
+ STRLEN re_state_reg_oldsavedoffset; /* old offset of saved substr during match */
+ STRLEN re_state_reg_oldsavedcoffset;/* old coffset of saved substr during match */
STRLEN re_state_reg_poscache_size; /* size of pos cache of WHILEM */
I32 re_state_reg_oldpos; /* from regexec.c */
I32 re_state_reg_maxiter; /* max wait until caching pos */
pod/perlperf.pod Verbatim line length including indents exceeds 79 by 154
pod/perlpodspec.pod Verbatim line length including indents exceeds 79 by 9
pod/perlpodstyle.pod Verbatim line length including indents exceeds 79 by 1
-pod/perlreapi.pod Verbatim line length including indents exceeds 79 by 17
+pod/perlreapi.pod Verbatim line length including indents exceeds 79 by 18
pod/perlrebackslash.pod Verbatim line length including indents exceeds 79 by 1
pod/perlref.pod Verbatim line length including indents exceeds 79 by 1
pod/perlreguts.pod Verbatim line length including indents exceeds 79 by 17
\W \x{200D} n - -
/^(?d:\xdf|_)*_/i \x{17f}\x{17f}_ y $& \x{17f}\x{17f}_
+#
+# check that @-, @+ count chars, not bytes; especially if beginning of
+# string is not copied
+
+(\x{100}) \x{2000}\x{2000}\x{2000}\x{100} y $-[0]:$-[1]:$+[0]:$+[1] 3:3:4:4
# vim: softtabstop=0 noexpandtab