#ifdef USE_ITHREADS
void* (*dupe) (pTHX_ REGEXP * const rx, CLONE_PARAMS *param);
#endif
+ REGEXP* (*op_comp) (...);
+
When a regexp is compiled, its C<engine> field is then set to point at
the appropriate structure, so that when it needs to be used Perl can find
=over 4
-=item RXf_UTF8
-
-Set if the pattern is L<SvUTF8()|perlapi/SvUTF8>, set by Perl_pmruntime.
-
-A regex engine may want to set or disable this flag during
-compilation. The perl engine for instance may upgrade non-UTF-8
-strings to UTF-8 if the pattern includes constructs such as C<\x{...}>
-that can only match Unicode values.
-
=item RXf_SPLIT
If C<split> is invoked as C<split ' '> or with no arguments (which
I32 minend, SV* screamer,
void* data, U32 flags);
-Execute a regexp.
+Execute a regexp. The arguments are
+
+=over 4
+
+=item rx
+
+The regular expression to execute.
+
+=item screamer
+
+This strangely-named arg is the SV to be matched against. Note that the
+actual char array to be matched against is supplied by the arguments
+described below; the SV is just used to determine UTF8ness, C<pos()> etc.
+
+=item strbeg
+
+Pointer to the physical start of the string.
+
+=item strend
+
+Pointer to the character following the physical end of the string (i.e.
+the \0).
+
+=item stringarg
+
+Pointer to the position in the string where matching should start; it might
+not be equal to C<strbeg> (for example in a later iteration of C</.../g>).
+
+=item minend
+
+Minimum length of string (measured in bytes from C<stringarg>) that must
+match; if the engine reaches the end of the match but hasn't reached this
+position in the string, it should fail.
+
+=item data
+
+Optimisation data; subject to change.
+
+=item flags
+
+Optimisation flags; subject to change.
+
+=back
=head2 intuit
equivalents, ${^PREMATCH}, ${^POSTMATCH} and $^{MATCH}, as well as the
numbered capture groups (C<$1>, C<$2>, ...).
-The C<paren> parameter will be C<-2> for C<$`>, C<-1> for C<$'>, C<0>
-for C<$&>, C<1> for C<$1> and so forth.
+The C<paren> parameter will be C<1> for C<$1>, C<2> for C<$2> and so
+forth, and have these symbolic values for the special variables:
+
+ ${^PREMATCH} RX_BUFF_IDX_CARET_PREMATCH
+ ${^POSTMATCH} RX_BUFF_IDX_CARET_POSTMATCH
+ ${^MATCH} RX_BUFF_IDX_CARET_FULLMATCH
+ $` RX_BUFF_IDX_PREMATCH
+ $' RX_BUFF_IDX_POSTMATCH
+ $& RX_BUFF_IDX_FULLMATCH
+
+Note that in perl 5.17.3 and earlier, the last three constants were also
+used for the caret variants of the variables.
+
The names have been chosen by analogy with L<Tie::Scalar> methods
names with an additional B<LENGTH> callback for efficiency. However
Example:
if ("ook" =~ /(o*)/) {
- # `paren' will be `1' and `value' will be `ee'
+ # 'paren' will be '1' and 'value' will be 'ee'
$1 =~ tr/o/e/;
}
On unthreaded builds this field doesn't exist.
+=head2 op_comp
+
+This is private to the perl core and subject to change. Should be left
+null.
+
=head1 The REGEXP structure
The REGEXP struct is defined in F<regexp.h>. All regex engines must be able to
created this object. */
/* Data about the last/current match. These are modified during matching*/
- U32 lastparen; /* last open paren matched */
- U32 lastcloseparen; /* last close paren matched */
+ U32 lastparen; /* highest close paren matched ($+) */
+ U32 lastcloseparen; /* last close paren matched ($^N) */
regexp_paren_pair *swap; /* Swap copy of *offs */
regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */
char *subbeg; /* saved or original string so \digit works forever. */
SV_SAVED_COPY /* If non-NULL, SV which is COW from original */
I32 sublen; /* Length of string pointed by subbeg */
+ I32 suboffset; /* byte offset of subbeg from logical start of str */
+ I32 subcoffset; /* suboffset equiv, but in chars (for @-/@+) */
/* Information about the match that isn't often used */
I32 prelen; /* length of precomp */
is currently only used internally by perl's engine for but might be
used in the future for all engines for optimisations.
-=head2 C<nparens>, C<lasparen>, and C<lastcloseparen>
+=head2 C<nparens>, C<lastparen>, and C<lastcloseparen>
These fields are used to keep track of how many paren groups could be matched
in the pattern, which was the last open paren to be entered, and which was
If C<< ->offs[num].start >> or C<< ->offs[num].end >> is C<-1> then that
capture group did not match. C<< ->offs[0].start/end >> represents C<$&> (or
-C<${^MATCH> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where
+C<${^MATCH}> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where
C<$paren >= 1>.
=head2 C<precomp> C<prelen>
Fast-Boyer-Moore searches on the string to find out if its worth using
the regex engine at all, and if so where in the string to search.
-=head2 C<subbeg> C<sublen> C<saved_copy>
-
-Used during execution phase for managing search and replace patterns.
+=head2 C<subbeg> C<sublen> C<saved_copy> C<suboffset> C<subcoffset>
+
+Used during the execution phase for managing search and replace patterns,
+and for providing the text for C<$&>, C<$1> etc. C<subbeg> points to a
+buffer (either the original string, or a copy in the case of
+C<RX_MATCH_COPIED(rx)>), and C<sublen> is the length of the buffer. The
+C<RX_OFFS> start and end indices index into this buffer.
+
+In the presence of the C<REXEC_COPY_STR> flag, but with the addition of
+the C<REXEC_COPY_SKIP_PRE> or C<REXEC_COPY_SKIP_POST> flags, an engine
+can choose not to copy the full buffer (although it must still do so in
+the presence of C<RXf_PMf_KEEPCOPY> or the relevant bits being set in
+C<PL_sawampersand>). In this case, it may set C<suboffset> to indicate the
+number of bytes from the logical start of the buffer to the physical start
+(i.e. C<subbeg>). It should also set C<subcoffset>, the number of
+characters in the offset. The latter is needed to support C<@-> and C<@+>
+which work in characters, not bytes.
=head2 C<wrapped> C<wraplen>