This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlretut: use a numbered list to format a numbered list
[perl5.git] / regexp.h
CommitLineData
a0d0e21e 1/* regexp.h
d6376244 2 *
4bb101f2 3 * Copyright (C) 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2003,
1129b882 4 * 2005, 2006, 2007, 2008 by Larry Wall and others
d6376244
JH
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
a0d0e21e
LW
9 */
10
378cc40b
LW
11/*
12 * Definitions etc. for regexp(3) routines.
13 *
14 * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
15 * not the System V one.
16 */
be8e71aa 17#ifndef PLUGGABLE_RE_EXTENSION
785a26d5
YO
18/* we don't want to include this stuff if we are inside of
19 an external regex engine based on the core one - like re 'debug'*/
378cc40b 20
6c5474e8 21# include "utf8.h"
79a2a0e8 22
f55b7b26
KW
23typedef SSize_t regnode_offset;
24
12d173c9
YO
25struct regnode_meta {
26 U8 type;
27 U8 arg_len;
28 U8 arg_len_varies;
29 U8 off_by_arg;
30};
31
44eb4cdc
YO
32/* this ensures that on alignment sensitive platforms
33 * this struct is aligned on 32 bit boundaries */
34union regnode_head {
35 struct {
36 union {
37 U8 flags;
38 U8 str_len_u8;
39 U8 first_byte;
40 } u_8;
41 U8 type;
42 U16 next_off;
43 } data;
44 U32 data_u32;
45};
46
c277df42 47struct regnode {
44eb4cdc 48 union regnode_head head;
c277df42
IZ
49};
50
51typedef struct regnode regnode;
52
cad2e5aa 53struct reg_substr_data;
2779dcf1 54
0ee3c8d0
JH
55struct reg_data;
56
f9f4320a 57struct regexp_engine;
28d8d7f4 58struct regexp;
bbe252da 59
785a26d5 60struct reg_substr_datum {
c4828ca0 61 SSize_t min_offset; /* min pos (in chars) that substr must appear */
6459affc 62 SSize_t max_offset; /* max pos (in chars) that substr must appear */
85900e28
YO
63 SV *substr; /* non-utf8 variant */
64 SV *utf8_substr; /* utf8 variant */
c4828ca0 65 SSize_t end_shift; /* how many fixed chars must end the string */
785a26d5
YO
66};
67struct reg_substr_data {
6480a6c4 68 U8 check_ix; /* index into data[] of check substr */
85900e28 69 struct reg_substr_datum data[3]; /* Actual array */
785a26d5 70};
f9f4320a 71
6c5474e8
KW
72# ifdef PERL_ANY_COW
73# define SV_SAVED_COPY SV *saved_copy; /* If non-NULL, SV which is COW from original */
74# else
75# define SV_SAVED_COPY
76# endif
28d8d7f4 77
1ed8aa9f
YO
78/* offsets within a string of a particular /(.)/ capture
79 * if you change this by adding new non-temporary fields
80 * then be sure to update Perl_rxres_save() in pp_ctl.c */
f0ab9afb 81typedef struct regexp_paren_pair {
99a90e59
FC
82 SSize_t start;
83 SSize_t end;
05b13cf6 84
b3fd53f3
DM
85 /* 'start_tmp' records a new opening position before the matching end
86 * has been found, so that the old start and end values are still
87 * valid, e.g.
85900e28 88 * "abc" =~ /(.(?{print "[$1]"}))+/
b3fd53f3
DM
89 *outputs [][a][b]
90 * This field is not part of the API. */
ea3daa5d 91 SSize_t start_tmp;
f0ab9afb 92} regexp_paren_pair;
28d8d7f4 93
85900e28 94# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_UTF8_C)
6c5474e8
KW
95# define _invlist_union(a, b, output) _invlist_union_maybe_complement_2nd(a, b, FALSE, output)
96# define _invlist_intersection(a, b, output) _invlist_intersection_maybe_complement_2nd(a, b, FALSE, output)
3f80b571
KW
97
98/* Subtracting b from a leaves in a everything that was there that isn't in b,
99 * that is the intersection of a with b's complement */
6c5474e8
KW
100# define _invlist_subtract(a, b, output) _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
101# endif
52ae8f7e 102
3d2bd50a
DM
103/* record the position of a (?{...}) within a pattern */
104
105struct reg_code_block {
106 STRLEN start;
107 STRLEN end;
108 OP *block;
b30fcab9 109 REGEXP *src_regex;
3d2bd50a
DM
110};
111
1acab4c5
DM
112/* array of reg_code_block's plus header info */
113
114struct reg_code_blocks {
f8def6c7 115 int refcnt; /* we may be pointed to from a regex and from the savestack */
1acab4c5
DM
116 int count; /* how many code blocks */
117 struct reg_code_block *cb; /* array of reg_code_block's */
118};
119
3d2bd50a 120
882227b7 121/*
1f6e74eb 122= for apidoc AyT||regexp
882227b7
AB
123 The regexp/REGEXP struct, see L<perlreapi> for further documentation
124 on the individual fields. The struct is ordered so that the most
125 commonly used fields are placed at the start.
126
127 Any patch that adds items to this struct will need to include
128 changes to F<sv.c> (C<Perl_re_dup()>) and F<regcomp.c>
129 (C<pregfree()>). This involves freeing or cloning items in the
130 regexp's data array based on the data item's type.
131*/
132
bbe252da 133typedef struct regexp {
e1168c11
DM
134 _XPV_HEAD;
135 const struct regexp_engine* engine; /* what engine created this regexp? */
136 REGEXP *mother_re; /* what re is this a lightweight copy of? */
137 HV *paren_names; /* Optional hash of paren names */
138
139 /*----------------------------------------------------------------------
140 * Information about the match that the perl core uses to manage things
141 */
142
fe5492d9
YO
143 /* see comment in regcomp_internal.h about branch reset to understand
144 the distinction between physical and logical capture buffers */
145 U32 nparens; /* physical number of capture buffers */
146 U32 logical_nparens; /* logical_number of capture buffers */
147 I32 *logical_to_parno; /* map logical parno to first physcial */
148 I32 *parno_to_logical; /* map every physical parno to logical */
149 I32 *parno_to_logical_next; /* map every physical parno to the next
150 physical with the same logical id */
151
e1168c11 152 U32 extflags; /* Flags used both externally and internally */
1ee51d7e 153 SSize_t maxlen; /* maximum possible number of chars in string to match */
e1168c11 154 SSize_t minlen; /* minimum possible number of chars in string to match */
0c6362ad 155 SSize_t minlenret; /* minimum possible number of chars in $& */
e1168c11
DM
156 STRLEN gofs; /* chars left of pos that we search from */
157 /* substring data about strings that must appear in
158 * the final match, used for optimisations */
fe5492d9 159
e1168c11 160 struct reg_substr_data *substrs;
e1168c11
DM
161
162 /* private engine specific data */
163
e1168c11
DM
164 void *pprivate; /* Data private to the regex engine which
165 * created this object. */
2bb68ff1 166 U32 intflags; /* Engine Specific Internal flags */
e1168c11
DM
167
168 /*----------------------------------------------------------------------
169 * Data about the last/current match. These are modified during matching
170 */
171
0b9dad94 172 U32 lastparen; /* highest close paren matched ($+) */
e1168c11
DM
173 regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */
174 char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */
2bb68ff1 175 U32 lastcloseparen; /* last close paren matched ($^N) */
e1168c11 176
fe5492d9 177
e1168c11
DM
178 /*---------------------------------------------------------------------- */
179
180 /* offset from wrapped to the start of precomp */
181 PERL_BITFIELD32 pre_prefix:4;
182
183 /* original flags used to compile the pattern, may differ from
184 * extflags in various ways */
185 PERL_BITFIELD32 compflags:9;
186
187 /*---------------------------------------------------------------------- */
188
da35f4ca 189 char *subbeg; /* saved or original string so \digit works forever. */
2bb68ff1
RL
190 SV_SAVED_COPY /* If non-NULL, SV which is COW from original */
191 SSize_t sublen; /* Length of string pointed by subbeg */
192 SSize_t suboffset; /* byte offset of subbeg from logical start of str */
193 SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */
2bb68ff1
RL
194
195 /*---------------------------------------------------------------------- */
196
197
e1168c11 198 CV *qr_anoncv; /* the anon sub wrapped round qr/(?{..})/ */
f8fc2ecf
YO
199} regexp;
200
e1168c11 201
05b13cf6 202#define RXp_PAREN_NAMES(rx) ((rx)->paren_names)
3f11a285 203
05b13cf6 204#define RXp_OFFS_START(rx,n) \
bf0d793b 205 RXp_OFFSp(rx)[(n)].start
3f11a285 206
05b13cf6 207#define RXp_OFFS_END(rx,n) \
bf0d793b 208 RXp_OFFSp(rx)[(n)].end
3f11a285
YO
209
210#define RXp_OFFS_VALID(rx,n) \
bf0d793b 211 (RXp_OFFSp(rx)[(n)].end != -1 && RXp_OFFSp(rx)[(n)].start != -1 )
3f11a285
YO
212
213#define RX_OFFS_START(rx_sv,n) RXp_OFFS_START(ReANY(rx_sv),n)
214#define RX_OFFS_END(rx_sv,n) RXp_OFFS_END(ReANY(rx_sv),n)
215#define RX_OFFS_VALID(rx_sv,n) RXp_OFFS_VALID(ReANY(rx_sv),n)
5daac39c 216
785a26d5 217/* used for high speed searches */
f9f4320a
YO
218typedef struct re_scream_pos_data_s
219{
85900e28
YO
220 char **scream_olds; /* match pos */
221 SSize_t *scream_pos; /* Internal iterator of scream. */
f9f4320a
YO
222} re_scream_pos_data;
223
785a26d5
YO
224/* regexp_engine structure. This is the dispatch table for regexes.
225 * Any regex engine implementation must be able to build one of these.
226 */
f9f4320a 227typedef struct regexp_engine {
1593ad57 228 REGEXP* (*comp) (pTHX_ SV * const pattern, U32 flags);
49d7dfbc 229 I32 (*exec) (pTHX_ REGEXP * const rx, char* stringarg, char* strend,
ea3daa5d 230 char* strbeg, SSize_t minend, SV* sv,
49d7dfbc 231 void* data, U32 flags);
52a21eb3
DM
232 char* (*intuit) (pTHX_
233 REGEXP * const rx,
234 SV *sv,
235 const char * const strbeg,
236 char *strpos,
237 char *strend,
238 const U32 flags,
9f61653a 239 re_scream_pos_data *data);
49d7dfbc 240 SV* (*checkstr) (pTHX_ REGEXP * const rx);
fc6bde6f 241 void (*rxfree) (pTHX_ REGEXP * const rx);
2fdbfb4d 242 void (*numbered_buff_FETCH) (pTHX_ REGEXP * const rx, const I32 paren,
192b9cd1 243 SV * const sv);
2fdbfb4d
AB
244 void (*numbered_buff_STORE) (pTHX_ REGEXP * const rx, const I32 paren,
245 SV const * const value);
246 I32 (*numbered_buff_LENGTH) (pTHX_ REGEXP * const rx, const SV * const sv,
247 const I32 paren);
192b9cd1
AB
248 SV* (*named_buff) (pTHX_ REGEXP * const rx, SV * const key,
249 SV * const value, const U32 flags);
250 SV* (*named_buff_iter) (pTHX_ REGEXP * const rx, const SV * const lastkey,
251 const U32 flags);
49d7dfbc 252 SV* (*qr_package)(pTHX_ REGEXP * const rx);
6c5474e8 253# ifdef USE_ITHREADS
49d7dfbc 254 void* (*dupe) (pTHX_ REGEXP * const rx, CLONE_PARAMS *param);
6c5474e8 255# endif
3c13cae6 256 REGEXP* (*op_comp) (pTHX_ SV ** const patternp, int pat_count,
1f4fbd3b
MS
257 OP *expr, const struct regexp_engine* eng,
258 REGEXP *old_re,
259 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags);
f9f4320a
YO
260} regexp_engine;
261
192b9cd1
AB
262/*
263 These are passed to the numbered capture variable callbacks as the
264 paren name. >= 1 is reserved for actual numbered captures, i.e. $1,
265 $2 etc.
266*/
6c5474e8
KW
267# define RX_BUFF_IDX_CARET_PREMATCH -5 /* ${^PREMATCH} */
268# define RX_BUFF_IDX_CARET_POSTMATCH -4 /* ${^POSTMATCH} */
269# define RX_BUFF_IDX_CARET_FULLMATCH -3 /* ${^MATCH} */
270# define RX_BUFF_IDX_PREMATCH -2 /* $` */
271# define RX_BUFF_IDX_POSTMATCH -1 /* $' */
272# define RX_BUFF_IDX_FULLMATCH 0 /* $& */
192b9cd1
AB
273
274/*
275 Flags that are passed to the named_buff and named_buff_iter
276 callbacks above. Those routines are called from universal.c via the
277 Tie::Hash::NamedCapture interface for %+ and %- and the re::
278 functions in the same file.
279*/
280
281/* The Tie::Hash::NamedCapture operation this is part of, if any */
6c5474e8
KW
282# define RXapif_FETCH 0x0001
283# define RXapif_STORE 0x0002
284# define RXapif_DELETE 0x0004
285# define RXapif_CLEAR 0x0008
286# define RXapif_EXISTS 0x0010
287# define RXapif_SCALAR 0x0020
288# define RXapif_FIRSTKEY 0x0040
289# define RXapif_NEXTKEY 0x0080
192b9cd1
AB
290
291/* Whether %+ or %- is being operated on */
6c5474e8
KW
292# define RXapif_ONE 0x0100 /* %+ */
293# define RXapif_ALL 0x0200 /* %- */
192b9cd1
AB
294
295/* Whether this is being called from a re:: function */
6c5474e8
KW
296# define RXapif_REGNAME 0x0400
297# define RXapif_REGNAMES 0x0800
298# define RXapif_REGNAMES_COUNT 0x1000
192b9cd1 299
f7e71195 300/*
f7e71195
AB
301=for apidoc Am|REGEXP *|SvRX|SV *sv
302
72d33970 303Convenience macro to get the REGEXP from a SV. This is approximately
f7e71195
AB
304equivalent to the following snippet:
305
306 if (SvMAGICAL(sv))
307 mg_get(sv);
1af910ea
DL
308 if (SvROK(sv))
309 sv = MUTABLE_SV(SvRV(sv));
310 if (SvTYPE(sv) == SVt_REGEXP)
311 return (REGEXP*) sv;
f7e71195 312
796b6530 313C<NULL> will be returned if a REGEXP* is not found.
f7e71195
AB
314
315=for apidoc Am|bool|SvRXOK|SV* sv
316
1af910ea
DL
317Returns a boolean indicating whether the SV (or the one it references)
318is a REGEXP.
f7e71195
AB
319
320If you want to do something with the REGEXP* later use SvRX instead
321and check for NULL.
322
323=cut
324*/
325
6c5474e8
KW
326# define SvRX(sv) (Perl_get_re_arg(aTHX_ sv))
327# define SvRXOK(sv) cBOOL(Perl_get_re_arg(aTHX_ sv))
f7e71195
AB
328
329
7948fc08 330/* Flags stored in regexp->extflags
bbe252da 331 * These are used by code external to the regexp engine
e357fc67 332 *
5b126c84
KW
333 * Note that the flags whose names start with RXf_PMf_ are defined in
334 * op_reg_common.h, being copied from the parallel flags of op_pmflags
cb5027f2 335 *
eb2624c9
FC
336 * NOTE: if you modify any RXf flags you should run regen.pl or
337 * regen/regcomp.pl so that regnodes.h is updated with the changes.
cb5027f2 338 *
bbe252da
YO
339 */
340
6c5474e8 341# include "op_reg_common.h"
5b126c84 342
85900e28 343# define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_NOCAPTURE)
cde0cee5 344
6c5474e8 345# define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count) \
cc4d09e1
KW
346 case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \
347 case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \
348 case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \
77c8f263
KW
349 case XTENDED_PAT_MOD: if (x_count == 0) { \
350 *(pmfl) |= RXf_PMf_EXTENDED; \
351 *(pmfl) &= ~RXf_PMf_EXTENDED_MORE; \
352 } \
353 else { \
354 *(pmfl) |= RXf_PMf_EXTENDED \
355 |RXf_PMf_EXTENDED_MORE; \
356 } \
357 (x_count)++; break; \
41d7c59e 358 case NOCAPTURE_PAT_MOD: *(pmfl) |= RXf_PMf_NOCAPTURE; break;
cc4d09e1 359
94b03d7d 360/* Note, includes charset ones, assumes 0 is the default for them */
6c5474e8 361# define STD_PMMOD_FLAGS_CLEAR(pmfl) \
77c8f263 362 *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE)
fb85c044 363
bcdf7404 364/* chars and strings used as regex pattern modifiers
486ec47a 365 * Singular is a 'c'har, plural is a "string"
87e95b7f
YO
366 *
367 * NOTE, KEEPCOPY was originally 'k', but was changed to 'p' for preserve
368 * for compatibility reasons with Regexp::Common which highjacked (?k:...)
369 * for its own uses. So 'k' is out as well.
bcdf7404 370 */
6c5474e8
KW
371# define DEFAULT_PAT_MOD '^' /* Short for all the default modifiers */
372# define EXEC_PAT_MOD 'e'
373# define KEEPCOPY_PAT_MOD 'p'
374# define NOCAPTURE_PAT_MOD 'n'
375# define ONCE_PAT_MOD 'o'
376# define GLOBAL_PAT_MOD 'g'
377# define CONTINUE_PAT_MOD 'c'
378# define MULTILINE_PAT_MOD 'm'
379# define SINGLE_PAT_MOD 's'
380# define IGNORE_PAT_MOD 'i'
381# define XTENDED_PAT_MOD 'x'
382# define NONDESTRUCT_PAT_MOD 'r'
383# define LOCALE_PAT_MOD 'l'
384# define UNICODE_PAT_MOD 'u'
385# define DEPENDS_PAT_MOD 'd'
386# define ASCII_RESTRICT_PAT_MOD 'a'
387
388# define ONCE_PAT_MODS "o"
389# define KEEPCOPY_PAT_MODS "p"
390# define NOCAPTURE_PAT_MODS "n"
391# define EXEC_PAT_MODS "e"
392# define LOOP_PAT_MODS "gc"
393# define NONDESTRUCT_PAT_MODS "r"
394# define LOCALE_PAT_MODS "l"
395# define UNICODE_PAT_MODS "u"
396# define DEPENDS_PAT_MODS "d"
397# define ASCII_RESTRICT_PAT_MODS "a"
398# define ASCII_MORE_RESTRICT_PAT_MODS "aa"
bcdf7404 399
78fecfc8 400/* This string is expected by regcomp.c to be ordered so that the first
c18d5d15
KW
401 * character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next
402 * character is bit +1, etc. */
6c5474e8 403# define STD_PAT_MODS "msixxn"
bcdf7404 404
6c5474e8 405# define CHARSET_PAT_MODS ASCII_RESTRICT_PAT_MODS DEPENDS_PAT_MODS LOCALE_PAT_MODS UNICODE_PAT_MODS
94b03d7d 406
78fecfc8 407/* This string is expected by XS_re_regexp_pattern() in universal.c to be ordered
c18d5d15
KW
408 * so that the first character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of
409 * extflags; the next character is in bit +1, etc. */
6c5474e8 410# define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS
bcdf7404 411
6c5474e8 412# define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS NOCAPTURE_PAT_MODS
85900e28 413# define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS CHARSET_PAT_MODS
6c5474e8
KW
414# define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS
415# define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS NONDESTRUCT_PAT_MODS
bcdf7404 416
cb5027f2 417/*
eb2624c9
FC
418 * NOTE: if you modify any RXf flags you should run regen.pl or
419 * regen/regcomp.pl so that regnodes.h is updated with the changes.
cb5027f2
YO
420 *
421 */
bcdf7404 422
dbc200c5 423/*
5012eebe
DM
424 Set in Perl_pmruntime for a split. Will be used by regex engines to
425 check whether they should set RXf_SKIPWHITE
dbc200c5 426*/
6c5474e8 427# define RXf_SPLIT RXf_PMf_SPLIT
32f62247 428
0bd51918
KW
429/* Currently the regex flags occupy a single 32-bit word. Not all bits are
430 * currently used. The lower bits are shared with their corresponding PMf flag
431 * bits, up to but not including _RXf_PMf_SHIFT_NEXT. The unused bits
432 * immediately follow; finally the used RXf-only (unshared) bits, so that the
433 * highest bit in the word is used. This gathers all the unused bits as a pool
434 * in the middle, like so: 11111111111111110000001111111111
435 * where the '1's represent used bits, and the '0's unused. This design allows
436 * us to allocate off one end of the pool if we need to add a shared bit, and
437 * off the other end if we need a non-shared bit, without disturbing the other
438 * bits. This maximizes the likelihood of being able to change things without
439 * breaking binary compatibility.
440 *
441 * To add shared bits, do so in op_reg_common.h. This should change
442 * _RXf_PMf_SHIFT_NEXT so that things won't compile. Then come to regexp.h and
443 * op.h and adjust the constant adders in the definitions of RXf_BASE_SHIFT and
444 * Pmf_BASE_SHIFT down by the number of shared bits you added. That's it.
445 * Things should be binary compatible. But if either of these gets to having
446 * to subtract rather than add, leave at 0 and instead adjust all the entries
447 * that are in terms of it. But if the first one of those is already
448 * RXf_BASE_SHIFT+0, there are no bits left, and a redesign is in order.
449 *
450 * To remove unshared bits, just delete its entry. If you're where breaking
451 * binary compatibility is ok to do, you might want to adjust things to move
452 * the newly opened space so that it gets absorbed into the common pool.
453 *
454 * To add unshared bits, first use up any gaps in the middle. Otherwise,
455 * allocate off the low end until you get to RXf_BASE_SHIFT+0. If that isn't
456 * enough, move RXf_BASE_SHIFT down (if possible) and add the new bit at the
457 * other end instead; this preserves binary compatibility.
458 *
459 * For the regexp bits, PL_reg_extflags_name[] in regnodes.h has a comment
460 * giving which bits are used/unused */
461
6c5474e8 462# define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 2)
52d81aa8 463
bbe252da 464/* What we have seen */
6c5474e8 465# define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2))
85900e28 466# define RXf_EVAL_SEEN (1U<<(RXf_BASE_SHIFT+3))
bbe252da
YO
467
468/* Special */
6c5474e8 469# define RXf_UNBOUNDED_QUANTIFIER_SEEN (1U<<(RXf_BASE_SHIFT+4))
85900e28 470# define RXf_CHECK_ALL (1U<<(RXf_BASE_SHIFT+5))
bbe252da
YO
471
472/* UTF8 related */
85900e28 473# define RXf_MATCH_UTF8 (1U<<(RXf_BASE_SHIFT+6)) /* $1 etc are utf8 */
bbe252da
YO
474
475/* Intuit related */
85900e28
YO
476# define RXf_USE_INTUIT_NOML (1U<<(RXf_BASE_SHIFT+7))
477# define RXf_USE_INTUIT_ML (1U<<(RXf_BASE_SHIFT+8))
478# define RXf_INTUIT_TAIL (1U<<(RXf_BASE_SHIFT+9))
6c5474e8 479# define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML)
bbe252da 480
a3b51d37 481/* Do we have some sort of anchor? */
6c5474e8 482# define RXf_IS_ANCHORED (1U<<(RXf_BASE_SHIFT+10))
a3b51d37 483
bbe252da 484/* Copy and tainted info */
85900e28 485# define RXf_COPY_DONE (1U<<(RXf_BASE_SHIFT+11))
ef07e810 486
1738e041 487/* post-execution: $1 et al are tainted */
85900e28 488# define RXf_TAINTED_SEEN (1U<<(RXf_BASE_SHIFT+12))
ef07e810 489/* this pattern was tainted during compilation */
85900e28 490# define RXf_TAINTED (1U<<(RXf_BASE_SHIFT+13))
52d81aa8
NC
491
492/* Flags indicating special patterns */
6c5474e8
KW
493# define RXf_START_ONLY (1U<<(RXf_BASE_SHIFT+14)) /* Pattern is /^/ */
494# define RXf_SKIPWHITE (1U<<(RXf_BASE_SHIFT+15)) /* Pattern is for a */
9cba692b 495 /* split " " */
85900e28
YO
496# define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */
497# define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */
0bd51918
KW
498
499/* See comments at the beginning of these defines about adding bits. The
500 * highest bit position should be used, so that if RXf_BASE_SHIFT gets
501 * increased, the #error below will be triggered so that you will be reminded
502 * to adjust things at the other end to keep the bit positions unchanged */
6c5474e8
KW
503# if RXf_BASE_SHIFT+17 > 31
504# error Too many RXf_PMf bits used. See comments at beginning of these for what to do
505# endif
c737faaf 506
cb5027f2 507/*
eb2624c9
FC
508 * NOTE: if you modify any RXf flags you should run regen.pl or
509 * regen/regcomp.pl so that regnodes.h is updated with the changes.
cb5027f2
YO
510 *
511 */
bbe252da 512
6c5474e8
KW
513# ifdef NO_TAINT_SUPPORT
514# define RX_ISTAINTED(rx_sv) 0
515# define RXp_ISTAINTED(prog) 0
516# define RX_TAINT_on(rx_sv) NOOP
517# define RXp_MATCH_TAINTED(prog) 0
518# define RX_MATCH_TAINTED(rx_sv) 0
519# define RXp_MATCH_TAINTED_on(prog) NOOP
520# define RX_MATCH_TAINTED_on(rx_sv) NOOP
521# define RXp_MATCH_TAINTED_off(prog) NOOP
522# define RX_MATCH_TAINTED_off(rx_sv) NOOP
523# else
524# define RX_ISTAINTED(rx_sv) (RX_EXTFLAGS(rx_sv) & RXf_TAINTED)
525# define RXp_ISTAINTED(prog) (RXp_EXTFLAGS(prog) & RXf_TAINTED)
526# define RX_TAINT_on(rx_sv) (RX_EXTFLAGS(rx_sv) |= RXf_TAINTED)
527# define RXp_MATCH_TAINTED(prog) (RXp_EXTFLAGS(prog) & RXf_TAINTED_SEEN)
528# define RX_MATCH_TAINTED(rx_sv) (RX_EXTFLAGS(rx_sv) & RXf_TAINTED_SEEN)
529# define RXp_MATCH_TAINTED_on(prog) (RXp_EXTFLAGS(prog) |= RXf_TAINTED_SEEN)
530# define RX_MATCH_TAINTED_on(rx_sv) (RX_EXTFLAGS(rx_sv) |= RXf_TAINTED_SEEN)
531# define RXp_MATCH_TAINTED_off(prog) (RXp_EXTFLAGS(prog) &= ~RXf_TAINTED_SEEN)
532# define RX_MATCH_TAINTED_off(rx_sv) (RX_EXTFLAGS(rx_sv) &= ~RXf_TAINTED_SEEN)
533# endif
534
535# define RXp_HAS_CUTGROUP(prog) ((prog)->intflags & PREGf_CUTGROUP_SEEN)
536
537# define RX_MATCH_TAINTED_set(rx_sv, t) ((t) \
a885a8e0
DM
538 ? RX_MATCH_TAINTED_on(rx_sv) \
539 : RX_MATCH_TAINTED_off(rx_sv))
540
6c5474e8
KW
541# define RXp_MATCH_COPIED(prog) (RXp_EXTFLAGS(prog) & RXf_COPY_DONE)
542# define RX_MATCH_COPIED(rx_sv) (RX_EXTFLAGS(rx_sv) & RXf_COPY_DONE)
543# define RXp_MATCH_COPIED_on(prog) (RXp_EXTFLAGS(prog) |= RXf_COPY_DONE)
544# define RX_MATCH_COPIED_on(rx_sv) (RX_EXTFLAGS(rx_sv) |= RXf_COPY_DONE)
545# define RXp_MATCH_COPIED_off(prog) (RXp_EXTFLAGS(prog) &= ~RXf_COPY_DONE)
546# define RX_MATCH_COPIED_off(rx_sv) (RX_EXTFLAGS(rx_sv) &= ~RXf_COPY_DONE)
547# define RX_MATCH_COPIED_set(rx_sv,t) ((t) \
a885a8e0
DM
548 ? RX_MATCH_COPIED_on(rx_sv) \
549 : RX_MATCH_COPIED_off(rx_sv))
550
6c5474e8
KW
551# define RXp_EXTFLAGS(rx) ((rx)->extflags)
552# define RXp_COMPFLAGS(rx) ((rx)->compflags)
866c78d1 553
07bc277f 554/* For source compatibility. We used to store these explicitly. */
6c5474e8 555# define RX_PRECOMP(rx_sv) (RX_WRAPPED(rx_sv) \
a885a8e0 556 + ReANY(rx_sv)->pre_prefix)
6c5474e8 557# define RX_PRECOMP_const(rx_sv) (RX_WRAPPED_const(rx_sv) \
a885a8e0 558 + ReANY(rx_sv)->pre_prefix)
5509d87a
NC
559/* FIXME? Are we hardcoding too much here and constraining plugin extension
560 writers? Specifically, the value 1 assumes that the wrapped version always
561 has exactly one character at the end, a ')'. Will that always be true? */
6c5474e8 562# define RX_PRELEN(rx_sv) (RX_WRAPLEN(rx_sv) \
a885a8e0
DM
563 - ReANY(rx_sv)->pre_prefix - 1)
564
6c5474e8
KW
565# define RX_WRAPPED(rx_sv) SvPVX(rx_sv)
566# define RX_WRAPPED_const(rx_sv) SvPVX_const(rx_sv)
567# define RX_WRAPLEN(rx_sv) SvCUR(rx_sv)
568# define RX_CHECK_SUBSTR(rx_sv) (ReANY(rx_sv)->check_substr)
569# define RX_REFCNT(rx_sv) SvREFCNT(rx_sv)
570# define RX_EXTFLAGS(rx_sv) RXp_EXTFLAGS(ReANY(rx_sv))
571# define RX_COMPFLAGS(rx_sv) RXp_COMPFLAGS(ReANY(rx_sv))
572# define RXp_ENGINE(prog) ((prog)->engine)
573# define RX_ENGINE(rx_sv) (RXp_ENGINE(ReANY(rx_sv)))
d145af01 574# define RXp_SUBBEG(prog) ((prog)->subbeg)
6c5474e8 575# define RX_SUBBEG(rx_sv) (RXp_SUBBEG(ReANY(rx_sv)))
d145af01 576# define RXp_SUBOFFSET(prog) ((prog)->suboffset)
6c5474e8 577# define RX_SUBOFFSET(rx_sv) (RXp_SUBOFFSET(ReANY(rx_sv)))
248ff857
YO
578# define RXp_SUBCOFFSET(prog) ((prog)->subcoffset)
579# define RX_SUBCOFFSET(rx_sv) (RXp_SUBCOFFSET(ReANY(rx_sv)))
d145af01 580# define RXp_OFFSp(prog) ((prog)->offs)
3f11a285 581# define RX_OFFSp(rx_sv) (RXp_OFFSp(ReANY(rx_sv)))
d145af01 582# define RXp_LOGICAL_NPARENS(prog) ((prog)->logical_nparens)
fe5492d9 583# define RX_LOGICAL_NPARENS(rx_sv) (RXp_LOGICAL_NPARENS(ReANY(rx_sv)))
d145af01 584# define RXp_LOGICAL_TO_PARNO(prog) ((prog)->logical_to_parno)
fe5492d9 585# define RX_LOGICAL_TO_PARNO(rx_sv) (RXp_LOGICAL_TO_PARNO(ReANY(rx_sv)))
d145af01 586# define RXp_PARNO_TO_LOGICAL(prog) ((prog)->parno_to_logical)
fe5492d9 587# define RX_PARNO_TO_LOGICAL(rx_sv) (RXp_PARNO_TO_LOGICAL(ReANY(rx_sv)))
d145af01 588# define RXp_PARNO_TO_LOGICAL_NEXT(prog) ((prog)->parno_to_logical_next)
fe5492d9 589# define RX_PARNO_TO_LOGICAL_NEXT(rx_sv) (RXp_PARNO_TO_LOGICAL_NEXT(ReANY(rx_sv)))
d145af01 590# define RXp_NPARENS(prog) ((prog)->nparens)
6c5474e8 591# define RX_NPARENS(rx_sv) (RXp_NPARENS(ReANY(rx_sv)))
248ff857
YO
592# define RXp_SUBLEN(prog) ((prog)->sublen)
593# define RX_SUBLEN(rx_sv) (RXp_SUBLEN(ReANY(rx_sv)))
d145af01 594# define RXp_MINLEN(prog) ((prog)->minlen)
6c5474e8 595# define RX_MINLEN(rx_sv) (RXp_MINLEN(ReANY(rx_sv)))
d145af01 596# define RXp_MINLENRET(prog) ((prog)->minlenret)
6c5474e8 597# define RX_MINLENRET(rx_sv) (RXp_MINLENRET(ReANY(rx_sv)))
d145af01 598# define RXp_GOFS(prog) ((prog)->gofs)
6c5474e8 599# define RX_GOFS(rx_sv) (RXp_GOFS(ReANY(rx_sv)))
d145af01
YO
600# define RXp_LASTPAREN(prog) ((prog)->lastparen)
601# define RX_LASTPAREN(rx_sv) (RXp_LASTPAREN(ReANY(rx_sv)))
602# define RXp_LASTCLOSEPAREN(prog) ((prog)->lastcloseparen)
603# define RX_LASTCLOSEPAREN(rx_sv) (RXp_LASTCLOSEPAREN(ReANY(rx_sv)))
604# define RXp_SAVED_COPY(prog) ((prog)->saved_copy)
6c5474e8 605# define RX_SAVED_COPY(rx_sv) (RXp_SAVED_COPY(ReANY(rx_sv)))
248ff857
YO
606# define RXp_SUBSTRS(prog) ((prog)->substrs)
607# define RX_SUBSTRS(rx_sv) (RXp_SUBSTRS(ReANY(rx_sv)))
608# define RXp_PPRIVATE(prog) ((prog)->pprivate)
609# define RX_PPRIVATE(rx_sv) (RXp_PPRIVATE(ReANY(rx_sv)))
610# define RXp_QR_ANONCV(prog) ((prog)->qr_anoncv)
611# define RX_QR_ANONCV(rx_sv) (RXp_QR_ANONCV(ReANY(rx_sv)))
612# define RXp_MOTHER_RE(prog) ((prog)->mother_re)
613# define RX_MOTHER_RE(rx_sv) (RXp_MOTHER_RE(ReANY(rx_sv)))
614# define RXp_PRE_PREFIX(prog) ((prog)->pre_prefix)
615# define RX_PRE_PREFIX(rx_sv) (RXp_PRE_PREFIX(ReANY(rx_sv)))
616
03c83e26 617/* last match was zero-length */
6c5474e8 618# define RXp_ZERO_LEN(prog) \
3f11a285
YO
619 (RXp_OFFS_START(prog,0) + (SSize_t)RXp_GOFS(prog) \
620 == RXp_OFFS_END(prog,0))
6c5474e8 621# define RX_ZERO_LEN(rx_sv) (RXp_ZERO_LEN(ReANY(rx_sv)))
220fc49f 622
be8e71aa
YO
623#endif /* PLUGGABLE_RE_EXTENSION */
624
486ec47a 625/* Stuff that needs to be included in the pluggable extension goes below here */
be8e71aa 626
db2c6cb3 627#ifdef PERL_ANY_COW
ffd0ba0c
YO
628# define RXp_MATCH_COPY_FREE(prog) \
629 STMT_START { \
630 if (RXp_SAVED_COPY(prog)) { \
631 SV_CHECK_THINKFIRST_COW_DROP(RXp_SAVED_COPY(prog)); \
632 } \
633 if (RXp_MATCH_COPIED(prog)) { \
634 Safefree(RXp_SUBBEG(prog)); \
635 RXp_MATCH_COPIED_off(prog); \
636 } \
637 } STMT_END
ed252734 638#else
ffd0ba0c
YO
639# define RXp_MATCH_COPY_FREE(prog) \
640 STMT_START { \
641 if (RXp_MATCH_COPIED(prog)) { \
642 Safefree(RXp_SUBBEG(prog)); \
643 RXp_MATCH_COPIED_off(prog); \
644 } \
645 } STMT_END
ed252734 646#endif
a8cb1947 647#define RX_MATCH_COPY_FREE(rx_sv) RXp_MATCH_COPY_FREE(ReANY(rx_sv))
ed252734 648
a885a8e0
DM
649#define RXp_MATCH_UTF8(prog) (RXp_EXTFLAGS(prog) & RXf_MATCH_UTF8)
650#define RX_MATCH_UTF8(rx_sv) (RX_EXTFLAGS(rx_sv) & RXf_MATCH_UTF8)
196a02af 651#define RXp_MATCH_UTF8_on(prog) (RXp_EXTFLAGS(prog) |= RXf_MATCH_UTF8)
97f6857b 652#define RX_MATCH_UTF8_on(rx_sv) (RXp_MATCH_UTF8_on(ReANY(rx_sv)))
196a02af
DM
653#define RXp_MATCH_UTF8_off(prog) (RXp_EXTFLAGS(prog) &= ~RXf_MATCH_UTF8)
654#define RX_MATCH_UTF8_off(rx_sv) (RXp_MATCH_UTF8_off(ReANY(rx_sv))
655#define RXp_MATCH_UTF8_set(prog, t) ((t) \
656 ? RXp_MATCH_UTF8_on(prog) \
657 : RXp_MATCH_UTF8_off(prog))
658#define RX_MATCH_UTF8_set(rx_sv, t) (RXp_MATCH_UTF8_set(ReANY(rx_sv), t))
efd26800
NC
659
660/* Whether the pattern stored at RX_WRAPPED is in UTF-8 */
a885a8e0 661#define RX_UTF8(rx_sv) SvUTF8(rx_sv)
7948fc08 662
a340edde
DM
663
664/* bits in flags arg of Perl_regexec_flags() */
665
666#define REXEC_COPY_STR 0x01 /* Need to copy the string for captures. */
667#define REXEC_CHECKED 0x02 /* re_intuit_start() already called. */
668#define REXEC_SCREAM 0x04 /* currently unused. */
669#define REXEC_IGNOREPOS 0x08 /* use stringarg, not pos(), for \G match */
670#define REXEC_NOT_FIRST 0x10 /* This is another iteration of //g:
671 no need to copy string again */
672
673 /* under REXEC_COPY_STR, it's ok for the
674 engine (modulo PL_sawamperand etc)
675 to skip copying: ... */
676#define REXEC_COPY_SKIP_PRE 0x20 /* ...the $` part of the string, or */
677#define REXEC_COPY_SKIP_POST 0x40 /* ...the $' part of the string */
d5e7783a
DM
678#define REXEC_FAIL_ON_UNDERFLOW 0x80 /* fail the match if $& would start before
679 the start pos (so s/.\G// would fail
680 on second iteration */
c277df42 681
041c1a23 682#if defined(PERL_USE_GCC_BRACE_GROUPS)
85900e28
YO
683# define ReREFCNT_inc(re) \
684 ({ \
685 /* This is here to generate a casting warning if incorrect. */ \
686 REGEXP *const _rerefcnt_inc = (re); \
687 assert(SvTYPE(_rerefcnt_inc) == SVt_REGEXP); \
688 SvREFCNT_inc(_rerefcnt_inc); \
689 _rerefcnt_inc; \
288b8c02 690 })
85900e28
YO
691# define ReREFCNT_dec(re) \
692 ({ \
693 /* This is here to generate a casting warning if incorrect. */ \
694 REGEXP *const _rerefcnt_dec = (re); \
695 SvREFCNT_dec(_rerefcnt_dec); \
288b8c02
NC
696 })
697#else
85900e28
YO
698# define ReREFCNT_dec(re) SvREFCNT_dec(re)
699# define ReREFCNT_inc(re) ((REGEXP *) SvREFCNT_inc(re))
288b8c02 700#endif
85900e28 701#define ReANY(re) Perl_ReANY((const REGEXP *)(re))
288b8c02
NC
702
703/* FIXME for plugins. */
cf93c79d 704
85900e28
YO
705#define FBMcf_TAIL_DOLLAR 1
706#define FBMcf_TAIL_DOLLARM 2
707#define FBMcf_TAIL_Z 4
708#define FBMcf_TAIL_z 8
709#define FBMcf_TAIL (FBMcf_TAIL_DOLLAR|FBMcf_TAIL_DOLLARM|FBMcf_TAIL_Z|FBMcf_TAIL_z)
cf93c79d 710
85900e28 711#define FBMrf_MULTILINE 1
cad2e5aa 712
331b2dcc
DM
713struct regmatch_state;
714struct regmatch_slab;
8adc0f72 715
bf2039a9
DM
716/* like regmatch_info_aux, but contains extra fields only needed if the
717 * pattern contains (?{}). If used, is snuck into the second slot in the
718 * regmatch_state stack at the start of execution */
8adc0f72
DM
719
720typedef struct {
721 regexp *rex;
722 PMOP *curpm; /* saved PL_curpm */
723#ifdef PERL_ANY_COW
724 SV *saved_copy; /* saved saved_copy field from rex */
725#endif
726 char *subbeg; /* saved subbeg field from rex */
727 STRLEN sublen; /* saved sublen field from rex */
728 STRLEN suboffset; /* saved suboffset field from rex */
729 STRLEN subcoffset; /* saved subcoffset field from rex */
1d48e83d 730 SV *sv; /* $_ during (?{}) */
8adc0f72 731 MAGIC *pos_magic; /* pos() magic attached to $_ */
ea3daa5d 732 SSize_t pos; /* the original value of pos() in pos_magic */
25fdce4a 733 U8 pos_flags; /* flags to be restored; currently only MGf_BYTES*/
bf2039a9
DM
734} regmatch_info_aux_eval;
735
736
737/* fields that logically live in regmatch_info, but which need cleaning
738 * up on croak(), and so are instead are snuck into the first slot in
739 * the regmatch_state stack at the start of execution */
740
741typedef struct {
742 regmatch_info_aux_eval *info_aux_eval;
331b2dcc
DM
743 struct regmatch_state *old_regmatch_state; /* saved PL_regmatch_state */
744 struct regmatch_slab *old_regmatch_slab; /* saved PL_regmatch_slab */
85900e28 745 char *poscache; /* S-L cache of fail positions of WHILEMs */
bf2039a9
DM
746} regmatch_info_aux;
747
8adc0f72 748
1f6e74eb
KW
749/*
750=for apidoc Ay||regmatch_info
751Some basic information about the current match that is created by
752Perl_regexec_flags and then passed to regtry(), regmatch() etc.
753It is allocated as a local var on the stack, so nothing should be
754stored in it that needs preserving or clearing up on croak().
755For that, see the aux_info and aux_info_eval members of the
756regmatch_state union.
757
758=cut
759*/
3b0527fe
DM
760
761typedef struct {
28d03b21 762 REGEXP *prog; /* the regex being executed */
9d9163fb 763 const char * strbeg; /* real start of string */
28d03b21
DM
764 char *strend; /* one byte beyond last char of match string */
765 char *till; /* matches shorter than this fail (see minlen arg) */
766 SV *sv; /* the SV string currently being matched */
767 char *ganch; /* position of \G anchor */
768 char *cutpoint; /* (*COMMIT) position (if any) */
bf2039a9
DM
769 regmatch_info_aux *info_aux; /* extra fields that need cleanup */
770 regmatch_info_aux_eval *info_aux_eval; /* extra saved state for (?{}) */
1cb48e53
DM
771 I32 poscache_maxiter; /* how many whilems todo before S-L cache kicks in */
772 I32 poscache_iter; /* current countdown from _maxiter to zero */
2ac8ff4b 773 STRLEN poscache_size; /* size of regmatch_info_aux.poscache */
02d5137b 774 bool intuit; /* re_intuit_start() is the top-level caller */
ba44c216
DM
775 bool is_utf8_pat; /* regex is utf8 */
776 bool is_utf8_target; /* string being matched is utf8 */
39819bd9 777 bool warned; /* we have issued a recursion warning; no need for more */
3b0527fe 778} regmatch_info;
1f4fbd3b 779
5d9a96ca
DM
780
781/* structures for holding and saving the state maintained by regmatch() */
782
d56b3014 783#ifndef MAX_RECURSE_EVAL_NOCHANGE_DEPTH
6c5474e8 784# define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10
d56b3014 785#endif
6bda09f9 786
52c50ff1
KW
787/* The +1 is because everything matches itself, which isn't included in
788 * MAX_FOLD_FROMS; the +2 is based on the current Unicode standards needs, and
789 * is unlikely to change. An assertion should fail in regexec.c if it is too
790 * low. It is needed for certain edge cases involving multi-character folds
791 * when the first component also participates in a fold individually. */
792#define MAX_MATCHES (MAX_FOLD_FROMS + 1 + 2)
bb382562
KW
793
794struct next_matchable_info {
795 U8 first_byte_mask;
796 U8 first_byte_anded;
797 U32 mask32;
798 U32 anded32;
799 PERL_INT_FAST8_T count; /* Negative means not initialized */
800 PERL_UINT_FAST8_T min_length;
801 PERL_UINT_FAST8_T max_length;
802 PERL_UINT_FAST8_T initial_definitive;
803 PERL_UINT_FAST8_T initial_exact;
804 PERL_UINT_FAST8_T lengths[MAX_MATCHES];
805
806 /* The size is from trial and error, and could change with new Unicode
807 * standards, in which case there is an assertion that should start
808 * failing. This size could be calculated in one of the regen scripts
809 * dealing with Unicode, but khw thinks the likelihood of it changing is
810 * low enough that it isn't worth the effort. */
811 U8 matches[18];
812};
813
5d9a96ca
DM
814typedef I32 CHECKPOINT;
815
a0374537 816typedef struct regmatch_state {
85900e28
YO
817 int resume_state; /* where to jump to on return */
818 char *locinput; /* where to backtrack in string on failure */
fd1dd2eb 819 char *loceol;
8e9f3eef 820 U8 *sr0; /* position of start of script run, or NULL */
e822a8b4
DM
821
822 union {
77cb431f 823
bf2039a9
DM
824 /* the 'info_aux' and 'info_aux_eval' union members are cuckoos in
825 * the nest. They aren't saved backtrack state; rather they
826 * represent one or two extra chunks of data that need allocating
827 * at the start of a match. These fields would logically live in
828 * the regmatch_info struct, except that is allocated on the
829 * C stack, and these fields are all things that require cleanup
830 * after a croak(), when the stack is lost.
831 * As a convenience, we just use the first 1 or 2 regmatch_state
832 * slots to store this info, as we will be allocating a slab of
833 * these anyway. Otherwise we'd have to malloc and then free them,
834 * or allocate them on the save stack (where they will get
835 * realloced if the save stack grows).
836 * info_aux contains the extra fields that are always needed;
837 * info_aux_eval contains extra fields that only needed if
838 * the pattern contains code blocks
839 * We split them into two separate structs to avoid increasing
840 * the size of the union.
841 */
842
843 regmatch_info_aux info_aux;
844
845 regmatch_info_aux_eval info_aux_eval;
846
1f4fbd3b
MS
847 /* this is a fake union member that matches the first element
848 * of each member that needs to store positive backtrack
849 * information */
850 struct {
851 struct regmatch_state *prev_yes_state;
852 } yes;
77cb431f 853
ee28315c
YO
854
855 /* NOTE: Regarding 'cp' and 'lastcp' in the following structs...
856 *
857 * In the majority of cases we use 'cp' for the "normal"
858 * checkpoint for paren saves, and 'lastcp' for the addtional
859 * paren saves that are done only under RE_PESSIMISTIC_PARENS.
860 *
861 * There may be a few cases where both are used always.
862 * Regardless they tend be used something like this:
863 *
864 * ST.cp = regcppush(rex, 0, maxopenparen);
865 * REGCP_SET(ST.lastcp);
866 *
867 * thus ST.cp holds the checkpoint from before we push parens,
868 * and ST.lastcp holds the checkpoint from afterwards.
869 */
870
fae667d5
YO
871 /* branchlike members */
872 /* this is a fake union member that matches the first elements
873 * of each member that needs to behave like a branch */
874 struct {
1f4fbd3b
MS
875 /* this first element must match u.yes */
876 struct regmatch_state *prev_yes_state;
ee28315c
YO
877 U32 lastparen;
878 U32 lastcloseparen;
879 CHECKPOINT cp; /* see note above "struct branchlike" */
880 CHECKPOINT lastcp; /* see note above "struct branchlike" */
881 U16 before_paren;
882 U16 after_paren;
1f4fbd3b 883
fae667d5 884 } branchlike;
1f4fbd3b
MS
885
886 struct {
887 /* the first elements must match u.branchlike */
888 struct regmatch_state *prev_yes_state;
ee28315c
YO
889 U32 lastparen;
890 U32 lastcloseparen;
891 CHECKPOINT cp; /* see note above "struct branchlike" */
892 CHECKPOINT lastcp; /* see note above "struct branchlike" */
893 U16 before_paren;
894 U16 after_paren;
895
896 regnode *next_branch; /* next branch node */
1f4fbd3b
MS
897 } branch;
898
899 struct {
900 /* the first elements must match u.branchlike */
901 struct regmatch_state *prev_yes_state;
ee28315c
YO
902 U32 lastparen;
903 U32 lastcloseparen;
904 CHECKPOINT cp; /* see note above "struct branchlike" */
905 CHECKPOINT lastcp; /* see note above "struct branchlike" */
906 U16 before_paren;
907 U16 after_paren;
908
909 U32 accepted; /* how many accepting states left */
910 bool longfold; /* saw a fold with a 1->n char mapping */
911 U16 *jump; /* positive offsets from me */
acababb4
YO
912 U16 *j_before_paren;
913 U16 *j_after_paren;
ee28315c
YO
914 regnode *me; /* Which node am I - needed for jump tries*/
915 U8 *firstpos; /* pos in string of first trie match */
916 U32 firstchars; /* len in chars of firstpos from start */
917 U16 nextword; /* next word to try */
918 U16 topword; /* longest accepted word */
1f4fbd3b 919 } trie;
e822a8b4 920
fae667d5
YO
921 /* special types - these members are used to store state for special
922 regops like eval, if/then, lookaround and the markpoint state */
1f4fbd3b
MS
923 struct {
924 /* this first element must match u.yes */
925 struct regmatch_state *prev_yes_state;
926 struct regmatch_state *prev_curlyx;
d1c49ad5 927 struct regmatch_state *prev_eval;
85900e28 928 REGEXP *prev_rex;
ee28315c
YO
929 CHECKPOINT cp; /* see note above "struct branchlike" */
930 CHECKPOINT lastcp; /* see note above "struct branchlike" */
931 U32 close_paren; /* which close bracket is our end (+1) */
932 regnode *B; /* the node following us */
ba6840fb 933 char *prev_recurse_locinput;
1f4fbd3b 934 } eval;
e822a8b4 935
1f4fbd3b
MS
936 struct {
937 /* this first element must match u.yes */
938 struct regmatch_state *prev_yes_state;
ee28315c
YO
939 I32 wanted;
940 I32 logical; /* saved copy of 'logical' var */
941 U8 count; /* number of beginning positions */
942 char *start;
943 char *end;
944 regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */
945 char *prev_match_end;
946 } ifmatch; /* and SUSPEND/UNLESSM */
1f4fbd3b
MS
947
948 struct {
949 /* this first element must match u.yes */
950 struct regmatch_state *prev_yes_state;
951 struct regmatch_state *prev_mark;
ee28315c
YO
952 SV *mark_name;
953 char *mark_loc;
1f4fbd3b
MS
954 } mark;
955
956 struct {
957 int val;
958 } keeper;
fae667d5
YO
959
960 /* quantifiers - these members are used for storing state for
a3815e44 961 the regops used to implement quantifiers */
1f4fbd3b
MS
962 struct {
963 /* this first element must match u.yes */
964 struct regmatch_state *prev_yes_state;
965 struct regmatch_state *prev_curlyx; /* previous cur_curlyx */
ee28315c
YO
966 regnode *me; /* the CURLYX node */
967 regnode *B; /* the B node in /A*B/ */
968 CHECKPOINT cp; /* see note above "struct branchlike" */
969 CHECKPOINT lastcp; /* see note above "struct branchlike" */
85900e28 970 bool minmod;
ee28315c 971 int parenfloor; /* how far back to strip paren data */
1f4fbd3b
MS
972
973 /* these two are modified by WHILEM */
ee28315c
YO
974 int count; /* how many instances of A we've matched */
975 char *lastloc; /* where previous A matched (0-len detect) */
1f4fbd3b
MS
976 } curlyx;
977
978 struct {
979 /* this first element must match u.yes */
980 struct regmatch_state *prev_yes_state;
981 struct regmatch_state *save_curlyx;
ee28315c
YO
982 CHECKPOINT cp; /* see note above "struct branchlike" */
983 CHECKPOINT lastcp; /* see note above "struct branchlike" */
984 char *save_lastloc; /* previous curlyx.lastloc */
85900e28
YO
985 I32 cache_offset;
986 I32 cache_mask;
1f4fbd3b
MS
987 } whilem;
988
989 struct {
990 /* this first element must match u.yes */
991 struct regmatch_state *prev_yes_state;
ee28315c
YO
992 U32 lastparen;
993 U32 lastcloseparen;
994 CHECKPOINT cp; /* see note above "struct branchlike" */
995 CHECKPOINT lastcp; /* see note above "struct branchlike" */
996 I32 alen; /* length of first-matched A string */
997 I32 count;
998 bool minmod;
999 regnode *A, *B; /* the nodes corresponding to /A*B/ */
1000 regnode *me; /* the curlym node */
bb382562 1001 struct next_matchable_info Binfo;
1f4fbd3b
MS
1002 } curlym;
1003
1004 struct {
ee28315c
YO
1005 U32 paren;
1006 U32 lastparen;
1007 U32 lastcloseparen;
1008 CHECKPOINT cp; /* see note above "struct branchlike" */
1009 CHECKPOINT lastcp; /* see note above "struct branchlike" */
1010 char *maxpos; /* highest possible point in string to match */
1011 char *oldloc; /* the previous locinput */
1012 int count;
1013 int min, max; /* {m,n} */
1014 regnode *A, *B; /* the nodes corresponding to /A*B/ */
bb382562 1015 struct next_matchable_info Binfo;
1f4fbd3b 1016 } curly; /* and CURLYN/PLUS/STAR */
dad79028 1017
59db1942 1018 struct {
ee28315c
YO
1019 CHECKPOINT cp;
1020 CHECKPOINT lastcp;
59db1942 1021 } backref; /* REF and friends */
d8319b27 1022 } u;
5d9a96ca
DM
1023} regmatch_state;
1024
ce12e254 1025
d5a00e4a 1026
5d9a96ca
DM
1027/* how many regmatch_state structs to allocate as a single slab.
1028 * We do it in 4K blocks for efficiency. The "3" is 2 for the next/prev
1029 * pointers, plus 1 for any mythical malloc overhead. */
1f4fbd3b 1030
5d9a96ca
DM
1031#define PERL_REGMATCH_SLAB_SLOTS \
1032 ((4096 - 3 * sizeof (void*)) / sizeof(regmatch_state))
1033
1034typedef struct regmatch_slab {
1035 regmatch_state states[PERL_REGMATCH_SLAB_SLOTS];
1036 struct regmatch_slab *prev, *next;
1037} regmatch_slab;
1ade1aa1 1038
46ab3289 1039
fe5492d9 1040#define REG_FETCH_ABSOLUTE 1
cde0cee5 1041
1ade1aa1 1042/*
14d04a33 1043 * ex: set ts=8 sts=4 sw=4 et:
1ade1aa1 1044 */