This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c etc - rework branch reset so it works properly
[perl5.git] / regcomp_internal.h
CommitLineData
85900e28
YO
1#ifndef REGCOMP_INTERNAL_H
2#define REGCOMP_INTERNAL_H
3#ifndef STATIC
4#define STATIC static
5#endif
6
7/* this is a chain of data about sub patterns we are processing that
8 need to be handled separately/specially in study_chunk. Its so
9 we can simulate recursion without losing state. */
10struct scan_frame;
11typedef struct scan_frame {
12 regnode *last_regnode; /* last node to process in this frame */
13 regnode *next_regnode; /* next node to process when last is reached */
14 U32 prev_recursed_depth;
15 I32 stopparen; /* what stopparen do we use */
16 bool in_gosub; /* this or an outer frame is for GOSUB */
17
18 struct scan_frame *this_prev_frame; /* this previous frame */
19 struct scan_frame *prev_frame; /* previous frame */
20 struct scan_frame *next_frame; /* next frame */
21} scan_frame;
22
23/* Certain characters are output as a sequence with the first being a
24 * backslash. */
25#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
26
27
28struct RExC_state_t {
29 U32 flags; /* RXf_* are we folding, multilining? */
30 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
31 char *precomp; /* uncompiled string. */
32 char *precomp_end; /* pointer to end of uncompiled string. */
33 REGEXP *rx_sv; /* The SV that is the regexp. */
34 regexp *rx; /* perl core regexp structure */
35 regexp_internal *rxi; /* internal data for regexp object
36 pprivate field */
37 char *start; /* Start of input for compile */
38 char *end; /* End of input for compile */
39 char *parse; /* Input-scan pointer. */
40 char *copy_start; /* start of copy of input within
41 constructed parse string */
42 char *save_copy_start; /* Provides one level of saving
43 and restoring 'copy_start' */
44 char *copy_start_in_input; /* Position in input string
45 corresponding to copy_start */
46 SSize_t whilem_seen; /* number of WHILEM in this expr */
47 regnode *emit_start; /* Start of emitted-code area */
48 regnode_offset emit; /* Code-emit pointer */
49 I32 naughty; /* How bad is this pattern? */
50 I32 sawback; /* Did we see \1, ...? */
51 SSize_t size; /* Number of regnode equivalents in
52 pattern */
53 Size_t sets_depth; /* Counts recursion depth of already-
54 compiled regex set patterns */
55 U32 seen;
56
57 I32 parens_buf_size; /* #slots malloced open/close_parens */
58 regnode_offset *open_parens; /* offsets to open parens */
59 regnode_offset *close_parens; /* offsets to close parens */
60 HV *paren_names; /* Paren names */
61
62 /* position beyond 'precomp' of the warning message furthest away from
63 * 'precomp'. During the parse, no warnings are raised for any problems
64 * earlier in the parse than this position. This works if warnings are
65 * raised the first time a given spot is parsed, and if only one
66 * independent warning is raised for any given spot */
67 Size_t latest_warn_offset;
68
fe5492d9
YO
69 /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id.
70 * "Logical Parno" is the user visible view with branch reset taken into
71 * account. "Parno" (or physical parno) is the actual capture buffers in
72 * the pattern *NOT* taking into account branch reset. We also maintain
73 * a map of "next" pointers which allow us to skip to the next physical
74 * capture buffer with the same logical id, with 0 representing "none".
75 *
76 * As we compile we keep track of the two different counts using the
77 * 'logical_npar' and 'npar' members, and we keep track of the upper bound
78 * of both in 'total_par' and 'logical_total_par', we also populate
79 * the 'logical_to_parno' map, which gives us the first physical parno
80 * for a given logical parno, and the `parno_to_logical` array which gives
81 * us the logical id for each physical parno. When compilation is
82 * completed we construct the 'parno_to_logical_next' array from the
83 * 'parno_to_logical' array. (We do not bother constructing it during
84 * compilation as we do not need it, and we can construct it in O(N) time
85 * once we are done, but would need more complicated logic during the
86 * compile, because we want the next pointers to go from smallest to
87 * largest, eg, left to right.)
88 *
89 * Logical: $1 $2 $3 $4 $2 $3 $2 $5
90 * Physical: 1 2 3 4 5 6 7 8
91 * Next: 0 5 6 0 7 0 0 0
92 * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/
93 *
94 * As much as possible the internals use and store the physical id of
95 * of capture buffers. We decode the physical to the logical only when
96 * we need to, for instance when someone use $2.
97 *
98 * Note that when branch reset is not used logical and physical are the
99 * same and the next data would be all zero. So when branch reset is not
100 * used we do not need to populate this data into the final regexp.
101 *
102 */
103 I32 *logical_to_parno; /* logical_parno to parno */
104 I32 *parno_to_logical; /* parno to logical_parno */
105 I32 *parno_to_logical_next; /* parno to next (greater value)
106 parno with the same
107 logical_parno as parno.*/
108
85900e28
YO
109 I32 npar; /* Capture buffer count so far in the
110 parse, (OPEN) plus one. ("par" 0 is
111 the whole pattern)*/
fe5492d9 112 I32 logical_npar; /* Logical version of npar */
85900e28
YO
113 I32 total_par; /* During initial parse, is either 0,
114 or -1; the latter indicating a
115 reparse is needed. After that pass,
116 it is what 'npar' became after the
117 pass. Hence, it being > 0 indicates
118 we are in a reparse situation */
fe5492d9 119 I32 logical_total_par; /* Logical version to total par */
85900e28
YO
120 I32 nestroot; /* root parens we are in - used by
121 accept */
122 I32 seen_zerolen;
123 regnode *end_op; /* END node in program */
124 I32 utf8; /* whether the pattern is utf8 or not */
125 I32 orig_utf8; /* whether the pattern was originally in utf8 */
126 /* XXX use this for future optimisation of case
127 * where pattern must be upgraded to utf8. */
128 I32 uni_semantics; /* If a d charset modifier should use unicode
129 rules, even if the pattern is not in
130 utf8 */
131
132 I32 recurse_count; /* Number of recurse regops we have generated */
133 regnode **recurse; /* Recurse regops */
134 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
135 through */
136 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
137 I32 in_lookaround;
138 I32 contains_locale;
139 I32 override_recoding;
140 I32 recode_x_to_native;
141 I32 in_multi_char_class;
142 int code_index; /* next code_blocks[] slot */
143 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
144 within pattern */
145 SSize_t maxlen; /* mininum possible number of chars in string to match */
146 scan_frame *frame_head;
147 scan_frame *frame_last;
148 U32 frame_count;
149 AV *warn_text;
150 HV *unlexed_names;
151 SV *runtime_code_qr; /* qr with the runtime code blocks */
152#ifdef DEBUGGING
153 const char *lastparse;
154 I32 lastnum;
155 U32 study_chunk_recursed_count;
156 AV *paren_name_list; /* idx -> name */
157 SV *mysv1;
158 SV *mysv2;
159
160#define RExC_lastparse (pRExC_state->lastparse)
161#define RExC_lastnum (pRExC_state->lastnum)
162#define RExC_paren_name_list (pRExC_state->paren_name_list)
163#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
164#define RExC_mysv (pRExC_state->mysv1)
165#define RExC_mysv1 (pRExC_state->mysv1)
166#define RExC_mysv2 (pRExC_state->mysv2)
167
168#endif
169 bool seen_d_op;
170 bool strict;
171 bool study_started;
172 bool in_script_run;
173 bool use_BRANCHJ;
174 bool sWARN_EXPERIMENTAL__VLB;
175 bool sWARN_EXPERIMENTAL__REGEX_SETS;
176};
177
178#define RExC_flags (pRExC_state->flags)
179#define RExC_pm_flags (pRExC_state->pm_flags)
180#define RExC_precomp (pRExC_state->precomp)
181#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
182#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
183#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
184#define RExC_precomp_end (pRExC_state->precomp_end)
185#define RExC_rx_sv (pRExC_state->rx_sv)
186#define RExC_rx (pRExC_state->rx)
187#define RExC_rxi (pRExC_state->rxi)
188#define RExC_start (pRExC_state->start)
189#define RExC_end (pRExC_state->end)
190#define RExC_parse (pRExC_state->parse)
191#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
192#define RExC_whilem_seen (pRExC_state->whilem_seen)
193#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
194 under /d from /u ? */
195
196#define RExC_emit (pRExC_state->emit)
197#define RExC_emit_start (pRExC_state->emit_start)
198#define RExC_sawback (pRExC_state->sawback)
199#define RExC_seen (pRExC_state->seen)
200#define RExC_size (pRExC_state->size)
201#define RExC_maxlen (pRExC_state->maxlen)
fe5492d9
YO
202#define RExC_logical_npar (pRExC_state->logical_npar)
203#define RExC_logical_total_parens (pRExC_state->logical_total_par)
204#define RExC_logical_to_parno (pRExC_state->logical_to_parno)
205#define RExC_parno_to_logical (pRExC_state->parno_to_logical)
206#define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next)
85900e28
YO
207#define RExC_npar (pRExC_state->npar)
208#define RExC_total_parens (pRExC_state->total_par)
209#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
210#define RExC_nestroot (pRExC_state->nestroot)
211#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
212#define RExC_utf8 (pRExC_state->utf8)
213#define RExC_uni_semantics (pRExC_state->uni_semantics)
214#define RExC_orig_utf8 (pRExC_state->orig_utf8)
215#define RExC_open_parens (pRExC_state->open_parens)
216#define RExC_close_parens (pRExC_state->close_parens)
217#define RExC_end_op (pRExC_state->end_op)
218#define RExC_paren_names (pRExC_state->paren_names)
219#define RExC_recurse (pRExC_state->recurse)
220#define RExC_recurse_count (pRExC_state->recurse_count)
221#define RExC_sets_depth (pRExC_state->sets_depth)
222#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
223#define RExC_study_chunk_recursed_bytes \
224 (pRExC_state->study_chunk_recursed_bytes)
225#define RExC_in_lookaround (pRExC_state->in_lookaround)
226#define RExC_contains_locale (pRExC_state->contains_locale)
227#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
228
229#ifdef EBCDIC
230# define SET_recode_x_to_native(x) \
231 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
232#else
233# define SET_recode_x_to_native(x) NOOP
234#endif
235
236#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
237#define RExC_frame_head (pRExC_state->frame_head)
238#define RExC_frame_last (pRExC_state->frame_last)
239#define RExC_frame_count (pRExC_state->frame_count)
240#define RExC_strict (pRExC_state->strict)
241#define RExC_study_started (pRExC_state->study_started)
242#define RExC_warn_text (pRExC_state->warn_text)
243#define RExC_in_script_run (pRExC_state->in_script_run)
244#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
245#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
246#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
247#define RExC_unlexed_names (pRExC_state->unlexed_names)
248
249
250/***********************************************************************/
251/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
252 *
253 * All of these macros depend on the above RExC_ accessor macros, which
254 * in turns depend on a variable pRExC_state being in scope where they
255 * are used. This is the standard regexp parser context variable which is
256 * passed into every non-trivial parse function in this file.
257 *
258 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
259 * of the macros which do not take an argument will operate on the
260 * pRExC_state structure *only*.
261 *
262 * Please do NOT modify RExC_parse without using these macros. In the
263 * future these macros will be extended for enhanced debugging and trace
264 * output during the parse process.
265 */
266
267/* RExC_parse_incf(flag)
268 *
269 * Increment RExC_parse to point at the next codepoint, while doing
270 * the right thing depending on whether we are parsing UTF-8 strings
271 * or not. The 'flag' argument determines if content is UTF-8 or not,
272 * intended for cases where this is NOT governed by the UTF macro.
273 *
274 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
275 *
276 * WARNING: Does NOT take into account RExC_end; it is the callers
277 * responsibility to make sure there are enough octets left in
278 * RExC_parse to ensure that when processing UTF-8 we would not read
279 * past the end of the string.
280 */
281#define RExC_parse_incf(flag) STMT_START { \
282 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \
283} STMT_END
284
285/* RExC_parse_inc_safef(flag)
286 *
287 * Safely increment RExC_parse to point at the next codepoint,
288 * doing the right thing depending on whether we are parsing
289 * UTF-8 strings or not and NOT reading past the end of the buffer.
290 * The 'flag' argument determines if content is UTF-8 or not,
291 * intended for cases where this is NOT governed by the UTF macro.
292 *
293 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
294 *
295 * NOTE: Will NOT read past RExC_end when content is UTF-8.
296 */
297#define RExC_parse_inc_safef(flag) STMT_START { \
298 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
299} STMT_END
300
301/* RExC_parse_inc()
302 *
303 * Increment RExC_parse to point at the next codepoint,
304 * doing the right thing depending on whether we are parsing
305 * UTF-8 strings or not.
306 *
307 * WARNING: Does NOT take into account RExC_end, it is the callers
308 * responsibility to make sure there are enough octets left in
309 * RExC_parse to ensure that when processing UTF-8 we would not read
310 * past the end of the string.
311 *
312 * NOTE: whether we are parsing UTF-8 or not is determined by the
313 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
314 * macro operates on the pRExC_state structure only.
315 */
316#define RExC_parse_inc() RExC_parse_incf(UTF)
317
318/* RExC_parse_inc_safe()
319 *
320 * Safely increment RExC_parse to point at the next codepoint,
321 * doing the right thing depending on whether we are parsing
322 * UTF-8 strings or not and NOT reading past the end of the buffer.
323 *
324 * NOTE: whether we are parsing UTF-8 or not is determined by the
325 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
326 * macro operates on the pRExC_state structure only.
327 */
328#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
329
330/* RExC_parse_inc_utf8()
331 *
332 * Increment RExC_parse to point at the next utf8 codepoint,
333 * assumes content is UTF-8.
334 *
335 * WARNING: Does NOT take into account RExC_end; it is the callers
336 * responsibility to make sure there are enough octets left in RExC_parse
337 * to ensure that when processing UTF-8 we would not read past the end
338 * of the string.
339 */
340#define RExC_parse_inc_utf8() STMT_START { \
341 RExC_parse += UTF8SKIP(RExC_parse); \
342} STMT_END
343
344/* RExC_parse_inc_if_char()
345 *
346 * Increment RExC_parse to point at the next codepoint, if and only
347 * if the current parse point is NOT a NULL, while doing the right thing
348 * depending on whether we are parsing UTF-8 strings or not.
349 *
350 * WARNING: Does NOT take into account RExC_end, it is the callers
351 * responsibility to make sure there are enough octets left in RExC_parse
352 * to ensure that when processing UTF-8 we would not read past the end
353 * of the string.
354 *
355 * NOTE: whether we are parsing UTF-8 or not is determined by the
356 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
357 * macro operates on the pRExC_state structure only.
358 */
359#define RExC_parse_inc_if_char() STMT_START { \
360 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \
361} STMT_END
362
363/* RExC_parse_inc_by(n_octets)
364 *
365 * Increment the parse cursor by the number of octets specified by
366 * the 'n_octets' argument.
367 *
368 * NOTE: Does NOT check ANY constraints. It is the callers responsibility
369 * that this will not move past the end of the string, or leave the
370 * pointer in the middle of a UTF-8 sequence.
371 *
372 * Typically used to advanced past previously analyzed content.
373 */
374#define RExC_parse_inc_by(n_octets) STMT_START { \
375 RExC_parse += (n_octets); \
376} STMT_END
377
378/* RExC_parse_set(to_ptr)
379 *
380 * Sets the RExC_parse pointer to the pointer specified by the 'to'
381 * argument. No validation whatsoever is performed on the to pointer.
382 */
383#define RExC_parse_set(to_ptr) STMT_START { \
384 RExC_parse = (to_ptr); \
385} STMT_END
386
387/**********************************************************************/
388
389/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
390 * a flag to disable back-off on the fixed/floating substrings - if it's
391 * a high complexity pattern we assume the benefit of avoiding a full match
392 * is worth the cost of checking for the substrings even if they rarely help.
393 */
394#define RExC_naughty (pRExC_state->naughty)
395#define TOO_NAUGHTY (10)
396#define MARK_NAUGHTY(add) \
397 if (RExC_naughty < TOO_NAUGHTY) \
398 RExC_naughty += (add)
399#define MARK_NAUGHTY_EXP(exp, add) \
400 if (RExC_naughty < TOO_NAUGHTY) \
401 RExC_naughty += RExC_naughty / (exp) + (add)
402
403#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
404#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \
405 || ((*s) == '{' && regcurly(s, e, NULL)))
406
407/*
408 * Flags to be passed up.
409 */
410#define HASWIDTH 0x01 /* Known to not match null strings, could match
411 non-null ones. */
412#define SIMPLE 0x02 /* Exactly one character wide */
413 /* (or LNBREAK as a special case) */
414#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
415#define TRYAGAIN 0x10 /* Weeded out a declaration. */
416#define RESTART_PARSE 0x20 /* Need to redo the parse */
417#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
418 calcuate sizes as UTF-8 */
419
420#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
421
422/* whether trie related optimizations are enabled */
423#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
424#define TRIE_STUDY_OPT
425#define FULL_TRIE_STUDY
426#define TRIE_STCLASS
427#endif
428
429/* About the term "restudy" and the var "restudied" and the defines
430 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
431 * doing multiple study_chunk() calls over the same set of opcodes for* the
432 * purpose of enhanced TRIE optimizations.
433 *
434 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
435 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
436 * which then causes the Perl_re_op_compile() to then call the optimizer
437 * S_study_chunk() a second time to perform additional optimizations,
438 * including the aho_corasick startclass optimization.
439 * This additional pass will only happen once, which is managed by the
440 * 'restudied' variable in Perl_re_op_compile().
441 *
442 * When this second pass is under way the flags passed into study_chunk() will
443 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
444 * to any recursive calls to S_study_chunk().
445 *
446 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
447 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
448 * be produced twice.
449 *
450 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
451 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
452 */
453
454
455#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
456#define PBITVAL(paren) (1 << ((paren) & 7))
457#define PAREN_OFFSET(depth) \
458 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
459#define PAREN_TEST(depth, paren) \
460 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
461#define PAREN_SET(depth, paren) \
462 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
463#define PAREN_UNSET(depth, paren) \
464 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
465
466#define REQUIRE_UTF8(flagp) STMT_START { \
467 if (!UTF) { \
468 *flagp = RESTART_PARSE|NEED_UTF8; \
469 return 0; \
470 } \
471 } STMT_END
472
473/* /u is to be chosen if we are supposed to use Unicode rules, or if the
474 * pattern is in UTF-8. This latter condition is in case the outermost rules
475 * are locale. See GH #17278 */
476#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
477
478/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
479 * a flag that indicates we need to override /d with /u as a result of
480 * something in the pattern. It should only be used in regards to calling
481 * set_regex_charset() or get_regex_charset() */
482#define REQUIRE_UNI_RULES(flagp, restart_retval) \
483 STMT_START { \
484 if (DEPENDS_SEMANTICS) { \
485 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
486 RExC_uni_semantics = 1; \
487 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
488 /* No need to restart the parse if we haven't seen \
489 * anything that differs between /u and /d, and no need \
490 * to restart immediately if we're going to reparse \
491 * anyway to count parens */ \
492 *flagp |= RESTART_PARSE; \
493 return restart_retval; \
494 } \
495 } \
496 } STMT_END
497
498#define REQUIRE_BRANCHJ(flagp, restart_retval) \
499 STMT_START { \
500 RExC_use_BRANCHJ = 1; \
501 *flagp |= RESTART_PARSE; \
502 return restart_retval; \
503 } STMT_END
504
505/* Until we have completed the parse, we leave RExC_total_parens at 0 or
506 * less. After that, it must always be positive, because the whole re is
507 * considered to be surrounded by virtual parens. Setting it to negative
508 * indicates there is some construct that needs to know the actual number of
509 * parens to be properly handled. And that means an extra pass will be
510 * required after we've counted them all */
511#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
512#define REQUIRE_PARENS_PASS \
513 STMT_START { /* No-op if have completed a pass */ \
514 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
515 } STMT_END
516#define IN_PARENS_PASS (RExC_total_parens < 0)
517
518
519/* This is used to return failure (zero) early from the calling function if
520 * various flags in 'flags' are set. Two flags always cause a return:
521 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
522 * additional flags that should cause a return; 0 if none. If the return will
523 * be done, '*flagp' is first set to be all of the flags that caused the
524 * return. */
525#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
526 STMT_START { \
527 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
528 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
529 return 0; \
530 } \
531 } STMT_END
532
533#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
534
535#define RETURN_FAIL_ON_RESTART(flags,flagp) \
536 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
537#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
538 if (MUST_RESTART(*(flagp))) return 0
539
540/* This converts the named class defined in regcomp.h to its equivalent class
541 * number defined in handy.h. */
542#define namedclass_to_classnum(class) ((int) ((class) / 2))
543#define classnum_to_namedclass(classnum) ((classnum) * 2)
544
545#define _invlist_union_complement_2nd(a, b, output) \
546 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
547#define _invlist_intersection_complement_2nd(a, b, output) \
548 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
549
550/* We add a marker if we are deferring expansion of a property that is both
551 * 1) potentiallly user-defined; and
552 * 2) could also be an official Unicode property.
553 *
554 * Without this marker, any deferred expansion can only be for a user-defined
555 * one. This marker shouldn't conflict with any that could be in a legal name,
556 * and is appended to its name to indicate this. There is a string and
557 * character form */
558#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
559#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
560
561/* What is infinity for optimization purposes */
562#define OPTIMIZE_INFTY SSize_t_MAX
563
564/* About scan_data_t.
565
566 During optimisation we recurse through the regexp program performing
567 various inplace (keyhole style) optimisations. In addition study_chunk
568 and scan_commit populate this data structure with information about
569 what strings MUST appear in the pattern. We look for the longest
570 string that must appear at a fixed location, and we look for the
571 longest string that may appear at a floating location. So for instance
572 in the pattern:
573
574 /FOO[xX]A.*B[xX]BAR/
575
576 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
577 strings (because they follow a .* construct). study_chunk will identify
578 both FOO and BAR as being the longest fixed and floating strings respectively.
579
580 The strings can be composites, for instance
581
582 /(f)(o)(o)/
583
584 will result in a composite fixed substring 'foo'.
585
586 For each string some basic information is maintained:
587
588 - min_offset
589 This is the position the string must appear at, or not before.
590 It also implicitly (when combined with minlenp) tells us how many
591 characters must match before the string we are searching for.
592 Likewise when combined with minlenp and the length of the string it
593 tells us how many characters must appear after the string we have
594 found.
595
596 - max_offset
597 Only used for floating strings. This is the rightmost point that
598 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
599 string can occur infinitely far to the right.
600 For fixed strings, it is equal to min_offset.
601
602 - minlenp
603 A pointer to the minimum number of characters of the pattern that the
604 string was found inside. This is important as in the case of positive
605 lookahead or positive lookbehind we can have multiple patterns
606 involved. Consider
607
608 /(?=FOO).*F/
609
610 The minimum length of the pattern overall is 3, the minimum length
611 of the lookahead part is 3, but the minimum length of the part that
612 will actually match is 1. So 'FOO's minimum length is 3, but the
613 minimum length for the F is 1. This is important as the minimum length
614 is used to determine offsets in front of and behind the string being
615 looked for. Since strings can be composites this is the length of the
616 pattern at the time it was committed with a scan_commit. Note that
617 the length is calculated by study_chunk, so that the minimum lengths
618 are not known until the full pattern has been compiled, thus the
619 pointer to the value.
620
621 - lookbehind
622
623 In the case of lookbehind the string being searched for can be
624 offset past the start point of the final matching string.
625 If this value was just blithely removed from the min_offset it would
626 invalidate some of the calculations for how many chars must match
627 before or after (as they are derived from min_offset and minlen and
628 the length of the string being searched for).
629 When the final pattern is compiled and the data is moved from the
630 scan_data_t structure into the regexp structure the information
631 about lookbehind is factored in, with the information that would
632 have been lost precalculated in the end_shift field for the
633 associated string.
634
635 The fields pos_min and pos_delta are used to store the minimum offset
636 and the delta to the maximum offset at the current point in the pattern.
637
638*/
639
640struct scan_data_substrs {
641 SV *str; /* longest substring found in pattern */
642 SSize_t min_offset; /* earliest point in string it can appear */
643 SSize_t max_offset; /* latest point in string it can appear */
644 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
645 SSize_t lookbehind; /* is the pos of the string modified by LB */
646 I32 flags; /* per substring SF_* and SCF_* flags */
647};
648
571fb71d
YO
649/* this is typedef'ed in perl.h */
650struct scan_data_t {
85900e28
YO
651 /*I32 len_min; unused */
652 /*I32 len_delta; unused */
653 SSize_t pos_min;
654 SSize_t pos_delta;
655 SV *last_found;
656 SSize_t last_end; /* min value, <0 unless valid. */
657 SSize_t last_start_min;
658 SSize_t last_start_max;
659 U8 cur_is_floating; /* whether the last_* values should be set as
660 * the next fixed (0) or floating (1)
661 * substring */
662
663 /* [0] is longest fixed substring so far, [1] is longest float so far */
664 struct scan_data_substrs substrs[2];
665
666 I32 flags; /* common SF_* and SCF_* flags */
667 I32 whilem_c;
668 SSize_t *last_closep;
669 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
670 seen. DO NOT DEREFERENCE the regnode
671 pointer - the op may have been optimized
672 away */
673 regnode_ssc *start_class;
571fb71d 674};
85900e28
YO
675
676/*
677 * Forward declarations for pregcomp()'s friends.
678 */
679
680static const scan_data_t zero_scan_data = {
681 0, 0, NULL, 0, 0, 0, 0,
682 {
683 { NULL, 0, 0, 0, 0, 0 },
684 { NULL, 0, 0, 0, 0, 0 },
685 },
686 0, 0, NULL, NULL, NULL
687};
688
689/* study flags */
690
691#define SF_BEFORE_SEOL 0x0001
692#define SF_BEFORE_MEOL 0x0002
693#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
694
695#define SF_IS_INF 0x0040
696#define SF_HAS_PAR 0x0080
697#define SF_IN_PAR 0x0100
698#define SF_HAS_EVAL 0x0200
699
700
701/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
702 * longest substring in the pattern. When it is not set the optimiser keeps
703 * track of position, but does not keep track of the actual strings seen,
704 *
705 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
706 * /foo/i will not.
707 *
708 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
709 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
710 * turned off because of the alternation (BRANCH). */
711#define SCF_DO_SUBSTR 0x0400
712
713#define SCF_DO_STCLASS_AND 0x0800
714#define SCF_DO_STCLASS_OR 0x1000
715#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
716#define SCF_WHILEM_VISITED_POS 0x2000
717
718#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()?
719 Search for "restudy" in this file
720 to find a detailed explanation.*/
721#define SCF_SEEN_ACCEPT 0x8000
722#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
723 Search for "restudy" in this file
724 to find a detailed explanation. */
725#define SCF_IN_DEFINE 0x20000
726
727
728
729#define UTF cBOOL(RExC_utf8)
730
731/* The enums for all these are ordered so things work out correctly */
732#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
733#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
734 == REGEX_DEPENDS_CHARSET)
735#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
736#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
737 >= REGEX_UNICODE_CHARSET)
738#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
739 == REGEX_ASCII_RESTRICTED_CHARSET)
740#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
741 >= REGEX_ASCII_RESTRICTED_CHARSET)
742#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
743 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
744
745#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
746
747/* For programs that want to be strictly Unicode compatible by dying if any
748 * attempt is made to match a non-Unicode code point against a Unicode
749 * property. */
750#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
751
752#define OOB_NAMEDCLASS -1
753
754/* There is no code point that is out-of-bounds, so this is problematic. But
755 * its only current use is to initialize a variable that is always set before
756 * looked at. */
757#define OOB_UNICODE 0xDEADBEEF
758
759#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
760
761
762/* length of regex to show in messages that don't mark a position within */
763#define RegexLengthToShowInErrorMessages 127
764
765/*
766 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
767 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
768 * op/pragma/warn/regcomp.
769 */
770#define MARKER1 "<-- HERE" /* marker as it appears in the description */
771#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
772
773#define REPORT_LOCATION " in regex; marked by " MARKER1 \
774 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
775
776/* The code in this file in places uses one level of recursion with parsing
777 * rebased to an alternate string constructed by us in memory. This can take
778 * the form of something that is completely different from the input, or
779 * something that uses the input as part of the alternate. In the first case,
780 * there should be no possibility of an error, as we are in complete control of
781 * the alternate string. But in the second case we don't completely control
782 * the input portion, so there may be errors in that. Here's an example:
783 * /[abc\x{DF}def]/ui
784 * is handled specially because \x{df} folds to a sequence of more than one
785 * character: 'ss'. What is done is to create and parse an alternate string,
786 * which looks like this:
787 * /(?:\x{DF}|[abc\x{DF}def])/ui
788 * where it uses the input unchanged in the middle of something it constructs,
789 * which is a branch for the DF outside the character class, and clustering
790 * parens around the whole thing. (It knows enough to skip the DF inside the
791 * class while in this substitute parse.) 'abc' and 'def' may have errors that
792 * need to be reported. The general situation looks like this:
793 *
794 * |<------- identical ------>|
795 * sI tI xI eI
796 * Input: ---------------------------------------------------------------
797 * Constructed: ---------------------------------------------------
798 * sC tC xC eC EC
799 * |<------- identical ------>|
800 *
801 * sI..eI is the portion of the input pattern we are concerned with here.
802 * sC..EC is the constructed substitute parse string.
803 * sC..tC is constructed by us
804 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
805 * In the diagram, these are vertically aligned.
806 * eC..EC is also constructed by us.
807 * xC is the position in the substitute parse string where we found a
808 * problem.
809 * xI is the position in the original pattern corresponding to xC.
810 *
811 * We want to display a message showing the real input string. Thus we need to
812 * translate from xC to xI. We know that xC >= tC, since the portion of the
813 * string sC..tC has been constructed by us, and so shouldn't have errors. We
814 * get:
815 * xI = tI + (xC - tC)
816 *
817 * When the substitute parse is constructed, the code needs to set:
818 * RExC_start (sC)
819 * RExC_end (eC)
820 * RExC_copy_start_in_input (tI)
821 * RExC_copy_start_in_constructed (tC)
822 * and restore them when done.
823 *
824 * During normal processing of the input pattern, both
825 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
826 * sI, so that xC equals xI.
827 */
828
829#define sI RExC_precomp
830#define eI RExC_precomp_end
831#define sC RExC_start
832#define eC RExC_end
833#define tI RExC_copy_start_in_input
834#define tC RExC_copy_start_in_constructed
835#define xI(xC) (tI + (xC - tC))
836#define xI_offset(xC) (xI(xC) - sI)
837
838#define REPORT_LOCATION_ARGS(xC) \
839 UTF8fARG(UTF, \
840 (xI(xC) > eI) /* Don't run off end */ \
841 ? eI - sI /* Length before the <--HERE */ \
842 : ((xI_offset(xC) >= 0) \
843 ? xI_offset(xC) \
844 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
845 IVdf " trying to output message for " \
846 " pattern %.*s", \
847 __FILE__, __LINE__, (IV) xI_offset(xC), \
848 ((int) (eC - sC)), sC), 0)), \
849 sI), /* The input pattern printed up to the <--HERE */ \
850 UTF8fARG(UTF, \
851 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
852 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
853
854/* Used to point after bad bytes for an error message, but avoid skipping
855 * past a nul byte. */
856#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
857
858/* Set up to clean up after our imminent demise */
859#define PREPARE_TO_DIE \
860 STMT_START { \
861 if (RExC_rx_sv) \
862 SAVEFREESV(RExC_rx_sv); \
863 if (RExC_open_parens) \
864 SAVEFREEPV(RExC_open_parens); \
865 if (RExC_close_parens) \
866 SAVEFREEPV(RExC_close_parens); \
867 } STMT_END
868
869/*
870 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
871 * arg. Show regex, up to a maximum length. If it's too long, chop and add
872 * "...".
873 */
874#define _FAIL(code) STMT_START { \
875 const char *ellipses = ""; \
876 IV len = RExC_precomp_end - RExC_precomp; \
877 \
878 PREPARE_TO_DIE; \
879 if (len > RegexLengthToShowInErrorMessages) { \
880 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
881 len = RegexLengthToShowInErrorMessages - 10; \
882 ellipses = "..."; \
883 } \
884 code; \
885} STMT_END
886
887#define FAIL(msg) _FAIL( \
888 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
889 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
890
891#define FAIL2(msg,arg) _FAIL( \
892 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
893 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
894
895#define FAIL3(msg,arg1,arg2) _FAIL( \
896 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
897 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
898
899/*
900 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
901 */
902#define Simple_vFAIL(m) STMT_START { \
903 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
904 m, REPORT_LOCATION_ARGS(RExC_parse)); \
905} STMT_END
906
907/*
908 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
909 */
910#define vFAIL(m) STMT_START { \
911 PREPARE_TO_DIE; \
912 Simple_vFAIL(m); \
913} STMT_END
914
915/*
916 * Like Simple_vFAIL(), but accepts two arguments.
917 */
918#define Simple_vFAIL2(m,a1) STMT_START { \
919 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
920 REPORT_LOCATION_ARGS(RExC_parse)); \
921} STMT_END
922
923/*
924 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
925 */
926#define vFAIL2(m,a1) STMT_START { \
927 PREPARE_TO_DIE; \
928 Simple_vFAIL2(m, a1); \
929} STMT_END
930
931
932/*
933 * Like Simple_vFAIL(), but accepts three arguments.
934 */
935#define Simple_vFAIL3(m, a1, a2) STMT_START { \
936 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
937 REPORT_LOCATION_ARGS(RExC_parse)); \
938} STMT_END
939
940/*
941 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
942 */
943#define vFAIL3(m,a1,a2) STMT_START { \
944 PREPARE_TO_DIE; \
945 Simple_vFAIL3(m, a1, a2); \
946} STMT_END
947
948/*
949 * Like Simple_vFAIL(), but accepts four arguments.
950 */
951#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
952 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
953 REPORT_LOCATION_ARGS(RExC_parse)); \
954} STMT_END
955
956#define vFAIL4(m,a1,a2,a3) STMT_START { \
957 PREPARE_TO_DIE; \
958 Simple_vFAIL4(m, a1, a2, a3); \
959} STMT_END
960
961/* A specialized version of vFAIL2 that works with UTF8f */
962#define vFAIL2utf8f(m, a1) STMT_START { \
963 PREPARE_TO_DIE; \
964 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
965 REPORT_LOCATION_ARGS(RExC_parse)); \
966} STMT_END
967
968#define vFAIL3utf8f(m, a1, a2) STMT_START { \
969 PREPARE_TO_DIE; \
970 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
971 REPORT_LOCATION_ARGS(RExC_parse)); \
972} STMT_END
973
974/* Setting this to NULL is a signal to not output warnings */
975#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
976 STMT_START { \
977 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
978 RExC_copy_start_in_constructed = NULL; \
979 } STMT_END
980#define RESTORE_WARNINGS \
981 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
982
983/* Since a warning can be generated multiple times as the input is reparsed, we
984 * output it the first time we come to that point in the parse, but suppress it
985 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
986 * generate any warnings */
987#define TO_OUTPUT_WARNINGS(loc) \
988 ( RExC_copy_start_in_constructed \
989 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
990
991/* After we've emitted a warning, we save the position in the input so we don't
992 * output it again */
993#define UPDATE_WARNINGS_LOC(loc) \
994 STMT_START { \
995 if (TO_OUTPUT_WARNINGS(loc)) { \
996 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
997 - RExC_precomp; \
998 } \
999 } STMT_END
1000
1001/* 'warns' is the output of the packWARNx macro used in 'code' */
1002#define _WARN_HELPER(loc, warns, code) \
1003 STMT_START { \
1004 if (! RExC_copy_start_in_constructed) { \
1005 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
1006 " expected at '%s'", \
1007 __FILE__, __LINE__, loc); \
1008 } \
1009 if (TO_OUTPUT_WARNINGS(loc)) { \
1010 if (ckDEAD(warns)) \
1011 PREPARE_TO_DIE; \
1012 code; \
1013 UPDATE_WARNINGS_LOC(loc); \
1014 } \
1015 } STMT_END
1016
1017/* m is not necessarily a "literal string", in this macro */
1018#define warn_non_literal_string(loc, packed_warn, m) \
1019 _WARN_HELPER(loc, packed_warn, \
1020 Perl_warner(aTHX_ packed_warn, \
1021 "%s" REPORT_LOCATION, \
1022 m, REPORT_LOCATION_ARGS(loc)))
1023#define reg_warn_non_literal_string(loc, m) \
1024 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1025
1026#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
1027 STMT_START { \
1028 char * format; \
1029 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1030 Newx(format, format_size, char); \
1031 my_strlcpy(format, m, format_size); \
1032 my_strlcat(format, REPORT_LOCATION, format_size); \
1033 SAVEFREEPV(format); \
1034 _WARN_HELPER(loc, packwarn, \
1035 Perl_ck_warner(aTHX_ packwarn, \
1036 format, \
1037 a1, REPORT_LOCATION_ARGS(loc))); \
1038 } STMT_END
1039
1040#define ckWARNreg(loc,m) \
1041 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1042 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1043 m REPORT_LOCATION, \
1044 REPORT_LOCATION_ARGS(loc)))
1045
1046#define vWARN(loc, m) \
1047 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1048 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1049 m REPORT_LOCATION, \
1050 REPORT_LOCATION_ARGS(loc))) \
1051
1052#define vWARN_dep(loc, m) \
1053 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1054 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
1055 m REPORT_LOCATION, \
1056 REPORT_LOCATION_ARGS(loc)))
1057
1058#define ckWARNdep(loc,m) \
1059 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1060 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1061 m REPORT_LOCATION, \
1062 REPORT_LOCATION_ARGS(loc)))
1063
1064#define ckWARNregdep(loc,m) \
1065 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1066 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1067 WARN_REGEXP), \
1068 m REPORT_LOCATION, \
1069 REPORT_LOCATION_ARGS(loc)))
1070
1071#define ckWARN2reg_d(loc,m, a1) \
1072 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1073 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1074 m REPORT_LOCATION, \
1075 a1, REPORT_LOCATION_ARGS(loc)))
1076
1077#define ckWARN2reg(loc, m, a1) \
1078 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1079 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1080 m REPORT_LOCATION, \
1081 a1, REPORT_LOCATION_ARGS(loc)))
1082
1083#define vWARN3(loc, m, a1, a2) \
1084 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1085 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1086 m REPORT_LOCATION, \
1087 a1, a2, REPORT_LOCATION_ARGS(loc)))
1088
1089#define ckWARN3reg(loc, m, a1, a2) \
1090 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1091 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1092 m REPORT_LOCATION, \
1093 a1, a2, \
1094 REPORT_LOCATION_ARGS(loc)))
1095
1096#define vWARN4(loc, m, a1, a2, a3) \
1097 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1098 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1099 m REPORT_LOCATION, \
1100 a1, a2, a3, \
1101 REPORT_LOCATION_ARGS(loc)))
1102
1103#define ckWARN4reg(loc, m, a1, a2, a3) \
1104 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1105 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1106 m REPORT_LOCATION, \
1107 a1, a2, a3, \
1108 REPORT_LOCATION_ARGS(loc)))
1109
1110#define vWARN5(loc, m, a1, a2, a3, a4) \
1111 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1112 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1113 m REPORT_LOCATION, \
1114 a1, a2, a3, a4, \
1115 REPORT_LOCATION_ARGS(loc)))
1116
1117#define ckWARNexperimental(loc, class, m) \
1118 STMT_START { \
1119 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1120 RExC_warned_ ## class = 1; \
1121 _WARN_HELPER(loc, packWARN(class), \
1122 Perl_ck_warner_d(aTHX_ packWARN(class), \
1123 m REPORT_LOCATION, \
1124 REPORT_LOCATION_ARGS(loc)));\
1125 } \
1126 } STMT_END
1127
1128#define ckWARNexperimental_with_arg(loc, class, m, arg) \
1129 STMT_START { \
1130 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1131 RExC_warned_ ## class = 1; \
1132 _WARN_HELPER(loc, packWARN(class), \
1133 Perl_ck_warner_d(aTHX_ packWARN(class), \
1134 m REPORT_LOCATION, \
1135 arg, REPORT_LOCATION_ARGS(loc)));\
1136 } \
1137 } STMT_END
1138
1139/* Convert between a pointer to a node and its offset from the beginning of the
1140 * program */
1141#define REGNODE_p(offset) (RExC_emit_start + (offset))
1142#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \
1143 (SSize_t) ((node) - RExC_emit_start))
1144
1145#define ProgLen(ri) ri->proglen
1146#define SetProgLen(ri,x) ri->proglen = x
1147
1148#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1149#define EXPERIMENTAL_INPLACESCAN
1150#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1151
1152#define DEBUG_RExC_seen() \
1153 DEBUG_OPTIMISE_MORE_r({ \
1154 Perl_re_printf( aTHX_ "RExC_seen: "); \
1155 \
1156 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1157 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1158 \
1159 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1160 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1161 \
1162 if (RExC_seen & REG_GPOS_SEEN) \
1163 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1164 \
1165 if (RExC_seen & REG_RECURSE_SEEN) \
1166 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1167 \
1168 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1169 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1170 \
1171 if (RExC_seen & REG_VERBARG_SEEN) \
1172 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1173 \
1174 if (RExC_seen & REG_CUTGROUP_SEEN) \
1175 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1176 \
1177 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1178 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1179 \
1180 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1181 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1182 \
1183 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1184 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1185 \
1186 Perl_re_printf( aTHX_ "\n"); \
1187 });
1188
1189#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1190 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1191
1192
1193#ifdef DEBUGGING
1194# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1195 debug_studydata(where, data, depth, is_inf, min, stopmin, delta)
1196
1197# define DEBUG_PEEP(str, scan, depth, flags) \
1198 debug_peep(str, pRExC_state, scan, depth, flags)
1199#else
1200# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1201# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1202#endif
1203
1204#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
1205#ifdef DEBUGGING
1206#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
1207#else
1208#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
1209#endif
1210
1211#define MADE_TRIE 1
1212#define MADE_JUMP_TRIE 2
1213#define MADE_EXACT_TRIE 4
1214
1215#define INVLIST_INDEX 0
1216#define ONLY_LOCALE_MATCHES_INDEX 1
1217#define DEFERRED_USER_DEFINED_INDEX 2
1218
1219/* These two functions currently do the exact same thing */
1220#define ssc_init_zero ssc_init
1221
1222#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1223#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1224
1225#ifdef DEBUGGING
1226#define REGNODE_GUTS(state,op,extra_size) \
1227 regnode_guts_debug(state,op,extra_size)
1228#else
1229#define REGNODE_GUTS(state,op,extra_size) \
1230 regnode_guts(state,extra_size)
1231#endif
1232
1233#define CLEAR_OPTSTART \
1234 if (optstart) STMT_START { \
1235 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
1236 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
1237 optstart=NULL; \
1238 } STMT_END
1239
1240#define DUMPUNTIL(b,e) \
1241 CLEAR_OPTSTART; \
1242 node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
1243
fe5492d9 1244
85900e28 1245#endif /* REGCOMP_INTERNAL_H */