This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
IO::getline(): use CALLRUNOPS
[perl5.git] / regcomp_internal.h
CommitLineData
85900e28
YO
1#ifndef REGCOMP_INTERNAL_H
2#define REGCOMP_INTERNAL_H
3#ifndef STATIC
4#define STATIC static
5#endif
c5b1c090
YO
6#ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM
7#define RE_OPTIMIZE_CURLYX_TO_CURLYM 1
8#endif
9#ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN
10#define RE_OPTIMIZE_CURLYX_TO_CURLYN 1
11#endif
85900e28
YO
12
13/* this is a chain of data about sub patterns we are processing that
14 need to be handled separately/specially in study_chunk. Its so
15 we can simulate recursion without losing state. */
16struct scan_frame;
17typedef struct scan_frame {
18 regnode *last_regnode; /* last node to process in this frame */
19 regnode *next_regnode; /* next node to process when last is reached */
20 U32 prev_recursed_depth;
21 I32 stopparen; /* what stopparen do we use */
22 bool in_gosub; /* this or an outer frame is for GOSUB */
23
24 struct scan_frame *this_prev_frame; /* this previous frame */
25 struct scan_frame *prev_frame; /* previous frame */
26 struct scan_frame *next_frame; /* next frame */
27} scan_frame;
28
29/* Certain characters are output as a sequence with the first being a
30 * backslash. */
31#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
32
33
34struct RExC_state_t {
35 U32 flags; /* RXf_* are we folding, multilining? */
36 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
37 char *precomp; /* uncompiled string. */
38 char *precomp_end; /* pointer to end of uncompiled string. */
39 REGEXP *rx_sv; /* The SV that is the regexp. */
40 regexp *rx; /* perl core regexp structure */
41 regexp_internal *rxi; /* internal data for regexp object
42 pprivate field */
43 char *start; /* Start of input for compile */
44 char *end; /* End of input for compile */
45 char *parse; /* Input-scan pointer. */
46 char *copy_start; /* start of copy of input within
47 constructed parse string */
48 char *save_copy_start; /* Provides one level of saving
49 and restoring 'copy_start' */
50 char *copy_start_in_input; /* Position in input string
51 corresponding to copy_start */
52 SSize_t whilem_seen; /* number of WHILEM in this expr */
53 regnode *emit_start; /* Start of emitted-code area */
54 regnode_offset emit; /* Code-emit pointer */
55 I32 naughty; /* How bad is this pattern? */
56 I32 sawback; /* Did we see \1, ...? */
57 SSize_t size; /* Number of regnode equivalents in
58 pattern */
59 Size_t sets_depth; /* Counts recursion depth of already-
60 compiled regex set patterns */
61 U32 seen;
62
63 I32 parens_buf_size; /* #slots malloced open/close_parens */
64 regnode_offset *open_parens; /* offsets to open parens */
65 regnode_offset *close_parens; /* offsets to close parens */
66 HV *paren_names; /* Paren names */
67
68 /* position beyond 'precomp' of the warning message furthest away from
69 * 'precomp'. During the parse, no warnings are raised for any problems
70 * earlier in the parse than this position. This works if warnings are
71 * raised the first time a given spot is parsed, and if only one
72 * independent warning is raised for any given spot */
73 Size_t latest_warn_offset;
74
fe5492d9
YO
75 /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id.
76 * "Logical Parno" is the user visible view with branch reset taken into
77 * account. "Parno" (or physical parno) is the actual capture buffers in
78 * the pattern *NOT* taking into account branch reset. We also maintain
79 * a map of "next" pointers which allow us to skip to the next physical
80 * capture buffer with the same logical id, with 0 representing "none".
81 *
82 * As we compile we keep track of the two different counts using the
83 * 'logical_npar' and 'npar' members, and we keep track of the upper bound
84 * of both in 'total_par' and 'logical_total_par', we also populate
85 * the 'logical_to_parno' map, which gives us the first physical parno
86 * for a given logical parno, and the `parno_to_logical` array which gives
87 * us the logical id for each physical parno. When compilation is
88 * completed we construct the 'parno_to_logical_next' array from the
89 * 'parno_to_logical' array. (We do not bother constructing it during
90 * compilation as we do not need it, and we can construct it in O(N) time
91 * once we are done, but would need more complicated logic during the
92 * compile, because we want the next pointers to go from smallest to
93 * largest, eg, left to right.)
94 *
95 * Logical: $1 $2 $3 $4 $2 $3 $2 $5
96 * Physical: 1 2 3 4 5 6 7 8
97 * Next: 0 5 6 0 7 0 0 0
98 * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/
99 *
100 * As much as possible the internals use and store the physical id of
101 * of capture buffers. We decode the physical to the logical only when
102 * we need to, for instance when someone use $2.
103 *
104 * Note that when branch reset is not used logical and physical are the
105 * same and the next data would be all zero. So when branch reset is not
106 * used we do not need to populate this data into the final regexp.
107 *
108 */
109 I32 *logical_to_parno; /* logical_parno to parno */
110 I32 *parno_to_logical; /* parno to logical_parno */
111 I32 *parno_to_logical_next; /* parno to next (greater value)
112 parno with the same
113 logical_parno as parno.*/
114
85900e28
YO
115 I32 npar; /* Capture buffer count so far in the
116 parse, (OPEN) plus one. ("par" 0 is
117 the whole pattern)*/
fe5492d9 118 I32 logical_npar; /* Logical version of npar */
85900e28
YO
119 I32 total_par; /* During initial parse, is either 0,
120 or -1; the latter indicating a
121 reparse is needed. After that pass,
122 it is what 'npar' became after the
123 pass. Hence, it being > 0 indicates
124 we are in a reparse situation */
fe5492d9 125 I32 logical_total_par; /* Logical version to total par */
85900e28
YO
126 I32 nestroot; /* root parens we are in - used by
127 accept */
128 I32 seen_zerolen;
129 regnode *end_op; /* END node in program */
130 I32 utf8; /* whether the pattern is utf8 or not */
131 I32 orig_utf8; /* whether the pattern was originally in utf8 */
132 /* XXX use this for future optimisation of case
133 * where pattern must be upgraded to utf8. */
134 I32 uni_semantics; /* If a d charset modifier should use unicode
135 rules, even if the pattern is not in
136 utf8 */
137
138 I32 recurse_count; /* Number of recurse regops we have generated */
139 regnode **recurse; /* Recurse regops */
140 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
141 through */
142 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
143 I32 in_lookaround;
144 I32 contains_locale;
145 I32 override_recoding;
146 I32 recode_x_to_native;
147 I32 in_multi_char_class;
148 int code_index; /* next code_blocks[] slot */
149 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
150 within pattern */
151 SSize_t maxlen; /* mininum possible number of chars in string to match */
152 scan_frame *frame_head;
153 scan_frame *frame_last;
154 U32 frame_count;
155 AV *warn_text;
156 HV *unlexed_names;
157 SV *runtime_code_qr; /* qr with the runtime code blocks */
158#ifdef DEBUGGING
159 const char *lastparse;
160 I32 lastnum;
161 U32 study_chunk_recursed_count;
162 AV *paren_name_list; /* idx -> name */
163 SV *mysv1;
164 SV *mysv2;
85900e28
YO
165#endif
166 bool seen_d_op;
167 bool strict;
168 bool study_started;
169 bool in_script_run;
170 bool use_BRANCHJ;
171 bool sWARN_EXPERIMENTAL__VLB;
172 bool sWARN_EXPERIMENTAL__REGEX_SETS;
173};
174
e7252fd4
YO
175#ifdef DEBUGGING
176#define RExC_lastparse (pRExC_state->lastparse)
177#define RExC_lastnum (pRExC_state->lastnum)
178#define RExC_paren_name_list (pRExC_state->paren_name_list)
179#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
180#define RExC_mysv (pRExC_state->mysv1)
181#define RExC_mysv1 (pRExC_state->mysv1)
182#define RExC_mysv2 (pRExC_state->mysv2)
183#endif
184
85900e28
YO
185#define RExC_flags (pRExC_state->flags)
186#define RExC_pm_flags (pRExC_state->pm_flags)
187#define RExC_precomp (pRExC_state->precomp)
188#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
189#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
190#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
191#define RExC_precomp_end (pRExC_state->precomp_end)
192#define RExC_rx_sv (pRExC_state->rx_sv)
193#define RExC_rx (pRExC_state->rx)
194#define RExC_rxi (pRExC_state->rxi)
195#define RExC_start (pRExC_state->start)
196#define RExC_end (pRExC_state->end)
197#define RExC_parse (pRExC_state->parse)
198#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
199#define RExC_whilem_seen (pRExC_state->whilem_seen)
200#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
201 under /d from /u ? */
202
203#define RExC_emit (pRExC_state->emit)
204#define RExC_emit_start (pRExC_state->emit_start)
205#define RExC_sawback (pRExC_state->sawback)
206#define RExC_seen (pRExC_state->seen)
207#define RExC_size (pRExC_state->size)
208#define RExC_maxlen (pRExC_state->maxlen)
fe5492d9
YO
209#define RExC_logical_npar (pRExC_state->logical_npar)
210#define RExC_logical_total_parens (pRExC_state->logical_total_par)
211#define RExC_logical_to_parno (pRExC_state->logical_to_parno)
212#define RExC_parno_to_logical (pRExC_state->parno_to_logical)
213#define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next)
85900e28
YO
214#define RExC_npar (pRExC_state->npar)
215#define RExC_total_parens (pRExC_state->total_par)
216#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
217#define RExC_nestroot (pRExC_state->nestroot)
218#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
219#define RExC_utf8 (pRExC_state->utf8)
220#define RExC_uni_semantics (pRExC_state->uni_semantics)
221#define RExC_orig_utf8 (pRExC_state->orig_utf8)
222#define RExC_open_parens (pRExC_state->open_parens)
223#define RExC_close_parens (pRExC_state->close_parens)
224#define RExC_end_op (pRExC_state->end_op)
225#define RExC_paren_names (pRExC_state->paren_names)
226#define RExC_recurse (pRExC_state->recurse)
227#define RExC_recurse_count (pRExC_state->recurse_count)
228#define RExC_sets_depth (pRExC_state->sets_depth)
229#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
230#define RExC_study_chunk_recursed_bytes \
231 (pRExC_state->study_chunk_recursed_bytes)
232#define RExC_in_lookaround (pRExC_state->in_lookaround)
233#define RExC_contains_locale (pRExC_state->contains_locale)
234#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
235
236#ifdef EBCDIC
237# define SET_recode_x_to_native(x) \
238 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
239#else
240# define SET_recode_x_to_native(x) NOOP
241#endif
242
243#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
244#define RExC_frame_head (pRExC_state->frame_head)
245#define RExC_frame_last (pRExC_state->frame_last)
246#define RExC_frame_count (pRExC_state->frame_count)
247#define RExC_strict (pRExC_state->strict)
248#define RExC_study_started (pRExC_state->study_started)
249#define RExC_warn_text (pRExC_state->warn_text)
250#define RExC_in_script_run (pRExC_state->in_script_run)
251#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
252#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
253#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
254#define RExC_unlexed_names (pRExC_state->unlexed_names)
255
256
257/***********************************************************************/
258/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
259 *
260 * All of these macros depend on the above RExC_ accessor macros, which
261 * in turns depend on a variable pRExC_state being in scope where they
262 * are used. This is the standard regexp parser context variable which is
263 * passed into every non-trivial parse function in this file.
264 *
265 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
266 * of the macros which do not take an argument will operate on the
267 * pRExC_state structure *only*.
268 *
269 * Please do NOT modify RExC_parse without using these macros. In the
270 * future these macros will be extended for enhanced debugging and trace
271 * output during the parse process.
272 */
273
274/* RExC_parse_incf(flag)
275 *
276 * Increment RExC_parse to point at the next codepoint, while doing
277 * the right thing depending on whether we are parsing UTF-8 strings
278 * or not. The 'flag' argument determines if content is UTF-8 or not,
279 * intended for cases where this is NOT governed by the UTF macro.
280 *
281 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
282 *
283 * WARNING: Does NOT take into account RExC_end; it is the callers
284 * responsibility to make sure there are enough octets left in
285 * RExC_parse to ensure that when processing UTF-8 we would not read
286 * past the end of the string.
287 */
288#define RExC_parse_incf(flag) STMT_START { \
289 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \
290} STMT_END
291
292/* RExC_parse_inc_safef(flag)
293 *
294 * Safely increment RExC_parse to point at the next codepoint,
295 * doing the right thing depending on whether we are parsing
296 * UTF-8 strings or not and NOT reading past the end of the buffer.
297 * The 'flag' argument determines if content is UTF-8 or not,
298 * intended for cases where this is NOT governed by the UTF macro.
299 *
300 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
301 *
302 * NOTE: Will NOT read past RExC_end when content is UTF-8.
303 */
304#define RExC_parse_inc_safef(flag) STMT_START { \
305 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
306} STMT_END
307
308/* RExC_parse_inc()
309 *
310 * Increment RExC_parse to point at the next codepoint,
311 * doing the right thing depending on whether we are parsing
312 * UTF-8 strings or not.
313 *
314 * WARNING: Does NOT take into account RExC_end, it is the callers
315 * responsibility to make sure there are enough octets left in
316 * RExC_parse to ensure that when processing UTF-8 we would not read
317 * past the end of the string.
318 *
319 * NOTE: whether we are parsing UTF-8 or not is determined by the
320 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
321 * macro operates on the pRExC_state structure only.
322 */
323#define RExC_parse_inc() RExC_parse_incf(UTF)
324
325/* RExC_parse_inc_safe()
326 *
327 * Safely increment RExC_parse to point at the next codepoint,
328 * doing the right thing depending on whether we are parsing
329 * UTF-8 strings or not and NOT reading past the end of the buffer.
330 *
331 * NOTE: whether we are parsing UTF-8 or not is determined by the
332 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
333 * macro operates on the pRExC_state structure only.
334 */
335#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
336
337/* RExC_parse_inc_utf8()
338 *
339 * Increment RExC_parse to point at the next utf8 codepoint,
340 * assumes content is UTF-8.
341 *
342 * WARNING: Does NOT take into account RExC_end; it is the callers
343 * responsibility to make sure there are enough octets left in RExC_parse
344 * to ensure that when processing UTF-8 we would not read past the end
345 * of the string.
346 */
347#define RExC_parse_inc_utf8() STMT_START { \
348 RExC_parse += UTF8SKIP(RExC_parse); \
349} STMT_END
350
351/* RExC_parse_inc_if_char()
352 *
353 * Increment RExC_parse to point at the next codepoint, if and only
354 * if the current parse point is NOT a NULL, while doing the right thing
355 * depending on whether we are parsing UTF-8 strings or not.
356 *
357 * WARNING: Does NOT take into account RExC_end, it is the callers
358 * responsibility to make sure there are enough octets left in RExC_parse
359 * to ensure that when processing UTF-8 we would not read past the end
360 * of the string.
361 *
362 * NOTE: whether we are parsing UTF-8 or not is determined by the
363 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
364 * macro operates on the pRExC_state structure only.
365 */
366#define RExC_parse_inc_if_char() STMT_START { \
367 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \
368} STMT_END
369
370/* RExC_parse_inc_by(n_octets)
371 *
372 * Increment the parse cursor by the number of octets specified by
373 * the 'n_octets' argument.
374 *
375 * NOTE: Does NOT check ANY constraints. It is the callers responsibility
376 * that this will not move past the end of the string, or leave the
377 * pointer in the middle of a UTF-8 sequence.
378 *
379 * Typically used to advanced past previously analyzed content.
380 */
381#define RExC_parse_inc_by(n_octets) STMT_START { \
382 RExC_parse += (n_octets); \
383} STMT_END
384
385/* RExC_parse_set(to_ptr)
386 *
387 * Sets the RExC_parse pointer to the pointer specified by the 'to'
388 * argument. No validation whatsoever is performed on the to pointer.
389 */
390#define RExC_parse_set(to_ptr) STMT_START { \
391 RExC_parse = (to_ptr); \
392} STMT_END
393
394/**********************************************************************/
395
396/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
397 * a flag to disable back-off on the fixed/floating substrings - if it's
398 * a high complexity pattern we assume the benefit of avoiding a full match
399 * is worth the cost of checking for the substrings even if they rarely help.
400 */
401#define RExC_naughty (pRExC_state->naughty)
402#define TOO_NAUGHTY (10)
403#define MARK_NAUGHTY(add) \
404 if (RExC_naughty < TOO_NAUGHTY) \
405 RExC_naughty += (add)
406#define MARK_NAUGHTY_EXP(exp, add) \
407 if (RExC_naughty < TOO_NAUGHTY) \
408 RExC_naughty += RExC_naughty / (exp) + (add)
409
410#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
411#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \
412 || ((*s) == '{' && regcurly(s, e, NULL)))
413
414/*
415 * Flags to be passed up.
416 */
417#define HASWIDTH 0x01 /* Known to not match null strings, could match
418 non-null ones. */
419#define SIMPLE 0x02 /* Exactly one character wide */
420 /* (or LNBREAK as a special case) */
421#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
422#define TRYAGAIN 0x10 /* Weeded out a declaration. */
423#define RESTART_PARSE 0x20 /* Need to redo the parse */
424#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
425 calcuate sizes as UTF-8 */
426
427#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
428
429/* whether trie related optimizations are enabled */
430#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
431#define TRIE_STUDY_OPT
432#define FULL_TRIE_STUDY
433#define TRIE_STCLASS
434#endif
435
436/* About the term "restudy" and the var "restudied" and the defines
437 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
438 * doing multiple study_chunk() calls over the same set of opcodes for* the
439 * purpose of enhanced TRIE optimizations.
440 *
441 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
442 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
443 * which then causes the Perl_re_op_compile() to then call the optimizer
444 * S_study_chunk() a second time to perform additional optimizations,
445 * including the aho_corasick startclass optimization.
446 * This additional pass will only happen once, which is managed by the
447 * 'restudied' variable in Perl_re_op_compile().
448 *
449 * When this second pass is under way the flags passed into study_chunk() will
450 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
451 * to any recursive calls to S_study_chunk().
452 *
453 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
454 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
455 * be produced twice.
456 *
457 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
458 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
459 */
460
461
462#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
463#define PBITVAL(paren) (1 << ((paren) & 7))
464#define PAREN_OFFSET(depth) \
465 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
466#define PAREN_TEST(depth, paren) \
467 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
468#define PAREN_SET(depth, paren) \
469 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
470#define PAREN_UNSET(depth, paren) \
471 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
472
473#define REQUIRE_UTF8(flagp) STMT_START { \
474 if (!UTF) { \
475 *flagp = RESTART_PARSE|NEED_UTF8; \
476 return 0; \
477 } \
478 } STMT_END
479
480/* /u is to be chosen if we are supposed to use Unicode rules, or if the
481 * pattern is in UTF-8. This latter condition is in case the outermost rules
482 * are locale. See GH #17278 */
483#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
484
485/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
486 * a flag that indicates we need to override /d with /u as a result of
487 * something in the pattern. It should only be used in regards to calling
488 * set_regex_charset() or get_regex_charset() */
489#define REQUIRE_UNI_RULES(flagp, restart_retval) \
490 STMT_START { \
491 if (DEPENDS_SEMANTICS) { \
492 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
493 RExC_uni_semantics = 1; \
494 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
495 /* No need to restart the parse if we haven't seen \
496 * anything that differs between /u and /d, and no need \
497 * to restart immediately if we're going to reparse \
498 * anyway to count parens */ \
499 *flagp |= RESTART_PARSE; \
500 return restart_retval; \
501 } \
502 } \
503 } STMT_END
504
505#define REQUIRE_BRANCHJ(flagp, restart_retval) \
506 STMT_START { \
507 RExC_use_BRANCHJ = 1; \
508 *flagp |= RESTART_PARSE; \
509 return restart_retval; \
510 } STMT_END
511
512/* Until we have completed the parse, we leave RExC_total_parens at 0 or
513 * less. After that, it must always be positive, because the whole re is
514 * considered to be surrounded by virtual parens. Setting it to negative
515 * indicates there is some construct that needs to know the actual number of
516 * parens to be properly handled. And that means an extra pass will be
517 * required after we've counted them all */
518#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
519#define REQUIRE_PARENS_PASS \
520 STMT_START { /* No-op if have completed a pass */ \
521 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
522 } STMT_END
523#define IN_PARENS_PASS (RExC_total_parens < 0)
524
525
526/* This is used to return failure (zero) early from the calling function if
527 * various flags in 'flags' are set. Two flags always cause a return:
528 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
529 * additional flags that should cause a return; 0 if none. If the return will
530 * be done, '*flagp' is first set to be all of the flags that caused the
531 * return. */
532#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
533 STMT_START { \
534 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
535 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
536 return 0; \
537 } \
538 } STMT_END
539
540#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
541
542#define RETURN_FAIL_ON_RESTART(flags,flagp) \
543 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
544#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
545 if (MUST_RESTART(*(flagp))) return 0
546
547/* This converts the named class defined in regcomp.h to its equivalent class
548 * number defined in handy.h. */
549#define namedclass_to_classnum(class) ((int) ((class) / 2))
550#define classnum_to_namedclass(classnum) ((classnum) * 2)
551
552#define _invlist_union_complement_2nd(a, b, output) \
553 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
554#define _invlist_intersection_complement_2nd(a, b, output) \
555 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
556
557/* We add a marker if we are deferring expansion of a property that is both
558 * 1) potentiallly user-defined; and
559 * 2) could also be an official Unicode property.
560 *
561 * Without this marker, any deferred expansion can only be for a user-defined
562 * one. This marker shouldn't conflict with any that could be in a legal name,
563 * and is appended to its name to indicate this. There is a string and
564 * character form */
565#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
566#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
567
568/* What is infinity for optimization purposes */
569#define OPTIMIZE_INFTY SSize_t_MAX
570
571/* About scan_data_t.
572
573 During optimisation we recurse through the regexp program performing
574 various inplace (keyhole style) optimisations. In addition study_chunk
575 and scan_commit populate this data structure with information about
576 what strings MUST appear in the pattern. We look for the longest
577 string that must appear at a fixed location, and we look for the
578 longest string that may appear at a floating location. So for instance
579 in the pattern:
580
581 /FOO[xX]A.*B[xX]BAR/
582
583 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
584 strings (because they follow a .* construct). study_chunk will identify
585 both FOO and BAR as being the longest fixed and floating strings respectively.
586
587 The strings can be composites, for instance
588
589 /(f)(o)(o)/
590
591 will result in a composite fixed substring 'foo'.
592
593 For each string some basic information is maintained:
594
595 - min_offset
596 This is the position the string must appear at, or not before.
597 It also implicitly (when combined with minlenp) tells us how many
598 characters must match before the string we are searching for.
599 Likewise when combined with minlenp and the length of the string it
600 tells us how many characters must appear after the string we have
601 found.
602
603 - max_offset
604 Only used for floating strings. This is the rightmost point that
605 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
606 string can occur infinitely far to the right.
607 For fixed strings, it is equal to min_offset.
608
609 - minlenp
610 A pointer to the minimum number of characters of the pattern that the
611 string was found inside. This is important as in the case of positive
612 lookahead or positive lookbehind we can have multiple patterns
613 involved. Consider
614
615 /(?=FOO).*F/
616
617 The minimum length of the pattern overall is 3, the minimum length
618 of the lookahead part is 3, but the minimum length of the part that
619 will actually match is 1. So 'FOO's minimum length is 3, but the
620 minimum length for the F is 1. This is important as the minimum length
621 is used to determine offsets in front of and behind the string being
622 looked for. Since strings can be composites this is the length of the
623 pattern at the time it was committed with a scan_commit. Note that
624 the length is calculated by study_chunk, so that the minimum lengths
625 are not known until the full pattern has been compiled, thus the
626 pointer to the value.
627
628 - lookbehind
629
630 In the case of lookbehind the string being searched for can be
631 offset past the start point of the final matching string.
632 If this value was just blithely removed from the min_offset it would
633 invalidate some of the calculations for how many chars must match
634 before or after (as they are derived from min_offset and minlen and
635 the length of the string being searched for).
636 When the final pattern is compiled and the data is moved from the
637 scan_data_t structure into the regexp structure the information
638 about lookbehind is factored in, with the information that would
639 have been lost precalculated in the end_shift field for the
640 associated string.
641
642 The fields pos_min and pos_delta are used to store the minimum offset
643 and the delta to the maximum offset at the current point in the pattern.
644
645*/
646
647struct scan_data_substrs {
648 SV *str; /* longest substring found in pattern */
649 SSize_t min_offset; /* earliest point in string it can appear */
650 SSize_t max_offset; /* latest point in string it can appear */
651 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
652 SSize_t lookbehind; /* is the pos of the string modified by LB */
653 I32 flags; /* per substring SF_* and SCF_* flags */
654};
655
571fb71d
YO
656/* this is typedef'ed in perl.h */
657struct scan_data_t {
85900e28
YO
658 /*I32 len_min; unused */
659 /*I32 len_delta; unused */
660 SSize_t pos_min;
661 SSize_t pos_delta;
662 SV *last_found;
663 SSize_t last_end; /* min value, <0 unless valid. */
664 SSize_t last_start_min;
665 SSize_t last_start_max;
666 U8 cur_is_floating; /* whether the last_* values should be set as
667 * the next fixed (0) or floating (1)
668 * substring */
669
670 /* [0] is longest fixed substring so far, [1] is longest float so far */
671 struct scan_data_substrs substrs[2];
672
673 I32 flags; /* common SF_* and SCF_* flags */
674 I32 whilem_c;
675 SSize_t *last_closep;
676 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
677 seen. DO NOT DEREFERENCE the regnode
678 pointer - the op may have been optimized
679 away */
680 regnode_ssc *start_class;
571fb71d 681};
85900e28
YO
682
683/*
684 * Forward declarations for pregcomp()'s friends.
685 */
686
687static const scan_data_t zero_scan_data = {
688 0, 0, NULL, 0, 0, 0, 0,
689 {
690 { NULL, 0, 0, 0, 0, 0 },
691 { NULL, 0, 0, 0, 0, 0 },
692 },
693 0, 0, NULL, NULL, NULL
694};
695
696/* study flags */
697
698#define SF_BEFORE_SEOL 0x0001
699#define SF_BEFORE_MEOL 0x0002
700#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
701
702#define SF_IS_INF 0x0040
703#define SF_HAS_PAR 0x0080
704#define SF_IN_PAR 0x0100
705#define SF_HAS_EVAL 0x0200
706
707
708/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
709 * longest substring in the pattern. When it is not set the optimiser keeps
710 * track of position, but does not keep track of the actual strings seen,
711 *
712 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
713 * /foo/i will not.
714 *
715 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
716 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
717 * turned off because of the alternation (BRANCH). */
718#define SCF_DO_SUBSTR 0x0400
719
720#define SCF_DO_STCLASS_AND 0x0800
721#define SCF_DO_STCLASS_OR 0x1000
722#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
723#define SCF_WHILEM_VISITED_POS 0x2000
724
725#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()?
726 Search for "restudy" in this file
727 to find a detailed explanation.*/
728#define SCF_SEEN_ACCEPT 0x8000
729#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
730 Search for "restudy" in this file
731 to find a detailed explanation. */
732#define SCF_IN_DEFINE 0x20000
733
734
735
736#define UTF cBOOL(RExC_utf8)
737
738/* The enums for all these are ordered so things work out correctly */
739#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
740#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
741 == REGEX_DEPENDS_CHARSET)
742#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
743#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
744 >= REGEX_UNICODE_CHARSET)
745#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
746 == REGEX_ASCII_RESTRICTED_CHARSET)
747#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
748 >= REGEX_ASCII_RESTRICTED_CHARSET)
749#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
750 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
751
752#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
753
754/* For programs that want to be strictly Unicode compatible by dying if any
755 * attempt is made to match a non-Unicode code point against a Unicode
756 * property. */
757#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
758
759#define OOB_NAMEDCLASS -1
760
761/* There is no code point that is out-of-bounds, so this is problematic. But
762 * its only current use is to initialize a variable that is always set before
763 * looked at. */
764#define OOB_UNICODE 0xDEADBEEF
765
766#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
767
768
769/* length of regex to show in messages that don't mark a position within */
770#define RegexLengthToShowInErrorMessages 127
771
772/*
773 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
774 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
775 * op/pragma/warn/regcomp.
776 */
777#define MARKER1 "<-- HERE" /* marker as it appears in the description */
778#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
779
780#define REPORT_LOCATION " in regex; marked by " MARKER1 \
781 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
782
783/* The code in this file in places uses one level of recursion with parsing
784 * rebased to an alternate string constructed by us in memory. This can take
785 * the form of something that is completely different from the input, or
786 * something that uses the input as part of the alternate. In the first case,
787 * there should be no possibility of an error, as we are in complete control of
788 * the alternate string. But in the second case we don't completely control
789 * the input portion, so there may be errors in that. Here's an example:
790 * /[abc\x{DF}def]/ui
791 * is handled specially because \x{df} folds to a sequence of more than one
792 * character: 'ss'. What is done is to create and parse an alternate string,
793 * which looks like this:
794 * /(?:\x{DF}|[abc\x{DF}def])/ui
795 * where it uses the input unchanged in the middle of something it constructs,
796 * which is a branch for the DF outside the character class, and clustering
797 * parens around the whole thing. (It knows enough to skip the DF inside the
798 * class while in this substitute parse.) 'abc' and 'def' may have errors that
799 * need to be reported. The general situation looks like this:
800 *
801 * |<------- identical ------>|
802 * sI tI xI eI
803 * Input: ---------------------------------------------------------------
804 * Constructed: ---------------------------------------------------
805 * sC tC xC eC EC
806 * |<------- identical ------>|
807 *
808 * sI..eI is the portion of the input pattern we are concerned with here.
809 * sC..EC is the constructed substitute parse string.
810 * sC..tC is constructed by us
811 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
812 * In the diagram, these are vertically aligned.
813 * eC..EC is also constructed by us.
814 * xC is the position in the substitute parse string where we found a
815 * problem.
816 * xI is the position in the original pattern corresponding to xC.
817 *
818 * We want to display a message showing the real input string. Thus we need to
819 * translate from xC to xI. We know that xC >= tC, since the portion of the
820 * string sC..tC has been constructed by us, and so shouldn't have errors. We
821 * get:
822 * xI = tI + (xC - tC)
823 *
824 * When the substitute parse is constructed, the code needs to set:
825 * RExC_start (sC)
826 * RExC_end (eC)
827 * RExC_copy_start_in_input (tI)
828 * RExC_copy_start_in_constructed (tC)
829 * and restore them when done.
830 *
831 * During normal processing of the input pattern, both
832 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
833 * sI, so that xC equals xI.
834 */
835
836#define sI RExC_precomp
837#define eI RExC_precomp_end
838#define sC RExC_start
839#define eC RExC_end
840#define tI RExC_copy_start_in_input
841#define tC RExC_copy_start_in_constructed
842#define xI(xC) (tI + (xC - tC))
843#define xI_offset(xC) (xI(xC) - sI)
844
845#define REPORT_LOCATION_ARGS(xC) \
846 UTF8fARG(UTF, \
847 (xI(xC) > eI) /* Don't run off end */ \
848 ? eI - sI /* Length before the <--HERE */ \
849 : ((xI_offset(xC) >= 0) \
850 ? xI_offset(xC) \
851 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
852 IVdf " trying to output message for " \
853 " pattern %.*s", \
854 __FILE__, __LINE__, (IV) xI_offset(xC), \
855 ((int) (eC - sC)), sC), 0)), \
856 sI), /* The input pattern printed up to the <--HERE */ \
857 UTF8fARG(UTF, \
858 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
859 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
860
861/* Used to point after bad bytes for an error message, but avoid skipping
862 * past a nul byte. */
863#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
864
865/* Set up to clean up after our imminent demise */
866#define PREPARE_TO_DIE \
867 STMT_START { \
868 if (RExC_rx_sv) \
869 SAVEFREESV(RExC_rx_sv); \
870 if (RExC_open_parens) \
871 SAVEFREEPV(RExC_open_parens); \
872 if (RExC_close_parens) \
873 SAVEFREEPV(RExC_close_parens); \
52bccf63
KW
874 if (RExC_logical_to_parno) \
875 SAVEFREEPV(RExC_logical_to_parno); \
876 if (RExC_parno_to_logical) \
877 SAVEFREEPV(RExC_parno_to_logical); \
85900e28
YO
878 } STMT_END
879
880/*
881 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
882 * arg. Show regex, up to a maximum length. If it's too long, chop and add
883 * "...".
884 */
885#define _FAIL(code) STMT_START { \
886 const char *ellipses = ""; \
887 IV len = RExC_precomp_end - RExC_precomp; \
888 \
889 PREPARE_TO_DIE; \
890 if (len > RegexLengthToShowInErrorMessages) { \
891 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
892 len = RegexLengthToShowInErrorMessages - 10; \
893 ellipses = "..."; \
894 } \
895 code; \
896} STMT_END
897
898#define FAIL(msg) _FAIL( \
899 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
900 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
901
902#define FAIL2(msg,arg) _FAIL( \
903 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
904 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
905
906#define FAIL3(msg,arg1,arg2) _FAIL( \
907 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
908 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
909
910/*
911 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
912 */
913#define Simple_vFAIL(m) STMT_START { \
914 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
915 m, REPORT_LOCATION_ARGS(RExC_parse)); \
916} STMT_END
917
918/*
919 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
920 */
921#define vFAIL(m) STMT_START { \
922 PREPARE_TO_DIE; \
923 Simple_vFAIL(m); \
924} STMT_END
925
926/*
927 * Like Simple_vFAIL(), but accepts two arguments.
928 */
929#define Simple_vFAIL2(m,a1) STMT_START { \
930 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
931 REPORT_LOCATION_ARGS(RExC_parse)); \
932} STMT_END
933
934/*
935 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
936 */
937#define vFAIL2(m,a1) STMT_START { \
938 PREPARE_TO_DIE; \
939 Simple_vFAIL2(m, a1); \
940} STMT_END
941
942
943/*
944 * Like Simple_vFAIL(), but accepts three arguments.
945 */
946#define Simple_vFAIL3(m, a1, a2) STMT_START { \
947 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
948 REPORT_LOCATION_ARGS(RExC_parse)); \
949} STMT_END
950
951/*
952 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
953 */
954#define vFAIL3(m,a1,a2) STMT_START { \
955 PREPARE_TO_DIE; \
956 Simple_vFAIL3(m, a1, a2); \
957} STMT_END
958
959/*
960 * Like Simple_vFAIL(), but accepts four arguments.
961 */
962#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
963 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
964 REPORT_LOCATION_ARGS(RExC_parse)); \
965} STMT_END
966
967#define vFAIL4(m,a1,a2,a3) STMT_START { \
968 PREPARE_TO_DIE; \
969 Simple_vFAIL4(m, a1, a2, a3); \
970} STMT_END
971
972/* A specialized version of vFAIL2 that works with UTF8f */
973#define vFAIL2utf8f(m, a1) STMT_START { \
974 PREPARE_TO_DIE; \
975 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
976 REPORT_LOCATION_ARGS(RExC_parse)); \
977} STMT_END
978
979#define vFAIL3utf8f(m, a1, a2) STMT_START { \
980 PREPARE_TO_DIE; \
981 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
982 REPORT_LOCATION_ARGS(RExC_parse)); \
983} STMT_END
984
985/* Setting this to NULL is a signal to not output warnings */
986#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
987 STMT_START { \
988 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
989 RExC_copy_start_in_constructed = NULL; \
990 } STMT_END
991#define RESTORE_WARNINGS \
992 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
993
994/* Since a warning can be generated multiple times as the input is reparsed, we
995 * output it the first time we come to that point in the parse, but suppress it
996 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
997 * generate any warnings */
998#define TO_OUTPUT_WARNINGS(loc) \
999 ( RExC_copy_start_in_constructed \
1000 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
1001
1002/* After we've emitted a warning, we save the position in the input so we don't
1003 * output it again */
1004#define UPDATE_WARNINGS_LOC(loc) \
1005 STMT_START { \
1006 if (TO_OUTPUT_WARNINGS(loc)) { \
1007 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
1008 - RExC_precomp; \
1009 } \
1010 } STMT_END
1011
1012/* 'warns' is the output of the packWARNx macro used in 'code' */
1013#define _WARN_HELPER(loc, warns, code) \
1014 STMT_START { \
1015 if (! RExC_copy_start_in_constructed) { \
1016 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
1017 " expected at '%s'", \
1018 __FILE__, __LINE__, loc); \
1019 } \
1020 if (TO_OUTPUT_WARNINGS(loc)) { \
1021 if (ckDEAD(warns)) \
1022 PREPARE_TO_DIE; \
1023 code; \
1024 UPDATE_WARNINGS_LOC(loc); \
1025 } \
1026 } STMT_END
1027
1028/* m is not necessarily a "literal string", in this macro */
1029#define warn_non_literal_string(loc, packed_warn, m) \
1030 _WARN_HELPER(loc, packed_warn, \
1031 Perl_warner(aTHX_ packed_warn, \
1032 "%s" REPORT_LOCATION, \
1033 m, REPORT_LOCATION_ARGS(loc)))
1034#define reg_warn_non_literal_string(loc, m) \
1035 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1036
1037#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
1038 STMT_START { \
1039 char * format; \
1040 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1041 Newx(format, format_size, char); \
1042 my_strlcpy(format, m, format_size); \
1043 my_strlcat(format, REPORT_LOCATION, format_size); \
1044 SAVEFREEPV(format); \
1045 _WARN_HELPER(loc, packwarn, \
1046 Perl_ck_warner(aTHX_ packwarn, \
1047 format, \
1048 a1, REPORT_LOCATION_ARGS(loc))); \
1049 } STMT_END
1050
1051#define ckWARNreg(loc,m) \
1052 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1053 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1054 m REPORT_LOCATION, \
1055 REPORT_LOCATION_ARGS(loc)))
1056
1057#define vWARN(loc, m) \
1058 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1059 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1060 m REPORT_LOCATION, \
1061 REPORT_LOCATION_ARGS(loc))) \
1062
b27367cd
YO
1063#define vWARN_dep(loc,category,m) \
1064 _WARN_HELPER(loc, packWARN(category), \
1065 Perl_warner(aTHX_ packWARN(category), \
1066 m REPORT_LOCATION, \
85900e28
YO
1067 REPORT_LOCATION_ARGS(loc)))
1068
b27367cd
YO
1069#define ckWARNdep(loc,category,m) \
1070 _WARN_HELPER(loc, packWARN(category), \
1071 Perl_ck_warner_d(aTHX_ packWARN(category), \
1072 m REPORT_LOCATION, \
85900e28
YO
1073 REPORT_LOCATION_ARGS(loc)))
1074
b27367cd
YO
1075#define ckWARNregdep(loc,category,m) \
1076 _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP), \
1077 Perl_ck_warner_d(aTHX_ packWARN2(category, \
85900e28
YO
1078 WARN_REGEXP), \
1079 m REPORT_LOCATION, \
1080 REPORT_LOCATION_ARGS(loc)))
1081
1082#define ckWARN2reg_d(loc,m, a1) \
1083 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1084 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1085 m REPORT_LOCATION, \
1086 a1, REPORT_LOCATION_ARGS(loc)))
1087
1088#define ckWARN2reg(loc, m, a1) \
1089 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1090 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1091 m REPORT_LOCATION, \
1092 a1, REPORT_LOCATION_ARGS(loc)))
1093
1094#define vWARN3(loc, m, a1, a2) \
1095 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1096 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1097 m REPORT_LOCATION, \
1098 a1, a2, REPORT_LOCATION_ARGS(loc)))
1099
1100#define ckWARN3reg(loc, m, a1, a2) \
1101 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1102 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1103 m REPORT_LOCATION, \
1104 a1, a2, \
1105 REPORT_LOCATION_ARGS(loc)))
1106
1107#define vWARN4(loc, m, a1, a2, a3) \
1108 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1109 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1110 m REPORT_LOCATION, \
1111 a1, a2, a3, \
1112 REPORT_LOCATION_ARGS(loc)))
1113
1114#define ckWARN4reg(loc, m, a1, a2, a3) \
1115 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1116 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1117 m REPORT_LOCATION, \
1118 a1, a2, a3, \
1119 REPORT_LOCATION_ARGS(loc)))
1120
1121#define vWARN5(loc, m, a1, a2, a3, a4) \
1122 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1123 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1124 m REPORT_LOCATION, \
1125 a1, a2, a3, a4, \
1126 REPORT_LOCATION_ARGS(loc)))
1127
1128#define ckWARNexperimental(loc, class, m) \
1129 STMT_START { \
1130 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1131 RExC_warned_ ## class = 1; \
1132 _WARN_HELPER(loc, packWARN(class), \
1133 Perl_ck_warner_d(aTHX_ packWARN(class), \
1134 m REPORT_LOCATION, \
1135 REPORT_LOCATION_ARGS(loc)));\
1136 } \
1137 } STMT_END
1138
1139#define ckWARNexperimental_with_arg(loc, class, m, arg) \
1140 STMT_START { \
1141 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1142 RExC_warned_ ## class = 1; \
1143 _WARN_HELPER(loc, packWARN(class), \
1144 Perl_ck_warner_d(aTHX_ packWARN(class), \
1145 m REPORT_LOCATION, \
1146 arg, REPORT_LOCATION_ARGS(loc)));\
1147 } \
1148 } STMT_END
1149
1150/* Convert between a pointer to a node and its offset from the beginning of the
1151 * program */
1152#define REGNODE_p(offset) (RExC_emit_start + (offset))
1153#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \
1154 (SSize_t) ((node) - RExC_emit_start))
1155
1156#define ProgLen(ri) ri->proglen
1157#define SetProgLen(ri,x) ri->proglen = x
1158
1159#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1160#define EXPERIMENTAL_INPLACESCAN
1161#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1162
1163#define DEBUG_RExC_seen() \
1164 DEBUG_OPTIMISE_MORE_r({ \
1165 Perl_re_printf( aTHX_ "RExC_seen: "); \
1166 \
1167 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1168 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1169 \
1170 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1171 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1172 \
1173 if (RExC_seen & REG_GPOS_SEEN) \
1174 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1175 \
1176 if (RExC_seen & REG_RECURSE_SEEN) \
1177 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1178 \
1179 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1180 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1181 \
1182 if (RExC_seen & REG_VERBARG_SEEN) \
1183 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1184 \
1185 if (RExC_seen & REG_CUTGROUP_SEEN) \
1186 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1187 \
1188 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1189 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1190 \
1191 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1192 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1193 \
1194 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1195 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1196 \
c224bbd5
YO
1197 if (RExC_seen & REG_PESSIMIZE_SEEN) \
1198 Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN "); \
1199 \
85900e28
YO
1200 Perl_re_printf( aTHX_ "\n"); \
1201 });
1202
1203#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1204 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1205
1206
1207#ifdef DEBUGGING
1208# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1209 debug_studydata(where, data, depth, is_inf, min, stopmin, delta)
1210
1211# define DEBUG_PEEP(str, scan, depth, flags) \
1212 debug_peep(str, pRExC_state, scan, depth, flags)
1213#else
1214# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1215# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1216#endif
1217
1218#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
1219#ifdef DEBUGGING
1220#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
1221#else
1222#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
1223#endif
1224
1225#define MADE_TRIE 1
1226#define MADE_JUMP_TRIE 2
1227#define MADE_EXACT_TRIE 4
1228
1229#define INVLIST_INDEX 0
1230#define ONLY_LOCALE_MATCHES_INDEX 1
1231#define DEFERRED_USER_DEFINED_INDEX 2
1232
1233/* These two functions currently do the exact same thing */
1234#define ssc_init_zero ssc_init
1235
1236#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1237#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1238
1239#ifdef DEBUGGING
1240#define REGNODE_GUTS(state,op,extra_size) \
1241 regnode_guts_debug(state,op,extra_size)
1242#else
1243#define REGNODE_GUTS(state,op,extra_size) \
1244 regnode_guts(state,extra_size)
1245#endif
1246
1247#define CLEAR_OPTSTART \
1248 if (optstart) STMT_START { \
1249 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
1250 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
1251 optstart=NULL; \
1252 } STMT_END
1253
1254#define DUMPUNTIL(b,e) \
1255 CLEAR_OPTSTART; \
1256 node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
1257
0678333e
YO
1258#define REGNODE_STEP_OVER(ret,t1,t2) \
1259 NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode))
fe5492d9 1260
85900e28 1261#endif /* REGCOMP_INTERNAL_H */