Commit | Line | Data |
---|---|---|
85900e28 YO |
1 | #ifndef REGCOMP_INTERNAL_H |
2 | #define REGCOMP_INTERNAL_H | |
3 | #ifndef STATIC | |
4 | #define STATIC static | |
5 | #endif | |
c5b1c090 YO |
6 | #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM |
7 | #define RE_OPTIMIZE_CURLYX_TO_CURLYM 1 | |
8 | #endif | |
9 | #ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN | |
10 | #define RE_OPTIMIZE_CURLYX_TO_CURLYN 1 | |
11 | #endif | |
85900e28 YO |
12 | |
13 | /* this is a chain of data about sub patterns we are processing that | |
14 | need to be handled separately/specially in study_chunk. Its so | |
15 | we can simulate recursion without losing state. */ | |
16 | struct scan_frame; | |
17 | typedef struct scan_frame { | |
18 | regnode *last_regnode; /* last node to process in this frame */ | |
19 | regnode *next_regnode; /* next node to process when last is reached */ | |
20 | U32 prev_recursed_depth; | |
21 | I32 stopparen; /* what stopparen do we use */ | |
22 | bool in_gosub; /* this or an outer frame is for GOSUB */ | |
23 | ||
24 | struct scan_frame *this_prev_frame; /* this previous frame */ | |
25 | struct scan_frame *prev_frame; /* previous frame */ | |
26 | struct scan_frame *next_frame; /* next frame */ | |
27 | } scan_frame; | |
28 | ||
29 | /* Certain characters are output as a sequence with the first being a | |
30 | * backslash. */ | |
31 | #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) | |
32 | ||
33 | ||
34 | struct RExC_state_t { | |
35 | U32 flags; /* RXf_* are we folding, multilining? */ | |
36 | U32 pm_flags; /* PMf_* stuff from the calling PMOP */ | |
37 | char *precomp; /* uncompiled string. */ | |
38 | char *precomp_end; /* pointer to end of uncompiled string. */ | |
39 | REGEXP *rx_sv; /* The SV that is the regexp. */ | |
40 | regexp *rx; /* perl core regexp structure */ | |
41 | regexp_internal *rxi; /* internal data for regexp object | |
42 | pprivate field */ | |
43 | char *start; /* Start of input for compile */ | |
44 | char *end; /* End of input for compile */ | |
45 | char *parse; /* Input-scan pointer. */ | |
46 | char *copy_start; /* start of copy of input within | |
47 | constructed parse string */ | |
48 | char *save_copy_start; /* Provides one level of saving | |
49 | and restoring 'copy_start' */ | |
50 | char *copy_start_in_input; /* Position in input string | |
51 | corresponding to copy_start */ | |
52 | SSize_t whilem_seen; /* number of WHILEM in this expr */ | |
53 | regnode *emit_start; /* Start of emitted-code area */ | |
54 | regnode_offset emit; /* Code-emit pointer */ | |
55 | I32 naughty; /* How bad is this pattern? */ | |
56 | I32 sawback; /* Did we see \1, ...? */ | |
57 | SSize_t size; /* Number of regnode equivalents in | |
58 | pattern */ | |
59 | Size_t sets_depth; /* Counts recursion depth of already- | |
60 | compiled regex set patterns */ | |
61 | U32 seen; | |
62 | ||
63 | I32 parens_buf_size; /* #slots malloced open/close_parens */ | |
64 | regnode_offset *open_parens; /* offsets to open parens */ | |
65 | regnode_offset *close_parens; /* offsets to close parens */ | |
66 | HV *paren_names; /* Paren names */ | |
67 | ||
68 | /* position beyond 'precomp' of the warning message furthest away from | |
69 | * 'precomp'. During the parse, no warnings are raised for any problems | |
70 | * earlier in the parse than this position. This works if warnings are | |
71 | * raised the first time a given spot is parsed, and if only one | |
72 | * independent warning is raised for any given spot */ | |
73 | Size_t latest_warn_offset; | |
74 | ||
fe5492d9 YO |
75 | /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id. |
76 | * "Logical Parno" is the user visible view with branch reset taken into | |
77 | * account. "Parno" (or physical parno) is the actual capture buffers in | |
78 | * the pattern *NOT* taking into account branch reset. We also maintain | |
79 | * a map of "next" pointers which allow us to skip to the next physical | |
80 | * capture buffer with the same logical id, with 0 representing "none". | |
81 | * | |
82 | * As we compile we keep track of the two different counts using the | |
83 | * 'logical_npar' and 'npar' members, and we keep track of the upper bound | |
84 | * of both in 'total_par' and 'logical_total_par', we also populate | |
85 | * the 'logical_to_parno' map, which gives us the first physical parno | |
86 | * for a given logical parno, and the `parno_to_logical` array which gives | |
87 | * us the logical id for each physical parno. When compilation is | |
88 | * completed we construct the 'parno_to_logical_next' array from the | |
89 | * 'parno_to_logical' array. (We do not bother constructing it during | |
90 | * compilation as we do not need it, and we can construct it in O(N) time | |
91 | * once we are done, but would need more complicated logic during the | |
92 | * compile, because we want the next pointers to go from smallest to | |
93 | * largest, eg, left to right.) | |
94 | * | |
95 | * Logical: $1 $2 $3 $4 $2 $3 $2 $5 | |
96 | * Physical: 1 2 3 4 5 6 7 8 | |
97 | * Next: 0 5 6 0 7 0 0 0 | |
98 | * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/ | |
99 | * | |
100 | * As much as possible the internals use and store the physical id of | |
101 | * of capture buffers. We decode the physical to the logical only when | |
102 | * we need to, for instance when someone use $2. | |
103 | * | |
104 | * Note that when branch reset is not used logical and physical are the | |
105 | * same and the next data would be all zero. So when branch reset is not | |
106 | * used we do not need to populate this data into the final regexp. | |
107 | * | |
108 | */ | |
109 | I32 *logical_to_parno; /* logical_parno to parno */ | |
110 | I32 *parno_to_logical; /* parno to logical_parno */ | |
111 | I32 *parno_to_logical_next; /* parno to next (greater value) | |
112 | parno with the same | |
113 | logical_parno as parno.*/ | |
114 | ||
85900e28 YO |
115 | I32 npar; /* Capture buffer count so far in the |
116 | parse, (OPEN) plus one. ("par" 0 is | |
117 | the whole pattern)*/ | |
fe5492d9 | 118 | I32 logical_npar; /* Logical version of npar */ |
85900e28 YO |
119 | I32 total_par; /* During initial parse, is either 0, |
120 | or -1; the latter indicating a | |
121 | reparse is needed. After that pass, | |
122 | it is what 'npar' became after the | |
123 | pass. Hence, it being > 0 indicates | |
124 | we are in a reparse situation */ | |
fe5492d9 | 125 | I32 logical_total_par; /* Logical version to total par */ |
85900e28 YO |
126 | I32 nestroot; /* root parens we are in - used by |
127 | accept */ | |
128 | I32 seen_zerolen; | |
129 | regnode *end_op; /* END node in program */ | |
130 | I32 utf8; /* whether the pattern is utf8 or not */ | |
131 | I32 orig_utf8; /* whether the pattern was originally in utf8 */ | |
132 | /* XXX use this for future optimisation of case | |
133 | * where pattern must be upgraded to utf8. */ | |
134 | I32 uni_semantics; /* If a d charset modifier should use unicode | |
135 | rules, even if the pattern is not in | |
136 | utf8 */ | |
137 | ||
138 | I32 recurse_count; /* Number of recurse regops we have generated */ | |
139 | regnode **recurse; /* Recurse regops */ | |
140 | U8 *study_chunk_recursed; /* bitmap of which subs we have moved | |
141 | through */ | |
142 | U32 study_chunk_recursed_bytes; /* bytes in bitmap */ | |
143 | I32 in_lookaround; | |
144 | I32 contains_locale; | |
145 | I32 override_recoding; | |
146 | I32 recode_x_to_native; | |
147 | I32 in_multi_char_class; | |
148 | int code_index; /* next code_blocks[] slot */ | |
149 | struct reg_code_blocks *code_blocks;/* positions of literal (?{}) | |
150 | within pattern */ | |
151 | SSize_t maxlen; /* mininum possible number of chars in string to match */ | |
152 | scan_frame *frame_head; | |
153 | scan_frame *frame_last; | |
154 | U32 frame_count; | |
155 | AV *warn_text; | |
156 | HV *unlexed_names; | |
157 | SV *runtime_code_qr; /* qr with the runtime code blocks */ | |
158 | #ifdef DEBUGGING | |
159 | const char *lastparse; | |
160 | I32 lastnum; | |
161 | U32 study_chunk_recursed_count; | |
162 | AV *paren_name_list; /* idx -> name */ | |
163 | SV *mysv1; | |
164 | SV *mysv2; | |
85900e28 YO |
165 | #endif |
166 | bool seen_d_op; | |
167 | bool strict; | |
168 | bool study_started; | |
169 | bool in_script_run; | |
170 | bool use_BRANCHJ; | |
171 | bool sWARN_EXPERIMENTAL__VLB; | |
172 | bool sWARN_EXPERIMENTAL__REGEX_SETS; | |
173 | }; | |
174 | ||
e7252fd4 YO |
175 | #ifdef DEBUGGING |
176 | #define RExC_lastparse (pRExC_state->lastparse) | |
177 | #define RExC_lastnum (pRExC_state->lastnum) | |
178 | #define RExC_paren_name_list (pRExC_state->paren_name_list) | |
179 | #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) | |
180 | #define RExC_mysv (pRExC_state->mysv1) | |
181 | #define RExC_mysv1 (pRExC_state->mysv1) | |
182 | #define RExC_mysv2 (pRExC_state->mysv2) | |
183 | #endif | |
184 | ||
85900e28 YO |
185 | #define RExC_flags (pRExC_state->flags) |
186 | #define RExC_pm_flags (pRExC_state->pm_flags) | |
187 | #define RExC_precomp (pRExC_state->precomp) | |
188 | #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) | |
189 | #define RExC_copy_start_in_constructed (pRExC_state->copy_start) | |
190 | #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) | |
191 | #define RExC_precomp_end (pRExC_state->precomp_end) | |
192 | #define RExC_rx_sv (pRExC_state->rx_sv) | |
193 | #define RExC_rx (pRExC_state->rx) | |
194 | #define RExC_rxi (pRExC_state->rxi) | |
195 | #define RExC_start (pRExC_state->start) | |
196 | #define RExC_end (pRExC_state->end) | |
197 | #define RExC_parse (pRExC_state->parse) | |
198 | #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) | |
199 | #define RExC_whilem_seen (pRExC_state->whilem_seen) | |
200 | #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs | |
201 | under /d from /u ? */ | |
202 | ||
203 | #define RExC_emit (pRExC_state->emit) | |
204 | #define RExC_emit_start (pRExC_state->emit_start) | |
205 | #define RExC_sawback (pRExC_state->sawback) | |
206 | #define RExC_seen (pRExC_state->seen) | |
207 | #define RExC_size (pRExC_state->size) | |
208 | #define RExC_maxlen (pRExC_state->maxlen) | |
fe5492d9 YO |
209 | #define RExC_logical_npar (pRExC_state->logical_npar) |
210 | #define RExC_logical_total_parens (pRExC_state->logical_total_par) | |
211 | #define RExC_logical_to_parno (pRExC_state->logical_to_parno) | |
212 | #define RExC_parno_to_logical (pRExC_state->parno_to_logical) | |
213 | #define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next) | |
85900e28 YO |
214 | #define RExC_npar (pRExC_state->npar) |
215 | #define RExC_total_parens (pRExC_state->total_par) | |
216 | #define RExC_parens_buf_size (pRExC_state->parens_buf_size) | |
217 | #define RExC_nestroot (pRExC_state->nestroot) | |
218 | #define RExC_seen_zerolen (pRExC_state->seen_zerolen) | |
219 | #define RExC_utf8 (pRExC_state->utf8) | |
220 | #define RExC_uni_semantics (pRExC_state->uni_semantics) | |
221 | #define RExC_orig_utf8 (pRExC_state->orig_utf8) | |
222 | #define RExC_open_parens (pRExC_state->open_parens) | |
223 | #define RExC_close_parens (pRExC_state->close_parens) | |
224 | #define RExC_end_op (pRExC_state->end_op) | |
225 | #define RExC_paren_names (pRExC_state->paren_names) | |
226 | #define RExC_recurse (pRExC_state->recurse) | |
227 | #define RExC_recurse_count (pRExC_state->recurse_count) | |
228 | #define RExC_sets_depth (pRExC_state->sets_depth) | |
229 | #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) | |
230 | #define RExC_study_chunk_recursed_bytes \ | |
231 | (pRExC_state->study_chunk_recursed_bytes) | |
232 | #define RExC_in_lookaround (pRExC_state->in_lookaround) | |
233 | #define RExC_contains_locale (pRExC_state->contains_locale) | |
234 | #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) | |
235 | ||
236 | #ifdef EBCDIC | |
237 | # define SET_recode_x_to_native(x) \ | |
238 | STMT_START { RExC_recode_x_to_native = (x); } STMT_END | |
239 | #else | |
240 | # define SET_recode_x_to_native(x) NOOP | |
241 | #endif | |
242 | ||
243 | #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) | |
244 | #define RExC_frame_head (pRExC_state->frame_head) | |
245 | #define RExC_frame_last (pRExC_state->frame_last) | |
246 | #define RExC_frame_count (pRExC_state->frame_count) | |
247 | #define RExC_strict (pRExC_state->strict) | |
248 | #define RExC_study_started (pRExC_state->study_started) | |
249 | #define RExC_warn_text (pRExC_state->warn_text) | |
250 | #define RExC_in_script_run (pRExC_state->in_script_run) | |
251 | #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) | |
252 | #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) | |
253 | #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) | |
254 | #define RExC_unlexed_names (pRExC_state->unlexed_names) | |
255 | ||
256 | ||
257 | /***********************************************************************/ | |
258 | /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse | |
259 | * | |
260 | * All of these macros depend on the above RExC_ accessor macros, which | |
261 | * in turns depend on a variable pRExC_state being in scope where they | |
262 | * are used. This is the standard regexp parser context variable which is | |
263 | * passed into every non-trivial parse function in this file. | |
264 | * | |
265 | * Note that the UTF macro is itself a wrapper around RExC_utf8, so all | |
266 | * of the macros which do not take an argument will operate on the | |
267 | * pRExC_state structure *only*. | |
268 | * | |
269 | * Please do NOT modify RExC_parse without using these macros. In the | |
270 | * future these macros will be extended for enhanced debugging and trace | |
271 | * output during the parse process. | |
272 | */ | |
273 | ||
274 | /* RExC_parse_incf(flag) | |
275 | * | |
276 | * Increment RExC_parse to point at the next codepoint, while doing | |
277 | * the right thing depending on whether we are parsing UTF-8 strings | |
278 | * or not. The 'flag' argument determines if content is UTF-8 or not, | |
279 | * intended for cases where this is NOT governed by the UTF macro. | |
280 | * | |
281 | * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. | |
282 | * | |
283 | * WARNING: Does NOT take into account RExC_end; it is the callers | |
284 | * responsibility to make sure there are enough octets left in | |
285 | * RExC_parse to ensure that when processing UTF-8 we would not read | |
286 | * past the end of the string. | |
287 | */ | |
288 | #define RExC_parse_incf(flag) STMT_START { \ | |
289 | RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ | |
290 | } STMT_END | |
291 | ||
292 | /* RExC_parse_inc_safef(flag) | |
293 | * | |
294 | * Safely increment RExC_parse to point at the next codepoint, | |
295 | * doing the right thing depending on whether we are parsing | |
296 | * UTF-8 strings or not and NOT reading past the end of the buffer. | |
297 | * The 'flag' argument determines if content is UTF-8 or not, | |
298 | * intended for cases where this is NOT governed by the UTF macro. | |
299 | * | |
300 | * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. | |
301 | * | |
302 | * NOTE: Will NOT read past RExC_end when content is UTF-8. | |
303 | */ | |
304 | #define RExC_parse_inc_safef(flag) STMT_START { \ | |
305 | RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ | |
306 | } STMT_END | |
307 | ||
308 | /* RExC_parse_inc() | |
309 | * | |
310 | * Increment RExC_parse to point at the next codepoint, | |
311 | * doing the right thing depending on whether we are parsing | |
312 | * UTF-8 strings or not. | |
313 | * | |
314 | * WARNING: Does NOT take into account RExC_end, it is the callers | |
315 | * responsibility to make sure there are enough octets left in | |
316 | * RExC_parse to ensure that when processing UTF-8 we would not read | |
317 | * past the end of the string. | |
318 | * | |
319 | * NOTE: whether we are parsing UTF-8 or not is determined by the | |
320 | * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this | |
321 | * macro operates on the pRExC_state structure only. | |
322 | */ | |
323 | #define RExC_parse_inc() RExC_parse_incf(UTF) | |
324 | ||
325 | /* RExC_parse_inc_safe() | |
326 | * | |
327 | * Safely increment RExC_parse to point at the next codepoint, | |
328 | * doing the right thing depending on whether we are parsing | |
329 | * UTF-8 strings or not and NOT reading past the end of the buffer. | |
330 | * | |
331 | * NOTE: whether we are parsing UTF-8 or not is determined by the | |
332 | * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this | |
333 | * macro operates on the pRExC_state structure only. | |
334 | */ | |
335 | #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) | |
336 | ||
337 | /* RExC_parse_inc_utf8() | |
338 | * | |
339 | * Increment RExC_parse to point at the next utf8 codepoint, | |
340 | * assumes content is UTF-8. | |
341 | * | |
342 | * WARNING: Does NOT take into account RExC_end; it is the callers | |
343 | * responsibility to make sure there are enough octets left in RExC_parse | |
344 | * to ensure that when processing UTF-8 we would not read past the end | |
345 | * of the string. | |
346 | */ | |
347 | #define RExC_parse_inc_utf8() STMT_START { \ | |
348 | RExC_parse += UTF8SKIP(RExC_parse); \ | |
349 | } STMT_END | |
350 | ||
351 | /* RExC_parse_inc_if_char() | |
352 | * | |
353 | * Increment RExC_parse to point at the next codepoint, if and only | |
354 | * if the current parse point is NOT a NULL, while doing the right thing | |
355 | * depending on whether we are parsing UTF-8 strings or not. | |
356 | * | |
357 | * WARNING: Does NOT take into account RExC_end, it is the callers | |
358 | * responsibility to make sure there are enough octets left in RExC_parse | |
359 | * to ensure that when processing UTF-8 we would not read past the end | |
360 | * of the string. | |
361 | * | |
362 | * NOTE: whether we are parsing UTF-8 or not is determined by the | |
363 | * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this | |
364 | * macro operates on the pRExC_state structure only. | |
365 | */ | |
366 | #define RExC_parse_inc_if_char() STMT_START { \ | |
367 | RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ | |
368 | } STMT_END | |
369 | ||
370 | /* RExC_parse_inc_by(n_octets) | |
371 | * | |
372 | * Increment the parse cursor by the number of octets specified by | |
373 | * the 'n_octets' argument. | |
374 | * | |
375 | * NOTE: Does NOT check ANY constraints. It is the callers responsibility | |
376 | * that this will not move past the end of the string, or leave the | |
377 | * pointer in the middle of a UTF-8 sequence. | |
378 | * | |
379 | * Typically used to advanced past previously analyzed content. | |
380 | */ | |
381 | #define RExC_parse_inc_by(n_octets) STMT_START { \ | |
382 | RExC_parse += (n_octets); \ | |
383 | } STMT_END | |
384 | ||
385 | /* RExC_parse_set(to_ptr) | |
386 | * | |
387 | * Sets the RExC_parse pointer to the pointer specified by the 'to' | |
388 | * argument. No validation whatsoever is performed on the to pointer. | |
389 | */ | |
390 | #define RExC_parse_set(to_ptr) STMT_START { \ | |
391 | RExC_parse = (to_ptr); \ | |
392 | } STMT_END | |
393 | ||
394 | /**********************************************************************/ | |
395 | ||
396 | /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set | |
397 | * a flag to disable back-off on the fixed/floating substrings - if it's | |
398 | * a high complexity pattern we assume the benefit of avoiding a full match | |
399 | * is worth the cost of checking for the substrings even if they rarely help. | |
400 | */ | |
401 | #define RExC_naughty (pRExC_state->naughty) | |
402 | #define TOO_NAUGHTY (10) | |
403 | #define MARK_NAUGHTY(add) \ | |
404 | if (RExC_naughty < TOO_NAUGHTY) \ | |
405 | RExC_naughty += (add) | |
406 | #define MARK_NAUGHTY_EXP(exp, add) \ | |
407 | if (RExC_naughty < TOO_NAUGHTY) \ | |
408 | RExC_naughty += RExC_naughty / (exp) + (add) | |
409 | ||
410 | #define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') | |
411 | #define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ | |
412 | || ((*s) == '{' && regcurly(s, e, NULL))) | |
413 | ||
414 | /* | |
415 | * Flags to be passed up. | |
416 | */ | |
417 | #define HASWIDTH 0x01 /* Known to not match null strings, could match | |
418 | non-null ones. */ | |
419 | #define SIMPLE 0x02 /* Exactly one character wide */ | |
420 | /* (or LNBREAK as a special case) */ | |
421 | #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ | |
422 | #define TRYAGAIN 0x10 /* Weeded out a declaration. */ | |
423 | #define RESTART_PARSE 0x20 /* Need to redo the parse */ | |
424 | #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to | |
425 | calcuate sizes as UTF-8 */ | |
426 | ||
427 | #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) | |
428 | ||
429 | /* whether trie related optimizations are enabled */ | |
430 | #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION | |
431 | #define TRIE_STUDY_OPT | |
432 | #define FULL_TRIE_STUDY | |
433 | #define TRIE_STCLASS | |
434 | #endif | |
435 | ||
436 | /* About the term "restudy" and the var "restudied" and the defines | |
437 | * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to | |
438 | * doing multiple study_chunk() calls over the same set of opcodes for* the | |
439 | * purpose of enhanced TRIE optimizations. | |
440 | * | |
441 | * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal | |
442 | * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled | |
443 | * which then causes the Perl_re_op_compile() to then call the optimizer | |
444 | * S_study_chunk() a second time to perform additional optimizations, | |
445 | * including the aho_corasick startclass optimization. | |
446 | * This additional pass will only happen once, which is managed by the | |
447 | * 'restudied' variable in Perl_re_op_compile(). | |
448 | * | |
449 | * When this second pass is under way the flags passed into study_chunk() will | |
450 | * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down | |
451 | * to any recursive calls to S_study_chunk(). | |
452 | * | |
453 | * IMPORTANT: Any logic in study_chunk() that emits warnings should check that | |
454 | * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may | |
455 | * be produced twice. | |
456 | * | |
457 | * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and | |
458 | * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. | |
459 | */ | |
460 | ||
461 | ||
462 | #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] | |
463 | #define PBITVAL(paren) (1 << ((paren) & 7)) | |
464 | #define PAREN_OFFSET(depth) \ | |
465 | (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) | |
466 | #define PAREN_TEST(depth, paren) \ | |
467 | (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) | |
468 | #define PAREN_SET(depth, paren) \ | |
469 | (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) | |
470 | #define PAREN_UNSET(depth, paren) \ | |
471 | (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) | |
472 | ||
473 | #define REQUIRE_UTF8(flagp) STMT_START { \ | |
474 | if (!UTF) { \ | |
475 | *flagp = RESTART_PARSE|NEED_UTF8; \ | |
476 | return 0; \ | |
477 | } \ | |
478 | } STMT_END | |
479 | ||
480 | /* /u is to be chosen if we are supposed to use Unicode rules, or if the | |
481 | * pattern is in UTF-8. This latter condition is in case the outermost rules | |
482 | * are locale. See GH #17278 */ | |
483 | #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) | |
484 | ||
485 | /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is | |
486 | * a flag that indicates we need to override /d with /u as a result of | |
487 | * something in the pattern. It should only be used in regards to calling | |
488 | * set_regex_charset() or get_regex_charset() */ | |
489 | #define REQUIRE_UNI_RULES(flagp, restart_retval) \ | |
490 | STMT_START { \ | |
491 | if (DEPENDS_SEMANTICS) { \ | |
492 | set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ | |
493 | RExC_uni_semantics = 1; \ | |
494 | if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ | |
495 | /* No need to restart the parse if we haven't seen \ | |
496 | * anything that differs between /u and /d, and no need \ | |
497 | * to restart immediately if we're going to reparse \ | |
498 | * anyway to count parens */ \ | |
499 | *flagp |= RESTART_PARSE; \ | |
500 | return restart_retval; \ | |
501 | } \ | |
502 | } \ | |
503 | } STMT_END | |
504 | ||
505 | #define REQUIRE_BRANCHJ(flagp, restart_retval) \ | |
506 | STMT_START { \ | |
507 | RExC_use_BRANCHJ = 1; \ | |
508 | *flagp |= RESTART_PARSE; \ | |
509 | return restart_retval; \ | |
510 | } STMT_END | |
511 | ||
512 | /* Until we have completed the parse, we leave RExC_total_parens at 0 or | |
513 | * less. After that, it must always be positive, because the whole re is | |
514 | * considered to be surrounded by virtual parens. Setting it to negative | |
515 | * indicates there is some construct that needs to know the actual number of | |
516 | * parens to be properly handled. And that means an extra pass will be | |
517 | * required after we've counted them all */ | |
518 | #define ALL_PARENS_COUNTED (RExC_total_parens > 0) | |
519 | #define REQUIRE_PARENS_PASS \ | |
520 | STMT_START { /* No-op if have completed a pass */ \ | |
521 | if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ | |
522 | } STMT_END | |
523 | #define IN_PARENS_PASS (RExC_total_parens < 0) | |
524 | ||
525 | ||
526 | /* This is used to return failure (zero) early from the calling function if | |
527 | * various flags in 'flags' are set. Two flags always cause a return: | |
528 | * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any | |
529 | * additional flags that should cause a return; 0 if none. If the return will | |
530 | * be done, '*flagp' is first set to be all of the flags that caused the | |
531 | * return. */ | |
532 | #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ | |
533 | STMT_START { \ | |
534 | if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ | |
535 | *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ | |
536 | return 0; \ | |
537 | } \ | |
538 | } STMT_END | |
539 | ||
540 | #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) | |
541 | ||
542 | #define RETURN_FAIL_ON_RESTART(flags,flagp) \ | |
543 | RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) | |
544 | #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ | |
545 | if (MUST_RESTART(*(flagp))) return 0 | |
546 | ||
547 | /* This converts the named class defined in regcomp.h to its equivalent class | |
548 | * number defined in handy.h. */ | |
549 | #define namedclass_to_classnum(class) ((int) ((class) / 2)) | |
550 | #define classnum_to_namedclass(classnum) ((classnum) * 2) | |
551 | ||
552 | #define _invlist_union_complement_2nd(a, b, output) \ | |
553 | _invlist_union_maybe_complement_2nd(a, b, TRUE, output) | |
554 | #define _invlist_intersection_complement_2nd(a, b, output) \ | |
555 | _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) | |
556 | ||
557 | /* We add a marker if we are deferring expansion of a property that is both | |
558 | * 1) potentiallly user-defined; and | |
559 | * 2) could also be an official Unicode property. | |
560 | * | |
561 | * Without this marker, any deferred expansion can only be for a user-defined | |
562 | * one. This marker shouldn't conflict with any that could be in a legal name, | |
563 | * and is appended to its name to indicate this. There is a string and | |
564 | * character form */ | |
565 | #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" | |
566 | #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' | |
567 | ||
568 | /* What is infinity for optimization purposes */ | |
569 | #define OPTIMIZE_INFTY SSize_t_MAX | |
570 | ||
571 | /* About scan_data_t. | |
572 | ||
573 | During optimisation we recurse through the regexp program performing | |
574 | various inplace (keyhole style) optimisations. In addition study_chunk | |
575 | and scan_commit populate this data structure with information about | |
576 | what strings MUST appear in the pattern. We look for the longest | |
577 | string that must appear at a fixed location, and we look for the | |
578 | longest string that may appear at a floating location. So for instance | |
579 | in the pattern: | |
580 | ||
581 | /FOO[xX]A.*B[xX]BAR/ | |
582 | ||
583 | Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating | |
584 | strings (because they follow a .* construct). study_chunk will identify | |
585 | both FOO and BAR as being the longest fixed and floating strings respectively. | |
586 | ||
587 | The strings can be composites, for instance | |
588 | ||
589 | /(f)(o)(o)/ | |
590 | ||
591 | will result in a composite fixed substring 'foo'. | |
592 | ||
593 | For each string some basic information is maintained: | |
594 | ||
595 | - min_offset | |
596 | This is the position the string must appear at, or not before. | |
597 | It also implicitly (when combined with minlenp) tells us how many | |
598 | characters must match before the string we are searching for. | |
599 | Likewise when combined with minlenp and the length of the string it | |
600 | tells us how many characters must appear after the string we have | |
601 | found. | |
602 | ||
603 | - max_offset | |
604 | Only used for floating strings. This is the rightmost point that | |
605 | the string can appear at. If set to OPTIMIZE_INFTY it indicates that the | |
606 | string can occur infinitely far to the right. | |
607 | For fixed strings, it is equal to min_offset. | |
608 | ||
609 | - minlenp | |
610 | A pointer to the minimum number of characters of the pattern that the | |
611 | string was found inside. This is important as in the case of positive | |
612 | lookahead or positive lookbehind we can have multiple patterns | |
613 | involved. Consider | |
614 | ||
615 | /(?=FOO).*F/ | |
616 | ||
617 | The minimum length of the pattern overall is 3, the minimum length | |
618 | of the lookahead part is 3, but the minimum length of the part that | |
619 | will actually match is 1. So 'FOO's minimum length is 3, but the | |
620 | minimum length for the F is 1. This is important as the minimum length | |
621 | is used to determine offsets in front of and behind the string being | |
622 | looked for. Since strings can be composites this is the length of the | |
623 | pattern at the time it was committed with a scan_commit. Note that | |
624 | the length is calculated by study_chunk, so that the minimum lengths | |
625 | are not known until the full pattern has been compiled, thus the | |
626 | pointer to the value. | |
627 | ||
628 | - lookbehind | |
629 | ||
630 | In the case of lookbehind the string being searched for can be | |
631 | offset past the start point of the final matching string. | |
632 | If this value was just blithely removed from the min_offset it would | |
633 | invalidate some of the calculations for how many chars must match | |
634 | before or after (as they are derived from min_offset and minlen and | |
635 | the length of the string being searched for). | |
636 | When the final pattern is compiled and the data is moved from the | |
637 | scan_data_t structure into the regexp structure the information | |
638 | about lookbehind is factored in, with the information that would | |
639 | have been lost precalculated in the end_shift field for the | |
640 | associated string. | |
641 | ||
642 | The fields pos_min and pos_delta are used to store the minimum offset | |
643 | and the delta to the maximum offset at the current point in the pattern. | |
644 | ||
645 | */ | |
646 | ||
647 | struct scan_data_substrs { | |
648 | SV *str; /* longest substring found in pattern */ | |
649 | SSize_t min_offset; /* earliest point in string it can appear */ | |
650 | SSize_t max_offset; /* latest point in string it can appear */ | |
651 | SSize_t *minlenp; /* pointer to the minlen relevant to the string */ | |
652 | SSize_t lookbehind; /* is the pos of the string modified by LB */ | |
653 | I32 flags; /* per substring SF_* and SCF_* flags */ | |
654 | }; | |
655 | ||
571fb71d YO |
656 | /* this is typedef'ed in perl.h */ |
657 | struct scan_data_t { | |
85900e28 YO |
658 | /*I32 len_min; unused */ |
659 | /*I32 len_delta; unused */ | |
660 | SSize_t pos_min; | |
661 | SSize_t pos_delta; | |
662 | SV *last_found; | |
663 | SSize_t last_end; /* min value, <0 unless valid. */ | |
664 | SSize_t last_start_min; | |
665 | SSize_t last_start_max; | |
666 | U8 cur_is_floating; /* whether the last_* values should be set as | |
667 | * the next fixed (0) or floating (1) | |
668 | * substring */ | |
669 | ||
670 | /* [0] is longest fixed substring so far, [1] is longest float so far */ | |
671 | struct scan_data_substrs substrs[2]; | |
672 | ||
673 | I32 flags; /* common SF_* and SCF_* flags */ | |
674 | I32 whilem_c; | |
675 | SSize_t *last_closep; | |
676 | regnode **last_close_opp; /* pointer to pointer to last CLOSE regop | |
677 | seen. DO NOT DEREFERENCE the regnode | |
678 | pointer - the op may have been optimized | |
679 | away */ | |
680 | regnode_ssc *start_class; | |
571fb71d | 681 | }; |
85900e28 YO |
682 | |
683 | /* | |
684 | * Forward declarations for pregcomp()'s friends. | |
685 | */ | |
686 | ||
687 | static const scan_data_t zero_scan_data = { | |
688 | 0, 0, NULL, 0, 0, 0, 0, | |
689 | { | |
690 | { NULL, 0, 0, 0, 0, 0 }, | |
691 | { NULL, 0, 0, 0, 0, 0 }, | |
692 | }, | |
693 | 0, 0, NULL, NULL, NULL | |
694 | }; | |
695 | ||
696 | /* study flags */ | |
697 | ||
698 | #define SF_BEFORE_SEOL 0x0001 | |
699 | #define SF_BEFORE_MEOL 0x0002 | |
700 | #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) | |
701 | ||
702 | #define SF_IS_INF 0x0040 | |
703 | #define SF_HAS_PAR 0x0080 | |
704 | #define SF_IN_PAR 0x0100 | |
705 | #define SF_HAS_EVAL 0x0200 | |
706 | ||
707 | ||
708 | /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the | |
709 | * longest substring in the pattern. When it is not set the optimiser keeps | |
710 | * track of position, but does not keep track of the actual strings seen, | |
711 | * | |
712 | * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but | |
713 | * /foo/i will not. | |
714 | * | |
715 | * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" | |
716 | * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be | |
717 | * turned off because of the alternation (BRANCH). */ | |
718 | #define SCF_DO_SUBSTR 0x0400 | |
719 | ||
720 | #define SCF_DO_STCLASS_AND 0x0800 | |
721 | #define SCF_DO_STCLASS_OR 0x1000 | |
722 | #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) | |
723 | #define SCF_WHILEM_VISITED_POS 0x2000 | |
724 | ||
725 | #define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? | |
726 | Search for "restudy" in this file | |
727 | to find a detailed explanation.*/ | |
728 | #define SCF_SEEN_ACCEPT 0x8000 | |
729 | #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? | |
730 | Search for "restudy" in this file | |
731 | to find a detailed explanation. */ | |
732 | #define SCF_IN_DEFINE 0x20000 | |
733 | ||
734 | ||
735 | ||
736 | #define UTF cBOOL(RExC_utf8) | |
737 | ||
738 | /* The enums for all these are ordered so things work out correctly */ | |
739 | #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) | |
740 | #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ | |
741 | == REGEX_DEPENDS_CHARSET) | |
742 | #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) | |
743 | #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ | |
744 | >= REGEX_UNICODE_CHARSET) | |
745 | #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ | |
746 | == REGEX_ASCII_RESTRICTED_CHARSET) | |
747 | #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ | |
748 | >= REGEX_ASCII_RESTRICTED_CHARSET) | |
749 | #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ | |
750 | == REGEX_ASCII_MORE_RESTRICTED_CHARSET) | |
751 | ||
752 | #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) | |
753 | ||
754 | /* For programs that want to be strictly Unicode compatible by dying if any | |
755 | * attempt is made to match a non-Unicode code point against a Unicode | |
756 | * property. */ | |
757 | #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) | |
758 | ||
759 | #define OOB_NAMEDCLASS -1 | |
760 | ||
761 | /* There is no code point that is out-of-bounds, so this is problematic. But | |
762 | * its only current use is to initialize a variable that is always set before | |
763 | * looked at. */ | |
764 | #define OOB_UNICODE 0xDEADBEEF | |
765 | ||
766 | #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) | |
767 | ||
768 | ||
769 | /* length of regex to show in messages that don't mark a position within */ | |
770 | #define RegexLengthToShowInErrorMessages 127 | |
771 | ||
772 | /* | |
773 | * If MARKER[12] are adjusted, be sure to adjust the constants at the top | |
774 | * of t/op/regmesg.t, the tests in t/op/re_tests, and those in | |
775 | * op/pragma/warn/regcomp. | |
776 | */ | |
777 | #define MARKER1 "<-- HERE" /* marker as it appears in the description */ | |
778 | #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ | |
779 | ||
780 | #define REPORT_LOCATION " in regex; marked by " MARKER1 \ | |
781 | " in m/%" UTF8f MARKER2 "%" UTF8f "/" | |
782 | ||
783 | /* The code in this file in places uses one level of recursion with parsing | |
784 | * rebased to an alternate string constructed by us in memory. This can take | |
785 | * the form of something that is completely different from the input, or | |
786 | * something that uses the input as part of the alternate. In the first case, | |
787 | * there should be no possibility of an error, as we are in complete control of | |
788 | * the alternate string. But in the second case we don't completely control | |
789 | * the input portion, so there may be errors in that. Here's an example: | |
790 | * /[abc\x{DF}def]/ui | |
791 | * is handled specially because \x{df} folds to a sequence of more than one | |
792 | * character: 'ss'. What is done is to create and parse an alternate string, | |
793 | * which looks like this: | |
794 | * /(?:\x{DF}|[abc\x{DF}def])/ui | |
795 | * where it uses the input unchanged in the middle of something it constructs, | |
796 | * which is a branch for the DF outside the character class, and clustering | |
797 | * parens around the whole thing. (It knows enough to skip the DF inside the | |
798 | * class while in this substitute parse.) 'abc' and 'def' may have errors that | |
799 | * need to be reported. The general situation looks like this: | |
800 | * | |
801 | * |<------- identical ------>| | |
802 | * sI tI xI eI | |
803 | * Input: --------------------------------------------------------------- | |
804 | * Constructed: --------------------------------------------------- | |
805 | * sC tC xC eC EC | |
806 | * |<------- identical ------>| | |
807 | * | |
808 | * sI..eI is the portion of the input pattern we are concerned with here. | |
809 | * sC..EC is the constructed substitute parse string. | |
810 | * sC..tC is constructed by us | |
811 | * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. | |
812 | * In the diagram, these are vertically aligned. | |
813 | * eC..EC is also constructed by us. | |
814 | * xC is the position in the substitute parse string where we found a | |
815 | * problem. | |
816 | * xI is the position in the original pattern corresponding to xC. | |
817 | * | |
818 | * We want to display a message showing the real input string. Thus we need to | |
819 | * translate from xC to xI. We know that xC >= tC, since the portion of the | |
820 | * string sC..tC has been constructed by us, and so shouldn't have errors. We | |
821 | * get: | |
822 | * xI = tI + (xC - tC) | |
823 | * | |
824 | * When the substitute parse is constructed, the code needs to set: | |
825 | * RExC_start (sC) | |
826 | * RExC_end (eC) | |
827 | * RExC_copy_start_in_input (tI) | |
828 | * RExC_copy_start_in_constructed (tC) | |
829 | * and restore them when done. | |
830 | * | |
831 | * During normal processing of the input pattern, both | |
832 | * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to | |
833 | * sI, so that xC equals xI. | |
834 | */ | |
835 | ||
836 | #define sI RExC_precomp | |
837 | #define eI RExC_precomp_end | |
838 | #define sC RExC_start | |
839 | #define eC RExC_end | |
840 | #define tI RExC_copy_start_in_input | |
841 | #define tC RExC_copy_start_in_constructed | |
842 | #define xI(xC) (tI + (xC - tC)) | |
843 | #define xI_offset(xC) (xI(xC) - sI) | |
844 | ||
845 | #define REPORT_LOCATION_ARGS(xC) \ | |
846 | UTF8fARG(UTF, \ | |
847 | (xI(xC) > eI) /* Don't run off end */ \ | |
848 | ? eI - sI /* Length before the <--HERE */ \ | |
849 | : ((xI_offset(xC) >= 0) \ | |
850 | ? xI_offset(xC) \ | |
851 | : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ | |
852 | IVdf " trying to output message for " \ | |
853 | " pattern %.*s", \ | |
854 | __FILE__, __LINE__, (IV) xI_offset(xC), \ | |
855 | ((int) (eC - sC)), sC), 0)), \ | |
856 | sI), /* The input pattern printed up to the <--HERE */ \ | |
857 | UTF8fARG(UTF, \ | |
858 | (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ | |
859 | (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ | |
860 | ||
861 | /* Used to point after bad bytes for an error message, but avoid skipping | |
862 | * past a nul byte. */ | |
863 | #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) | |
864 | ||
865 | /* Set up to clean up after our imminent demise */ | |
866 | #define PREPARE_TO_DIE \ | |
867 | STMT_START { \ | |
868 | if (RExC_rx_sv) \ | |
869 | SAVEFREESV(RExC_rx_sv); \ | |
870 | if (RExC_open_parens) \ | |
871 | SAVEFREEPV(RExC_open_parens); \ | |
872 | if (RExC_close_parens) \ | |
873 | SAVEFREEPV(RExC_close_parens); \ | |
52bccf63 KW |
874 | if (RExC_logical_to_parno) \ |
875 | SAVEFREEPV(RExC_logical_to_parno); \ | |
876 | if (RExC_parno_to_logical) \ | |
877 | SAVEFREEPV(RExC_parno_to_logical); \ | |
85900e28 YO |
878 | } STMT_END |
879 | ||
880 | /* | |
881 | * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given | |
882 | * arg. Show regex, up to a maximum length. If it's too long, chop and add | |
883 | * "...". | |
884 | */ | |
885 | #define _FAIL(code) STMT_START { \ | |
886 | const char *ellipses = ""; \ | |
887 | IV len = RExC_precomp_end - RExC_precomp; \ | |
888 | \ | |
889 | PREPARE_TO_DIE; \ | |
890 | if (len > RegexLengthToShowInErrorMessages) { \ | |
891 | /* chop 10 shorter than the max, to ensure meaning of "..." */ \ | |
892 | len = RegexLengthToShowInErrorMessages - 10; \ | |
893 | ellipses = "..."; \ | |
894 | } \ | |
895 | code; \ | |
896 | } STMT_END | |
897 | ||
898 | #define FAIL(msg) _FAIL( \ | |
899 | Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ | |
900 | msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) | |
901 | ||
902 | #define FAIL2(msg,arg) _FAIL( \ | |
903 | Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ | |
904 | arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) | |
905 | ||
906 | #define FAIL3(msg,arg1,arg2) _FAIL( \ | |
907 | Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ | |
908 | arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) | |
909 | ||
910 | /* | |
911 | * Simple_vFAIL -- like FAIL, but marks the current location in the scan | |
912 | */ | |
913 | #define Simple_vFAIL(m) STMT_START { \ | |
914 | Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ | |
915 | m, REPORT_LOCATION_ARGS(RExC_parse)); \ | |
916 | } STMT_END | |
917 | ||
918 | /* | |
919 | * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() | |
920 | */ | |
921 | #define vFAIL(m) STMT_START { \ | |
922 | PREPARE_TO_DIE; \ | |
923 | Simple_vFAIL(m); \ | |
924 | } STMT_END | |
925 | ||
926 | /* | |
927 | * Like Simple_vFAIL(), but accepts two arguments. | |
928 | */ | |
929 | #define Simple_vFAIL2(m,a1) STMT_START { \ | |
930 | S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ | |
931 | REPORT_LOCATION_ARGS(RExC_parse)); \ | |
932 | } STMT_END | |
933 | ||
934 | /* | |
935 | * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). | |
936 | */ | |
937 | #define vFAIL2(m,a1) STMT_START { \ | |
938 | PREPARE_TO_DIE; \ | |
939 | Simple_vFAIL2(m, a1); \ | |
940 | } STMT_END | |
941 | ||
942 | ||
943 | /* | |
944 | * Like Simple_vFAIL(), but accepts three arguments. | |
945 | */ | |
946 | #define Simple_vFAIL3(m, a1, a2) STMT_START { \ | |
947 | S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ | |
948 | REPORT_LOCATION_ARGS(RExC_parse)); \ | |
949 | } STMT_END | |
950 | ||
951 | /* | |
952 | * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). | |
953 | */ | |
954 | #define vFAIL3(m,a1,a2) STMT_START { \ | |
955 | PREPARE_TO_DIE; \ | |
956 | Simple_vFAIL3(m, a1, a2); \ | |
957 | } STMT_END | |
958 | ||
959 | /* | |
960 | * Like Simple_vFAIL(), but accepts four arguments. | |
961 | */ | |
962 | #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ | |
963 | S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ | |
964 | REPORT_LOCATION_ARGS(RExC_parse)); \ | |
965 | } STMT_END | |
966 | ||
967 | #define vFAIL4(m,a1,a2,a3) STMT_START { \ | |
968 | PREPARE_TO_DIE; \ | |
969 | Simple_vFAIL4(m, a1, a2, a3); \ | |
970 | } STMT_END | |
971 | ||
972 | /* A specialized version of vFAIL2 that works with UTF8f */ | |
973 | #define vFAIL2utf8f(m, a1) STMT_START { \ | |
974 | PREPARE_TO_DIE; \ | |
975 | S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ | |
976 | REPORT_LOCATION_ARGS(RExC_parse)); \ | |
977 | } STMT_END | |
978 | ||
979 | #define vFAIL3utf8f(m, a1, a2) STMT_START { \ | |
980 | PREPARE_TO_DIE; \ | |
981 | S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ | |
982 | REPORT_LOCATION_ARGS(RExC_parse)); \ | |
983 | } STMT_END | |
984 | ||
985 | /* Setting this to NULL is a signal to not output warnings */ | |
986 | #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ | |
987 | STMT_START { \ | |
988 | RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ | |
989 | RExC_copy_start_in_constructed = NULL; \ | |
990 | } STMT_END | |
991 | #define RESTORE_WARNINGS \ | |
992 | RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed | |
993 | ||
994 | /* Since a warning can be generated multiple times as the input is reparsed, we | |
995 | * output it the first time we come to that point in the parse, but suppress it | |
996 | * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not | |
997 | * generate any warnings */ | |
998 | #define TO_OUTPUT_WARNINGS(loc) \ | |
999 | ( RExC_copy_start_in_constructed \ | |
1000 | && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) | |
1001 | ||
1002 | /* After we've emitted a warning, we save the position in the input so we don't | |
1003 | * output it again */ | |
1004 | #define UPDATE_WARNINGS_LOC(loc) \ | |
1005 | STMT_START { \ | |
1006 | if (TO_OUTPUT_WARNINGS(loc)) { \ | |
1007 | RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ | |
1008 | - RExC_precomp; \ | |
1009 | } \ | |
1010 | } STMT_END | |
1011 | ||
1012 | /* 'warns' is the output of the packWARNx macro used in 'code' */ | |
1013 | #define _WARN_HELPER(loc, warns, code) \ | |
1014 | STMT_START { \ | |
1015 | if (! RExC_copy_start_in_constructed) { \ | |
1016 | Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ | |
1017 | " expected at '%s'", \ | |
1018 | __FILE__, __LINE__, loc); \ | |
1019 | } \ | |
1020 | if (TO_OUTPUT_WARNINGS(loc)) { \ | |
1021 | if (ckDEAD(warns)) \ | |
1022 | PREPARE_TO_DIE; \ | |
1023 | code; \ | |
1024 | UPDATE_WARNINGS_LOC(loc); \ | |
1025 | } \ | |
1026 | } STMT_END | |
1027 | ||
1028 | /* m is not necessarily a "literal string", in this macro */ | |
1029 | #define warn_non_literal_string(loc, packed_warn, m) \ | |
1030 | _WARN_HELPER(loc, packed_warn, \ | |
1031 | Perl_warner(aTHX_ packed_warn, \ | |
1032 | "%s" REPORT_LOCATION, \ | |
1033 | m, REPORT_LOCATION_ARGS(loc))) | |
1034 | #define reg_warn_non_literal_string(loc, m) \ | |
1035 | warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) | |
1036 | ||
1037 | #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ | |
1038 | STMT_START { \ | |
1039 | char * format; \ | |
1040 | Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ | |
1041 | Newx(format, format_size, char); \ | |
1042 | my_strlcpy(format, m, format_size); \ | |
1043 | my_strlcat(format, REPORT_LOCATION, format_size); \ | |
1044 | SAVEFREEPV(format); \ | |
1045 | _WARN_HELPER(loc, packwarn, \ | |
1046 | Perl_ck_warner(aTHX_ packwarn, \ | |
1047 | format, \ | |
1048 | a1, REPORT_LOCATION_ARGS(loc))); \ | |
1049 | } STMT_END | |
1050 | ||
1051 | #define ckWARNreg(loc,m) \ | |
1052 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1053 | Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1054 | m REPORT_LOCATION, \ | |
1055 | REPORT_LOCATION_ARGS(loc))) | |
1056 | ||
1057 | #define vWARN(loc, m) \ | |
1058 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1059 | Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1060 | m REPORT_LOCATION, \ | |
1061 | REPORT_LOCATION_ARGS(loc))) \ | |
1062 | ||
b27367cd YO |
1063 | #define vWARN_dep(loc,category,m) \ |
1064 | _WARN_HELPER(loc, packWARN(category), \ | |
1065 | Perl_warner(aTHX_ packWARN(category), \ | |
1066 | m REPORT_LOCATION, \ | |
85900e28 YO |
1067 | REPORT_LOCATION_ARGS(loc))) |
1068 | ||
b27367cd YO |
1069 | #define ckWARNdep(loc,category,m) \ |
1070 | _WARN_HELPER(loc, packWARN(category), \ | |
1071 | Perl_ck_warner_d(aTHX_ packWARN(category), \ | |
1072 | m REPORT_LOCATION, \ | |
85900e28 YO |
1073 | REPORT_LOCATION_ARGS(loc))) |
1074 | ||
b27367cd YO |
1075 | #define ckWARNregdep(loc,category,m) \ |
1076 | _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP), \ | |
1077 | Perl_ck_warner_d(aTHX_ packWARN2(category, \ | |
85900e28 YO |
1078 | WARN_REGEXP), \ |
1079 | m REPORT_LOCATION, \ | |
1080 | REPORT_LOCATION_ARGS(loc))) | |
1081 | ||
1082 | #define ckWARN2reg_d(loc,m, a1) \ | |
1083 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1084 | Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ | |
1085 | m REPORT_LOCATION, \ | |
1086 | a1, REPORT_LOCATION_ARGS(loc))) | |
1087 | ||
1088 | #define ckWARN2reg(loc, m, a1) \ | |
1089 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1090 | Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1091 | m REPORT_LOCATION, \ | |
1092 | a1, REPORT_LOCATION_ARGS(loc))) | |
1093 | ||
1094 | #define vWARN3(loc, m, a1, a2) \ | |
1095 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1096 | Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1097 | m REPORT_LOCATION, \ | |
1098 | a1, a2, REPORT_LOCATION_ARGS(loc))) | |
1099 | ||
1100 | #define ckWARN3reg(loc, m, a1, a2) \ | |
1101 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1102 | Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1103 | m REPORT_LOCATION, \ | |
1104 | a1, a2, \ | |
1105 | REPORT_LOCATION_ARGS(loc))) | |
1106 | ||
1107 | #define vWARN4(loc, m, a1, a2, a3) \ | |
1108 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1109 | Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1110 | m REPORT_LOCATION, \ | |
1111 | a1, a2, a3, \ | |
1112 | REPORT_LOCATION_ARGS(loc))) | |
1113 | ||
1114 | #define ckWARN4reg(loc, m, a1, a2, a3) \ | |
1115 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1116 | Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1117 | m REPORT_LOCATION, \ | |
1118 | a1, a2, a3, \ | |
1119 | REPORT_LOCATION_ARGS(loc))) | |
1120 | ||
1121 | #define vWARN5(loc, m, a1, a2, a3, a4) \ | |
1122 | _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ | |
1123 | Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ | |
1124 | m REPORT_LOCATION, \ | |
1125 | a1, a2, a3, a4, \ | |
1126 | REPORT_LOCATION_ARGS(loc))) | |
1127 | ||
1128 | #define ckWARNexperimental(loc, class, m) \ | |
1129 | STMT_START { \ | |
1130 | if (! RExC_warned_ ## class) { /* warn once per compilation */ \ | |
1131 | RExC_warned_ ## class = 1; \ | |
1132 | _WARN_HELPER(loc, packWARN(class), \ | |
1133 | Perl_ck_warner_d(aTHX_ packWARN(class), \ | |
1134 | m REPORT_LOCATION, \ | |
1135 | REPORT_LOCATION_ARGS(loc)));\ | |
1136 | } \ | |
1137 | } STMT_END | |
1138 | ||
1139 | #define ckWARNexperimental_with_arg(loc, class, m, arg) \ | |
1140 | STMT_START { \ | |
1141 | if (! RExC_warned_ ## class) { /* warn once per compilation */ \ | |
1142 | RExC_warned_ ## class = 1; \ | |
1143 | _WARN_HELPER(loc, packWARN(class), \ | |
1144 | Perl_ck_warner_d(aTHX_ packWARN(class), \ | |
1145 | m REPORT_LOCATION, \ | |
1146 | arg, REPORT_LOCATION_ARGS(loc)));\ | |
1147 | } \ | |
1148 | } STMT_END | |
1149 | ||
1150 | /* Convert between a pointer to a node and its offset from the beginning of the | |
1151 | * program */ | |
1152 | #define REGNODE_p(offset) (RExC_emit_start + (offset)) | |
1153 | #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ | |
1154 | (SSize_t) ((node) - RExC_emit_start)) | |
1155 | ||
1156 | #define ProgLen(ri) ri->proglen | |
1157 | #define SetProgLen(ri,x) ri->proglen = x | |
1158 | ||
1159 | #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS | |
1160 | #define EXPERIMENTAL_INPLACESCAN | |
1161 | #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ | |
1162 | ||
1163 | #define DEBUG_RExC_seen() \ | |
1164 | DEBUG_OPTIMISE_MORE_r({ \ | |
1165 | Perl_re_printf( aTHX_ "RExC_seen: "); \ | |
1166 | \ | |
1167 | if (RExC_seen & REG_ZERO_LEN_SEEN) \ | |
1168 | Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ | |
1169 | \ | |
1170 | if (RExC_seen & REG_LOOKBEHIND_SEEN) \ | |
1171 | Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ | |
1172 | \ | |
1173 | if (RExC_seen & REG_GPOS_SEEN) \ | |
1174 | Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ | |
1175 | \ | |
1176 | if (RExC_seen & REG_RECURSE_SEEN) \ | |
1177 | Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ | |
1178 | \ | |
1179 | if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ | |
1180 | Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ | |
1181 | \ | |
1182 | if (RExC_seen & REG_VERBARG_SEEN) \ | |
1183 | Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ | |
1184 | \ | |
1185 | if (RExC_seen & REG_CUTGROUP_SEEN) \ | |
1186 | Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ | |
1187 | \ | |
1188 | if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ | |
1189 | Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ | |
1190 | \ | |
1191 | if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ | |
1192 | Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ | |
1193 | \ | |
1194 | if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ | |
1195 | Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ | |
1196 | \ | |
c224bbd5 YO |
1197 | if (RExC_seen & REG_PESSIMIZE_SEEN) \ |
1198 | Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN "); \ | |
1199 | \ | |
85900e28 YO |
1200 | Perl_re_printf( aTHX_ "\n"); \ |
1201 | }); | |
1202 | ||
1203 | #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ | |
1204 | if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) | |
1205 | ||
1206 | ||
1207 | #ifdef DEBUGGING | |
1208 | # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ | |
1209 | debug_studydata(where, data, depth, is_inf, min, stopmin, delta) | |
1210 | ||
1211 | # define DEBUG_PEEP(str, scan, depth, flags) \ | |
1212 | debug_peep(str, pRExC_state, scan, depth, flags) | |
1213 | #else | |
1214 | # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP | |
1215 | # define DEBUG_PEEP(str, scan, depth, flags) NOOP | |
1216 | #endif | |
1217 | ||
1218 | #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) | |
1219 | #ifdef DEBUGGING | |
1220 | #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) | |
1221 | #else | |
1222 | #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) | |
1223 | #endif | |
1224 | ||
1225 | #define MADE_TRIE 1 | |
1226 | #define MADE_JUMP_TRIE 2 | |
1227 | #define MADE_EXACT_TRIE 4 | |
1228 | ||
1229 | #define INVLIST_INDEX 0 | |
1230 | #define ONLY_LOCALE_MATCHES_INDEX 1 | |
1231 | #define DEFERRED_USER_DEFINED_INDEX 2 | |
1232 | ||
1233 | /* These two functions currently do the exact same thing */ | |
1234 | #define ssc_init_zero ssc_init | |
1235 | ||
1236 | #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) | |
1237 | #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) | |
1238 | ||
1239 | #ifdef DEBUGGING | |
1240 | #define REGNODE_GUTS(state,op,extra_size) \ | |
1241 | regnode_guts_debug(state,op,extra_size) | |
1242 | #else | |
1243 | #define REGNODE_GUTS(state,op,extra_size) \ | |
1244 | regnode_guts(state,extra_size) | |
1245 | #endif | |
1246 | ||
1247 | #define CLEAR_OPTSTART \ | |
1248 | if (optstart) STMT_START { \ | |
1249 | DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ | |
1250 | " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ | |
1251 | optstart=NULL; \ | |
1252 | } STMT_END | |
1253 | ||
1254 | #define DUMPUNTIL(b,e) \ | |
1255 | CLEAR_OPTSTART; \ | |
1256 | node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); | |
1257 | ||
0678333e YO |
1258 | #define REGNODE_STEP_OVER(ret,t1,t2) \ |
1259 | NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode)) | |
fe5492d9 | 1260 | |
85900e28 | 1261 | #endif /* REGCOMP_INTERNAL_H */ |