5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 #define MAX(a,b) ((a) > (b) ? (a) : (b))
112 /* this is a chain of data about sub patterns we are processing that
113 need to be handled separately/specially in study_chunk. Its so
114 we can simulate recursion without losing state. */
116 typedef struct scan_frame {
117 regnode *last_regnode; /* last node to process in this frame */
118 regnode *next_regnode; /* next node to process when last is reached */
119 U32 prev_recursed_depth;
120 I32 stopparen; /* what stopparen do we use */
121 U32 is_top_frame; /* what flags do we use? */
123 struct scan_frame *this_prev_frame; /* this previous frame */
124 struct scan_frame *prev_frame; /* previous frame */
125 struct scan_frame *next_frame; /* next frame */
128 /* Certain characters are output as a sequence with the first being a
130 #define isBACKSLASHED_PUNCT(c) \
131 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
134 struct RExC_state_t {
135 U32 flags; /* RXf_* are we folding, multilining? */
136 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
137 char *precomp; /* uncompiled string. */
138 char *precomp_end; /* pointer to end of uncompiled string. */
139 REGEXP *rx_sv; /* The SV that is the regexp. */
140 regexp *rx; /* perl core regexp structure */
141 regexp_internal *rxi; /* internal data for regexp object
143 char *start; /* Start of input for compile */
144 char *end; /* End of input for compile */
145 char *parse; /* Input-scan pointer. */
146 char *adjusted_start; /* 'start', adjusted. See code use */
147 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
148 SSize_t whilem_seen; /* number of WHILEM in this expr */
149 regnode *emit_start; /* Start of emitted-code area */
150 regnode *emit_bound; /* First regnode outside of the
152 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
153 implies compiling, so don't emit */
154 regnode_ssc emit_dummy; /* placeholder for emit to point to;
155 large enough for the largest
156 non-EXACTish node, so can use it as
158 I32 naughty; /* How bad is this pattern? */
159 I32 sawback; /* Did we see \1, ...? */
161 SSize_t size; /* Code size. */
162 I32 npar; /* Capture buffer count, (OPEN) plus
163 one. ("par" 0 is the whole
165 I32 nestroot; /* root parens we are in - used by
169 regnode **open_parens; /* pointers to open parens */
170 regnode **close_parens; /* pointers to close parens */
171 regnode *end_op; /* END node in program */
172 I32 utf8; /* whether the pattern is utf8 or not */
173 I32 orig_utf8; /* whether the pattern was originally in utf8 */
174 /* XXX use this for future optimisation of case
175 * where pattern must be upgraded to utf8. */
176 I32 uni_semantics; /* If a d charset modifier should use unicode
177 rules, even if the pattern is not in
179 HV *paren_names; /* Paren names */
181 regnode **recurse; /* Recurse regops */
182 I32 recurse_count; /* Number of recurse regops we have generated */
183 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
185 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
189 I32 override_recoding;
191 I32 recode_x_to_native;
193 I32 in_multi_char_class;
194 struct reg_code_block *code_blocks; /* positions of literal (?{})
196 int num_code_blocks; /* size of code_blocks[] */
197 int code_index; /* next code_blocks[] slot */
198 SSize_t maxlen; /* mininum possible number of chars in string to match */
199 scan_frame *frame_head;
200 scan_frame *frame_last;
202 #ifdef ADD_TO_REGEXEC
203 char *starttry; /* -Dr: where regtry was called. */
204 #define RExC_starttry (pRExC_state->starttry)
206 SV *runtime_code_qr; /* qr with the runtime code blocks */
208 const char *lastparse;
210 AV *paren_name_list; /* idx -> name */
211 U32 study_chunk_recursed_count;
214 #define RExC_lastparse (pRExC_state->lastparse)
215 #define RExC_lastnum (pRExC_state->lastnum)
216 #define RExC_paren_name_list (pRExC_state->paren_name_list)
217 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
218 #define RExC_mysv (pRExC_state->mysv1)
219 #define RExC_mysv1 (pRExC_state->mysv1)
220 #define RExC_mysv2 (pRExC_state->mysv2)
223 bool seen_unfolded_sharp_s;
228 #define RExC_flags (pRExC_state->flags)
229 #define RExC_pm_flags (pRExC_state->pm_flags)
230 #define RExC_precomp (pRExC_state->precomp)
231 #define RExC_precomp_adj (pRExC_state->precomp_adj)
232 #define RExC_adjusted_start (pRExC_state->adjusted_start)
233 #define RExC_precomp_end (pRExC_state->precomp_end)
234 #define RExC_rx_sv (pRExC_state->rx_sv)
235 #define RExC_rx (pRExC_state->rx)
236 #define RExC_rxi (pRExC_state->rxi)
237 #define RExC_start (pRExC_state->start)
238 #define RExC_end (pRExC_state->end)
239 #define RExC_parse (pRExC_state->parse)
240 #define RExC_whilem_seen (pRExC_state->whilem_seen)
242 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
243 * EXACTF node, hence was parsed under /di rules. If later in the parse,
244 * something forces the pattern into using /ui rules, the sharp s should be
245 * folded into the sequence 'ss', which takes up more space than previously
246 * calculated. This means that the sizing pass needs to be restarted. (The
247 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
248 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
249 * so there is no need to resize [perl #125990]. */
250 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
252 #ifdef RE_TRACK_PATTERN_OFFSETS
253 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
256 #define RExC_emit (pRExC_state->emit)
257 #define RExC_emit_dummy (pRExC_state->emit_dummy)
258 #define RExC_emit_start (pRExC_state->emit_start)
259 #define RExC_emit_bound (pRExC_state->emit_bound)
260 #define RExC_sawback (pRExC_state->sawback)
261 #define RExC_seen (pRExC_state->seen)
262 #define RExC_size (pRExC_state->size)
263 #define RExC_maxlen (pRExC_state->maxlen)
264 #define RExC_npar (pRExC_state->npar)
265 #define RExC_nestroot (pRExC_state->nestroot)
266 #define RExC_extralen (pRExC_state->extralen)
267 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
268 #define RExC_utf8 (pRExC_state->utf8)
269 #define RExC_uni_semantics (pRExC_state->uni_semantics)
270 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
271 #define RExC_open_parens (pRExC_state->open_parens)
272 #define RExC_close_parens (pRExC_state->close_parens)
273 #define RExC_end_op (pRExC_state->end_op)
274 #define RExC_paren_names (pRExC_state->paren_names)
275 #define RExC_recurse (pRExC_state->recurse)
276 #define RExC_recurse_count (pRExC_state->recurse_count)
277 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
278 #define RExC_study_chunk_recursed_bytes \
279 (pRExC_state->study_chunk_recursed_bytes)
280 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
281 #define RExC_contains_locale (pRExC_state->contains_locale)
282 #define RExC_contains_i (pRExC_state->contains_i)
283 #define RExC_override_recoding (pRExC_state->override_recoding)
285 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
287 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
288 #define RExC_frame_head (pRExC_state->frame_head)
289 #define RExC_frame_last (pRExC_state->frame_last)
290 #define RExC_frame_count (pRExC_state->frame_count)
291 #define RExC_strict (pRExC_state->strict)
292 #define RExC_study_started (pRExC_state->study_started)
294 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
295 * a flag to disable back-off on the fixed/floating substrings - if it's
296 * a high complexity pattern we assume the benefit of avoiding a full match
297 * is worth the cost of checking for the substrings even if they rarely help.
299 #define RExC_naughty (pRExC_state->naughty)
300 #define TOO_NAUGHTY (10)
301 #define MARK_NAUGHTY(add) \
302 if (RExC_naughty < TOO_NAUGHTY) \
303 RExC_naughty += (add)
304 #define MARK_NAUGHTY_EXP(exp, add) \
305 if (RExC_naughty < TOO_NAUGHTY) \
306 RExC_naughty += RExC_naughty / (exp) + (add)
308 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
309 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
310 ((*s) == '{' && regcurly(s)))
313 * Flags to be passed up and down.
315 #define WORST 0 /* Worst case. */
316 #define HASWIDTH 0x01 /* Known to match non-null strings. */
318 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
319 * character. (There needs to be a case: in the switch statement in regexec.c
320 * for any node marked SIMPLE.) Note that this is not the same thing as
323 #define SPSTART 0x04 /* Starts with * or + */
324 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
325 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
326 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */
327 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
328 calcuate sizes as UTF-8 */
330 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
332 /* whether trie related optimizations are enabled */
333 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
334 #define TRIE_STUDY_OPT
335 #define FULL_TRIE_STUDY
341 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
342 #define PBITVAL(paren) (1 << ((paren) & 7))
343 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
344 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
345 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
347 #define REQUIRE_UTF8(flagp) STMT_START { \
350 *flagp = RESTART_PASS1|NEED_UTF8; \
355 /* Change from /d into /u rules, and restart the parse if we've already seen
356 * something whose size would increase as a result, by setting *flagp and
357 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
358 * we've change to /u during the parse. */
359 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
361 if (DEPENDS_SEMANTICS) { \
363 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
364 RExC_uni_semantics = 1; \
365 if (RExC_seen_unfolded_sharp_s) { \
366 *flagp |= RESTART_PASS1; \
367 return restart_retval; \
372 /* This converts the named class defined in regcomp.h to its equivalent class
373 * number defined in handy.h. */
374 #define namedclass_to_classnum(class) ((int) ((class) / 2))
375 #define classnum_to_namedclass(classnum) ((classnum) * 2)
377 #define _invlist_union_complement_2nd(a, b, output) \
378 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
379 #define _invlist_intersection_complement_2nd(a, b, output) \
380 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
382 /* About scan_data_t.
384 During optimisation we recurse through the regexp program performing
385 various inplace (keyhole style) optimisations. In addition study_chunk
386 and scan_commit populate this data structure with information about
387 what strings MUST appear in the pattern. We look for the longest
388 string that must appear at a fixed location, and we look for the
389 longest string that may appear at a floating location. So for instance
394 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
395 strings (because they follow a .* construct). study_chunk will identify
396 both FOO and BAR as being the longest fixed and floating strings respectively.
398 The strings can be composites, for instance
402 will result in a composite fixed substring 'foo'.
404 For each string some basic information is maintained:
406 - offset or min_offset
407 This is the position the string must appear at, or not before.
408 It also implicitly (when combined with minlenp) tells us how many
409 characters must match before the string we are searching for.
410 Likewise when combined with minlenp and the length of the string it
411 tells us how many characters must appear after the string we have
415 Only used for floating strings. This is the rightmost point that
416 the string can appear at. If set to SSize_t_MAX it indicates that the
417 string can occur infinitely far to the right.
420 A pointer to the minimum number of characters of the pattern that the
421 string was found inside. This is important as in the case of positive
422 lookahead or positive lookbehind we can have multiple patterns
427 The minimum length of the pattern overall is 3, the minimum length
428 of the lookahead part is 3, but the minimum length of the part that
429 will actually match is 1. So 'FOO's minimum length is 3, but the
430 minimum length for the F is 1. This is important as the minimum length
431 is used to determine offsets in front of and behind the string being
432 looked for. Since strings can be composites this is the length of the
433 pattern at the time it was committed with a scan_commit. Note that
434 the length is calculated by study_chunk, so that the minimum lengths
435 are not known until the full pattern has been compiled, thus the
436 pointer to the value.
440 In the case of lookbehind the string being searched for can be
441 offset past the start point of the final matching string.
442 If this value was just blithely removed from the min_offset it would
443 invalidate some of the calculations for how many chars must match
444 before or after (as they are derived from min_offset and minlen and
445 the length of the string being searched for).
446 When the final pattern is compiled and the data is moved from the
447 scan_data_t structure into the regexp structure the information
448 about lookbehind is factored in, with the information that would
449 have been lost precalculated in the end_shift field for the
452 The fields pos_min and pos_delta are used to store the minimum offset
453 and the delta to the maximum offset at the current point in the pattern.
457 typedef struct scan_data_t {
458 /*I32 len_min; unused */
459 /*I32 len_delta; unused */
463 SSize_t last_end; /* min value, <0 unless valid. */
464 SSize_t last_start_min;
465 SSize_t last_start_max;
466 SV **longest; /* Either &l_fixed, or &l_float. */
467 SV *longest_fixed; /* longest fixed string found in pattern */
468 SSize_t offset_fixed; /* offset where it starts */
469 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
470 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
471 SV *longest_float; /* longest floating string found in pattern */
472 SSize_t offset_float_min; /* earliest point in string it can appear */
473 SSize_t offset_float_max; /* latest point in string it can appear */
474 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
475 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
478 SSize_t *last_closep;
479 regnode_ssc *start_class;
483 * Forward declarations for pregcomp()'s friends.
486 static const scan_data_t zero_scan_data =
487 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
489 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
490 #define SF_BEFORE_SEOL 0x0001
491 #define SF_BEFORE_MEOL 0x0002
492 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
493 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
495 #define SF_FIX_SHIFT_EOL (+2)
496 #define SF_FL_SHIFT_EOL (+4)
498 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
499 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
501 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
502 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
503 #define SF_IS_INF 0x0040
504 #define SF_HAS_PAR 0x0080
505 #define SF_IN_PAR 0x0100
506 #define SF_HAS_EVAL 0x0200
509 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
510 * longest substring in the pattern. When it is not set the optimiser keeps
511 * track of position, but does not keep track of the actual strings seen,
513 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
516 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
517 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
518 * turned off because of the alternation (BRANCH). */
519 #define SCF_DO_SUBSTR 0x0400
521 #define SCF_DO_STCLASS_AND 0x0800
522 #define SCF_DO_STCLASS_OR 0x1000
523 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
524 #define SCF_WHILEM_VISITED_POS 0x2000
526 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
527 #define SCF_SEEN_ACCEPT 0x8000
528 #define SCF_TRIE_DOING_RESTUDY 0x10000
529 #define SCF_IN_DEFINE 0x20000
534 #define UTF cBOOL(RExC_utf8)
536 /* The enums for all these are ordered so things work out correctly */
537 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
539 == REGEX_DEPENDS_CHARSET)
540 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
541 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
542 >= REGEX_UNICODE_CHARSET)
543 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
544 == REGEX_ASCII_RESTRICTED_CHARSET)
545 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
546 >= REGEX_ASCII_RESTRICTED_CHARSET)
547 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
548 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
550 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
552 /* For programs that want to be strictly Unicode compatible by dying if any
553 * attempt is made to match a non-Unicode code point against a Unicode
555 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
557 #define OOB_NAMEDCLASS -1
559 /* There is no code point that is out-of-bounds, so this is problematic. But
560 * its only current use is to initialize a variable that is always set before
562 #define OOB_UNICODE 0xDEADBEEF
564 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
565 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
568 /* length of regex to show in messages that don't mark a position within */
569 #define RegexLengthToShowInErrorMessages 127
572 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
573 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
574 * op/pragma/warn/regcomp.
576 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
577 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
579 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
580 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
582 /* The code in this file in places uses one level of recursion with parsing
583 * rebased to an alternate string constructed by us in memory. This can take
584 * the form of something that is completely different from the input, or
585 * something that uses the input as part of the alternate. In the first case,
586 * there should be no possibility of an error, as we are in complete control of
587 * the alternate string. But in the second case we don't control the input
588 * portion, so there may be errors in that. Here's an example:
590 * is handled specially because \x{df} folds to a sequence of more than one
591 * character, 'ss'. What is done is to create and parse an alternate string,
592 * which looks like this:
593 * /(?:\x{DF}|[abc\x{DF}def])/ui
594 * where it uses the input unchanged in the middle of something it constructs,
595 * which is a branch for the DF outside the character class, and clustering
596 * parens around the whole thing. (It knows enough to skip the DF inside the
597 * class while in this substitute parse.) 'abc' and 'def' may have errors that
598 * need to be reported. The general situation looks like this:
601 * Input: ----------------------------------------------------
602 * Constructed: ---------------------------------------------------
605 * The input string sI..eI is the input pattern. The string sC..EC is the
606 * constructed substitute parse string. The portions sC..tC and eC..EC are
607 * constructed by us. The portion tC..eC is an exact duplicate of the input
608 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
609 * while parsing, we find an error at xC. We want to display a message showing
610 * the real input string. Thus we need to find the point xI in it which
611 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
612 * been constructed by us, and so shouldn't have errors. We get:
614 * xI = sI + (tI - sI) + (xC - tC)
616 * and, the offset into sI is:
618 * (xI - sI) = (tI - sI) + (xC - tC)
620 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
621 * and we save tC as RExC_adjusted_start.
623 * During normal processing of the input pattern, everything points to that,
624 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
627 #define tI_sI RExC_precomp_adj
628 #define tC RExC_adjusted_start
629 #define sC RExC_precomp
630 #define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
631 #define xI(xC) (sC + xI_offset(xC))
632 #define eC RExC_precomp_end
634 #define REPORT_LOCATION_ARGS(xC) \
636 (xI(xC) > eC) /* Don't run off end */ \
637 ? eC - sC /* Length before the <--HERE */ \
639 sC), /* The input pattern printed up to the <--HERE */ \
641 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
642 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
644 /* Used to point after bad bytes for an error message, but avoid skipping
645 * past a nul byte. */
646 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
649 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
650 * arg. Show regex, up to a maximum length. If it's too long, chop and add
653 #define _FAIL(code) STMT_START { \
654 const char *ellipses = ""; \
655 IV len = RExC_precomp_end - RExC_precomp; \
658 SAVEFREESV(RExC_rx_sv); \
659 if (len > RegexLengthToShowInErrorMessages) { \
660 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
661 len = RegexLengthToShowInErrorMessages - 10; \
667 #define FAIL(msg) _FAIL( \
668 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
669 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
671 #define FAIL2(msg,arg) _FAIL( \
672 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
673 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
676 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
678 #define Simple_vFAIL(m) STMT_START { \
679 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
680 m, REPORT_LOCATION_ARGS(RExC_parse)); \
684 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
686 #define vFAIL(m) STMT_START { \
688 SAVEFREESV(RExC_rx_sv); \
693 * Like Simple_vFAIL(), but accepts two arguments.
695 #define Simple_vFAIL2(m,a1) STMT_START { \
696 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
697 REPORT_LOCATION_ARGS(RExC_parse)); \
701 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
703 #define vFAIL2(m,a1) STMT_START { \
705 SAVEFREESV(RExC_rx_sv); \
706 Simple_vFAIL2(m, a1); \
711 * Like Simple_vFAIL(), but accepts three arguments.
713 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
714 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
715 REPORT_LOCATION_ARGS(RExC_parse)); \
719 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
721 #define vFAIL3(m,a1,a2) STMT_START { \
723 SAVEFREESV(RExC_rx_sv); \
724 Simple_vFAIL3(m, a1, a2); \
728 * Like Simple_vFAIL(), but accepts four arguments.
730 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
731 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
732 REPORT_LOCATION_ARGS(RExC_parse)); \
735 #define vFAIL4(m,a1,a2,a3) STMT_START { \
737 SAVEFREESV(RExC_rx_sv); \
738 Simple_vFAIL4(m, a1, a2, a3); \
741 /* A specialized version of vFAIL2 that works with UTF8f */
742 #define vFAIL2utf8f(m, a1) STMT_START { \
744 SAVEFREESV(RExC_rx_sv); \
745 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
746 REPORT_LOCATION_ARGS(RExC_parse)); \
749 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
751 SAVEFREESV(RExC_rx_sv); \
752 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
753 REPORT_LOCATION_ARGS(RExC_parse)); \
756 /* These have asserts in them because of [perl #122671] Many warnings in
757 * regcomp.c can occur twice. If they get output in pass1 and later in that
758 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
759 * would get output again. So they should be output in pass2, and these
760 * asserts make sure new warnings follow that paradigm. */
762 /* m is not necessarily a "literal string", in this macro */
763 #define reg_warn_non_literal_string(loc, m) STMT_START { \
764 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
765 "%s" REPORT_LOCATION, \
766 m, REPORT_LOCATION_ARGS(loc)); \
769 #define ckWARNreg(loc,m) STMT_START { \
770 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
772 REPORT_LOCATION_ARGS(loc)); \
775 #define vWARN(loc, m) STMT_START { \
776 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
778 REPORT_LOCATION_ARGS(loc)); \
781 #define vWARN_dep(loc, m) STMT_START { \
782 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
784 REPORT_LOCATION_ARGS(loc)); \
787 #define ckWARNdep(loc,m) STMT_START { \
788 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
790 REPORT_LOCATION_ARGS(loc)); \
793 #define ckWARNregdep(loc,m) STMT_START { \
794 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
797 REPORT_LOCATION_ARGS(loc)); \
800 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
801 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
803 a1, REPORT_LOCATION_ARGS(loc)); \
806 #define ckWARN2reg(loc, m, a1) STMT_START { \
807 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
809 a1, REPORT_LOCATION_ARGS(loc)); \
812 #define vWARN3(loc, m, a1, a2) STMT_START { \
813 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
815 a1, a2, REPORT_LOCATION_ARGS(loc)); \
818 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
819 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
822 REPORT_LOCATION_ARGS(loc)); \
825 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
826 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
829 REPORT_LOCATION_ARGS(loc)); \
832 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
833 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
836 REPORT_LOCATION_ARGS(loc)); \
839 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
840 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
843 REPORT_LOCATION_ARGS(loc)); \
846 /* Macros for recording node offsets. 20001227 mjd@plover.com
847 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
848 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
849 * Element 0 holds the number n.
850 * Position is 1 indexed.
852 #ifndef RE_TRACK_PATTERN_OFFSETS
853 #define Set_Node_Offset_To_R(node,byte)
854 #define Set_Node_Offset(node,byte)
855 #define Set_Cur_Node_Offset
856 #define Set_Node_Length_To_R(node,len)
857 #define Set_Node_Length(node,len)
858 #define Set_Node_Cur_Length(node,start)
859 #define Node_Offset(n)
860 #define Node_Length(n)
861 #define Set_Node_Offset_Length(node,offset,len)
862 #define ProgLen(ri) ri->u.proglen
863 #define SetProgLen(ri,x) ri->u.proglen = x
865 #define ProgLen(ri) ri->u.offsets[0]
866 #define SetProgLen(ri,x) ri->u.offsets[0] = x
867 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
869 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
870 __LINE__, (int)(node), (int)(byte))); \
872 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
875 RExC_offsets[2*(node)-1] = (byte); \
880 #define Set_Node_Offset(node,byte) \
881 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
882 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
884 #define Set_Node_Length_To_R(node,len) STMT_START { \
886 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
887 __LINE__, (int)(node), (int)(len))); \
889 Perl_croak(aTHX_ "value of node is %d in Length macro", \
892 RExC_offsets[2*(node)] = (len); \
897 #define Set_Node_Length(node,len) \
898 Set_Node_Length_To_R((node)-RExC_emit_start, len)
899 #define Set_Node_Cur_Length(node, start) \
900 Set_Node_Length(node, RExC_parse - start)
902 /* Get offsets and lengths */
903 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
904 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
906 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
907 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
908 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
912 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
913 #define EXPERIMENTAL_INPLACESCAN
914 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
918 Perl_re_printf(pTHX_ const char *fmt, ...)
922 PerlIO *f= Perl_debug_log;
923 PERL_ARGS_ASSERT_RE_PRINTF;
925 result = PerlIO_vprintf(f, fmt, ap);
931 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
935 PerlIO *f= Perl_debug_log;
936 PERL_ARGS_ASSERT_RE_INDENTF;
938 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
939 result = PerlIO_vprintf(f, fmt, ap);
943 #endif /* DEBUGGING */
945 #define DEBUG_RExC_seen() \
946 DEBUG_OPTIMISE_MORE_r({ \
947 Perl_re_printf( aTHX_ "RExC_seen: "); \
949 if (RExC_seen & REG_ZERO_LEN_SEEN) \
950 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
952 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
953 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
955 if (RExC_seen & REG_GPOS_SEEN) \
956 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
958 if (RExC_seen & REG_RECURSE_SEEN) \
959 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
961 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
962 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
964 if (RExC_seen & REG_VERBARG_SEEN) \
965 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
967 if (RExC_seen & REG_CUTGROUP_SEEN) \
968 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
970 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
971 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
973 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
974 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
976 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
977 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
979 Perl_re_printf( aTHX_ "\n"); \
982 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
983 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
985 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
987 Perl_re_printf( aTHX_ "%s", open_str); \
988 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
989 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
990 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
999 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
1000 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
1001 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
1002 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
1003 Perl_re_printf( aTHX_ "%s", close_str); \
1007 #define DEBUG_STUDYDATA(str,data,depth) \
1008 DEBUG_OPTIMISE_MORE_r(if(data){ \
1009 Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \
1010 " Flags: 0x%"UVXf, \
1012 (IV)((data)->pos_min), \
1013 (IV)((data)->pos_delta), \
1014 (UV)((data)->flags) \
1016 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
1017 Perl_re_printf( aTHX_ \
1018 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1019 (IV)((data)->whilem_c), \
1020 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1021 is_inf ? "INF " : "" \
1023 if ((data)->last_found) \
1024 Perl_re_printf( aTHX_ \
1025 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
1026 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
1027 SvPVX_const((data)->last_found), \
1028 (IV)((data)->last_end), \
1029 (IV)((data)->last_start_min), \
1030 (IV)((data)->last_start_max), \
1031 ((data)->longest && \
1032 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
1033 SvPVX_const((data)->longest_fixed), \
1034 (IV)((data)->offset_fixed), \
1035 ((data)->longest && \
1036 (data)->longest==&((data)->longest_float)) ? "*" : "", \
1037 SvPVX_const((data)->longest_float), \
1038 (IV)((data)->offset_float_min), \
1039 (IV)((data)->offset_float_max) \
1041 Perl_re_printf( aTHX_ "\n"); \
1045 /* =========================================================
1046 * BEGIN edit_distance stuff.
1048 * This calculates how many single character changes of any type are needed to
1049 * transform a string into another one. It is taken from version 3.1 of
1051 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1054 /* Our unsorted dictionary linked list. */
1055 /* Note we use UVs, not chars. */
1060 struct dictionary* next;
1062 typedef struct dictionary item;
1065 PERL_STATIC_INLINE item*
1066 push(UV key,item* curr)
1069 Newxz(head, 1, item);
1077 PERL_STATIC_INLINE item*
1078 find(item* head, UV key)
1080 item* iterator = head;
1082 if (iterator->key == key){
1085 iterator = iterator->next;
1091 PERL_STATIC_INLINE item*
1092 uniquePush(item* head,UV key)
1094 item* iterator = head;
1097 if (iterator->key == key) {
1100 iterator = iterator->next;
1103 return push(key,head);
1106 PERL_STATIC_INLINE void
1107 dict_free(item* head)
1109 item* iterator = head;
1112 item* temp = iterator;
1113 iterator = iterator->next;
1120 /* End of Dictionary Stuff */
1122 /* All calculations/work are done here */
1124 S_edit_distance(const UV* src,
1126 const STRLEN x, /* length of src[] */
1127 const STRLEN y, /* length of tgt[] */
1128 const SSize_t maxDistance
1132 UV swapCount,swapScore,targetCharCount,i,j;
1134 UV score_ceil = x + y;
1136 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1138 /* intialize matrix start values */
1139 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1140 scores[0] = score_ceil;
1141 scores[1 * (y + 2) + 0] = score_ceil;
1142 scores[0 * (y + 2) + 1] = score_ceil;
1143 scores[1 * (y + 2) + 1] = 0;
1144 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1149 for (i=1;i<=x;i++) {
1151 head = uniquePush(head,src[i]);
1152 scores[(i+1) * (y + 2) + 1] = i;
1153 scores[(i+1) * (y + 2) + 0] = score_ceil;
1156 for (j=1;j<=y;j++) {
1159 head = uniquePush(head,tgt[j]);
1160 scores[1 * (y + 2) + (j + 1)] = j;
1161 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1164 targetCharCount = find(head,tgt[j-1])->value;
1165 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1167 if (src[i-1] != tgt[j-1]){
1168 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1172 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1176 find(head,src[i-1])->value = i;
1180 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1183 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1187 /* END of edit_distance() stuff
1188 * ========================================================= */
1190 /* is c a control character for which we have a mnemonic? */
1191 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1194 S_cntrl_to_mnemonic(const U8 c)
1196 /* Returns the mnemonic string that represents character 'c', if one
1197 * exists; NULL otherwise. The only ones that exist for the purposes of
1198 * this routine are a few control characters */
1201 case '\a': return "\\a";
1202 case '\b': return "\\b";
1203 case ESC_NATIVE: return "\\e";
1204 case '\f': return "\\f";
1205 case '\n': return "\\n";
1206 case '\r': return "\\r";
1207 case '\t': return "\\t";
1213 /* Mark that we cannot extend a found fixed substring at this point.
1214 Update the longest found anchored substring and the longest found
1215 floating substrings if needed. */
1218 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1219 SSize_t *minlenp, int is_inf)
1221 const STRLEN l = CHR_SVLEN(data->last_found);
1222 const STRLEN old_l = CHR_SVLEN(*data->longest);
1223 GET_RE_DEBUG_FLAGS_DECL;
1225 PERL_ARGS_ASSERT_SCAN_COMMIT;
1227 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1228 SvSetMagicSV(*data->longest, data->last_found);
1229 if (*data->longest == data->longest_fixed) {
1230 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1231 if (data->flags & SF_BEFORE_EOL)
1233 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1235 data->flags &= ~SF_FIX_BEFORE_EOL;
1236 data->minlen_fixed=minlenp;
1237 data->lookbehind_fixed=0;
1239 else { /* *data->longest == data->longest_float */
1240 data->offset_float_min = l ? data->last_start_min : data->pos_min;
1241 data->offset_float_max = (l
1242 ? data->last_start_max
1243 : (data->pos_delta > SSize_t_MAX - data->pos_min
1245 : data->pos_min + data->pos_delta));
1247 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1248 data->offset_float_max = SSize_t_MAX;
1249 if (data->flags & SF_BEFORE_EOL)
1251 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1253 data->flags &= ~SF_FL_BEFORE_EOL;
1254 data->minlen_float=minlenp;
1255 data->lookbehind_float=0;
1258 SvCUR_set(data->last_found, 0);
1260 SV * const sv = data->last_found;
1261 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1262 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1267 data->last_end = -1;
1268 data->flags &= ~SF_BEFORE_EOL;
1269 DEBUG_STUDYDATA("commit: ",data,0);
1272 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1273 * list that describes which code points it matches */
1276 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1278 /* Set the SSC 'ssc' to match an empty string or any code point */
1280 PERL_ARGS_ASSERT_SSC_ANYTHING;
1282 assert(is_ANYOF_SYNTHETIC(ssc));
1284 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1285 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1286 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1290 S_ssc_is_anything(const regnode_ssc *ssc)
1292 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1293 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1294 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1295 * in any way, so there's no point in using it */
1300 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1302 assert(is_ANYOF_SYNTHETIC(ssc));
1304 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1308 /* See if the list consists solely of the range 0 - Infinity */
1309 invlist_iterinit(ssc->invlist);
1310 ret = invlist_iternext(ssc->invlist, &start, &end)
1314 invlist_iterfinish(ssc->invlist);
1320 /* If e.g., both \w and \W are set, matches everything */
1321 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1323 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1324 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1334 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1336 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1337 * string, any code point, or any posix class under locale */
1339 PERL_ARGS_ASSERT_SSC_INIT;
1341 Zero(ssc, 1, regnode_ssc);
1342 set_ANYOF_SYNTHETIC(ssc);
1343 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1346 /* If any portion of the regex is to operate under locale rules that aren't
1347 * fully known at compile time, initialization includes it. The reason
1348 * this isn't done for all regexes is that the optimizer was written under
1349 * the assumption that locale was all-or-nothing. Given the complexity and
1350 * lack of documentation in the optimizer, and that there are inadequate
1351 * test cases for locale, many parts of it may not work properly, it is
1352 * safest to avoid locale unless necessary. */
1353 if (RExC_contains_locale) {
1354 ANYOF_POSIXL_SETALL(ssc);
1357 ANYOF_POSIXL_ZERO(ssc);
1362 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1363 const regnode_ssc *ssc)
1365 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1366 * to the list of code points matched, and locale posix classes; hence does
1367 * not check its flags) */
1372 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1374 assert(is_ANYOF_SYNTHETIC(ssc));
1376 invlist_iterinit(ssc->invlist);
1377 ret = invlist_iternext(ssc->invlist, &start, &end)
1381 invlist_iterfinish(ssc->invlist);
1387 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1395 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1396 const regnode_charclass* const node)
1398 /* Returns a mortal inversion list defining which code points are matched
1399 * by 'node', which is of type ANYOF. Handles complementing the result if
1400 * appropriate. If some code points aren't knowable at this time, the
1401 * returned list must, and will, contain every code point that is a
1405 SV* only_utf8_locale_invlist = NULL;
1407 const U32 n = ARG(node);
1408 bool new_node_has_latin1 = FALSE;
1410 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1412 /* Look at the data structure created by S_set_ANYOF_arg() */
1413 if (n != ANYOF_ONLY_HAS_BITMAP) {
1414 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1415 AV * const av = MUTABLE_AV(SvRV(rv));
1416 SV **const ary = AvARRAY(av);
1417 assert(RExC_rxi->data->what[n] == 's');
1419 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1420 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1422 else if (ary[0] && ary[0] != &PL_sv_undef) {
1424 /* Here, no compile-time swash, and there are things that won't be
1425 * known until runtime -- we have to assume it could be anything */
1426 invlist = sv_2mortal(_new_invlist(1));
1427 return _add_range_to_invlist(invlist, 0, UV_MAX);
1429 else if (ary[3] && ary[3] != &PL_sv_undef) {
1431 /* Here no compile-time swash, and no run-time only data. Use the
1432 * node's inversion list */
1433 invlist = sv_2mortal(invlist_clone(ary[3]));
1436 /* Get the code points valid only under UTF-8 locales */
1437 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1438 && ary[2] && ary[2] != &PL_sv_undef)
1440 only_utf8_locale_invlist = ary[2];
1445 invlist = sv_2mortal(_new_invlist(0));
1448 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1449 * code points, and an inversion list for the others, but if there are code
1450 * points that should match only conditionally on the target string being
1451 * UTF-8, those are placed in the inversion list, and not the bitmap.
1452 * Since there are circumstances under which they could match, they are
1453 * included in the SSC. But if the ANYOF node is to be inverted, we have
1454 * to exclude them here, so that when we invert below, the end result
1455 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1456 * have to do this here before we add the unconditionally matched code
1458 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1459 _invlist_intersection_complement_2nd(invlist,
1464 /* Add in the points from the bit map */
1465 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1466 if (ANYOF_BITMAP_TEST(node, i)) {
1467 unsigned int start = i++;
1469 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1472 invlist = _add_range_to_invlist(invlist, start, i-1);
1473 new_node_has_latin1 = TRUE;
1477 /* If this can match all upper Latin1 code points, have to add them
1478 * as well. But don't add them if inverting, as when that gets done below,
1479 * it would exclude all these characters, including the ones it shouldn't
1480 * that were added just above */
1481 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1482 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1484 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1487 /* Similarly for these */
1488 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1489 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1492 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1493 _invlist_invert(invlist);
1495 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1497 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1498 * locale. We can skip this if there are no 0-255 at all. */
1499 _invlist_union(invlist, PL_Latin1, &invlist);
1502 /* Similarly add the UTF-8 locale possible matches. These have to be
1503 * deferred until after the non-UTF-8 locale ones are taken care of just
1504 * above, or it leads to wrong results under ANYOF_INVERT */
1505 if (only_utf8_locale_invlist) {
1506 _invlist_union_maybe_complement_2nd(invlist,
1507 only_utf8_locale_invlist,
1508 ANYOF_FLAGS(node) & ANYOF_INVERT,
1515 /* These two functions currently do the exact same thing */
1516 #define ssc_init_zero ssc_init
1518 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1519 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1521 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1522 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1523 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1526 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1527 const regnode_charclass *and_with)
1529 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1530 * another SSC or a regular ANYOF class. Can create false positives. */
1535 PERL_ARGS_ASSERT_SSC_AND;
1537 assert(is_ANYOF_SYNTHETIC(ssc));
1539 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1540 * the code point inversion list and just the relevant flags */
1541 if (is_ANYOF_SYNTHETIC(and_with)) {
1542 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1543 anded_flags = ANYOF_FLAGS(and_with);
1545 /* XXX This is a kludge around what appears to be deficiencies in the
1546 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1547 * there are paths through the optimizer where it doesn't get weeded
1548 * out when it should. And if we don't make some extra provision for
1549 * it like the code just below, it doesn't get added when it should.
1550 * This solution is to add it only when AND'ing, which is here, and
1551 * only when what is being AND'ed is the pristine, original node
1552 * matching anything. Thus it is like adding it to ssc_anything() but
1553 * only when the result is to be AND'ed. Probably the same solution
1554 * could be adopted for the same problem we have with /l matching,
1555 * which is solved differently in S_ssc_init(), and that would lead to
1556 * fewer false positives than that solution has. But if this solution
1557 * creates bugs, the consequences are only that a warning isn't raised
1558 * that should be; while the consequences for having /l bugs is
1559 * incorrect matches */
1560 if (ssc_is_anything((regnode_ssc *)and_with)) {
1561 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1565 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1566 if (OP(and_with) == ANYOFD) {
1567 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1570 anded_flags = ANYOF_FLAGS(and_with)
1571 &( ANYOF_COMMON_FLAGS
1572 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1573 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1574 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1576 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1581 ANYOF_FLAGS(ssc) &= anded_flags;
1583 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1584 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1585 * 'and_with' may be inverted. When not inverted, we have the situation of
1587 * (C1 | P1) & (C2 | P2)
1588 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1589 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1590 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1591 * <= ((C1 & C2) | P1 | P2)
1592 * Alternatively, the last few steps could be:
1593 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1594 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1595 * <= (C1 | C2 | (P1 & P2))
1596 * We favor the second approach if either P1 or P2 is non-empty. This is
1597 * because these components are a barrier to doing optimizations, as what
1598 * they match cannot be known until the moment of matching as they are
1599 * dependent on the current locale, 'AND"ing them likely will reduce or
1601 * But we can do better if we know that C1,P1 are in their initial state (a
1602 * frequent occurrence), each matching everything:
1603 * (<everything>) & (C2 | P2) = C2 | P2
1604 * Similarly, if C2,P2 are in their initial state (again a frequent
1605 * occurrence), the result is a no-op
1606 * (C1 | P1) & (<everything>) = C1 | P1
1609 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1610 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1611 * <= (C1 & ~C2) | (P1 & ~P2)
1614 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1615 && ! is_ANYOF_SYNTHETIC(and_with))
1619 ssc_intersection(ssc,
1621 FALSE /* Has already been inverted */
1624 /* If either P1 or P2 is empty, the intersection will be also; can skip
1626 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1627 ANYOF_POSIXL_ZERO(ssc);
1629 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1631 /* Note that the Posix class component P from 'and_with' actually
1633 * P = Pa | Pb | ... | Pn
1634 * where each component is one posix class, such as in [\w\s].
1636 * ~P = ~(Pa | Pb | ... | Pn)
1637 * = ~Pa & ~Pb & ... & ~Pn
1638 * <= ~Pa | ~Pb | ... | ~Pn
1639 * The last is something we can easily calculate, but unfortunately
1640 * is likely to have many false positives. We could do better
1641 * in some (but certainly not all) instances if two classes in
1642 * P have known relationships. For example
1643 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1645 * :lower: & :print: = :lower:
1646 * And similarly for classes that must be disjoint. For example,
1647 * since \s and \w can have no elements in common based on rules in
1648 * the POSIX standard,
1649 * \w & ^\S = nothing
1650 * Unfortunately, some vendor locales do not meet the Posix
1651 * standard, in particular almost everything by Microsoft.
1652 * The loop below just changes e.g., \w into \W and vice versa */
1654 regnode_charclass_posixl temp;
1655 int add = 1; /* To calculate the index of the complement */
1657 ANYOF_POSIXL_ZERO(&temp);
1658 for (i = 0; i < ANYOF_MAX; i++) {
1660 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1661 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1663 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1664 ANYOF_POSIXL_SET(&temp, i + add);
1666 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1668 ANYOF_POSIXL_AND(&temp, ssc);
1670 } /* else ssc already has no posixes */
1671 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1672 in its initial state */
1673 else if (! is_ANYOF_SYNTHETIC(and_with)
1674 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1676 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1677 * copy it over 'ssc' */
1678 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1679 if (is_ANYOF_SYNTHETIC(and_with)) {
1680 StructCopy(and_with, ssc, regnode_ssc);
1683 ssc->invlist = anded_cp_list;
1684 ANYOF_POSIXL_ZERO(ssc);
1685 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1686 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1690 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1691 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1693 /* One or the other of P1, P2 is non-empty. */
1694 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1695 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1697 ssc_union(ssc, anded_cp_list, FALSE);
1699 else { /* P1 = P2 = empty */
1700 ssc_intersection(ssc, anded_cp_list, FALSE);
1706 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1707 const regnode_charclass *or_with)
1709 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1710 * another SSC or a regular ANYOF class. Can create false positives if
1711 * 'or_with' is to be inverted. */
1716 PERL_ARGS_ASSERT_SSC_OR;
1718 assert(is_ANYOF_SYNTHETIC(ssc));
1720 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1721 * the code point inversion list and just the relevant flags */
1722 if (is_ANYOF_SYNTHETIC(or_with)) {
1723 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1724 ored_flags = ANYOF_FLAGS(or_with);
1727 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1728 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1729 if (OP(or_with) != ANYOFD) {
1731 |= ANYOF_FLAGS(or_with)
1732 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1733 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1734 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1736 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1741 ANYOF_FLAGS(ssc) |= ored_flags;
1743 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1744 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1745 * 'or_with' may be inverted. When not inverted, we have the simple
1746 * situation of computing:
1747 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1748 * If P1|P2 yields a situation with both a class and its complement are
1749 * set, like having both \w and \W, this matches all code points, and we
1750 * can delete these from the P component of the ssc going forward. XXX We
1751 * might be able to delete all the P components, but I (khw) am not certain
1752 * about this, and it is better to be safe.
1755 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1756 * <= (C1 | P1) | ~C2
1757 * <= (C1 | ~C2) | P1
1758 * (which results in actually simpler code than the non-inverted case)
1761 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1762 && ! is_ANYOF_SYNTHETIC(or_with))
1764 /* We ignore P2, leaving P1 going forward */
1765 } /* else Not inverted */
1766 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1767 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1768 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1770 for (i = 0; i < ANYOF_MAX; i += 2) {
1771 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1773 ssc_match_all_cp(ssc);
1774 ANYOF_POSIXL_CLEAR(ssc, i);
1775 ANYOF_POSIXL_CLEAR(ssc, i+1);
1783 FALSE /* Already has been inverted */
1787 PERL_STATIC_INLINE void
1788 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1790 PERL_ARGS_ASSERT_SSC_UNION;
1792 assert(is_ANYOF_SYNTHETIC(ssc));
1794 _invlist_union_maybe_complement_2nd(ssc->invlist,
1800 PERL_STATIC_INLINE void
1801 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1803 const bool invert2nd)
1805 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1807 assert(is_ANYOF_SYNTHETIC(ssc));
1809 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1815 PERL_STATIC_INLINE void
1816 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1818 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1820 assert(is_ANYOF_SYNTHETIC(ssc));
1822 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1825 PERL_STATIC_INLINE void
1826 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1828 /* AND just the single code point 'cp' into the SSC 'ssc' */
1830 SV* cp_list = _new_invlist(2);
1832 PERL_ARGS_ASSERT_SSC_CP_AND;
1834 assert(is_ANYOF_SYNTHETIC(ssc));
1836 cp_list = add_cp_to_invlist(cp_list, cp);
1837 ssc_intersection(ssc, cp_list,
1838 FALSE /* Not inverted */
1840 SvREFCNT_dec_NN(cp_list);
1843 PERL_STATIC_INLINE void
1844 S_ssc_clear_locale(regnode_ssc *ssc)
1846 /* Set the SSC 'ssc' to not match any locale things */
1847 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1849 assert(is_ANYOF_SYNTHETIC(ssc));
1851 ANYOF_POSIXL_ZERO(ssc);
1852 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1855 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1858 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1860 /* The synthetic start class is used to hopefully quickly winnow down
1861 * places where a pattern could start a match in the target string. If it
1862 * doesn't really narrow things down that much, there isn't much point to
1863 * having the overhead of using it. This function uses some very crude
1864 * heuristics to decide if to use the ssc or not.
1866 * It returns TRUE if 'ssc' rules out more than half what it considers to
1867 * be the "likely" possible matches, but of course it doesn't know what the
1868 * actual things being matched are going to be; these are only guesses
1870 * For /l matches, it assumes that the only likely matches are going to be
1871 * in the 0-255 range, uniformly distributed, so half of that is 127
1872 * For /a and /d matches, it assumes that the likely matches will be just
1873 * the ASCII range, so half of that is 63
1874 * For /u and there isn't anything matching above the Latin1 range, it
1875 * assumes that that is the only range likely to be matched, and uses
1876 * half that as the cut-off: 127. If anything matches above Latin1,
1877 * it assumes that all of Unicode could match (uniformly), except for
1878 * non-Unicode code points and things in the General Category "Other"
1879 * (unassigned, private use, surrogates, controls and formats). This
1880 * is a much large number. */
1882 U32 count = 0; /* Running total of number of code points matched by
1884 UV start, end; /* Start and end points of current range in inversion
1886 const U32 max_code_points = (LOC)
1888 : (( ! UNI_SEMANTICS
1889 || invlist_highest(ssc->invlist) < 256)
1892 const U32 max_match = max_code_points / 2;
1894 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1896 invlist_iterinit(ssc->invlist);
1897 while (invlist_iternext(ssc->invlist, &start, &end)) {
1898 if (start >= max_code_points) {
1901 end = MIN(end, max_code_points - 1);
1902 count += end - start + 1;
1903 if (count >= max_match) {
1904 invlist_iterfinish(ssc->invlist);
1914 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1916 /* The inversion list in the SSC is marked mortal; now we need a more
1917 * permanent copy, which is stored the same way that is done in a regular
1918 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1921 SV* invlist = invlist_clone(ssc->invlist);
1923 PERL_ARGS_ASSERT_SSC_FINALIZE;
1925 assert(is_ANYOF_SYNTHETIC(ssc));
1927 /* The code in this file assumes that all but these flags aren't relevant
1928 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1929 * by the time we reach here */
1930 assert(! (ANYOF_FLAGS(ssc)
1931 & ~( ANYOF_COMMON_FLAGS
1932 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1933 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1935 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1937 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1938 NULL, NULL, NULL, FALSE);
1940 /* Make sure is clone-safe */
1941 ssc->invlist = NULL;
1943 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1944 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1947 if (RExC_contains_locale) {
1951 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1954 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1955 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1956 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1957 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1958 ? (TRIE_LIST_CUR( idx ) - 1) \
1964 dump_trie(trie,widecharmap,revcharmap)
1965 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1966 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1968 These routines dump out a trie in a somewhat readable format.
1969 The _interim_ variants are used for debugging the interim
1970 tables that are used to generate the final compressed
1971 representation which is what dump_trie expects.
1973 Part of the reason for their existence is to provide a form
1974 of documentation as to how the different representations function.
1979 Dumps the final compressed table form of the trie to Perl_debug_log.
1980 Used for debugging make_trie().
1984 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1985 AV *revcharmap, U32 depth)
1988 SV *sv=sv_newmortal();
1989 int colwidth= widecharmap ? 6 : 4;
1991 GET_RE_DEBUG_FLAGS_DECL;
1993 PERL_ARGS_ASSERT_DUMP_TRIE;
1995 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1996 depth+1, "Match","Base","Ofs" );
1998 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1999 SV ** const tmp = av_fetch( revcharmap, state, 0);
2001 Perl_re_printf( aTHX_ "%*s",
2003 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2004 PL_colors[0], PL_colors[1],
2005 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2006 PERL_PV_ESCAPE_FIRSTCHAR
2011 Perl_re_printf( aTHX_ "\n");
2012 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2014 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2015 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2016 Perl_re_printf( aTHX_ "\n");
2018 for( state = 1 ; state < trie->statecount ; state++ ) {
2019 const U32 base = trie->states[ state ].trans.base;
2021 Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state);
2023 if ( trie->states[ state ].wordnum ) {
2024 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2026 Perl_re_printf( aTHX_ "%6s", "" );
2029 Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base );
2034 while( ( base + ofs < trie->uniquecharcount ) ||
2035 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2036 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2040 Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs);
2042 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2043 if ( ( base + ofs >= trie->uniquecharcount )
2044 && ( base + ofs - trie->uniquecharcount
2046 && trie->trans[ base + ofs
2047 - trie->uniquecharcount ].check == state )
2049 Perl_re_printf( aTHX_ "%*"UVXf, colwidth,
2050 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2053 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
2057 Perl_re_printf( aTHX_ "]");
2060 Perl_re_printf( aTHX_ "\n" );
2062 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2064 for (word=1; word <= trie->wordcount; word++) {
2065 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2066 (int)word, (int)(trie->wordinfo[word].prev),
2067 (int)(trie->wordinfo[word].len));
2069 Perl_re_printf( aTHX_ "\n" );
2072 Dumps a fully constructed but uncompressed trie in list form.
2073 List tries normally only are used for construction when the number of
2074 possible chars (trie->uniquecharcount) is very high.
2075 Used for debugging make_trie().
2078 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2079 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2083 SV *sv=sv_newmortal();
2084 int colwidth= widecharmap ? 6 : 4;
2085 GET_RE_DEBUG_FLAGS_DECL;
2087 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2089 /* print out the table precompression. */
2090 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2092 Perl_re_indentf( aTHX_ "%s",
2093 depth+1, "------:-----+-----------------\n" );
2095 for( state=1 ; state < next_alloc ; state ++ ) {
2098 Perl_re_indentf( aTHX_ " %4"UVXf" :",
2099 depth+1, (UV)state );
2100 if ( ! trie->states[ state ].wordnum ) {
2101 Perl_re_printf( aTHX_ "%5s| ","");
2103 Perl_re_printf( aTHX_ "W%4x| ",
2104 trie->states[ state ].wordnum
2107 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2108 SV ** const tmp = av_fetch( revcharmap,
2109 TRIE_LIST_ITEM(state,charid).forid, 0);
2111 Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ",
2113 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2115 PL_colors[0], PL_colors[1],
2116 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2117 | PERL_PV_ESCAPE_FIRSTCHAR
2119 TRIE_LIST_ITEM(state,charid).forid,
2120 (UV)TRIE_LIST_ITEM(state,charid).newstate
2123 Perl_re_printf( aTHX_ "\n%*s| ",
2124 (int)((depth * 2) + 14), "");
2127 Perl_re_printf( aTHX_ "\n");
2132 Dumps a fully constructed but uncompressed trie in table form.
2133 This is the normal DFA style state transition table, with a few
2134 twists to facilitate compression later.
2135 Used for debugging make_trie().
2138 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2144 SV *sv=sv_newmortal();
2145 int colwidth= widecharmap ? 6 : 4;
2146 GET_RE_DEBUG_FLAGS_DECL;
2148 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2151 print out the table precompression so that we can do a visual check
2152 that they are identical.
2155 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2157 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2158 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2160 Perl_re_printf( aTHX_ "%*s",
2162 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2163 PL_colors[0], PL_colors[1],
2164 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2165 PERL_PV_ESCAPE_FIRSTCHAR
2171 Perl_re_printf( aTHX_ "\n");
2172 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2174 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2175 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2178 Perl_re_printf( aTHX_ "\n" );
2180 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2182 Perl_re_indentf( aTHX_ "%4"UVXf" : ",
2184 (UV)TRIE_NODENUM( state ) );
2186 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2187 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2189 Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v );
2191 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2193 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2194 Perl_re_printf( aTHX_ " (%4"UVXf")\n",
2195 (UV)trie->trans[ state ].check );
2197 Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n",
2198 (UV)trie->trans[ state ].check,
2199 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2207 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2208 startbranch: the first branch in the whole branch sequence
2209 first : start branch of sequence of branch-exact nodes.
2210 May be the same as startbranch
2211 last : Thing following the last branch.
2212 May be the same as tail.
2213 tail : item following the branch sequence
2214 count : words in the sequence
2215 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2216 depth : indent depth
2218 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2220 A trie is an N'ary tree where the branches are determined by digital
2221 decomposition of the key. IE, at the root node you look up the 1st character and
2222 follow that branch repeat until you find the end of the branches. Nodes can be
2223 marked as "accepting" meaning they represent a complete word. Eg:
2227 would convert into the following structure. Numbers represent states, letters
2228 following numbers represent valid transitions on the letter from that state, if
2229 the number is in square brackets it represents an accepting state, otherwise it
2230 will be in parenthesis.
2232 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2236 (1) +-i->(6)-+-s->[7]
2238 +-s->(3)-+-h->(4)-+-e->[5]
2240 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2242 This shows that when matching against the string 'hers' we will begin at state 1
2243 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2244 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2245 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2246 single traverse. We store a mapping from accepting to state to which word was
2247 matched, and then when we have multiple possibilities we try to complete the
2248 rest of the regex in the order in which they occurred in the alternation.
2250 The only prior NFA like behaviour that would be changed by the TRIE support is
2251 the silent ignoring of duplicate alternations which are of the form:
2253 / (DUPE|DUPE) X? (?{ ... }) Y /x
2255 Thus EVAL blocks following a trie may be called a different number of times with
2256 and without the optimisation. With the optimisations dupes will be silently
2257 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2258 the following demonstrates:
2260 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2262 which prints out 'word' three times, but
2264 'words'=~/(word|word|word)(?{ print $1 })S/
2266 which doesnt print it out at all. This is due to other optimisations kicking in.
2268 Example of what happens on a structural level:
2270 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2272 1: CURLYM[1] {1,32767}(18)
2283 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2284 and should turn into:
2286 1: CURLYM[1] {1,32767}(18)
2288 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2296 Cases where tail != last would be like /(?foo|bar)baz/:
2306 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2307 and would end up looking like:
2310 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2317 d = uvchr_to_utf8_flags(d, uv, 0);
2319 is the recommended Unicode-aware way of saying
2324 #define TRIE_STORE_REVCHAR(val) \
2327 SV *zlopp = newSV(UTF8_MAXBYTES); \
2328 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2329 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2330 SvCUR_set(zlopp, kapow - flrbbbbb); \
2333 av_push(revcharmap, zlopp); \
2335 char ooooff = (char)val; \
2336 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2340 /* This gets the next character from the input, folding it if not already
2342 #define TRIE_READ_CHAR STMT_START { \
2345 /* if it is UTF then it is either already folded, or does not need \
2347 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2349 else if (folder == PL_fold_latin1) { \
2350 /* This folder implies Unicode rules, which in the range expressible \
2351 * by not UTF is the lower case, with the two exceptions, one of \
2352 * which should have been taken care of before calling this */ \
2353 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2354 uvc = toLOWER_L1(*uc); \
2355 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2358 /* raw data, will be folded later if needed */ \
2366 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2367 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2368 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2369 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2371 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2372 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2373 TRIE_LIST_CUR( state )++; \
2376 #define TRIE_LIST_NEW(state) STMT_START { \
2377 Newxz( trie->states[ state ].trans.list, \
2378 4, reg_trie_trans_le ); \
2379 TRIE_LIST_CUR( state ) = 1; \
2380 TRIE_LIST_LEN( state ) = 4; \
2383 #define TRIE_HANDLE_WORD(state) STMT_START { \
2384 U16 dupe= trie->states[ state ].wordnum; \
2385 regnode * const noper_next = regnext( noper ); \
2388 /* store the word for dumping */ \
2390 if (OP(noper) != NOTHING) \
2391 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2393 tmp = newSVpvn_utf8( "", 0, UTF ); \
2394 av_push( trie_words, tmp ); \
2398 trie->wordinfo[curword].prev = 0; \
2399 trie->wordinfo[curword].len = wordlen; \
2400 trie->wordinfo[curword].accept = state; \
2402 if ( noper_next < tail ) { \
2404 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2406 trie->jump[curword] = (U16)(noper_next - convert); \
2408 jumper = noper_next; \
2410 nextbranch= regnext(cur); \
2414 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2415 /* chain, so that when the bits of chain are later */\
2416 /* linked together, the dups appear in the chain */\
2417 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2418 trie->wordinfo[dupe].prev = curword; \
2420 /* we haven't inserted this word yet. */ \
2421 trie->states[ state ].wordnum = curword; \
2426 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2427 ( ( base + charid >= ucharcount \
2428 && base + charid < ubound \
2429 && state == trie->trans[ base - ucharcount + charid ].check \
2430 && trie->trans[ base - ucharcount + charid ].next ) \
2431 ? trie->trans[ base - ucharcount + charid ].next \
2432 : ( state==1 ? special : 0 ) \
2436 #define MADE_JUMP_TRIE 2
2437 #define MADE_EXACT_TRIE 4
2440 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2441 regnode *first, regnode *last, regnode *tail,
2442 U32 word_count, U32 flags, U32 depth)
2444 /* first pass, loop through and scan words */
2445 reg_trie_data *trie;
2446 HV *widecharmap = NULL;
2447 AV *revcharmap = newAV();
2453 regnode *jumper = NULL;
2454 regnode *nextbranch = NULL;
2455 regnode *convert = NULL;
2456 U32 *prev_states; /* temp array mapping each state to previous one */
2457 /* we just use folder as a flag in utf8 */
2458 const U8 * folder = NULL;
2461 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2462 AV *trie_words = NULL;
2463 /* along with revcharmap, this only used during construction but both are
2464 * useful during debugging so we store them in the struct when debugging.
2467 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2468 STRLEN trie_charcount=0;
2470 SV *re_trie_maxbuff;
2471 GET_RE_DEBUG_FLAGS_DECL;
2473 PERL_ARGS_ASSERT_MAKE_TRIE;
2475 PERL_UNUSED_ARG(depth);
2479 case EXACT: case EXACTL: break;
2483 case EXACTFLU8: folder = PL_fold_latin1; break;
2484 case EXACTF: folder = PL_fold; break;
2485 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2488 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2490 trie->startstate = 1;
2491 trie->wordcount = word_count;
2492 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2493 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2494 if (flags == EXACT || flags == EXACTL)
2495 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2496 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2497 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2500 trie_words = newAV();
2503 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2504 assert(re_trie_maxbuff);
2505 if (!SvIOK(re_trie_maxbuff)) {
2506 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2508 DEBUG_TRIE_COMPILE_r({
2509 Perl_re_indentf( aTHX_
2510 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2512 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2513 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2516 /* Find the node we are going to overwrite */
2517 if ( first == startbranch && OP( last ) != BRANCH ) {
2518 /* whole branch chain */
2521 /* branch sub-chain */
2522 convert = NEXTOPER( first );
2525 /* -- First loop and Setup --
2527 We first traverse the branches and scan each word to determine if it
2528 contains widechars, and how many unique chars there are, this is
2529 important as we have to build a table with at least as many columns as we
2532 We use an array of integers to represent the character codes 0..255
2533 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2534 the native representation of the character value as the key and IV's for
2537 *TODO* If we keep track of how many times each character is used we can
2538 remap the columns so that the table compression later on is more
2539 efficient in terms of memory by ensuring the most common value is in the
2540 middle and the least common are on the outside. IMO this would be better
2541 than a most to least common mapping as theres a decent chance the most
2542 common letter will share a node with the least common, meaning the node
2543 will not be compressible. With a middle is most common approach the worst
2544 case is when we have the least common nodes twice.
2548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2549 regnode *noper = NEXTOPER( cur );
2553 U32 wordlen = 0; /* required init */
2554 STRLEN minchars = 0;
2555 STRLEN maxchars = 0;
2556 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2559 if (OP(noper) == NOTHING) {
2560 regnode *noper_next= regnext(noper);
2561 if (noper_next < tail)
2565 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2566 uc= (U8*)STRING(noper);
2567 e= uc + STR_LEN(noper);
2574 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2575 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2576 regardless of encoding */
2577 if (OP( noper ) == EXACTFU_SS) {
2578 /* false positives are ok, so just set this */
2579 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2582 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2584 TRIE_CHARCOUNT(trie)++;
2587 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2588 * is in effect. Under /i, this character can match itself, or
2589 * anything that folds to it. If not under /i, it can match just
2590 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2591 * all fold to k, and all are single characters. But some folds
2592 * expand to more than one character, so for example LATIN SMALL
2593 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2594 * the string beginning at 'uc' is 'ffi', it could be matched by
2595 * three characters, or just by the one ligature character. (It
2596 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2597 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2598 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2599 * match.) The trie needs to know the minimum and maximum number
2600 * of characters that could match so that it can use size alone to
2601 * quickly reject many match attempts. The max is simple: it is
2602 * the number of folded characters in this branch (since a fold is
2603 * never shorter than what folds to it. */
2607 /* And the min is equal to the max if not under /i (indicated by
2608 * 'folder' being NULL), or there are no multi-character folds. If
2609 * there is a multi-character fold, the min is incremented just
2610 * once, for the character that folds to the sequence. Each
2611 * character in the sequence needs to be added to the list below of
2612 * characters in the trie, but we count only the first towards the
2613 * min number of characters needed. This is done through the
2614 * variable 'foldlen', which is returned by the macros that look
2615 * for these sequences as the number of bytes the sequence
2616 * occupies. Each time through the loop, we decrement 'foldlen' by
2617 * how many bytes the current char occupies. Only when it reaches
2618 * 0 do we increment 'minchars' or look for another multi-character
2620 if (folder == NULL) {
2623 else if (foldlen > 0) {
2624 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2629 /* See if *uc is the beginning of a multi-character fold. If
2630 * so, we decrement the length remaining to look at, to account
2631 * for the current character this iteration. (We can use 'uc'
2632 * instead of the fold returned by TRIE_READ_CHAR because for
2633 * non-UTF, the latin1_safe macro is smart enough to account
2634 * for all the unfolded characters, and because for UTF, the
2635 * string will already have been folded earlier in the
2636 * compilation process */
2638 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2639 foldlen -= UTF8SKIP(uc);
2642 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2647 /* The current character (and any potential folds) should be added
2648 * to the possible matching characters for this position in this
2652 U8 folded= folder[ (U8) uvc ];
2653 if ( !trie->charmap[ folded ] ) {
2654 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2655 TRIE_STORE_REVCHAR( folded );
2658 if ( !trie->charmap[ uvc ] ) {
2659 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2660 TRIE_STORE_REVCHAR( uvc );
2663 /* store the codepoint in the bitmap, and its folded
2665 TRIE_BITMAP_SET(trie, uvc);
2667 /* store the folded codepoint */
2668 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2671 /* store first byte of utf8 representation of
2672 variant codepoints */
2673 if (! UVCHR_IS_INVARIANT(uvc)) {
2674 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2677 set_bit = 0; /* We've done our bit :-) */
2681 /* XXX We could come up with the list of code points that fold
2682 * to this using PL_utf8_foldclosures, except not for
2683 * multi-char folds, as there may be multiple combinations
2684 * there that could work, which needs to wait until runtime to
2685 * resolve (The comment about LIGATURE FFI above is such an
2690 widecharmap = newHV();
2692 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2695 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2697 if ( !SvTRUE( *svpp ) ) {
2698 sv_setiv( *svpp, ++trie->uniquecharcount );
2699 TRIE_STORE_REVCHAR(uvc);
2702 } /* end loop through characters in this branch of the trie */
2704 /* We take the min and max for this branch and combine to find the min
2705 * and max for all branches processed so far */
2706 if( cur == first ) {
2707 trie->minlen = minchars;
2708 trie->maxlen = maxchars;
2709 } else if (minchars < trie->minlen) {
2710 trie->minlen = minchars;
2711 } else if (maxchars > trie->maxlen) {
2712 trie->maxlen = maxchars;
2714 } /* end first pass */
2715 DEBUG_TRIE_COMPILE_r(
2716 Perl_re_indentf( aTHX_
2717 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2719 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2720 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2721 (int)trie->minlen, (int)trie->maxlen )
2725 We now know what we are dealing with in terms of unique chars and
2726 string sizes so we can calculate how much memory a naive
2727 representation using a flat table will take. If it's over a reasonable
2728 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2729 conservative but potentially much slower representation using an array
2732 At the end we convert both representations into the same compressed
2733 form that will be used in regexec.c for matching with. The latter
2734 is a form that cannot be used to construct with but has memory
2735 properties similar to the list form and access properties similar
2736 to the table form making it both suitable for fast searches and
2737 small enough that its feasable to store for the duration of a program.
2739 See the comment in the code where the compressed table is produced
2740 inplace from the flat tabe representation for an explanation of how
2741 the compression works.
2746 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2749 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2750 > SvIV(re_trie_maxbuff) )
2753 Second Pass -- Array Of Lists Representation
2755 Each state will be represented by a list of charid:state records
2756 (reg_trie_trans_le) the first such element holds the CUR and LEN
2757 points of the allocated array. (See defines above).
2759 We build the initial structure using the lists, and then convert
2760 it into the compressed table form which allows faster lookups
2761 (but cant be modified once converted).
2764 STRLEN transcount = 1;
2766 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2769 trie->states = (reg_trie_state *)
2770 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2771 sizeof(reg_trie_state) );
2775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2777 regnode *noper = NEXTOPER( cur );
2778 U32 state = 1; /* required init */
2779 U16 charid = 0; /* sanity init */
2780 U32 wordlen = 0; /* required init */
2782 if (OP(noper) == NOTHING) {
2783 regnode *noper_next= regnext(noper);
2784 if (noper_next < tail)
2788 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2789 const U8 *uc= (U8*)STRING(noper);
2790 const U8 *e= uc + STR_LEN(noper);
2792 for ( ; uc < e ; uc += len ) {
2797 charid = trie->charmap[ uvc ];
2799 SV** const svpp = hv_fetch( widecharmap,
2806 charid=(U16)SvIV( *svpp );
2809 /* charid is now 0 if we dont know the char read, or
2810 * nonzero if we do */
2817 if ( !trie->states[ state ].trans.list ) {
2818 TRIE_LIST_NEW( state );
2821 check <= TRIE_LIST_USED( state );
2824 if ( TRIE_LIST_ITEM( state, check ).forid
2827 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2832 newstate = next_alloc++;
2833 prev_states[newstate] = state;
2834 TRIE_LIST_PUSH( state, charid, newstate );
2839 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2843 TRIE_HANDLE_WORD(state);
2845 } /* end second pass */
2847 /* next alloc is the NEXT state to be allocated */
2848 trie->statecount = next_alloc;
2849 trie->states = (reg_trie_state *)
2850 PerlMemShared_realloc( trie->states,
2852 * sizeof(reg_trie_state) );
2854 /* and now dump it out before we compress it */
2855 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2856 revcharmap, next_alloc,
2860 trie->trans = (reg_trie_trans *)
2861 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2868 for( state=1 ; state < next_alloc ; state ++ ) {
2872 DEBUG_TRIE_COMPILE_MORE_r(
2873 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
2877 if (trie->states[state].trans.list) {
2878 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2882 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2883 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2884 if ( forid < minid ) {
2886 } else if ( forid > maxid ) {
2890 if ( transcount < tp + maxid - minid + 1) {
2892 trie->trans = (reg_trie_trans *)
2893 PerlMemShared_realloc( trie->trans,
2895 * sizeof(reg_trie_trans) );
2896 Zero( trie->trans + (transcount / 2),
2900 base = trie->uniquecharcount + tp - minid;
2901 if ( maxid == minid ) {
2903 for ( ; zp < tp ; zp++ ) {
2904 if ( ! trie->trans[ zp ].next ) {
2905 base = trie->uniquecharcount + zp - minid;
2906 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2908 trie->trans[ zp ].check = state;
2914 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2916 trie->trans[ tp ].check = state;
2921 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2922 const U32 tid = base
2923 - trie->uniquecharcount
2924 + TRIE_LIST_ITEM( state, idx ).forid;
2925 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2927 trie->trans[ tid ].check = state;
2929 tp += ( maxid - minid + 1 );
2931 Safefree(trie->states[ state ].trans.list);
2934 DEBUG_TRIE_COMPILE_MORE_r(
2935 Perl_re_printf( aTHX_ " base: %d\n",base);
2938 trie->states[ state ].trans.base=base;
2940 trie->lasttrans = tp + 1;
2944 Second Pass -- Flat Table Representation.
2946 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2947 each. We know that we will need Charcount+1 trans at most to store
2948 the data (one row per char at worst case) So we preallocate both
2949 structures assuming worst case.
2951 We then construct the trie using only the .next slots of the entry
2954 We use the .check field of the first entry of the node temporarily
2955 to make compression both faster and easier by keeping track of how
2956 many non zero fields are in the node.
2958 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2961 There are two terms at use here: state as a TRIE_NODEIDX() which is
2962 a number representing the first entry of the node, and state as a
2963 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2964 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2965 if there are 2 entrys per node. eg:
2973 The table is internally in the right hand, idx form. However as we
2974 also have to deal with the states array which is indexed by nodenum
2975 we have to use TRIE_NODENUM() to convert.
2978 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
2981 trie->trans = (reg_trie_trans *)
2982 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2983 * trie->uniquecharcount + 1,
2984 sizeof(reg_trie_trans) );
2985 trie->states = (reg_trie_state *)
2986 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2987 sizeof(reg_trie_state) );
2988 next_alloc = trie->uniquecharcount + 1;
2991 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2993 regnode *noper = NEXTOPER( cur );
2995 U32 state = 1; /* required init */
2997 U16 charid = 0; /* sanity init */
2998 U32 accept_state = 0; /* sanity init */
3000 U32 wordlen = 0; /* required init */
3002 if (OP(noper) == NOTHING) {
3003 regnode *noper_next= regnext(noper);
3004 if (noper_next < tail)
3008 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3009 const U8 *uc= (U8*)STRING(noper);
3010 const U8 *e= uc + STR_LEN(noper);
3012 for ( ; uc < e ; uc += len ) {
3017 charid = trie->charmap[ uvc ];
3019 SV* const * const svpp = hv_fetch( widecharmap,
3023 charid = svpp ? (U16)SvIV(*svpp) : 0;
3027 if ( !trie->trans[ state + charid ].next ) {
3028 trie->trans[ state + charid ].next = next_alloc;
3029 trie->trans[ state ].check++;
3030 prev_states[TRIE_NODENUM(next_alloc)]
3031 = TRIE_NODENUM(state);
3032 next_alloc += trie->uniquecharcount;
3034 state = trie->trans[ state + charid ].next;
3036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
3038 /* charid is now 0 if we dont know the char read, or
3039 * nonzero if we do */
3042 accept_state = TRIE_NODENUM( state );
3043 TRIE_HANDLE_WORD(accept_state);
3045 } /* end second pass */
3047 /* and now dump it out before we compress it */
3048 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3050 next_alloc, depth+1));
3054 * Inplace compress the table.*
3056 For sparse data sets the table constructed by the trie algorithm will
3057 be mostly 0/FAIL transitions or to put it another way mostly empty.
3058 (Note that leaf nodes will not contain any transitions.)
3060 This algorithm compresses the tables by eliminating most such
3061 transitions, at the cost of a modest bit of extra work during lookup:
3063 - Each states[] entry contains a .base field which indicates the
3064 index in the state[] array wheres its transition data is stored.
3066 - If .base is 0 there are no valid transitions from that node.
3068 - If .base is nonzero then charid is added to it to find an entry in
3071 -If trans[states[state].base+charid].check!=state then the
3072 transition is taken to be a 0/Fail transition. Thus if there are fail
3073 transitions at the front of the node then the .base offset will point
3074 somewhere inside the previous nodes data (or maybe even into a node
3075 even earlier), but the .check field determines if the transition is
3079 The following process inplace converts the table to the compressed
3080 table: We first do not compress the root node 1,and mark all its
3081 .check pointers as 1 and set its .base pointer as 1 as well. This
3082 allows us to do a DFA construction from the compressed table later,
3083 and ensures that any .base pointers we calculate later are greater
3086 - We set 'pos' to indicate the first entry of the second node.
3088 - We then iterate over the columns of the node, finding the first and
3089 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3090 and set the .check pointers accordingly, and advance pos
3091 appropriately and repreat for the next node. Note that when we copy
3092 the next pointers we have to convert them from the original
3093 NODEIDX form to NODENUM form as the former is not valid post
3096 - If a node has no transitions used we mark its base as 0 and do not
3097 advance the pos pointer.
3099 - If a node only has one transition we use a second pointer into the
3100 structure to fill in allocated fail transitions from other states.
3101 This pointer is independent of the main pointer and scans forward
3102 looking for null transitions that are allocated to a state. When it
3103 finds one it writes the single transition into the "hole". If the
3104 pointer doesnt find one the single transition is appended as normal.
3106 - Once compressed we can Renew/realloc the structures to release the
3109 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3110 specifically Fig 3.47 and the associated pseudocode.
3114 const U32 laststate = TRIE_NODENUM( next_alloc );
3117 trie->statecount = laststate;
3119 for ( state = 1 ; state < laststate ; state++ ) {
3121 const U32 stateidx = TRIE_NODEIDX( state );
3122 const U32 o_used = trie->trans[ stateidx ].check;
3123 U32 used = trie->trans[ stateidx ].check;
3124 trie->trans[ stateidx ].check = 0;
3127 used && charid < trie->uniquecharcount;
3130 if ( flag || trie->trans[ stateidx + charid ].next ) {
3131 if ( trie->trans[ stateidx + charid ].next ) {
3133 for ( ; zp < pos ; zp++ ) {
3134 if ( ! trie->trans[ zp ].next ) {
3138 trie->states[ state ].trans.base
3140 + trie->uniquecharcount
3142 trie->trans[ zp ].next
3143 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3145 trie->trans[ zp ].check = state;
3146 if ( ++zp > pos ) pos = zp;
3153 trie->states[ state ].trans.base
3154 = pos + trie->uniquecharcount - charid ;
3156 trie->trans[ pos ].next
3157 = SAFE_TRIE_NODENUM(
3158 trie->trans[ stateidx + charid ].next );
3159 trie->trans[ pos ].check = state;
3164 trie->lasttrans = pos + 1;
3165 trie->states = (reg_trie_state *)
3166 PerlMemShared_realloc( trie->states, laststate
3167 * sizeof(reg_trie_state) );
3168 DEBUG_TRIE_COMPILE_MORE_r(
3169 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
3171 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3175 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3178 } /* end table compress */
3180 DEBUG_TRIE_COMPILE_MORE_r(
3181 Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
3183 (UV)trie->statecount,
3184 (UV)trie->lasttrans)
3186 /* resize the trans array to remove unused space */
3187 trie->trans = (reg_trie_trans *)
3188 PerlMemShared_realloc( trie->trans, trie->lasttrans
3189 * sizeof(reg_trie_trans) );
3191 { /* Modify the program and insert the new TRIE node */
3192 U8 nodetype =(U8)(flags & 0xFF);
3196 regnode *optimize = NULL;
3197 #ifdef RE_TRACK_PATTERN_OFFSETS
3200 U32 mjd_nodelen = 0;
3201 #endif /* RE_TRACK_PATTERN_OFFSETS */
3202 #endif /* DEBUGGING */
3204 This means we convert either the first branch or the first Exact,
3205 depending on whether the thing following (in 'last') is a branch
3206 or not and whther first is the startbranch (ie is it a sub part of
3207 the alternation or is it the whole thing.)
3208 Assuming its a sub part we convert the EXACT otherwise we convert
3209 the whole branch sequence, including the first.
3211 /* Find the node we are going to overwrite */
3212 if ( first != startbranch || OP( last ) == BRANCH ) {
3213 /* branch sub-chain */
3214 NEXT_OFF( first ) = (U16)(last - first);
3215 #ifdef RE_TRACK_PATTERN_OFFSETS
3217 mjd_offset= Node_Offset((convert));
3218 mjd_nodelen= Node_Length((convert));
3221 /* whole branch chain */
3223 #ifdef RE_TRACK_PATTERN_OFFSETS
3226 const regnode *nop = NEXTOPER( convert );
3227 mjd_offset= Node_Offset((nop));
3228 mjd_nodelen= Node_Length((nop));
3232 Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
3234 (UV)mjd_offset, (UV)mjd_nodelen)
3237 /* But first we check to see if there is a common prefix we can
3238 split out as an EXACT and put in front of the TRIE node. */
3239 trie->startstate= 1;
3240 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3242 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3246 const U32 base = trie->states[ state ].trans.base;
3248 if ( trie->states[state].wordnum )
3251 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3252 if ( ( base + ofs >= trie->uniquecharcount ) &&
3253 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3254 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3256 if ( ++count > 1 ) {
3257 SV **tmp = av_fetch( revcharmap, ofs, 0);
3258 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3259 if ( state == 1 ) break;
3261 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3263 Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
3267 SV ** const tmp = av_fetch( revcharmap, idx, 0);
3268 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3270 TRIE_BITMAP_SET(trie,*ch);
3272 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3274 Perl_re_printf( aTHX_ "%s", (char*)ch)
3278 TRIE_BITMAP_SET(trie,*ch);
3280 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3281 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3287 SV **tmp = av_fetch( revcharmap, idx, 0);
3289 char *ch = SvPV( *tmp, len );
3291 SV *sv=sv_newmortal();
3292 Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3295 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3296 PL_colors[0], PL_colors[1],
3297 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3298 PERL_PV_ESCAPE_FIRSTCHAR
3303 OP( convert ) = nodetype;
3304 str=STRING(convert);
3307 STR_LEN(convert) += len;
3313 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3318 trie->prefixlen = (state-1);
3320 regnode *n = convert+NODE_SZ_STR(convert);
3321 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3322 trie->startstate = state;
3323 trie->minlen -= (state - 1);
3324 trie->maxlen -= (state - 1);
3326 /* At least the UNICOS C compiler choked on this
3327 * being argument to DEBUG_r(), so let's just have
3330 #ifdef PERL_EXT_RE_BUILD
3336 regnode *fix = convert;
3337 U32 word = trie->wordcount;
3339 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3340 while( ++fix < n ) {
3341 Set_Node_Offset_Length(fix, 0, 0);
3344 SV ** const tmp = av_fetch( trie_words, word, 0 );
3346 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3347 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3349 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3357 NEXT_OFF(convert) = (U16)(tail - convert);
3358 DEBUG_r(optimize= n);
3364 if ( trie->maxlen ) {
3365 NEXT_OFF( convert ) = (U16)(tail - convert);
3366 ARG_SET( convert, data_slot );
3367 /* Store the offset to the first unabsorbed branch in
3368 jump[0], which is otherwise unused by the jump logic.
3369 We use this when dumping a trie and during optimisation. */
3371 trie->jump[0] = (U16)(nextbranch - convert);
3373 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3374 * and there is a bitmap
3375 * and the first "jump target" node we found leaves enough room
3376 * then convert the TRIE node into a TRIEC node, with the bitmap
3377 * embedded inline in the opcode - this is hypothetically faster.
3379 if ( !trie->states[trie->startstate].wordnum
3381 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3383 OP( convert ) = TRIEC;
3384 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3385 PerlMemShared_free(trie->bitmap);
3388 OP( convert ) = TRIE;
3390 /* store the type in the flags */
3391 convert->flags = nodetype;
3395 + regarglen[ OP( convert ) ];
3397 /* XXX We really should free up the resource in trie now,
3398 as we won't use them - (which resources?) dmq */
3400 /* needed for dumping*/
3401 DEBUG_r(if (optimize) {
3402 regnode *opt = convert;
3404 while ( ++opt < optimize) {
3405 Set_Node_Offset_Length(opt,0,0);
3408 Try to clean up some of the debris left after the
3411 while( optimize < jumper ) {
3412 mjd_nodelen += Node_Length((optimize));
3413 OP( optimize ) = OPTIMIZED;
3414 Set_Node_Offset_Length(optimize,0,0);
3417 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3419 } /* end node insert */
3421 /* Finish populating the prev field of the wordinfo array. Walk back
3422 * from each accept state until we find another accept state, and if
3423 * so, point the first word's .prev field at the second word. If the
3424 * second already has a .prev field set, stop now. This will be the
3425 * case either if we've already processed that word's accept state,
3426 * or that state had multiple words, and the overspill words were
3427 * already linked up earlier.
3434 for (word=1; word <= trie->wordcount; word++) {
3436 if (trie->wordinfo[word].prev)
3438 state = trie->wordinfo[word].accept;
3440 state = prev_states[state];
3443 prev = trie->states[state].wordnum;
3447 trie->wordinfo[word].prev = prev;
3449 Safefree(prev_states);
3453 /* and now dump out the compressed format */
3454 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3456 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3458 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3459 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3461 SvREFCNT_dec_NN(revcharmap);
3465 : trie->startstate>1
3471 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3473 /* The Trie is constructed and compressed now so we can build a fail array if
3476 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3478 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3482 We find the fail state for each state in the trie, this state is the longest
3483 proper suffix of the current state's 'word' that is also a proper prefix of
3484 another word in our trie. State 1 represents the word '' and is thus the
3485 default fail state. This allows the DFA not to have to restart after its
3486 tried and failed a word at a given point, it simply continues as though it
3487 had been matching the other word in the first place.
3489 'abcdgu'=~/abcdefg|cdgu/
3490 When we get to 'd' we are still matching the first word, we would encounter
3491 'g' which would fail, which would bring us to the state representing 'd' in
3492 the second word where we would try 'g' and succeed, proceeding to match
3495 /* add a fail transition */
3496 const U32 trie_offset = ARG(source);
3497 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3499 const U32 ucharcount = trie->uniquecharcount;
3500 const U32 numstates = trie->statecount;
3501 const U32 ubound = trie->lasttrans + ucharcount;
3505 U32 base = trie->states[ 1 ].trans.base;
3508 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3510 GET_RE_DEBUG_FLAGS_DECL;
3512 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3513 PERL_UNUSED_CONTEXT;
3515 PERL_UNUSED_ARG(depth);
3518 if ( OP(source) == TRIE ) {
3519 struct regnode_1 *op = (struct regnode_1 *)
3520 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3521 StructCopy(source,op,struct regnode_1);
3522 stclass = (regnode *)op;
3524 struct regnode_charclass *op = (struct regnode_charclass *)
3525 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3526 StructCopy(source,op,struct regnode_charclass);
3527 stclass = (regnode *)op;
3529 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3531 ARG_SET( stclass, data_slot );
3532 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3533 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3534 aho->trie=trie_offset;
3535 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3536 Copy( trie->states, aho->states, numstates, reg_trie_state );
3537 Newxz( q, numstates, U32);
3538 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3541 /* initialize fail[0..1] to be 1 so that we always have
3542 a valid final fail state */
3543 fail[ 0 ] = fail[ 1 ] = 1;
3545 for ( charid = 0; charid < ucharcount ; charid++ ) {
3546 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3548 q[ q_write ] = newstate;
3549 /* set to point at the root */
3550 fail[ q[ q_write++ ] ]=1;
3553 while ( q_read < q_write) {
3554 const U32 cur = q[ q_read++ % numstates ];
3555 base = trie->states[ cur ].trans.base;
3557 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3558 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3560 U32 fail_state = cur;
3563 fail_state = fail[ fail_state ];
3564 fail_base = aho->states[ fail_state ].trans.base;
3565 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3567 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3568 fail[ ch_state ] = fail_state;
3569 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3571 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3573 q[ q_write++ % numstates] = ch_state;
3577 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3578 when we fail in state 1, this allows us to use the
3579 charclass scan to find a valid start char. This is based on the principle
3580 that theres a good chance the string being searched contains lots of stuff
3581 that cant be a start char.
3583 fail[ 0 ] = fail[ 1 ] = 0;
3584 DEBUG_TRIE_COMPILE_r({
3585 Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0",
3586 depth, (UV)numstates
3588 for( q_read=1; q_read<numstates; q_read++ ) {
3589 Perl_re_printf( aTHX_ ", %"UVuf, (UV)fail[q_read]);
3591 Perl_re_printf( aTHX_ "\n");
3594 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3599 #define DEBUG_PEEP(str,scan,depth) \
3600 DEBUG_OPTIMISE_r({if (scan){ \
3601 regnode *Next = regnext(scan); \
3602 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
3603 Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \
3604 depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3605 Next ? (REG_NODE_NUM(Next)) : 0 );\
3606 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3607 Perl_re_printf( aTHX_ "\n"); \
3610 /* The below joins as many adjacent EXACTish nodes as possible into a single
3611 * one. The regop may be changed if the node(s) contain certain sequences that
3612 * require special handling. The joining is only done if:
3613 * 1) there is room in the current conglomerated node to entirely contain the
3615 * 2) they are the exact same node type
3617 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3618 * these get optimized out
3620 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3621 * as possible, even if that means splitting an existing node so that its first
3622 * part is moved to the preceeding node. This would maximise the efficiency of
3623 * memEQ during matching. Elsewhere in this file, khw proposes splitting
3624 * EXACTFish nodes into portions that don't change under folding vs those that
3625 * do. Those portions that don't change may be the only things in the pattern that
3626 * could be used to find fixed and floating strings.
3628 * If a node is to match under /i (folded), the number of characters it matches
3629 * can be different than its character length if it contains a multi-character
3630 * fold. *min_subtract is set to the total delta number of characters of the
3633 * And *unfolded_multi_char is set to indicate whether or not the node contains
3634 * an unfolded multi-char fold. This happens when whether the fold is valid or
3635 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3636 * SMALL LETTER SHARP S, as only if the target string being matched against
3637 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3638 * folding rules depend on the locale in force at runtime. (Multi-char folds
3639 * whose components are all above the Latin1 range are not run-time locale
3640 * dependent, and have already been folded by the time this function is
3643 * This is as good a place as any to discuss the design of handling these
3644 * multi-character fold sequences. It's been wrong in Perl for a very long
3645 * time. There are three code points in Unicode whose multi-character folds
3646 * were long ago discovered to mess things up. The previous designs for
3647 * dealing with these involved assigning a special node for them. This
3648 * approach doesn't always work, as evidenced by this example:
3649 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3650 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3651 * would match just the \xDF, it won't be able to handle the case where a
3652 * successful match would have to cross the node's boundary. The new approach
3653 * that hopefully generally solves the problem generates an EXACTFU_SS node
3654 * that is "sss" in this case.
3656 * It turns out that there are problems with all multi-character folds, and not
3657 * just these three. Now the code is general, for all such cases. The
3658 * approach taken is:
3659 * 1) This routine examines each EXACTFish node that could contain multi-
3660 * character folded sequences. Since a single character can fold into
3661 * such a sequence, the minimum match length for this node is less than
3662 * the number of characters in the node. This routine returns in
3663 * *min_subtract how many characters to subtract from the the actual
3664 * length of the string to get a real minimum match length; it is 0 if
3665 * there are no multi-char foldeds. This delta is used by the caller to
3666 * adjust the min length of the match, and the delta between min and max,
3667 * so that the optimizer doesn't reject these possibilities based on size
3669 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3670 * is used for an EXACTFU node that contains at least one "ss" sequence in
3671 * it. For non-UTF-8 patterns and strings, this is the only case where
3672 * there is a possible fold length change. That means that a regular
3673 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3674 * with length changes, and so can be processed faster. regexec.c takes
3675 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3676 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3677 * known until runtime). This saves effort in regex matching. However,
3678 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3679 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3680 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3681 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3682 * possibilities for the non-UTF8 patterns are quite simple, except for
3683 * the sharp s. All the ones that don't involve a UTF-8 target string are
3684 * members of a fold-pair, and arrays are set up for all of them so that
3685 * the other member of the pair can be found quickly. Code elsewhere in
3686 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3687 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3688 * described in the next item.
3689 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3690 * validity of the fold won't be known until runtime, and so must remain
3691 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3692 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3693 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3694 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3695 * The reason this is a problem is that the optimizer part of regexec.c
3696 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3697 * that a character in the pattern corresponds to at most a single
3698 * character in the target string. (And I do mean character, and not byte
3699 * here, unlike other parts of the documentation that have never been
3700 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3701 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3702 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3703 * nodes, violate the assumption, and they are the only instances where it
3704 * is violated. I'm reluctant to try to change the assumption, as the
3705 * code involved is impenetrable to me (khw), so instead the code here
3706 * punts. This routine examines EXACTFL nodes, and (when the pattern
3707 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3708 * boolean indicating whether or not the node contains such a fold. When
3709 * it is true, the caller sets a flag that later causes the optimizer in
3710 * this file to not set values for the floating and fixed string lengths,
3711 * and thus avoids the optimizer code in regexec.c that makes the invalid
3712 * assumption. Thus, there is no optimization based on string lengths for
3713 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3714 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3715 * assumption is wrong only in these cases is that all other non-UTF-8
3716 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3717 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3718 * EXACTF nodes because we don't know at compile time if it actually
3719 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3720 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3721 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3722 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3723 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3724 * string would require the pattern to be forced into UTF-8, the overhead
3725 * of which we want to avoid. Similarly the unfolded multi-char folds in
3726 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3729 * Similarly, the code that generates tries doesn't currently handle
3730 * not-already-folded multi-char folds, and it looks like a pain to change
3731 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3732 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3733 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3734 * using /iaa matching will be doing so almost entirely with ASCII
3735 * strings, so this should rarely be encountered in practice */
3737 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3738 if (PL_regkind[OP(scan)] == EXACT) \
3739 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3742 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3743 UV *min_subtract, bool *unfolded_multi_char,
3744 U32 flags,regnode *val, U32 depth)
3746 /* Merge several consecutive EXACTish nodes into one. */
3747 regnode *n = regnext(scan);
3749 regnode *next = scan + NODE_SZ_STR(scan);
3753 regnode *stop = scan;
3754 GET_RE_DEBUG_FLAGS_DECL;
3756 PERL_UNUSED_ARG(depth);
3759 PERL_ARGS_ASSERT_JOIN_EXACT;
3760 #ifndef EXPERIMENTAL_INPLACESCAN
3761 PERL_UNUSED_ARG(flags);
3762 PERL_UNUSED_ARG(val);
3764 DEBUG_PEEP("join",scan,depth);
3766 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3767 * EXACT ones that are mergeable to the current one. */
3769 && (PL_regkind[OP(n)] == NOTHING
3770 || (stringok && OP(n) == OP(scan)))
3772 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3775 if (OP(n) == TAIL || n > next)
3777 if (PL_regkind[OP(n)] == NOTHING) {
3778 DEBUG_PEEP("skip:",n,depth);
3779 NEXT_OFF(scan) += NEXT_OFF(n);
3780 next = n + NODE_STEP_REGNODE;
3787 else if (stringok) {
3788 const unsigned int oldl = STR_LEN(scan);
3789 regnode * const nnext = regnext(n);
3791 /* XXX I (khw) kind of doubt that this works on platforms (should
3792 * Perl ever run on one) where U8_MAX is above 255 because of lots
3793 * of other assumptions */
3794 /* Don't join if the sum can't fit into a single node */
3795 if (oldl + STR_LEN(n) > U8_MAX)
3798 DEBUG_PEEP("merg",n,depth);
3801 NEXT_OFF(scan) += NEXT_OFF(n);
3802 STR_LEN(scan) += STR_LEN(n);
3803 next = n + NODE_SZ_STR(n);
3804 /* Now we can overwrite *n : */
3805 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3813 #ifdef EXPERIMENTAL_INPLACESCAN
3814 if (flags && !NEXT_OFF(n)) {
3815 DEBUG_PEEP("atch", val, depth);
3816 if (reg_off_by_arg[OP(n)]) {
3817 ARG_SET(n, val - n);
3820 NEXT_OFF(n) = val - n;
3828 *unfolded_multi_char = FALSE;
3830 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3831 * can now analyze for sequences of problematic code points. (Prior to
3832 * this final joining, sequences could have been split over boundaries, and
3833 * hence missed). The sequences only happen in folding, hence for any
3834 * non-EXACT EXACTish node */
3835 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3836 U8* s0 = (U8*) STRING(scan);
3838 U8* s_end = s0 + STR_LEN(scan);
3840 int total_count_delta = 0; /* Total delta number of characters that
3841 multi-char folds expand to */
3843 /* One pass is made over the node's string looking for all the
3844 * possibilities. To avoid some tests in the loop, there are two main
3845 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3850 if (OP(scan) == EXACTFL) {
3853 /* An EXACTFL node would already have been changed to another
3854 * node type unless there is at least one character in it that
3855 * is problematic; likely a character whose fold definition
3856 * won't be known until runtime, and so has yet to be folded.
3857 * For all but the UTF-8 locale, folds are 1-1 in length, but
3858 * to handle the UTF-8 case, we need to create a temporary
3859 * folded copy using UTF-8 locale rules in order to analyze it.
3860 * This is because our macros that look to see if a sequence is
3861 * a multi-char fold assume everything is folded (otherwise the
3862 * tests in those macros would be too complicated and slow).
3863 * Note that here, the non-problematic folds will have already
3864 * been done, so we can just copy such characters. We actually
3865 * don't completely fold the EXACTFL string. We skip the
3866 * unfolded multi-char folds, as that would just create work
3867 * below to figure out the size they already are */
3869 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3872 STRLEN s_len = UTF8SKIP(s);
3873 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3874 Copy(s, d, s_len, U8);
3877 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3878 *unfolded_multi_char = TRUE;
3879 Copy(s, d, s_len, U8);
3882 else if (isASCII(*s)) {
3883 *(d++) = toFOLD(*s);
3887 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3893 /* Point the remainder of the routine to look at our temporary
3897 } /* End of creating folded copy of EXACTFL string */
3899 /* Examine the string for a multi-character fold sequence. UTF-8
3900 * patterns have all characters pre-folded by the time this code is
3902 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3903 length sequence we are looking for is 2 */
3905 int count = 0; /* How many characters in a multi-char fold */
3906 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3907 if (! len) { /* Not a multi-char fold: get next char */
3912 /* Nodes with 'ss' require special handling, except for
3913 * EXACTFA-ish for which there is no multi-char fold to this */
3914 if (len == 2 && *s == 's' && *(s+1) == 's'
3915 && OP(scan) != EXACTFA
3916 && OP(scan) != EXACTFA_NO_TRIE)
3919 if (OP(scan) != EXACTFL) {
3920 OP(scan) = EXACTFU_SS;
3924 else { /* Here is a generic multi-char fold. */
3925 U8* multi_end = s + len;
3927 /* Count how many characters are in it. In the case of
3928 * /aa, no folds which contain ASCII code points are
3929 * allowed, so check for those, and skip if found. */
3930 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3931 count = utf8_length(s, multi_end);
3935 while (s < multi_end) {
3938 goto next_iteration;
3948 /* The delta is how long the sequence is minus 1 (1 is how long
3949 * the character that folds to the sequence is) */
3950 total_count_delta += count - 1;
3954 /* We created a temporary folded copy of the string in EXACTFL
3955 * nodes. Therefore we need to be sure it doesn't go below zero,
3956 * as the real string could be shorter */
3957 if (OP(scan) == EXACTFL) {
3958 int total_chars = utf8_length((U8*) STRING(scan),
3959 (U8*) STRING(scan) + STR_LEN(scan));
3960 if (total_count_delta > total_chars) {
3961 total_count_delta = total_chars;
3965 *min_subtract += total_count_delta;
3968 else if (OP(scan) == EXACTFA) {
3970 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3971 * fold to the ASCII range (and there are no existing ones in the
3972 * upper latin1 range). But, as outlined in the comments preceding
3973 * this function, we need to flag any occurrences of the sharp s.
3974 * This character forbids trie formation (because of added
3976 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3977 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3978 || UNICODE_DOT_DOT_VERSION > 0)
3980 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3981 OP(scan) = EXACTFA_NO_TRIE;
3982 *unfolded_multi_char = TRUE;
3990 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3991 * folds that are all Latin1. As explained in the comments
3992 * preceding this function, we look also for the sharp s in EXACTF
3993 * and EXACTFL nodes; it can be in the final position. Otherwise
3994 * we can stop looking 1 byte earlier because have to find at least
3995 * two characters for a multi-fold */
3996 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4001 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4002 if (! len) { /* Not a multi-char fold. */
4003 if (*s == LATIN_SMALL_LETTER_SHARP_S
4004 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4006 *unfolded_multi_char = TRUE;
4013 && isALPHA_FOLD_EQ(*s, 's')
4014 && isALPHA_FOLD_EQ(*(s+1), 's'))
4017 /* EXACTF nodes need to know that the minimum length
4018 * changed so that a sharp s in the string can match this
4019 * ss in the pattern, but they remain EXACTF nodes, as they
4020 * won't match this unless the target string is is UTF-8,
4021 * which we don't know until runtime. EXACTFL nodes can't
4022 * transform into EXACTFU nodes */
4023 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4024 OP(scan) = EXACTFU_SS;
4028 *min_subtract += len - 1;
4036 /* Allow dumping but overwriting the collection of skipped
4037 * ops and/or strings with fake optimized ops */
4038 n = scan + NODE_SZ_STR(scan);
4046 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
4050 /* REx optimizer. Converts nodes into quicker variants "in place".
4051 Finds fixed substrings. */
4053 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4054 to the position after last scanned or to NULL. */
4056 #define INIT_AND_WITHP \
4057 assert(!and_withp); \
4058 Newx(and_withp,1, regnode_ssc); \
4059 SAVEFREEPV(and_withp)
4063 S_unwind_scan_frames(pTHX_ const void *p)
4065 scan_frame *f= (scan_frame *)p;
4067 scan_frame *n= f->next_frame;
4075 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4076 SSize_t *minlenp, SSize_t *deltap,
4081 regnode_ssc *and_withp,
4082 U32 flags, U32 depth)
4083 /* scanp: Start here (read-write). */
4084 /* deltap: Write maxlen-minlen here. */
4085 /* last: Stop before this one. */
4086 /* data: string data about the pattern */
4087 /* stopparen: treat close N as END */
4088 /* recursed: which subroutines have we recursed into */
4089 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4091 /* There must be at least this number of characters to match */
4094 regnode *scan = *scanp, *next;
4096 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4097 int is_inf_internal = 0; /* The studied chunk is infinite */
4098 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4099 scan_data_t data_fake;
4100 SV *re_trie_maxbuff = NULL;
4101 regnode *first_non_open = scan;
4102 SSize_t stopmin = SSize_t_MAX;
4103 scan_frame *frame = NULL;
4104 GET_RE_DEBUG_FLAGS_DECL;
4106 PERL_ARGS_ASSERT_STUDY_CHUNK;
4107 RExC_study_started= 1;
4111 while (first_non_open && OP(first_non_open) == OPEN)
4112 first_non_open=regnext(first_non_open);
4118 RExC_study_chunk_recursed_count++;
4120 DEBUG_OPTIMISE_MORE_r(
4122 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4123 depth, (long)stopparen,
4124 (unsigned long)RExC_study_chunk_recursed_count,
4125 (unsigned long)depth, (unsigned long)recursed_depth,
4128 if (recursed_depth) {
4131 for ( j = 0 ; j < recursed_depth ; j++ ) {
4132 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4134 PAREN_TEST(RExC_study_chunk_recursed +
4135 ( j * RExC_study_chunk_recursed_bytes), i )
4138 !PAREN_TEST(RExC_study_chunk_recursed +
4139 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4142 Perl_re_printf( aTHX_ " %d",(int)i);
4146 if ( j + 1 < recursed_depth ) {
4147 Perl_re_printf( aTHX_ ",");
4151 Perl_re_printf( aTHX_ "\n");
4154 while ( scan && OP(scan) != END && scan < last ){
4155 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4156 node length to get a real minimum (because
4157 the folded version may be shorter) */
4158 bool unfolded_multi_char = FALSE;
4159 /* Peephole optimizer: */
4160 DEBUG_STUDYDATA("Peep:", data, depth);
4161 DEBUG_PEEP("Peep", scan, depth);
4164 /* The reason we do this here is that we need to deal with things like
4165 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4166 * parsing code, as each (?:..) is handled by a different invocation of
4169 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4171 /* Follow the next-chain of the current node and optimize
4172 away all the NOTHINGs from it. */
4173 if (OP(scan) != CURLYX) {
4174 const int max = (reg_off_by_arg[OP(scan)]
4176 /* I32 may be smaller than U16 on CRAYs! */
4177 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4178 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4182 /* Skip NOTHING and LONGJMP. */
4183 while ((n = regnext(n))
4184 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4185 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4186 && off + noff < max)
4188 if (reg_off_by_arg[OP(scan)])
4191 NEXT_OFF(scan) = off;
4194 /* The principal pseudo-switch. Cannot be a switch, since we
4195 look into several different things. */
4196 if ( OP(scan) == DEFINEP ) {
4198 SSize_t deltanext = 0;
4199 SSize_t fake_last_close = 0;
4200 I32 f = SCF_IN_DEFINE;
4202 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4203 scan = regnext(scan);
4204 assert( OP(scan) == IFTHEN );
4205 DEBUG_PEEP("expect IFTHEN", scan, depth);
4207 data_fake.last_closep= &fake_last_close;
4209 next = regnext(scan);
4210 scan = NEXTOPER(NEXTOPER(scan));
4211 DEBUG_PEEP("scan", scan, depth);
4212 DEBUG_PEEP("next", next, depth);
4214 /* we suppose the run is continuous, last=next...
4215 * NOTE we dont use the return here! */
4216 (void)study_chunk(pRExC_state, &scan, &minlen,
4217 &deltanext, next, &data_fake, stopparen,
4218 recursed_depth, NULL, f, depth+1);
4223 OP(scan) == BRANCH ||
4224 OP(scan) == BRANCHJ ||
4227 next = regnext(scan);
4230 /* The op(next)==code check below is to see if we
4231 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4232 * IFTHEN is special as it might not appear in pairs.
4233 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4234 * we dont handle it cleanly. */
4235 if (OP(next) == code || code == IFTHEN) {
4236 /* NOTE - There is similar code to this block below for
4237 * handling TRIE nodes on a re-study. If you change stuff here
4238 * check there too. */
4239 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4241 regnode * const startbranch=scan;
4243 if (flags & SCF_DO_SUBSTR) {
4244 /* Cannot merge strings after this. */
4245 scan_commit(pRExC_state, data, minlenp, is_inf);
4248 if (flags & SCF_DO_STCLASS)
4249 ssc_init_zero(pRExC_state, &accum);
4251 while (OP(scan) == code) {
4252 SSize_t deltanext, minnext, fake;
4254 regnode_ssc this_class;
4256 DEBUG_PEEP("Branch", scan, depth);
4259 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4261 data_fake.whilem_c = data->whilem_c;
4262 data_fake.last_closep = data->last_closep;
4265 data_fake.last_closep = &fake;
4267 data_fake.pos_delta = delta;
4268 next = regnext(scan);
4270 scan = NEXTOPER(scan); /* everything */
4271 if (code != BRANCH) /* everything but BRANCH */
4272 scan = NEXTOPER(scan);
4274 if (flags & SCF_DO_STCLASS) {
4275 ssc_init(pRExC_state, &this_class);
4276 data_fake.start_class = &this_class;
4277 f = SCF_DO_STCLASS_AND;
4279 if (flags & SCF_WHILEM_VISITED_POS)
4280 f |= SCF_WHILEM_VISITED_POS;
4282 /* we suppose the run is continuous, last=next...*/
4283 minnext = study_chunk(pRExC_state, &scan, minlenp,
4284 &deltanext, next, &data_fake, stopparen,
4285 recursed_depth, NULL, f,depth+1);
4289 if (deltanext == SSize_t_MAX) {
4290 is_inf = is_inf_internal = 1;
4292 } else if (max1 < minnext + deltanext)
4293 max1 = minnext + deltanext;
4295 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4297 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4298 if ( stopmin > minnext)
4299 stopmin = min + min1;
4300 flags &= ~SCF_DO_SUBSTR;
4302 data->flags |= SCF_SEEN_ACCEPT;
4305 if (data_fake.flags & SF_HAS_EVAL)
4306 data->flags |= SF_HAS_EVAL;
4307 data->whilem_c = data_fake.whilem_c;
4309 if (flags & SCF_DO_STCLASS)
4310 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4312 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4314 if (flags & SCF_DO_SUBSTR) {
4315 data->pos_min += min1;
4316 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4317 data->pos_delta = SSize_t_MAX;
4319 data->pos_delta += max1 - min1;
4320 if (max1 != min1 || is_inf)
4321 data->longest = &(data->longest_float);
4324 if (delta == SSize_t_MAX
4325 || SSize_t_MAX - delta - (max1 - min1) < 0)
4326 delta = SSize_t_MAX;
4328 delta += max1 - min1;
4329 if (flags & SCF_DO_STCLASS_OR) {
4330 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4332 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4333 flags &= ~SCF_DO_STCLASS;
4336 else if (flags & SCF_DO_STCLASS_AND) {
4338 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4339 flags &= ~SCF_DO_STCLASS;
4342 /* Switch to OR mode: cache the old value of
4343 * data->start_class */
4345 StructCopy(data->start_class, and_withp, regnode_ssc);
4346 flags &= ~SCF_DO_STCLASS_AND;
4347 StructCopy(&accum, data->start_class, regnode_ssc);
4348 flags |= SCF_DO_STCLASS_OR;
4352 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4353 OP( startbranch ) == BRANCH )
4357 Assuming this was/is a branch we are dealing with: 'scan'
4358 now points at the item that follows the branch sequence,
4359 whatever it is. We now start at the beginning of the
4360 sequence and look for subsequences of
4366 which would be constructed from a pattern like
4369 If we can find such a subsequence we need to turn the first
4370 element into a trie and then add the subsequent branch exact
4371 strings to the trie.
4375 1. patterns where the whole set of branches can be
4378 2. patterns where only a subset can be converted.
4380 In case 1 we can replace the whole set with a single regop
4381 for the trie. In case 2 we need to keep the start and end
4384 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4385 becomes BRANCH TRIE; BRANCH X;
4387 There is an additional case, that being where there is a
4388 common prefix, which gets split out into an EXACT like node
4389 preceding the TRIE node.
4391 If x(1..n)==tail then we can do a simple trie, if not we make
4392 a "jump" trie, such that when we match the appropriate word
4393 we "jump" to the appropriate tail node. Essentially we turn
4394 a nested if into a case structure of sorts.
4399 if (!re_trie_maxbuff) {
4400 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4401 if (!SvIOK(re_trie_maxbuff))
4402 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4404 if ( SvIV(re_trie_maxbuff)>=0 ) {
4406 regnode *first = (regnode *)NULL;
4407 regnode *last = (regnode *)NULL;
4408 regnode *tail = scan;
4412 /* var tail is used because there may be a TAIL
4413 regop in the way. Ie, the exacts will point to the
4414 thing following the TAIL, but the last branch will
4415 point at the TAIL. So we advance tail. If we
4416 have nested (?:) we may have to move through several
4420 while ( OP( tail ) == TAIL ) {
4421 /* this is the TAIL generated by (?:) */
4422 tail = regnext( tail );
4426 DEBUG_TRIE_COMPILE_r({
4427 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4428 Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n",
4430 "Looking for TRIE'able sequences. Tail node is ",
4431 (UV)(tail - RExC_emit_start),
4432 SvPV_nolen_const( RExC_mysv )
4438 Step through the branches
4439 cur represents each branch,
4440 noper is the first thing to be matched as part
4442 noper_next is the regnext() of that node.
4444 We normally handle a case like this
4445 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4446 support building with NOJUMPTRIE, which restricts
4447 the trie logic to structures like /FOO|BAR/.
4449 If noper is a trieable nodetype then the branch is
4450 a possible optimization target. If we are building
4451 under NOJUMPTRIE then we require that noper_next is
4452 the same as scan (our current position in the regex
4455 Once we have two or more consecutive such branches
4456 we can create a trie of the EXACT's contents and
4457 stitch it in place into the program.
4459 If the sequence represents all of the branches in
4460 the alternation we replace the entire thing with a
4463 Otherwise when it is a subsequence we need to
4464 stitch it in place and replace only the relevant
4465 branches. This means the first branch has to remain
4466 as it is used by the alternation logic, and its
4467 next pointer, and needs to be repointed at the item
4468 on the branch chain following the last branch we
4469 have optimized away.
4471 This could be either a BRANCH, in which case the
4472 subsequence is internal, or it could be the item
4473 following the branch sequence in which case the
4474 subsequence is at the end (which does not
4475 necessarily mean the first node is the start of the
4478 TRIE_TYPE(X) is a define which maps the optype to a
4482 ----------------+-----------
4486 EXACTFU_SS | EXACTFU
4489 EXACTFLU8 | EXACTFLU8
4493 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4495 : ( EXACT == (X) ) \
4497 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4499 : ( EXACTFA == (X) ) \
4501 : ( EXACTL == (X) ) \
4503 : ( EXACTFLU8 == (X) ) \
4507 /* dont use tail as the end marker for this traverse */
4508 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4509 regnode * const noper = NEXTOPER( cur );
4510 U8 noper_type = OP( noper );
4511 U8 noper_trietype = TRIE_TYPE( noper_type );
4512 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4513 regnode * const noper_next = regnext( noper );
4514 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4515 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4518 DEBUG_TRIE_COMPILE_r({
4519 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4520 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4522 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4524 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4525 Perl_re_printf( aTHX_ " -> %d:%s",
4526 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4529 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4530 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4531 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4533 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4534 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4535 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4539 /* Is noper a trieable nodetype that can be merged
4540 * with the current trie (if there is one)? */
4544 ( noper_trietype == NOTHING )
4545 || ( trietype == NOTHING )
4546 || ( trietype == noper_trietype )
4549 && noper_next >= tail
4553 /* Handle mergable triable node Either we are
4554 * the first node in a new trieable sequence,
4555 * in which case we do some bookkeeping,
4556 * otherwise we update the end pointer. */
4559 if ( noper_trietype == NOTHING ) {
4560 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4561 regnode * const noper_next = regnext( noper );
4562 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4563 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4566 if ( noper_next_trietype ) {
4567 trietype = noper_next_trietype;
4568 } else if (noper_next_type) {
4569 /* a NOTHING regop is 1 regop wide.
4570 * We need at least two for a trie
4571 * so we can't merge this in */
4575 trietype = noper_trietype;
4578 if ( trietype == NOTHING )
4579 trietype = noper_trietype;
4584 } /* end handle mergable triable node */
4586 /* handle unmergable node -
4587 * noper may either be a triable node which can
4588 * not be tried together with the current trie,
4589 * or a non triable node */
4591 /* If last is set and trietype is not
4592 * NOTHING then we have found at least two
4593 * triable branch sequences in a row of a
4594 * similar trietype so we can turn them
4595 * into a trie. If/when we allow NOTHING to
4596 * start a trie sequence this condition
4597 * will be required, and it isn't expensive
4598 * so we leave it in for now. */
4599 if ( trietype && trietype != NOTHING )
4600 make_trie( pRExC_state,
4601 startbranch, first, cur, tail,
4602 count, trietype, depth+1 );
4603 last = NULL; /* note: we clear/update
4604 first, trietype etc below,
4605 so we dont do it here */
4609 && noper_next >= tail
4612 /* noper is triable, so we can start a new
4616 trietype = noper_trietype;
4618 /* if we already saw a first but the
4619 * current node is not triable then we have
4620 * to reset the first information. */
4625 } /* end handle unmergable node */
4626 } /* loop over branches */
4627 DEBUG_TRIE_COMPILE_r({
4628 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4629 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
4630 depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4631 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4632 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4633 PL_reg_name[trietype]
4637 if ( last && trietype ) {
4638 if ( trietype != NOTHING ) {
4639 /* the last branch of the sequence was part of
4640 * a trie, so we have to construct it here
4641 * outside of the loop */
4642 made= make_trie( pRExC_state, startbranch,
4643 first, scan, tail, count,
4644 trietype, depth+1 );
4645 #ifdef TRIE_STUDY_OPT
4646 if ( ((made == MADE_EXACT_TRIE &&
4647 startbranch == first)
4648 || ( first_non_open == first )) &&
4650 flags |= SCF_TRIE_RESTUDY;
4651 if ( startbranch == first
4654 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4659 /* at this point we know whatever we have is a
4660 * NOTHING sequence/branch AND if 'startbranch'
4661 * is 'first' then we can turn the whole thing
4664 if ( startbranch == first ) {
4666 /* the entire thing is a NOTHING sequence,
4667 * something like this: (?:|) So we can
4668 * turn it into a plain NOTHING op. */
4669 DEBUG_TRIE_COMPILE_r({
4670 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4671 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4673 SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4676 OP(startbranch)= NOTHING;
4677 NEXT_OFF(startbranch)= tail - startbranch;
4678 for ( opt= startbranch + 1; opt < tail ; opt++ )
4682 } /* end if ( last) */
4683 } /* TRIE_MAXBUF is non zero */
4688 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4689 scan = NEXTOPER(NEXTOPER(scan));
4690 } else /* single branch is optimized. */
4691 scan = NEXTOPER(scan);
4693 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4695 regnode *start = NULL;
4696 regnode *end = NULL;
4697 U32 my_recursed_depth= recursed_depth;
4699 if (OP(scan) != SUSPEND) { /* GOSUB */
4700 /* Do setup, note this code has side effects beyond
4701 * the rest of this block. Specifically setting
4702 * RExC_recurse[] must happen at least once during
4705 RExC_recurse[ARG2L(scan)] = scan;
4706 start = RExC_open_parens[paren];
4707 end = RExC_close_parens[paren];
4709 /* NOTE we MUST always execute the above code, even
4710 * if we do nothing with a GOSUB */
4712 ( flags & SCF_IN_DEFINE )
4715 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4717 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4720 /* no need to do anything here if we are in a define. */
4721 /* or we are after some kind of infinite construct
4722 * so we can skip recursing into this item.
4723 * Since it is infinite we will not change the maxlen
4724 * or delta, and if we miss something that might raise
4725 * the minlen it will merely pessimise a little.
4727 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4728 * might result in a minlen of 1 and not of 4,
4729 * but this doesn't make us mismatch, just try a bit
4730 * harder than we should.
4732 scan= regnext(scan);
4739 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4741 /* it is quite possible that there are more efficient ways
4742 * to do this. We maintain a bitmap per level of recursion
4743 * of which patterns we have entered so we can detect if a
4744 * pattern creates a possible infinite loop. When we
4745 * recurse down a level we copy the previous levels bitmap
4746 * down. When we are at recursion level 0 we zero the top
4747 * level bitmap. It would be nice to implement a different
4748 * more efficient way of doing this. In particular the top
4749 * level bitmap may be unnecessary.
4751 if (!recursed_depth) {
4752 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4754 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4755 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4756 RExC_study_chunk_recursed_bytes, U8);
4758 /* we havent recursed into this paren yet, so recurse into it */
4759 DEBUG_STUDYDATA("gosub-set:", data,depth);
4760 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4761 my_recursed_depth= recursed_depth + 1;
4763 DEBUG_STUDYDATA("gosub-inf:", data,depth);
4764 /* some form of infinite recursion, assume infinite length
4766 if (flags & SCF_DO_SUBSTR) {
4767 scan_commit(pRExC_state, data, minlenp, is_inf);
4768 data->longest = &(data->longest_float);
4770 is_inf = is_inf_internal = 1;
4771 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4772 ssc_anything(data->start_class);
4773 flags &= ~SCF_DO_STCLASS;
4775 start= NULL; /* reset start so we dont recurse later on. */
4780 end = regnext(scan);
4783 scan_frame *newframe;
4785 if (!RExC_frame_last) {
4786 Newxz(newframe, 1, scan_frame);
4787 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4788 RExC_frame_head= newframe;
4790 } else if (!RExC_frame_last->next_frame) {
4791 Newxz(newframe,1,scan_frame);
4792 RExC_frame_last->next_frame= newframe;
4793 newframe->prev_frame= RExC_frame_last;
4796 newframe= RExC_frame_last->next_frame;
4798 RExC_frame_last= newframe;
4800 newframe->next_regnode = regnext(scan);
4801 newframe->last_regnode = last;
4802 newframe->stopparen = stopparen;
4803 newframe->prev_recursed_depth = recursed_depth;
4804 newframe->this_prev_frame= frame;
4806 DEBUG_STUDYDATA("frame-new:",data,depth);
4807 DEBUG_PEEP("fnew", scan, depth);
4814 recursed_depth= my_recursed_depth;
4819 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4820 SSize_t l = STR_LEN(scan);
4823 const U8 * const s = (U8*)STRING(scan);
4824 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4825 l = utf8_length(s, s + l);
4827 uc = *((U8*)STRING(scan));
4830 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4831 /* The code below prefers earlier match for fixed
4832 offset, later match for variable offset. */
4833 if (data->last_end == -1) { /* Update the start info. */
4834 data->last_start_min = data->pos_min;
4835 data->last_start_max = is_inf
4836 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4838 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4840 SvUTF8_on(data->last_found);
4842 SV * const sv = data->last_found;
4843 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4844 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4845 if (mg && mg->mg_len >= 0)
4846 mg->mg_len += utf8_length((U8*)STRING(scan),
4847 (U8*)STRING(scan)+STR_LEN(scan));
4849 data->last_end = data->pos_min + l;
4850 data->pos_min += l; /* As in the first entry. */
4851 data->flags &= ~SF_BEFORE_EOL;
4854 /* ANDing the code point leaves at most it, and not in locale, and
4855 * can't match null string */
4856 if (flags & SCF_DO_STCLASS_AND) {
4857 ssc_cp_and(data->start_class, uc);
4858 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4859 ssc_clear_locale(data->start_class);
4861 else if (flags & SCF_DO_STCLASS_OR) {
4862 ssc_add_cp(data->start_class, uc);
4863 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4865 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4866 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4868 flags &= ~SCF_DO_STCLASS;
4870 else if (PL_regkind[OP(scan)] == EXACT) {
4871 /* But OP != EXACT!, so is EXACTFish */
4872 SSize_t l = STR_LEN(scan);
4873 const U8 * s = (U8*)STRING(scan);
4875 /* Search for fixed substrings supports EXACT only. */
4876 if (flags & SCF_DO_SUBSTR) {
4878 scan_commit(pRExC_state, data, minlenp, is_inf);
4881 l = utf8_length(s, s + l);
4883 if (unfolded_multi_char) {
4884 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4886 min += l - min_subtract;
4888 delta += min_subtract;
4889 if (flags & SCF_DO_SUBSTR) {
4890 data->pos_min += l - min_subtract;
4891 if (data->pos_min < 0) {
4894 data->pos_delta += min_subtract;
4896 data->longest = &(data->longest_float);
4900 if (flags & SCF_DO_STCLASS) {
4901 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4903 assert(EXACTF_invlist);
4904 if (flags & SCF_DO_STCLASS_AND) {
4905 if (OP(scan) != EXACTFL)
4906 ssc_clear_locale(data->start_class);
4907 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4908 ANYOF_POSIXL_ZERO(data->start_class);
4909 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4911 else { /* SCF_DO_STCLASS_OR */
4912 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4913 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4915 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4916 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4918 flags &= ~SCF_DO_STCLASS;
4919 SvREFCNT_dec(EXACTF_invlist);
4922 else if (REGNODE_VARIES(OP(scan))) {
4923 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4924 I32 fl = 0, f = flags;
4925 regnode * const oscan = scan;
4926 regnode_ssc this_class;
4927 regnode_ssc *oclass = NULL;
4928 I32 next_is_eval = 0;
4930 switch (PL_regkind[OP(scan)]) {
4931 case WHILEM: /* End of (?:...)* . */
4932 scan = NEXTOPER(scan);
4935 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4936 next = NEXTOPER(scan);
4937 if (OP(next) == EXACT
4938 || OP(next) == EXACTL
4939 || (flags & SCF_DO_STCLASS))
4942 maxcount = REG_INFTY;
4943 next = regnext(scan);
4944 scan = NEXTOPER(scan);
4948 if (flags & SCF_DO_SUBSTR)
4953 if (flags & SCF_DO_STCLASS) {
4955 maxcount = REG_INFTY;
4956 next = regnext(scan);
4957 scan = NEXTOPER(scan);
4960 if (flags & SCF_DO_SUBSTR) {
4961 scan_commit(pRExC_state, data, minlenp, is_inf);
4962 /* Cannot extend fixed substrings */
4963 data->longest = &(data->longest_float);
4965 is_inf = is_inf_internal = 1;
4966 scan = regnext(scan);
4967 goto optimize_curly_tail;
4969 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4970 && (scan->flags == stopparen))
4975 mincount = ARG1(scan);
4976 maxcount = ARG2(scan);
4978 next = regnext(scan);
4979 if (OP(scan) == CURLYX) {
4980 I32 lp = (data ? *(data->last_closep) : 0);
4981 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4983 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4984 next_is_eval = (OP(scan) == EVAL);
4986 if (flags & SCF_DO_SUBSTR) {
4988 scan_commit(pRExC_state, data, minlenp, is_inf);
4989 /* Cannot extend fixed substrings */
4990 pos_before = data->pos_min;
4994 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4996 data->flags |= SF_IS_INF;
4998 if (flags & SCF_DO_STCLASS) {
4999 ssc_init(pRExC_state, &this_class);
5000 oclass = data->start_class;
5001 data->start_class = &this_class;
5002 f |= SCF_DO_STCLASS_AND;
5003 f &= ~SCF_DO_STCLASS_OR;
5005 /* Exclude from super-linear cache processing any {n,m}
5006 regops for which the combination of input pos and regex
5007 pos is not enough information to determine if a match
5010 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5011 regex pos at the \s*, the prospects for a match depend not
5012 only on the input position but also on how many (bar\s*)
5013 repeats into the {4,8} we are. */
5014 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5015 f &= ~SCF_WHILEM_VISITED_POS;
5017 /* This will finish on WHILEM, setting scan, or on NULL: */
5018 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5019 last, data, stopparen, recursed_depth, NULL,
5021 ? (f & ~SCF_DO_SUBSTR)
5025 if (flags & SCF_DO_STCLASS)
5026 data->start_class = oclass;
5027 if (mincount == 0 || minnext == 0) {
5028 if (flags & SCF_DO_STCLASS_OR) {
5029 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5031 else if (flags & SCF_DO_STCLASS_AND) {
5032 /* Switch to OR mode: cache the old value of
5033 * data->start_class */
5035 StructCopy(data->start_class, and_withp, regnode_ssc);
5036 flags &= ~SCF_DO_STCLASS_AND;
5037 StructCopy(&this_class, data->start_class, regnode_ssc);
5038 flags |= SCF_DO_STCLASS_OR;
5039 ANYOF_FLAGS(data->start_class)
5040 |= SSC_MATCHES_EMPTY_STRING;
5042 } else { /* Non-zero len */
5043 if (flags & SCF_DO_STCLASS_OR) {
5044 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5045 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5047 else if (flags & SCF_DO_STCLASS_AND)
5048 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5049 flags &= ~SCF_DO_STCLASS;
5051 if (!scan) /* It was not CURLYX, but CURLY. */
5053 if (!(flags & SCF_TRIE_DOING_RESTUDY)
5054 /* ? quantifier ok, except for (?{ ... }) */
5055 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5056 && (minnext == 0) && (deltanext == 0)
5057 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5058 && maxcount <= REG_INFTY/3) /* Complement check for big
5061 /* Fatal warnings may leak the regexp without this: */
5062 SAVEFREESV(RExC_rx_sv);
5063 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5064 "Quantifier unexpected on zero-length expression "
5065 "in regex m/%"UTF8f"/",
5066 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5068 (void)ReREFCNT_inc(RExC_rx_sv);
5071 min += minnext * mincount;
5072 is_inf_internal |= deltanext == SSize_t_MAX
5073 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5074 is_inf |= is_inf_internal;
5076 delta = SSize_t_MAX;
5078 delta += (minnext + deltanext) * maxcount
5079 - minnext * mincount;
5081 /* Try powerful optimization CURLYX => CURLYN. */
5082 if ( OP(oscan) == CURLYX && data
5083 && data->flags & SF_IN_PAR
5084 && !(data->flags & SF_HAS_EVAL)
5085 && !deltanext && minnext == 1 ) {
5086 /* Try to optimize to CURLYN. */
5087 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5088 regnode * const nxt1 = nxt;
5095 if (!REGNODE_SIMPLE(OP(nxt))
5096 && !(PL_regkind[OP(nxt)] == EXACT
5097 && STR_LEN(nxt) == 1))
5103 if (OP(nxt) != CLOSE)
5105 if (RExC_open_parens) {
5106 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5107 RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5109 /* Now we know that nxt2 is the only contents: */
5110 oscan->flags = (U8)ARG(nxt);
5112 OP(nxt1) = NOTHING; /* was OPEN. */
5115 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5116 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5117 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5118 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5119 OP(nxt + 1) = OPTIMIZED; /* was count. */
5120 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5125 /* Try optimization CURLYX => CURLYM. */
5126 if ( OP(oscan) == CURLYX && data
5127 && !(data->flags & SF_HAS_PAR)
5128 && !(data->flags & SF_HAS_EVAL)
5129 && !deltanext /* atom is fixed width */
5130 && minnext != 0 /* CURLYM can't handle zero width */
5132 /* Nor characters whose fold at run-time may be
5133 * multi-character */
5134 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5136 /* XXXX How to optimize if data == 0? */
5137 /* Optimize to a simpler form. */
5138 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5142 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5143 && (OP(nxt2) != WHILEM))
5145 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5146 /* Need to optimize away parenths. */
5147 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5148 /* Set the parenth number. */
5149 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5151 oscan->flags = (U8)ARG(nxt);
5152 if (RExC_open_parens) {
5153 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5154 RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5156 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5157 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5161 OP(nxt + 1) = OPTIMIZED; /* was count. */
5162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5163 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5167 regnode *nnxt = regnext(nxt1);
5169 if (reg_off_by_arg[OP(nxt1)])
5170 ARG_SET(nxt1, nxt2 - nxt1);
5171 else if (nxt2 - nxt1 < U16_MAX)
5172 NEXT_OFF(nxt1) = nxt2 - nxt1;
5174 OP(nxt) = NOTHING; /* Cannot beautify */
5179 /* Optimize again: */
5180 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5181 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5186 else if ((OP(oscan) == CURLYX)
5187 && (flags & SCF_WHILEM_VISITED_POS)
5188 /* See the comment on a similar expression above.
5189 However, this time it's not a subexpression
5190 we care about, but the expression itself. */
5191 && (maxcount == REG_INFTY)
5192 && data && ++data->whilem_c < 16) {
5193 /* This stays as CURLYX, we can put the count/of pair. */
5194 /* Find WHILEM (as in regexec.c) */
5195 regnode *nxt = oscan + NEXT_OFF(oscan);
5197 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5199 PREVOPER(nxt)->flags = (U8)(data->whilem_c
5200 | (RExC_whilem_seen << 4)); /* On WHILEM */
5202 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5204 if (flags & SCF_DO_SUBSTR) {
5205 SV *last_str = NULL;
5206 STRLEN last_chrs = 0;
5207 int counted = mincount != 0;
5209 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5211 SSize_t b = pos_before >= data->last_start_min
5212 ? pos_before : data->last_start_min;
5214 const char * const s = SvPV_const(data->last_found, l);
5215 SSize_t old = b - data->last_start_min;
5218 old = utf8_hop((U8*)s, old) - (U8*)s;
5220 /* Get the added string: */
5221 last_str = newSVpvn_utf8(s + old, l, UTF);
5222 last_chrs = UTF ? utf8_length((U8*)(s + old),
5223 (U8*)(s + old + l)) : l;
5224 if (deltanext == 0 && pos_before == b) {
5225 /* What was added is a constant string */
5228 SvGROW(last_str, (mincount * l) + 1);
5229 repeatcpy(SvPVX(last_str) + l,
5230 SvPVX_const(last_str), l,
5232 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5233 /* Add additional parts. */
5234 SvCUR_set(data->last_found,
5235 SvCUR(data->last_found) - l);
5236 sv_catsv(data->last_found, last_str);
5238 SV * sv = data->last_found;
5240 SvUTF8(sv) && SvMAGICAL(sv) ?
5241 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5242 if (mg && mg->mg_len >= 0)
5243 mg->mg_len += last_chrs * (mincount-1);
5245 last_chrs *= mincount;
5246 data->last_end += l * (mincount - 1);
5249 /* start offset must point into the last copy */
5250 data->last_start_min += minnext * (mincount - 1);
5251 data->last_start_max =
5254 : data->last_start_max +
5255 (maxcount - 1) * (minnext + data->pos_delta);
5258 /* It is counted once already... */
5259 data->pos_min += minnext * (mincount - counted);
5261 Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf
5262 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5263 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5264 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5266 if (deltanext != SSize_t_MAX)
5267 Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n",
5268 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5269 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5271 if (deltanext == SSize_t_MAX
5272 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5273 data->pos_delta = SSize_t_MAX;
5275 data->pos_delta += - counted * deltanext +
5276 (minnext + deltanext) * maxcount - minnext * mincount;
5277 if (mincount != maxcount) {
5278 /* Cannot extend fixed substrings found inside
5280 scan_commit(pRExC_state, data, minlenp, is_inf);
5281 if (mincount && last_str) {
5282 SV * const sv = data->last_found;
5283 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5284 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5288 sv_setsv(sv, last_str);
5289 data->last_end = data->pos_min;
5290 data->last_start_min = data->pos_min - last_chrs;
5291 data->last_start_max = is_inf
5293 : data->pos_min + data->pos_delta - last_chrs;
5295 data->longest = &(data->longest_float);
5297 SvREFCNT_dec(last_str);
5299 if (data && (fl & SF_HAS_EVAL))
5300 data->flags |= SF_HAS_EVAL;
5301 optimize_curly_tail:
5302 if (OP(oscan) != CURLYX) {
5303 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5305 NEXT_OFF(oscan) += NEXT_OFF(next);
5311 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5316 if (flags & SCF_DO_SUBSTR) {
5317 /* Cannot expect anything... */
5318 scan_commit(pRExC_state, data, minlenp, is_inf);
5319 data->longest = &(data->longest_float);
5321 is_inf = is_inf_internal = 1;
5322 if (flags & SCF_DO_STCLASS_OR) {
5323 if (OP(scan) == CLUMP) {
5324 /* Actually is any start char, but very few code points
5325 * aren't start characters */
5326 ssc_match_all_cp(data->start_class);
5329 ssc_anything(data->start_class);
5332 flags &= ~SCF_DO_STCLASS;
5336 else if (OP(scan) == LNBREAK) {
5337 if (flags & SCF_DO_STCLASS) {
5338 if (flags & SCF_DO_STCLASS_AND) {
5339 ssc_intersection(data->start_class,
5340 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5341 ssc_clear_locale(data->start_class);
5342 ANYOF_FLAGS(data->start_class)
5343 &= ~SSC_MATCHES_EMPTY_STRING;
5345 else if (flags & SCF_DO_STCLASS_OR) {
5346 ssc_union(data->start_class,
5347 PL_XPosix_ptrs[_CC_VERTSPACE],
5349 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5351 /* See commit msg for
5352 * 749e076fceedeb708a624933726e7989f2302f6a */
5353 ANYOF_FLAGS(data->start_class)
5354 &= ~SSC_MATCHES_EMPTY_STRING;
5356 flags &= ~SCF_DO_STCLASS;
5359 if (delta != SSize_t_MAX)
5360 delta++; /* Because of the 2 char string cr-lf */
5361 if (flags & SCF_DO_SUBSTR) {
5362 /* Cannot expect anything... */
5363 scan_commit(pRExC_state, data, minlenp, is_inf);
5365 data->pos_delta += 1;
5366 data->longest = &(data->longest_float);
5369 else if (REGNODE_SIMPLE(OP(scan))) {
5371 if (flags & SCF_DO_SUBSTR) {
5372 scan_commit(pRExC_state, data, minlenp, is_inf);
5376 if (flags & SCF_DO_STCLASS) {
5378 SV* my_invlist = NULL;
5381 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5382 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5384 /* Some of the logic below assumes that switching
5385 locale on will only add false positives. */
5390 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5394 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5395 ssc_match_all_cp(data->start_class);
5400 SV* REG_ANY_invlist = _new_invlist(2);
5401 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5403 if (flags & SCF_DO_STCLASS_OR) {
5404 ssc_union(data->start_class,
5406 TRUE /* TRUE => invert, hence all but \n
5410 else if (flags & SCF_DO_STCLASS_AND) {
5411 ssc_intersection(data->start_class,
5413 TRUE /* TRUE => invert */
5415 ssc_clear_locale(data->start_class);
5417 SvREFCNT_dec_NN(REG_ANY_invlist);
5424 if (flags & SCF_DO_STCLASS_AND)
5425 ssc_and(pRExC_state, data->start_class,
5426 (regnode_charclass *) scan);
5428 ssc_or(pRExC_state, data->start_class,
5429 (regnode_charclass *) scan);
5437 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5438 if (flags & SCF_DO_STCLASS_AND) {
5439 bool was_there = cBOOL(
5440 ANYOF_POSIXL_TEST(data->start_class,
5442 ANYOF_POSIXL_ZERO(data->start_class);
5443 if (was_there) { /* Do an AND */
5444 ANYOF_POSIXL_SET(data->start_class, namedclass);
5446 /* No individual code points can now match */
5447 data->start_class->invlist
5448 = sv_2mortal(_new_invlist(0));
5451 int complement = namedclass + ((invert) ? -1 : 1);
5453 assert(flags & SCF_DO_STCLASS_OR);
5455 /* If the complement of this class was already there,
5456 * the result is that they match all code points,
5457 * (\d + \D == everything). Remove the classes from
5458 * future consideration. Locale is not relevant in
5460 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5461 ssc_match_all_cp(data->start_class);
5462 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5463 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5465 else { /* The usual case; just add this class to the
5467 ANYOF_POSIXL_SET(data->start_class, namedclass);
5472 case NPOSIXA: /* For these, we always know the exact set of
5477 if (FLAGS(scan) == _CC_ASCII) {
5478 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5481 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5482 PL_XPosix_ptrs[_CC_ASCII],
5493 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5495 /* NPOSIXD matches all upper Latin1 code points unless the
5496 * target string being matched is UTF-8, which is
5497 * unknowable until match time. Since we are going to
5498 * invert, we want to get rid of all of them so that the
5499 * inversion will match all */
5500 if (OP(scan) == NPOSIXD) {
5501 _invlist_subtract(my_invlist, PL_UpperLatin1,
5507 if (flags & SCF_DO_STCLASS_AND) {
5508 ssc_intersection(data->start_class, my_invlist, invert);
5509 ssc_clear_locale(data->start_class);
5512 assert(flags & SCF_DO_STCLASS_OR);
5513 ssc_union(data->start_class, my_invlist, invert);
5515 SvREFCNT_dec(my_invlist);
5517 if (flags & SCF_DO_STCLASS_OR)
5518 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5519 flags &= ~SCF_DO_STCLASS;
5522 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5523 data->flags |= (OP(scan) == MEOL
5526 scan_commit(pRExC_state, data, minlenp, is_inf);
5529 else if ( PL_regkind[OP(scan)] == BRANCHJ
5530 /* Lookbehind, or need to calculate parens/evals/stclass: */
5531 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5532 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5534 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5535 || OP(scan) == UNLESSM )
5537 /* Negative Lookahead/lookbehind
5538 In this case we can't do fixed string optimisation.
5541 SSize_t deltanext, minnext, fake = 0;
5546 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5548 data_fake.whilem_c = data->whilem_c;
5549 data_fake.last_closep = data->last_closep;
5552 data_fake.last_closep = &fake;
5553 data_fake.pos_delta = delta;
5554 if ( flags & SCF_DO_STCLASS && !scan->flags
5555 && OP(scan) == IFMATCH ) { /* Lookahead */
5556 ssc_init(pRExC_state, &intrnl);
5557 data_fake.start_class = &intrnl;
5558 f |= SCF_DO_STCLASS_AND;
5560 if (flags & SCF_WHILEM_VISITED_POS)
5561 f |= SCF_WHILEM_VISITED_POS;
5562 next = regnext(scan);
5563 nscan = NEXTOPER(NEXTOPER(scan));
5564 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5565 last, &data_fake, stopparen,
5566 recursed_depth, NULL, f, depth+1);
5569 FAIL("Variable length lookbehind not implemented");
5571 else if (minnext > (I32)U8_MAX) {
5572 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5575 scan->flags = (U8)minnext;
5578 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5580 if (data_fake.flags & SF_HAS_EVAL)
5581 data->flags |= SF_HAS_EVAL;
5582 data->whilem_c = data_fake.whilem_c;
5584 if (f & SCF_DO_STCLASS_AND) {
5585 if (flags & SCF_DO_STCLASS_OR) {
5586 /* OR before, AND after: ideally we would recurse with
5587 * data_fake to get the AND applied by study of the
5588 * remainder of the pattern, and then derecurse;
5589 * *** HACK *** for now just treat as "no information".
5590 * See [perl #56690].
5592 ssc_init(pRExC_state, data->start_class);
5594 /* AND before and after: combine and continue. These
5595 * assertions are zero-length, so can match an EMPTY
5597 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5598 ANYOF_FLAGS(data->start_class)
5599 |= SSC_MATCHES_EMPTY_STRING;
5603 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5605 /* Positive Lookahead/lookbehind
5606 In this case we can do fixed string optimisation,
5607 but we must be careful about it. Note in the case of
5608 lookbehind the positions will be offset by the minimum
5609 length of the pattern, something we won't know about
5610 until after the recurse.
5612 SSize_t deltanext, fake = 0;
5616 /* We use SAVEFREEPV so that when the full compile
5617 is finished perl will clean up the allocated
5618 minlens when it's all done. This way we don't
5619 have to worry about freeing them when we know
5620 they wont be used, which would be a pain.
5623 Newx( minnextp, 1, SSize_t );
5624 SAVEFREEPV(minnextp);
5627 StructCopy(data, &data_fake, scan_data_t);
5628 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5631 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5632 data_fake.last_found=newSVsv(data->last_found);
5636 data_fake.last_closep = &fake;
5637 data_fake.flags = 0;
5638 data_fake.pos_delta = delta;
5640 data_fake.flags |= SF_IS_INF;
5641 if ( flags & SCF_DO_STCLASS && !scan->flags
5642 && OP(scan) == IFMATCH ) { /* Lookahead */
5643 ssc_init(pRExC_state, &intrnl);
5644 data_fake.start_class = &intrnl;
5645 f |= SCF_DO_STCLASS_AND;
5647 if (flags & SCF_WHILEM_VISITED_POS)
5648 f |= SCF_WHILEM_VISITED_POS;
5649 next = regnext(scan);
5650 nscan = NEXTOPER(NEXTOPER(scan));
5652 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5653 &deltanext, last, &data_fake,
5654 stopparen, recursed_depth, NULL,
5658 FAIL("Variable length lookbehind not implemented");
5660 else if (*minnextp > (I32)U8_MAX) {
5661 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5664 scan->flags = (U8)*minnextp;
5669 if (f & SCF_DO_STCLASS_AND) {
5670 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5671 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5674 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5676 if (data_fake.flags & SF_HAS_EVAL)
5677 data->flags |= SF_HAS_EVAL;
5678 data->whilem_c = data_fake.whilem_c;
5679 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5680 if (RExC_rx->minlen<*minnextp)
5681 RExC_rx->minlen=*minnextp;
5682 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5683 SvREFCNT_dec_NN(data_fake.last_found);
5685 if ( data_fake.minlen_fixed != minlenp )
5687 data->offset_fixed= data_fake.offset_fixed;
5688 data->minlen_fixed= data_fake.minlen_fixed;
5689 data->lookbehind_fixed+= scan->flags;
5691 if ( data_fake.minlen_float != minlenp )
5693 data->minlen_float= data_fake.minlen_float;
5694 data->offset_float_min=data_fake.offset_float_min;
5695 data->offset_float_max=data_fake.offset_float_max;
5696 data->lookbehind_float+= scan->flags;
5703 else if (OP(scan) == OPEN) {
5704 if (stopparen != (I32)ARG(scan))
5707 else if (OP(scan) == CLOSE) {
5708 if (stopparen == (I32)ARG(scan)) {
5711 if ((I32)ARG(scan) == is_par) {
5712 next = regnext(scan);
5714 if ( next && (OP(next) != WHILEM) && next < last)
5715 is_par = 0; /* Disable optimization */
5718 *(data->last_closep) = ARG(scan);
5720 else if (OP(scan) == EVAL) {
5722 data->flags |= SF_HAS_EVAL;
5724 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5725 if (flags & SCF_DO_SUBSTR) {
5726 scan_commit(pRExC_state, data, minlenp, is_inf);
5727 flags &= ~SCF_DO_SUBSTR;
5729 if (data && OP(scan)==ACCEPT) {
5730 data->flags |= SCF_SEEN_ACCEPT;
5735 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5737 if (flags & SCF_DO_SUBSTR) {
5738 scan_commit(pRExC_state, data, minlenp, is_inf);
5739 data->longest = &(data->longest_float);
5741 is_inf = is_inf_internal = 1;
5742 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5743 ssc_anything(data->start_class);
5744 flags &= ~SCF_DO_STCLASS;
5746 else if (OP(scan) == GPOS) {
5747 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5748 !(delta || is_inf || (data && data->pos_delta)))
5750 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5751 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5752 if (RExC_rx->gofs < (STRLEN)min)
5753 RExC_rx->gofs = min;
5755 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5759 #ifdef TRIE_STUDY_OPT
5760 #ifdef FULL_TRIE_STUDY
5761 else if (PL_regkind[OP(scan)] == TRIE) {
5762 /* NOTE - There is similar code to this block above for handling
5763 BRANCH nodes on the initial study. If you change stuff here
5765 regnode *trie_node= scan;
5766 regnode *tail= regnext(scan);
5767 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5768 SSize_t max1 = 0, min1 = SSize_t_MAX;
5771 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5772 /* Cannot merge strings after this. */
5773 scan_commit(pRExC_state, data, minlenp, is_inf);
5775 if (flags & SCF_DO_STCLASS)
5776 ssc_init_zero(pRExC_state, &accum);
5782 const regnode *nextbranch= NULL;
5785 for ( word=1 ; word <= trie->wordcount ; word++)
5787 SSize_t deltanext=0, minnext=0, f = 0, fake;
5788 regnode_ssc this_class;
5790 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5792 data_fake.whilem_c = data->whilem_c;
5793 data_fake.last_closep = data->last_closep;
5796 data_fake.last_closep = &fake;
5797 data_fake.pos_delta = delta;
5798 if (flags & SCF_DO_STCLASS) {
5799 ssc_init(pRExC_state, &this_class);
5800 data_fake.start_class = &this_class;
5801 f = SCF_DO_STCLASS_AND;
5803 if (flags & SCF_WHILEM_VISITED_POS)
5804 f |= SCF_WHILEM_VISITED_POS;
5806 if (trie->jump[word]) {
5808 nextbranch = trie_node + trie->jump[0];
5809 scan= trie_node + trie->jump[word];
5810 /* We go from the jump point to the branch that follows
5811 it. Note this means we need the vestigal unused
5812 branches even though they arent otherwise used. */
5813 minnext = study_chunk(pRExC_state, &scan, minlenp,
5814 &deltanext, (regnode *)nextbranch, &data_fake,
5815 stopparen, recursed_depth, NULL, f,depth+1);
5817 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5818 nextbranch= regnext((regnode*)nextbranch);
5820 if (min1 > (SSize_t)(minnext + trie->minlen))
5821 min1 = minnext + trie->minlen;
5822 if (deltanext == SSize_t_MAX) {
5823 is_inf = is_inf_internal = 1;
5825 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5826 max1 = minnext + deltanext + trie->maxlen;
5828 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5830 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5831 if ( stopmin > min + min1)
5832 stopmin = min + min1;
5833 flags &= ~SCF_DO_SUBSTR;
5835 data->flags |= SCF_SEEN_ACCEPT;
5838 if (data_fake.flags & SF_HAS_EVAL)
5839 data->flags |= SF_HAS_EVAL;
5840 data->whilem_c = data_fake.whilem_c;
5842 if (flags & SCF_DO_STCLASS)
5843 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5846 if (flags & SCF_DO_SUBSTR) {
5847 data->pos_min += min1;
5848 data->pos_delta += max1 - min1;
5849 if (max1 != min1 || is_inf)
5850 data->longest = &(data->longest_float);
5853 if (delta != SSize_t_MAX)
5854 delta += max1 - min1;
5855 if (flags & SCF_DO_STCLASS_OR) {
5856 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5858 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5859 flags &= ~SCF_DO_STCLASS;
5862 else if (flags & SCF_DO_STCLASS_AND) {
5864 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5865 flags &= ~SCF_DO_STCLASS;
5868 /* Switch to OR mode: cache the old value of
5869 * data->start_class */
5871 StructCopy(data->start_class, and_withp, regnode_ssc);
5872 flags &= ~SCF_DO_STCLASS_AND;
5873 StructCopy(&accum, data->start_class, regnode_ssc);
5874 flags |= SCF_DO_STCLASS_OR;
5881 else if (PL_regkind[OP(scan)] == TRIE) {
5882 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5885 min += trie->minlen;
5886 delta += (trie->maxlen - trie->minlen);
5887 flags &= ~SCF_DO_STCLASS; /* xxx */
5888 if (flags & SCF_DO_SUBSTR) {
5889 /* Cannot expect anything... */
5890 scan_commit(pRExC_state, data, minlenp, is_inf);
5891 data->pos_min += trie->minlen;
5892 data->pos_delta += (trie->maxlen - trie->minlen);
5893 if (trie->maxlen != trie->minlen)
5894 data->longest = &(data->longest_float);
5896 if (trie->jump) /* no more substrings -- for now /grr*/
5897 flags &= ~SCF_DO_SUBSTR;
5899 #endif /* old or new */
5900 #endif /* TRIE_STUDY_OPT */
5902 /* Else: zero-length, ignore. */
5903 scan = regnext(scan);
5908 /* we need to unwind recursion. */
5911 DEBUG_STUDYDATA("frame-end:",data,depth);
5912 DEBUG_PEEP("fend", scan, depth);
5914 /* restore previous context */
5915 last = frame->last_regnode;
5916 scan = frame->next_regnode;
5917 stopparen = frame->stopparen;
5918 recursed_depth = frame->prev_recursed_depth;
5920 RExC_frame_last = frame->prev_frame;
5921 frame = frame->this_prev_frame;
5922 goto fake_study_recurse;
5926 DEBUG_STUDYDATA("pre-fin:",data,depth);
5929 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5931 if (flags & SCF_DO_SUBSTR && is_inf)
5932 data->pos_delta = SSize_t_MAX - data->pos_min;
5933 if (is_par > (I32)U8_MAX)
5935 if (is_par && pars==1 && data) {
5936 data->flags |= SF_IN_PAR;
5937 data->flags &= ~SF_HAS_PAR;
5939 else if (pars && data) {
5940 data->flags |= SF_HAS_PAR;
5941 data->flags &= ~SF_IN_PAR;
5943 if (flags & SCF_DO_STCLASS_OR)
5944 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5945 if (flags & SCF_TRIE_RESTUDY)
5946 data->flags |= SCF_TRIE_RESTUDY;
5948 DEBUG_STUDYDATA("post-fin:",data,depth);
5951 SSize_t final_minlen= min < stopmin ? min : stopmin;
5953 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5954 if (final_minlen > SSize_t_MAX - delta)
5955 RExC_maxlen = SSize_t_MAX;
5956 else if (RExC_maxlen < final_minlen + delta)
5957 RExC_maxlen = final_minlen + delta;
5959 return final_minlen;
5961 NOT_REACHED; /* NOTREACHED */
5965 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5967 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5969 PERL_ARGS_ASSERT_ADD_DATA;
5971 Renewc(RExC_rxi->data,
5972 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5973 char, struct reg_data);
5975 Renew(RExC_rxi->data->what, count + n, U8);
5977 Newx(RExC_rxi->data->what, n, U8);
5978 RExC_rxi->data->count = count + n;
5979 Copy(s, RExC_rxi->data->what + count, n, U8);
5983 /*XXX: todo make this not included in a non debugging perl, but appears to be
5984 * used anyway there, in 'use re' */
5985 #ifndef PERL_IN_XSUB_RE
5987 Perl_reginitcolors(pTHX)
5989 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5991 char *t = savepv(s);
5995 t = strchr(t, '\t');
6001 PL_colors[i] = t = (char *)"";
6006 PL_colors[i++] = (char *)"";
6013 #ifdef TRIE_STUDY_OPT
6014 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6017 (data.flags & SCF_TRIE_RESTUDY) \
6025 #define CHECK_RESTUDY_GOTO_butfirst
6029 * pregcomp - compile a regular expression into internal code
6031 * Decides which engine's compiler to call based on the hint currently in
6035 #ifndef PERL_IN_XSUB_RE
6037 /* return the currently in-scope regex engine (or the default if none) */
6039 regexp_engine const *
6040 Perl_current_re_engine(pTHX)
6042 if (IN_PERL_COMPILETIME) {
6043 HV * const table = GvHV(PL_hintgv);
6046 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6047 return &PL_core_reg_engine;
6048 ptr = hv_fetchs(table, "regcomp", FALSE);
6049 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6050 return &PL_core_reg_engine;
6051 return INT2PTR(regexp_engine*,SvIV(*ptr));
6055 if (!PL_curcop->cop_hints_hash)
6056 return &PL_core_reg_engine;
6057 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6058 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6059 return &PL_core_reg_engine;
6060 return INT2PTR(regexp_engine*,SvIV(ptr));
6066 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6068 regexp_engine const *eng = current_re_engine();
6069 GET_RE_DEBUG_FLAGS_DECL;
6071 PERL_ARGS_ASSERT_PREGCOMP;
6073 /* Dispatch a request to compile a regexp to correct regexp engine. */
6075 Perl_re_printf( aTHX_ "Using engine %"UVxf"\n",
6078 return CALLREGCOMP_ENG(eng, pattern, flags);
6082 /* public(ish) entry point for the perl core's own regex compiling code.
6083 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6084 * pattern rather than a list of OPs, and uses the internal engine rather
6085 * than the current one */
6088 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6090 SV *pat = pattern; /* defeat constness! */
6091 PERL_ARGS_ASSERT_RE_COMPILE;
6092 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6093 #ifdef PERL_IN_XSUB_RE
6096 &PL_core_reg_engine,
6098 NULL, NULL, rx_flags, 0);
6102 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6103 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6104 * point to the realloced string and length.
6106 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6110 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6111 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6113 U8 *const src = (U8*)*pat_p;
6118 GET_RE_DEBUG_FLAGS_DECL;
6120 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6121 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6123 Newx(dst, *plen_p * 2 + 1, U8);
6126 while (s < *plen_p) {
6127 append_utf8_from_native_byte(src[s], &d);
6128 if (n < num_code_blocks) {
6129 if (!do_end && pRExC_state->code_blocks[n].start == s) {
6130 pRExC_state->code_blocks[n].start = d - dst - 1;
6131 assert(*(d - 1) == '(');
6134 else if (do_end && pRExC_state->code_blocks[n].end == s) {
6135 pRExC_state->code_blocks[n].end = d - dst - 1;
6136 assert(*(d - 1) == ')');
6145 *pat_p = (char*) dst;
6147 RExC_orig_utf8 = RExC_utf8 = 1;
6152 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6153 * while recording any code block indices, and handling overloading,
6154 * nested qr// objects etc. If pat is null, it will allocate a new
6155 * string, or just return the first arg, if there's only one.
6157 * Returns the malloced/updated pat.
6158 * patternp and pat_count is the array of SVs to be concatted;
6159 * oplist is the optional list of ops that generated the SVs;
6160 * recompile_p is a pointer to a boolean that will be set if
6161 * the regex will need to be recompiled.
6162 * delim, if non-null is an SV that will be inserted between each element
6166 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6167 SV *pat, SV ** const patternp, int pat_count,
6168 OP *oplist, bool *recompile_p, SV *delim)
6172 bool use_delim = FALSE;
6173 bool alloced = FALSE;
6175 /* if we know we have at least two args, create an empty string,
6176 * then concatenate args to that. For no args, return an empty string */
6177 if (!pat && pat_count != 1) {
6183 for (svp = patternp; svp < patternp + pat_count; svp++) {
6186 STRLEN orig_patlen = 0;
6188 SV *msv = use_delim ? delim : *svp;
6189 if (!msv) msv = &PL_sv_undef;
6191 /* if we've got a delimiter, we go round the loop twice for each
6192 * svp slot (except the last), using the delimiter the second
6201 if (SvTYPE(msv) == SVt_PVAV) {
6202 /* we've encountered an interpolated array within
6203 * the pattern, e.g. /...@a..../. Expand the list of elements,
6204 * then recursively append elements.
6205 * The code in this block is based on S_pushav() */
6207 AV *const av = (AV*)msv;
6208 const SSize_t maxarg = AvFILL(av) + 1;
6212 assert(oplist->op_type == OP_PADAV
6213 || oplist->op_type == OP_RV2AV);
6214 oplist = OpSIBLING(oplist);
6217 if (SvRMAGICAL(av)) {
6220 Newx(array, maxarg, SV*);
6222 for (i=0; i < maxarg; i++) {
6223 SV ** const svp = av_fetch(av, i, FALSE);
6224 array[i] = svp ? *svp : &PL_sv_undef;
6228 array = AvARRAY(av);
6230 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6231 array, maxarg, NULL, recompile_p,
6233 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6239 /* we make the assumption here that each op in the list of
6240 * op_siblings maps to one SV pushed onto the stack,
6241 * except for code blocks, with have both an OP_NULL and
6243 * This allows us to match up the list of SVs against the
6244 * list of OPs to find the next code block.
6246 * Note that PUSHMARK PADSV PADSV ..
6248 * PADRANGE PADSV PADSV ..
6249 * so the alignment still works. */
6252 if (oplist->op_type == OP_NULL
6253 && (oplist->op_flags & OPf_SPECIAL))
6255 assert(n < pRExC_state->num_code_blocks);
6256 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6257 pRExC_state->code_blocks[n].block = oplist;
6258 pRExC_state->code_blocks[n].src_regex = NULL;
6261 oplist = OpSIBLING(oplist); /* skip CONST */
6264 oplist = OpSIBLING(oplist);;
6267 /* apply magic and QR overloading to arg */
6270 if (SvROK(msv) && SvAMAGIC(msv)) {
6271 SV *sv = AMG_CALLunary(msv, regexp_amg);
6275 if (SvTYPE(sv) != SVt_REGEXP)
6276 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6281 /* try concatenation overload ... */
6282 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6283 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6286 /* overloading involved: all bets are off over literal
6287 * code. Pretend we haven't seen it */
6288 pRExC_state->num_code_blocks -= n;
6292 /* ... or failing that, try "" overload */
6293 while (SvAMAGIC(msv)
6294 && (sv = AMG_CALLunary(msv, string_amg))
6298 && SvRV(msv) == SvRV(sv))
6303 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6307 /* this is a partially unrolled
6308 * sv_catsv_nomg(pat, msv);
6309 * that allows us to adjust code block indices if
6312 char *dst = SvPV_force_nomg(pat, dlen);
6314 if (SvUTF8(msv) && !SvUTF8(pat)) {
6315 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6316 sv_setpvn(pat, dst, dlen);
6319 sv_catsv_nomg(pat, msv);
6326 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6329 /* extract any code blocks within any embedded qr//'s */
6330 if (rx && SvTYPE(rx) == SVt_REGEXP
6331 && RX_ENGINE((REGEXP*)rx)->op_comp)
6334 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6335 if (ri->num_code_blocks) {
6337 /* the presence of an embedded qr// with code means
6338 * we should always recompile: the text of the
6339 * qr// may not have changed, but it may be a
6340 * different closure than last time */
6342 Renew(pRExC_state->code_blocks,
6343 pRExC_state->num_code_blocks + ri->num_code_blocks,
6344 struct reg_code_block);
6345 pRExC_state->num_code_blocks += ri->num_code_blocks;
6347 for (i=0; i < ri->num_code_blocks; i++) {
6348 struct reg_code_block *src, *dst;
6349 STRLEN offset = orig_patlen
6350 + ReANY((REGEXP *)rx)->pre_prefix;
6351 assert(n < pRExC_state->num_code_blocks);
6352 src = &ri->code_blocks[i];
6353 dst = &pRExC_state->code_blocks[n];
6354 dst->start = src->start + offset;
6355 dst->end = src->end + offset;
6356 dst->block = src->block;
6357 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6366 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6375 /* see if there are any run-time code blocks in the pattern.
6376 * False positives are allowed */
6379 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6380 char *pat, STRLEN plen)
6385 PERL_UNUSED_CONTEXT;
6387 for (s = 0; s < plen; s++) {
6388 if (n < pRExC_state->num_code_blocks
6389 && s == pRExC_state->code_blocks[n].start)
6391 s = pRExC_state->code_blocks[n].end;
6395 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6397 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6399 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6406 /* Handle run-time code blocks. We will already have compiled any direct
6407 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6408 * copy of it, but with any literal code blocks blanked out and
6409 * appropriate chars escaped; then feed it into
6411 * eval "qr'modified_pattern'"
6415 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6419 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6421 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6422 * and merge them with any code blocks of the original regexp.
6424 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6425 * instead, just save the qr and return FALSE; this tells our caller that
6426 * the original pattern needs upgrading to utf8.
6430 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6431 char *pat, STRLEN plen)
6435 GET_RE_DEBUG_FLAGS_DECL;
6437 if (pRExC_state->runtime_code_qr) {
6438 /* this is the second time we've been called; this should
6439 * only happen if the main pattern got upgraded to utf8
6440 * during compilation; re-use the qr we compiled first time
6441 * round (which should be utf8 too)
6443 qr = pRExC_state->runtime_code_qr;
6444 pRExC_state->runtime_code_qr = NULL;
6445 assert(RExC_utf8 && SvUTF8(qr));
6451 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6455 /* determine how many extra chars we need for ' and \ escaping */
6456 for (s = 0; s < plen; s++) {
6457 if (pat[s] == '\'' || pat[s] == '\\')
6461 Newx(newpat, newlen, char);
6463 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6465 for (s = 0; s < plen; s++) {
6466 if (n < pRExC_state->num_code_blocks
6467 && s == pRExC_state->code_blocks[n].start)
6469 /* blank out literal code block */
6470 assert(pat[s] == '(');
6471 while (s <= pRExC_state->code_blocks[n].end) {
6479 if (pat[s] == '\'' || pat[s] == '\\')
6484 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6488 Perl_re_printf( aTHX_
6489 "%sre-parsing pattern for runtime code:%s %s\n",
6490 PL_colors[4],PL_colors[5],newpat);
6493 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6499 PUSHSTACKi(PERLSI_REQUIRE);
6500 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6501 * parsing qr''; normally only q'' does this. It also alters
6503 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6504 SvREFCNT_dec_NN(sv);
6509 SV * const errsv = ERRSV;
6510 if (SvTRUE_NN(errsv))
6512 Safefree(pRExC_state->code_blocks);
6513 /* use croak_sv ? */
6514 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6517 assert(SvROK(qr_ref));
6519 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6520 /* the leaving below frees the tmp qr_ref.
6521 * Give qr a life of its own */
6529 if (!RExC_utf8 && SvUTF8(qr)) {
6530 /* first time through; the pattern got upgraded; save the
6531 * qr for the next time through */
6532 assert(!pRExC_state->runtime_code_qr);
6533 pRExC_state->runtime_code_qr = qr;
6538 /* extract any code blocks within the returned qr// */
6541 /* merge the main (r1) and run-time (r2) code blocks into one */
6543 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6544 struct reg_code_block *new_block, *dst;
6545 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6548 if (!r2->num_code_blocks) /* we guessed wrong */
6550 SvREFCNT_dec_NN(qr);
6555 r1->num_code_blocks + r2->num_code_blocks,
6556 struct reg_code_block);
6559 while ( i1 < r1->num_code_blocks
6560 || i2 < r2->num_code_blocks)
6562 struct reg_code_block *src;
6565 if (i1 == r1->num_code_blocks) {
6566 src = &r2->code_blocks[i2++];
6569 else if (i2 == r2->num_code_blocks)
6570 src = &r1->code_blocks[i1++];
6571 else if ( r1->code_blocks[i1].start
6572 < r2->code_blocks[i2].start)
6574 src = &r1->code_blocks[i1++];
6575 assert(src->end < r2->code_blocks[i2].start);
6578 assert( r1->code_blocks[i1].start
6579 > r2->code_blocks[i2].start);
6580 src = &r2->code_blocks[i2++];
6582 assert(src->end < r1->code_blocks[i1].start);
6585 assert(pat[src->start] == '(');
6586 assert(pat[src->end] == ')');
6587 dst->start = src->start;
6588 dst->end = src->end;
6589 dst->block = src->block;
6590 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6594 r1->num_code_blocks += r2->num_code_blocks;
6595 Safefree(r1->code_blocks);
6596 r1->code_blocks = new_block;
6599 SvREFCNT_dec_NN(qr);
6605 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6606 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6607 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6608 STRLEN longest_length, bool eol, bool meol)
6610 /* This is the common code for setting up the floating and fixed length
6611 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6612 * as to whether succeeded or not */
6617 if (! (longest_length
6618 || (eol /* Can't have SEOL and MULTI */
6619 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6621 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6622 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6627 /* copy the information about the longest from the reg_scan_data
6628 over to the program. */
6629 if (SvUTF8(sv_longest)) {
6630 *rx_utf8 = sv_longest;
6633 *rx_substr = sv_longest;
6636 /* end_shift is how many chars that must be matched that
6637 follow this item. We calculate it ahead of time as once the
6638 lookbehind offset is added in we lose the ability to correctly
6640 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6641 *rx_end_shift = ml - offset
6642 - longest_length + (SvTAIL(sv_longest) != 0)
6645 t = (eol/* Can't have SEOL and MULTI */
6646 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6647 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6653 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6654 * regular expression into internal code.
6655 * The pattern may be passed either as:
6656 * a list of SVs (patternp plus pat_count)
6657 * a list of OPs (expr)
6658 * If both are passed, the SV list is used, but the OP list indicates
6659 * which SVs are actually pre-compiled code blocks
6661 * The SVs in the list have magic and qr overloading applied to them (and
6662 * the list may be modified in-place with replacement SVs in the latter
6665 * If the pattern hasn't changed from old_re, then old_re will be
6668 * eng is the current engine. If that engine has an op_comp method, then
6669 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6670 * do the initial concatenation of arguments and pass on to the external
6673 * If is_bare_re is not null, set it to a boolean indicating whether the
6674 * arg list reduced (after overloading) to a single bare regex which has
6675 * been returned (i.e. /$qr/).
6677 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6679 * pm_flags contains the PMf_* flags, typically based on those from the
6680 * pm_flags field of the related PMOP. Currently we're only interested in
6681 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6683 * We can't allocate space until we know how big the compiled form will be,
6684 * but we can't compile it (and thus know how big it is) until we've got a
6685 * place to put the code. So we cheat: we compile it twice, once with code
6686 * generation turned off and size counting turned on, and once "for real".
6687 * This also means that we don't allocate space until we are sure that the
6688 * thing really will compile successfully, and we never have to move the
6689 * code and thus invalidate pointers into it. (Note that it has to be in
6690 * one piece because free() must be able to free it all.) [NB: not true in perl]
6692 * Beware that the optimization-preparation code in here knows about some
6693 * of the structure of the compiled regexp. [I'll say.]
6697 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6698 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6699 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6703 regexp_internal *ri;
6711 SV *code_blocksv = NULL;
6712 SV** new_patternp = patternp;
6714 /* these are all flags - maybe they should be turned
6715 * into a single int with different bit masks */
6716 I32 sawlookahead = 0;
6721 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6723 bool runtime_code = 0;
6725 RExC_state_t RExC_state;
6726 RExC_state_t * const pRExC_state = &RExC_state;
6727 #ifdef TRIE_STUDY_OPT
6729 RExC_state_t copyRExC_state;
6731 GET_RE_DEBUG_FLAGS_DECL;
6733 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6735 DEBUG_r(if (!PL_colorset) reginitcolors());
6737 /* Initialize these here instead of as-needed, as is quick and avoids
6738 * having to test them each time otherwise */
6739 if (! PL_AboveLatin1) {
6741 char * dump_len_string;
6744 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6745 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6746 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6747 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6748 PL_HasMultiCharFold =
6749 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6751 /* This is calculated here, because the Perl program that generates the
6752 * static global ones doesn't currently have access to
6753 * NUM_ANYOF_CODE_POINTS */
6754 PL_InBitmap = _new_invlist(2);
6755 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6756 NUM_ANYOF_CODE_POINTS - 1);
6758 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6759 if ( ! dump_len_string
6760 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6762 PL_dump_re_max_len = 0;
6767 pRExC_state->code_blocks = NULL;
6768 pRExC_state->num_code_blocks = 0;
6771 *is_bare_re = FALSE;
6773 if (expr && (expr->op_type == OP_LIST ||
6774 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6775 /* allocate code_blocks if needed */
6779 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6780 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6781 ncode++; /* count of DO blocks */
6783 pRExC_state->num_code_blocks = ncode;
6784 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6789 /* compile-time pattern with just OP_CONSTs and DO blocks */
6794 /* find how many CONSTs there are */
6797 if (expr->op_type == OP_CONST)
6800 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6801 if (o->op_type == OP_CONST)
6805 /* fake up an SV array */
6807 assert(!new_patternp);
6808 Newx(new_patternp, n, SV*);
6809 SAVEFREEPV(new_patternp);
6813 if (expr->op_type == OP_CONST)
6814 new_patternp[n] = cSVOPx_sv(expr);
6816 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6817 if (o->op_type == OP_CONST)
6818 new_patternp[n++] = cSVOPo_sv;
6823 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6824 "Assembling pattern from %d elements%s\n", pat_count,
6825 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6827 /* set expr to the first arg op */
6829 if (pRExC_state->num_code_blocks
6830 && expr->op_type != OP_CONST)
6832 expr = cLISTOPx(expr)->op_first;
6833 assert( expr->op_type == OP_PUSHMARK
6834 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6835 || expr->op_type == OP_PADRANGE);
6836 expr = OpSIBLING(expr);
6839 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6840 expr, &recompile, NULL);
6842 /* handle bare (possibly after overloading) regex: foo =~ $re */
6847 if (SvTYPE(re) == SVt_REGEXP) {
6851 Safefree(pRExC_state->code_blocks);
6852 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6853 "Precompiled pattern%s\n",
6854 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6860 exp = SvPV_nomg(pat, plen);
6862 if (!eng->op_comp) {
6863 if ((SvUTF8(pat) && IN_BYTES)
6864 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6866 /* make a temporary copy; either to convert to bytes,
6867 * or to avoid repeating get-magic / overloaded stringify */
6868 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6869 (IN_BYTES ? 0 : SvUTF8(pat)));
6871 Safefree(pRExC_state->code_blocks);
6872 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6875 /* ignore the utf8ness if the pattern is 0 length */
6876 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6878 RExC_uni_semantics = 0;
6879 RExC_seen_unfolded_sharp_s = 0;
6880 RExC_contains_locale = 0;
6881 RExC_contains_i = 0;
6882 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6883 RExC_study_started = 0;
6884 pRExC_state->runtime_code_qr = NULL;
6885 RExC_frame_head= NULL;
6886 RExC_frame_last= NULL;
6887 RExC_frame_count= 0;
6890 RExC_mysv1= sv_newmortal();
6891 RExC_mysv2= sv_newmortal();
6894 SV *dsv= sv_newmortal();
6895 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6896 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
6897 PL_colors[4],PL_colors[5],s);
6901 /* we jump here if we have to recompile, e.g., from upgrading the pattern
6904 if ((pm_flags & PMf_USE_RE_EVAL)
6905 /* this second condition covers the non-regex literal case,
6906 * i.e. $foo =~ '(?{})'. */
6907 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6909 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6911 /* return old regex if pattern hasn't changed */
6912 /* XXX: note in the below we have to check the flags as well as the
6915 * Things get a touch tricky as we have to compare the utf8 flag
6916 * independently from the compile flags. */
6920 && !!RX_UTF8(old_re) == !!RExC_utf8
6921 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6922 && RX_PRECOMP(old_re)
6923 && RX_PRELEN(old_re) == plen
6924 && memEQ(RX_PRECOMP(old_re), exp, plen)
6925 && !runtime_code /* with runtime code, always recompile */ )
6927 Safefree(pRExC_state->code_blocks);
6931 rx_flags = orig_rx_flags;
6933 if (rx_flags & PMf_FOLD) {
6934 RExC_contains_i = 1;
6936 if ( initial_charset == REGEX_DEPENDS_CHARSET
6937 && (RExC_utf8 ||RExC_uni_semantics))
6940 /* Set to use unicode semantics if the pattern is in utf8 and has the
6941 * 'depends' charset specified, as it means unicode when utf8 */
6942 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6946 RExC_precomp_adj = 0;
6947 RExC_flags = rx_flags;
6948 RExC_pm_flags = pm_flags;
6951 assert(TAINTING_get || !TAINT_get);
6953 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6955 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6956 /* whoops, we have a non-utf8 pattern, whilst run-time code
6957 * got compiled as utf8. Try again with a utf8 pattern */
6958 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6959 pRExC_state->num_code_blocks);
6960 goto redo_first_pass;
6963 assert(!pRExC_state->runtime_code_qr);
6969 RExC_in_lookbehind = 0;
6970 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6972 RExC_override_recoding = 0;
6974 RExC_recode_x_to_native = 0;
6976 RExC_in_multi_char_class = 0;
6978 /* First pass: determine size, legality. */
6980 RExC_start = RExC_adjusted_start = exp;
6981 RExC_end = exp + plen;
6982 RExC_precomp_end = RExC_end;
6987 RExC_emit = (regnode *) &RExC_emit_dummy;
6988 RExC_whilem_seen = 0;
6989 RExC_open_parens = NULL;
6990 RExC_close_parens = NULL;
6992 RExC_paren_names = NULL;
6994 RExC_paren_name_list = NULL;
6996 RExC_recurse = NULL;
6997 RExC_study_chunk_recursed = NULL;
6998 RExC_study_chunk_recursed_bytes= 0;
6999 RExC_recurse_count = 0;
7000 pRExC_state->code_index = 0;
7002 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7003 * code makes sure the final byte is an uncounted NUL. But should this
7004 * ever not be the case, lots of things could read beyond the end of the
7005 * buffer: loops like
7006 * while(isFOO(*RExC_parse)) RExC_parse++;
7007 * strchr(RExC_parse, "foo");
7008 * etc. So it is worth noting. */
7009 assert(*RExC_end == '\0');
7012 Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
7014 RExC_lastparse=NULL;
7016 /* reg may croak on us, not giving us a chance to free
7017 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
7018 need it to survive as long as the regexp (qr/(?{})/).
7019 We must check that code_blocksv is not already set, because we may
7020 have jumped back to restart the sizing pass. */
7021 if (pRExC_state->code_blocks && !code_blocksv) {
7022 code_blocksv = newSV_type(SVt_PV);
7023 SAVEFREESV(code_blocksv);
7024 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
7025 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
7027 if (reg(pRExC_state, 0, &flags,1) == NULL) {
7028 /* It's possible to write a regexp in ascii that represents Unicode
7029 codepoints outside of the byte range, such as via \x{100}. If we
7030 detect such a sequence we have to convert the entire pattern to utf8
7031 and then recompile, as our sizing calculation will have been based
7032 on 1 byte == 1 character, but we will need to use utf8 to encode
7033 at least some part of the pattern, and therefore must convert the whole
7036 if (flags & RESTART_PASS1) {
7037 if (flags & NEED_UTF8) {
7038 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7039 pRExC_state->num_code_blocks);
7042 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7043 "Need to redo pass 1\n"));
7046 goto redo_first_pass;
7048 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
7051 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
7054 Perl_re_printf( aTHX_
7055 "Required size %"IVdf" nodes\n"
7056 "Starting second pass (creation)\n",
7059 RExC_lastparse=NULL;
7062 /* The first pass could have found things that force Unicode semantics */
7063 if ((RExC_utf8 || RExC_uni_semantics)
7064 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7066 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7069 /* Small enough for pointer-storage convention?
7070 If extralen==0, this means that we will not need long jumps. */
7071 if (RExC_size >= 0x10000L && RExC_extralen)
7072 RExC_size += RExC_extralen;
7075 if (RExC_whilem_seen > 15)
7076 RExC_whilem_seen = 15;
7078 /* Allocate space and zero-initialize. Note, the two step process
7079 of zeroing when in debug mode, thus anything assigned has to
7080 happen after that */
7081 rx = (REGEXP*) newSV_type(SVt_REGEXP);
7083 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7084 char, regexp_internal);
7085 if ( r == NULL || ri == NULL )
7086 FAIL("Regexp out of space");
7088 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7089 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7092 /* bulk initialize base fields with 0. */
7093 Zero(ri, sizeof(regexp_internal), char);
7096 /* non-zero initialization begins here */
7099 r->extflags = rx_flags;
7100 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7102 if (pm_flags & PMf_IS_QR) {
7103 ri->code_blocks = pRExC_state->code_blocks;
7104 ri->num_code_blocks = pRExC_state->num_code_blocks;
7109 for (n = 0; n < pRExC_state->num_code_blocks; n++)
7110 if (pRExC_state->code_blocks[n].src_regex)
7111 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
7112 if(pRExC_state->code_blocks)
7113 SAVEFREEPV(pRExC_state->code_blocks); /* often null */
7117 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7118 bool has_charset = (get_regex_charset(r->extflags)
7119 != REGEX_DEPENDS_CHARSET);
7121 /* The caret is output if there are any defaults: if not all the STD
7122 * flags are set, or if no character set specifier is needed */
7124 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7126 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7127 == REG_RUN_ON_COMMENT_SEEN);
7128 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7129 >> RXf_PMf_STD_PMMOD_SHIFT);
7130 const char *fptr = STD_PAT_MODS; /*"msixn"*/
7133 /* We output all the necessary flags; we never output a minus, as all
7134 * those are defaults, so are
7135 * covered by the caret */
7136 const STRLEN wraplen = plen + has_p + has_runon
7137 + has_default /* If needs a caret */
7138 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7140 /* If needs a character set specifier */
7141 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7142 + (sizeof("(?:)") - 1);
7144 /* make sure PL_bitcount bounds not exceeded */
7145 assert(sizeof(STD_PAT_MODS) <= 8);
7147 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
7148 r->xpv_len_u.xpvlenu_pv = p;
7150 SvFLAGS(rx) |= SVf_UTF8;
7153 /* If a default, cover it using the caret */
7155 *p++= DEFAULT_PAT_MOD;
7159 const char* const name = get_regex_charset_name(r->extflags, &len);
7160 Copy(name, p, len, char);
7164 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7167 while((ch = *fptr++)) {
7175 Copy(RExC_precomp, p, plen, char);
7176 assert ((RX_WRAPPED(rx) - p) < 16);
7177 r->pre_prefix = p - RX_WRAPPED(rx);
7183 SvCUR_set(rx, p - RX_WRAPPED(rx));
7187 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
7189 /* Useful during FAIL. */
7190 #ifdef RE_TRACK_PATTERN_OFFSETS
7191 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7192 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7193 "%s %"UVuf" bytes for offset annotations.\n",
7194 ri->u.offsets ? "Got" : "Couldn't get",
7195 (UV)((2*RExC_size+1) * sizeof(U32))));
7197 SetProgLen(ri,RExC_size);
7202 /* Second pass: emit code. */
7203 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7204 RExC_pm_flags = pm_flags;
7206 RExC_end = exp + plen;
7208 RExC_emit_start = ri->program;
7209 RExC_emit = ri->program;
7210 RExC_emit_bound = ri->program + RExC_size + 1;
7211 pRExC_state->code_index = 0;
7213 *((char*) RExC_emit++) = (char) REG_MAGIC;
7214 /* setup various meta data about recursion, this all requires
7215 * RExC_npar to be correctly set, and a bit later on we clear it */
7216 if (RExC_seen & REG_RECURSE_SEEN) {
7217 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7218 "%*s%*s Setting up open/close parens\n",
7219 22, "| |", (int)(0 * 2 + 1), ""));
7221 /* setup RExC_open_parens, which holds the address of each
7222 * OPEN tag, and to make things simpler for the 0 index
7223 * the start of the program - this is used later for offsets */
7224 Newxz(RExC_open_parens, RExC_npar,regnode *);
7225 SAVEFREEPV(RExC_open_parens);
7226 RExC_open_parens[0] = RExC_emit;
7228 /* setup RExC_close_parens, which holds the address of each
7229 * CLOSE tag, and to make things simpler for the 0 index
7230 * the end of the program - this is used later for offsets */
7231 Newxz(RExC_close_parens, RExC_npar,regnode *);
7232 SAVEFREEPV(RExC_close_parens);
7233 /* we dont know where end op starts yet, so we dont
7234 * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7236 /* Note, RExC_npar is 1 + the number of parens in a pattern.
7237 * So its 1 if there are no parens. */
7238 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7239 ((RExC_npar & 0x07) != 0);
7240 Newx(RExC_study_chunk_recursed,
7241 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7242 SAVEFREEPV(RExC_study_chunk_recursed);
7245 if (reg(pRExC_state, 0, &flags,1) == NULL) {
7247 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
7250 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7253 /* XXXX To minimize changes to RE engine we always allocate
7254 3-units-long substrs field. */
7255 Newx(r->substrs, 1, struct reg_substr_data);
7256 if (RExC_recurse_count) {
7257 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7258 SAVEFREEPV(RExC_recurse);
7262 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7264 RExC_study_chunk_recursed_count= 0;
7266 Zero(r->substrs, 1, struct reg_substr_data);
7267 if (RExC_study_chunk_recursed) {
7268 Zero(RExC_study_chunk_recursed,
7269 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7273 #ifdef TRIE_STUDY_OPT
7275 StructCopy(&zero_scan_data, &data, scan_data_t);
7276 copyRExC_state = RExC_state;
7279 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7281 RExC_state = copyRExC_state;
7282 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7283 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7285 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7286 StructCopy(&zero_scan_data, &data, scan_data_t);
7289 StructCopy(&zero_scan_data, &data, scan_data_t);
7292 /* Dig out information for optimizations. */
7293 r->extflags = RExC_flags; /* was pm_op */
7294 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7297 SvUTF8_on(rx); /* Unicode in it? */
7298 ri->regstclass = NULL;
7299 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7300 r->intflags |= PREGf_NAUGHTY;
7301 scan = ri->program + 1; /* First BRANCH. */
7303 /* testing for BRANCH here tells us whether there is "must appear"
7304 data in the pattern. If there is then we can use it for optimisations */
7305 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7308 STRLEN longest_float_length, longest_fixed_length;
7309 regnode_ssc ch_class; /* pointed to by data */
7311 SSize_t last_close = 0; /* pointed to by data */
7312 regnode *first= scan;
7313 regnode *first_next= regnext(first);
7315 * Skip introductions and multiplicators >= 1
7316 * so that we can extract the 'meat' of the pattern that must
7317 * match in the large if() sequence following.
7318 * NOTE that EXACT is NOT covered here, as it is normally
7319 * picked up by the optimiser separately.
7321 * This is unfortunate as the optimiser isnt handling lookahead
7322 * properly currently.
7325 while ((OP(first) == OPEN && (sawopen = 1)) ||
7326 /* An OR of *one* alternative - should not happen now. */
7327 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7328 /* for now we can't handle lookbehind IFMATCH*/
7329 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7330 (OP(first) == PLUS) ||
7331 (OP(first) == MINMOD) ||
7332 /* An {n,m} with n>0 */
7333 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7334 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7337 * the only op that could be a regnode is PLUS, all the rest
7338 * will be regnode_1 or regnode_2.
7340 * (yves doesn't think this is true)
7342 if (OP(first) == PLUS)
7345 if (OP(first) == MINMOD)
7347 first += regarglen[OP(first)];
7349 first = NEXTOPER(first);
7350 first_next= regnext(first);
7353 /* Starting-point info. */
7355 DEBUG_PEEP("first:",first,0);
7356 /* Ignore EXACT as we deal with it later. */
7357 if (PL_regkind[OP(first)] == EXACT) {
7358 if (OP(first) == EXACT || OP(first) == EXACTL)
7359 NOOP; /* Empty, get anchored substr later. */
7361 ri->regstclass = first;
7364 else if (PL_regkind[OP(first)] == TRIE &&
7365 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7367 /* this can happen only on restudy */
7368 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7371 else if (REGNODE_SIMPLE(OP(first)))
7372 ri->regstclass = first;
7373 else if (PL_regkind[OP(first)] == BOUND ||
7374 PL_regkind[OP(first)] == NBOUND)
7375 ri->regstclass = first;
7376 else if (PL_regkind[OP(first)] == BOL) {
7377 r->intflags |= (OP(first) == MBOL
7380 first = NEXTOPER(first);
7383 else if (OP(first) == GPOS) {
7384 r->intflags |= PREGf_ANCH_GPOS;
7385 first = NEXTOPER(first);
7388 else if ((!sawopen || !RExC_sawback) &&
7390 (OP(first) == STAR &&
7391 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7392 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7394 /* turn .* into ^.* with an implied $*=1 */
7396 (OP(NEXTOPER(first)) == REG_ANY)
7399 r->intflags |= (type | PREGf_IMPLICIT);
7400 first = NEXTOPER(first);
7403 if (sawplus && !sawminmod && !sawlookahead
7404 && (!sawopen || !RExC_sawback)
7405 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7406 /* x+ must match at the 1st pos of run of x's */
7407 r->intflags |= PREGf_SKIP;
7409 /* Scan is after the zeroth branch, first is atomic matcher. */
7410 #ifdef TRIE_STUDY_OPT
7413 Perl_re_printf( aTHX_ "first at %"IVdf"\n",
7414 (IV)(first - scan + 1))
7418 Perl_re_printf( aTHX_ "first at %"IVdf"\n",
7419 (IV)(first - scan + 1))
7425 * If there's something expensive in the r.e., find the
7426 * longest literal string that must appear and make it the
7427 * regmust. Resolve ties in favor of later strings, since
7428 * the regstart check works with the beginning of the r.e.
7429 * and avoiding duplication strengthens checking. Not a
7430 * strong reason, but sufficient in the absence of others.
7431 * [Now we resolve ties in favor of the earlier string if
7432 * it happens that c_offset_min has been invalidated, since the
7433 * earlier string may buy us something the later one won't.]
7436 data.longest_fixed = newSVpvs("");
7437 data.longest_float = newSVpvs("");
7438 data.last_found = newSVpvs("");
7439 data.longest = &(data.longest_fixed);
7440 ENTER_with_name("study_chunk");
7441 SAVEFREESV(data.longest_fixed);
7442 SAVEFREESV(data.longest_float);
7443 SAVEFREESV(data.last_found);
7445 if (!ri->regstclass) {
7446 ssc_init(pRExC_state, &ch_class);
7447 data.start_class = &ch_class;
7448 stclass_flag = SCF_DO_STCLASS_AND;
7449 } else /* XXXX Check for BOUND? */
7451 data.last_closep = &last_close;
7454 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7455 scan + RExC_size, /* Up to end */
7457 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7458 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7462 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7465 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7466 && data.last_start_min == 0 && data.last_end > 0
7467 && !RExC_seen_zerolen
7468 && !(RExC_seen & REG_VERBARG_SEEN)
7469 && !(RExC_seen & REG_GPOS_SEEN)
7471 r->extflags |= RXf_CHECK_ALL;
7473 scan_commit(pRExC_state, &data,&minlen,0);
7475 longest_float_length = CHR_SVLEN(data.longest_float);
7477 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7478 && data.offset_fixed == data.offset_float_min
7479 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7480 && S_setup_longest (aTHX_ pRExC_state,
7484 &(r->float_end_shift),
7485 data.lookbehind_float,
7486 data.offset_float_min,
7488 longest_float_length,
7489 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7490 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7492 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7493 r->float_max_offset = data.offset_float_max;
7494 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7495 r->float_max_offset -= data.lookbehind_float;
7496 SvREFCNT_inc_simple_void_NN(data.longest_float);
7499 r->float_substr = r->float_utf8 = NULL;
7500 longest_float_length = 0;
7503 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7505 if (S_setup_longest (aTHX_ pRExC_state,
7507 &(r->anchored_utf8),
7508 &(r->anchored_substr),
7509 &(r->anchored_end_shift),
7510 data.lookbehind_fixed,
7513 longest_fixed_length,
7514 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7515 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7517 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7518 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7521 r->anchored_substr = r->anchored_utf8 = NULL;
7522 longest_fixed_length = 0;
7524 LEAVE_with_name("study_chunk");
7527 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7528 ri->regstclass = NULL;
7530 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7532 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7533 && is_ssc_worth_it(pRExC_state, data.start_class))
7535 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7537 ssc_finalize(pRExC_state, data.start_class);
7539 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7540 StructCopy(data.start_class,
7541 (regnode_ssc*)RExC_rxi->data->data[n],
7543 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7544 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7545 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7546 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7547 Perl_re_printf( aTHX_
7548 "synthetic stclass \"%s\".\n",
7549 SvPVX_const(sv));});
7550 data.start_class = NULL;
7553 /* A temporary algorithm prefers floated substr to fixed one to dig
7555 if (longest_fixed_length > longest_float_length) {
7556 r->substrs->check_ix = 0;
7557 r->check_end_shift = r->anchored_end_shift;
7558 r->check_substr = r->anchored_substr;
7559 r->check_utf8 = r->anchored_utf8;
7560 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7561 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7562 r->intflags |= PREGf_NOSCAN;
7565 r->substrs->check_ix = 1;
7566 r->check_end_shift = r->float_end_shift;
7567 r->check_substr = r->float_substr;
7568 r->check_utf8 = r->float_utf8;
7569 r->check_offset_min = r->float_min_offset;
7570 r->check_offset_max = r->float_max_offset;
7572 if ((r->check_substr || r->check_utf8) ) {
7573 r->extflags |= RXf_USE_INTUIT;
7574 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7575 r->extflags |= RXf_INTUIT_TAIL;
7577 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7579 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7580 if ( (STRLEN)minlen < longest_float_length )
7581 minlen= longest_float_length;
7582 if ( (STRLEN)minlen < longest_fixed_length )
7583 minlen= longest_fixed_length;
7587 /* Several toplevels. Best we can is to set minlen. */
7589 regnode_ssc ch_class;
7590 SSize_t last_close = 0;
7592 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
7594 scan = ri->program + 1;
7595 ssc_init(pRExC_state, &ch_class);
7596 data.start_class = &ch_class;
7597 data.last_closep = &last_close;
7600 minlen = study_chunk(pRExC_state,
7601 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7602 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7603 ? SCF_TRIE_DOING_RESTUDY
7607 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7609 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7610 = r->float_substr = r->float_utf8 = NULL;
7612 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7613 && is_ssc_worth_it(pRExC_state, data.start_class))
7615 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7617 ssc_finalize(pRExC_state, data.start_class);
7619 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7620 StructCopy(data.start_class,
7621 (regnode_ssc*)RExC_rxi->data->data[n],
7623 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7624 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7625 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7626 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7627 Perl_re_printf( aTHX_
7628 "synthetic stclass \"%s\".\n",
7629 SvPVX_const(sv));});
7630 data.start_class = NULL;
7634 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7635 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7636 r->maxlen = REG_INFTY;
7639 r->maxlen = RExC_maxlen;
7642 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7643 the "real" pattern. */
7645 Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7646 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7648 r->minlenret = minlen;
7649 if (r->minlen < minlen)
7652 if (RExC_seen & REG_RECURSE_SEEN ) {
7653 r->intflags |= PREGf_RECURSE_SEEN;
7654 Newxz(r->recurse_locinput, r->nparens + 1, char *);
7656 if (RExC_seen & REG_GPOS_SEEN)
7657 r->intflags |= PREGf_GPOS_SEEN;
7658 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7659 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7661 if (pRExC_state->num_code_blocks)
7662 r->extflags |= RXf_EVAL_SEEN;
7663 if (RExC_seen & REG_VERBARG_SEEN)
7665 r->intflags |= PREGf_VERBARG_SEEN;
7666 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7668 if (RExC_seen & REG_CUTGROUP_SEEN)
7669 r->intflags |= PREGf_CUTGROUP_SEEN;
7670 if (pm_flags & PMf_USE_RE_EVAL)
7671 r->intflags |= PREGf_USE_RE_EVAL;
7672 if (RExC_paren_names)
7673 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7675 RXp_PAREN_NAMES(r) = NULL;
7677 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7678 * so it can be used in pp.c */
7679 if (r->intflags & PREGf_ANCH)
7680 r->extflags |= RXf_IS_ANCHORED;
7684 /* this is used to identify "special" patterns that might result
7685 * in Perl NOT calling the regex engine and instead doing the match "itself",
7686 * particularly special cases in split//. By having the regex compiler
7687 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7688 * we avoid weird issues with equivalent patterns resulting in different behavior,
7689 * AND we allow non Perl engines to get the same optimizations by the setting the
7690 * flags appropriately - Yves */
7691 regnode *first = ri->program + 1;
7693 regnode *next = regnext(first);
7696 if (PL_regkind[fop] == NOTHING && nop == END)
7697 r->extflags |= RXf_NULL;
7698 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7699 /* when fop is SBOL first->flags will be true only when it was
7700 * produced by parsing /\A/, and not when parsing /^/. This is
7701 * very important for the split code as there we want to
7702 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7703 * See rt #122761 for more details. -- Yves */
7704 r->extflags |= RXf_START_ONLY;
7705 else if (fop == PLUS
7706 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7708 r->extflags |= RXf_WHITE;
7709 else if ( r->extflags & RXf_SPLIT
7710 && (fop == EXACT || fop == EXACTL)
7711 && STR_LEN(first) == 1
7712 && *(STRING(first)) == ' '
7714 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7718 if (RExC_contains_locale) {
7719 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7723 if (RExC_paren_names) {
7724 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7725 ri->data->data[ri->name_list_idx]
7726 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7729 ri->name_list_idx = 0;
7731 while ( RExC_recurse_count > 0 ) {
7732 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7733 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7736 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7737 /* assume we don't need to swap parens around before we match */
7739 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7740 (unsigned long)RExC_study_chunk_recursed_count);
7744 Perl_re_printf( aTHX_ "Final program:\n");
7747 #ifdef RE_TRACK_PATTERN_OFFSETS
7748 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7749 const STRLEN len = ri->u.offsets[0];
7751 GET_RE_DEBUG_FLAGS_DECL;
7752 Perl_re_printf( aTHX_
7753 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7754 for (i = 1; i <= len; i++) {
7755 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7756 Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ",
7757 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7759 Perl_re_printf( aTHX_ "\n");
7764 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7765 * by setting the regexp SV to readonly-only instead. If the
7766 * pattern's been recompiled, the USEDness should remain. */
7767 if (old_re && SvREADONLY(old_re))
7775 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7778 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7780 PERL_UNUSED_ARG(value);
7782 if (flags & RXapif_FETCH) {
7783 return reg_named_buff_fetch(rx, key, flags);
7784 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7785 Perl_croak_no_modify();
7787 } else if (flags & RXapif_EXISTS) {
7788 return reg_named_buff_exists(rx, key, flags)
7791 } else if (flags & RXapif_REGNAMES) {
7792 return reg_named_buff_all(rx, flags);
7793 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7794 return reg_named_buff_scalar(rx, flags);
7796 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7802 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7805 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7806 PERL_UNUSED_ARG(lastkey);
7808 if (flags & RXapif_FIRSTKEY)
7809 return reg_named_buff_firstkey(rx, flags);
7810 else if (flags & RXapif_NEXTKEY)
7811 return reg_named_buff_nextkey(rx, flags);
7813 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7820 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7823 AV *retarray = NULL;
7825 struct regexp *const rx = ReANY(r);
7827 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7829 if (flags & RXapif_ALL)
7832 if (rx && RXp_PAREN_NAMES(rx)) {
7833 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7836 SV* sv_dat=HeVAL(he_str);
7837 I32 *nums=(I32*)SvPVX(sv_dat);
7838 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7839 if ((I32)(rx->nparens) >= nums[i]
7840 && rx->offs[nums[i]].start != -1
7841 && rx->offs[nums[i]].end != -1)
7844 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7849 ret = newSVsv(&PL_sv_undef);
7852 av_push(retarray, ret);
7855 return newRV_noinc(MUTABLE_SV(retarray));
7862 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7865 struct regexp *const rx = ReANY(r);
7867 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7869 if (rx && RXp_PAREN_NAMES(rx)) {
7870 if (flags & RXapif_ALL) {
7871 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7873 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7875 SvREFCNT_dec_NN(sv);
7887 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7889 struct regexp *const rx = ReANY(r);
7891 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7893 if ( rx && RXp_PAREN_NAMES(rx) ) {
7894 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7896 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7903 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7905 struct regexp *const rx = ReANY(r);
7906 GET_RE_DEBUG_FLAGS_DECL;
7908 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7910 if (rx && RXp_PAREN_NAMES(rx)) {
7911 HV *hv = RXp_PAREN_NAMES(rx);
7913 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7916 SV* sv_dat = HeVAL(temphe);
7917 I32 *nums = (I32*)SvPVX(sv_dat);
7918 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7919 if ((I32)(rx->lastparen) >= nums[i] &&
7920 rx->offs[nums[i]].start != -1 &&
7921 rx->offs[nums[i]].end != -1)
7927 if (parno || flags & RXapif_ALL) {
7928 return newSVhek(HeKEY_hek(temphe));
7936 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7941 struct regexp *const rx = ReANY(r);
7943 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7945 if (rx && RXp_PAREN_NAMES(rx)) {
7946 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7947 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7948 } else if (flags & RXapif_ONE) {
7949 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7950 av = MUTABLE_AV(SvRV(ret));
7951 length = av_tindex(av);
7952 SvREFCNT_dec_NN(ret);
7953 return newSViv(length + 1);
7955 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7960 return &PL_sv_undef;
7964 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7966 struct regexp *const rx = ReANY(r);
7969 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7971 if (rx && RXp_PAREN_NAMES(rx)) {
7972 HV *hv= RXp_PAREN_NAMES(rx);
7974 (void)hv_iterinit(hv);
7975 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7978 SV* sv_dat = HeVAL(temphe);
7979 I32 *nums = (I32*)SvPVX(sv_dat);
7980 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7981 if ((I32)(rx->lastparen) >= nums[i] &&
7982 rx->offs[nums[i]].start != -1 &&
7983 rx->offs[nums[i]].end != -1)
7989 if (parno || flags & RXapif_ALL) {
7990 av_push(av, newSVhek(HeKEY_hek(temphe)));
7995 return newRV_noinc(MUTABLE_SV(av));
7999 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8002 struct regexp *const rx = ReANY(r);
8008 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8010 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8011 || n == RX_BUFF_IDX_CARET_FULLMATCH
8012 || n == RX_BUFF_IDX_CARET_POSTMATCH
8015 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8017 /* on something like
8020 * the KEEPCOPY is set on the PMOP rather than the regex */
8021 if (PL_curpm && r == PM_GETRE(PL_curpm))
8022 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8031 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8032 /* no need to distinguish between them any more */
8033 n = RX_BUFF_IDX_FULLMATCH;
8035 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8036 && rx->offs[0].start != -1)
8038 /* $`, ${^PREMATCH} */
8039 i = rx->offs[0].start;
8043 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8044 && rx->offs[0].end != -1)
8046 /* $', ${^POSTMATCH} */
8047 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8048 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8051 if ( 0 <= n && n <= (I32)rx->nparens &&
8052 (s1 = rx->offs[n].start) != -1 &&
8053 (t1 = rx->offs[n].end) != -1)
8055 /* $&, ${^MATCH}, $1 ... */
8057 s = rx->subbeg + s1 - rx->suboffset;
8062 assert(s >= rx->subbeg);
8063 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8065 #ifdef NO_TAINT_SUPPORT
8066 sv_setpvn(sv, s, i);
8068 const int oldtainted = TAINT_get;
8070 sv_setpvn(sv, s, i);
8071 TAINT_set(oldtainted);
8073 if (RXp_MATCH_UTF8(rx))
8078 if (RXp_MATCH_TAINTED(rx)) {
8079 if (SvTYPE(sv) >= SVt_PVMG) {
8080 MAGIC* const mg = SvMAGIC(sv);
8083 SvMAGIC_set(sv, mg->mg_moremagic);
8085 if ((mgt = SvMAGIC(sv))) {
8086 mg->mg_moremagic = mgt;
8087 SvMAGIC_set(sv, mg);
8098 sv_setsv(sv,&PL_sv_undef);
8104 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8105 SV const * const value)
8107 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8109 PERL_UNUSED_ARG(rx);
8110 PERL_UNUSED_ARG(paren);
8111 PERL_UNUSED_ARG(value);
8114 Perl_croak_no_modify();
8118 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8121 struct regexp *const rx = ReANY(r);
8125 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8127 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8128 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8129 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8132 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8134 /* on something like
8137 * the KEEPCOPY is set on the PMOP rather than the regex */
8138 if (PL_curpm && r == PM_GETRE(PL_curpm))
8139 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8145 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8147 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8148 case RX_BUFF_IDX_PREMATCH: /* $` */
8149 if (rx->offs[0].start != -1) {
8150 i = rx->offs[0].start;
8159 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8160 case RX_BUFF_IDX_POSTMATCH: /* $' */
8161 if (rx->offs[0].end != -1) {
8162 i = rx->sublen - rx->offs[0].end;
8164 s1 = rx->offs[0].end;
8171 default: /* $& / ${^MATCH}, $1, $2, ... */
8172 if (paren <= (I32)rx->nparens &&
8173 (s1 = rx->offs[paren].start) != -1 &&
8174 (t1 = rx->offs[paren].end) != -1)
8180 if (ckWARN(WARN_UNINITIALIZED))
8181 report_uninit((const SV *)sv);
8186 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8187 const char * const s = rx->subbeg - rx->suboffset + s1;
8192 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8199 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8201 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8202 PERL_UNUSED_ARG(rx);
8206 return newSVpvs("Regexp");
8209 /* Scans the name of a named buffer from the pattern.
8210 * If flags is REG_RSN_RETURN_NULL returns null.
8211 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8212 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8213 * to the parsed name as looked up in the RExC_paren_names hash.
8214 * If there is an error throws a vFAIL().. type exception.
8217 #define REG_RSN_RETURN_NULL 0
8218 #define REG_RSN_RETURN_NAME 1
8219 #define REG_RSN_RETURN_DATA 2
8222 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8224 char *name_start = RExC_parse;
8226 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8228 assert (RExC_parse <= RExC_end);
8229 if (RExC_parse == RExC_end) NOOP;
8230 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
8231 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8232 * using do...while */
8235 RExC_parse += UTF8SKIP(RExC_parse);
8236 } while (isWORDCHAR_utf8((U8*)RExC_parse));
8240 } while (isWORDCHAR(*RExC_parse));
8242 RExC_parse++; /* so the <- from the vFAIL is after the offending
8244 vFAIL("Group name must start with a non-digit word character");
8248 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8249 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8250 if ( flags == REG_RSN_RETURN_NAME)
8252 else if (flags==REG_RSN_RETURN_DATA) {
8255 if ( ! sv_name ) /* should not happen*/
8256 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8257 if (RExC_paren_names)
8258 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8260 sv_dat = HeVAL(he_str);
8262 vFAIL("Reference to nonexistent named group");
8266 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8267 (unsigned long) flags);
8269 NOT_REACHED; /* NOTREACHED */
8274 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8276 if (RExC_lastparse!=RExC_parse) { \
8277 Perl_re_printf( aTHX_ "%s", \
8278 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8279 RExC_end - RExC_parse, 16, \
8281 PERL_PV_ESCAPE_UNI_DETECT | \
8282 PERL_PV_PRETTY_ELLIPSES | \
8283 PERL_PV_PRETTY_LTGT | \
8284 PERL_PV_ESCAPE_RE | \
8285 PERL_PV_PRETTY_EXACTSIZE \
8289 Perl_re_printf( aTHX_ "%16s",""); \
8292 num = RExC_size + 1; \
8294 num=REG_NODE_NUM(RExC_emit); \
8295 if (RExC_lastnum!=num) \
8296 Perl_re_printf( aTHX_ "|%4d",num); \
8298 Perl_re_printf( aTHX_ "|%4s",""); \
8299 Perl_re_printf( aTHX_ "|%*s%-4s", \
8300 (int)((depth*2)), "", \
8304 RExC_lastparse=RExC_parse; \
8309 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8310 DEBUG_PARSE_MSG((funcname)); \
8311 Perl_re_printf( aTHX_ "%4s","\n"); \
8313 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8314 DEBUG_PARSE_MSG((funcname)); \
8315 Perl_re_printf( aTHX_ fmt "\n",args); \
8318 /* This section of code defines the inversion list object and its methods. The
8319 * interfaces are highly subject to change, so as much as possible is static to
8320 * this file. An inversion list is here implemented as a malloc'd C UV array
8321 * as an SVt_INVLIST scalar.
8323 * An inversion list for Unicode is an array of code points, sorted by ordinal
8324 * number. The zeroth element is the first code point in the list. The 1th
8325 * element is the first element beyond that not in the list. In other words,
8326 * the first range is
8327 * invlist[0]..(invlist[1]-1)
8328 * The other ranges follow. Thus every element whose index is divisible by two
8329 * marks the beginning of a range that is in the list, and every element not
8330 * divisible by two marks the beginning of a range not in the list. A single
8331 * element inversion list that contains the single code point N generally
8332 * consists of two elements
8335 * (The exception is when N is the highest representable value on the
8336 * machine, in which case the list containing just it would be a single
8337 * element, itself. By extension, if the last range in the list extends to
8338 * infinity, then the first element of that range will be in the inversion list
8339 * at a position that is divisible by two, and is the final element in the
8341 * Taking the complement (inverting) an inversion list is quite simple, if the
8342 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8343 * This implementation reserves an element at the beginning of each inversion
8344 * list to always contain 0; there is an additional flag in the header which
8345 * indicates if the list begins at the 0, or is offset to begin at the next
8348 * More about inversion lists can be found in "Unicode Demystified"
8349 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8350 * More will be coming when functionality is added later.
8352 * The inversion list data structure is currently implemented as an SV pointing
8353 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8354 * array of UV whose memory management is automatically handled by the existing
8355 * facilities for SV's.
8357 * Some of the methods should always be private to the implementation, and some
8358 * should eventually be made public */
8360 /* The header definitions are in F<invlist_inline.h> */
8362 PERL_STATIC_INLINE UV*
8363 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8365 /* Returns a pointer to the first element in the inversion list's array.
8366 * This is called upon initialization of an inversion list. Where the
8367 * array begins depends on whether the list has the code point U+0000 in it
8368 * or not. The other parameter tells it whether the code that follows this
8369 * call is about to put a 0 in the inversion list or not. The first
8370 * element is either the element reserved for 0, if TRUE, or the element
8371 * after it, if FALSE */
8373 bool* offset = get_invlist_offset_addr(invlist);
8374 UV* zero_addr = (UV *) SvPVX(invlist);
8376 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8379 assert(! _invlist_len(invlist));
8383 /* 1^1 = 0; 1^0 = 1 */
8384 *offset = 1 ^ will_have_0;
8385 return zero_addr + *offset;
8388 PERL_STATIC_INLINE void
8389 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8391 /* Sets the current number of elements stored in the inversion list.
8392 * Updates SvCUR correspondingly */
8393 PERL_UNUSED_CONTEXT;
8394 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8396 assert(SvTYPE(invlist) == SVt_INVLIST);
8401 : TO_INTERNAL_SIZE(len + offset));
8402 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8405 #ifndef PERL_IN_XSUB_RE
8408 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8410 /* Replaces the inversion list in 'src' with the one in 'dest'. It steals
8411 * the list from 'src', so 'src' is made to have a NULL list. This is
8412 * similar to what SvSetMagicSV() would do, if it were implemented on
8413 * inversion lists, though this routine avoids a copy */
8415 const UV src_len = _invlist_len(src);
8416 const bool src_offset = *get_invlist_offset_addr(src);
8417 const STRLEN src_byte_len = SvLEN(src);
8418 char * array = SvPVX(src);
8420 const int oldtainted = TAINT_get;
8422 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8424 assert(SvTYPE(src) == SVt_INVLIST);
8425 assert(SvTYPE(dest) == SVt_INVLIST);
8426 assert(! invlist_is_iterating(src));
8427 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8429 /* Make sure it ends in the right place with a NUL, as our inversion list
8430 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8432 array[src_byte_len - 1] = '\0';
8434 TAINT_NOT; /* Otherwise it breaks */
8435 sv_usepvn_flags(dest,
8439 /* This flag is documented to cause a copy to be avoided */
8440 SV_HAS_TRAILING_NUL);
8441 TAINT_set(oldtainted);
8446 /* Finish up copying over the other fields in an inversion list */
8447 *get_invlist_offset_addr(dest) = src_offset;
8448 invlist_set_len(dest, src_len, src_offset);
8449 *get_invlist_previous_index_addr(dest) = 0;
8450 invlist_iterfinish(dest);
8453 PERL_STATIC_INLINE IV*
8454 S_get_invlist_previous_index_addr(SV* invlist)
8456 /* Return the address of the IV that is reserved to hold the cached index
8458 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8460 assert(SvTYPE(invlist) == SVt_INVLIST);
8462 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8465 PERL_STATIC_INLINE IV
8466 S_invlist_previous_index(SV* const invlist)
8468 /* Returns cached index of previous search */
8470 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8472 return *get_invlist_previous_index_addr(invlist);
8475 PERL_STATIC_INLINE void
8476 S_invlist_set_previous_index(SV* const invlist, const IV index)
8478 /* Caches <index> for later retrieval */
8480 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8482 assert(index == 0 || index < (int) _invlist_len(invlist));
8484 *get_invlist_previous_index_addr(invlist) = index;
8487 PERL_STATIC_INLINE void
8488 S_invlist_trim(SV* invlist)
8490 /* Free the not currently-being-used space in an inversion list */
8492 /* But don't free up the space needed for the 0 UV that is always at the
8493 * beginning of the list, nor the trailing NUL */
8494 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8496 PERL_ARGS_ASSERT_INVLIST_TRIM;
8498 assert(SvTYPE(invlist) == SVt_INVLIST);
8500 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8503 PERL_STATIC_INLINE void
8504 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
8506 PERL_ARGS_ASSERT_INVLIST_CLEAR;
8508 assert(SvTYPE(invlist) == SVt_INVLIST);
8510 invlist_set_len(invlist, 0, 0);
8511 invlist_trim(invlist);
8514 #endif /* ifndef PERL_IN_XSUB_RE */
8516 PERL_STATIC_INLINE bool
8517 S_invlist_is_iterating(SV* const invlist)
8519 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8521 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8524 PERL_STATIC_INLINE UV
8525 S_invlist_max(SV* const invlist)
8527 /* Returns the maximum number of elements storable in the inversion list's
8528 * array, without having to realloc() */
8530 PERL_ARGS_ASSERT_INVLIST_MAX;
8532 assert(SvTYPE(invlist) == SVt_INVLIST);
8534 /* Assumes worst case, in which the 0 element is not counted in the
8535 * inversion list, so subtracts 1 for that */
8536 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8537 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8538 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8541 #ifndef PERL_IN_XSUB_RE
8543 Perl__new_invlist(pTHX_ IV initial_size)
8546 /* Return a pointer to a newly constructed inversion list, with enough
8547 * space to store 'initial_size' elements. If that number is negative, a
8548 * system default is used instead */
8552 if (initial_size < 0) {
8556 /* Allocate the initial space */
8557 new_list = newSV_type(SVt_INVLIST);
8559 /* First 1 is in case the zero element isn't in the list; second 1 is for
8561 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8562 invlist_set_len(new_list, 0, 0);
8564 /* Force iterinit() to be used to get iteration to work */
8565 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8567 *get_invlist_previous_index_addr(new_list) = 0;
8573 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8575 /* Return a pointer to a newly constructed inversion list, initialized to
8576 * point to <list>, which has to be in the exact correct inversion list
8577 * form, including internal fields. Thus this is a dangerous routine that
8578 * should not be used in the wrong hands. The passed in 'list' contains
8579 * several header fields at the beginning that are not part of the
8580 * inversion list body proper */
8582 const STRLEN length = (STRLEN) list[0];
8583 const UV version_id = list[1];
8584 const bool offset = cBOOL(list[2]);
8585 #define HEADER_LENGTH 3
8586 /* If any of the above changes in any way, you must change HEADER_LENGTH
8587 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8588 * perl -E 'say int(rand 2**31-1)'
8590 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8591 data structure type, so that one being
8592 passed in can be validated to be an
8593 inversion list of the correct vintage.
8596 SV* invlist = newSV_type(SVt_INVLIST);
8598 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8600 if (version_id != INVLIST_VERSION_ID) {
8601 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8604 /* The generated array passed in includes header elements that aren't part
8605 * of the list proper, so start it just after them */
8606 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8608 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8609 shouldn't touch it */
8611 *(get_invlist_offset_addr(invlist)) = offset;
8613 /* The 'length' passed to us is the physical number of elements in the
8614 * inversion list. But if there is an offset the logical number is one
8616 invlist_set_len(invlist, length - offset, offset);
8618 invlist_set_previous_index(invlist, 0);
8620 /* Initialize the iteration pointer. */
8621 invlist_iterfinish(invlist);
8623 SvREADONLY_on(invlist);
8627 #endif /* ifndef PERL_IN_XSUB_RE */
8630 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8632 /* Grow the maximum size of an inversion list */
8634 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8636 assert(SvTYPE(invlist) == SVt_INVLIST);
8638 /* Add one to account for the zero element at the beginning which may not
8639 * be counted by the calling parameters */
8640 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8644 S__append_range_to_invlist(pTHX_ SV* const invlist,
8645 const UV start, const UV end)
8647 /* Subject to change or removal. Append the range from 'start' to 'end' at
8648 * the end of the inversion list. The range must be above any existing
8652 UV max = invlist_max(invlist);
8653 UV len = _invlist_len(invlist);
8656 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8658 if (len == 0) { /* Empty lists must be initialized */
8659 offset = start != 0;
8660 array = _invlist_array_init(invlist, ! offset);
8663 /* Here, the existing list is non-empty. The current max entry in the
8664 * list is generally the first value not in the set, except when the
8665 * set extends to the end of permissible values, in which case it is
8666 * the first entry in that final set, and so this call is an attempt to
8667 * append out-of-order */
8669 UV final_element = len - 1;
8670 array = invlist_array(invlist);
8671 if (array[final_element] > start
8672 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8674 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8675 array[final_element], start,
8676 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8679 /* Here, it is a legal append. If the new range begins with the first
8680 * value not in the set, it is extending the set, so the new first
8681 * value not in the set is one greater than the newly extended range.
8683 offset = *get_invlist_offset_addr(invlist);
8684 if (array[final_element] == start) {
8685 if (end != UV_MAX) {
8686 array[final_element] = end + 1;
8689 /* But if the end is the maximum representable on the machine,
8690 * just let the range that this would extend to have no end */
8691 invlist_set_len(invlist, len - 1, offset);
8697 /* Here the new range doesn't extend any existing set. Add it */
8699 len += 2; /* Includes an element each for the start and end of range */
8701 /* If wll overflow the existing space, extend, which may cause the array to
8704 invlist_extend(invlist, len);
8706 /* Have to set len here to avoid assert failure in invlist_array() */
8707 invlist_set_len(invlist, len, offset);
8709 array = invlist_array(invlist);
8712 invlist_set_len(invlist, len, offset);
8715 /* The next item on the list starts the range, the one after that is
8716 * one past the new range. */
8717 array[len - 2] = start;
8718 if (end != UV_MAX) {
8719 array[len - 1] = end + 1;
8722 /* But if the end is the maximum representable on the machine, just let
8723 * the range have no end */
8724 invlist_set_len(invlist, len - 1, offset);
8728 #ifndef PERL_IN_XSUB_RE
8731 Perl__invlist_search(SV* const invlist, const UV cp)
8733 /* Searches the inversion list for the entry that contains the input code
8734 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8735 * return value is the index into the list's array of the range that
8736 * contains <cp>, that is, 'i' such that
8737 * array[i] <= cp < array[i+1]
8742 IV high = _invlist_len(invlist);
8743 const IV highest_element = high - 1;
8746 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8748 /* If list is empty, return failure. */
8753 /* (We can't get the array unless we know the list is non-empty) */
8754 array = invlist_array(invlist);
8756 mid = invlist_previous_index(invlist);
8758 if (mid > highest_element) {
8759 mid = highest_element;
8762 /* <mid> contains the cache of the result of the previous call to this
8763 * function (0 the first time). See if this call is for the same result,
8764 * or if it is for mid-1. This is under the theory that calls to this
8765 * function will often be for related code points that are near each other.
8766 * And benchmarks show that caching gives better results. We also test
8767 * here if the code point is within the bounds of the list. These tests
8768 * replace others that would have had to be made anyway to make sure that
8769 * the array bounds were not exceeded, and these give us extra information
8770 * at the same time */
8771 if (cp >= array[mid]) {
8772 if (cp >= array[highest_element]) {
8773 return highest_element;
8776 /* Here, array[mid] <= cp < array[highest_element]. This means that
8777 * the final element is not the answer, so can exclude it; it also
8778 * means that <mid> is not the final element, so can refer to 'mid + 1'
8780 if (cp < array[mid + 1]) {
8786 else { /* cp < aray[mid] */
8787 if (cp < array[0]) { /* Fail if outside the array */
8791 if (cp >= array[mid - 1]) {
8796 /* Binary search. What we are looking for is <i> such that
8797 * array[i] <= cp < array[i+1]
8798 * The loop below converges on the i+1. Note that there may not be an
8799 * (i+1)th element in the array, and things work nonetheless */
8800 while (low < high) {
8801 mid = (low + high) / 2;
8802 assert(mid <= highest_element);
8803 if (array[mid] <= cp) { /* cp >= array[mid] */
8806 /* We could do this extra test to exit the loop early.
8807 if (cp < array[low]) {
8812 else { /* cp < array[mid] */
8819 invlist_set_previous_index(invlist, high);
8824 Perl__invlist_populate_swatch(SV* const invlist,
8825 const UV start, const UV end, U8* swatch)
8827 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8828 * but is used when the swash has an inversion list. This makes this much
8829 * faster, as it uses a binary search instead of a linear one. This is
8830 * intimately tied to that function, and perhaps should be in utf8.c,
8831 * except it is intimately tied to inversion lists as well. It assumes
8832 * that <swatch> is all 0's on input */
8835 const IV len = _invlist_len(invlist);
8839 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8841 if (len == 0) { /* Empty inversion list */
8845 array = invlist_array(invlist);
8847 /* Find which element it is */
8848 i = _invlist_search(invlist, start);
8850 /* We populate from <start> to <end> */
8851 while (current < end) {
8854 /* The inversion list gives the results for every possible code point
8855 * after the first one in the list. Only those ranges whose index is
8856 * even are ones that the inversion list matches. For the odd ones,
8857 * and if the initial code point is not in the list, we have to skip
8858 * forward to the next element */
8859 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8861 if (i >= len) { /* Finished if beyond the end of the array */
8865 if (current >= end) { /* Finished if beyond the end of what we
8867 if (LIKELY(end < UV_MAX)) {
8871 /* We get here when the upper bound is the maximum
8872 * representable on the machine, and we are looking for just
8873 * that code point. Have to special case it */
8875 goto join_end_of_list;
8878 assert(current >= start);
8880 /* The current range ends one below the next one, except don't go past
8883 upper = (i < len && array[i] < end) ? array[i] : end;
8885 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8886 * for each code point in it */
8887 for (; current < upper; current++) {
8888 const STRLEN offset = (STRLEN)(current - start);
8889 swatch[offset >> 3] |= 1 << (offset & 7);
8894 /* Quit if at the end of the list */
8897 /* But first, have to deal with the highest possible code point on
8898 * the platform. The previous code assumes that <end> is one
8899 * beyond where we want to populate, but that is impossible at the
8900 * platform's infinity, so have to handle it specially */
8901 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8903 const STRLEN offset = (STRLEN)(end - start);
8904 swatch[offset >> 3] |= 1 << (offset & 7);
8909 /* Advance to the next range, which will be for code points not in the
8918 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8919 const bool complement_b, SV** output)
8921 /* Take the union of two inversion lists and point <output> to it. *output
8922 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8923 * the reference count to that list will be decremented if not already a
8924 * temporary (mortal); otherwise just its contents will be modified to be
8925 * the union. The first list, <a>, may be NULL, in which case a copy of
8926 * the second list is returned. If <complement_b> is TRUE, the union is
8927 * taken of the complement (inversion) of <b> instead of b itself.
8929 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8930 * Richard Gillam, published by Addison-Wesley, and explained at some
8931 * length there. The preface says to incorporate its examples into your
8932 * code at your own risk.
8934 * The algorithm is like a merge sort. */
8936 const UV* array_a; /* a's array */
8938 UV len_a; /* length of a's array */
8941 SV* u; /* the resulting union */
8945 UV i_a = 0; /* current index into a's array */
8949 bool has_something_from_a = FALSE;
8950 bool has_something_from_b = FALSE;
8953 /* running count, as explained in the algorithm source book; items are
8954 * stopped accumulating and are output when the count changes to/from 0.
8955 * The count is incremented when we start a range that's in the set, and
8956 * decremented when we start a range that's not in the set. So its range
8957 * is 0 to 2. Only when the count is zero is something not in the set.
8961 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8964 len_b = _invlist_len(b);
8967 /* Here, 'b' is empty. If the output is the complement of 'b', the
8968 * union is all possible code points, and we need not even look at 'a'.
8969 * It's easiest to create a new inversion list that matches everything.
8972 SV* everything = _new_invlist(1);
8973 _append_range_to_invlist(everything, 0, UV_MAX);
8975 /* If the output didn't exist, just point it at the new list */
8976 if (*output == NULL) {
8977 *output = everything;
8981 /* Otherwise, replace its contents with the new list */
8982 invlist_replace_list_destroys_src(*output, everything);
8983 SvREFCNT_dec_NN(everything);
8987 /* Here, we don't want the complement of 'b', and since it is empty,
8988 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
8989 * output will be empty */
8992 *output = _new_invlist(0);
8996 if (_invlist_len(a) == 0) {
8997 invlist_clear(*output);
9001 /* Here, 'a' is not empty, and entirely determines the union. If the
9002 * output is not to overwrite 'b', we can just return 'a'. */
9005 /* If the output is to overwrite 'a', we have a no-op, as it's
9011 /* But otherwise we have to copy 'a' to the output */
9012 *output = invlist_clone(a);
9016 /* Here, 'b' is to be overwritten by the output, which will be 'a' */
9017 u = invlist_clone(a);
9018 invlist_replace_list_destroys_src(*output, u);
9024 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9026 /* Here, 'a' is empty (and b is not). That means the union will come
9027 * entirely from 'b'. If the output is not to overwrite 'a', we can
9028 * just return what's in 'b'. */
9031 /* If the output is to overwrite 'b', it's already in 'b', but
9032 * otherwise we have to copy 'b' to the output */
9034 *output = invlist_clone(b);
9037 /* And if the output is to be the inversion of 'b', do that */
9039 _invlist_invert(*output);
9045 /* Here, 'a', which is empty or even NULL, is to be overwritten by the
9046 * output, which will either be 'b' or the complement of 'b' */
9049 *output = invlist_clone(b);
9052 u = invlist_clone(b);
9053 invlist_replace_list_destroys_src(*output, u);
9058 _invlist_invert(*output);
9064 /* Here both lists exist and are non-empty */
9065 array_a = invlist_array(a);
9066 array_b = invlist_array(b);
9068 /* If are to take the union of 'a' with the complement of b, set it
9069 * up so are looking at b's complement. */
9072 /* To complement, we invert: if the first element is 0, remove it. To
9073 * do this, we just pretend the array starts one later */
9074 if (array_b[0] == 0) {
9080 /* But if the first element is not zero, we pretend the list starts
9081 * at the 0 that is always stored immediately before the array. */
9087 /* Size the union for the worst case: that the sets are completely
9089 u = _new_invlist(len_a + len_b);
9091 /* Will contain U+0000 if either component does */
9092 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
9093 || (len_b > 0 && array_b[0] == 0));
9095 /* Go through each list item by item, stopping when exhausted one of
9097 while (i_a < len_a && i_b < len_b) {
9098 UV cp; /* The element to potentially add to the union's array */
9099 bool cp_in_set; /* is it in the the input list's set or not */
9101 /* We need to take one or the other of the two inputs for the union.
9102 * Since we are merging two sorted lists, we take the smaller of the
9103 * next items. In case of a tie, we take the one that is in its set
9104 * first. If we took one not in the set first, it would decrement the
9105 * count, possibly to 0 which would cause it to be output as ending the
9106 * range, and the next time through we would take the same number, and
9107 * output it again as beginning the next range. By doing it the
9108 * opposite way, there is no possibility that the count will be
9109 * momentarily decremented to 0, and thus the two adjoining ranges will
9110 * be seamlessly merged. (In a tie and both are in the set or both not
9111 * in the set, it doesn't matter which we take first.) */
9112 if (array_a[i_a] < array_b[i_b]
9113 || (array_a[i_a] == array_b[i_b]
9114 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9116 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9118 has_something_from_a = TRUE;
9121 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9122 cp = array_b[i_b++];
9123 has_something_from_b = TRUE;
9126 /* Here, have chosen which of the two inputs to look at. Only output
9127 * if the running count changes to/from 0, which marks the
9128 * beginning/end of a range that's in the set */
9131 array_u[i_u++] = cp;
9138 array_u[i_u++] = cp;
9143 /* Here, we are finished going through at least one of the lists, which
9144 * means there is something remaining in at most one. We check if the list
9145 * that hasn't been exhausted is positioned such that we are in the middle
9146 * of a range in its set or not. (i_a and i_b point to the element beyond
9147 * the one we care about.) If in the set, we decrement 'count'; if 0, there
9148 * is potentially more to output.
9149 * There are four cases:
9150 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
9151 * in the union is entirely from the non-exhausted set.
9152 * 2) Both were in their sets, count is 2. Nothing further should
9153 * be output, as everything that remains will be in the exhausted
9154 * list's set, hence in the union; decrementing to 1 but not 0 insures
9156 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
9157 * Nothing further should be output because the union includes
9158 * everything from the exhausted set. Not decrementing ensures that.
9159 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
9160 * decrementing to 0 insures that we look at the remainder of the
9161 * non-exhausted set */
9162 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9163 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9168 /* The final length is what we've output so far, plus what else is about to
9169 * be output. (If 'count' is non-zero, then the input list we exhausted
9170 * has everything remaining up to the machine's limit in its set, and hence
9171 * in the union, so there will be no further output. */
9174 /* Here, there is nothing left to put in the union. If the union came
9175 * only from the input that it is to overwrite, this whole operation is
9177 if ( UNLIKELY(! has_something_from_b && *output == a)
9178 || UNLIKELY(! has_something_from_a && *output == b))
9187 /* When 'count' is 0, the list that was exhausted (if one was shorter
9188 * than the other) ended with everything above it not in its set. That
9189 * means that the remaining part of the union is precisely the same as
9190 * the non-exhausted list, so can just copy it unchanged. If only one
9191 * of the inputs contributes to the union, and the output is to
9192 * overwite that particular input, then this whole operation was a
9195 IV copy_count = len_a - i_a;
9196 if (copy_count > 0) {
9197 if (UNLIKELY(! has_something_from_b && *output == a)) {
9201 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9202 len_u = i_u + copy_count;
9204 else if ((copy_count = len_b - i_b) > 0) {
9205 if (UNLIKELY(! has_something_from_a && *output == b)) {
9209 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9210 len_u = i_u + copy_count;
9211 } else if ( UNLIKELY(! has_something_from_b && *output == a)
9212 || UNLIKELY(! has_something_from_a && *output == b))
9214 /* Here, both arrays are exhausted, so no need to do any additional
9215 * copying. Also here, the union came only from the input that it is
9216 * to overwrite, so this whole operation is a no-op */
9222 /* Set the result to the final length, which can change the pointer to
9223 * array_u, so re-find it. (Note that it is unlikely that this will
9224 * change, as we are shrinking the space, not enlarging it) */
9225 if (len_u != _invlist_len(u)) {
9226 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9228 array_u = invlist_array(u);
9231 /* If the output is not to overwrite either of the inputs, just return the
9232 * calculated union */
9233 if (a != *output && b != *output) {
9237 /* Here, the output is to be the same as one of the input scalars,
9238 * hence replacing it. The simple thing to do is to free the input
9239 * scalar, making it instead be the output one. But experience has
9240 * shown [perl #127392] that if the input is a mortal, we can get a
9241 * huge build-up of these during regex compilation before they get
9242 * freed. So for that case, replace just the input's interior with
9243 * the output's, and then free the output */
9245 assert(! invlist_is_iterating(*output));
9247 if (! SvTEMP(*output)) {
9248 SvREFCNT_dec_NN(*output);
9252 invlist_replace_list_destroys_src(*output, u);
9261 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9262 const bool complement_b, SV** i)
9264 /* Take the intersection of two inversion lists and point <i> to it. *i
9265 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
9266 * the reference count to that list will be decremented if not already a
9267 * temporary (mortal); otherwise just its contents will be modified to be
9268 * the intersection. The first list, <a>, may be NULL, in which case an
9269 * empty list is returned. If <complement_b> is TRUE, the result will be
9270 * the intersection of <a> and the complement (or inversion) of <b> instead
9273 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9274 * Richard Gillam, published by Addison-Wesley, and explained at some
9275 * length there. The preface says to incorporate its examples into your
9276 * code at your own risk. In fact, it had bugs
9278 * The algorithm is like a merge sort, and is essentially the same as the
9282 const UV* array_a; /* a's array */
9284 UV len_a; /* length of a's array */
9287 SV* r; /* the resulting intersection */
9291 UV i_a = 0; /* current index into a's array */
9295 /* running count, as explained in the algorithm source book; items are
9296 * stopped accumulating and are output when the count changes to/from 2.
9297 * The count is incremented when we start a range that's in the set, and
9298 * decremented when we start a range that's not in the set. So its range
9299 * is 0 to 2. Only when the count is 2 is something in the intersection.
9303 bool has_something_from_a = FALSE;
9304 bool has_something_from_b = FALSE;
9306 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9309 /* Special case if either one is empty */
9310 len_a = (a == NULL) ? 0 : _invlist_len(a);
9311 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9312 if (len_a != 0 && complement_b) {
9314 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9315 * must be empty. Here, also we are using 'b's complement, which
9316 * hence must be every possible code point. Thus the intersection
9319 if (*i == a) { /* No-op */
9323 /* If not overwriting either input, just make a copy of 'a' */
9325 *i = invlist_clone(a);
9329 /* Here we are overwriting 'b' with 'a's contents */
9330 r = invlist_clone(a);
9331 invlist_replace_list_destroys_src(*i, r);
9336 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9337 * intersection must be empty */
9339 *i = _new_invlist(0);
9347 /* Here both lists exist and are non-empty */
9348 array_a = invlist_array(a);
9349 array_b = invlist_array(b);
9351 /* If are to take the intersection of 'a' with the complement of b, set it
9352 * up so are looking at b's complement. */
9355 /* To complement, we invert: if the first element is 0, remove it. To
9356 * do this, we just pretend the array starts one later */
9357 if (array_b[0] == 0) {
9363 /* But if the first element is not zero, we pretend the list starts
9364 * at the 0 that is always stored immediately before the array. */
9370 /* Size the intersection for the worst case: that the intersection ends up
9371 * fragmenting everything to be completely disjoint */
9372 r= _new_invlist(len_a + len_b);
9374 /* Will contain U+0000 iff both components do */
9375 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9376 && len_b > 0 && array_b[0] == 0);
9378 /* Go through each list item by item, stopping when exhausted one of
9380 while (i_a < len_a && i_b < len_b) {
9381 UV cp; /* The element to potentially add to the intersection's
9383 bool cp_in_set; /* Is it in the input list's set or not */
9385 /* We need to take one or the other of the two inputs for the
9386 * intersection. Since we are merging two sorted lists, we take the
9387 * smaller of the next items. In case of a tie, we take the one that
9388 * is not in its set first (a difference from the union algorithm). If
9389 * we took one in the set first, it would increment the count, possibly
9390 * to 2 which would cause it to be output as starting a range in the
9391 * intersection, and the next time through we would take that same
9392 * number, and output it again as ending the set. By doing it the
9393 * opposite of this, there is no possibility that the count will be
9394 * momentarily incremented to 2. (In a tie and both are in the set or
9395 * both not in the set, it doesn't matter which we take first.) */
9396 if (array_a[i_a] < array_b[i_b]
9397 || (array_a[i_a] == array_b[i_b]
9398 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9400 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9402 has_something_from_a = TRUE;
9405 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9407 has_something_from_b = TRUE;
9410 /* Here, have chosen which of the two inputs to look at. Only output
9411 * if the running count changes to/from 2, which marks the
9412 * beginning/end of a range that's in the intersection */
9416 array_r[i_r++] = cp;
9421 array_r[i_r++] = cp;
9427 /* Here, we are finished going through at least one of the lists, which
9428 * means there is something remaining in at most one. We check if the list
9429 * that has been exhausted is positioned such that we are in the middle
9430 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9431 * the ones we care about.) There are four cases:
9432 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9433 * nothing left in the intersection.
9434 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9435 * above 2. What should be output is exactly that which is in the
9436 * non-exhausted set, as everything it has is also in the intersection
9437 * set, and everything it doesn't have can't be in the intersection
9438 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9439 * gets incremented to 2. Like the previous case, the intersection is
9440 * everything that remains in the non-exhausted set.
9441 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9442 * remains 1. And the intersection has nothing more. */
9443 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9444 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9451 /* Here, there is nothing left to put in the intersection. If the
9452 * intersection came only from the input that it is to overwrite, this
9453 * whole operation is a no-op */
9454 if ( UNLIKELY(! has_something_from_b && *i == a)
9455 || UNLIKELY(! has_something_from_a && *i == b))
9464 /* When 'count' is 2 or more, the list that was exhausted, what remains
9465 * in the intersection is precisely the same as the non-exhausted list,
9466 * so can just copy it unchanged. If only one of the inputs
9467 * contributes to the intersection, and the output is to overwite that
9468 * particular input, then this whole operation was a no-op. */
9470 IV copy_count = len_a - i_a;
9471 if (copy_count > 0) {
9472 if (UNLIKELY(! has_something_from_b && *i == a)) {
9476 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9477 len_r = i_r + copy_count;
9479 else if ((copy_count = len_b - i_b) > 0) {
9480 if (UNLIKELY(! has_something_from_a && *i == b)) {
9484 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9485 len_r = i_r + copy_count;
9486 } else if ( UNLIKELY(! has_something_from_b && *i == a)
9487 || UNLIKELY(! has_something_from_a && *i == b))
9489 /* Here, both arrays are exhausted, so no need to do any additional
9490 * copying. Also here, the intersection came only from the input
9491 * that it is to overwrite, so this whole operation is a no-op */
9497 /* Set the result to the final length, which can change the pointer to
9498 * array_r, so re-find it. (Note that it is unlikely that this will
9499 * change, as we are shrinking the space, not enlarging it) */
9500 if (len_r != _invlist_len(r)) {
9501 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9503 array_r = invlist_array(r);
9506 /* If the output is not to overwrite either of the inputs, just return the
9507 * calculated intersection */
9508 if (a != *i && b != *i) {
9512 /* Here, the output is to be the same as one of the input scalars,
9513 * hence replacing it. The simple thing to do is to free the input
9514 * scalar, making it instead be the output one. But experience has
9515 * shown [perl #127392] that if the input is a mortal, we can get a
9516 * huge build-up of these during regex compilation before they get
9517 * freed. So for that case, replace just the input's interior with
9518 * the output's, and then free the output. A short-cut in this case
9519 * is if the output is empty, we can just set the input to be empty */
9521 assert(! invlist_is_iterating(*i));
9524 SvREFCNT_dec_NN(*i);
9529 invlist_replace_list_destroys_src(*i, r);
9542 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9544 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9545 * set. A pointer to the inversion list is returned. This may actually be
9546 * a new list, in which case the passed in one has been destroyed. The
9547 * passed-in inversion list can be NULL, in which case a new one is created
9548 * with just the one range in it */
9553 if (invlist == NULL) {
9554 invlist = _new_invlist(2);
9558 len = _invlist_len(invlist);
9561 /* If comes after the final entry actually in the list, can just append it
9564 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9565 && start >= invlist_array(invlist)[len - 1]))
9567 _append_range_to_invlist(invlist, start, end);
9571 /* Here, can't just append things, create and return a new inversion list
9572 * which is the union of this range and the existing inversion list. (If
9573 * the new range is well-behaved wrt to the old one, we could just insert
9574 * it, doing a Move() down on the tail of the old one (potentially growing
9575 * it first). But to determine that means we would have the extra
9576 * (possibly throw-away) work of first finding where the new one goes and
9577 * whether it disrupts (splits) an existing range, so it doesn't appear to
9578 * me (khw) that it's worth it) */
9579 range_invlist = _new_invlist(2);
9580 _append_range_to_invlist(range_invlist, start, end);
9582 _invlist_union(invlist, range_invlist, &invlist);
9584 /* The temporary can be freed */
9585 SvREFCNT_dec_NN(range_invlist);
9591 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9592 UV** other_elements_ptr)
9594 /* Create and return an inversion list whose contents are to be populated
9595 * by the caller. The caller gives the number of elements (in 'size') and
9596 * the very first element ('element0'). This function will set
9597 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9600 * Obviously there is some trust involved that the caller will properly
9601 * fill in the other elements of the array.
9603 * (The first element needs to be passed in, as the underlying code does
9604 * things differently depending on whether it is zero or non-zero) */
9606 SV* invlist = _new_invlist(size);
9609 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9611 _append_range_to_invlist(invlist, element0, element0);
9612 offset = *get_invlist_offset_addr(invlist);
9614 invlist_set_len(invlist, size, offset);
9615 *other_elements_ptr = invlist_array(invlist) + 1;
9621 PERL_STATIC_INLINE SV*
9622 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9623 return _add_range_to_invlist(invlist, cp, cp);
9626 #ifndef PERL_IN_XSUB_RE
9628 Perl__invlist_invert(pTHX_ SV* const invlist)
9630 /* Complement the input inversion list. This adds a 0 if the list didn't
9631 * have a zero; removes it otherwise. As described above, the data
9632 * structure is set up so that this is very efficient */
9634 PERL_ARGS_ASSERT__INVLIST_INVERT;
9636 assert(! invlist_is_iterating(invlist));
9638 /* The inverse of matching nothing is matching everything */
9639 if (_invlist_len(invlist) == 0) {
9640 _append_range_to_invlist(invlist, 0, UV_MAX);
9644 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9649 PERL_STATIC_INLINE SV*
9650 S_invlist_clone(pTHX_ SV* const invlist)
9653 /* Return a new inversion list that is a copy of the input one, which is
9654 * unchanged. The new list will not be mortal even if the old one was. */
9656 /* Need to allocate extra space to accommodate Perl's addition of a
9657 * trailing NUL to SvPV's, since it thinks they are always strings */
9658 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9659 STRLEN physical_length = SvCUR(invlist);
9660 bool offset = *(get_invlist_offset_addr(invlist));
9662 PERL_ARGS_ASSERT_INVLIST_CLONE;
9664 *(get_invlist_offset_addr(new_invlist)) = offset;
9665 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9666 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9671 PERL_STATIC_INLINE STRLEN*
9672 S_get_invlist_iter_addr(SV* invlist)
9674 /* Return the address of the UV that contains the current iteration
9677 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9679 assert(SvTYPE(invlist) == SVt_INVLIST);
9681 return &(((XINVLIST*) SvANY(invlist))->iterator);
9684 PERL_STATIC_INLINE void
9685 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9687 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9689 *get_invlist_iter_addr(invlist) = 0;
9692 PERL_STATIC_INLINE void
9693 S_invlist_iterfinish(SV* invlist)
9695 /* Terminate iterator for invlist. This is to catch development errors.
9696 * Any iteration that is interrupted before completed should call this
9697 * function. Functions that add code points anywhere else but to the end
9698 * of an inversion list assert that they are not in the middle of an
9699 * iteration. If they were, the addition would make the iteration
9700 * problematical: if the iteration hadn't reached the place where things
9701 * were being added, it would be ok */
9703 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9705 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9709 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9711 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9712 * This call sets in <*start> and <*end>, the next range in <invlist>.
9713 * Returns <TRUE> if successful and the next call will return the next
9714 * range; <FALSE> if was already at the end of the list. If the latter,
9715 * <*start> and <*end> are unchanged, and the next call to this function
9716 * will start over at the beginning of the list */
9718 STRLEN* pos = get_invlist_iter_addr(invlist);
9719 UV len = _invlist_len(invlist);
9722 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9725 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9729 array = invlist_array(invlist);
9731 *start = array[(*pos)++];
9737 *end = array[(*pos)++] - 1;
9743 PERL_STATIC_INLINE UV
9744 S_invlist_highest(SV* const invlist)
9746 /* Returns the highest code point that matches an inversion list. This API
9747 * has an ambiguity, as it returns 0 under either the highest is actually
9748 * 0, or if the list is empty. If this distinction matters to you, check
9749 * for emptiness before calling this function */
9751 UV len = _invlist_len(invlist);
9754 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9760 array = invlist_array(invlist);
9762 /* The last element in the array in the inversion list always starts a
9763 * range that goes to infinity. That range may be for code points that are
9764 * matched in the inversion list, or it may be for ones that aren't
9765 * matched. In the latter case, the highest code point in the set is one
9766 * less than the beginning of this range; otherwise it is the final element
9767 * of this range: infinity */
9768 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9770 : array[len - 1] - 1;
9774 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
9776 /* Get the contents of an inversion list into a string SV so that they can
9777 * be printed out. If 'traditional_style' is TRUE, it uses the format
9778 * traditionally done for debug tracing; otherwise it uses a format
9779 * suitable for just copying to the output, with blanks between ranges and
9780 * a dash between range components */
9784 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
9785 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
9787 if (traditional_style) {
9788 output = newSVpvs("\n");
9791 output = newSVpvs("");
9794 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
9796 assert(! invlist_is_iterating(invlist));
9798 invlist_iterinit(invlist);
9799 while (invlist_iternext(invlist, &start, &end)) {
9800 if (end == UV_MAX) {
9801 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
9802 start, intra_range_delimiter,
9803 inter_range_delimiter);
9805 else if (end != start) {
9806 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
9808 intra_range_delimiter,
9809 end, inter_range_delimiter);
9812 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
9813 start, inter_range_delimiter);
9817 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
9818 SvCUR_set(output, SvCUR(output) - 1);
9824 #ifndef PERL_IN_XSUB_RE
9826 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9827 const char * const indent, SV* const invlist)
9829 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9830 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9831 * the string 'indent'. The output looks like this:
9832 [0] 0x000A .. 0x000D
9834 [4] 0x2028 .. 0x2029
9835 [6] 0x3104 .. INFINITY
9836 * This means that the first range of code points matched by the list are
9837 * 0xA through 0xD; the second range contains only the single code point
9838 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9839 * are used to define each range (except if the final range extends to
9840 * infinity, only a single element is needed). The array index of the
9841 * first element for the corresponding range is given in brackets. */
9846 PERL_ARGS_ASSERT__INVLIST_DUMP;
9848 if (invlist_is_iterating(invlist)) {
9849 Perl_dump_indent(aTHX_ level, file,
9850 "%sCan't dump inversion list because is in middle of iterating\n",
9855 invlist_iterinit(invlist);
9856 while (invlist_iternext(invlist, &start, &end)) {
9857 if (end == UV_MAX) {
9858 Perl_dump_indent(aTHX_ level, file,
9859 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9860 indent, (UV)count, start);
9862 else if (end != start) {
9863 Perl_dump_indent(aTHX_ level, file,
9864 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9865 indent, (UV)count, start, end);
9868 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9869 indent, (UV)count, start);
9876 Perl__load_PL_utf8_foldclosures (pTHX)
9878 assert(! PL_utf8_foldclosures);
9880 /* If the folds haven't been read in, call a fold function
9882 if (! PL_utf8_tofold) {
9883 U8 dummy[UTF8_MAXBYTES_CASE+1];
9885 /* This string is just a short named one above \xff */
9886 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9887 assert(PL_utf8_tofold); /* Verify that worked */
9889 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9893 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
9895 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9897 /* Return a boolean as to if the two passed in inversion lists are
9898 * identical. The final argument, if TRUE, says to take the complement of
9899 * the second inversion list before doing the comparison */
9901 const UV* array_a = invlist_array(a);
9902 const UV* array_b = invlist_array(b);
9903 UV len_a = _invlist_len(a);
9904 UV len_b = _invlist_len(b);
9906 UV i = 0; /* current index into the arrays */
9907 bool retval = TRUE; /* Assume are identical until proven otherwise */
9909 PERL_ARGS_ASSERT__INVLISTEQ;
9911 /* If are to compare 'a' with the complement of b, set it
9912 * up so are looking at b's complement. */
9915 /* The complement of nothing is everything, so <a> would have to have
9916 * just one element, starting at zero (ending at infinity) */
9918 return (len_a == 1 && array_a[0] == 0);
9920 else if (array_b[0] == 0) {
9922 /* Otherwise, to complement, we invert. Here, the first element is
9923 * 0, just remove it. To do this, we just pretend the array starts
9931 /* But if the first element is not zero, we pretend the list starts
9932 * at the 0 that is always stored immediately before the array. */
9938 /* Make sure that the lengths are the same, as well as the final element
9939 * before looping through the remainder. (Thus we test the length, final,
9940 * and first elements right off the bat) */
9941 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9944 else for (i = 0; i < len_a - 1; i++) {
9945 if (array_a[i] != array_b[i]) {
9956 * As best we can, determine the characters that can match the start of
9957 * the given EXACTF-ish node.
9959 * Returns the invlist as a new SV*; it is the caller's responsibility to
9960 * call SvREFCNT_dec() when done with it.
9963 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9965 const U8 * s = (U8*)STRING(node);
9966 SSize_t bytelen = STR_LEN(node);
9968 /* Start out big enough for 2 separate code points */
9969 SV* invlist = _new_invlist(4);
9971 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9976 /* We punt and assume can match anything if the node begins
9977 * with a multi-character fold. Things are complicated. For
9978 * example, /ffi/i could match any of:
9979 * "\N{LATIN SMALL LIGATURE FFI}"
9980 * "\N{LATIN SMALL LIGATURE FF}I"
9981 * "F\N{LATIN SMALL LIGATURE FI}"
9982 * plus several other things; and making sure we have all the
9983 * possibilities is hard. */
9984 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9985 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9988 /* Any Latin1 range character can potentially match any
9989 * other depending on the locale */
9990 if (OP(node) == EXACTFL) {
9991 _invlist_union(invlist, PL_Latin1, &invlist);
9994 /* But otherwise, it matches at least itself. We can
9995 * quickly tell if it has a distinct fold, and if so,
9996 * it matches that as well */
9997 invlist = add_cp_to_invlist(invlist, uc);
9998 if (IS_IN_SOME_FOLD_L1(uc))
9999 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10002 /* Some characters match above-Latin1 ones under /i. This
10003 * is true of EXACTFL ones when the locale is UTF-8 */
10004 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10005 && (! isASCII(uc) || (OP(node) != EXACTFA
10006 && OP(node) != EXACTFA_NO_TRIE)))
10008 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10012 else { /* Pattern is UTF-8 */
10013 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10014 STRLEN foldlen = UTF8SKIP(s);
10015 const U8* e = s + bytelen;
10018 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10020 /* The only code points that aren't folded in a UTF EXACTFish
10021 * node are are the problematic ones in EXACTFL nodes */
10022 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10023 /* We need to check for the possibility that this EXACTFL
10024 * node begins with a multi-char fold. Therefore we fold
10025 * the first few characters of it so that we can make that
10030 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10032 *(d++) = (U8) toFOLD(*s);
10037 to_utf8_fold(s, d, &len);
10043 /* And set up so the code below that looks in this folded
10044 * buffer instead of the node's string */
10046 foldlen = UTF8SKIP(folded);
10050 /* When we reach here 's' points to the fold of the first
10051 * character(s) of the node; and 'e' points to far enough along
10052 * the folded string to be just past any possible multi-char
10053 * fold. 'foldlen' is the length in bytes of the first
10056 * Unlike the non-UTF-8 case, the macro for determining if a
10057 * string is a multi-char fold requires all the characters to
10058 * already be folded. This is because of all the complications
10059 * if not. Note that they are folded anyway, except in EXACTFL
10060 * nodes. Like the non-UTF case above, we punt if the node
10061 * begins with a multi-char fold */
10063 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10064 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10066 else { /* Single char fold */
10068 /* It matches all the things that fold to it, which are
10069 * found in PL_utf8_foldclosures (including itself) */
10070 invlist = add_cp_to_invlist(invlist, uc);
10071 if (! PL_utf8_foldclosures)
10072 _load_PL_utf8_foldclosures();
10073 if ((listp = hv_fetch(PL_utf8_foldclosures,
10074 (char *) s, foldlen, FALSE)))
10076 AV* list = (AV*) *listp;
10078 for (k = 0; k <= av_tindex_nomg(list); k++) {
10079 SV** c_p = av_fetch(list, k, FALSE);
10085 /* /aa doesn't allow folds between ASCII and non- */
10086 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10087 && isASCII(c) != isASCII(uc))
10092 invlist = add_cp_to_invlist(invlist, c);
10101 #undef HEADER_LENGTH
10102 #undef TO_INTERNAL_SIZE
10103 #undef FROM_INTERNAL_SIZE
10104 #undef INVLIST_VERSION_ID
10106 /* End of inversion list object */
10109 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10111 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10112 * constructs, and updates RExC_flags with them. On input, RExC_parse
10113 * should point to the first flag; it is updated on output to point to the
10114 * final ')' or ':'. There needs to be at least one flag, or this will
10117 /* for (?g), (?gc), and (?o) warnings; warning
10118 about (?c) will warn about (?g) -- japhy */
10120 #define WASTED_O 0x01
10121 #define WASTED_G 0x02
10122 #define WASTED_C 0x04
10123 #define WASTED_GC (WASTED_G|WASTED_C)
10124 I32 wastedflags = 0x00;
10125 U32 posflags = 0, negflags = 0;
10126 U32 *flagsp = &posflags;
10127 char has_charset_modifier = '\0';
10129 bool has_use_defaults = FALSE;
10130 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10131 int x_mod_count = 0;
10133 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10135 /* '^' as an initial flag sets certain defaults */
10136 if (UCHARAT(RExC_parse) == '^') {
10138 has_use_defaults = TRUE;
10139 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10140 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10141 ? REGEX_UNICODE_CHARSET
10142 : REGEX_DEPENDS_CHARSET);
10145 cs = get_regex_charset(RExC_flags);
10146 if (cs == REGEX_DEPENDS_CHARSET
10147 && (RExC_utf8 || RExC_uni_semantics))
10149 cs = REGEX_UNICODE_CHARSET;
10152 while (RExC_parse < RExC_end) {
10153 /* && strchr("iogcmsx", *RExC_parse) */
10154 /* (?g), (?gc) and (?o) are useless here
10155 and must be globally applied -- japhy */
10156 switch (*RExC_parse) {
10158 /* Code for the imsxn flags */
10159 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10161 case LOCALE_PAT_MOD:
10162 if (has_charset_modifier) {
10163 goto excess_modifier;
10165 else if (flagsp == &negflags) {
10168 cs = REGEX_LOCALE_CHARSET;
10169 has_charset_modifier = LOCALE_PAT_MOD;
10171 case UNICODE_PAT_MOD:
10172 if (has_charset_modifier) {
10173 goto excess_modifier;
10175 else if (flagsp == &negflags) {
10178 cs = REGEX_UNICODE_CHARSET;
10179 has_charset_modifier = UNICODE_PAT_MOD;
10181 case ASCII_RESTRICT_PAT_MOD:
10182 if (flagsp == &negflags) {
10185 if (has_charset_modifier) {
10186 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10187 goto excess_modifier;
10189 /* Doubled modifier implies more restricted */
10190 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10193 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10195 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10197 case DEPENDS_PAT_MOD:
10198 if (has_use_defaults) {
10199 goto fail_modifiers;
10201 else if (flagsp == &negflags) {
10204 else if (has_charset_modifier) {
10205 goto excess_modifier;
10208 /* The dual charset means unicode semantics if the
10209 * pattern (or target, not known until runtime) are
10210 * utf8, or something in the pattern indicates unicode
10212 cs = (RExC_utf8 || RExC_uni_semantics)
10213 ? REGEX_UNICODE_CHARSET
10214 : REGEX_DEPENDS_CHARSET;
10215 has_charset_modifier = DEPENDS_PAT_MOD;
10219 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10220 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10222 else if (has_charset_modifier == *(RExC_parse - 1)) {
10223 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10224 *(RExC_parse - 1));
10227 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10229 NOT_REACHED; /*NOTREACHED*/
10232 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10233 *(RExC_parse - 1));
10234 NOT_REACHED; /*NOTREACHED*/
10235 case ONCE_PAT_MOD: /* 'o' */
10236 case GLOBAL_PAT_MOD: /* 'g' */
10237 if (PASS2 && ckWARN(WARN_REGEXP)) {
10238 const I32 wflagbit = *RExC_parse == 'o'
10241 if (! (wastedflags & wflagbit) ) {
10242 wastedflags |= wflagbit;
10243 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10246 "Useless (%s%c) - %suse /%c modifier",
10247 flagsp == &negflags ? "?-" : "?",
10249 flagsp == &negflags ? "don't " : "",
10256 case CONTINUE_PAT_MOD: /* 'c' */
10257 if (PASS2 && ckWARN(WARN_REGEXP)) {
10258 if (! (wastedflags & WASTED_C) ) {
10259 wastedflags |= WASTED_GC;
10260 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10263 "Useless (%sc) - %suse /gc modifier",
10264 flagsp == &negflags ? "?-" : "?",
10265 flagsp == &negflags ? "don't " : ""
10270 case KEEPCOPY_PAT_MOD: /* 'p' */
10271 if (flagsp == &negflags) {
10273 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10275 *flagsp |= RXf_PMf_KEEPCOPY;
10279 /* A flag is a default iff it is following a minus, so
10280 * if there is a minus, it means will be trying to
10281 * re-specify a default which is an error */
10282 if (has_use_defaults || flagsp == &negflags) {
10283 goto fail_modifiers;
10285 flagsp = &negflags;
10286 wastedflags = 0; /* reset so (?g-c) warns twice */
10290 RExC_flags |= posflags;
10291 RExC_flags &= ~negflags;
10292 set_regex_charset(&RExC_flags, cs);
10293 if (RExC_flags & RXf_PMf_FOLD) {
10294 RExC_contains_i = 1;
10297 if (UNLIKELY((x_mod_count) > 1)) {
10298 vFAIL("Only one /x regex modifier is allowed");
10304 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10305 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10306 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
10307 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10308 NOT_REACHED; /*NOTREACHED*/
10311 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10314 vFAIL("Sequence (?... not terminated");
10318 - reg - regular expression, i.e. main body or parenthesized thing
10320 * Caller must absorb opening parenthesis.
10322 * Combining parenthesis handling with the base level of regular expression
10323 * is a trifle forced, but the need to tie the tails of the branches to what
10324 * follows makes it hard to avoid.
10326 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10328 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10330 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10333 PERL_STATIC_INLINE regnode *
10334 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10336 char * parse_start,
10341 char* name_start = RExC_parse;
10343 SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10344 ? REG_RSN_RETURN_NULL
10345 : REG_RSN_RETURN_DATA);
10346 GET_RE_DEBUG_FLAGS_DECL;
10348 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10350 if (RExC_parse == name_start || *RExC_parse != ch) {
10351 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10352 vFAIL2("Sequence %.3s... not terminated",parse_start);
10356 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10357 RExC_rxi->data->data[num]=(void*)sv_dat;
10358 SvREFCNT_inc_simple_void(sv_dat);
10361 ret = reganode(pRExC_state,
10364 : (ASCII_FOLD_RESTRICTED)
10366 : (AT_LEAST_UNI_SEMANTICS)
10372 *flagp |= HASWIDTH;
10374 Set_Node_Offset(ret, parse_start+1);
10375 Set_Node_Cur_Length(ret, parse_start);
10377 nextchar(pRExC_state);
10381 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10382 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10383 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10384 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
10385 NULL, which cannot happen. */
10387 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10388 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10389 * 2 is like 1, but indicates that nextchar() has been called to advance
10390 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
10391 * this flag alerts us to the need to check for that */
10393 regnode *ret; /* Will be the head of the group. */
10396 regnode *ender = NULL;
10399 U32 oregflags = RExC_flags;
10400 bool have_branch = 0;
10402 I32 freeze_paren = 0;
10403 I32 after_freeze = 0;
10404 I32 num; /* numeric backreferences */
10406 char * parse_start = RExC_parse; /* MJD */
10407 char * const oregcomp_parse = RExC_parse;
10409 GET_RE_DEBUG_FLAGS_DECL;
10411 PERL_ARGS_ASSERT_REG;
10412 DEBUG_PARSE("reg ");
10414 *flagp = 0; /* Tentatively. */
10416 /* Having this true makes it feasible to have a lot fewer tests for the
10417 * parse pointer being in scope. For example, we can write
10418 * while(isFOO(*RExC_parse)) RExC_parse++;
10420 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10422 assert(*RExC_end == '\0');
10424 /* Make an OPEN node, if parenthesized. */
10427 /* Under /x, space and comments can be gobbled up between the '(' and
10428 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
10429 * intervening space, as the sequence is a token, and a token should be
10431 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
10433 if (RExC_parse >= RExC_end) {
10434 vFAIL("Unmatched (");
10437 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10438 char *start_verb = RExC_parse + 1;
10440 char *start_arg = NULL;
10441 unsigned char op = 0;
10442 int arg_required = 0;
10443 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10445 if (has_intervening_patws) {
10446 RExC_parse++; /* past the '*' */
10447 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10449 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10450 if ( *RExC_parse == ':' ) {
10451 start_arg = RExC_parse + 1;
10454 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10456 verb_len = RExC_parse - start_verb;
10458 if (RExC_parse >= RExC_end) {
10459 goto unterminated_verb_pattern;
10461 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10462 while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10463 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10464 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10465 unterminated_verb_pattern:
10466 vFAIL("Unterminated verb pattern argument");
10467 if ( RExC_parse == start_arg )
10470 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10471 vFAIL("Unterminated verb pattern");
10474 /* Here, we know that RExC_parse < RExC_end */
10476 switch ( *start_verb ) {
10477 case 'A': /* (*ACCEPT) */
10478 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10480 internal_argval = RExC_nestroot;
10483 case 'C': /* (*COMMIT) */
10484 if ( memEQs(start_verb,verb_len,"COMMIT") )
10487 case 'F': /* (*FAIL) */
10488 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10492 case ':': /* (*:NAME) */
10493 case 'M': /* (*MARK:NAME) */
10494 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10499 case 'P': /* (*PRUNE) */
10500 if ( memEQs(start_verb,verb_len,"PRUNE") )
10503 case 'S': /* (*SKIP) */
10504 if ( memEQs(start_verb,verb_len,"SKIP") )
10507 case 'T': /* (*THEN) */
10508 /* [19:06] <TimToady> :: is then */
10509 if ( memEQs(start_verb,verb_len,"THEN") ) {
10511 RExC_seen |= REG_CUTGROUP_SEEN;
10516 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10518 "Unknown verb pattern '%"UTF8f"'",
10519 UTF8fARG(UTF, verb_len, start_verb));
10521 if ( arg_required && !start_arg ) {
10522 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10523 verb_len, start_verb);
10525 if (internal_argval == -1) {
10526 ret = reganode(pRExC_state, op, 0);
10528 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10530 RExC_seen |= REG_VERBARG_SEEN;
10531 if ( ! SIZE_ONLY ) {
10533 SV *sv = newSVpvn( start_arg,
10534 RExC_parse - start_arg);
10535 ARG(ret) = add_data( pRExC_state,
10536 STR_WITH_LEN("S"));
10537 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10542 if ( internal_argval != -1 )
10543 ARG2L_SET(ret, internal_argval);
10545 nextchar(pRExC_state);
10548 else if (*RExC_parse == '?') { /* (?...) */
10549 bool is_logical = 0;
10550 const char * const seqstart = RExC_parse;
10551 const char * endptr;
10552 if (has_intervening_patws) {
10554 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10557 RExC_parse++; /* past the '?' */
10558 paren = *RExC_parse; /* might be a trailing NUL, if not
10560 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10561 if (RExC_parse > RExC_end) {
10564 ret = NULL; /* For look-ahead/behind. */
10567 case 'P': /* (?P...) variants for those used to PCRE/Python */
10568 paren = *RExC_parse;
10569 if ( paren == '<') { /* (?P<...>) named capture */
10571 if (RExC_parse >= RExC_end) {
10572 vFAIL("Sequence (?P<... not terminated");
10574 goto named_capture;
10576 else if (paren == '>') { /* (?P>name) named recursion */
10578 if (RExC_parse >= RExC_end) {
10579 vFAIL("Sequence (?P>... not terminated");
10581 goto named_recursion;
10583 else if (paren == '=') { /* (?P=...) named backref */
10585 return handle_named_backref(pRExC_state, flagp,
10588 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10589 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10590 vFAIL3("Sequence (%.*s...) not recognized",
10591 RExC_parse-seqstart, seqstart);
10592 NOT_REACHED; /*NOTREACHED*/
10593 case '<': /* (?<...) */
10594 if (*RExC_parse == '!')
10596 else if (*RExC_parse != '=')
10603 case '\'': /* (?'...') */
10604 name_start = RExC_parse;
10605 svname = reg_scan_name(pRExC_state,
10606 SIZE_ONLY /* reverse test from the others */
10607 ? REG_RSN_RETURN_NAME
10608 : REG_RSN_RETURN_NULL);
10609 if ( RExC_parse == name_start
10610 || RExC_parse >= RExC_end
10611 || *RExC_parse != paren)
10613 vFAIL2("Sequence (?%c... not terminated",
10614 paren=='>' ? '<' : paren);
10619 if (!svname) /* shouldn't happen */
10621 "panic: reg_scan_name returned NULL");
10622 if (!RExC_paren_names) {
10623 RExC_paren_names= newHV();
10624 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10626 RExC_paren_name_list= newAV();
10627 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10630 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10632 sv_dat = HeVAL(he_str);
10634 /* croak baby croak */
10636 "panic: paren_name hash element allocation failed");
10637 } else if ( SvPOK(sv_dat) ) {
10638 /* (?|...) can mean we have dupes so scan to check
10639 its already been stored. Maybe a flag indicating
10640 we are inside such a construct would be useful,
10641 but the arrays are likely to be quite small, so
10642 for now we punt -- dmq */
10643 IV count = SvIV(sv_dat);
10644 I32 *pv = (I32*)SvPVX(sv_dat);
10646 for ( i = 0 ; i < count ; i++ ) {
10647 if ( pv[i] == RExC_npar ) {
10653 pv = (I32*)SvGROW(sv_dat,
10654 SvCUR(sv_dat) + sizeof(I32)+1);
10655 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10656 pv[count] = RExC_npar;
10657 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10660 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10661 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10664 SvIV_set(sv_dat, 1);
10667 /* Yes this does cause a memory leak in debugging Perls
10669 if (!av_store(RExC_paren_name_list,
10670 RExC_npar, SvREFCNT_inc(svname)))
10671 SvREFCNT_dec_NN(svname);
10674 /*sv_dump(sv_dat);*/
10676 nextchar(pRExC_state);
10678 goto capturing_parens;
10680 RExC_seen |= REG_LOOKBEHIND_SEEN;
10681 RExC_in_lookbehind++;
10683 assert(RExC_parse < RExC_end);
10685 case '=': /* (?=...) */
10686 RExC_seen_zerolen++;
10688 case '!': /* (?!...) */
10689 RExC_seen_zerolen++;
10690 /* check if we're really just a "FAIL" assertion */
10691 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10692 FALSE /* Don't force to /x */ );
10693 if (*RExC_parse == ')') {
10694 ret=reganode(pRExC_state, OPFAIL, 0);
10695 nextchar(pRExC_state);
10699 case '|': /* (?|...) */
10700 /* branch reset, behave like a (?:...) except that
10701 buffers in alternations share the same numbers */
10703 after_freeze = freeze_paren = RExC_npar;
10705 case ':': /* (?:...) */
10706 case '>': /* (?>...) */
10708 case '$': /* (?$...) */
10709 case '@': /* (?@...) */
10710 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10712 case '0' : /* (?0) */
10713 case 'R' : /* (?R) */
10714 if (RExC_parse == RExC_end || *RExC_parse != ')')
10715 FAIL("Sequence (?R) not terminated");
10717 RExC_seen |= REG_RECURSE_SEEN;
10718 *flagp |= POSTPONED;
10719 goto gen_recurse_regop;
10721 /* named and numeric backreferences */
10722 case '&': /* (?&NAME) */
10723 parse_start = RExC_parse - 1;
10726 SV *sv_dat = reg_scan_name(pRExC_state,
10727 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10728 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10730 if (RExC_parse >= RExC_end || *RExC_parse != ')')
10731 vFAIL("Sequence (?&... not terminated");
10732 goto gen_recurse_regop;
10735 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10737 vFAIL("Illegal pattern");
10739 goto parse_recursion;
10741 case '-': /* (?-1) */
10742 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10743 RExC_parse--; /* rewind to let it be handled later */
10747 case '1': case '2': case '3': case '4': /* (?1) */
10748 case '5': case '6': case '7': case '8': case '9':
10749 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
10752 bool is_neg = FALSE;
10754 parse_start = RExC_parse - 1; /* MJD */
10755 if (*RExC_parse == '-') {
10759 if (grok_atoUV(RExC_parse, &unum, &endptr)
10763 RExC_parse = (char*)endptr;
10767 /* Some limit for num? */
10771 if (*RExC_parse!=')')
10772 vFAIL("Expecting close bracket");
10775 if ( paren == '-' ) {
10777 Diagram of capture buffer numbering.
10778 Top line is the normal capture buffer numbers
10779 Bottom line is the negative indexing as from
10783 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10787 num = RExC_npar + num;
10790 vFAIL("Reference to nonexistent group");
10792 } else if ( paren == '+' ) {
10793 num = RExC_npar + num - 1;
10795 /* We keep track how many GOSUB items we have produced.
10796 To start off the ARG2L() of the GOSUB holds its "id",
10797 which is used later in conjunction with RExC_recurse
10798 to calculate the offset we need to jump for the GOSUB,
10799 which it will store in the final representation.
10800 We have to defer the actual calculation until much later
10801 as the regop may move.
10804 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10806 if (num > (I32)RExC_rx->nparens) {
10808 vFAIL("Reference to nonexistent group");
10810 RExC_recurse_count++;
10811 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
10812 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10813 22, "| |", (int)(depth * 2 + 1), "",
10814 (UV)ARG(ret), (IV)ARG2L(ret)));
10816 RExC_seen |= REG_RECURSE_SEEN;
10818 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10819 Set_Node_Offset(ret, parse_start); /* MJD */
10821 *flagp |= POSTPONED;
10822 assert(*RExC_parse == ')');
10823 nextchar(pRExC_state);
10828 case '?': /* (??...) */
10830 if (*RExC_parse != '{') {
10831 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10832 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10834 "Sequence (%"UTF8f"...) not recognized",
10835 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10836 NOT_REACHED; /*NOTREACHED*/
10838 *flagp |= POSTPONED;
10842 case '{': /* (?{...}) */
10845 struct reg_code_block *cb;
10847 RExC_seen_zerolen++;
10849 if ( !pRExC_state->num_code_blocks
10850 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10851 || pRExC_state->code_blocks[pRExC_state->code_index].start
10852 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10855 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10856 FAIL("panic: Sequence (?{...}): no code block found\n");
10857 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10859 /* this is a pre-compiled code block (?{...}) */
10860 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10861 RExC_parse = RExC_start + cb->end;
10864 if (cb->src_regex) {
10865 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10866 RExC_rxi->data->data[n] =
10867 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10868 RExC_rxi->data->data[n+1] = (void*)o;
10871 n = add_data(pRExC_state,
10872 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10873 RExC_rxi->data->data[n] = (void*)o;
10876 pRExC_state->code_index++;
10877 nextchar(pRExC_state);
10881 ret = reg_node(pRExC_state, LOGICAL);
10883 eval = reg2Lanode(pRExC_state, EVAL,
10886 /* for later propagation into (??{})
10888 RExC_flags & RXf_PMf_COMPILETIME
10893 REGTAIL(pRExC_state, ret, eval);
10894 /* deal with the length of this later - MJD */
10897 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10898 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10899 Set_Node_Offset(ret, parse_start);
10902 case '(': /* (?(?{...})...) and (?(?=...)...) */
10905 const int DEFINE_len = sizeof("DEFINE") - 1;
10906 if (RExC_parse[0] == '?') { /* (?(?...)) */
10907 if ( RExC_parse < RExC_end - 1
10908 && ( RExC_parse[1] == '='
10909 || RExC_parse[1] == '!'
10910 || RExC_parse[1] == '<'
10911 || RExC_parse[1] == '{')
10912 ) { /* Lookahead or eval. */
10916 ret = reg_node(pRExC_state, LOGICAL);
10920 tail = reg(pRExC_state, 1, &flag, depth+1);
10921 if (flag & (RESTART_PASS1|NEED_UTF8)) {
10922 *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10925 REGTAIL(pRExC_state, ret, tail);
10928 /* Fall through to ‘Unknown switch condition’ at the
10929 end of the if/else chain. */
10931 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10932 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10934 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10935 char *name_start= RExC_parse++;
10937 SV *sv_dat=reg_scan_name(pRExC_state,
10938 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10939 if ( RExC_parse == name_start
10940 || RExC_parse >= RExC_end
10941 || *RExC_parse != ch)
10943 vFAIL2("Sequence (?(%c... not terminated",
10944 (ch == '>' ? '<' : ch));
10948 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10949 RExC_rxi->data->data[num]=(void*)sv_dat;
10950 SvREFCNT_inc_simple_void(sv_dat);
10952 ret = reganode(pRExC_state,NGROUPP,num);
10953 goto insert_if_check_paren;
10955 else if (RExC_end - RExC_parse >= DEFINE_len
10956 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10958 ret = reganode(pRExC_state,DEFINEP,0);
10959 RExC_parse += DEFINE_len;
10961 goto insert_if_check_paren;
10963 else if (RExC_parse[0] == 'R') {
10965 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
10966 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
10967 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
10970 if (RExC_parse[0] == '0') {
10974 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10976 if (grok_atoUV(RExC_parse, &uv, &endptr)
10979 parno = (I32)uv + 1;
10980 RExC_parse = (char*)endptr;
10982 /* else "Switch condition not recognized" below */
10983 } else if (RExC_parse[0] == '&') {
10986 sv_dat = reg_scan_name(pRExC_state,
10988 ? REG_RSN_RETURN_NULL
10989 : REG_RSN_RETURN_DATA);
10991 /* we should only have a false sv_dat when
10992 * SIZE_ONLY is true, and we always have false
10993 * sv_dat when SIZE_ONLY is true.
10994 * reg_scan_name() will VFAIL() if the name is
10995 * unknown when SIZE_ONLY is false, and otherwise
10996 * will return something, and when SIZE_ONLY is
10997 * true, reg_scan_name() just parses the string,
10998 * and doesnt return anything. (in theory) */
10999 assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11002 parno = 1 + *((I32 *)SvPVX(sv_dat));
11004 ret = reganode(pRExC_state,INSUBP,parno);
11005 goto insert_if_check_paren;
11007 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11011 if (grok_atoUV(RExC_parse, &uv, &endptr)
11015 RExC_parse = (char*)endptr;
11018 vFAIL("panic: grok_atoUV returned FALSE");
11020 ret = reganode(pRExC_state, GROUPP, parno);
11022 insert_if_check_paren:
11023 if (UCHARAT(RExC_parse) != ')') {
11024 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11025 vFAIL("Switch condition not recognized");
11027 nextchar(pRExC_state);
11029 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11030 br = regbranch(pRExC_state, &flags, 1,depth+1);
11032 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11033 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11036 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
11039 REGTAIL(pRExC_state, br, reganode(pRExC_state,
11041 c = UCHARAT(RExC_parse);
11042 nextchar(pRExC_state);
11043 if (flags&HASWIDTH)
11044 *flagp |= HASWIDTH;
11047 vFAIL("(?(DEFINE)....) does not allow branches");
11049 /* Fake one for optimizer. */
11050 lastbr = reganode(pRExC_state, IFTHEN, 0);
11052 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
11053 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11054 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11057 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
11060 REGTAIL(pRExC_state, ret, lastbr);
11061 if (flags&HASWIDTH)
11062 *flagp |= HASWIDTH;
11063 c = UCHARAT(RExC_parse);
11064 nextchar(pRExC_state);
11069 if (RExC_parse >= RExC_end)
11070 vFAIL("Switch (?(condition)... not terminated");
11072 vFAIL("Switch (?(condition)... contains too many branches");
11074 ender = reg_node(pRExC_state, TAIL);
11075 REGTAIL(pRExC_state, br, ender);
11077 REGTAIL(pRExC_state, lastbr, ender);
11078 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11081 REGTAIL(pRExC_state, ret, ender);
11082 RExC_size++; /* XXX WHY do we need this?!!
11083 For large programs it seems to be required
11084 but I can't figure out why. -- dmq*/
11087 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11088 vFAIL("Unknown switch condition (?(...))");
11090 case '[': /* (?[ ... ]) */
11091 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
11093 case 0: /* A NUL */
11094 RExC_parse--; /* for vFAIL to print correctly */
11095 vFAIL("Sequence (? incomplete");
11097 default: /* e.g., (?i) */
11098 RExC_parse = (char *) seqstart + 1;
11100 parse_lparen_question_flags(pRExC_state);
11101 if (UCHARAT(RExC_parse) != ':') {
11102 if (RExC_parse < RExC_end)
11103 nextchar(pRExC_state);
11108 nextchar(pRExC_state);
11113 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
11118 ret = reganode(pRExC_state, OPEN, parno);
11120 if (!RExC_nestroot)
11121 RExC_nestroot = parno;
11122 if (RExC_open_parens && !RExC_open_parens[parno])
11124 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11125 "%*s%*s Setting open paren #%"IVdf" to %d\n",
11126 22, "| |", (int)(depth * 2 + 1), "",
11127 (IV)parno, REG_NODE_NUM(ret)));
11128 RExC_open_parens[parno]= ret;
11131 Set_Node_Length(ret, 1); /* MJD */
11132 Set_Node_Offset(ret, RExC_parse); /* MJD */
11135 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11144 /* Pick up the branches, linking them together. */
11145 parse_start = RExC_parse; /* MJD */
11146 br = regbranch(pRExC_state, &flags, 1,depth+1);
11148 /* branch_len = (paren != 0); */
11151 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11152 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11155 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11157 if (*RExC_parse == '|') {
11158 if (!SIZE_ONLY && RExC_extralen) {
11159 reginsert(pRExC_state, BRANCHJ, br, depth+1);
11162 reginsert(pRExC_state, BRANCH, br, depth+1);
11163 Set_Node_Length(br, paren != 0);
11164 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11168 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
11170 else if (paren == ':') {
11171 *flagp |= flags&SIMPLE;
11173 if (is_open) { /* Starts with OPEN. */
11174 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
11176 else if (paren != '?') /* Not Conditional */
11178 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11180 while (*RExC_parse == '|') {
11181 if (!SIZE_ONLY && RExC_extralen) {
11182 ender = reganode(pRExC_state, LONGJMP,0);
11184 /* Append to the previous. */
11185 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11188 RExC_extralen += 2; /* Account for LONGJMP. */
11189 nextchar(pRExC_state);
11190 if (freeze_paren) {
11191 if (RExC_npar > after_freeze)
11192 after_freeze = RExC_npar;
11193 RExC_npar = freeze_paren;
11195 br = regbranch(pRExC_state, &flags, 0, depth+1);
11198 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11199 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11202 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11204 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
11206 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11209 if (have_branch || paren != ':') {
11210 /* Make a closing node, and hook it on the end. */
11213 ender = reg_node(pRExC_state, TAIL);
11216 ender = reganode(pRExC_state, CLOSE, parno);
11217 if ( RExC_close_parens ) {
11218 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11219 "%*s%*s Setting close paren #%"IVdf" to %d\n",
11220 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11221 RExC_close_parens[parno]= ender;
11222 if (RExC_nestroot == parno)
11225 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11226 Set_Node_Length(ender,1); /* MJD */
11232 *flagp &= ~HASWIDTH;
11235 ender = reg_node(pRExC_state, SUCCEED);
11238 ender = reg_node(pRExC_state, END);
11240 assert(!RExC_end_op); /* there can only be one! */
11241 RExC_end_op = ender;
11242 if (RExC_close_parens) {
11243 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11244 "%*s%*s Setting close paren #0 (END) to %d\n",
11245 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11247 RExC_close_parens[0]= ender;
11252 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11253 DEBUG_PARSE_MSG("lsbr");
11254 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11255 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11256 Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11257 SvPV_nolen_const(RExC_mysv1),
11258 (IV)REG_NODE_NUM(lastbr),
11259 SvPV_nolen_const(RExC_mysv2),
11260 (IV)REG_NODE_NUM(ender),
11261 (IV)(ender - lastbr)
11264 REGTAIL(pRExC_state, lastbr, ender);
11266 if (have_branch && !SIZE_ONLY) {
11267 char is_nothing= 1;
11269 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11271 /* Hook the tails of the branches to the closing node. */
11272 for (br = ret; br; br = regnext(br)) {
11273 const U8 op = PL_regkind[OP(br)];
11274 if (op == BRANCH) {
11275 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11276 if ( OP(NEXTOPER(br)) != NOTHING
11277 || regnext(NEXTOPER(br)) != ender)
11280 else if (op == BRANCHJ) {
11281 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11282 /* for now we always disable this optimisation * /
11283 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11284 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11290 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11291 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11292 DEBUG_PARSE_MSG("NADA");
11293 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11294 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11295 Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11296 SvPV_nolen_const(RExC_mysv1),
11297 (IV)REG_NODE_NUM(ret),
11298 SvPV_nolen_const(RExC_mysv2),
11299 (IV)REG_NODE_NUM(ender),
11304 if (OP(ender) == TAIL) {
11309 for ( opt= br + 1; opt < ender ; opt++ )
11310 OP(opt)= OPTIMIZED;
11311 NEXT_OFF(br)= ender - br;
11319 static const char parens[] = "=!<,>";
11321 if (paren && (p = strchr(parens, paren))) {
11322 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11323 int flag = (p - parens) > 1;
11326 node = SUSPEND, flag = 0;
11327 reginsert(pRExC_state, node,ret, depth+1);
11328 Set_Node_Cur_Length(ret, parse_start);
11329 Set_Node_Offset(ret, parse_start + 1);
11331 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11335 /* Check for proper termination. */
11337 /* restore original flags, but keep (?p) and, if we've changed from /d
11338 * rules to /u, keep the /u */
11339 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11340 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11341 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11343 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11344 RExC_parse = oregcomp_parse;
11345 vFAIL("Unmatched (");
11347 nextchar(pRExC_state);
11349 else if (!paren && RExC_parse < RExC_end) {
11350 if (*RExC_parse == ')') {
11352 vFAIL("Unmatched )");
11355 FAIL("Junk on end of regexp"); /* "Can't happen". */
11356 NOT_REACHED; /* NOTREACHED */
11359 if (RExC_in_lookbehind) {
11360 RExC_in_lookbehind--;
11362 if (after_freeze > RExC_npar)
11363 RExC_npar = after_freeze;
11368 - regbranch - one alternative of an | operator
11370 * Implements the concatenation operator.
11372 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11373 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11376 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11379 regnode *chain = NULL;
11381 I32 flags = 0, c = 0;
11382 GET_RE_DEBUG_FLAGS_DECL;
11384 PERL_ARGS_ASSERT_REGBRANCH;
11386 DEBUG_PARSE("brnc");
11391 if (!SIZE_ONLY && RExC_extralen)
11392 ret = reganode(pRExC_state, BRANCHJ,0);
11394 ret = reg_node(pRExC_state, BRANCH);
11395 Set_Node_Length(ret, 1);
11399 if (!first && SIZE_ONLY)
11400 RExC_extralen += 1; /* BRANCHJ */
11402 *flagp = WORST; /* Tentatively. */
11404 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11405 FALSE /* Don't force to /x */ );
11406 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11407 flags &= ~TRYAGAIN;
11408 latest = regpiece(pRExC_state, &flags,depth+1);
11409 if (latest == NULL) {
11410 if (flags & TRYAGAIN)
11412 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11413 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11416 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
11418 else if (ret == NULL)
11420 *flagp |= flags&(HASWIDTH|POSTPONED);
11421 if (chain == NULL) /* First piece. */
11422 *flagp |= flags&SPSTART;
11424 /* FIXME adding one for every branch after the first is probably
11425 * excessive now we have TRIE support. (hv) */
11427 REGTAIL(pRExC_state, chain, latest);
11432 if (chain == NULL) { /* Loop ran zero times. */
11433 chain = reg_node(pRExC_state, NOTHING);
11438 *flagp |= flags&SIMPLE;
11445 - regpiece - something followed by possible [*+?]
11447 * Note that the branching code sequences used for ? and the general cases
11448 * of * and + are somewhat optimized: they use the same NOTHING node as
11449 * both the endmarker for their branch list and the body of the last branch.
11450 * It might seem that this node could be dispensed with entirely, but the
11451 * endmarker role is not redundant.
11453 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11455 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11456 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11459 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11465 const char * const origparse = RExC_parse;
11467 I32 max = REG_INFTY;
11468 #ifdef RE_TRACK_PATTERN_OFFSETS
11471 const char *maxpos = NULL;
11474 /* Save the original in case we change the emitted regop to a FAIL. */
11475 regnode * const orig_emit = RExC_emit;
11477 GET_RE_DEBUG_FLAGS_DECL;
11479 PERL_ARGS_ASSERT_REGPIECE;
11481 DEBUG_PARSE("piec");
11483 ret = regatom(pRExC_state, &flags,depth+1);
11485 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11486 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11488 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
11494 if (op == '{' && regcurly(RExC_parse)) {
11496 #ifdef RE_TRACK_PATTERN_OFFSETS
11497 parse_start = RExC_parse; /* MJD */
11499 next = RExC_parse + 1;
11500 while (isDIGIT(*next) || *next == ',') {
11501 if (*next == ',') {
11509 if (*next == '}') { /* got one */
11510 const char* endptr;
11514 if (isDIGIT(*RExC_parse)) {
11515 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11516 vFAIL("Invalid quantifier in {,}");
11517 if (uv >= REG_INFTY)
11518 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11523 if (*maxpos == ',')
11526 maxpos = RExC_parse;
11527 if (isDIGIT(*maxpos)) {
11528 if (!grok_atoUV(maxpos, &uv, &endptr))
11529 vFAIL("Invalid quantifier in {,}");
11530 if (uv >= REG_INFTY)
11531 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11534 max = REG_INFTY; /* meaning "infinity" */
11537 nextchar(pRExC_state);
11538 if (max < min) { /* If can't match, warn and optimize to fail
11542 /* We can't back off the size because we have to reserve
11543 * enough space for all the things we are about to throw
11544 * away, but we can shrink it by the amount we are about
11545 * to re-use here */
11546 RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
11549 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11550 RExC_emit = orig_emit;
11552 ret = reganode(pRExC_state, OPFAIL, 0);
11555 else if (min == max && *RExC_parse == '?')
11558 ckWARN2reg(RExC_parse + 1,
11559 "Useless use of greediness modifier '%c'",
11565 if ((flags&SIMPLE)) {
11566 if (min == 0 && max == REG_INFTY) {
11567 reginsert(pRExC_state, STAR, ret, depth+1);
11570 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11573 if (min == 1 && max == REG_INFTY) {
11574 reginsert(pRExC_state, PLUS, ret, depth+1);
11577 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11580 MARK_NAUGHTY_EXP(2, 2);
11581 reginsert(pRExC_state, CURLY, ret, depth+1);
11582 Set_Node_Offset(ret, parse_start+1); /* MJD */
11583 Set_Node_Cur_Length(ret, parse_start);
11586 regnode * const w = reg_node(pRExC_state, WHILEM);
11589 REGTAIL(pRExC_state, ret, w);
11590 if (!SIZE_ONLY && RExC_extralen) {
11591 reginsert(pRExC_state, LONGJMP,ret, depth+1);
11592 reginsert(pRExC_state, NOTHING,ret, depth+1);
11593 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
11595 reginsert(pRExC_state, CURLYX,ret, depth+1);
11597 Set_Node_Offset(ret, parse_start+1);
11598 Set_Node_Length(ret,
11599 op == '{' ? (RExC_parse - parse_start) : 1);
11601 if (!SIZE_ONLY && RExC_extralen)
11602 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
11603 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11605 RExC_whilem_seen++, RExC_extralen += 3;
11606 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
11613 *flagp |= HASWIDTH;
11615 ARG1_SET(ret, (U16)min);
11616 ARG2_SET(ret, (U16)max);
11618 if (max == REG_INFTY)
11619 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11625 if (!ISMULT1(op)) {
11630 #if 0 /* Now runtime fix should be reliable. */
11632 /* if this is reinstated, don't forget to put this back into perldiag:
11634 =item Regexp *+ operand could be empty at {#} in regex m/%s/
11636 (F) The part of the regexp subject to either the * or + quantifier
11637 could match an empty string. The {#} shows in the regular
11638 expression about where the problem was discovered.
11642 if (!(flags&HASWIDTH) && op != '?')
11643 vFAIL("Regexp *+ operand could be empty");
11646 #ifdef RE_TRACK_PATTERN_OFFSETS
11647 parse_start = RExC_parse;
11649 nextchar(pRExC_state);
11651 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11657 else if (op == '+') {
11661 else if (op == '?') {
11666 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11667 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11668 ckWARN2reg(RExC_parse,
11669 "%"UTF8f" matches null string many times",
11670 UTF8fARG(UTF, (RExC_parse >= origparse
11671 ? RExC_parse - origparse
11674 (void)ReREFCNT_inc(RExC_rx_sv);
11677 if (*RExC_parse == '?') {
11678 nextchar(pRExC_state);
11679 reginsert(pRExC_state, MINMOD, ret, depth+1);
11680 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11682 else if (*RExC_parse == '+') {
11684 nextchar(pRExC_state);
11685 ender = reg_node(pRExC_state, SUCCEED);
11686 REGTAIL(pRExC_state, ret, ender);
11687 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11689 ender = reg_node(pRExC_state, TAIL);
11690 REGTAIL(pRExC_state, ret, ender);
11693 if (ISMULT2(RExC_parse)) {
11695 vFAIL("Nested quantifiers");
11702 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11711 /* This routine teases apart the various meanings of \N and returns
11712 * accordingly. The input parameters constrain which meaning(s) is/are valid
11713 * in the current context.
11715 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11717 * If <code_point_p> is not NULL, the context is expecting the result to be a
11718 * single code point. If this \N instance turns out to a single code point,
11719 * the function returns TRUE and sets *code_point_p to that code point.
11721 * If <node_p> is not NULL, the context is expecting the result to be one of
11722 * the things representable by a regnode. If this \N instance turns out to be
11723 * one such, the function generates the regnode, returns TRUE and sets *node_p
11724 * to point to that regnode.
11726 * If this instance of \N isn't legal in any context, this function will
11727 * generate a fatal error and not return.
11729 * On input, RExC_parse should point to the first char following the \N at the
11730 * time of the call. On successful return, RExC_parse will have been updated
11731 * to point to just after the sequence identified by this routine. Also
11732 * *flagp has been updated as needed.
11734 * When there is some problem with the current context and this \N instance,
11735 * the function returns FALSE, without advancing RExC_parse, nor setting
11736 * *node_p, nor *code_point_p, nor *flagp.
11738 * If <cp_count> is not NULL, the caller wants to know the length (in code
11739 * points) that this \N sequence matches. This is set even if the function
11740 * returns FALSE, as detailed below.
11742 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11744 * Probably the most common case is for the \N to specify a single code point.
11745 * *cp_count will be set to 1, and *code_point_p will be set to that code
11748 * Another possibility is for the input to be an empty \N{}, which for
11749 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11750 * will be set to a generated NOTHING node.
11752 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11753 * set to 0. *node_p will be set to a generated REG_ANY node.
11755 * The fourth possibility is that \N resolves to a sequence of more than one
11756 * code points. *cp_count will be set to the number of code points in the
11757 * sequence. *node_p * will be set to a generated node returned by this
11758 * function calling S_reg().
11760 * The final possibility is that it is premature to be calling this function;
11761 * that pass1 needs to be restarted. This can happen when this changes from
11762 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
11763 * latter occurs only when the fourth possibility would otherwise be in
11764 * effect, and is because one of those code points requires the pattern to be
11765 * recompiled as UTF-8. The function returns FALSE, and sets the
11766 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
11767 * happens, the caller needs to desist from continuing parsing, and return
11768 * this information to its caller. This is not set for when there is only one
11769 * code point, as this can be called as part of an ANYOF node, and they can
11770 * store above-Latin1 code points without the pattern having to be in UTF-8.
11772 * For non-single-quoted regexes, the tokenizer has resolved character and
11773 * sequence names inside \N{...} into their Unicode values, normalizing the
11774 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11775 * hex-represented code points in the sequence. This is done there because
11776 * the names can vary based on what charnames pragma is in scope at the time,
11777 * so we need a way to take a snapshot of what they resolve to at the time of
11778 * the original parse. [perl #56444].
11780 * That parsing is skipped for single-quoted regexes, so we may here get
11781 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11782 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11783 * is legal and handled here. The code point is Unicode, and has to be
11784 * translated into the native character set for non-ASCII platforms.
11787 char * endbrace; /* points to '}' following the name */
11788 char *endchar; /* Points to '.' or '}' ending cur char in the input
11790 char* p = RExC_parse; /* Temporary */
11792 GET_RE_DEBUG_FLAGS_DECL;
11794 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11796 GET_RE_DEBUG_FLAGS;
11798 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11799 assert(! (node_p && cp_count)); /* At most 1 should be set */
11801 if (cp_count) { /* Initialize return for the most common case */
11805 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11806 * modifier. The other meanings do not, so use a temporary until we find
11807 * out which we are being called with */
11808 skip_to_be_ignored_text(pRExC_state, &p,
11809 FALSE /* Don't force to /x */ );
11811 /* Disambiguate between \N meaning a named character versus \N meaning
11812 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11813 * quantifier, or there is no '{' at all */
11814 if (*p != '{' || regcurly(p)) {
11824 *node_p = reg_node(pRExC_state, REG_ANY);
11825 *flagp |= HASWIDTH|SIMPLE;
11827 Set_Node_Length(*node_p, 1); /* MJD */
11831 /* Here, we have decided it should be a named character or sequence */
11833 /* The test above made sure that the next real character is a '{', but
11834 * under the /x modifier, it could be separated by space (or a comment and
11835 * \n) and this is not allowed (for consistency with \x{...} and the
11836 * tokenizer handling of \N{NAME}). */
11837 if (*RExC_parse != '{') {
11838 vFAIL("Missing braces on \\N{}");
11841 RExC_parse++; /* Skip past the '{' */
11843 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11844 || ! (endbrace == RExC_parse /* nothing between the {} */
11845 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11846 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11849 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11850 vFAIL("\\N{NAME} must be resolved by the lexer");
11853 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11856 if (endbrace == RExC_parse) { /* empty: \N{} */
11858 RExC_parse++; /* Position after the "}" */
11859 vFAIL("Zero length \\N{}");
11864 nextchar(pRExC_state);
11869 *node_p = reg_node(pRExC_state,NOTHING);
11873 RExC_parse += 2; /* Skip past the 'U+' */
11875 /* Because toke.c has generated a special construct for us guaranteed not
11876 * to have NULs, we can use a str function */
11877 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11879 /* Code points are separated by dots. If none, there is only one code
11880 * point, and is terminated by the brace */
11882 if (endchar >= endbrace) {
11883 STRLEN length_of_hex;
11884 I32 grok_hex_flags;
11886 /* Here, exactly one code point. If that isn't what is wanted, fail */
11887 if (! code_point_p) {
11892 /* Convert code point from hex */
11893 length_of_hex = (STRLEN)(endchar - RExC_parse);
11894 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11895 | PERL_SCAN_DISALLOW_PREFIX
11897 /* No errors in the first pass (See [perl
11898 * #122671].) We let the code below find the
11899 * errors when there are multiple chars. */
11901 ? PERL_SCAN_SILENT_ILLDIGIT
11904 /* This routine is the one place where both single- and double-quotish
11905 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11906 * must be converted to native. */
11907 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11912 /* The tokenizer should have guaranteed validity, but it's possible to
11913 * bypass it by using single quoting, so check. Don't do the check
11914 * here when there are multiple chars; we do it below anyway. */
11915 if (length_of_hex == 0
11916 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11918 RExC_parse += length_of_hex; /* Includes all the valid */
11919 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11920 ? UTF8SKIP(RExC_parse)
11922 /* Guard against malformed utf8 */
11923 if (RExC_parse >= endchar) {
11924 RExC_parse = endchar;
11926 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11929 RExC_parse = endbrace + 1;
11932 else { /* Is a multiple character sequence */
11933 SV * substitute_parse;
11935 char *orig_end = RExC_end;
11936 char *save_start = RExC_start;
11939 /* Count the code points, if desired, in the sequence */
11942 while (RExC_parse < endbrace) {
11943 /* Point to the beginning of the next character in the sequence. */
11944 RExC_parse = endchar + 1;
11945 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11950 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11951 * But don't backup up the pointer if the caller want to know how many
11952 * code points there are (they can then handle things) */
11960 /* What is done here is to convert this to a sub-pattern of the form
11961 * \x{char1}\x{char2}... and then call reg recursively to parse it
11962 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11963 * while not having to worry about special handling that some code
11964 * points may have. */
11966 substitute_parse = newSVpvs("?:");
11968 while (RExC_parse < endbrace) {
11970 /* Convert to notation the rest of the code understands */
11971 sv_catpv(substitute_parse, "\\x{");
11972 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11973 sv_catpv(substitute_parse, "}");
11975 /* Point to the beginning of the next character in the sequence. */
11976 RExC_parse = endchar + 1;
11977 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11980 sv_catpv(substitute_parse, ")");
11982 RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
11985 /* Don't allow empty number */
11986 if (len < (STRLEN) 8) {
11987 RExC_parse = endbrace;
11988 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11990 RExC_end = RExC_parse + len;
11992 /* The values are Unicode, and therefore not subject to recoding, but
11993 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11995 RExC_override_recoding = 1;
11997 RExC_recode_x_to_native = 1;
12001 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
12002 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12003 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12006 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
12009 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12012 /* Restore the saved values */
12013 RExC_start = RExC_adjusted_start = save_start;
12014 RExC_parse = endbrace;
12015 RExC_end = orig_end;
12016 RExC_override_recoding = 0;
12018 RExC_recode_x_to_native = 0;
12021 SvREFCNT_dec_NN(substitute_parse);
12022 nextchar(pRExC_state);
12032 * It returns the code point in utf8 for the value in *encp.
12033 * value: a code value in the source encoding
12034 * encp: a pointer to an Encode object
12036 * If the result from Encode is not a single character,
12037 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
12040 S_reg_recode(pTHX_ const U8 value, SV **encp)
12043 SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
12044 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
12045 const STRLEN newlen = SvCUR(sv);
12046 UV uv = UNICODE_REPLACEMENT;
12048 PERL_ARGS_ASSERT_REG_RECODE;
12052 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
12055 if (!newlen || numlen != newlen) {
12056 uv = UNICODE_REPLACEMENT;
12062 PERL_STATIC_INLINE U8
12063 S_compute_EXACTish(RExC_state_t *pRExC_state)
12067 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12075 op = get_regex_charset(RExC_flags);
12076 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12077 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12078 been, so there is no hole */
12081 return op + EXACTF;
12084 PERL_STATIC_INLINE void
12085 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12086 regnode *node, I32* flagp, STRLEN len, UV code_point,
12089 /* This knows the details about sizing an EXACTish node, setting flags for
12090 * it (by setting <*flagp>, and potentially populating it with a single
12093 * If <len> (the length in bytes) is non-zero, this function assumes that
12094 * the node has already been populated, and just does the sizing. In this
12095 * case <code_point> should be the final code point that has already been
12096 * placed into the node. This value will be ignored except that under some
12097 * circumstances <*flagp> is set based on it.
12099 * If <len> is zero, the function assumes that the node is to contain only
12100 * the single character given by <code_point> and calculates what <len>
12101 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
12102 * additionally will populate the node's STRING with <code_point> or its
12105 * In both cases <*flagp> is appropriately set
12107 * It knows that under FOLD, the Latin Sharp S and UTF characters above
12108 * 255, must be folded (the former only when the rules indicate it can
12111 * When it does the populating, it looks at the flag 'downgradable'. If
12112 * true with a node that folds, it checks if the single code point
12113 * participates in a fold, and if not downgrades the node to an EXACT.
12114 * This helps the optimizer */
12116 bool len_passed_in = cBOOL(len != 0);
12117 U8 character[UTF8_MAXBYTES_CASE+1];
12119 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12121 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12122 * sizing difference, and is extra work that is thrown away */
12123 if (downgradable && ! PASS2) {
12124 downgradable = FALSE;
12127 if (! len_passed_in) {
12129 if (UVCHR_IS_INVARIANT(code_point)) {
12130 if (LOC || ! FOLD) { /* /l defers folding until runtime */
12131 *character = (U8) code_point;
12133 else { /* Here is /i and not /l. (toFOLD() is defined on just
12134 ASCII, which isn't the same thing as INVARIANT on
12135 EBCDIC, but it works there, as the extra invariants
12136 fold to themselves) */
12137 *character = toFOLD((U8) code_point);
12139 /* We can downgrade to an EXACT node if this character
12140 * isn't a folding one. Note that this assumes that
12141 * nothing above Latin1 folds to some other invariant than
12142 * one of these alphabetics; otherwise we would also have
12144 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12145 * || ASCII_FOLD_RESTRICTED))
12147 if (downgradable && PL_fold[code_point] == code_point) {
12153 else if (FOLD && (! LOC
12154 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12155 { /* Folding, and ok to do so now */
12156 UV folded = _to_uni_fold_flags(
12160 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12161 ? FOLD_FLAGS_NOMIX_ASCII
12164 && folded == code_point /* This quickly rules out many
12165 cases, avoiding the
12166 _invlist_contains_cp() overhead
12168 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12175 else if (code_point <= MAX_UTF8_TWO_BYTE) {
12177 /* Not folding this cp, and can output it directly */
12178 *character = UTF8_TWO_BYTE_HI(code_point);
12179 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12183 uvchr_to_utf8( character, code_point);
12184 len = UTF8SKIP(character);
12186 } /* Else pattern isn't UTF8. */
12188 *character = (U8) code_point;
12190 } /* Else is folded non-UTF8 */
12191 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12192 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12193 || UNICODE_DOT_DOT_VERSION > 0)
12194 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12198 /* We don't fold any non-UTF8 except possibly the Sharp s (see
12199 * comments at join_exact()); */
12200 *character = (U8) code_point;
12203 /* Can turn into an EXACT node if we know the fold at compile time,
12204 * and it folds to itself and doesn't particpate in other folds */
12207 && PL_fold_latin1[code_point] == code_point
12208 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12209 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12213 } /* else is Sharp s. May need to fold it */
12214 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12216 *(character + 1) = 's';
12220 *character = LATIN_SMALL_LETTER_SHARP_S;
12226 RExC_size += STR_SZ(len);
12229 RExC_emit += STR_SZ(len);
12230 STR_LEN(node) = len;
12231 if (! len_passed_in) {
12232 Copy((char *) character, STRING(node), len, char);
12236 *flagp |= HASWIDTH;
12238 /* A single character node is SIMPLE, except for the special-cased SHARP S
12240 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12241 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12242 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12243 || UNICODE_DOT_DOT_VERSION > 0)
12244 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12245 || ! FOLD || ! DEPENDS_SEMANTICS)
12251 /* The OP may not be well defined in PASS1 */
12252 if (PASS2 && OP(node) == EXACTFL) {
12253 RExC_contains_locale = 1;
12258 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12259 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12262 S_backref_value(char *p)
12264 const char* endptr;
12266 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12273 - regatom - the lowest level
12275 Try to identify anything special at the start of the pattern. If there
12276 is, then handle it as required. This may involve generating a single regop,
12277 such as for an assertion; or it may involve recursing, such as to
12278 handle a () structure.
12280 If the string doesn't start with something special then we gobble up
12281 as much literal text as we can.
12283 Once we have been able to handle whatever type of thing started the
12284 sequence, we return.
12286 Note: we have to be careful with escapes, as they can be both literal
12287 and special, and in the case of \10 and friends, context determines which.
12289 A summary of the code structure is:
12291 switch (first_byte) {
12292 cases for each special:
12293 handle this special;
12296 switch (2nd byte) {
12297 cases for each unambiguous special:
12298 handle this special;
12300 cases for each ambigous special/literal:
12302 if (special) handle here
12304 default: // unambiguously literal:
12307 default: // is a literal char
12310 create EXACTish node for literal;
12311 while (more input and node isn't full) {
12312 switch (input_byte) {
12313 cases for each special;
12314 make sure parse pointer is set so that the next call to
12315 regatom will see this special first
12316 goto loopdone; // EXACTish node terminated by prev. char
12318 append char to EXACTISH node;
12320 get next input byte;
12324 return the generated node;
12326 Specifically there are two separate switches for handling
12327 escape sequences, with the one for handling literal escapes requiring
12328 a dummy entry for all of the special escapes that are actually handled
12331 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12333 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12334 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12335 Otherwise does not return NULL.
12339 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12341 regnode *ret = NULL;
12348 GET_RE_DEBUG_FLAGS_DECL;
12350 *flagp = WORST; /* Tentatively. */
12352 DEBUG_PARSE("atom");
12354 PERL_ARGS_ASSERT_REGATOM;
12357 parse_start = RExC_parse;
12358 assert(RExC_parse < RExC_end);
12359 switch ((U8)*RExC_parse) {
12361 RExC_seen_zerolen++;
12362 nextchar(pRExC_state);
12363 if (RExC_flags & RXf_PMf_MULTILINE)
12364 ret = reg_node(pRExC_state, MBOL);
12366 ret = reg_node(pRExC_state, SBOL);
12367 Set_Node_Length(ret, 1); /* MJD */
12370 nextchar(pRExC_state);
12372 RExC_seen_zerolen++;
12373 if (RExC_flags & RXf_PMf_MULTILINE)
12374 ret = reg_node(pRExC_state, MEOL);
12376 ret = reg_node(pRExC_state, SEOL);
12377 Set_Node_Length(ret, 1); /* MJD */
12380 nextchar(pRExC_state);
12381 if (RExC_flags & RXf_PMf_SINGLELINE)
12382 ret = reg_node(pRExC_state, SANY);
12384 ret = reg_node(pRExC_state, REG_ANY);
12385 *flagp |= HASWIDTH|SIMPLE;
12387 Set_Node_Length(ret, 1); /* MJD */
12391 char * const oregcomp_parse = ++RExC_parse;
12392 ret = regclass(pRExC_state, flagp,depth+1,
12393 FALSE, /* means parse the whole char class */
12394 TRUE, /* allow multi-char folds */
12395 FALSE, /* don't silence non-portable warnings. */
12396 (bool) RExC_strict,
12397 TRUE, /* Allow an optimized regnode result */
12401 if (*flagp & (RESTART_PASS1|NEED_UTF8))
12403 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12406 if (*RExC_parse != ']') {
12407 RExC_parse = oregcomp_parse;
12408 vFAIL("Unmatched [");
12410 nextchar(pRExC_state);
12411 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12415 nextchar(pRExC_state);
12416 ret = reg(pRExC_state, 2, &flags,depth+1);
12418 if (flags & TRYAGAIN) {
12419 if (RExC_parse >= RExC_end) {
12420 /* Make parent create an empty node if needed. */
12421 *flagp |= TRYAGAIN;
12426 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12427 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12430 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
12433 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12437 if (flags & TRYAGAIN) {
12438 *flagp |= TRYAGAIN;
12441 vFAIL("Internal urp");
12442 /* Supposed to be caught earlier. */
12448 vFAIL("Quantifier follows nothing");
12453 This switch handles escape sequences that resolve to some kind
12454 of special regop and not to literal text. Escape sequnces that
12455 resolve to literal text are handled below in the switch marked
12458 Every entry in this switch *must* have a corresponding entry
12459 in the literal escape switch. However, the opposite is not
12460 required, as the default for this switch is to jump to the
12461 literal text handling code.
12464 switch ((U8)*RExC_parse) {
12465 /* Special Escapes */
12467 RExC_seen_zerolen++;
12468 ret = reg_node(pRExC_state, SBOL);
12469 /* SBOL is shared with /^/ so we set the flags so we can tell
12470 * /\A/ from /^/ in split. We check ret because first pass we
12471 * have no regop struct to set the flags on. */
12475 goto finish_meta_pat;
12477 ret = reg_node(pRExC_state, GPOS);
12478 RExC_seen |= REG_GPOS_SEEN;
12480 goto finish_meta_pat;
12482 RExC_seen_zerolen++;
12483 ret = reg_node(pRExC_state, KEEPS);
12485 /* XXX:dmq : disabling in-place substitution seems to
12486 * be necessary here to avoid cases of memory corruption, as
12487 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12489 RExC_seen |= REG_LOOKBEHIND_SEEN;
12490 goto finish_meta_pat;
12492 ret = reg_node(pRExC_state, SEOL);
12494 RExC_seen_zerolen++; /* Do not optimize RE away */
12495 goto finish_meta_pat;
12497 ret = reg_node(pRExC_state, EOS);
12499 RExC_seen_zerolen++; /* Do not optimize RE away */
12500 goto finish_meta_pat;
12502 vFAIL("\\C no longer supported");
12504 ret = reg_node(pRExC_state, CLUMP);
12505 *flagp |= HASWIDTH;
12506 goto finish_meta_pat;
12512 arg = ANYOF_WORDCHAR;
12520 regex_charset charset = get_regex_charset(RExC_flags);
12522 RExC_seen_zerolen++;
12523 RExC_seen |= REG_LOOKBEHIND_SEEN;
12524 op = BOUND + charset;
12526 if (op == BOUNDL) {
12527 RExC_contains_locale = 1;
12530 ret = reg_node(pRExC_state, op);
12532 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12533 FLAGS(ret) = TRADITIONAL_BOUND;
12534 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
12540 char name = *RExC_parse;
12543 endbrace = strchr(RExC_parse, '}');
12546 vFAIL2("Missing right brace on \\%c{}", name);
12548 /* XXX Need to decide whether to take spaces or not. Should be
12549 * consistent with \p{}, but that currently is SPACE, which
12550 * means vertical too, which seems wrong
12551 * while (isBLANK(*RExC_parse)) {
12554 if (endbrace == RExC_parse) {
12555 RExC_parse++; /* After the '}' */
12556 vFAIL2("Empty \\%c{}", name);
12558 length = endbrace - RExC_parse;
12559 /*while (isBLANK(*(RExC_parse + length - 1))) {
12562 switch (*RExC_parse) {
12565 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12567 goto bad_bound_type;
12569 FLAGS(ret) = GCB_BOUND;
12572 if (length != 2 || *(RExC_parse + 1) != 'b') {
12573 goto bad_bound_type;
12575 FLAGS(ret) = LB_BOUND;
12578 if (length != 2 || *(RExC_parse + 1) != 'b') {
12579 goto bad_bound_type;
12581 FLAGS(ret) = SB_BOUND;
12584 if (length != 2 || *(RExC_parse + 1) != 'b') {
12585 goto bad_bound_type;
12587 FLAGS(ret) = WB_BOUND;
12591 RExC_parse = endbrace;
12593 "'%"UTF8f"' is an unknown bound type",
12594 UTF8fARG(UTF, length, endbrace - length));
12595 NOT_REACHED; /*NOTREACHED*/
12597 RExC_parse = endbrace;
12598 REQUIRE_UNI_RULES(flagp, NULL);
12600 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
12604 /* Don't have to worry about UTF-8, in this message because
12605 * to get here the contents of the \b must be ASCII */
12606 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
12607 "Using /u for '%.*s' instead of /%s",
12609 endbrace - length + 1,
12610 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12611 ? ASCII_RESTRICT_PAT_MODS
12612 : ASCII_MORE_RESTRICT_PAT_MODS);
12616 if (PASS2 && invert) {
12617 OP(ret) += NBOUND - BOUND;
12619 goto finish_meta_pat;
12627 if (! DEPENDS_SEMANTICS) {
12631 /* \d doesn't have any matches in the upper Latin1 range, hence /d
12632 * is equivalent to /u. Changing to /u saves some branches at
12635 goto join_posix_op_known;
12638 ret = reg_node(pRExC_state, LNBREAK);
12639 *flagp |= HASWIDTH|SIMPLE;
12640 goto finish_meta_pat;
12648 goto join_posix_op_known;
12654 arg = ANYOF_VERTWS;
12656 goto join_posix_op_known;
12666 op = POSIXD + get_regex_charset(RExC_flags);
12667 if (op > POSIXA) { /* /aa is same as /a */
12670 else if (op == POSIXL) {
12671 RExC_contains_locale = 1;
12674 join_posix_op_known:
12677 op += NPOSIXD - POSIXD;
12680 ret = reg_node(pRExC_state, op);
12682 FLAGS(ret) = namedclass_to_classnum(arg);
12685 *flagp |= HASWIDTH|SIMPLE;
12689 nextchar(pRExC_state);
12690 Set_Node_Length(ret, 2); /* MJD */
12696 ret = regclass(pRExC_state, flagp,depth+1,
12697 TRUE, /* means just parse this element */
12698 FALSE, /* don't allow multi-char folds */
12699 FALSE, /* don't silence non-portable warnings. It
12700 would be a bug if these returned
12702 (bool) RExC_strict,
12703 TRUE, /* Allow an optimized regnode result */
12706 if (*flagp & RESTART_PASS1)
12708 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12709 * multi-char folds are allowed. */
12711 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12716 Set_Node_Offset(ret, parse_start);
12717 Set_Node_Cur_Length(ret, parse_start - 2);
12718 nextchar(pRExC_state);
12721 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12722 * \N{...} evaluates to a sequence of more than one code points).
12723 * The function call below returns a regnode, which is our result.
12724 * The parameters cause it to fail if the \N{} evaluates to a
12725 * single code point; we handle those like any other literal. The
12726 * reason that the multicharacter case is handled here and not as
12727 * part of the EXACtish code is because of quantifiers. In
12728 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12729 * this way makes that Just Happen. dmq.
12730 * join_exact() will join this up with adjacent EXACTish nodes
12731 * later on, if appropriate. */
12733 if (grok_bslash_N(pRExC_state,
12734 &ret, /* Want a regnode returned */
12735 NULL, /* Fail if evaluates to a single code
12737 NULL, /* Don't need a count of how many code
12746 if (*flagp & RESTART_PASS1)
12749 /* Here, evaluates to a single code point. Go get that */
12750 RExC_parse = parse_start;
12753 case 'k': /* Handle \k<NAME> and \k'NAME' */
12757 if ( RExC_parse >= RExC_end - 1
12758 || (( ch = RExC_parse[1]) != '<'
12763 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12764 vFAIL2("Sequence %.2s... not terminated",parse_start);
12767 ret = handle_named_backref(pRExC_state,
12779 case '1': case '2': case '3': case '4':
12780 case '5': case '6': case '7': case '8': case '9':
12785 if (*RExC_parse == 'g') {
12789 if (*RExC_parse == '{') {
12793 if (*RExC_parse == '-') {
12797 if (hasbrace && !isDIGIT(*RExC_parse)) {
12798 if (isrel) RExC_parse--;
12800 goto parse_named_seq;
12803 if (RExC_parse >= RExC_end) {
12804 goto unterminated_g;
12806 num = S_backref_value(RExC_parse);
12808 vFAIL("Reference to invalid group 0");
12809 else if (num == I32_MAX) {
12810 if (isDIGIT(*RExC_parse))
12811 vFAIL("Reference to nonexistent group");
12814 vFAIL("Unterminated \\g... pattern");
12818 num = RExC_npar - num;
12820 vFAIL("Reference to nonexistent or unclosed group");
12824 num = S_backref_value(RExC_parse);
12825 /* bare \NNN might be backref or octal - if it is larger
12826 * than or equal RExC_npar then it is assumed to be an
12827 * octal escape. Note RExC_npar is +1 from the actual
12828 * number of parens. */
12829 /* Note we do NOT check if num == I32_MAX here, as that is
12830 * handled by the RExC_npar check */
12833 /* any numeric escape < 10 is always a backref */
12835 /* any numeric escape < RExC_npar is a backref */
12836 && num >= RExC_npar
12837 /* cannot be an octal escape if it starts with 8 */
12838 && *RExC_parse != '8'
12839 /* cannot be an octal escape it it starts with 9 */
12840 && *RExC_parse != '9'
12843 /* Probably not a backref, instead likely to be an
12844 * octal character escape, e.g. \35 or \777.
12845 * The above logic should make it obvious why using
12846 * octal escapes in patterns is problematic. - Yves */
12847 RExC_parse = parse_start;
12852 /* At this point RExC_parse points at a numeric escape like
12853 * \12 or \88 or something similar, which we should NOT treat
12854 * as an octal escape. It may or may not be a valid backref
12855 * escape. For instance \88888888 is unlikely to be a valid
12857 while (isDIGIT(*RExC_parse))
12860 if (*RExC_parse != '}')
12861 vFAIL("Unterminated \\g{...} pattern");
12865 if (num > (I32)RExC_rx->nparens)
12866 vFAIL("Reference to nonexistent group");
12869 ret = reganode(pRExC_state,
12872 : (ASCII_FOLD_RESTRICTED)
12874 : (AT_LEAST_UNI_SEMANTICS)
12880 *flagp |= HASWIDTH;
12882 /* override incorrect value set in reganode MJD */
12883 Set_Node_Offset(ret, parse_start);
12884 Set_Node_Cur_Length(ret, parse_start-1);
12885 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12886 FALSE /* Don't force to /x */ );
12890 if (RExC_parse >= RExC_end)
12891 FAIL("Trailing \\");
12894 /* Do not generate "unrecognized" warnings here, we fall
12895 back into the quick-grab loop below */
12896 RExC_parse = parse_start;
12898 } /* end of switch on a \foo sequence */
12903 /* '#' comments should have been spaced over before this function was
12905 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12907 if (RExC_flags & RXf_PMf_EXTENDED) {
12908 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12909 if (RExC_parse < RExC_end)
12919 /* Here, we have determined that the next thing is probably a
12920 * literal character. RExC_parse points to the first byte of its
12921 * definition. (It still may be an escape sequence that evaluates
12922 * to a single character) */
12928 #define MAX_NODE_STRING_SIZE 127
12929 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12931 U8 upper_parse = MAX_NODE_STRING_SIZE;
12932 U8 node_type = compute_EXACTish(pRExC_state);
12933 bool next_is_quantifier;
12934 char * oldp = NULL;
12936 /* We can convert EXACTF nodes to EXACTFU if they contain only
12937 * characters that match identically regardless of the target
12938 * string's UTF8ness. The reason to do this is that EXACTF is not
12939 * trie-able, EXACTFU is.
12941 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12942 * contain only above-Latin1 characters (hence must be in UTF8),
12943 * which don't participate in folds with Latin1-range characters,
12944 * as the latter's folds aren't known until runtime. (We don't
12945 * need to figure this out until pass 2) */
12946 bool maybe_exactfu = PASS2
12947 && (node_type == EXACTF || node_type == EXACTFL);
12949 /* If a folding node contains only code points that don't
12950 * participate in folds, it can be changed into an EXACT node,
12951 * which allows the optimizer more things to look for */
12954 ret = reg_node(pRExC_state, node_type);
12956 /* In pass1, folded, we use a temporary buffer instead of the
12957 * actual node, as the node doesn't exist yet */
12958 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12964 /* We look for the EXACTFish to EXACT node optimizaton only if
12965 * folding. (And we don't need to figure this out until pass 2).
12966 * XXX It might actually make sense to split the node into portions
12967 * that are exact and ones that aren't, so that we could later use
12968 * the exact ones to find the longest fixed and floating strings.
12969 * One would want to join them back into a larger node. One could
12970 * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
12971 maybe_exact = FOLD && PASS2;
12973 /* XXX The node can hold up to 255 bytes, yet this only goes to
12974 * 127. I (khw) do not know why. Keeping it somewhat less than
12975 * 255 allows us to not have to worry about overflow due to
12976 * converting to utf8 and fold expansion, but that value is
12977 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12978 * split up by this limit into a single one using the real max of
12979 * 255. Even at 127, this breaks under rare circumstances. If
12980 * folding, we do not want to split a node at a character that is a
12981 * non-final in a multi-char fold, as an input string could just
12982 * happen to want to match across the node boundary. The join
12983 * would solve that problem if the join actually happens. But a
12984 * series of more than two nodes in a row each of 127 would cause
12985 * the first join to succeed to get to 254, but then there wouldn't
12986 * be room for the next one, which could at be one of those split
12987 * multi-char folds. I don't know of any fool-proof solution. One
12988 * could back off to end with only a code point that isn't such a
12989 * non-final, but it is possible for there not to be any in the
12992 assert( ! UTF /* Is at the beginning of a character */
12993 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12994 || UTF8_IS_START(UCHARAT(RExC_parse)));
12996 for (p = RExC_parse;
12997 len < upper_parse && p < RExC_end;
13002 /* White space has already been ignored */
13003 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13004 || ! is_PATWS_safe((p), RExC_end, UTF));
13016 /* Literal Escapes Switch
13018 This switch is meant to handle escape sequences that
13019 resolve to a literal character.
13021 Every escape sequence that represents something
13022 else, like an assertion or a char class, is handled
13023 in the switch marked 'Special Escapes' above in this
13024 routine, but also has an entry here as anything that
13025 isn't explicitly mentioned here will be treated as
13026 an unescaped equivalent literal.
13029 switch ((U8)*++p) {
13030 /* These are all the special escapes. */
13031 case 'A': /* Start assertion */
13032 case 'b': case 'B': /* Word-boundary assertion*/
13033 case 'C': /* Single char !DANGEROUS! */
13034 case 'd': case 'D': /* digit class */
13035 case 'g': case 'G': /* generic-backref, pos assertion */
13036 case 'h': case 'H': /* HORIZWS */
13037 case 'k': case 'K': /* named backref, keep marker */
13038 case 'p': case 'P': /* Unicode property */
13039 case 'R': /* LNBREAK */
13040 case 's': case 'S': /* space class */
13041 case 'v': case 'V': /* VERTWS */
13042 case 'w': case 'W': /* word class */
13043 case 'X': /* eXtended Unicode "combining
13044 character sequence" */
13045 case 'z': case 'Z': /* End of line/string assertion */
13049 /* Anything after here is an escape that resolves to a
13050 literal. (Except digits, which may or may not)
13056 case 'N': /* Handle a single-code point named character. */
13057 RExC_parse = p + 1;
13058 if (! grok_bslash_N(pRExC_state,
13059 NULL, /* Fail if evaluates to
13060 anything other than a
13061 single code point */
13062 &ender, /* The returned single code
13064 NULL, /* Don't need a count of
13065 how many code points */
13070 if (*flagp & NEED_UTF8)
13071 FAIL("panic: grok_bslash_N set NEED_UTF8");
13072 if (*flagp & RESTART_PASS1)
13075 /* Here, it wasn't a single code point. Go close
13076 * up this EXACTish node. The switch() prior to
13077 * this switch handles the other cases */
13078 RExC_parse = p = oldp;
13082 if (ender > 0xff) {
13083 REQUIRE_UTF8(flagp);
13099 ender = ESC_NATIVE;
13109 const char* error_msg;
13111 bool valid = grok_bslash_o(&p,
13114 PASS2, /* out warnings */
13115 (bool) RExC_strict,
13116 TRUE, /* Output warnings
13121 RExC_parse = p; /* going to die anyway; point
13122 to exact spot of failure */
13126 if (IN_ENCODING && ender < 0x100) {
13127 goto recode_encoding;
13129 if (ender > 0xff) {
13130 REQUIRE_UTF8(flagp);
13136 UV result = UV_MAX; /* initialize to erroneous
13138 const char* error_msg;
13140 bool valid = grok_bslash_x(&p,
13143 PASS2, /* out warnings */
13144 (bool) RExC_strict,
13145 TRUE, /* Silence warnings
13150 RExC_parse = p; /* going to die anyway; point
13151 to exact spot of failure */
13156 if (ender < 0x100) {
13158 if (RExC_recode_x_to_native) {
13159 ender = LATIN1_TO_NATIVE(ender);
13164 goto recode_encoding;
13168 REQUIRE_UTF8(flagp);
13174 ender = grok_bslash_c(*p++, PASS2);
13176 case '8': case '9': /* must be a backreference */
13178 /* we have an escape like \8 which cannot be an octal escape
13179 * so we exit the loop, and let the outer loop handle this
13180 * escape which may or may not be a legitimate backref. */
13182 case '1': case '2': case '3':case '4':
13183 case '5': case '6': case '7':
13184 /* When we parse backslash escapes there is ambiguity
13185 * between backreferences and octal escapes. Any escape
13186 * from \1 - \9 is a backreference, any multi-digit
13187 * escape which does not start with 0 and which when
13188 * evaluated as decimal could refer to an already
13189 * parsed capture buffer is a back reference. Anything
13192 * Note this implies that \118 could be interpreted as
13193 * 118 OR as "\11" . "8" depending on whether there
13194 * were 118 capture buffers defined already in the
13197 /* NOTE, RExC_npar is 1 more than the actual number of
13198 * parens we have seen so far, hence the < RExC_npar below. */
13200 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13201 { /* Not to be treated as an octal constant, go
13209 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13211 ender = grok_oct(p, &numlen, &flags, NULL);
13212 if (ender > 0xff) {
13213 REQUIRE_UTF8(flagp);
13216 if (PASS2 /* like \08, \178 */
13218 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13220 reg_warn_non_literal_string(
13222 form_short_octal_warning(p, numlen));
13225 if (IN_ENCODING && ender < 0x100)
13226 goto recode_encoding;
13229 if (! RExC_override_recoding) {
13230 SV* enc = _get_encoding();
13231 ender = reg_recode((U8)ender, &enc);
13233 ckWARNreg(p, "Invalid escape in the specified encoding");
13234 REQUIRE_UTF8(flagp);
13239 FAIL("Trailing \\");
13242 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13243 /* Include any left brace following the alpha to emphasize
13244 * that it could be part of an escape at some point
13246 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13247 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13249 goto normal_default;
13250 } /* End of switch on '\' */
13253 /* Currently we don't care if the lbrace is at the start
13254 * of a construct. This catches it in the middle of a
13255 * literal string, or when it's the first thing after
13256 * something like "\b" */
13257 if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
13258 RExC_parse = p + 1;
13259 vFAIL("Unescaped left brace in regex is illegal");
13262 default: /* A literal character */
13264 if (! UTF8_IS_INVARIANT(*p) && UTF) {
13266 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13267 &numlen, UTF8_ALLOW_DEFAULT);
13273 } /* End of switch on the literal */
13275 /* Here, have looked at the literal character and <ender>
13276 * contains its ordinal, <p> points to the character after it.
13277 * We need to check if the next non-ignored thing is a
13278 * quantifier. Move <p> to after anything that should be
13279 * ignored, which, as a side effect, positions <p> for the next
13280 * loop iteration */
13281 skip_to_be_ignored_text(pRExC_state, &p,
13282 FALSE /* Don't force to /x */ );
13284 /* If the next thing is a quantifier, it applies to this
13285 * character only, which means that this character has to be in
13286 * its own node and can't just be appended to the string in an
13287 * existing node, so if there are already other characters in
13288 * the node, close the node with just them, and set up to do
13289 * this character again next time through, when it will be the
13290 * only thing in its new node */
13291 if ((next_is_quantifier = ( LIKELY(p < RExC_end)
13292 && UNLIKELY(ISMULT2(p))))
13299 /* Ready to add 'ender' to the node */
13301 if (! FOLD) { /* The simple case, just append the literal */
13303 /* In the sizing pass, we need only the size of the
13304 * character we are appending, hence we can delay getting
13305 * its representation until PASS2. */
13308 const STRLEN unilen = UVCHR_SKIP(ender);
13311 /* We have to subtract 1 just below (and again in
13312 * the corresponding PASS2 code) because the loop
13313 * increments <len> each time, as all but this path
13314 * (and one other) through it add a single byte to
13315 * the EXACTish node. But these paths would change
13316 * len to be the correct final value, so cancel out
13317 * the increment that follows */
13323 } else { /* PASS2 */
13326 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13327 len += (char *) new_s - s - 1;
13328 s = (char *) new_s;
13331 *(s++) = (char) ender;
13335 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13337 /* Here are folding under /l, and the code point is
13338 * problematic. First, we know we can't simplify things */
13339 maybe_exact = FALSE;
13340 maybe_exactfu = FALSE;
13342 /* A problematic code point in this context means that its
13343 * fold isn't known until runtime, so we can't fold it now.
13344 * (The non-problematic code points are the above-Latin1
13345 * ones that fold to also all above-Latin1. Their folds
13346 * don't vary no matter what the locale is.) But here we
13347 * have characters whose fold depends on the locale.
13348 * Unlike the non-folding case above, we have to keep track
13349 * of these in the sizing pass, so that we can make sure we
13350 * don't split too-long nodes in the middle of a potential
13351 * multi-char fold. And unlike the regular fold case
13352 * handled in the else clauses below, we don't actually
13353 * fold and don't have special cases to consider. What we
13354 * do for both passes is the PASS2 code for non-folding */
13355 goto not_fold_common;
13357 else /* A regular FOLD code point */
13359 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
13360 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
13361 || UNICODE_DOT_DOT_VERSION > 0)
13362 /* See comments for join_exact() as to why we fold
13363 * this non-UTF at compile time */
13364 || ( node_type == EXACTFU
13365 && ender == LATIN_SMALL_LETTER_SHARP_S)
13368 /* Here, are folding and are not UTF-8 encoded; therefore
13369 * the character must be in the range 0-255, and is not /l
13370 * (Not /l because we already handled these under /l in
13371 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13372 if (IS_IN_SOME_FOLD_L1(ender)) {
13373 maybe_exact = FALSE;
13375 /* See if the character's fold differs between /d and
13376 * /u. This includes the multi-char fold SHARP S to
13378 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13379 RExC_seen_unfolded_sharp_s = 1;
13380 maybe_exactfu = FALSE;
13382 else if (maybe_exactfu
13383 && (PL_fold[ender] != PL_fold_latin1[ender]
13384 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
13385 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
13386 || UNICODE_DOT_DOT_VERSION > 0)
13388 && isALPHA_FOLD_EQ(ender, 's')
13389 && isALPHA_FOLD_EQ(*(s-1), 's'))
13392 maybe_exactfu = FALSE;
13396 /* Even when folding, we store just the input character, as
13397 * we have an array that finds its fold quickly */
13398 *(s++) = (char) ender;
13400 else { /* FOLD, and UTF (or sharp s) */
13401 /* Unlike the non-fold case, we do actually have to
13402 * calculate the results here in pass 1. This is for two
13403 * reasons, the folded length may be longer than the
13404 * unfolded, and we have to calculate how many EXACTish
13405 * nodes it will take; and we may run out of room in a node
13406 * in the middle of a potential multi-char fold, and have
13407 * to back off accordingly. */
13410 if (isASCII_uni(ender)) {
13411 folded = toFOLD(ender);
13412 *(s)++ = (U8) folded;
13417 folded = _to_uni_fold_flags(
13421 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13422 ? FOLD_FLAGS_NOMIX_ASCII
13426 /* The loop increments <len> each time, as all but this
13427 * path (and one other) through it add a single byte to
13428 * the EXACTish node. But this one has changed len to
13429 * be the correct final value, so subtract one to
13430 * cancel out the increment that follows */
13431 len += foldlen - 1;
13433 /* If this node only contains non-folding code points so
13434 * far, see if this new one is also non-folding */
13436 if (folded != ender) {
13437 maybe_exact = FALSE;
13440 /* Here the fold is the original; we have to check
13441 * further to see if anything folds to it */
13442 if (_invlist_contains_cp(PL_utf8_foldable,
13445 maybe_exact = FALSE;
13452 if (next_is_quantifier) {
13454 /* Here, the next input is a quantifier, and to get here,
13455 * the current character is the only one in the node.
13456 * Also, here <len> doesn't include the final byte for this
13462 } /* End of loop through literal characters */
13464 /* Here we have either exhausted the input or ran out of room in
13465 * the node. (If we encountered a character that can't be in the
13466 * node, transfer is made directly to <loopdone>, and so we
13467 * wouldn't have fallen off the end of the loop.) In the latter
13468 * case, we artificially have to split the node into two, because
13469 * we just don't have enough space to hold everything. This
13470 * creates a problem if the final character participates in a
13471 * multi-character fold in the non-final position, as a match that
13472 * should have occurred won't, due to the way nodes are matched,
13473 * and our artificial boundary. So back off until we find a non-
13474 * problematic character -- one that isn't at the beginning or
13475 * middle of such a fold. (Either it doesn't participate in any
13476 * folds, or appears only in the final position of all the folds it
13477 * does participate in.) A better solution with far fewer false
13478 * positives, and that would fill the nodes more completely, would
13479 * be to actually have available all the multi-character folds to
13480 * test against, and to back-off only far enough to be sure that
13481 * this node isn't ending with a partial one. <upper_parse> is set
13482 * further below (if we need to reparse the node) to include just
13483 * up through that final non-problematic character that this code
13484 * identifies, so when it is set to less than the full node, we can
13485 * skip the rest of this */
13486 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13488 const STRLEN full_len = len;
13490 assert(len >= MAX_NODE_STRING_SIZE);
13492 /* Here, <s> points to the final byte of the final character.
13493 * Look backwards through the string until find a non-
13494 * problematic character */
13498 /* This has no multi-char folds to non-UTF characters */
13499 if (ASCII_FOLD_RESTRICTED) {
13503 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13507 if (! PL_NonL1NonFinalFold) {
13508 PL_NonL1NonFinalFold = _new_invlist_C_array(
13509 NonL1_Perl_Non_Final_Folds_invlist);
13512 /* Point to the first byte of the final character */
13513 s = (char *) utf8_hop((U8 *) s, -1);
13515 while (s >= s0) { /* Search backwards until find
13516 non-problematic char */
13517 if (UTF8_IS_INVARIANT(*s)) {
13519 /* There are no ascii characters that participate
13520 * in multi-char folds under /aa. In EBCDIC, the
13521 * non-ascii invariants are all control characters,
13522 * so don't ever participate in any folds. */
13523 if (ASCII_FOLD_RESTRICTED
13524 || ! IS_NON_FINAL_FOLD(*s))
13529 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13530 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13536 else if (! _invlist_contains_cp(
13537 PL_NonL1NonFinalFold,
13538 valid_utf8_to_uvchr((U8 *) s, NULL)))
13543 /* Here, the current character is problematic in that
13544 * it does occur in the non-final position of some
13545 * fold, so try the character before it, but have to
13546 * special case the very first byte in the string, so
13547 * we don't read outside the string */
13548 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13549 } /* End of loop backwards through the string */
13551 /* If there were only problematic characters in the string,
13552 * <s> will point to before s0, in which case the length
13553 * should be 0, otherwise include the length of the
13554 * non-problematic character just found */
13555 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13558 /* Here, have found the final character, if any, that is
13559 * non-problematic as far as ending the node without splitting
13560 * it across a potential multi-char fold. <len> contains the
13561 * number of bytes in the node up-to and including that
13562 * character, or is 0 if there is no such character, meaning
13563 * the whole node contains only problematic characters. In
13564 * this case, give up and just take the node as-is. We can't
13569 /* If the node ends in an 's' we make sure it stays EXACTF,
13570 * as if it turns into an EXACTFU, it could later get
13571 * joined with another 's' that would then wrongly match
13573 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13575 maybe_exactfu = FALSE;
13579 /* Here, the node does contain some characters that aren't
13580 * problematic. If one such is the final character in the
13581 * node, we are done */
13582 if (len == full_len) {
13585 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13587 /* If the final character is problematic, but the
13588 * penultimate is not, back-off that last character to
13589 * later start a new node with it */
13594 /* Here, the final non-problematic character is earlier
13595 * in the input than the penultimate character. What we do
13596 * is reparse from the beginning, going up only as far as
13597 * this final ok one, thus guaranteeing that the node ends
13598 * in an acceptable character. The reason we reparse is
13599 * that we know how far in the character is, but we don't
13600 * know how to correlate its position with the input parse.
13601 * An alternate implementation would be to build that
13602 * correlation as we go along during the original parse,
13603 * but that would entail extra work for every node, whereas
13604 * this code gets executed only when the string is too
13605 * large for the node, and the final two characters are
13606 * problematic, an infrequent occurrence. Yet another
13607 * possible strategy would be to save the tail of the
13608 * string, and the next time regatom is called, initialize
13609 * with that. The problem with this is that unless you
13610 * back off one more character, you won't be guaranteed
13611 * regatom will get called again, unless regbranch,
13612 * regpiece ... are also changed. If you do back off that
13613 * extra character, so that there is input guaranteed to
13614 * force calling regatom, you can't handle the case where
13615 * just the first character in the node is acceptable. I
13616 * (khw) decided to try this method which doesn't have that
13617 * pitfall; if performance issues are found, we can do a
13618 * combination of the current approach plus that one */
13624 } /* End of verifying node ends with an appropriate char */
13626 loopdone: /* Jumped to when encounters something that shouldn't be
13629 /* I (khw) don't know if you can get here with zero length, but the
13630 * old code handled this situation by creating a zero-length EXACT
13631 * node. Might as well be NOTHING instead */
13637 /* If 'maybe_exact' is still set here, means there are no
13638 * code points in the node that participate in folds;
13639 * similarly for 'maybe_exactfu' and code points that match
13640 * differently depending on UTF8ness of the target string
13641 * (for /u), or depending on locale for /l */
13647 else if (maybe_exactfu) {
13653 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13654 FALSE /* Don't look to see if could
13655 be turned into an EXACT
13656 node, as we have already
13661 RExC_parse = p - 1;
13662 Set_Node_Cur_Length(ret, parse_start);
13664 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13665 FALSE /* Don't force to /x */ );
13667 /* len is STRLEN which is unsigned, need to copy to signed */
13670 vFAIL("Internal disaster");
13673 } /* End of label 'defchar:' */
13675 } /* End of giant switch on input character */
13682 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13684 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
13685 * sets up the bitmap and any flags, removing those code points from the
13686 * inversion list, setting it to NULL should it become completely empty */
13688 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13689 assert(PL_regkind[OP(node)] == ANYOF);
13691 ANYOF_BITMAP_ZERO(node);
13692 if (*invlist_ptr) {
13694 /* This gets set if we actually need to modify things */
13695 bool change_invlist = FALSE;
13699 /* Start looking through *invlist_ptr */
13700 invlist_iterinit(*invlist_ptr);
13701 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13705 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13706 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13709 /* Quit if are above what we should change */
13710 if (start >= NUM_ANYOF_CODE_POINTS) {
13714 change_invlist = TRUE;
13716 /* Set all the bits in the range, up to the max that we are doing */
13717 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13719 : NUM_ANYOF_CODE_POINTS - 1;
13720 for (i = start; i <= (int) high; i++) {
13721 if (! ANYOF_BITMAP_TEST(node, i)) {
13722 ANYOF_BITMAP_SET(node, i);
13726 invlist_iterfinish(*invlist_ptr);
13728 /* Done with loop; remove any code points that are in the bitmap from
13729 * *invlist_ptr; similarly for code points above the bitmap if we have
13730 * a flag to match all of them anyways */
13731 if (change_invlist) {
13732 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13734 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13735 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13738 /* If have completely emptied it, remove it completely */
13739 if (_invlist_len(*invlist_ptr) == 0) {
13740 SvREFCNT_dec_NN(*invlist_ptr);
13741 *invlist_ptr = NULL;
13746 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13747 Character classes ([:foo:]) can also be negated ([:^foo:]).
13748 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13749 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13750 but trigger failures because they are currently unimplemented. */
13752 #define POSIXCC_DONE(c) ((c) == ':')
13753 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13754 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13755 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
13757 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
13758 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
13759 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
13761 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
13763 /* 'posix_warnings' and 'warn_text' are names of variables in the following
13765 #define ADD_POSIX_WARNING(p, text) STMT_START { \
13766 if (posix_warnings) { \
13767 if (! warn_text) warn_text = newAV(); \
13768 av_push(warn_text, Perl_newSVpvf(aTHX_ \
13772 REPORT_LOCATION_ARGS(p))); \
13777 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
13779 const char * const s, /* Where the putative posix class begins.
13780 Normally, this is one past the '['. This
13781 parameter exists so it can be somewhere
13782 besides RExC_parse. */
13783 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
13785 AV ** posix_warnings, /* Where to place any generated warnings, or
13787 const bool check_only /* Don't die if error */
13790 /* This parses what the caller thinks may be one of the three POSIX
13792 * 1) a character class, like [:blank:]
13793 * 2) a collating symbol, like [. .]
13794 * 3) an equivalence class, like [= =]
13795 * In the latter two cases, it croaks if it finds a syntactically legal
13796 * one, as these are not handled by Perl.
13798 * The main purpose is to look for a POSIX character class. It returns:
13799 * a) the class number
13800 * if it is a completely syntactically and semantically legal class.
13801 * 'updated_parse_ptr', if not NULL, is set to point to just after the
13802 * closing ']' of the class
13803 * b) OOB_NAMEDCLASS
13804 * if it appears that one of the three POSIX constructs was meant, but
13805 * its specification was somehow defective. 'updated_parse_ptr', if
13806 * not NULL, is set to point to the character just after the end
13807 * character of the class. See below for handling of warnings.
13808 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
13809 * if it doesn't appear that a POSIX construct was intended.
13810 * 'updated_parse_ptr' is not changed. No warnings nor errors are
13813 * In b) there may be errors or warnings generated. If 'check_only' is
13814 * TRUE, then any errors are discarded. Warnings are returned to the
13815 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
13816 * instead it is NULL, warnings are suppressed. This is done in all
13817 * passes. The reason for this is that the rest of the parsing is heavily
13818 * dependent on whether this routine found a valid posix class or not. If
13819 * it did, the closing ']' is absorbed as part of the class. If no class,
13820 * or an invalid one is found, any ']' will be considered the terminator of
13821 * the outer bracketed character class, leading to very different results.
13822 * In particular, a '(?[ ])' construct will likely have a syntax error if
13823 * the class is parsed other than intended, and this will happen in pass1,
13824 * before the warnings would normally be output. This mechanism allows the
13825 * caller to output those warnings in pass1 just before dieing, giving a
13826 * much better clue as to what is wrong.
13828 * The reason for this function, and its complexity is that a bracketed
13829 * character class can contain just about anything. But it's easy to
13830 * mistype the very specific posix class syntax but yielding a valid
13831 * regular bracketed class, so it silently gets compiled into something
13832 * quite unintended.
13834 * The solution adopted here maintains backward compatibility except that
13835 * it adds a warning if it looks like a posix class was intended but
13836 * improperly specified. The warning is not raised unless what is input
13837 * very closely resembles one of the 14 legal posix classes. To do this,
13838 * it uses fuzzy parsing. It calculates how many single-character edits it
13839 * would take to transform what was input into a legal posix class. Only
13840 * if that number is quite small does it think that the intention was a
13841 * posix class. Obviously these are heuristics, and there will be cases
13842 * where it errs on one side or another, and they can be tweaked as
13843 * experience informs.
13845 * The syntax for a legal posix class is:
13847 * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
13849 * What this routine considers syntactically to be an intended posix class
13850 * is this (the comments indicate some restrictions that the pattern
13853 * qr/(?x: \[? # The left bracket, possibly
13855 * \h* # possibly followed by blanks
13856 * (?: \^ \h* )? # possibly a misplaced caret
13857 * [:;]? # The opening class character,
13858 * # possibly omitted. A typo
13859 * # semi-colon can also be used.
13861 * \^? # possibly a correctly placed
13862 * # caret, but not if there was also
13863 * # a misplaced one
13865 * .{3,15} # The class name. If there are
13866 * # deviations from the legal syntax,
13867 * # its edit distance must be close
13868 * # to a real class name in order
13869 * # for it to be considered to be
13870 * # an intended posix class.
13872 * [:punct:]? # The closing class character,
13873 * # possibly omitted. If not a colon
13874 * # nor semi colon, the class name
13875 * # must be even closer to a valid
13878 * \]? # The right bracket, possibly
13882 * In the above, \h must be ASCII-only.
13884 * These are heuristics, and can be tweaked as field experience dictates.
13885 * There will be cases when someone didn't intend to specify a posix class
13886 * that this warns as being so. The goal is to minimize these, while
13887 * maximizing the catching of things intended to be a posix class that
13888 * aren't parsed as such.
13892 const char * const e = RExC_end;
13893 unsigned complement = 0; /* If to complement the class */
13894 bool found_problem = FALSE; /* Assume OK until proven otherwise */
13895 bool has_opening_bracket = FALSE;
13896 bool has_opening_colon = FALSE;
13897 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
13899 AV* warn_text = NULL; /* any warning messages */
13900 const char * possible_end = NULL; /* used for a 2nd parse pass */
13901 const char* name_start; /* ptr to class name first char */
13903 /* If the number of single-character typos the input name is away from a
13904 * legal name is no more than this number, it is considered to have meant
13905 * the legal name */
13906 int max_distance = 2;
13908 /* to store the name. The size determines the maximum length before we
13909 * decide that no posix class was intended. Should be at least
13910 * sizeof("alphanumeric") */
13913 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
13916 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
13919 if (*(p - 1) != '[') {
13920 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
13921 found_problem = TRUE;
13924 has_opening_bracket = TRUE;
13927 /* They could be confused and think you can put spaces between the
13930 found_problem = TRUE;
13934 } while (p < e && isBLANK(*p));
13936 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
13939 /* For [. .] and [= =]. These are quite different internally from [: :],
13940 * so they are handled separately. */
13941 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
13942 and 1 for at least one char in it
13945 const char open_char = *p;
13946 const char * temp_ptr = p + 1;
13948 /* These two constructs are not handled by perl, and if we find a
13949 * syntactically valid one, we croak. khw, who wrote this code, finds
13950 * this explanation of them very unclear:
13951 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
13952 * And searching the rest of the internet wasn't very helpful either.
13953 * It looks like just about any byte can be in these constructs,
13954 * depending on the locale. But unless the pattern is being compiled
13955 * under /l, which is very rare, Perl runs under the C or POSIX locale.
13956 * In that case, it looks like [= =] isn't allowed at all, and that
13957 * [. .] could be any single code point, but for longer strings the
13958 * constituent characters would have to be the ASCII alphabetics plus
13959 * the minus-hyphen. Any sensible locale definition would limit itself
13960 * to these. And any portable one definitely should. Trying to parse
13961 * the general case is a nightmare (see [perl #127604]). So, this code
13962 * looks only for interiors of these constructs that match:
13964 * Using \w relaxes the apparent rules a little, without adding much
13965 * danger of mistaking something else for one of these constructs.
13967 * [. .] in some implementations described on the internet is usable to
13968 * escape a character that otherwise is special in bracketed character
13969 * classes. For example [.].] means a literal right bracket instead of
13970 * the ending of the class
13972 * [= =] can legitimately contain a [. .] construct, but we don't
13973 * handle this case, as that [. .] construct will later get parsed
13974 * itself and croak then. And [= =] is checked for even when not under
13975 * /l, as Perl has long done so.
13977 * The code below relies on there being a trailing NUL, so it doesn't
13978 * have to keep checking if the parse ptr < e.
13980 if (temp_ptr[1] == open_char) {
13983 else while ( temp_ptr < e
13984 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
13989 if (*temp_ptr == open_char) {
13991 if (*temp_ptr == ']') {
13993 if (! found_problem && ! check_only) {
13994 RExC_parse = (char *) temp_ptr;
13995 vFAIL3("POSIX syntax [%c %c] is reserved for future "
13996 "extensions", open_char, open_char);
13999 /* Here, the syntax wasn't completely valid, or else the call
14000 * is to check-only */
14001 if (updated_parse_ptr) {
14002 *updated_parse_ptr = (char *) temp_ptr;
14005 return OOB_NAMEDCLASS;
14009 /* If we find something that started out to look like one of these
14010 * constructs, but isn't, we continue below so that it can be checked
14011 * for being a class name with a typo of '.' or '=' instead of a colon.
14015 /* Here, we think there is a possibility that a [: :] class was meant, and
14016 * we have the first real character. It could be they think the '^' comes
14019 found_problem = TRUE;
14020 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14025 found_problem = TRUE;
14029 } while (p < e && isBLANK(*p));
14031 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14035 /* But the first character should be a colon, which they could have easily
14036 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14037 * distinguish from a colon, so treat that as a colon). */
14040 has_opening_colon = TRUE;
14042 else if (*p == ';') {
14043 found_problem = TRUE;
14045 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14046 has_opening_colon = TRUE;
14049 found_problem = TRUE;
14050 ADD_POSIX_WARNING(p, "there must be a starting ':'");
14052 /* Consider an initial punctuation (not one of the recognized ones) to
14053 * be a left terminator */
14054 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14059 /* They may think that you can put spaces between the components */
14061 found_problem = TRUE;
14065 } while (p < e && isBLANK(*p));
14067 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14072 /* We consider something like [^:^alnum:]] to not have been intended to
14073 * be a posix class, but XXX maybe we should */
14075 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14082 /* Again, they may think that you can put spaces between the components */
14084 found_problem = TRUE;
14088 } while (p < e && isBLANK(*p));
14090 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14095 /* XXX This ']' may be a typo, and something else was meant. But
14096 * treating it as such creates enough complications, that that
14097 * possibility isn't currently considered here. So we assume that the
14098 * ']' is what is intended, and if we've already found an initial '[',
14099 * this leaves this construct looking like [:] or [:^], which almost
14100 * certainly weren't intended to be posix classes */
14101 if (has_opening_bracket) {
14102 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14105 /* But this function can be called when we parse the colon for
14106 * something like qr/[alpha:]]/, so we back up to look for the
14111 found_problem = TRUE;
14112 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14114 else if (*p != ':') {
14116 /* XXX We are currently very restrictive here, so this code doesn't
14117 * consider the possibility that, say, /[alpha.]]/ was intended to
14118 * be a posix class. */
14119 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14122 /* Here we have something like 'foo:]'. There was no initial colon,
14123 * and we back up over 'foo. XXX Unlike the going forward case, we
14124 * don't handle typos of non-word chars in the middle */
14125 has_opening_colon = FALSE;
14128 while (p > RExC_start && isWORDCHAR(*p)) {
14133 /* Here, we have positioned ourselves to where we think the first
14134 * character in the potential class is */
14137 /* Now the interior really starts. There are certain key characters that
14138 * can end the interior, or these could just be typos. To catch both
14139 * cases, we may have to do two passes. In the first pass, we keep on
14140 * going unless we come to a sequence that matches
14141 * qr/ [[:punct:]] [[:blank:]]* \] /xa
14142 * This means it takes a sequence to end the pass, so two typos in a row if
14143 * that wasn't what was intended. If the class is perfectly formed, just
14144 * this one pass is needed. We also stop if there are too many characters
14145 * being accumulated, but this number is deliberately set higher than any
14146 * real class. It is set high enough so that someone who thinks that
14147 * 'alphanumeric' is a correct name would get warned that it wasn't.
14148 * While doing the pass, we keep track of where the key characters were in
14149 * it. If we don't find an end to the class, and one of the key characters
14150 * was found, we redo the pass, but stop when we get to that character.
14151 * Thus the key character was considered a typo in the first pass, but a
14152 * terminator in the second. If two key characters are found, we stop at
14153 * the second one in the first pass. Again this can miss two typos, but
14154 * catches a single one
14156 * In the first pass, 'possible_end' starts as NULL, and then gets set to
14157 * point to the first key character. For the second pass, it starts as -1.
14163 bool has_blank = FALSE;
14164 bool has_upper = FALSE;
14165 bool has_terminating_colon = FALSE;
14166 bool has_terminating_bracket = FALSE;
14167 bool has_semi_colon = FALSE;
14168 unsigned int name_len = 0;
14169 int punct_count = 0;
14173 /* Squeeze out blanks when looking up the class name below */
14174 if (isBLANK(*p) ) {
14176 found_problem = TRUE;
14181 /* The name will end with a punctuation */
14183 const char * peek = p + 1;
14185 /* Treat any non-']' punctuation followed by a ']' (possibly
14186 * with intervening blanks) as trying to terminate the class.
14187 * ']]' is very likely to mean a class was intended (but
14188 * missing the colon), but the warning message that gets
14189 * generated shows the error position better if we exit the
14190 * loop at the bottom (eventually), so skip it here. */
14192 if (peek < e && isBLANK(*peek)) {
14194 found_problem = TRUE;
14197 } while (peek < e && isBLANK(*peek));
14200 if (peek < e && *peek == ']') {
14201 has_terminating_bracket = TRUE;
14203 has_terminating_colon = TRUE;
14205 else if (*p == ';') {
14206 has_semi_colon = TRUE;
14207 has_terminating_colon = TRUE;
14210 found_problem = TRUE;
14217 /* Here we have punctuation we thought didn't end the class.
14218 * Keep track of the position of the key characters that are
14219 * more likely to have been class-enders */
14220 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14222 /* Allow just one such possible class-ender not actually
14223 * ending the class. */
14224 if (possible_end) {
14230 /* If we have too many punctuation characters, no use in
14232 if (++punct_count > max_distance) {
14236 /* Treat the punctuation as a typo. */
14237 input_text[name_len++] = *p;
14240 else if (isUPPER(*p)) { /* Use lowercase for lookup */
14241 input_text[name_len++] = toLOWER(*p);
14243 found_problem = TRUE;
14245 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14246 input_text[name_len++] = *p;
14250 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14254 /* The declaration of 'input_text' is how long we allow a potential
14255 * class name to be, before saying they didn't mean a class name at
14257 if (name_len >= C_ARRAY_LENGTH(input_text)) {
14262 /* We get to here when the possible class name hasn't been properly
14263 * terminated before:
14264 * 1) we ran off the end of the pattern; or
14265 * 2) found two characters, each of which might have been intended to
14266 * be the name's terminator
14267 * 3) found so many punctuation characters in the purported name,
14268 * that the edit distance to a valid one is exceeded
14269 * 4) we decided it was more characters than anyone could have
14270 * intended to be one. */
14272 found_problem = TRUE;
14274 /* In the final two cases, we know that looking up what we've
14275 * accumulated won't lead to a match, even a fuzzy one. */
14276 if ( name_len >= C_ARRAY_LENGTH(input_text)
14277 || punct_count > max_distance)
14279 /* If there was an intermediate key character that could have been
14280 * an intended end, redo the parse, but stop there */
14281 if (possible_end && possible_end != (char *) -1) {
14282 possible_end = (char *) -1; /* Special signal value to say
14283 we've done a first pass */
14288 /* Otherwise, it can't have meant to have been a class */
14289 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14292 /* If we ran off the end, and the final character was a punctuation
14293 * one, back up one, to look at that final one just below. Later, we
14294 * will restore the parse pointer if appropriate */
14295 if (name_len && p == e && isPUNCT(*(p-1))) {
14300 if (p < e && isPUNCT(*p)) {
14302 has_terminating_bracket = TRUE;
14304 /* If this is a 2nd ']', and the first one is just below this
14305 * one, consider that to be the real terminator. This gives a
14306 * uniform and better positioning for the warning message */
14308 && possible_end != (char *) -1
14309 && *possible_end == ']'
14310 && name_len && input_text[name_len - 1] == ']')
14315 /* And this is actually equivalent to having done the 2nd
14316 * pass now, so set it to not try again */
14317 possible_end = (char *) -1;
14322 has_terminating_colon = TRUE;
14324 else if (*p == ';') {
14325 has_semi_colon = TRUE;
14326 has_terminating_colon = TRUE;
14334 /* Here, we have a class name to look up. We can short circuit the
14335 * stuff below for short names that can't possibly be meant to be a
14336 * class name. (We can do this on the first pass, as any second pass
14337 * will yield an even shorter name) */
14338 if (name_len < 3) {
14339 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14342 /* Find which class it is. Initially switch on the length of the name.
14344 switch (name_len) {
14346 if (memEQ(name_start, "word", 4)) {
14347 /* this is not POSIX, this is the Perl \w */
14348 class_number = ANYOF_WORDCHAR;
14352 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14353 * graph lower print punct space upper
14354 * Offset 4 gives the best switch position. */
14355 switch (name_start[4]) {
14357 if (memEQ(name_start, "alph", 4)) /* alpha */
14358 class_number = ANYOF_ALPHA;
14361 if (memEQ(name_start, "spac", 4)) /* space */
14362 class_number = ANYOF_SPACE;
14365 if (memEQ(name_start, "grap", 4)) /* graph */
14366 class_number = ANYOF_GRAPH;
14369 if (memEQ(name_start, "asci", 4)) /* ascii */
14370 class_number = ANYOF_ASCII;
14373 if (memEQ(name_start, "blan", 4)) /* blank */
14374 class_number = ANYOF_BLANK;
14377 if (memEQ(name_start, "cntr", 4)) /* cntrl */
14378 class_number = ANYOF_CNTRL;
14381 if (memEQ(name_start, "alnu", 4)) /* alnum */
14382 class_number = ANYOF_ALPHANUMERIC;
14385 if (memEQ(name_start, "lowe", 4)) /* lower */
14386 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14387 else if (memEQ(name_start, "uppe", 4)) /* upper */
14388 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14391 if (memEQ(name_start, "digi", 4)) /* digit */
14392 class_number = ANYOF_DIGIT;
14393 else if (memEQ(name_start, "prin", 4)) /* print */
14394 class_number = ANYOF_PRINT;
14395 else if (memEQ(name_start, "punc", 4)) /* punct */
14396 class_number = ANYOF_PUNCT;
14401 if (memEQ(name_start, "xdigit", 6))
14402 class_number = ANYOF_XDIGIT;
14406 /* If the name exactly matches a posix class name the class number will
14407 * here be set to it, and the input almost certainly was meant to be a
14408 * posix class, so we can skip further checking. If instead the syntax
14409 * is exactly correct, but the name isn't one of the legal ones, we
14410 * will return that as an error below. But if neither of these apply,
14411 * it could be that no posix class was intended at all, or that one
14412 * was, but there was a typo. We tease these apart by doing fuzzy
14413 * matching on the name */
14414 if (class_number == OOB_NAMEDCLASS && found_problem) {
14415 const UV posix_names[][6] = {
14416 { 'a', 'l', 'n', 'u', 'm' },
14417 { 'a', 'l', 'p', 'h', 'a' },
14418 { 'a', 's', 'c', 'i', 'i' },
14419 { 'b', 'l', 'a', 'n', 'k' },
14420 { 'c', 'n', 't', 'r', 'l' },
14421 { 'd', 'i', 'g', 'i', 't' },
14422 { 'g', 'r', 'a', 'p', 'h' },
14423 { 'l', 'o', 'w', 'e', 'r' },
14424 { 'p', 'r', 'i', 'n', 't' },
14425 { 'p', 'u', 'n', 'c', 't' },
14426 { 's', 'p', 'a', 'c', 'e' },
14427 { 'u', 'p', 'p', 'e', 'r' },
14428 { 'w', 'o', 'r', 'd' },
14429 { 'x', 'd', 'i', 'g', 'i', 't' }
14431 /* The names of the above all have added NULs to make them the same
14432 * size, so we need to also have the real lengths */
14433 const UV posix_name_lengths[] = {
14434 sizeof("alnum") - 1,
14435 sizeof("alpha") - 1,
14436 sizeof("ascii") - 1,
14437 sizeof("blank") - 1,
14438 sizeof("cntrl") - 1,
14439 sizeof("digit") - 1,
14440 sizeof("graph") - 1,
14441 sizeof("lower") - 1,
14442 sizeof("print") - 1,
14443 sizeof("punct") - 1,
14444 sizeof("space") - 1,
14445 sizeof("upper") - 1,
14446 sizeof("word") - 1,
14447 sizeof("xdigit")- 1
14450 int temp_max = max_distance; /* Use a temporary, so if we
14451 reparse, we haven't changed the
14454 /* Use a smaller max edit distance if we are missing one of the
14456 if ( has_opening_bracket + has_opening_colon < 2
14457 || has_terminating_bracket + has_terminating_colon < 2)
14462 /* See if the input name is close to a legal one */
14463 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14465 /* Short circuit call if the lengths are too far apart to be
14467 if (abs( (int) (name_len - posix_name_lengths[i]))
14473 if (edit_distance(input_text,
14476 posix_name_lengths[i],
14480 { /* If it is close, it probably was intended to be a class */
14481 goto probably_meant_to_be;
14485 /* Here the input name is not close enough to a valid class name
14486 * for us to consider it to be intended to be a posix class. If
14487 * we haven't already done so, and the parse found a character that
14488 * could have been terminators for the name, but which we absorbed
14489 * as typos during the first pass, repeat the parse, signalling it
14490 * to stop at that character */
14491 if (possible_end && possible_end != (char *) -1) {
14492 possible_end = (char *) -1;
14497 /* Here neither pass found a close-enough class name */
14498 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14501 probably_meant_to_be:
14503 /* Here we think that a posix specification was intended. Update any
14505 if (updated_parse_ptr) {
14506 *updated_parse_ptr = (char *) p;
14509 /* If a posix class name was intended but incorrectly specified, we
14510 * output or return the warnings */
14511 if (found_problem) {
14513 /* We set flags for these issues in the parse loop above instead of
14514 * adding them to the list of warnings, because we can parse it
14515 * twice, and we only want one warning instance */
14517 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14520 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14522 if (has_semi_colon) {
14523 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14525 else if (! has_terminating_colon) {
14526 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14528 if (! has_terminating_bracket) {
14529 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14533 if (posix_warnings) {
14534 /* mortalize to avoid a leak with FATAL warnings */
14535 *posix_warnings = (AV *) sv_2mortal((SV *) warn_text);
14538 SvREFCNT_dec_NN(warn_text);
14542 else if (class_number != OOB_NAMEDCLASS) {
14543 /* If it is a known class, return the class. The class number
14544 * #defines are structured so each complement is +1 to the normal
14546 return class_number + complement;
14548 else if (! check_only) {
14550 /* Here, it is an unrecognized class. This is an error (unless the
14551 * call is to check only, which we've already handled above) */
14552 const char * const complement_string = (complement)
14555 RExC_parse = (char *) p;
14556 vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
14558 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14562 return OOB_NAMEDCLASS;
14564 #undef ADD_POSIX_WARNING
14566 STATIC unsigned int
14567 S_regex_set_precedence(const U8 my_operator) {
14569 /* Returns the precedence in the (?[...]) construct of the input operator,
14570 * specified by its character representation. The precedence follows
14571 * general Perl rules, but it extends this so that ')' and ']' have (low)
14572 * precedence even though they aren't really operators */
14574 switch (my_operator) {
14590 NOT_REACHED; /* NOTREACHED */
14591 return 0; /* Silence compiler warning */
14595 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14596 I32 *flagp, U32 depth,
14597 char * const oregcomp_parse)
14599 /* Handle the (?[...]) construct to do set operations */
14601 U8 curchar; /* Current character being parsed */
14602 UV start, end; /* End points of code point ranges */
14603 SV* final = NULL; /* The end result inversion list */
14604 SV* result_string; /* 'final' stringified */
14605 AV* stack; /* stack of operators and operands not yet
14607 AV* fence_stack = NULL; /* A stack containing the positions in
14608 'stack' of where the undealt-with left
14609 parens would be if they were actually
14611 /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
14612 * in Solaris Studio 12.3. See RT #127455 */
14613 VOL IV fence = 0; /* Position of where most recent undealt-
14614 with left paren in stack is; -1 if none.
14616 STRLEN len; /* Temporary */
14617 regnode* node; /* Temporary, and final regnode returned by
14619 const bool save_fold = FOLD; /* Temporary */
14620 char *save_end, *save_parse; /* Temporaries */
14621 const bool in_locale = LOC; /* we turn off /l during processing */
14622 AV* posix_warnings = NULL;
14624 GET_RE_DEBUG_FLAGS_DECL;
14626 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14629 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14632 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
14633 This is required so that the compile
14634 time values are valid in all runtime
14637 /* This will return only an ANYOF regnode, or (unlikely) something smaller
14638 * (such as EXACT). Thus we can skip most everything if just sizing. We
14639 * call regclass to handle '[]' so as to not have to reinvent its parsing
14640 * rules here (throwing away the size it computes each time). And, we exit
14641 * upon an unescaped ']' that isn't one ending a regclass. To do both
14642 * these things, we need to realize that something preceded by a backslash
14643 * is escaped, so we have to keep track of backslashes */
14645 UV depth = 0; /* how many nested (?[...]) constructs */
14647 while (RExC_parse < RExC_end) {
14648 SV* current = NULL;
14650 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14651 TRUE /* Force /x */ );
14653 switch (*RExC_parse) {
14655 if (RExC_parse[1] == '[') depth++, RExC_parse++;
14660 /* Skip past this, so the next character gets skipped, after
14663 if (*RExC_parse == 'c') {
14664 /* Skip the \cX notation for control characters */
14665 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14671 /* See if this is a [:posix:] class. */
14672 bool is_posix_class = (OOB_NAMEDCLASS
14673 < handle_possible_posix(pRExC_state,
14677 TRUE /* checking only */));
14678 /* If it is a posix class, leave the parse pointer at the
14679 * '[' to fool regclass() into thinking it is part of a
14680 * '[[:posix:]]'. */
14681 if (! is_posix_class) {
14685 /* regclass() can only return RESTART_PASS1 and NEED_UTF8
14686 * if multi-char folds are allowed. */
14687 if (!regclass(pRExC_state, flagp,depth+1,
14688 is_posix_class, /* parse the whole char
14689 class only if not a
14691 FALSE, /* don't allow multi-char folds */
14692 TRUE, /* silence non-portable warnings. */
14694 FALSE, /* Require return to be an ANYOF */
14698 FAIL2("panic: regclass returned NULL to handle_sets, "
14699 "flags=%#"UVxf"", (UV) *flagp);
14701 /* function call leaves parse pointing to the ']', except
14702 * if we faked it */
14703 if (is_posix_class) {
14707 SvREFCNT_dec(current); /* In case it returned something */
14712 if (depth--) break;
14714 if (*RExC_parse == ')') {
14715 node = reganode(pRExC_state, ANYOF, 0);
14716 RExC_size += ANYOF_SKIP;
14717 nextchar(pRExC_state);
14718 Set_Node_Length(node,
14719 RExC_parse - oregcomp_parse + 1); /* MJD */
14721 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14729 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14733 /* We output the messages even if warnings are off, because we'll fail
14734 * the very next thing, and these give a likely diagnosis for that */
14735 if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
14736 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
14739 FAIL("Syntax error in (?[...])");
14742 /* Pass 2 only after this. */
14743 Perl_ck_warner_d(aTHX_
14744 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
14745 "The regex_sets feature is experimental" REPORT_LOCATION,
14746 REPORT_LOCATION_ARGS(RExC_parse));
14748 /* Everything in this construct is a metacharacter. Operands begin with
14749 * either a '\' (for an escape sequence), or a '[' for a bracketed
14750 * character class. Any other character should be an operator, or
14751 * parenthesis for grouping. Both types of operands are handled by calling
14752 * regclass() to parse them. It is called with a parameter to indicate to
14753 * return the computed inversion list. The parsing here is implemented via
14754 * a stack. Each entry on the stack is a single character representing one
14755 * of the operators; or else a pointer to an operand inversion list. */
14757 #define IS_OPERATOR(a) SvIOK(a)
14758 #define IS_OPERAND(a) (! IS_OPERATOR(a))
14760 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
14761 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
14762 * with pronouncing it called it Reverse Polish instead, but now that YOU
14763 * know how to pronounce it you can use the correct term, thus giving due
14764 * credit to the person who invented it, and impressing your geek friends.
14765 * Wikipedia says that the pronounciation of "Ł" has been changing so that
14766 * it is now more like an English initial W (as in wonk) than an L.)
14768 * This means that, for example, 'a | b & c' is stored on the stack as
14776 * where the numbers in brackets give the stack [array] element number.
14777 * In this implementation, parentheses are not stored on the stack.
14778 * Instead a '(' creates a "fence" so that the part of the stack below the
14779 * fence is invisible except to the corresponding ')' (this allows us to
14780 * replace testing for parens, by using instead subtraction of the fence
14781 * position). As new operands are processed they are pushed onto the stack
14782 * (except as noted in the next paragraph). New operators of higher
14783 * precedence than the current final one are inserted on the stack before
14784 * the lhs operand (so that when the rhs is pushed next, everything will be
14785 * in the correct positions shown above. When an operator of equal or
14786 * lower precedence is encountered in parsing, all the stacked operations
14787 * of equal or higher precedence are evaluated, leaving the result as the
14788 * top entry on the stack. This makes higher precedence operations
14789 * evaluate before lower precedence ones, and causes operations of equal
14790 * precedence to left associate.
14792 * The only unary operator '!' is immediately pushed onto the stack when
14793 * encountered. When an operand is encountered, if the top of the stack is
14794 * a '!", the complement is immediately performed, and the '!' popped. The
14795 * resulting value is treated as a new operand, and the logic in the
14796 * previous paragraph is executed. Thus in the expression
14798 * the stack looks like
14804 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
14811 * A ')' is treated as an operator with lower precedence than all the
14812 * aforementioned ones, which causes all operations on the stack above the
14813 * corresponding '(' to be evaluated down to a single resultant operand.
14814 * Then the fence for the '(' is removed, and the operand goes through the
14815 * algorithm above, without the fence.
14817 * A separate stack is kept of the fence positions, so that the position of
14818 * the latest so-far unbalanced '(' is at the top of it.
14820 * The ']' ending the construct is treated as the lowest operator of all,
14821 * so that everything gets evaluated down to a single operand, which is the
14824 sv_2mortal((SV *)(stack = newAV()));
14825 sv_2mortal((SV *)(fence_stack = newAV()));
14827 while (RExC_parse < RExC_end) {
14828 I32 top_index; /* Index of top-most element in 'stack' */
14829 SV** top_ptr; /* Pointer to top 'stack' element */
14830 SV* current = NULL; /* To contain the current inversion list
14832 SV* only_to_avoid_leaks;
14834 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14835 TRUE /* Force /x */ );
14836 if (RExC_parse >= RExC_end) {
14837 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
14840 curchar = UCHARAT(RExC_parse);
14844 top_index = av_tindex_nomg(stack);
14847 SV** stacked_ptr; /* Ptr to something already on 'stack' */
14848 char stacked_operator; /* The topmost operator on the 'stack'. */
14849 SV* lhs; /* Operand to the left of the operator */
14850 SV* rhs; /* Operand to the right of the operator */
14851 SV* fence_ptr; /* Pointer to top element of the fence
14856 if ( RExC_parse < RExC_end - 1
14857 && (UCHARAT(RExC_parse + 1) == '?'))
14859 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
14860 * This happens when we have some thing like
14862 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
14864 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
14866 * Here we would be handling the interpolated
14867 * '$thai_or_lao'. We handle this by a recursive call to
14868 * ourselves which returns the inversion list the
14869 * interpolated expression evaluates to. We use the flags
14870 * from the interpolated pattern. */
14871 U32 save_flags = RExC_flags;
14872 const char * save_parse;
14874 RExC_parse += 2; /* Skip past the '(?' */
14875 save_parse = RExC_parse;
14877 /* Parse any flags for the '(?' */
14878 parse_lparen_question_flags(pRExC_state);
14880 if (RExC_parse == save_parse /* Makes sure there was at
14881 least one flag (or else
14882 this embedding wasn't
14884 || RExC_parse >= RExC_end - 4
14885 || UCHARAT(RExC_parse) != ':'
14886 || UCHARAT(++RExC_parse) != '('
14887 || UCHARAT(++RExC_parse) != '?'
14888 || UCHARAT(++RExC_parse) != '[')
14891 /* In combination with the above, this moves the
14892 * pointer to the point just after the first erroneous
14893 * character (or if there are no flags, to where they
14894 * should have been) */
14895 if (RExC_parse >= RExC_end - 4) {
14896 RExC_parse = RExC_end;
14898 else if (RExC_parse != save_parse) {
14899 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14901 vFAIL("Expecting '(?flags:(?[...'");
14904 /* Recurse, with the meat of the embedded expression */
14906 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
14907 depth+1, oregcomp_parse);
14909 /* Here, 'current' contains the embedded expression's
14910 * inversion list, and RExC_parse points to the trailing
14911 * ']'; the next character should be the ')' */
14913 assert(UCHARAT(RExC_parse) == ')');
14915 /* Then the ')' matching the original '(' handled by this
14916 * case: statement */
14918 assert(UCHARAT(RExC_parse) == ')');
14921 RExC_flags = save_flags;
14922 goto handle_operand;
14925 /* A regular '('. Look behind for illegal syntax */
14926 if (top_index - fence >= 0) {
14927 /* If the top entry on the stack is an operator, it had
14928 * better be a '!', otherwise the entry below the top
14929 * operand should be an operator */
14930 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
14931 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
14932 || ( IS_OPERAND(*top_ptr)
14933 && ( top_index - fence < 1
14934 || ! (stacked_ptr = av_fetch(stack,
14937 || ! IS_OPERATOR(*stacked_ptr))))
14940 vFAIL("Unexpected '(' with no preceding operator");
14944 /* Stack the position of this undealt-with left paren */
14945 fence = top_index + 1;
14946 av_push(fence_stack, newSViv(fence));
14950 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
14951 * multi-char folds are allowed. */
14952 if (!regclass(pRExC_state, flagp,depth+1,
14953 TRUE, /* means parse just the next thing */
14954 FALSE, /* don't allow multi-char folds */
14955 FALSE, /* don't silence non-portable warnings. */
14957 FALSE, /* Require return to be an ANYOF */
14961 FAIL2("panic: regclass returned NULL to handle_sets, "
14962 "flags=%#"UVxf"", (UV) *flagp);
14965 /* regclass() will return with parsing just the \ sequence,
14966 * leaving the parse pointer at the next thing to parse */
14968 goto handle_operand;
14970 case '[': /* Is a bracketed character class */
14972 /* See if this is a [:posix:] class. */
14973 bool is_posix_class = (OOB_NAMEDCLASS
14974 < handle_possible_posix(pRExC_state,
14978 TRUE /* checking only */));
14979 /* If it is a posix class, leave the parse pointer at the '['
14980 * to fool regclass() into thinking it is part of a
14981 * '[[:posix:]]'. */
14982 if (! is_posix_class) {
14986 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
14987 * multi-char folds are allowed. */
14988 if (!regclass(pRExC_state, flagp,depth+1,
14989 is_posix_class, /* parse the whole char
14990 class only if not a
14992 FALSE, /* don't allow multi-char folds */
14993 TRUE, /* silence non-portable warnings. */
14995 FALSE, /* Require return to be an ANYOF */
15000 FAIL2("panic: regclass returned NULL to handle_sets, "
15001 "flags=%#"UVxf"", (UV) *flagp);
15004 /* function call leaves parse pointing to the ']', except if we
15006 if (is_posix_class) {
15010 goto handle_operand;
15014 if (top_index >= 1) {
15015 goto join_operators;
15018 /* Only a single operand on the stack: are done */
15022 if (av_tindex_nomg(fence_stack) < 0) {
15024 vFAIL("Unexpected ')'");
15027 /* If at least two thing on the stack, treat this as an
15029 if (top_index - fence >= 1) {
15030 goto join_operators;
15033 /* Here only a single thing on the fenced stack, and there is a
15034 * fence. Get rid of it */
15035 fence_ptr = av_pop(fence_stack);
15037 fence = SvIV(fence_ptr) - 1;
15038 SvREFCNT_dec_NN(fence_ptr);
15045 /* Having gotten rid of the fence, we pop the operand at the
15046 * stack top and process it as a newly encountered operand */
15047 current = av_pop(stack);
15048 if (IS_OPERAND(current)) {
15049 goto handle_operand;
15061 /* These binary operators should have a left operand already
15063 if ( top_index - fence < 0
15064 || top_index - fence == 1
15065 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15066 || ! IS_OPERAND(*top_ptr))
15068 goto unexpected_binary;
15071 /* If only the one operand is on the part of the stack visible
15072 * to us, we just place this operator in the proper position */
15073 if (top_index - fence < 2) {
15075 /* Place the operator before the operand */
15077 SV* lhs = av_pop(stack);
15078 av_push(stack, newSVuv(curchar));
15079 av_push(stack, lhs);
15083 /* But if there is something else on the stack, we need to
15084 * process it before this new operator if and only if the
15085 * stacked operation has equal or higher precedence than the
15090 /* The operator on the stack is supposed to be below both its
15092 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15093 || IS_OPERAND(*stacked_ptr))
15095 /* But if not, it's legal and indicates we are completely
15096 * done if and only if we're currently processing a ']',
15097 * which should be the final thing in the expression */
15098 if (curchar == ']') {
15104 vFAIL2("Unexpected binary operator '%c' with no "
15105 "preceding operand", curchar);
15107 stacked_operator = (char) SvUV(*stacked_ptr);
15109 if (regex_set_precedence(curchar)
15110 > regex_set_precedence(stacked_operator))
15112 /* Here, the new operator has higher precedence than the
15113 * stacked one. This means we need to add the new one to
15114 * the stack to await its rhs operand (and maybe more
15115 * stuff). We put it before the lhs operand, leaving
15116 * untouched the stacked operator and everything below it
15118 lhs = av_pop(stack);
15119 assert(IS_OPERAND(lhs));
15121 av_push(stack, newSVuv(curchar));
15122 av_push(stack, lhs);
15126 /* Here, the new operator has equal or lower precedence than
15127 * what's already there. This means the operation already
15128 * there should be performed now, before the new one. */
15130 rhs = av_pop(stack);
15131 if (! IS_OPERAND(rhs)) {
15133 /* This can happen when a ! is not followed by an operand,
15134 * like in /(?[\t &!])/ */
15138 lhs = av_pop(stack);
15140 if (! IS_OPERAND(lhs)) {
15142 /* This can happen when there is an empty (), like in
15143 * /(?[[0]+()+])/ */
15147 switch (stacked_operator) {
15149 _invlist_intersection(lhs, rhs, &rhs);
15154 _invlist_union(lhs, rhs, &rhs);
15158 _invlist_subtract(lhs, rhs, &rhs);
15161 case '^': /* The union minus the intersection */
15167 _invlist_union(lhs, rhs, &u);
15168 _invlist_intersection(lhs, rhs, &i);
15169 /* _invlist_subtract will overwrite rhs
15170 without freeing what it already contains */
15172 _invlist_subtract(u, i, &rhs);
15173 SvREFCNT_dec_NN(i);
15174 SvREFCNT_dec_NN(u);
15175 SvREFCNT_dec_NN(element);
15181 /* Here, the higher precedence operation has been done, and the
15182 * result is in 'rhs'. We overwrite the stacked operator with
15183 * the result. Then we redo this code to either push the new
15184 * operator onto the stack or perform any higher precedence
15185 * stacked operation */
15186 only_to_avoid_leaks = av_pop(stack);
15187 SvREFCNT_dec(only_to_avoid_leaks);
15188 av_push(stack, rhs);
15191 case '!': /* Highest priority, right associative */
15193 /* If what's already at the top of the stack is another '!",
15194 * they just cancel each other out */
15195 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
15196 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15198 only_to_avoid_leaks = av_pop(stack);
15199 SvREFCNT_dec(only_to_avoid_leaks);
15201 else { /* Otherwise, since it's right associative, just push
15203 av_push(stack, newSVuv(curchar));
15208 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15209 vFAIL("Unexpected character");
15213 /* Here 'current' is the operand. If something is already on the
15214 * stack, we have to check if it is a !. But first, the code above
15215 * may have altered the stack in the time since we earlier set
15218 top_index = av_tindex_nomg(stack);
15219 if (top_index - fence >= 0) {
15220 /* If the top entry on the stack is an operator, it had better
15221 * be a '!', otherwise the entry below the top operand should
15222 * be an operator */
15223 top_ptr = av_fetch(stack, top_index, FALSE);
15225 if (IS_OPERATOR(*top_ptr)) {
15227 /* The only permissible operator at the top of the stack is
15228 * '!', which is applied immediately to this operand. */
15229 curchar = (char) SvUV(*top_ptr);
15230 if (curchar != '!') {
15231 SvREFCNT_dec(current);
15232 vFAIL2("Unexpected binary operator '%c' with no "
15233 "preceding operand", curchar);
15236 _invlist_invert(current);
15238 only_to_avoid_leaks = av_pop(stack);
15239 SvREFCNT_dec(only_to_avoid_leaks);
15241 /* And we redo with the inverted operand. This allows
15242 * handling multiple ! in a row */
15243 goto handle_operand;
15245 /* Single operand is ok only for the non-binary ')'
15247 else if ((top_index - fence == 0 && curchar != ')')
15248 || (top_index - fence > 0
15249 && (! (stacked_ptr = av_fetch(stack,
15252 || IS_OPERAND(*stacked_ptr))))
15254 SvREFCNT_dec(current);
15255 vFAIL("Operand with no preceding operator");
15259 /* Here there was nothing on the stack or the top element was
15260 * another operand. Just add this new one */
15261 av_push(stack, current);
15263 } /* End of switch on next parse token */
15265 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15266 } /* End of loop parsing through the construct */
15269 if (av_tindex_nomg(fence_stack) >= 0) {
15270 vFAIL("Unmatched (");
15273 if (av_tindex_nomg(stack) < 0 /* Was empty */
15274 || ((final = av_pop(stack)) == NULL)
15275 || ! IS_OPERAND(final)
15276 || SvTYPE(final) != SVt_INVLIST
15277 || av_tindex_nomg(stack) >= 0) /* More left on stack */
15280 SvREFCNT_dec(final);
15281 vFAIL("Incomplete expression within '(?[ ])'");
15284 /* Here, 'final' is the resultant inversion list from evaluating the
15285 * expression. Return it if so requested */
15286 if (return_invlist) {
15287 *return_invlist = final;
15291 /* Otherwise generate a resultant node, based on 'final'. regclass() is
15292 * expecting a string of ranges and individual code points */
15293 invlist_iterinit(final);
15294 result_string = newSVpvs("");
15295 while (invlist_iternext(final, &start, &end)) {
15296 if (start == end) {
15297 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
15300 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
15305 /* About to generate an ANYOF (or similar) node from the inversion list we
15306 * have calculated */
15307 save_parse = RExC_parse;
15308 RExC_parse = SvPV(result_string, len);
15309 save_end = RExC_end;
15310 RExC_end = RExC_parse + len;
15312 /* We turn off folding around the call, as the class we have constructed
15313 * already has all folding taken into consideration, and we don't want
15314 * regclass() to add to that */
15315 RExC_flags &= ~RXf_PMf_FOLD;
15316 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15317 * folds are allowed. */
15318 node = regclass(pRExC_state, flagp,depth+1,
15319 FALSE, /* means parse the whole char class */
15320 FALSE, /* don't allow multi-char folds */
15321 TRUE, /* silence non-portable warnings. The above may very
15322 well have generated non-portable code points, but
15323 they're valid on this machine */
15324 FALSE, /* similarly, no need for strict */
15325 FALSE, /* Require return to be an ANYOF */
15330 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
15333 /* Fix up the node type if we are in locale. (We have pretended we are
15334 * under /u for the purposes of regclass(), as this construct will only
15335 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
15336 * as to cause any warnings about bad locales to be output in regexec.c),
15337 * and add the flag that indicates to check if not in a UTF-8 locale. The
15338 * reason we above forbid optimization into something other than an ANYOF
15339 * node is simply to minimize the number of code changes in regexec.c.
15340 * Otherwise we would have to create new EXACTish node types and deal with
15341 * them. This decision could be revisited should this construct become
15344 * (One might think we could look at the resulting ANYOF node and suppress
15345 * the flag if everything is above 255, as those would be UTF-8 only,
15346 * but this isn't true, as the components that led to that result could
15347 * have been locale-affected, and just happen to cancel each other out
15348 * under UTF-8 locales.) */
15350 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15352 assert(OP(node) == ANYOF);
15356 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15360 RExC_flags |= RXf_PMf_FOLD;
15363 RExC_parse = save_parse + 1;
15364 RExC_end = save_end;
15365 SvREFCNT_dec_NN(final);
15366 SvREFCNT_dec_NN(result_string);
15368 nextchar(pRExC_state);
15369 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15376 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15378 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15379 * innocent-looking character class, like /[ks]/i won't have to go out to
15380 * disk to find the possible matches.
15382 * This should be called only for a Latin1-range code points, cp, which is
15383 * known to be involved in a simple fold with other code points above
15384 * Latin1. It would give false results if /aa has been specified.
15385 * Multi-char folds are outside the scope of this, and must be handled
15388 * XXX It would be better to generate these via regen, in case a new
15389 * version of the Unicode standard adds new mappings, though that is not
15390 * really likely, and may be caught by the default: case of the switch
15393 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15395 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15401 add_cp_to_invlist(*invlist, KELVIN_SIGN);
15405 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15408 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15409 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15411 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15412 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15413 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15415 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15416 *invlist = add_cp_to_invlist(*invlist,
15417 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15420 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15422 case LATIN_SMALL_LETTER_SHARP_S:
15423 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15428 #if UNICODE_MAJOR_VERSION < 3 \
15429 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15431 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15436 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15437 # if UNICODE_DOT_DOT_VERSION == 1
15438 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15444 /* Use deprecated warning to increase the chances of this being
15447 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15454 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15456 /* If the final parameter is NULL, output the elements of the array given
15457 * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
15458 * pushed onto it, (creating if necessary) */
15461 const bool first_is_fatal = ! return_posix_warnings
15462 && ckDEAD(packWARN(WARN_REGEXP));
15464 PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15466 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15467 if (return_posix_warnings) {
15468 if (! *return_posix_warnings) { /* mortalize to not leak if
15469 warnings are fatal */
15470 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15472 av_push(*return_posix_warnings, msg);
15475 if (first_is_fatal) { /* Avoid leaking this */
15476 av_undef(posix_warnings); /* This isn't necessary if the
15477 array is mortal, but is a
15479 (void) sv_2mortal(msg);
15481 SAVEFREESV(RExC_rx_sv);
15484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15485 SvREFCNT_dec_NN(msg);
15491 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15493 /* This adds the string scalar <multi_string> to the array
15494 * <multi_char_matches>. <multi_string> is known to have exactly
15495 * <cp_count> code points in it. This is used when constructing a
15496 * bracketed character class and we find something that needs to match more
15497 * than a single character.
15499 * <multi_char_matches> is actually an array of arrays. Each top-level
15500 * element is an array that contains all the strings known so far that are
15501 * the same length. And that length (in number of code points) is the same
15502 * as the index of the top-level array. Hence, the [2] element is an
15503 * array, each element thereof is a string containing TWO code points;
15504 * while element [3] is for strings of THREE characters, and so on. Since
15505 * this is for multi-char strings there can never be a [0] nor [1] element.
15507 * When we rewrite the character class below, we will do so such that the
15508 * longest strings are written first, so that it prefers the longest
15509 * matching strings first. This is done even if it turns out that any
15510 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
15511 * Christiansen has agreed that this is ok. This makes the test for the
15512 * ligature 'ffi' come before the test for 'ff', for example */
15515 AV** this_array_ptr;
15517 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15519 if (! multi_char_matches) {
15520 multi_char_matches = newAV();
15523 if (av_exists(multi_char_matches, cp_count)) {
15524 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15525 this_array = *this_array_ptr;
15528 this_array = newAV();
15529 av_store(multi_char_matches, cp_count,
15532 av_push(this_array, multi_string);
15534 return multi_char_matches;
15537 /* The names of properties whose definitions are not known at compile time are
15538 * stored in this SV, after a constant heading. So if the length has been
15539 * changed since initialization, then there is a run-time definition. */
15540 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
15541 (SvCUR(listsv) != initial_listsv_len)
15543 /* There is a restricted set of white space characters that are legal when
15544 * ignoring white space in a bracketed character class. This generates the
15545 * code to skip them.
15547 * There is a line below that uses the same white space criteria but is outside
15548 * this macro. Both here and there must use the same definition */
15549 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
15552 while (isBLANK_A(UCHARAT(p))) \
15560 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15561 const bool stop_at_1, /* Just parse the next thing, don't
15562 look for a full character class */
15563 bool allow_multi_folds,
15564 const bool silence_non_portable, /* Don't output warnings
15568 bool optimizable, /* ? Allow a non-ANYOF return
15570 SV** ret_invlist, /* Return an inversion list, not a node */
15571 AV** return_posix_warnings
15574 /* parse a bracketed class specification. Most of these will produce an
15575 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
15576 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
15577 * under /i with multi-character folds: it will be rewritten following the
15578 * paradigm of this example, where the <multi-fold>s are characters which
15579 * fold to multiple character sequences:
15580 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
15581 * gets effectively rewritten as:
15582 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
15583 * reg() gets called (recursively) on the rewritten version, and this
15584 * function will return what it constructs. (Actually the <multi-fold>s
15585 * aren't physically removed from the [abcdefghi], it's just that they are
15586 * ignored in the recursion by means of a flag:
15587 * <RExC_in_multi_char_class>.)
15589 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
15590 * characters, with the corresponding bit set if that character is in the
15591 * list. For characters above this, a range list or swash is used. There
15592 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
15593 * determinable at compile time
15595 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
15596 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
15597 * to UTF-8. This can only happen if ret_invlist is non-NULL.
15600 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
15602 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
15605 int namedclass = OOB_NAMEDCLASS;
15606 char *rangebegin = NULL;
15607 bool need_class = 0;
15609 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
15610 than just initialized. */
15611 SV* properties = NULL; /* Code points that match \p{} \P{} */
15612 SV* posixes = NULL; /* Code points that match classes like [:word:],
15613 extended beyond the Latin1 range. These have to
15614 be kept separate from other code points for much
15615 of this function because their handling is
15616 different under /i, and for most classes under
15618 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
15619 separate for a while from the non-complemented
15620 versions because of complications with /d
15622 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
15623 treated more simply than the general case,
15624 leading to less compilation and execution
15626 UV element_count = 0; /* Number of distinct elements in the class.
15627 Optimizations may be possible if this is tiny */
15628 AV * multi_char_matches = NULL; /* Code points that fold to more than one
15629 character; used under /i */
15631 char * stop_ptr = RExC_end; /* where to stop parsing */
15632 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
15635 /* Unicode properties are stored in a swash; this holds the current one
15636 * being parsed. If this swash is the only above-latin1 component of the
15637 * character class, an optimization is to pass it directly on to the
15638 * execution engine. Otherwise, it is set to NULL to indicate that there
15639 * are other things in the class that have to be dealt with at execution
15641 SV* swash = NULL; /* Code points that match \p{} \P{} */
15643 /* Set if a component of this character class is user-defined; just passed
15644 * on to the engine */
15645 bool has_user_defined_property = FALSE;
15647 /* inversion list of code points this node matches only when the target
15648 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
15650 SV* has_upper_latin1_only_utf8_matches = NULL;
15652 /* Inversion list of code points this node matches regardless of things
15653 * like locale, folding, utf8ness of the target string */
15654 SV* cp_list = NULL;
15656 /* Like cp_list, but code points on this list need to be checked for things
15657 * that fold to/from them under /i */
15658 SV* cp_foldable_list = NULL;
15660 /* Like cp_list, but code points on this list are valid only when the
15661 * runtime locale is UTF-8 */
15662 SV* only_utf8_locale_list = NULL;
15664 /* In a range, if one of the endpoints is non-character-set portable,
15665 * meaning that it hard-codes a code point that may mean a different
15666 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
15667 * mnemonic '\t' which each mean the same character no matter which
15668 * character set the platform is on. */
15669 unsigned int non_portable_endpoint = 0;
15671 /* Is the range unicode? which means on a platform that isn't 1-1 native
15672 * to Unicode (i.e. non-ASCII), each code point in it should be considered
15673 * to be a Unicode value. */
15674 bool unicode_range = FALSE;
15675 bool invert = FALSE; /* Is this class to be complemented */
15677 bool warn_super = ALWAYS_WARN_SUPER;
15679 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
15680 case we need to change the emitted regop to an EXACT. */
15681 const char * orig_parse = RExC_parse;
15682 const SSize_t orig_size = RExC_size;
15683 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
15685 /* This variable is used to mark where the end in the input is of something
15686 * that looks like a POSIX construct but isn't. During the parse, when
15687 * something looks like it could be such a construct is encountered, it is
15688 * checked for being one, but not if we've already checked this area of the
15689 * input. Only after this position is reached do we check again */
15690 char *not_posix_region_end = RExC_parse - 1;
15692 AV* posix_warnings = NULL;
15693 const bool do_posix_warnings = return_posix_warnings
15694 || (PASS2 && ckWARN(WARN_REGEXP));
15696 GET_RE_DEBUG_FLAGS_DECL;
15698 PERL_ARGS_ASSERT_REGCLASS;
15700 PERL_UNUSED_ARG(depth);
15703 DEBUG_PARSE("clas");
15705 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
15706 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
15707 && UNICODE_DOT_DOT_VERSION == 0)
15708 allow_multi_folds = FALSE;
15711 /* Assume we are going to generate an ANYOF node. */
15712 ret = reganode(pRExC_state,
15719 RExC_size += ANYOF_SKIP;
15720 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
15723 ANYOF_FLAGS(ret) = 0;
15725 RExC_emit += ANYOF_SKIP;
15726 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
15727 initial_listsv_len = SvCUR(listsv);
15728 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
15731 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15733 assert(RExC_parse <= RExC_end);
15735 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
15738 allow_multi_folds = FALSE;
15740 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15743 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
15744 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
15745 int maybe_class = handle_possible_posix(pRExC_state,
15747 ¬_posix_region_end,
15749 TRUE /* checking only */);
15750 if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
15751 SAVEFREESV(RExC_rx_sv);
15752 ckWARN4reg(not_posix_region_end,
15753 "POSIX syntax [%c %c] belongs inside character classes%s",
15754 *RExC_parse, *RExC_parse,
15755 (maybe_class == OOB_NAMEDCLASS)
15756 ? ((POSIXCC_NOTYET(*RExC_parse))
15757 ? " (but this one isn't implemented)"
15758 : " (but this one isn't fully valid)")
15761 (void)ReREFCNT_inc(RExC_rx_sv);
15765 /* If the caller wants us to just parse a single element, accomplish this
15766 * by faking the loop ending condition */
15767 if (stop_at_1 && RExC_end > RExC_parse) {
15768 stop_ptr = RExC_parse + 1;
15771 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
15772 if (UCHARAT(RExC_parse) == ']')
15773 goto charclassloop;
15777 if ( posix_warnings
15778 && av_tindex_nomg(posix_warnings) >= 0
15779 && RExC_parse > not_posix_region_end)
15781 /* Warnings about posix class issues are considered tentative until
15782 * we are far enough along in the parse that we can no longer
15783 * change our mind, at which point we either output them or add
15784 * them, if it has so specified, to what gets returned to the
15785 * caller. This is done each time through the loop so that a later
15786 * class won't zap them before they have been dealt with. */
15787 output_or_return_posix_warnings(pRExC_state, posix_warnings,
15788 return_posix_warnings);
15791 if (RExC_parse >= stop_ptr) {
15795 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15797 if (UCHARAT(RExC_parse) == ']') {
15803 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
15804 save_value = value;
15805 save_prevvalue = prevvalue;
15808 rangebegin = RExC_parse;
15810 non_portable_endpoint = 0;
15812 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
15813 value = utf8n_to_uvchr((U8*)RExC_parse,
15814 RExC_end - RExC_parse,
15815 &numlen, UTF8_ALLOW_DEFAULT);
15816 RExC_parse += numlen;
15819 value = UCHARAT(RExC_parse++);
15821 if (value == '[') {
15822 char * posix_class_end;
15823 namedclass = handle_possible_posix(pRExC_state,
15826 do_posix_warnings ? &posix_warnings : NULL,
15827 FALSE /* die if error */);
15828 if (namedclass > OOB_NAMEDCLASS) {
15830 /* If there was an earlier attempt to parse this particular
15831 * posix class, and it failed, it was a false alarm, as this
15832 * successful one proves */
15833 if ( posix_warnings
15834 && av_tindex_nomg(posix_warnings) >= 0
15835 && not_posix_region_end >= RExC_parse
15836 && not_posix_region_end <= posix_class_end)
15838 av_undef(posix_warnings);
15841 RExC_parse = posix_class_end;
15843 else if (namedclass == OOB_NAMEDCLASS) {
15844 not_posix_region_end = posix_class_end;
15847 namedclass = OOB_NAMEDCLASS;
15850 else if ( RExC_parse - 1 > not_posix_region_end
15851 && MAYBE_POSIXCC(value))
15853 (void) handle_possible_posix(
15855 RExC_parse - 1, /* -1 because parse has already been
15857 ¬_posix_region_end,
15858 do_posix_warnings ? &posix_warnings : NULL,
15859 TRUE /* checking only */);
15861 else if (value == '\\') {
15862 /* Is a backslash; get the code point of the char after it */
15864 if (RExC_parse >= RExC_end) {
15865 vFAIL("Unmatched [");
15868 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
15869 value = utf8n_to_uvchr((U8*)RExC_parse,
15870 RExC_end - RExC_parse,
15871 &numlen, UTF8_ALLOW_DEFAULT);
15872 RExC_parse += numlen;
15875 value = UCHARAT(RExC_parse++);
15877 /* Some compilers cannot handle switching on 64-bit integer
15878 * values, therefore value cannot be an UV. Yes, this will
15879 * be a problem later if we want switch on Unicode.
15880 * A similar issue a little bit later when switching on
15881 * namedclass. --jhi */
15883 /* If the \ is escaping white space when white space is being
15884 * skipped, it means that that white space is wanted literally, and
15885 * is already in 'value'. Otherwise, need to translate the escape
15886 * into what it signifies. */
15887 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
15889 case 'w': namedclass = ANYOF_WORDCHAR; break;
15890 case 'W': namedclass = ANYOF_NWORDCHAR; break;
15891 case 's': namedclass = ANYOF_SPACE; break;
15892 case 'S': namedclass = ANYOF_NSPACE; break;
15893 case 'd': namedclass = ANYOF_DIGIT; break;
15894 case 'D': namedclass = ANYOF_NDIGIT; break;
15895 case 'v': namedclass = ANYOF_VERTWS; break;
15896 case 'V': namedclass = ANYOF_NVERTWS; break;
15897 case 'h': namedclass = ANYOF_HORIZWS; break;
15898 case 'H': namedclass = ANYOF_NHORIZWS; break;
15899 case 'N': /* Handle \N{NAME} in class */
15901 const char * const backslash_N_beg = RExC_parse - 2;
15904 if (! grok_bslash_N(pRExC_state,
15905 NULL, /* No regnode */
15906 &value, /* Yes single value */
15907 &cp_count, /* Multiple code pt count */
15913 if (*flagp & NEED_UTF8)
15914 FAIL("panic: grok_bslash_N set NEED_UTF8");
15915 if (*flagp & RESTART_PASS1)
15918 if (cp_count < 0) {
15919 vFAIL("\\N in a character class must be a named character: \\N{...}");
15921 else if (cp_count == 0) {
15923 ckWARNreg(RExC_parse,
15924 "Ignoring zero length \\N{} in character class");
15927 else { /* cp_count > 1 */
15928 if (! RExC_in_multi_char_class) {
15929 if (invert || range || *RExC_parse == '-') {
15932 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
15935 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
15937 break; /* <value> contains the first code
15938 point. Drop out of the switch to
15942 SV * multi_char_N = newSVpvn(backslash_N_beg,
15943 RExC_parse - backslash_N_beg);
15945 = add_multi_match(multi_char_matches,
15950 } /* End of cp_count != 1 */
15952 /* This element should not be processed further in this
15955 value = save_value;
15956 prevvalue = save_prevvalue;
15957 continue; /* Back to top of loop to get next char */
15960 /* Here, is a single code point, and <value> contains it */
15961 unicode_range = TRUE; /* \N{} are Unicode */
15969 /* We will handle any undefined properties ourselves */
15970 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
15971 /* And we actually would prefer to get
15972 * the straight inversion list of the
15973 * swash, since we will be accessing it
15974 * anyway, to save a little time */
15975 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
15977 if (RExC_parse >= RExC_end)
15978 vFAIL2("Empty \\%c", (U8)value);
15979 if (*RExC_parse == '{') {
15980 const U8 c = (U8)value;
15981 e = strchr(RExC_parse, '}');
15984 vFAIL2("Missing right brace on \\%c{}", c);
15988 while (isSPACE(*RExC_parse)) {
15992 if (UCHARAT(RExC_parse) == '^') {
15994 /* toggle. (The rhs xor gets the single bit that
15995 * differs between P and p; the other xor inverts just
15997 value ^= 'P' ^ 'p';
16000 while (isSPACE(*RExC_parse)) {
16005 if (e == RExC_parse)
16006 vFAIL2("Empty \\%c{}", c);
16008 n = e - RExC_parse;
16009 while (isSPACE(*(RExC_parse + n - 1)))
16011 } /* The \p isn't immediately followed by a '{' */
16012 else if (! isALPHA(*RExC_parse)) {
16013 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16014 vFAIL2("Character following \\%c must be '{' or a "
16015 "single-character Unicode property name",
16025 char* base_name; /* name after any packages are stripped */
16026 char* lookup_name = NULL;
16027 const char * const colon_colon = "::";
16029 /* Try to get the definition of the property into
16030 * <invlist>. If /i is in effect, the effective property
16031 * will have its name be <__NAME_i>. The design is
16032 * discussed in commit
16033 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16034 name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16037 lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16039 /* The function call just below that uses this can fail
16040 * to return, leaking memory if we don't do this */
16041 SAVEFREEPV(lookup_name);
16044 /* Look up the property name, and get its swash and
16045 * inversion list, if the property is found */
16046 SvREFCNT_dec(swash); /* Free any left-overs */
16047 swash = _core_swash_init("utf8",
16054 NULL, /* No inversion list */
16057 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16058 HV* curpkg = (IN_PERL_COMPILETIME)
16060 : CopSTASH(PL_curcop);
16064 if (swash) { /* Got a swash but no inversion list.
16065 Something is likely wrong that will
16066 be sorted-out later */
16067 SvREFCNT_dec_NN(swash);
16071 /* Here didn't find it. It could be a an error (like a
16072 * typo) in specifying a Unicode property, or it could
16073 * be a user-defined property that will be available at
16074 * run-time. The names of these must begin with 'In'
16075 * or 'Is' (after any packages are stripped off). So
16076 * if not one of those, or if we accept only
16077 * compile-time properties, is an error; otherwise add
16078 * it to the list for run-time look up. */
16079 if ((base_name = rninstr(name, name + n,
16080 colon_colon, colon_colon + 2)))
16081 { /* Has ::. We know this must be a user-defined
16084 final_n -= base_name - name;
16093 || base_name[0] != 'I'
16094 || (base_name[1] != 's' && base_name[1] != 'n')
16097 const char * const msg
16099 ? "Illegal user-defined property name"
16100 : "Can't find Unicode property definition";
16101 RExC_parse = e + 1;
16103 /* diag_listed_as: Can't find Unicode property definition "%s" */
16104 vFAIL3utf8f("%s \"%"UTF8f"\"",
16105 msg, UTF8fARG(UTF, n, name));
16108 /* If the property name doesn't already have a package
16109 * name, add the current one to it so that it can be
16110 * referred to outside it. [perl #121777] */
16111 if (! has_pkg && curpkg) {
16112 char* pkgname = HvNAME(curpkg);
16113 if (strNE(pkgname, "main")) {
16114 char* full_name = Perl_form(aTHX_
16118 n = strlen(full_name);
16119 name = savepvn(full_name, n);
16123 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
16124 (value == 'p' ? '+' : '!'),
16125 (FOLD) ? "__" : "",
16126 UTF8fARG(UTF, n, name),
16127 (FOLD) ? "_i" : "");
16128 has_user_defined_property = TRUE;
16129 optimizable = FALSE; /* Will have to leave this an
16132 /* We don't know yet what this matches, so have to flag
16134 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16138 /* Here, did get the swash and its inversion list. If
16139 * the swash is from a user-defined property, then this
16140 * whole character class should be regarded as such */
16141 if (swash_init_flags
16142 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16144 has_user_defined_property = TRUE;
16147 /* We warn on matching an above-Unicode code point
16148 * if the match would return true, except don't
16149 * warn for \p{All}, which has exactly one element
16151 (_invlist_contains_cp(invlist, 0x110000)
16152 && (! (_invlist_len(invlist) == 1
16153 && *invlist_array(invlist) == 0)))
16159 /* Invert if asking for the complement */
16160 if (value == 'P') {
16161 _invlist_union_complement_2nd(properties,
16165 /* The swash can't be used as-is, because we've
16166 * inverted things; delay removing it to here after
16167 * have copied its invlist above */
16168 SvREFCNT_dec_NN(swash);
16172 _invlist_union(properties, invlist, &properties);
16176 RExC_parse = e + 1;
16177 namedclass = ANYOF_UNIPROP; /* no official name, but it's
16180 /* \p means they want Unicode semantics */
16181 REQUIRE_UNI_RULES(flagp, NULL);
16184 case 'n': value = '\n'; break;
16185 case 'r': value = '\r'; break;
16186 case 't': value = '\t'; break;
16187 case 'f': value = '\f'; break;
16188 case 'b': value = '\b'; break;
16189 case 'e': value = ESC_NATIVE; break;
16190 case 'a': value = '\a'; break;
16192 RExC_parse--; /* function expects to be pointed at the 'o' */
16194 const char* error_msg;
16195 bool valid = grok_bslash_o(&RExC_parse,
16198 PASS2, /* warnings only in
16201 silence_non_portable,
16207 non_portable_endpoint++;
16208 if (IN_ENCODING && value < 0x100) {
16209 goto recode_encoding;
16213 RExC_parse--; /* function expects to be pointed at the 'x' */
16215 const char* error_msg;
16216 bool valid = grok_bslash_x(&RExC_parse,
16219 PASS2, /* Output warnings */
16221 silence_non_portable,
16227 non_portable_endpoint++;
16228 if (IN_ENCODING && value < 0x100)
16229 goto recode_encoding;
16232 value = grok_bslash_c(*RExC_parse++, PASS2);
16233 non_portable_endpoint++;
16235 case '0': case '1': case '2': case '3': case '4':
16236 case '5': case '6': case '7':
16238 /* Take 1-3 octal digits */
16239 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16240 numlen = (strict) ? 4 : 3;
16241 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16242 RExC_parse += numlen;
16245 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16246 vFAIL("Need exactly 3 octal digits");
16248 else if (! SIZE_ONLY /* like \08, \178 */
16250 && RExC_parse < RExC_end
16251 && isDIGIT(*RExC_parse)
16252 && ckWARN(WARN_REGEXP))
16254 SAVEFREESV(RExC_rx_sv);
16255 reg_warn_non_literal_string(
16257 form_short_octal_warning(RExC_parse, numlen));
16258 (void)ReREFCNT_inc(RExC_rx_sv);
16261 non_portable_endpoint++;
16262 if (IN_ENCODING && value < 0x100)
16263 goto recode_encoding;
16267 if (! RExC_override_recoding) {
16268 SV* enc = _get_encoding();
16269 value = reg_recode((U8)value, &enc);
16272 vFAIL("Invalid escape in the specified encoding");
16275 ckWARNreg(RExC_parse,
16276 "Invalid escape in the specified encoding");
16282 /* Allow \_ to not give an error */
16283 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16285 vFAIL2("Unrecognized escape \\%c in character class",
16289 SAVEFREESV(RExC_rx_sv);
16290 ckWARN2reg(RExC_parse,
16291 "Unrecognized escape \\%c in character class passed through",
16293 (void)ReREFCNT_inc(RExC_rx_sv);
16297 } /* End of switch on char following backslash */
16298 } /* end of handling backslash escape sequences */
16300 /* Here, we have the current token in 'value' */
16302 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16305 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
16306 * literal, as is the character that began the false range, i.e.
16307 * the 'a' in the examples */
16310 const int w = (RExC_parse >= rangebegin)
16311 ? RExC_parse - rangebegin
16315 "False [] range \"%"UTF8f"\"",
16316 UTF8fARG(UTF, w, rangebegin));
16319 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16320 ckWARN2reg(RExC_parse,
16321 "False [] range \"%"UTF8f"\"",
16322 UTF8fARG(UTF, w, rangebegin));
16323 (void)ReREFCNT_inc(RExC_rx_sv);
16324 cp_list = add_cp_to_invlist(cp_list, '-');
16325 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16330 range = 0; /* this was not a true range */
16331 element_count += 2; /* So counts for three values */
16334 classnum = namedclass_to_classnum(namedclass);
16336 if (LOC && namedclass < ANYOF_POSIXL_MAX
16337 #ifndef HAS_ISASCII
16338 && classnum != _CC_ASCII
16341 /* What the Posix classes (like \w, [:space:]) match in locale
16342 * isn't knowable under locale until actual match time. Room
16343 * must be reserved (one time per outer bracketed class) to
16344 * store such classes. The space will contain a bit for each
16345 * named class that is to be matched against. This isn't
16346 * needed for \p{} and pseudo-classes, as they are not affected
16347 * by locale, and hence are dealt with separately */
16348 if (! need_class) {
16351 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16354 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16356 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16357 ANYOF_POSIXL_ZERO(ret);
16359 /* We can't change this into some other type of node
16360 * (unless this is the only element, in which case there
16361 * are nodes that mean exactly this) as has runtime
16363 optimizable = FALSE;
16366 /* Coverity thinks it is possible for this to be negative; both
16367 * jhi and khw think it's not, but be safer */
16368 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16369 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16371 /* See if it already matches the complement of this POSIX
16373 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16374 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16378 posixl_matches_all = TRUE;
16379 break; /* No need to continue. Since it matches both
16380 e.g., \w and \W, it matches everything, and the
16381 bracketed class can be optimized into qr/./s */
16384 /* Add this class to those that should be checked at runtime */
16385 ANYOF_POSIXL_SET(ret, namedclass);
16387 /* The above-Latin1 characters are not subject to locale rules.
16388 * Just add them, in the second pass, to the
16389 * unconditionally-matched list */
16391 SV* scratch_list = NULL;
16393 /* Get the list of the above-Latin1 code points this
16395 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16396 PL_XPosix_ptrs[classnum],
16398 /* Odd numbers are complements, like
16399 * NDIGIT, NASCII, ... */
16400 namedclass % 2 != 0,
16402 /* Checking if 'cp_list' is NULL first saves an extra
16403 * clone. Its reference count will be decremented at the
16404 * next union, etc, or if this is the only instance, at the
16405 * end of the routine */
16407 cp_list = scratch_list;
16410 _invlist_union(cp_list, scratch_list, &cp_list);
16411 SvREFCNT_dec_NN(scratch_list);
16413 continue; /* Go get next character */
16416 else if (! SIZE_ONLY) {
16418 /* Here, not in pass1 (in that pass we skip calculating the
16419 * contents of this class), and is /l, or is a POSIX class for
16420 * which /l doesn't matter (or is a Unicode property, which is
16421 * skipped here). */
16422 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
16423 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16425 /* Here, should be \h, \H, \v, or \V. None of /d, /i
16426 * nor /l make a difference in what these match,
16427 * therefore we just add what they match to cp_list. */
16428 if (classnum != _CC_VERTSPACE) {
16429 assert( namedclass == ANYOF_HORIZWS
16430 || namedclass == ANYOF_NHORIZWS);
16432 /* It turns out that \h is just a synonym for
16434 classnum = _CC_BLANK;
16437 _invlist_union_maybe_complement_2nd(
16439 PL_XPosix_ptrs[classnum],
16440 namedclass % 2 != 0, /* Complement if odd
16441 (NHORIZWS, NVERTWS)
16446 else if (UNI_SEMANTICS
16447 || classnum == _CC_ASCII
16448 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
16449 || classnum == _CC_XDIGIT)))
16451 /* We usually have to worry about /d and /a affecting what
16452 * POSIX classes match, with special code needed for /d
16453 * because we won't know until runtime what all matches.
16454 * But there is no extra work needed under /u, and
16455 * [:ascii:] is unaffected by /a and /d; and :digit: and
16456 * :xdigit: don't have runtime differences under /d. So we
16457 * can special case these, and avoid some extra work below,
16458 * and at runtime. */
16459 _invlist_union_maybe_complement_2nd(
16461 PL_XPosix_ptrs[classnum],
16462 namedclass % 2 != 0,
16465 else { /* Garden variety class. If is NUPPER, NALPHA, ...
16466 complement and use nposixes */
16467 SV** posixes_ptr = namedclass % 2 == 0
16470 _invlist_union_maybe_complement_2nd(
16472 PL_XPosix_ptrs[classnum],
16473 namedclass % 2 != 0,
16477 } /* end of namedclass \blah */
16479 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16481 /* If 'range' is set, 'value' is the ending of a range--check its
16482 * validity. (If value isn't a single code point in the case of a
16483 * range, we should have figured that out above in the code that
16484 * catches false ranges). Later, we will handle each individual code
16485 * point in the range. If 'range' isn't set, this could be the
16486 * beginning of a range, so check for that by looking ahead to see if
16487 * the next real character to be processed is the range indicator--the
16492 /* For unicode ranges, we have to test that the Unicode as opposed
16493 * to the native values are not decreasing. (Above 255, there is
16494 * no difference between native and Unicode) */
16495 if (unicode_range && prevvalue < 255 && value < 255) {
16496 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16497 goto backwards_range;
16502 if (prevvalue > value) /* b-a */ {
16507 w = RExC_parse - rangebegin;
16509 "Invalid [] range \"%"UTF8f"\"",
16510 UTF8fARG(UTF, w, rangebegin));
16511 NOT_REACHED; /* NOTREACHED */
16515 prevvalue = value; /* save the beginning of the potential range */
16516 if (! stop_at_1 /* Can't be a range if parsing just one thing */
16517 && *RExC_parse == '-')
16519 char* next_char_ptr = RExC_parse + 1;
16521 /* Get the next real char after the '-' */
16522 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16524 /* If the '-' is at the end of the class (just before the ']',
16525 * it is a literal minus; otherwise it is a range */
16526 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16527 RExC_parse = next_char_ptr;
16529 /* a bad range like \w-, [:word:]- ? */
16530 if (namedclass > OOB_NAMEDCLASS) {
16531 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16532 const int w = RExC_parse >= rangebegin
16533 ? RExC_parse - rangebegin
16536 vFAIL4("False [] range \"%*.*s\"",
16541 "False [] range \"%*.*s\"",
16546 cp_list = add_cp_to_invlist(cp_list, '-');
16550 range = 1; /* yeah, it's a range! */
16551 continue; /* but do it the next time */
16556 if (namedclass > OOB_NAMEDCLASS) {
16560 /* Here, we have a single value this time through the loop, and
16561 * <prevvalue> is the beginning of the range, if any; or <value> if
16564 /* non-Latin1 code point implies unicode semantics. Must be set in
16565 * pass1 so is there for the whole of pass 2 */
16567 REQUIRE_UNI_RULES(flagp, NULL);
16570 /* Ready to process either the single value, or the completed range.
16571 * For single-valued non-inverted ranges, we consider the possibility
16572 * of multi-char folds. (We made a conscious decision to not do this
16573 * for the other cases because it can often lead to non-intuitive
16574 * results. For example, you have the peculiar case that:
16575 * "s s" =~ /^[^\xDF]+$/i => Y
16576 * "ss" =~ /^[^\xDF]+$/i => N
16578 * See [perl #89750] */
16579 if (FOLD && allow_multi_folds && value == prevvalue) {
16580 if (value == LATIN_SMALL_LETTER_SHARP_S
16581 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
16584 /* Here <value> is indeed a multi-char fold. Get what it is */
16586 U8 foldbuf[UTF8_MAXBYTES_CASE];
16589 UV folded = _to_uni_fold_flags(
16593 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
16594 ? FOLD_FLAGS_NOMIX_ASCII
16598 /* Here, <folded> should be the first character of the
16599 * multi-char fold of <value>, with <foldbuf> containing the
16600 * whole thing. But, if this fold is not allowed (because of
16601 * the flags), <fold> will be the same as <value>, and should
16602 * be processed like any other character, so skip the special
16604 if (folded != value) {
16606 /* Skip if we are recursed, currently parsing the class
16607 * again. Otherwise add this character to the list of
16608 * multi-char folds. */
16609 if (! RExC_in_multi_char_class) {
16610 STRLEN cp_count = utf8_length(foldbuf,
16611 foldbuf + foldlen);
16612 SV* multi_fold = sv_2mortal(newSVpvs(""));
16614 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
16617 = add_multi_match(multi_char_matches,
16623 /* This element should not be processed further in this
16626 value = save_value;
16627 prevvalue = save_prevvalue;
16633 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
16636 /* If the range starts above 255, everything is portable and
16637 * likely to be so for any forseeable character set, so don't
16639 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
16640 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
16642 else if (prevvalue != value) {
16644 /* Under strict, ranges that stop and/or end in an ASCII
16645 * printable should have each end point be a portable value
16646 * for it (preferably like 'A', but we don't warn if it is
16647 * a (portable) Unicode name or code point), and the range
16648 * must be be all digits or all letters of the same case.
16649 * Otherwise, the range is non-portable and unclear as to
16650 * what it contains */
16651 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
16652 && (non_portable_endpoint
16653 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
16654 || (isLOWER_A(prevvalue) && isLOWER_A(value))
16655 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
16657 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
16659 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
16661 /* But the nature of Unicode and languages mean we
16662 * can't do the same checks for above-ASCII ranges,
16663 * except in the case of digit ones. These should
16664 * contain only digits from the same group of 10. The
16665 * ASCII case is handled just above. 0x660 is the
16666 * first digit character beyond ASCII. Hence here, the
16667 * range could be a range of digits. Find out. */
16668 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16670 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16673 /* If the range start and final points are in the same
16674 * inversion list element, it means that either both
16675 * are not digits, or both are digits in a consecutive
16676 * sequence of digits. (So far, Unicode has kept all
16677 * such sequences as distinct groups of 10, but assert
16678 * to make sure). If the end points are not in the
16679 * same element, neither should be a digit. */
16680 if (index_start == index_final) {
16681 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
16682 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16683 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16685 /* But actually Unicode did have one group of 11
16686 * 'digits' in 5.2, so in case we are operating
16687 * on that version, let that pass */
16688 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16689 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16691 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16695 else if ((index_start >= 0
16696 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
16697 || (index_final >= 0
16698 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
16700 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
16705 if ((! range || prevvalue == value) && non_portable_endpoint) {
16706 if (isPRINT_A(value)) {
16709 if (isBACKSLASHED_PUNCT(value)) {
16710 literal[d++] = '\\';
16712 literal[d++] = (char) value;
16713 literal[d++] = '\0';
16716 "\"%.*s\" is more clearly written simply as \"%s\"",
16717 (int) (RExC_parse - rangebegin),
16722 else if isMNEMONIC_CNTRL(value) {
16724 "\"%.*s\" is more clearly written simply as \"%s\"",
16725 (int) (RExC_parse - rangebegin),
16727 cntrl_to_mnemonic((U8) value)
16733 /* Deal with this element of the class */
16737 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16740 /* On non-ASCII platforms, for ranges that span all of 0..255, and
16741 * ones that don't require special handling, we can just add the
16742 * range like we do for ASCII platforms */
16743 if ((UNLIKELY(prevvalue == 0) && value >= 255)
16744 || ! (prevvalue < 256
16746 || (! non_portable_endpoint
16747 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
16748 || (isUPPER_A(prevvalue)
16749 && isUPPER_A(value)))))))
16751 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16755 /* Here, requires special handling. This can be because it is
16756 * a range whose code points are considered to be Unicode, and
16757 * so must be individually translated into native, or because
16758 * its a subrange of 'A-Z' or 'a-z' which each aren't
16759 * contiguous in EBCDIC, but we have defined them to include
16760 * only the "expected" upper or lower case ASCII alphabetics.
16761 * Subranges above 255 are the same in native and Unicode, so
16762 * can be added as a range */
16763 U8 start = NATIVE_TO_LATIN1(prevvalue);
16765 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
16766 for (j = start; j <= end; j++) {
16767 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
16770 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16777 range = 0; /* this range (if it was one) is done now */
16778 } /* End of loop through all the text within the brackets */
16781 if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
16782 output_or_return_posix_warnings(pRExC_state, posix_warnings,
16783 return_posix_warnings);
16786 /* If anything in the class expands to more than one character, we have to
16787 * deal with them by building up a substitute parse string, and recursively
16788 * calling reg() on it, instead of proceeding */
16789 if (multi_char_matches) {
16790 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
16793 char *save_end = RExC_end;
16794 char *save_parse = RExC_parse;
16795 char *save_start = RExC_start;
16796 STRLEN prefix_end = 0; /* We copy the character class after a
16797 prefix supplied here. This is the size
16798 + 1 of that prefix */
16799 bool first_time = TRUE; /* First multi-char occurrence doesn't get
16804 assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
16806 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
16807 because too confusing */
16809 sv_catpv(substitute_parse, "(?:");
16813 /* Look at the longest folds first */
16814 for (cp_count = av_tindex_nomg(multi_char_matches);
16819 if (av_exists(multi_char_matches, cp_count)) {
16820 AV** this_array_ptr;
16823 this_array_ptr = (AV**) av_fetch(multi_char_matches,
16825 while ((this_sequence = av_pop(*this_array_ptr)) !=
16828 if (! first_time) {
16829 sv_catpv(substitute_parse, "|");
16831 first_time = FALSE;
16833 sv_catpv(substitute_parse, SvPVX(this_sequence));
16838 /* If the character class contains anything else besides these
16839 * multi-character folds, have to include it in recursive parsing */
16840 if (element_count) {
16841 sv_catpv(substitute_parse, "|[");
16842 prefix_end = SvCUR(substitute_parse);
16843 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
16845 /* Put in a closing ']' only if not going off the end, as otherwise
16846 * we are adding something that really isn't there */
16847 if (RExC_parse < RExC_end) {
16848 sv_catpv(substitute_parse, "]");
16852 sv_catpv(substitute_parse, ")");
16855 /* This is a way to get the parse to skip forward a whole named
16856 * sequence instead of matching the 2nd character when it fails the
16858 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
16862 /* Set up the data structure so that any errors will be properly
16863 * reported. See the comments at the definition of
16864 * REPORT_LOCATION_ARGS for details */
16865 RExC_precomp_adj = orig_parse - RExC_precomp;
16866 RExC_start = RExC_parse = SvPV(substitute_parse, len);
16867 RExC_adjusted_start = RExC_start + prefix_end;
16868 RExC_end = RExC_parse + len;
16869 RExC_in_multi_char_class = 1;
16870 RExC_override_recoding = 1;
16871 RExC_emit = (regnode *)orig_emit;
16873 ret = reg(pRExC_state, 1, ®_flags, depth+1);
16875 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
16877 /* And restore so can parse the rest of the pattern */
16878 RExC_parse = save_parse;
16879 RExC_start = RExC_adjusted_start = save_start;
16880 RExC_precomp_adj = 0;
16881 RExC_end = save_end;
16882 RExC_in_multi_char_class = 0;
16883 RExC_override_recoding = 0;
16884 SvREFCNT_dec_NN(multi_char_matches);
16888 /* Here, we've gone through the entire class and dealt with multi-char
16889 * folds. We are now in a position that we can do some checks to see if we
16890 * can optimize this ANYOF node into a simpler one, even in Pass 1.
16891 * Currently we only do two checks:
16892 * 1) is in the unlikely event that the user has specified both, eg. \w and
16893 * \W under /l, then the class matches everything. (This optimization
16894 * is done only to make the optimizer code run later work.)
16895 * 2) if the character class contains only a single element (including a
16896 * single range), we see if there is an equivalent node for it.
16897 * Other checks are possible */
16899 && ! ret_invlist /* Can't optimize if returning the constructed
16901 && (UNLIKELY(posixl_matches_all) || element_count == 1))
16906 if (UNLIKELY(posixl_matches_all)) {
16909 else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
16910 class, like \w or [:digit:]
16913 /* All named classes are mapped into POSIXish nodes, with its FLAG
16914 * argument giving which class it is */
16915 switch ((I32)namedclass) {
16916 case ANYOF_UNIPROP:
16919 /* These don't depend on the charset modifiers. They always
16920 * match under /u rules */
16921 case ANYOF_NHORIZWS:
16922 case ANYOF_HORIZWS:
16923 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
16926 case ANYOF_NVERTWS:
16931 /* The actual POSIXish node for all the rest depends on the
16932 * charset modifier. The ones in the first set depend only on
16933 * ASCII or, if available on this platform, also locale */
16937 op = (LOC) ? POSIXL : POSIXA;
16943 /* The following don't have any matches in the upper Latin1
16944 * range, hence /d is equivalent to /u for them. Making it /u
16945 * saves some branches at runtime */
16949 case ANYOF_NXDIGIT:
16950 if (! DEPENDS_SEMANTICS) {
16951 goto treat_as_default;
16957 /* The following change to CASED under /i */
16963 namedclass = ANYOF_CASED + (namedclass % 2);
16967 /* The rest have more possibilities depending on the charset.
16968 * We take advantage of the enum ordering of the charset
16969 * modifiers to get the exact node type, */
16972 op = POSIXD + get_regex_charset(RExC_flags);
16973 if (op > POSIXA) { /* /aa is same as /a */
16978 /* The odd numbered ones are the complements of the
16979 * next-lower even number one */
16980 if (namedclass % 2 == 1) {
16984 arg = namedclass_to_classnum(namedclass);
16988 else if (value == prevvalue) {
16990 /* Here, the class consists of just a single code point */
16993 if (! LOC && value == '\n') {
16994 op = REG_ANY; /* Optimize [^\n] */
16995 *flagp |= HASWIDTH|SIMPLE;
16999 else if (value < 256 || UTF) {
17001 /* Optimize a single value into an EXACTish node, but not if it
17002 * would require converting the pattern to UTF-8. */
17003 op = compute_EXACTish(pRExC_state);
17005 } /* Otherwise is a range */
17006 else if (! LOC) { /* locale could vary these */
17007 if (prevvalue == '0') {
17008 if (value == '9') {
17013 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17014 /* We can optimize A-Z or a-z, but not if they could match
17015 * something like the KELVIN SIGN under /i. */
17016 if (prevvalue == 'A') {
17019 && ! non_portable_endpoint
17022 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17026 else if (prevvalue == 'a') {
17029 && ! non_portable_endpoint
17032 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17039 /* Here, we have changed <op> away from its initial value iff we found
17040 * an optimization */
17043 /* Throw away this ANYOF regnode, and emit the calculated one,
17044 * which should correspond to the beginning, not current, state of
17046 const char * cur_parse = RExC_parse;
17047 RExC_parse = (char *)orig_parse;
17051 /* To get locale nodes to not use the full ANYOF size would
17052 * require moving the code above that writes the portions
17053 * of it that aren't in other nodes to after this point.
17054 * e.g. ANYOF_POSIXL_SET */
17055 RExC_size = orig_size;
17059 RExC_emit = (regnode *)orig_emit;
17060 if (PL_regkind[op] == POSIXD) {
17061 if (op == POSIXL) {
17062 RExC_contains_locale = 1;
17065 op += NPOSIXD - POSIXD;
17070 ret = reg_node(pRExC_state, op);
17072 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17076 *flagp |= HASWIDTH|SIMPLE;
17078 else if (PL_regkind[op] == EXACT) {
17079 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17080 TRUE /* downgradable to EXACT */
17084 RExC_parse = (char *) cur_parse;
17086 SvREFCNT_dec(posixes);
17087 SvREFCNT_dec(nposixes);
17088 SvREFCNT_dec(simple_posixes);
17089 SvREFCNT_dec(cp_list);
17090 SvREFCNT_dec(cp_foldable_list);
17097 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17099 /* If folding, we calculate all characters that could fold to or from the
17100 * ones already on the list */
17101 if (cp_foldable_list) {
17103 UV start, end; /* End points of code point ranges */
17105 SV* fold_intersection = NULL;
17108 /* Our calculated list will be for Unicode rules. For locale
17109 * matching, we have to keep a separate list that is consulted at
17110 * runtime only when the locale indicates Unicode rules. For
17111 * non-locale, we just use the general list */
17113 use_list = &only_utf8_locale_list;
17116 use_list = &cp_list;
17119 /* Only the characters in this class that participate in folds need
17120 * be checked. Get the intersection of this class and all the
17121 * possible characters that are foldable. This can quickly narrow
17122 * down a large class */
17123 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17124 &fold_intersection);
17126 /* The folds for all the Latin1 characters are hard-coded into this
17127 * program, but we have to go out to disk to get the others. */
17128 if (invlist_highest(cp_foldable_list) >= 256) {
17130 /* This is a hash that for a particular fold gives all
17131 * characters that are involved in it */
17132 if (! PL_utf8_foldclosures) {
17133 _load_PL_utf8_foldclosures();
17137 /* Now look at the foldable characters in this class individually */
17138 invlist_iterinit(fold_intersection);
17139 while (invlist_iternext(fold_intersection, &start, &end)) {
17142 /* Look at every character in the range */
17143 for (j = start; j <= end; j++) {
17144 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17150 if (IS_IN_SOME_FOLD_L1(j)) {
17152 /* ASCII is always matched; non-ASCII is matched
17153 * only under Unicode rules (which could happen
17154 * under /l if the locale is a UTF-8 one */
17155 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17156 *use_list = add_cp_to_invlist(*use_list,
17157 PL_fold_latin1[j]);
17160 has_upper_latin1_only_utf8_matches
17161 = add_cp_to_invlist(
17162 has_upper_latin1_only_utf8_matches,
17163 PL_fold_latin1[j]);
17167 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17168 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17170 add_above_Latin1_folds(pRExC_state,
17177 /* Here is an above Latin1 character. We don't have the
17178 * rules hard-coded for it. First, get its fold. This is
17179 * the simple fold, as the multi-character folds have been
17180 * handled earlier and separated out */
17181 _to_uni_fold_flags(j, foldbuf, &foldlen,
17182 (ASCII_FOLD_RESTRICTED)
17183 ? FOLD_FLAGS_NOMIX_ASCII
17186 /* Single character fold of above Latin1. Add everything in
17187 * its fold closure to the list that this node should match.
17188 * The fold closures data structure is a hash with the keys
17189 * being the UTF-8 of every character that is folded to, like
17190 * 'k', and the values each an array of all code points that
17191 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
17192 * Multi-character folds are not included */
17193 if ((listp = hv_fetch(PL_utf8_foldclosures,
17194 (char *) foldbuf, foldlen, FALSE)))
17196 AV* list = (AV*) *listp;
17198 for (k = 0; k <= av_tindex_nomg(list); k++) {
17199 SV** c_p = av_fetch(list, k, FALSE);
17205 /* /aa doesn't allow folds between ASCII and non- */
17206 if ((ASCII_FOLD_RESTRICTED
17207 && (isASCII(c) != isASCII(j))))
17212 /* Folds under /l which cross the 255/256 boundary
17213 * are added to a separate list. (These are valid
17214 * only when the locale is UTF-8.) */
17215 if (c < 256 && LOC) {
17216 *use_list = add_cp_to_invlist(*use_list, c);
17220 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17222 cp_list = add_cp_to_invlist(cp_list, c);
17225 /* Similarly folds involving non-ascii Latin1
17226 * characters under /d are added to their list */
17227 has_upper_latin1_only_utf8_matches
17228 = add_cp_to_invlist(
17229 has_upper_latin1_only_utf8_matches,
17236 SvREFCNT_dec_NN(fold_intersection);
17239 /* Now that we have finished adding all the folds, there is no reason
17240 * to keep the foldable list separate */
17241 _invlist_union(cp_list, cp_foldable_list, &cp_list);
17242 SvREFCNT_dec_NN(cp_foldable_list);
17245 /* And combine the result (if any) with any inversion list from posix
17246 * classes. The lists are kept separate up to now because we don't want to
17247 * fold the classes (folding of those is automatically handled by the swash
17248 * fetching code) */
17249 if (simple_posixes) {
17250 _invlist_union(cp_list, simple_posixes, &cp_list);
17251 SvREFCNT_dec_NN(simple_posixes);
17253 if (posixes || nposixes) {
17254 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
17255 /* Under /a and /aa, nothing above ASCII matches these */
17256 _invlist_intersection(posixes,
17257 PL_XPosix_ptrs[_CC_ASCII],
17261 if (DEPENDS_SEMANTICS) {
17262 /* Under /d, everything in the upper half of the Latin1 range
17263 * matches these complements */
17264 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17266 else if (AT_LEAST_ASCII_RESTRICTED) {
17267 /* Under /a and /aa, everything above ASCII matches these
17269 _invlist_union_complement_2nd(nposixes,
17270 PL_XPosix_ptrs[_CC_ASCII],
17274 _invlist_union(posixes, nposixes, &posixes);
17275 SvREFCNT_dec_NN(nposixes);
17278 posixes = nposixes;
17281 if (! DEPENDS_SEMANTICS) {
17283 _invlist_union(cp_list, posixes, &cp_list);
17284 SvREFCNT_dec_NN(posixes);
17291 /* Under /d, we put into a separate list the Latin1 things that
17292 * match only when the target string is utf8 */
17293 SV* nonascii_but_latin1_properties = NULL;
17294 _invlist_intersection(posixes, PL_UpperLatin1,
17295 &nonascii_but_latin1_properties);
17296 _invlist_subtract(posixes, nonascii_but_latin1_properties,
17299 _invlist_union(cp_list, posixes, &cp_list);
17300 SvREFCNT_dec_NN(posixes);
17306 if (has_upper_latin1_only_utf8_matches) {
17307 _invlist_union(has_upper_latin1_only_utf8_matches,
17308 nonascii_but_latin1_properties,
17309 &has_upper_latin1_only_utf8_matches);
17310 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
17313 has_upper_latin1_only_utf8_matches
17314 = nonascii_but_latin1_properties;
17319 /* And combine the result (if any) with any inversion list from properties.
17320 * The lists are kept separate up to now so that we can distinguish the two
17321 * in regards to matching above-Unicode. A run-time warning is generated
17322 * if a Unicode property is matched against a non-Unicode code point. But,
17323 * we allow user-defined properties to match anything, without any warning,
17324 * and we also suppress the warning if there is a portion of the character
17325 * class that isn't a Unicode property, and which matches above Unicode, \W
17326 * or [\x{110000}] for example.
17327 * (Note that in this case, unlike the Posix one above, there is no
17328 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17329 * forces Unicode semantics */
17333 /* If it matters to the final outcome, see if a non-property
17334 * component of the class matches above Unicode. If so, the
17335 * warning gets suppressed. This is true even if just a single
17336 * such code point is specified, as, though not strictly correct if
17337 * another such code point is matched against, the fact that they
17338 * are using above-Unicode code points indicates they should know
17339 * the issues involved */
17341 warn_super = ! (invert
17342 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17345 _invlist_union(properties, cp_list, &cp_list);
17346 SvREFCNT_dec_NN(properties);
17349 cp_list = properties;
17354 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17356 /* Because an ANYOF node is the only one that warns, this node
17357 * can't be optimized into something else */
17358 optimizable = FALSE;
17362 /* Here, we have calculated what code points should be in the character
17365 * Now we can see about various optimizations. Fold calculation (which we
17366 * did above) needs to take place before inversion. Otherwise /[^k]/i
17367 * would invert to include K, which under /i would match k, which it
17368 * shouldn't. Therefore we can't invert folded locale now, as it won't be
17369 * folded until runtime */
17371 /* If we didn't do folding, it's because some information isn't available
17372 * until runtime; set the run-time fold flag for these. (We don't have to
17373 * worry about properties folding, as that is taken care of by the swash
17374 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
17375 * locales, or the class matches at least one 0-255 range code point */
17378 /* Some things on the list might be unconditionally included because of
17379 * other components. Remove them, and clean up the list if it goes to
17381 if (only_utf8_locale_list && cp_list) {
17382 _invlist_subtract(only_utf8_locale_list, cp_list,
17383 &only_utf8_locale_list);
17385 if (_invlist_len(only_utf8_locale_list) == 0) {
17386 SvREFCNT_dec_NN(only_utf8_locale_list);
17387 only_utf8_locale_list = NULL;
17390 if (only_utf8_locale_list) {
17393 |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17395 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17397 invlist_iterinit(cp_list);
17398 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17399 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17401 invlist_iterfinish(cp_list);
17405 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
17406 ( DEPENDS_SEMANTICS \
17407 && (ANYOF_FLAGS(ret) \
17408 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17410 /* See if we can simplify things under /d */
17411 if ( has_upper_latin1_only_utf8_matches
17412 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
17414 /* But not if we are inverting, as that screws it up */
17416 if (has_upper_latin1_only_utf8_matches) {
17417 if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
17419 /* Here, we have both the flag and inversion list. Any
17420 * character in 'has_upper_latin1_only_utf8_matches'
17421 * matches when UTF-8 is in effect, but it also matches
17422 * when UTF-8 is not in effect because of
17423 * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches
17424 * unconditionally, so can be added to the regular list,
17425 * and 'has_upper_latin1_only_utf8_matches' cleared */
17426 _invlist_union(cp_list,
17427 has_upper_latin1_only_utf8_matches,
17429 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17430 has_upper_latin1_only_utf8_matches = NULL;
17432 else if (cp_list) {
17434 /* Here, 'cp_list' gives chars that always match, and
17435 * 'has_upper_latin1_only_utf8_matches' gives chars that
17436 * were specified to match only if the target string is in
17437 * UTF-8. It may be that these overlap, so we can subtract
17438 * the unconditionally matching from the conditional ones,
17439 * to make the conditional list as small as possible,
17440 * perhaps even clearing it, in which case more
17441 * optimizations are possible later */
17442 _invlist_subtract(has_upper_latin1_only_utf8_matches,
17444 &has_upper_latin1_only_utf8_matches);
17445 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17446 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17447 has_upper_latin1_only_utf8_matches = NULL;
17452 /* Similarly, if the unconditional matches include every upper
17453 * latin1 character, we can clear that flag to permit later
17455 if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
17456 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17457 _invlist_subtract(only_non_utf8_list, cp_list,
17458 &only_non_utf8_list);
17459 if (_invlist_len(only_non_utf8_list) == 0) {
17460 ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17462 SvREFCNT_dec_NN(only_non_utf8_list);
17463 only_non_utf8_list = NULL;;
17467 /* If we haven't gotten rid of all conditional matching, we change the
17468 * regnode type to indicate that */
17469 if ( has_upper_latin1_only_utf8_matches
17470 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
17473 optimizable = FALSE;
17476 #undef MATCHES_ALL_NON_UTF8_NON_ASCII
17478 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17479 * at compile time. Besides not inverting folded locale now, we can't
17480 * invert if there are things such as \w, which aren't known until runtime
17484 && OP(ret) != ANYOFD
17485 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17486 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17488 _invlist_invert(cp_list);
17490 /* Any swash can't be used as-is, because we've inverted things */
17492 SvREFCNT_dec_NN(swash);
17496 /* Clear the invert flag since have just done it here */
17503 *ret_invlist = cp_list;
17504 SvREFCNT_dec(swash);
17506 /* Discard the generated node */
17508 RExC_size = orig_size;
17511 RExC_emit = orig_emit;
17516 /* Some character classes are equivalent to other nodes. Such nodes take
17517 * up less room and generally fewer operations to execute than ANYOF nodes.
17518 * Above, we checked for and optimized into some such equivalents for
17519 * certain common classes that are easy to test. Getting to this point in
17520 * the code means that the class didn't get optimized there. Since this
17521 * code is only executed in Pass 2, it is too late to save space--it has
17522 * been allocated in Pass 1, and currently isn't given back. But turning
17523 * things into an EXACTish node can allow the optimizer to join it to any
17524 * adjacent such nodes. And if the class is equivalent to things like /./,
17525 * expensive run-time swashes can be avoided. Now that we have more
17526 * complete information, we can find things necessarily missed by the
17527 * earlier code. Another possible "optimization" that isn't done is that
17528 * something like [Ee] could be changed into an EXACTFU. khw tried this
17529 * and found that the ANYOF is faster, including for code points not in the
17530 * bitmap. This still might make sense to do, provided it got joined with
17531 * an adjacent node(s) to create a longer EXACTFU one. This could be
17532 * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
17533 * routine would know is joinable. If that didn't happen, the node type
17534 * could then be made a straight ANYOF */
17536 if (optimizable && cp_list && ! invert) {
17538 U8 op = END; /* The optimzation node-type */
17539 int posix_class = -1; /* Illegal value */
17540 const char * cur_parse= RExC_parse;
17542 invlist_iterinit(cp_list);
17543 if (! invlist_iternext(cp_list, &start, &end)) {
17545 /* Here, the list is empty. This happens, for example, when a
17546 * Unicode property that doesn't match anything is the only element
17547 * in the character class (perluniprops.pod notes such properties).
17550 *flagp |= HASWIDTH|SIMPLE;
17552 else if (start == end) { /* The range is a single code point */
17553 if (! invlist_iternext(cp_list, &start, &end)
17555 /* Don't do this optimization if it would require changing
17556 * the pattern to UTF-8 */
17557 && (start < 256 || UTF))
17559 /* Here, the list contains a single code point. Can optimize
17560 * into an EXACTish node */
17571 /* A locale node under folding with one code point can be
17572 * an EXACTFL, as its fold won't be calculated until
17578 /* Here, we are generally folding, but there is only one
17579 * code point to match. If we have to, we use an EXACT
17580 * node, but it would be better for joining with adjacent
17581 * nodes in the optimization pass if we used the same
17582 * EXACTFish node that any such are likely to be. We can
17583 * do this iff the code point doesn't participate in any
17584 * folds. For example, an EXACTF of a colon is the same as
17585 * an EXACT one, since nothing folds to or from a colon. */
17587 if (IS_IN_SOME_FOLD_L1(value)) {
17592 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
17597 /* If we haven't found the node type, above, it means we
17598 * can use the prevailing one */
17600 op = compute_EXACTish(pRExC_state);
17604 } /* End of first range contains just a single code point */
17605 else if (start == 0) {
17606 if (end == UV_MAX) {
17608 *flagp |= HASWIDTH|SIMPLE;
17611 else if (end == '\n' - 1
17612 && invlist_iternext(cp_list, &start, &end)
17613 && start == '\n' + 1 && end == UV_MAX)
17616 *flagp |= HASWIDTH|SIMPLE;
17620 invlist_iterfinish(cp_list);
17623 const UV cp_list_len = _invlist_len(cp_list);
17624 const UV* cp_list_array = invlist_array(cp_list);
17626 /* Here, didn't find an optimization. See if this matches any of
17627 * the POSIX classes. These run slightly faster for above-Unicode
17628 * code points, so don't bother with POSIXA ones nor the 2 that
17629 * have no above-Unicode matches. We can avoid these checks unless
17630 * the ANYOF matches at least as high as the lowest POSIX one
17631 * (which was manually found to be \v. The actual code point may
17632 * increase in later Unicode releases, if a higher code point is
17633 * assigned to be \v, but this code will never break. It would
17634 * just mean we could execute the checks for posix optimizations
17635 * unnecessarily) */
17637 if (cp_list_array[cp_list_len-1] > 0x2029) {
17638 for (posix_class = 0;
17639 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
17643 if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
17646 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
17648 /* Check if matches normal or inverted */
17649 if (_invlistEQ(cp_list,
17650 PL_XPosix_ptrs[posix_class],
17653 op = (try_inverted)
17656 *flagp |= HASWIDTH|SIMPLE;
17666 RExC_parse = (char *)orig_parse;
17667 RExC_emit = (regnode *)orig_emit;
17669 if (regarglen[op]) {
17670 ret = reganode(pRExC_state, op, 0);
17672 ret = reg_node(pRExC_state, op);
17675 RExC_parse = (char *)cur_parse;
17677 if (PL_regkind[op] == EXACT) {
17678 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17679 TRUE /* downgradable to EXACT */
17682 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17683 FLAGS(ret) = posix_class;
17686 SvREFCNT_dec_NN(cp_list);
17691 /* Here, <cp_list> contains all the code points we can determine at
17692 * compile time that match under all conditions. Go through it, and
17693 * for things that belong in the bitmap, put them there, and delete from
17694 * <cp_list>. While we are at it, see if everything above 255 is in the
17695 * list, and if so, set a flag to speed up execution */
17697 populate_ANYOF_from_invlist(ret, &cp_list);
17700 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
17703 /* Here, the bitmap has been populated with all the Latin1 code points that
17704 * always match. Can now add to the overall list those that match only
17705 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
17707 if (has_upper_latin1_only_utf8_matches) {
17709 _invlist_union(cp_list,
17710 has_upper_latin1_only_utf8_matches,
17712 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17715 cp_list = has_upper_latin1_only_utf8_matches;
17717 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17720 /* If there is a swash and more than one element, we can't use the swash in
17721 * the optimization below. */
17722 if (swash && element_count > 1) {
17723 SvREFCNT_dec_NN(swash);
17727 /* Note that the optimization of using 'swash' if it is the only thing in
17728 * the class doesn't have us change swash at all, so it can include things
17729 * that are also in the bitmap; otherwise we have purposely deleted that
17730 * duplicate information */
17731 set_ANYOF_arg(pRExC_state, ret, cp_list,
17732 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17734 only_utf8_locale_list,
17735 swash, has_user_defined_property);
17737 *flagp |= HASWIDTH|SIMPLE;
17739 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
17740 RExC_contains_locale = 1;
17746 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
17749 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
17750 regnode* const node,
17752 SV* const runtime_defns,
17753 SV* const only_utf8_locale_list,
17755 const bool has_user_defined_property)
17757 /* Sets the arg field of an ANYOF-type node 'node', using information about
17758 * the node passed-in. If there is nothing outside the node's bitmap, the
17759 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
17760 * the count returned by add_data(), having allocated and stored an array,
17761 * av, that that count references, as follows:
17762 * av[0] stores the character class description in its textual form.
17763 * This is used later (regexec.c:Perl_regclass_swash()) to
17764 * initialize the appropriate swash, and is also useful for dumping
17765 * the regnode. This is set to &PL_sv_undef if the textual
17766 * description is not needed at run-time (as happens if the other
17767 * elements completely define the class)
17768 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
17769 * computed from av[0]. But if no further computation need be done,
17770 * the swash is stored here now (and av[0] is &PL_sv_undef).
17771 * av[2] stores the inversion list of code points that match only if the
17772 * current locale is UTF-8
17773 * av[3] stores the cp_list inversion list for use in addition or instead
17774 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
17775 * (Otherwise everything needed is already in av[0] and av[1])
17776 * av[4] is set if any component of the class is from a user-defined
17777 * property; used only if av[3] exists */
17781 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
17783 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
17784 assert(! (ANYOF_FLAGS(node)
17785 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
17786 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
17789 AV * const av = newAV();
17792 av_store(av, 0, (runtime_defns)
17793 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
17796 av_store(av, 1, swash);
17797 SvREFCNT_dec_NN(cp_list);
17800 av_store(av, 1, &PL_sv_undef);
17802 av_store(av, 3, cp_list);
17803 av_store(av, 4, newSVuv(has_user_defined_property));
17807 if (only_utf8_locale_list) {
17808 av_store(av, 2, only_utf8_locale_list);
17811 av_store(av, 2, &PL_sv_undef);
17814 rv = newRV_noinc(MUTABLE_SV(av));
17815 n = add_data(pRExC_state, STR_WITH_LEN("s"));
17816 RExC_rxi->data->data[n] = (void*)rv;
17821 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
17823 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
17824 const regnode* node,
17827 SV** only_utf8_locale_ptr,
17828 SV** output_invlist)
17831 /* For internal core use only.
17832 * Returns the swash for the input 'node' in the regex 'prog'.
17833 * If <doinit> is 'true', will attempt to create the swash if not already
17835 * If <listsvp> is non-null, will return the printable contents of the
17836 * swash. This can be used to get debugging information even before the
17837 * swash exists, by calling this function with 'doinit' set to false, in
17838 * which case the components that will be used to eventually create the
17839 * swash are returned (in a printable form).
17840 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
17841 * store an inversion list of code points that should match only if the
17842 * execution-time locale is a UTF-8 one.
17843 * If <output_invlist> is not NULL, it is where this routine is to store an
17844 * inversion list of the code points that would be instead returned in
17845 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
17846 * when this parameter is used, is just the non-code point data that
17847 * will go into creating the swash. This currently should be just
17848 * user-defined properties whose definitions were not known at compile
17849 * time. Using this parameter allows for easier manipulation of the
17850 * swash's data by the caller. It is illegal to call this function with
17851 * this parameter set, but not <listsvp>
17853 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
17854 * that, in spite of this function's name, the swash it returns may include
17855 * the bitmap data as well */
17858 SV *si = NULL; /* Input swash initialization string */
17859 SV* invlist = NULL;
17861 RXi_GET_DECL(prog,progi);
17862 const struct reg_data * const data = prog ? progi->data : NULL;
17864 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
17865 assert(! output_invlist || listsvp);
17867 if (data && data->count) {
17868 const U32 n = ARG(node);
17870 if (data->what[n] == 's') {
17871 SV * const rv = MUTABLE_SV(data->data[n]);
17872 AV * const av = MUTABLE_AV(SvRV(rv));
17873 SV **const ary = AvARRAY(av);
17874 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
17876 si = *ary; /* ary[0] = the string to initialize the swash with */
17878 if (av_tindex_nomg(av) >= 2) {
17879 if (only_utf8_locale_ptr
17881 && ary[2] != &PL_sv_undef)
17883 *only_utf8_locale_ptr = ary[2];
17886 assert(only_utf8_locale_ptr);
17887 *only_utf8_locale_ptr = NULL;
17890 /* Elements 3 and 4 are either both present or both absent. [3]
17891 * is any inversion list generated at compile time; [4]
17892 * indicates if that inversion list has any user-defined
17893 * properties in it. */
17894 if (av_tindex_nomg(av) >= 3) {
17896 if (SvUV(ary[4])) {
17897 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
17905 /* Element [1] is reserved for the set-up swash. If already there,
17906 * return it; if not, create it and store it there */
17907 if (ary[1] && SvROK(ary[1])) {
17910 else if (doinit && ((si && si != &PL_sv_undef)
17911 || (invlist && invlist != &PL_sv_undef))) {
17913 sw = _core_swash_init("utf8", /* the utf8 package */
17917 0, /* not from tr/// */
17919 &swash_init_flags);
17920 (void)av_store(av, 1, sw);
17925 /* If requested, return a printable version of what this swash matches */
17927 SV* matches_string = NULL;
17929 /* The swash should be used, if possible, to get the data, as it
17930 * contains the resolved data. But this function can be called at
17931 * compile-time, before everything gets resolved, in which case we
17932 * return the currently best available information, which is the string
17933 * that will eventually be used to do that resolving, 'si' */
17934 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
17935 && (si && si != &PL_sv_undef))
17937 /* Here, we only have 'si' (and possibly some passed-in data in
17938 * 'invlist', which is handled below) If the caller only wants
17939 * 'si', use that. */
17940 if (! output_invlist) {
17941 matches_string = newSVsv(si);
17944 /* But if the caller wants an inversion list of the node, we
17945 * need to parse 'si' and place as much as possible in the
17946 * desired output inversion list, making 'matches_string' only
17947 * contain the currently unresolvable things */
17948 const char *si_string = SvPVX(si);
17949 STRLEN remaining = SvCUR(si);
17953 /* Ignore everything before the first new-line */
17954 while (*si_string != '\n' && remaining > 0) {
17958 assert(remaining > 0);
17963 while (remaining > 0) {
17965 /* The data consists of just strings defining user-defined
17966 * property names, but in prior incarnations, and perhaps
17967 * somehow from pluggable regex engines, it could still
17968 * hold hex code point definitions. Each component of a
17969 * range would be separated by a tab, and each range by a
17970 * new-line. If these are found, instead add them to the
17971 * inversion list */
17972 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
17973 |PERL_SCAN_SILENT_NON_PORTABLE;
17974 STRLEN len = remaining;
17975 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
17977 /* If the hex decode routine found something, it should go
17978 * up to the next \n */
17979 if ( *(si_string + len) == '\n') {
17980 if (count) { /* 2nd code point on line */
17981 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
17984 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
17987 goto prepare_for_next_iteration;
17990 /* If the hex decode was instead for the lower range limit,
17991 * save it, and go parse the upper range limit */
17992 if (*(si_string + len) == '\t') {
17993 assert(count == 0);
17997 prepare_for_next_iteration:
17998 si_string += len + 1;
17999 remaining -= len + 1;
18003 /* Here, didn't find a legal hex number. Just add it from
18004 * here to the next \n */
18007 while (*(si_string + len) != '\n' && remaining > 0) {
18011 if (*(si_string + len) == '\n') {
18015 if (matches_string) {
18016 sv_catpvn(matches_string, si_string, len - 1);
18019 matches_string = newSVpvn(si_string, len - 1);
18022 sv_catpvs(matches_string, " ");
18023 } /* end of loop through the text */
18025 assert(matches_string);
18026 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
18027 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18029 } /* end of has an 'si' but no swash */
18032 /* If we have a swash in place, its equivalent inversion list was above
18033 * placed into 'invlist'. If not, this variable may contain a stored
18034 * inversion list which is information beyond what is in 'si' */
18037 /* Again, if the caller doesn't want the output inversion list, put
18038 * everything in 'matches-string' */
18039 if (! output_invlist) {
18040 if ( ! matches_string) {
18041 matches_string = newSVpvs("\n");
18043 sv_catsv(matches_string, invlist_contents(invlist,
18044 TRUE /* traditional style */
18047 else if (! *output_invlist) {
18048 *output_invlist = invlist_clone(invlist);
18051 _invlist_union(*output_invlist, invlist, output_invlist);
18055 *listsvp = matches_string;
18060 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18062 /* reg_skipcomment()
18064 Absorbs an /x style # comment from the input stream,
18065 returning a pointer to the first character beyond the comment, or if the
18066 comment terminates the pattern without anything following it, this returns
18067 one past the final character of the pattern (in other words, RExC_end) and
18068 sets the REG_RUN_ON_COMMENT_SEEN flag.
18070 Note it's the callers responsibility to ensure that we are
18071 actually in /x mode
18075 PERL_STATIC_INLINE char*
18076 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18078 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18082 while (p < RExC_end) {
18083 if (*(++p) == '\n') {
18088 /* we ran off the end of the pattern without ending the comment, so we have
18089 * to add an \n when wrapping */
18090 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18095 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18097 const bool force_to_xmod
18100 /* If the text at the current parse position '*p' is a '(?#...)' comment,
18101 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18102 * is /x whitespace, advance '*p' so that on exit it points to the first
18103 * byte past all such white space and comments */
18105 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18107 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18109 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18112 if (RExC_end - (*p) >= 3
18114 && *(*p + 1) == '?'
18115 && *(*p + 2) == '#')
18117 while (*(*p) != ')') {
18118 if ((*p) == RExC_end)
18119 FAIL("Sequence (?#... not terminated");
18127 const char * save_p = *p;
18128 while ((*p) < RExC_end) {
18130 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18133 else if (*(*p) == '#') {
18134 (*p) = reg_skipcomment(pRExC_state, (*p));
18140 if (*p != save_p) {
18153 Advances the parse position by one byte, unless that byte is the beginning
18154 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
18155 those two cases, the parse position is advanced beyond all such comments and
18158 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18162 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18164 PERL_ARGS_ASSERT_NEXTCHAR;
18166 if (RExC_parse < RExC_end) {
18168 || UTF8_IS_INVARIANT(*RExC_parse)
18169 || UTF8_IS_START(*RExC_parse));
18171 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18173 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18174 FALSE /* Don't assume /x */ );
18179 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18181 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18182 * space. In pass1, it aligns and increments RExC_size; in pass2,
18185 regnode * const ret = RExC_emit;
18186 GET_RE_DEBUG_FLAGS_DECL;
18188 PERL_ARGS_ASSERT_REGNODE_GUTS;
18190 assert(extra_size >= regarglen[op]);
18193 SIZE_ALIGN(RExC_size);
18194 RExC_size += 1 + extra_size;
18197 if (RExC_emit >= RExC_emit_bound)
18198 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18199 op, (void*)RExC_emit, (void*)RExC_emit_bound);
18201 NODE_ALIGN_FILL(ret);
18202 #ifndef RE_TRACK_PATTERN_OFFSETS
18203 PERL_UNUSED_ARG(name);
18205 if (RExC_offsets) { /* MJD */
18207 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
18210 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18211 ? "Overwriting end of array!\n" : "OK",
18212 (UV)(RExC_emit - RExC_emit_start),
18213 (UV)(RExC_parse - RExC_start),
18214 (UV)RExC_offsets[0]));
18215 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18222 - reg_node - emit a node
18224 STATIC regnode * /* Location. */
18225 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18227 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18229 PERL_ARGS_ASSERT_REG_NODE;
18231 assert(regarglen[op] == 0);
18234 regnode *ptr = ret;
18235 FILL_ADVANCE_NODE(ptr, op);
18242 - reganode - emit a node with an argument
18244 STATIC regnode * /* Location. */
18245 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18247 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18249 PERL_ARGS_ASSERT_REGANODE;
18251 assert(regarglen[op] == 1);
18254 regnode *ptr = ret;
18255 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18262 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18264 /* emit a node with U32 and I32 arguments */
18266 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18268 PERL_ARGS_ASSERT_REG2LANODE;
18270 assert(regarglen[op] == 2);
18273 regnode *ptr = ret;
18274 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18281 - reginsert - insert an operator in front of already-emitted operand
18283 * Means relocating the operand.
18286 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
18291 const int offset = regarglen[(U8)op];
18292 const int size = NODE_STEP_REGNODE + offset;
18293 GET_RE_DEBUG_FLAGS_DECL;
18295 PERL_ARGS_ASSERT_REGINSERT;
18296 PERL_UNUSED_CONTEXT;
18297 PERL_UNUSED_ARG(depth);
18298 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18299 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18304 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18305 studying. If this is wrong then we need to adjust RExC_recurse
18306 below like we do with RExC_open_parens/RExC_close_parens. */
18310 if (RExC_open_parens) {
18312 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
18313 /* remember that RExC_npar is rex->nparens + 1,
18314 * iow it is 1 more than the number of parens seen in
18315 * the pattern so far. */
18316 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18317 /* note, RExC_open_parens[0] is the start of the
18318 * regex, it can't move. RExC_close_parens[0] is the end
18319 * of the regex, it *can* move. */
18320 if ( paren && RExC_open_parens[paren] >= opnd ) {
18321 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18322 RExC_open_parens[paren] += size;
18324 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18326 if ( RExC_close_parens[paren] >= opnd ) {
18327 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18328 RExC_close_parens[paren] += size;
18330 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18335 RExC_end_op += size;
18337 while (src > opnd) {
18338 StructCopy(--src, --dst, regnode);
18339 #ifdef RE_TRACK_PATTERN_OFFSETS
18340 if (RExC_offsets) { /* MJD 20010112 */
18342 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
18346 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18347 ? "Overwriting end of array!\n" : "OK",
18348 (UV)(src - RExC_emit_start),
18349 (UV)(dst - RExC_emit_start),
18350 (UV)RExC_offsets[0]));
18351 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18352 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18358 place = opnd; /* Op node, where operand used to be. */
18359 #ifdef RE_TRACK_PATTERN_OFFSETS
18360 if (RExC_offsets) { /* MJD */
18362 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
18366 (UV)(place - RExC_emit_start) > RExC_offsets[0]
18367 ? "Overwriting end of array!\n" : "OK",
18368 (UV)(place - RExC_emit_start),
18369 (UV)(RExC_parse - RExC_start),
18370 (UV)RExC_offsets[0]));
18371 Set_Node_Offset(place, RExC_parse);
18372 Set_Node_Length(place, 1);
18375 src = NEXTOPER(place);
18376 FILL_ADVANCE_NODE(place, op);
18377 Zero(src, offset, regnode);
18381 - regtail - set the next-pointer at the end of a node chain of p to val.
18382 - SEE ALSO: regtail_study
18385 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18386 const regnode * const p,
18387 const regnode * const val,
18391 GET_RE_DEBUG_FLAGS_DECL;
18393 PERL_ARGS_ASSERT_REGTAIL;
18395 PERL_UNUSED_ARG(depth);
18401 /* Find last node. */
18402 scan = (regnode *) p;
18404 regnode * const temp = regnext(scan);
18406 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18407 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18408 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
18409 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18410 (temp == NULL ? "->" : ""),
18411 (temp == NULL ? PL_reg_name[OP(val)] : "")
18419 if (reg_off_by_arg[OP(scan)]) {
18420 ARG_SET(scan, val - scan);
18423 NEXT_OFF(scan) = val - scan;
18429 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18430 - Look for optimizable sequences at the same time.
18431 - currently only looks for EXACT chains.
18433 This is experimental code. The idea is to use this routine to perform
18434 in place optimizations on branches and groups as they are constructed,
18435 with the long term intention of removing optimization from study_chunk so
18436 that it is purely analytical.
18438 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18439 to control which is which.
18442 /* TODO: All four parms should be const */
18445 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18446 const regnode *val,U32 depth)
18450 #ifdef EXPERIMENTAL_INPLACESCAN
18453 GET_RE_DEBUG_FLAGS_DECL;
18455 PERL_ARGS_ASSERT_REGTAIL_STUDY;
18461 /* Find last node. */
18465 regnode * const temp = regnext(scan);
18466 #ifdef EXPERIMENTAL_INPLACESCAN
18467 if (PL_regkind[OP(scan)] == EXACT) {
18468 bool unfolded_multi_char; /* Unexamined in this routine */
18469 if (join_exact(pRExC_state, scan, &min,
18470 &unfolded_multi_char, 1, val, depth+1))
18475 switch (OP(scan)) {
18479 case EXACTFA_NO_TRIE:
18485 if( exact == PSEUDO )
18487 else if ( exact != OP(scan) )
18496 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18497 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18498 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
18499 SvPV_nolen_const(RExC_mysv),
18500 REG_NODE_NUM(scan),
18501 PL_reg_name[exact]);
18508 DEBUG_PARSE_MSG("");
18509 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
18510 Perl_re_printf( aTHX_
18511 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
18512 SvPV_nolen_const(RExC_mysv),
18513 (IV)REG_NODE_NUM(val),
18517 if (reg_off_by_arg[OP(scan)]) {
18518 ARG_SET(scan, val - scan);
18521 NEXT_OFF(scan) = val - scan;
18529 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
18534 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
18539 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18541 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
18542 if (flags & (1<<bit)) {
18543 if (!set++ && lead)
18544 Perl_re_printf( aTHX_ "%s",lead);
18545 Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]);
18550 Perl_re_printf( aTHX_ "\n");
18552 Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
18557 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
18563 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18565 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
18566 if (flags & (1<<bit)) {
18567 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
18570 if (!set++ && lead)
18571 Perl_re_printf( aTHX_ "%s",lead);
18572 Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]);
18575 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
18576 if (!set++ && lead) {
18577 Perl_re_printf( aTHX_ "%s",lead);
18580 case REGEX_UNICODE_CHARSET:
18581 Perl_re_printf( aTHX_ "UNICODE");
18583 case REGEX_LOCALE_CHARSET:
18584 Perl_re_printf( aTHX_ "LOCALE");
18586 case REGEX_ASCII_RESTRICTED_CHARSET:
18587 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
18589 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
18590 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
18593 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
18599 Perl_re_printf( aTHX_ "\n");
18601 Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
18607 Perl_regdump(pTHX_ const regexp *r)
18610 SV * const sv = sv_newmortal();
18611 SV *dsv= sv_newmortal();
18612 RXi_GET_DECL(r,ri);
18613 GET_RE_DEBUG_FLAGS_DECL;
18615 PERL_ARGS_ASSERT_REGDUMP;
18617 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
18619 /* Header fields of interest. */
18620 if (r->anchored_substr) {
18621 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
18622 RE_SV_DUMPLEN(r->anchored_substr), 30);
18623 Perl_re_printf( aTHX_
18624 "anchored %s%s at %"IVdf" ",
18625 s, RE_SV_TAIL(r->anchored_substr),
18626 (IV)r->anchored_offset);
18627 } else if (r->anchored_utf8) {
18628 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
18629 RE_SV_DUMPLEN(r->anchored_utf8), 30);
18630 Perl_re_printf( aTHX_
18631 "anchored utf8 %s%s at %"IVdf" ",
18632 s, RE_SV_TAIL(r->anchored_utf8),
18633 (IV)r->anchored_offset);
18635 if (r->float_substr) {
18636 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
18637 RE_SV_DUMPLEN(r->float_substr), 30);
18638 Perl_re_printf( aTHX_
18639 "floating %s%s at %"IVdf"..%"UVuf" ",
18640 s, RE_SV_TAIL(r->float_substr),
18641 (IV)r->float_min_offset, (UV)r->float_max_offset);
18642 } else if (r->float_utf8) {
18643 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
18644 RE_SV_DUMPLEN(r->float_utf8), 30);
18645 Perl_re_printf( aTHX_
18646 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
18647 s, RE_SV_TAIL(r->float_utf8),
18648 (IV)r->float_min_offset, (UV)r->float_max_offset);
18650 if (r->check_substr || r->check_utf8)
18651 Perl_re_printf( aTHX_
18653 (r->check_substr == r->float_substr
18654 && r->check_utf8 == r->float_utf8
18655 ? "(checking floating" : "(checking anchored"));
18656 if (r->intflags & PREGf_NOSCAN)
18657 Perl_re_printf( aTHX_ " noscan");
18658 if (r->extflags & RXf_CHECK_ALL)
18659 Perl_re_printf( aTHX_ " isall");
18660 if (r->check_substr || r->check_utf8)
18661 Perl_re_printf( aTHX_ ") ");
18663 if (ri->regstclass) {
18664 regprop(r, sv, ri->regstclass, NULL, NULL);
18665 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
18667 if (r->intflags & PREGf_ANCH) {
18668 Perl_re_printf( aTHX_ "anchored");
18669 if (r->intflags & PREGf_ANCH_MBOL)
18670 Perl_re_printf( aTHX_ "(MBOL)");
18671 if (r->intflags & PREGf_ANCH_SBOL)
18672 Perl_re_printf( aTHX_ "(SBOL)");
18673 if (r->intflags & PREGf_ANCH_GPOS)
18674 Perl_re_printf( aTHX_ "(GPOS)");
18675 Perl_re_printf( aTHX_ " ");
18677 if (r->intflags & PREGf_GPOS_SEEN)
18678 Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs);
18679 if (r->intflags & PREGf_SKIP)
18680 Perl_re_printf( aTHX_ "plus ");
18681 if (r->intflags & PREGf_IMPLICIT)
18682 Perl_re_printf( aTHX_ "implicit ");
18683 Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen);
18684 if (r->extflags & RXf_EVAL_SEEN)
18685 Perl_re_printf( aTHX_ "with eval ");
18686 Perl_re_printf( aTHX_ "\n");
18688 regdump_extflags("r->extflags: ",r->extflags);
18689 regdump_intflags("r->intflags: ",r->intflags);
18692 PERL_ARGS_ASSERT_REGDUMP;
18693 PERL_UNUSED_CONTEXT;
18694 PERL_UNUSED_ARG(r);
18695 #endif /* DEBUGGING */
18698 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
18701 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
18702 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
18703 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
18704 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
18705 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
18706 || _CC_VERTSPACE != 15
18707 # error Need to adjust order of anyofs[]
18709 static const char * const anyofs[] = {
18746 - regprop - printable representation of opcode, with run time support
18750 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
18754 RXi_GET_DECL(prog,progi);
18755 GET_RE_DEBUG_FLAGS_DECL;
18757 PERL_ARGS_ASSERT_REGPROP;
18759 sv_setpvn(sv, "", 0);
18761 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
18762 /* It would be nice to FAIL() here, but this may be called from
18763 regexec.c, and it would be hard to supply pRExC_state. */
18764 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
18765 (int)OP(o), (int)REGNODE_MAX);
18766 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
18768 k = PL_regkind[OP(o)];
18771 sv_catpvs(sv, " ");
18772 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
18773 * is a crude hack but it may be the best for now since
18774 * we have no flag "this EXACTish node was UTF-8"
18776 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
18777 PERL_PV_ESCAPE_UNI_DETECT |
18778 PERL_PV_ESCAPE_NONASCII |
18779 PERL_PV_PRETTY_ELLIPSES |
18780 PERL_PV_PRETTY_LTGT |
18781 PERL_PV_PRETTY_NOCLEAR
18783 } else if (k == TRIE) {
18784 /* print the details of the trie in dumpuntil instead, as
18785 * progi->data isn't available here */
18786 const char op = OP(o);
18787 const U32 n = ARG(o);
18788 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
18789 (reg_ac_data *)progi->data->data[n] :
18791 const reg_trie_data * const trie
18792 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
18794 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
18795 DEBUG_TRIE_COMPILE_r(
18796 Perl_sv_catpvf(aTHX_ sv,
18797 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
18798 (UV)trie->startstate,
18799 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
18800 (UV)trie->wordcount,
18803 (UV)TRIE_CHARCOUNT(trie),
18804 (UV)trie->uniquecharcount
18807 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
18808 sv_catpvs(sv, "[");
18809 (void) put_charclass_bitmap_innards(sv,
18810 ((IS_ANYOF_TRIE(op))
18812 : TRIE_BITMAP(trie)),
18817 sv_catpvs(sv, "]");
18820 } else if (k == CURLY) {
18821 U32 lo = ARG1(o), hi = ARG2(o);
18822 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
18823 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
18824 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
18825 if (hi == REG_INFTY)
18826 sv_catpvs(sv, "INFTY");
18828 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
18829 sv_catpvs(sv, "}");
18831 else if (k == WHILEM && o->flags) /* Ordinal/of */
18832 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
18833 else if (k == REF || k == OPEN || k == CLOSE
18834 || k == GROUPP || OP(o)==ACCEPT)
18836 AV *name_list= NULL;
18837 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
18838 Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
18839 if ( RXp_PAREN_NAMES(prog) ) {
18840 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
18841 } else if ( pRExC_state ) {
18842 name_list= RExC_paren_name_list;
18845 if ( k != REF || (OP(o) < NREF)) {
18846 SV **name= av_fetch(name_list, parno, 0 );
18848 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18851 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
18852 I32 *nums=(I32*)SvPVX(sv_dat);
18853 SV **name= av_fetch(name_list, nums[0], 0 );
18856 for ( n=0; n<SvIVX(sv_dat); n++ ) {
18857 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
18858 (n ? "," : ""), (IV)nums[n]);
18860 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18864 if ( k == REF && reginfo) {
18865 U32 n = ARG(o); /* which paren pair */
18866 I32 ln = prog->offs[n].start;
18867 if (prog->lastparen < n || ln == -1)
18868 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
18869 else if (ln == prog->offs[n].end)
18870 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
18872 const char *s = reginfo->strbeg + ln;
18873 Perl_sv_catpvf(aTHX_ sv, ": ");
18874 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
18875 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
18878 } else if (k == GOSUB) {
18879 AV *name_list= NULL;
18880 if ( RXp_PAREN_NAMES(prog) ) {
18881 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
18882 } else if ( pRExC_state ) {
18883 name_list= RExC_paren_name_list;
18886 /* Paren and offset */
18887 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
18888 (int)((o + (int)ARG2L(o)) - progi->program) );
18890 SV **name= av_fetch(name_list, ARG(o), 0 );
18892 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18895 else if (k == LOGICAL)
18896 /* 2: embedded, otherwise 1 */
18897 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
18898 else if (k == ANYOF) {
18899 const U8 flags = ANYOF_FLAGS(o);
18900 bool do_sep = FALSE; /* Do we need to separate various components of
18902 /* Set if there is still an unresolved user-defined property */
18903 SV *unresolved = NULL;
18905 /* Things that are ignored except when the runtime locale is UTF-8 */
18906 SV *only_utf8_locale_invlist = NULL;
18908 /* Code points that don't fit in the bitmap */
18909 SV *nonbitmap_invlist = NULL;
18911 /* And things that aren't in the bitmap, but are small enough to be */
18912 SV* bitmap_range_not_in_bitmap = NULL;
18914 if (OP(o) == ANYOFL) {
18915 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
18916 sv_catpvs(sv, "{utf8-locale-reqd}");
18918 if (flags & ANYOFL_FOLD) {
18919 sv_catpvs(sv, "{i}");
18923 /* If there is stuff outside the bitmap, get it */
18924 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
18925 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
18927 &only_utf8_locale_invlist,
18928 &nonbitmap_invlist);
18929 /* The non-bitmap data may contain stuff that could fit in the
18930 * bitmap. This could come from a user-defined property being
18931 * finally resolved when this call was done; or much more likely
18932 * because there are matches that require UTF-8 to be valid, and so
18933 * aren't in the bitmap. This is teased apart later */
18934 _invlist_intersection(nonbitmap_invlist,
18936 &bitmap_range_not_in_bitmap);
18937 /* Leave just the things that don't fit into the bitmap */
18938 _invlist_subtract(nonbitmap_invlist,
18940 &nonbitmap_invlist);
18943 /* Obey this flag to add all above-the-bitmap code points */
18944 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
18945 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
18946 NUM_ANYOF_CODE_POINTS,
18950 /* Ready to start outputting. First, the initial left bracket */
18951 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
18953 /* Then all the things that could fit in the bitmap */
18954 do_sep = put_charclass_bitmap_innards(sv,
18956 bitmap_range_not_in_bitmap,
18957 only_utf8_locale_invlist,
18959 SvREFCNT_dec(bitmap_range_not_in_bitmap);
18961 /* If there are user-defined properties which haven't been defined yet,
18962 * output them, in a separate [] from the bitmap range stuff */
18965 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
18967 if (flags & ANYOF_INVERT) {
18968 sv_catpvs(sv, "^");
18970 sv_catsv(sv, unresolved);
18972 SvREFCNT_dec_NN(unresolved);
18975 /* And, finally, add the above-the-bitmap stuff */
18976 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
18979 /* See if truncation size is overridden */
18980 const STRLEN dump_len = (PL_dump_re_max_len)
18981 ? PL_dump_re_max_len
18984 /* This is output in a separate [] */
18986 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
18989 /* And, for easy of understanding, it is always output not-shown as
18991 if (flags & ANYOF_INVERT) {
18992 _invlist_invert(nonbitmap_invlist);
18993 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
18996 contents = invlist_contents(nonbitmap_invlist,
18997 FALSE /* output suitable for catsv */
19000 /* If the output is shorter than the permissible maximum, just do it. */
19001 if (SvCUR(contents) <= dump_len) {
19002 sv_catsv(sv, contents);
19005 const char * contents_string = SvPVX(contents);
19006 STRLEN i = dump_len;
19008 /* Otherwise, start at the permissible max and work back to the
19009 * first break possibility */
19010 while (i > 0 && contents_string[i] != ' ') {
19013 if (i == 0) { /* Fail-safe. Use the max if we couldn't
19014 find a legal break */
19018 sv_catpvn(sv, contents_string, i);
19019 sv_catpvs(sv, "...");
19022 SvREFCNT_dec_NN(contents);
19023 SvREFCNT_dec_NN(nonbitmap_invlist);
19026 /* And finally the matching, closing ']' */
19027 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19029 else if (k == POSIXD || k == NPOSIXD) {
19030 U8 index = FLAGS(o) * 2;
19031 if (index < C_ARRAY_LENGTH(anyofs)) {
19032 if (*anyofs[index] != '[') {
19035 sv_catpv(sv, anyofs[index]);
19036 if (*anyofs[index] != '[') {
19041 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19044 else if (k == BOUND || k == NBOUND) {
19045 /* Must be synced with order of 'bound_type' in regcomp.h */
19046 const char * const bounds[] = {
19047 "", /* Traditional */
19053 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19054 sv_catpv(sv, bounds[FLAGS(o)]);
19056 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19057 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19058 else if (OP(o) == SBOL)
19059 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19061 /* add on the verb argument if there is one */
19062 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19063 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
19064 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19067 PERL_UNUSED_CONTEXT;
19068 PERL_UNUSED_ARG(sv);
19069 PERL_UNUSED_ARG(o);
19070 PERL_UNUSED_ARG(prog);
19071 PERL_UNUSED_ARG(reginfo);
19072 PERL_UNUSED_ARG(pRExC_state);
19073 #endif /* DEBUGGING */
19079 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19080 { /* Assume that RE_INTUIT is set */
19081 struct regexp *const prog = ReANY(r);
19082 GET_RE_DEBUG_FLAGS_DECL;
19084 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19085 PERL_UNUSED_CONTEXT;
19089 const char * const s = SvPV_nolen_const(RX_UTF8(r)
19090 ? prog->check_utf8 : prog->check_substr);
19092 if (!PL_colorset) reginitcolors();
19093 Perl_re_printf( aTHX_
19094 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19096 RX_UTF8(r) ? "utf8 " : "",
19097 PL_colors[5],PL_colors[0],
19100 (strlen(s) > 60 ? "..." : ""));
19103 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19104 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19110 handles refcounting and freeing the perl core regexp structure. When
19111 it is necessary to actually free the structure the first thing it
19112 does is call the 'free' method of the regexp_engine associated to
19113 the regexp, allowing the handling of the void *pprivate; member
19114 first. (This routine is not overridable by extensions, which is why
19115 the extensions free is called first.)
19117 See regdupe and regdupe_internal if you change anything here.
19119 #ifndef PERL_IN_XSUB_RE
19121 Perl_pregfree(pTHX_ REGEXP *r)
19127 Perl_pregfree2(pTHX_ REGEXP *rx)
19129 struct regexp *const r = ReANY(rx);
19130 GET_RE_DEBUG_FLAGS_DECL;
19132 PERL_ARGS_ASSERT_PREGFREE2;
19134 if (r->mother_re) {
19135 ReREFCNT_dec(r->mother_re);
19137 CALLREGFREE_PVT(rx); /* free the private data */
19138 SvREFCNT_dec(RXp_PAREN_NAMES(r));
19139 Safefree(r->xpv_len_u.xpvlenu_pv);
19142 SvREFCNT_dec(r->anchored_substr);
19143 SvREFCNT_dec(r->anchored_utf8);
19144 SvREFCNT_dec(r->float_substr);
19145 SvREFCNT_dec(r->float_utf8);
19146 Safefree(r->substrs);
19148 RX_MATCH_COPY_FREE(rx);
19149 #ifdef PERL_ANY_COW
19150 SvREFCNT_dec(r->saved_copy);
19153 SvREFCNT_dec(r->qr_anoncv);
19154 if (r->recurse_locinput)
19155 Safefree(r->recurse_locinput);
19156 rx->sv_u.svu_rx = 0;
19161 This is a hacky workaround to the structural issue of match results
19162 being stored in the regexp structure which is in turn stored in
19163 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19164 could be PL_curpm in multiple contexts, and could require multiple
19165 result sets being associated with the pattern simultaneously, such
19166 as when doing a recursive match with (??{$qr})
19168 The solution is to make a lightweight copy of the regexp structure
19169 when a qr// is returned from the code executed by (??{$qr}) this
19170 lightweight copy doesn't actually own any of its data except for
19171 the starp/end and the actual regexp structure itself.
19177 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
19179 struct regexp *ret;
19180 struct regexp *const r = ReANY(rx);
19181 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
19183 PERL_ARGS_ASSERT_REG_TEMP_COPY;
19186 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
19188 SvOK_off((SV *)ret_x);
19190 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
19191 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
19192 made both spots point to the same regexp body.) */
19193 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19194 assert(!SvPVX(ret_x));
19195 ret_x->sv_u.svu_rx = temp->sv_any;
19196 temp->sv_any = NULL;
19197 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19198 SvREFCNT_dec_NN(temp);
19199 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19200 ing below will not set it. */
19201 SvCUR_set(ret_x, SvCUR(rx));
19204 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19205 sv_force_normal(sv) is called. */
19207 ret = ReANY(ret_x);
19209 SvFLAGS(ret_x) |= SvUTF8(rx);
19210 /* We share the same string buffer as the original regexp, on which we
19211 hold a reference count, incremented when mother_re is set below.
19212 The string pointer is copied here, being part of the regexp struct.
19214 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
19215 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19217 const I32 npar = r->nparens+1;
19218 Newx(ret->offs, npar, regexp_paren_pair);
19219 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19222 Newx(ret->substrs, 1, struct reg_substr_data);
19223 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19225 SvREFCNT_inc_void(ret->anchored_substr);
19226 SvREFCNT_inc_void(ret->anchored_utf8);
19227 SvREFCNT_inc_void(ret->float_substr);
19228 SvREFCNT_inc_void(ret->float_utf8);
19230 /* check_substr and check_utf8, if non-NULL, point to either their
19231 anchored or float namesakes, and don't hold a second reference. */
19233 RX_MATCH_COPIED_off(ret_x);
19234 #ifdef PERL_ANY_COW
19235 ret->saved_copy = NULL;
19237 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
19238 SvREFCNT_inc_void(ret->qr_anoncv);
19239 if (r->recurse_locinput)
19240 Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19246 /* regfree_internal()
19248 Free the private data in a regexp. This is overloadable by
19249 extensions. Perl takes care of the regexp structure in pregfree(),
19250 this covers the *pprivate pointer which technically perl doesn't
19251 know about, however of course we have to handle the
19252 regexp_internal structure when no extension is in use.
19254 Note this is called before freeing anything in the regexp
19259 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19261 struct regexp *const r = ReANY(rx);
19262 RXi_GET_DECL(r,ri);
19263 GET_RE_DEBUG_FLAGS_DECL;
19265 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19271 SV *dsv= sv_newmortal();
19272 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19273 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
19274 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19275 PL_colors[4],PL_colors[5],s);
19278 #ifdef RE_TRACK_PATTERN_OFFSETS
19280 Safefree(ri->u.offsets); /* 20010421 MJD */
19282 if (ri->code_blocks) {
19284 for (n = 0; n < ri->num_code_blocks; n++)
19285 SvREFCNT_dec(ri->code_blocks[n].src_regex);
19286 Safefree(ri->code_blocks);
19290 int n = ri->data->count;
19293 /* If you add a ->what type here, update the comment in regcomp.h */
19294 switch (ri->data->what[n]) {
19300 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19303 Safefree(ri->data->data[n]);
19309 { /* Aho Corasick add-on structure for a trie node.
19310 Used in stclass optimization only */
19312 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19313 #ifdef USE_ITHREADS
19317 refcount = --aho->refcount;
19320 PerlMemShared_free(aho->states);
19321 PerlMemShared_free(aho->fail);
19322 /* do this last!!!! */
19323 PerlMemShared_free(ri->data->data[n]);
19324 /* we should only ever get called once, so
19325 * assert as much, and also guard the free
19326 * which /might/ happen twice. At the least
19327 * it will make code anlyzers happy and it
19328 * doesn't cost much. - Yves */
19329 assert(ri->regstclass);
19330 if (ri->regstclass) {
19331 PerlMemShared_free(ri->regstclass);
19332 ri->regstclass = 0;
19339 /* trie structure. */
19341 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19342 #ifdef USE_ITHREADS
19346 refcount = --trie->refcount;
19349 PerlMemShared_free(trie->charmap);
19350 PerlMemShared_free(trie->states);
19351 PerlMemShared_free(trie->trans);
19353 PerlMemShared_free(trie->bitmap);
19355 PerlMemShared_free(trie->jump);
19356 PerlMemShared_free(trie->wordinfo);
19357 /* do this last!!!! */
19358 PerlMemShared_free(ri->data->data[n]);
19363 Perl_croak(aTHX_ "panic: regfree data code '%c'",
19364 ri->data->what[n]);
19367 Safefree(ri->data->what);
19368 Safefree(ri->data);
19374 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19375 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19376 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
19379 re_dup_guts - duplicate a regexp.
19381 This routine is expected to clone a given regexp structure. It is only
19382 compiled under USE_ITHREADS.
19384 After all of the core data stored in struct regexp is duplicated
19385 the regexp_engine.dupe method is used to copy any private data
19386 stored in the *pprivate pointer. This allows extensions to handle
19387 any duplication it needs to do.
19389 See pregfree() and regfree_internal() if you change anything here.
19391 #if defined(USE_ITHREADS)
19392 #ifndef PERL_IN_XSUB_RE
19394 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19398 const struct regexp *r = ReANY(sstr);
19399 struct regexp *ret = ReANY(dstr);
19401 PERL_ARGS_ASSERT_RE_DUP_GUTS;
19403 npar = r->nparens+1;
19404 Newx(ret->offs, npar, regexp_paren_pair);
19405 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19407 if (ret->substrs) {
19408 /* Do it this way to avoid reading from *r after the StructCopy().
19409 That way, if any of the sv_dup_inc()s dislodge *r from the L1
19410 cache, it doesn't matter. */
19411 const bool anchored = r->check_substr
19412 ? r->check_substr == r->anchored_substr
19413 : r->check_utf8 == r->anchored_utf8;
19414 Newx(ret->substrs, 1, struct reg_substr_data);
19415 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19417 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
19418 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
19419 ret->float_substr = sv_dup_inc(ret->float_substr, param);
19420 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
19422 /* check_substr and check_utf8, if non-NULL, point to either their
19423 anchored or float namesakes, and don't hold a second reference. */
19425 if (ret->check_substr) {
19427 assert(r->check_utf8 == r->anchored_utf8);
19428 ret->check_substr = ret->anchored_substr;
19429 ret->check_utf8 = ret->anchored_utf8;
19431 assert(r->check_substr == r->float_substr);
19432 assert(r->check_utf8 == r->float_utf8);
19433 ret->check_substr = ret->float_substr;
19434 ret->check_utf8 = ret->float_utf8;
19436 } else if (ret->check_utf8) {
19438 ret->check_utf8 = ret->anchored_utf8;
19440 ret->check_utf8 = ret->float_utf8;
19445 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19446 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19447 if (r->recurse_locinput)
19448 Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19451 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19453 if (RX_MATCH_COPIED(dstr))
19454 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
19456 ret->subbeg = NULL;
19457 #ifdef PERL_ANY_COW
19458 ret->saved_copy = NULL;
19461 /* Whether mother_re be set or no, we need to copy the string. We
19462 cannot refrain from copying it when the storage points directly to
19463 our mother regexp, because that's
19464 1: a buffer in a different thread
19465 2: something we no longer hold a reference on
19466 so we need to copy it locally. */
19467 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
19468 ret->mother_re = NULL;
19470 #endif /* PERL_IN_XSUB_RE */
19475 This is the internal complement to regdupe() which is used to copy
19476 the structure pointed to by the *pprivate pointer in the regexp.
19477 This is the core version of the extension overridable cloning hook.
19478 The regexp structure being duplicated will be copied by perl prior
19479 to this and will be provided as the regexp *r argument, however
19480 with the /old/ structures pprivate pointer value. Thus this routine
19481 may override any copying normally done by perl.
19483 It returns a pointer to the new regexp_internal structure.
19487 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
19490 struct regexp *const r = ReANY(rx);
19491 regexp_internal *reti;
19493 RXi_GET_DECL(r,ri);
19495 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
19499 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
19500 char, regexp_internal);
19501 Copy(ri->program, reti->program, len+1, regnode);
19504 reti->num_code_blocks = ri->num_code_blocks;
19505 if (ri->code_blocks) {
19507 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
19508 struct reg_code_block);
19509 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
19510 struct reg_code_block);
19511 for (n = 0; n < ri->num_code_blocks; n++)
19512 reti->code_blocks[n].src_regex = (REGEXP*)
19513 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
19516 reti->code_blocks = NULL;
19518 reti->regstclass = NULL;
19521 struct reg_data *d;
19522 const int count = ri->data->count;
19525 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
19526 char, struct reg_data);
19527 Newx(d->what, count, U8);
19530 for (i = 0; i < count; i++) {
19531 d->what[i] = ri->data->what[i];
19532 switch (d->what[i]) {
19533 /* see also regcomp.h and regfree_internal() */
19534 case 'a': /* actually an AV, but the dup function is identical. */
19538 case 'u': /* actually an HV, but the dup function is identical. */
19539 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
19542 /* This is cheating. */
19543 Newx(d->data[i], 1, regnode_ssc);
19544 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
19545 reti->regstclass = (regnode*)d->data[i];
19548 /* Trie stclasses are readonly and can thus be shared
19549 * without duplication. We free the stclass in pregfree
19550 * when the corresponding reg_ac_data struct is freed.
19552 reti->regstclass= ri->regstclass;
19556 ((reg_trie_data*)ri->data->data[i])->refcount++;
19561 d->data[i] = ri->data->data[i];
19564 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
19565 ri->data->what[i]);
19574 reti->name_list_idx = ri->name_list_idx;
19576 #ifdef RE_TRACK_PATTERN_OFFSETS
19577 if (ri->u.offsets) {
19578 Newx(reti->u.offsets, 2*len+1, U32);
19579 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
19582 SetProgLen(reti,len);
19585 return (void*)reti;
19588 #endif /* USE_ITHREADS */
19590 #ifndef PERL_IN_XSUB_RE
19593 - regnext - dig the "next" pointer out of a node
19596 Perl_regnext(pTHX_ regnode *p)
19603 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
19604 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19605 (int)OP(p), (int)REGNODE_MAX);
19608 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
19617 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
19620 STRLEN l1 = strlen(pat1);
19621 STRLEN l2 = strlen(pat2);
19624 const char *message;
19626 PERL_ARGS_ASSERT_RE_CROAK2;
19632 Copy(pat1, buf, l1 , char);
19633 Copy(pat2, buf + l1, l2 , char);
19634 buf[l1 + l2] = '\n';
19635 buf[l1 + l2 + 1] = '\0';
19636 va_start(args, pat2);
19637 msv = vmess(buf, &args);
19639 message = SvPV_const(msv,l1);
19642 Copy(message, buf, l1 , char);
19643 /* l1-1 to avoid \n */
19644 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
19647 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
19649 #ifndef PERL_IN_XSUB_RE
19651 Perl_save_re_context(pTHX)
19656 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
19659 const REGEXP * const rx = PM_GETRE(PL_curpm);
19661 nparens = RX_NPARENS(rx);
19664 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
19665 * that PL_curpm will be null, but that utf8.pm and the modules it
19666 * loads will only use $1..$3.
19667 * The t/porting/re_context.t test file checks this assumption.
19672 for (i = 1; i <= nparens; i++) {
19673 char digits[TYPE_CHARS(long)];
19674 const STRLEN len = my_snprintf(digits, sizeof(digits),
19676 GV *const *const gvp
19677 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
19680 GV * const gv = *gvp;
19681 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
19691 S_put_code_point(pTHX_ SV *sv, UV c)
19693 PERL_ARGS_ASSERT_PUT_CODE_POINT;
19696 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
19698 else if (isPRINT(c)) {
19699 const char string = (char) c;
19701 /* We use {phrase} as metanotation in the class, so also escape literal
19703 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
19704 sv_catpvs(sv, "\\");
19705 sv_catpvn(sv, &string, 1);
19707 else if (isMNEMONIC_CNTRL(c)) {
19708 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
19711 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
19715 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
19718 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
19720 /* Appends to 'sv' a displayable version of the range of code points from
19721 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
19722 * that have them, when they occur at the beginning or end of the range.
19723 * It uses hex to output the remaining code points, unless 'allow_literals'
19724 * is true, in which case the printable ASCII ones are output as-is (though
19725 * some of these will be escaped by put_code_point()).
19727 * NOTE: This is designed only for printing ranges of code points that fit
19728 * inside an ANYOF bitmap. Higher code points are simply suppressed
19731 const unsigned int min_range_count = 3;
19733 assert(start <= end);
19735 PERL_ARGS_ASSERT_PUT_RANGE;
19737 while (start <= end) {
19739 const char * format;
19741 if (end - start < min_range_count) {
19743 /* Output chars individually when they occur in short ranges */
19744 for (; start <= end; start++) {
19745 put_code_point(sv, start);
19750 /* If permitted by the input options, and there is a possibility that
19751 * this range contains a printable literal, look to see if there is
19753 if (allow_literals && start <= MAX_PRINT_A) {
19755 /* If the character at the beginning of the range isn't an ASCII
19756 * printable, effectively split the range into two parts:
19757 * 1) the portion before the first such printable,
19759 * and output them separately. */
19760 if (! isPRINT_A(start)) {
19761 UV temp_end = start + 1;
19763 /* There is no point looking beyond the final possible
19764 * printable, in MAX_PRINT_A */
19765 UV max = MIN(end, MAX_PRINT_A);
19767 while (temp_end <= max && ! isPRINT_A(temp_end)) {
19771 /* Here, temp_end points to one beyond the first printable if
19772 * found, or to one beyond 'max' if not. If none found, make
19773 * sure that we use the entire range */
19774 if (temp_end > MAX_PRINT_A) {
19775 temp_end = end + 1;
19778 /* Output the first part of the split range: the part that
19779 * doesn't have printables, with the parameter set to not look
19780 * for literals (otherwise we would infinitely recurse) */
19781 put_range(sv, start, temp_end - 1, FALSE);
19783 /* The 2nd part of the range (if any) starts here. */
19786 /* We do a continue, instead of dropping down, because even if
19787 * the 2nd part is non-empty, it could be so short that we want
19788 * to output it as individual characters, as tested for at the
19789 * top of this loop. */
19793 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
19794 * output a sub-range of just the digits or letters, then process
19795 * the remaining portion as usual. */
19796 if (isALPHANUMERIC_A(start)) {
19797 UV mask = (isDIGIT_A(start))
19802 UV temp_end = start + 1;
19804 /* Find the end of the sub-range that includes just the
19805 * characters in the same class as the first character in it */
19806 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
19811 /* For short ranges, don't duplicate the code above to output
19812 * them; just call recursively */
19813 if (temp_end - start < min_range_count) {
19814 put_range(sv, start, temp_end, FALSE);
19816 else { /* Output as a range */
19817 put_code_point(sv, start);
19818 sv_catpvs(sv, "-");
19819 put_code_point(sv, temp_end);
19821 start = temp_end + 1;
19825 /* We output any other printables as individual characters */
19826 if (isPUNCT_A(start) || isSPACE_A(start)) {
19827 while (start <= end && (isPUNCT_A(start)
19828 || isSPACE_A(start)))
19830 put_code_point(sv, start);
19835 } /* End of looking for literals */
19837 /* Here is not to output as a literal. Some control characters have
19838 * mnemonic names. Split off any of those at the beginning and end of
19839 * the range to print mnemonically. It isn't possible for many of
19840 * these to be in a row, so this won't overwhelm with output */
19841 while (isMNEMONIC_CNTRL(start) && start <= end) {
19842 put_code_point(sv, start);
19845 if (start < end && isMNEMONIC_CNTRL(end)) {
19847 /* Here, the final character in the range has a mnemonic name.
19848 * Work backwards from the end to find the final non-mnemonic */
19849 UV temp_end = end - 1;
19850 while (isMNEMONIC_CNTRL(temp_end)) {
19854 /* And separately output the interior range that doesn't start or
19855 * end with mnemonics */
19856 put_range(sv, start, temp_end, FALSE);
19858 /* Then output the mnemonic trailing controls */
19859 start = temp_end + 1;
19860 while (start <= end) {
19861 put_code_point(sv, start);
19867 /* As a final resort, output the range or subrange as hex. */
19869 this_end = (end < NUM_ANYOF_CODE_POINTS)
19871 : NUM_ANYOF_CODE_POINTS - 1;
19872 #if NUM_ANYOF_CODE_POINTS > 256
19873 format = (this_end < 256)
19874 ? "\\x%02"UVXf"-\\x%02"UVXf""
19875 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
19877 format = "\\x%02"UVXf"-\\x%02"UVXf"";
19879 GCC_DIAG_IGNORE(-Wformat-nonliteral);
19880 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
19887 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
19889 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
19893 bool allow_literals = TRUE;
19895 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
19897 /* Generally, it is more readable if printable characters are output as
19898 * literals, but if a range (nearly) spans all of them, it's best to output
19899 * it as a single range. This code will use a single range if all but 2
19900 * ASCII printables are in it */
19901 invlist_iterinit(invlist);
19902 while (invlist_iternext(invlist, &start, &end)) {
19904 /* If the range starts beyond the final printable, it doesn't have any
19906 if (start > MAX_PRINT_A) {
19910 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
19911 * all but two, the range must start and end no later than 2 from
19913 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
19914 if (end > MAX_PRINT_A) {
19920 if (end - start >= MAX_PRINT_A - ' ' - 2) {
19921 allow_literals = FALSE;
19926 invlist_iterfinish(invlist);
19928 /* Here we have figured things out. Output each range */
19929 invlist_iterinit(invlist);
19930 while (invlist_iternext(invlist, &start, &end)) {
19931 if (start >= NUM_ANYOF_CODE_POINTS) {
19934 put_range(sv, start, end, allow_literals);
19936 invlist_iterfinish(invlist);
19942 S_put_charclass_bitmap_innards_common(pTHX_
19943 SV* invlist, /* The bitmap */
19944 SV* posixes, /* Under /l, things like [:word:], \S */
19945 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
19946 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
19947 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
19948 const bool invert /* Is the result to be inverted? */
19951 /* Create and return an SV containing a displayable version of the bitmap
19952 * and associated information determined by the input parameters. */
19956 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
19959 output = newSVpvs("^");
19962 output = newSVpvs("");
19965 /* First, the code points in the bitmap that are unconditionally there */
19966 put_charclass_bitmap_innards_invlist(output, invlist);
19968 /* Traditionally, these have been placed after the main code points */
19970 sv_catsv(output, posixes);
19973 if (only_utf8 && _invlist_len(only_utf8)) {
19974 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
19975 put_charclass_bitmap_innards_invlist(output, only_utf8);
19978 if (not_utf8 && _invlist_len(not_utf8)) {
19979 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
19980 put_charclass_bitmap_innards_invlist(output, not_utf8);
19983 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
19984 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
19985 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
19987 /* This is the only list in this routine that can legally contain code
19988 * points outside the bitmap range. The call just above to
19989 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
19990 * output them here. There's about a half-dozen possible, and none in
19991 * contiguous ranges longer than 2 */
19992 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
19994 SV* above_bitmap = NULL;
19996 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
19998 invlist_iterinit(above_bitmap);
19999 while (invlist_iternext(above_bitmap, &start, &end)) {
20002 for (i = start; i <= end; i++) {
20003 put_code_point(output, i);
20006 invlist_iterfinish(above_bitmap);
20007 SvREFCNT_dec_NN(above_bitmap);
20011 /* If the only thing we output is the '^', clear it */
20012 if (invert && SvCUR(output) == 1) {
20013 SvCUR_set(output, 0);
20020 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20022 SV *nonbitmap_invlist,
20023 SV *only_utf8_locale_invlist,
20024 const regnode * const node)
20026 /* Appends to 'sv' a displayable version of the innards of the bracketed
20027 * character class defined by the other arguments:
20028 * 'bitmap' points to the bitmap.
20029 * 'nonbitmap_invlist' is an inversion list of the code points that are in
20030 * the bitmap range, but for some reason aren't in the bitmap; NULL if
20031 * none. The reasons for this could be that they require some
20032 * condition such as the target string being or not being in UTF-8
20033 * (under /d), or because they came from a user-defined property that
20034 * was not resolved at the time of the regex compilation (under /u)
20035 * 'only_utf8_locale_invlist' is an inversion list of the code points that
20036 * are valid only if the runtime locale is a UTF-8 one; NULL if none
20037 * 'node' is the regex pattern node. It is needed only when the above two
20038 * parameters are not null, and is passed so that this routine can
20039 * tease apart the various reasons for them.
20041 * It returns TRUE if there was actually something output. (It may be that
20042 * the bitmap, etc is empty.)
20044 * When called for outputting the bitmap of a non-ANYOF node, just pass the
20045 * bitmap, with the succeeding parameters set to NULL.
20049 /* In general, it tries to display the 'cleanest' representation of the
20050 * innards, choosing whether to display them inverted or not, regardless of
20051 * whether the class itself is to be inverted. However, there are some
20052 * cases where it can't try inverting, as what actually matches isn't known
20053 * until runtime, and hence the inversion isn't either. */
20054 bool inverting_allowed = TRUE;
20057 STRLEN orig_sv_cur = SvCUR(sv);
20059 SV* invlist; /* Inversion list we accumulate of code points that
20060 are unconditionally matched */
20061 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
20063 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
20065 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
20066 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
20069 SV* as_is_display; /* The output string when we take the inputs
20071 SV* inverted_display; /* The output string when we invert the inputs */
20073 U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20075 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
20077 /* We are biased in favor of displaying things without them being inverted,
20078 * as that is generally easier to understand */
20079 const int bias = 5;
20081 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20083 /* Start off with whatever code points are passed in. (We clone, so we
20084 * don't change the caller's list) */
20085 if (nonbitmap_invlist) {
20086 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20087 invlist = invlist_clone(nonbitmap_invlist);
20089 else { /* Worst case size is every other code point is matched */
20090 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20094 if (OP(node) == ANYOFD) {
20096 /* This flag indicates that the code points below 0x100 in the
20097 * nonbitmap list are precisely the ones that match only when the
20098 * target is UTF-8 (they should all be non-ASCII). */
20099 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20101 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20102 _invlist_subtract(invlist, only_utf8, &invlist);
20105 /* And this flag for matching all non-ASCII 0xFF and below */
20106 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20108 not_utf8 = invlist_clone(PL_UpperLatin1);
20111 else if (OP(node) == ANYOFL) {
20113 /* If either of these flags are set, what matches isn't
20114 * determinable except during execution, so don't know enough here
20116 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20117 inverting_allowed = FALSE;
20120 /* What the posix classes match also varies at runtime, so these
20121 * will be output symbolically. */
20122 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20125 posixes = newSVpvs("");
20126 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20127 if (ANYOF_POSIXL_TEST(node,i)) {
20128 sv_catpv(posixes, anyofs[i]);
20135 /* Accumulate the bit map into the unconditional match list */
20136 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20137 if (BITMAP_TEST(bitmap, i)) {
20139 for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20142 invlist = _add_range_to_invlist(invlist, start, i-1);
20146 /* Make sure that the conditional match lists don't have anything in them
20147 * that match unconditionally; otherwise the output is quite confusing.
20148 * This could happen if the code that populates these misses some
20151 _invlist_subtract(only_utf8, invlist, &only_utf8);
20154 _invlist_subtract(not_utf8, invlist, ¬_utf8);
20157 if (only_utf8_locale_invlist) {
20159 /* Since this list is passed in, we have to make a copy before
20161 only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20163 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20165 /* And, it can get really weird for us to try outputting an inverted
20166 * form of this list when it has things above the bitmap, so don't even
20168 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20169 inverting_allowed = FALSE;
20173 /* Calculate what the output would be if we take the input as-is */
20174 as_is_display = put_charclass_bitmap_innards_common(invlist,
20181 /* If have to take the output as-is, just do that */
20182 if (! inverting_allowed) {
20183 sv_catsv(sv, as_is_display);
20185 else { /* But otherwise, create the output again on the inverted input, and
20186 use whichever version is shorter */
20188 int inverted_bias, as_is_bias;
20190 /* We will apply our bias to whichever of the the results doesn't have
20200 inverted_bias = bias;
20203 /* Now invert each of the lists that contribute to the output,
20204 * excluding from the result things outside the possible range */
20206 /* For the unconditional inversion list, we have to add in all the
20207 * conditional code points, so that when inverted, they will be gone
20209 _invlist_union(only_utf8, invlist, &invlist);
20210 _invlist_union(not_utf8, invlist, &invlist);
20211 _invlist_union(only_utf8_locale, invlist, &invlist);
20212 _invlist_invert(invlist);
20213 _invlist_intersection(invlist, PL_InBitmap, &invlist);
20216 _invlist_invert(only_utf8);
20217 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20221 _invlist_invert(not_utf8);
20222 _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8);
20225 if (only_utf8_locale) {
20226 _invlist_invert(only_utf8_locale);
20227 _invlist_intersection(only_utf8_locale,
20229 &only_utf8_locale);
20232 inverted_display = put_charclass_bitmap_innards_common(
20237 only_utf8_locale, invert);
20239 /* Use the shortest representation, taking into account our bias
20240 * against showing it inverted */
20241 if (SvCUR(inverted_display) + inverted_bias
20242 < SvCUR(as_is_display) + as_is_bias)
20244 sv_catsv(sv, inverted_display);
20247 sv_catsv(sv, as_is_display);
20250 SvREFCNT_dec_NN(as_is_display);
20251 SvREFCNT_dec_NN(inverted_display);
20254 SvREFCNT_dec_NN(invlist);
20255 SvREFCNT_dec(only_utf8);
20256 SvREFCNT_dec(not_utf8);
20257 SvREFCNT_dec(posixes);
20258 SvREFCNT_dec(only_utf8_locale);
20260 return SvCUR(sv) > orig_sv_cur;
20263 #define CLEAR_OPTSTART \
20264 if (optstart) STMT_START { \
20265 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
20266 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
20270 #define DUMPUNTIL(b,e) \
20272 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20274 STATIC const regnode *
20275 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20276 const regnode *last, const regnode *plast,
20277 SV* sv, I32 indent, U32 depth)
20279 U8 op = PSEUDO; /* Arbitrary non-END op. */
20280 const regnode *next;
20281 const regnode *optstart= NULL;
20283 RXi_GET_DECL(r,ri);
20284 GET_RE_DEBUG_FLAGS_DECL;
20286 PERL_ARGS_ASSERT_DUMPUNTIL;
20288 #ifdef DEBUG_DUMPUNTIL
20289 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start,
20290 last ? last-start : 0,plast ? plast-start : 0);
20293 if (plast && plast < last)
20296 while (PL_regkind[op] != END && (!last || node < last)) {
20298 /* While that wasn't END last time... */
20301 if (op == CLOSE || op == WHILEM)
20303 next = regnext((regnode *)node);
20306 if (OP(node) == OPTIMIZED) {
20307 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20314 regprop(r, sv, node, NULL, NULL);
20315 Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start),
20316 (int)(2*indent + 1), "", SvPVX_const(sv));
20318 if (OP(node) != OPTIMIZED) {
20319 if (next == NULL) /* Next ptr. */
20320 Perl_re_printf( aTHX_ " (0)");
20321 else if (PL_regkind[(U8)op] == BRANCH
20322 && PL_regkind[OP(next)] != BRANCH )
20323 Perl_re_printf( aTHX_ " (FAIL)");
20325 Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start));
20326 Perl_re_printf( aTHX_ "\n");
20330 if (PL_regkind[(U8)op] == BRANCHJ) {
20333 const regnode *nnode = (OP(next) == LONGJMP
20334 ? regnext((regnode *)next)
20336 if (last && nnode > last)
20338 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20341 else if (PL_regkind[(U8)op] == BRANCH) {
20343 DUMPUNTIL(NEXTOPER(node), next);
20345 else if ( PL_regkind[(U8)op] == TRIE ) {
20346 const regnode *this_trie = node;
20347 const char op = OP(node);
20348 const U32 n = ARG(node);
20349 const reg_ac_data * const ac = op>=AHOCORASICK ?
20350 (reg_ac_data *)ri->data->data[n] :
20352 const reg_trie_data * const trie =
20353 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20355 AV *const trie_words
20356 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20358 const regnode *nextbranch= NULL;
20361 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20362 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20364 Perl_re_indentf( aTHX_ "%s ",
20367 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20368 SvCUR(*elem_ptr), 60,
20369 PL_colors[0], PL_colors[1],
20371 ? PERL_PV_ESCAPE_UNI
20373 | PERL_PV_PRETTY_ELLIPSES
20374 | PERL_PV_PRETTY_LTGT
20379 U16 dist= trie->jump[word_idx+1];
20380 Perl_re_printf( aTHX_ "(%"UVuf")\n",
20381 (UV)((dist ? this_trie + dist : next) - start));
20384 nextbranch= this_trie + trie->jump[0];
20385 DUMPUNTIL(this_trie + dist, nextbranch);
20387 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20388 nextbranch= regnext((regnode *)nextbranch);
20390 Perl_re_printf( aTHX_ "\n");
20393 if (last && next > last)
20398 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
20399 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20400 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20402 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20404 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20406 else if ( op == PLUS || op == STAR) {
20407 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20409 else if (PL_regkind[(U8)op] == ANYOF) {
20410 /* arglen 1 + class block */
20411 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20412 ? ANYOF_POSIXL_SKIP
20414 node = NEXTOPER(node);
20416 else if (PL_regkind[(U8)op] == EXACT) {
20417 /* Literal string, where present. */
20418 node += NODE_SZ_STR(node) - 1;
20419 node = NEXTOPER(node);
20422 node = NEXTOPER(node);
20423 node += regarglen[(U8)op];
20425 if (op == CURLYX || op == OPEN)
20429 #ifdef DEBUG_DUMPUNTIL
20430 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
20435 #endif /* DEBUGGING */
20438 * ex: set ts=8 sts=4 sw=4 et: