5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
104 /* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
108 typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
119 /* Certain characters are output as a sequence with the first being a
121 #define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
124 struct RExC_state_t {
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
127 char *precomp; /* uncompiled string. */
128 char *precomp_end; /* pointer to end of uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 char *adjusted_start; /* 'start', adjusted. See code use */
137 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
138 SSize_t whilem_seen; /* number of WHILEM in this expr */
139 regnode *emit_start; /* Start of emitted-code area */
140 regnode *emit_bound; /* First regnode outside of the
142 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
143 implies compiling, so don't emit */
144 regnode_ssc emit_dummy; /* placeholder for emit to point to;
145 large enough for the largest
146 non-EXACTish node, so can use it as
148 I32 naughty; /* How bad is this pattern? */
149 I32 sawback; /* Did we see \1, ...? */
151 SSize_t size; /* Code size. */
152 I32 npar; /* Capture buffer count, (OPEN) plus
153 one. ("par" 0 is the whole
155 I32 nestroot; /* root parens we are in - used by
159 regnode **open_parens; /* pointers to open parens */
160 regnode **close_parens; /* pointers to close parens */
161 regnode *end_op; /* END node in program */
162 I32 utf8; /* whether the pattern is utf8 or not */
163 I32 orig_utf8; /* whether the pattern was originally in utf8 */
164 /* XXX use this for future optimisation of case
165 * where pattern must be upgraded to utf8. */
166 I32 uni_semantics; /* If a d charset modifier should use unicode
167 rules, even if the pattern is not in
169 HV *paren_names; /* Paren names */
171 regnode **recurse; /* Recurse regops */
172 I32 recurse_count; /* Number of recurse regops we have generated */
173 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
175 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
178 I32 override_recoding;
180 I32 recode_x_to_native;
182 I32 in_multi_char_class;
183 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
185 int code_index; /* next code_blocks[] slot */
186 SSize_t maxlen; /* mininum possible number of chars in string to match */
187 scan_frame *frame_head;
188 scan_frame *frame_last;
191 #ifdef ADD_TO_REGEXEC
192 char *starttry; /* -Dr: where regtry was called. */
193 #define RExC_starttry (pRExC_state->starttry)
195 SV *runtime_code_qr; /* qr with the runtime code blocks */
197 const char *lastparse;
199 AV *paren_name_list; /* idx -> name */
200 U32 study_chunk_recursed_count;
203 #define RExC_lastparse (pRExC_state->lastparse)
204 #define RExC_lastnum (pRExC_state->lastnum)
205 #define RExC_paren_name_list (pRExC_state->paren_name_list)
206 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
207 #define RExC_mysv (pRExC_state->mysv1)
208 #define RExC_mysv1 (pRExC_state->mysv1)
209 #define RExC_mysv2 (pRExC_state->mysv2)
212 bool seen_unfolded_sharp_s;
218 #define RExC_flags (pRExC_state->flags)
219 #define RExC_pm_flags (pRExC_state->pm_flags)
220 #define RExC_precomp (pRExC_state->precomp)
221 #define RExC_precomp_adj (pRExC_state->precomp_adj)
222 #define RExC_adjusted_start (pRExC_state->adjusted_start)
223 #define RExC_precomp_end (pRExC_state->precomp_end)
224 #define RExC_rx_sv (pRExC_state->rx_sv)
225 #define RExC_rx (pRExC_state->rx)
226 #define RExC_rxi (pRExC_state->rxi)
227 #define RExC_start (pRExC_state->start)
228 #define RExC_end (pRExC_state->end)
229 #define RExC_parse (pRExC_state->parse)
230 #define RExC_whilem_seen (pRExC_state->whilem_seen)
232 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
242 #ifdef RE_TRACK_PATTERN_OFFSETS
243 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
246 #define RExC_emit (pRExC_state->emit)
247 #define RExC_emit_dummy (pRExC_state->emit_dummy)
248 #define RExC_emit_start (pRExC_state->emit_start)
249 #define RExC_emit_bound (pRExC_state->emit_bound)
250 #define RExC_sawback (pRExC_state->sawback)
251 #define RExC_seen (pRExC_state->seen)
252 #define RExC_size (pRExC_state->size)
253 #define RExC_maxlen (pRExC_state->maxlen)
254 #define RExC_npar (pRExC_state->npar)
255 #define RExC_nestroot (pRExC_state->nestroot)
256 #define RExC_extralen (pRExC_state->extralen)
257 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
258 #define RExC_utf8 (pRExC_state->utf8)
259 #define RExC_uni_semantics (pRExC_state->uni_semantics)
260 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
261 #define RExC_open_parens (pRExC_state->open_parens)
262 #define RExC_close_parens (pRExC_state->close_parens)
263 #define RExC_end_op (pRExC_state->end_op)
264 #define RExC_paren_names (pRExC_state->paren_names)
265 #define RExC_recurse (pRExC_state->recurse)
266 #define RExC_recurse_count (pRExC_state->recurse_count)
267 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
268 #define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
270 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
271 #define RExC_contains_locale (pRExC_state->contains_locale)
273 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
275 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
276 #define RExC_frame_head (pRExC_state->frame_head)
277 #define RExC_frame_last (pRExC_state->frame_last)
278 #define RExC_frame_count (pRExC_state->frame_count)
279 #define RExC_strict (pRExC_state->strict)
280 #define RExC_study_started (pRExC_state->study_started)
281 #define RExC_warn_text (pRExC_state->warn_text)
282 #define RExC_in_script_run (pRExC_state->in_script_run)
284 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
289 #define RExC_naughty (pRExC_state->naughty)
290 #define TOO_NAUGHTY (10)
291 #define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294 #define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
298 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
300 ((*s) == '{' && regcurly(s)))
303 * Flags to be passed up and down.
305 #define WORST 0 /* Worst case. */
306 #define HASWIDTH 0x01 /* Known to match non-null strings. */
308 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
313 #define SPSTART 0x04 /* Starts with * or + */
314 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
316 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */
317 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
318 calcuate sizes as UTF-8 */
320 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
322 /* whether trie related optimizations are enabled */
323 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324 #define TRIE_STUDY_OPT
325 #define FULL_TRIE_STUDY
331 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332 #define PBITVAL(paren) (1 << ((paren) & 7))
333 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
337 #define REQUIRE_UTF8(flagp) STMT_START { \
340 *flagp = RESTART_PASS1|NEED_UTF8; \
345 /* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
348 * we've change to /u during the parse. */
349 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
351 if (DEPENDS_SEMANTICS) { \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
356 *flagp |= RESTART_PASS1; \
357 return restart_retval; \
362 /* This converts the named class defined in regcomp.h to its equivalent class
363 * number defined in handy.h. */
364 #define namedclass_to_classnum(class) ((int) ((class) / 2))
365 #define classnum_to_namedclass(classnum) ((classnum) * 2)
367 #define _invlist_union_complement_2nd(a, b, output) \
368 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
369 #define _invlist_intersection_complement_2nd(a, b, output) \
370 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
372 /* About scan_data_t.
374 During optimisation we recurse through the regexp program performing
375 various inplace (keyhole style) optimisations. In addition study_chunk
376 and scan_commit populate this data structure with information about
377 what strings MUST appear in the pattern. We look for the longest
378 string that must appear at a fixed location, and we look for the
379 longest string that may appear at a floating location. So for instance
384 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
385 strings (because they follow a .* construct). study_chunk will identify
386 both FOO and BAR as being the longest fixed and floating strings respectively.
388 The strings can be composites, for instance
392 will result in a composite fixed substring 'foo'.
394 For each string some basic information is maintained:
397 This is the position the string must appear at, or not before.
398 It also implicitly (when combined with minlenp) tells us how many
399 characters must match before the string we are searching for.
400 Likewise when combined with minlenp and the length of the string it
401 tells us how many characters must appear after the string we have
405 Only used for floating strings. This is the rightmost point that
406 the string can appear at. If set to SSize_t_MAX it indicates that the
407 string can occur infinitely far to the right.
408 For fixed strings, it is equal to min_offset.
411 A pointer to the minimum number of characters of the pattern that the
412 string was found inside. This is important as in the case of positive
413 lookahead or positive lookbehind we can have multiple patterns
418 The minimum length of the pattern overall is 3, the minimum length
419 of the lookahead part is 3, but the minimum length of the part that
420 will actually match is 1. So 'FOO's minimum length is 3, but the
421 minimum length for the F is 1. This is important as the minimum length
422 is used to determine offsets in front of and behind the string being
423 looked for. Since strings can be composites this is the length of the
424 pattern at the time it was committed with a scan_commit. Note that
425 the length is calculated by study_chunk, so that the minimum lengths
426 are not known until the full pattern has been compiled, thus the
427 pointer to the value.
431 In the case of lookbehind the string being searched for can be
432 offset past the start point of the final matching string.
433 If this value was just blithely removed from the min_offset it would
434 invalidate some of the calculations for how many chars must match
435 before or after (as they are derived from min_offset and minlen and
436 the length of the string being searched for).
437 When the final pattern is compiled and the data is moved from the
438 scan_data_t structure into the regexp structure the information
439 about lookbehind is factored in, with the information that would
440 have been lost precalculated in the end_shift field for the
443 The fields pos_min and pos_delta are used to store the minimum offset
444 and the delta to the maximum offset at the current point in the pattern.
448 struct scan_data_substrs {
449 SV *str; /* longest substring found in pattern */
450 SSize_t min_offset; /* earliest point in string it can appear */
451 SSize_t max_offset; /* latest point in string it can appear */
452 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
453 SSize_t lookbehind; /* is the pos of the string modified by LB */
454 I32 flags; /* per substring SF_* and SCF_* flags */
457 typedef struct scan_data_t {
458 /*I32 len_min; unused */
459 /*I32 len_delta; unused */
463 SSize_t last_end; /* min value, <0 unless valid. */
464 SSize_t last_start_min;
465 SSize_t last_start_max;
466 U8 cur_is_floating; /* whether the last_* values should be set as
467 * the next fixed (0) or floating (1)
470 /* [0] is longest fixed substring so far, [1] is longest float so far */
471 struct scan_data_substrs substrs[2];
473 I32 flags; /* common SF_* and SCF_* flags */
475 SSize_t *last_closep;
476 regnode_ssc *start_class;
480 * Forward declarations for pregcomp()'s friends.
483 static const scan_data_t zero_scan_data = {
484 0, 0, NULL, 0, 0, 0, 0,
486 { NULL, 0, 0, 0, 0, 0 },
487 { NULL, 0, 0, 0, 0, 0 },
494 #define SF_BEFORE_SEOL 0x0001
495 #define SF_BEFORE_MEOL 0x0002
496 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
498 #define SF_IS_INF 0x0040
499 #define SF_HAS_PAR 0x0080
500 #define SF_IN_PAR 0x0100
501 #define SF_HAS_EVAL 0x0200
504 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
505 * longest substring in the pattern. When it is not set the optimiser keeps
506 * track of position, but does not keep track of the actual strings seen,
508 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
511 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
512 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
513 * turned off because of the alternation (BRANCH). */
514 #define SCF_DO_SUBSTR 0x0400
516 #define SCF_DO_STCLASS_AND 0x0800
517 #define SCF_DO_STCLASS_OR 0x1000
518 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
519 #define SCF_WHILEM_VISITED_POS 0x2000
521 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
522 #define SCF_SEEN_ACCEPT 0x8000
523 #define SCF_TRIE_DOING_RESTUDY 0x10000
524 #define SCF_IN_DEFINE 0x20000
529 #define UTF cBOOL(RExC_utf8)
531 /* The enums for all these are ordered so things work out correctly */
532 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
533 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
534 == REGEX_DEPENDS_CHARSET)
535 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
536 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
537 >= REGEX_UNICODE_CHARSET)
538 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
539 == REGEX_ASCII_RESTRICTED_CHARSET)
540 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
541 >= REGEX_ASCII_RESTRICTED_CHARSET)
542 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
543 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
545 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
547 /* For programs that want to be strictly Unicode compatible by dying if any
548 * attempt is made to match a non-Unicode code point against a Unicode
550 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
552 #define OOB_NAMEDCLASS -1
554 /* There is no code point that is out-of-bounds, so this is problematic. But
555 * its only current use is to initialize a variable that is always set before
557 #define OOB_UNICODE 0xDEADBEEF
559 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
562 /* length of regex to show in messages that don't mark a position within */
563 #define RegexLengthToShowInErrorMessages 127
566 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
567 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
568 * op/pragma/warn/regcomp.
570 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
571 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
573 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
574 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
576 /* The code in this file in places uses one level of recursion with parsing
577 * rebased to an alternate string constructed by us in memory. This can take
578 * the form of something that is completely different from the input, or
579 * something that uses the input as part of the alternate. In the first case,
580 * there should be no possibility of an error, as we are in complete control of
581 * the alternate string. But in the second case we don't control the input
582 * portion, so there may be errors in that. Here's an example:
584 * is handled specially because \x{df} folds to a sequence of more than one
585 * character, 'ss'. What is done is to create and parse an alternate string,
586 * which looks like this:
587 * /(?:\x{DF}|[abc\x{DF}def])/ui
588 * where it uses the input unchanged in the middle of something it constructs,
589 * which is a branch for the DF outside the character class, and clustering
590 * parens around the whole thing. (It knows enough to skip the DF inside the
591 * class while in this substitute parse.) 'abc' and 'def' may have errors that
592 * need to be reported. The general situation looks like this:
595 * Input: ----------------------------------------------------
596 * Constructed: ---------------------------------------------------
599 * The input string sI..eI is the input pattern. The string sC..EC is the
600 * constructed substitute parse string. The portions sC..tC and eC..EC are
601 * constructed by us. The portion tC..eC is an exact duplicate of the input
602 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
603 * while parsing, we find an error at xC. We want to display a message showing
604 * the real input string. Thus we need to find the point xI in it which
605 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
606 * been constructed by us, and so shouldn't have errors. We get:
608 * xI = sI + (tI - sI) + (xC - tC)
610 * and, the offset into sI is:
612 * (xI - sI) = (tI - sI) + (xC - tC)
614 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
615 * and we save tC as RExC_adjusted_start.
617 * During normal processing of the input pattern, everything points to that,
618 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
621 #define tI_sI RExC_precomp_adj
622 #define tC RExC_adjusted_start
623 #define sC RExC_precomp
624 #define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
625 #define xI(xC) (sC + xI_offset(xC))
626 #define eC RExC_precomp_end
628 #define REPORT_LOCATION_ARGS(xC) \
630 (xI(xC) > eC) /* Don't run off end */ \
631 ? eC - sC /* Length before the <--HERE */ \
632 : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ), \
633 sC), /* The input pattern printed up to the <--HERE */ \
635 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
636 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
638 /* Used to point after bad bytes for an error message, but avoid skipping
639 * past a nul byte. */
640 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
643 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
644 * arg. Show regex, up to a maximum length. If it's too long, chop and add
647 #define _FAIL(code) STMT_START { \
648 const char *ellipses = ""; \
649 IV len = RExC_precomp_end - RExC_precomp; \
652 SAVEFREESV(RExC_rx_sv); \
653 if (len > RegexLengthToShowInErrorMessages) { \
654 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
655 len = RegexLengthToShowInErrorMessages - 10; \
661 #define FAIL(msg) _FAIL( \
662 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
663 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
665 #define FAIL2(msg,arg) _FAIL( \
666 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
667 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
670 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
672 #define Simple_vFAIL(m) STMT_START { \
673 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
674 m, REPORT_LOCATION_ARGS(RExC_parse)); \
678 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
680 #define vFAIL(m) STMT_START { \
682 SAVEFREESV(RExC_rx_sv); \
687 * Like Simple_vFAIL(), but accepts two arguments.
689 #define Simple_vFAIL2(m,a1) STMT_START { \
690 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
691 REPORT_LOCATION_ARGS(RExC_parse)); \
695 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
697 #define vFAIL2(m,a1) STMT_START { \
699 SAVEFREESV(RExC_rx_sv); \
700 Simple_vFAIL2(m, a1); \
705 * Like Simple_vFAIL(), but accepts three arguments.
707 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
708 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
709 REPORT_LOCATION_ARGS(RExC_parse)); \
713 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
715 #define vFAIL3(m,a1,a2) STMT_START { \
717 SAVEFREESV(RExC_rx_sv); \
718 Simple_vFAIL3(m, a1, a2); \
722 * Like Simple_vFAIL(), but accepts four arguments.
724 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
725 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
726 REPORT_LOCATION_ARGS(RExC_parse)); \
729 #define vFAIL4(m,a1,a2,a3) STMT_START { \
731 SAVEFREESV(RExC_rx_sv); \
732 Simple_vFAIL4(m, a1, a2, a3); \
735 /* A specialized version of vFAIL2 that works with UTF8f */
736 #define vFAIL2utf8f(m, a1) STMT_START { \
738 SAVEFREESV(RExC_rx_sv); \
739 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
740 REPORT_LOCATION_ARGS(RExC_parse)); \
743 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
745 SAVEFREESV(RExC_rx_sv); \
746 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
747 REPORT_LOCATION_ARGS(RExC_parse)); \
750 /* These have asserts in them because of [perl #122671] Many warnings in
751 * regcomp.c can occur twice. If they get output in pass1 and later in that
752 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
753 * would get output again. So they should be output in pass2, and these
754 * asserts make sure new warnings follow that paradigm. */
756 /* m is not necessarily a "literal string", in this macro */
757 #define reg_warn_non_literal_string(loc, m) STMT_START { \
758 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
759 "%s" REPORT_LOCATION, \
760 m, REPORT_LOCATION_ARGS(loc)); \
763 #define ckWARNreg(loc,m) STMT_START { \
764 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
766 REPORT_LOCATION_ARGS(loc)); \
769 #define vWARN(loc, m) STMT_START { \
770 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
772 REPORT_LOCATION_ARGS(loc)); \
775 #define vWARN_dep(loc, m) STMT_START { \
776 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
778 REPORT_LOCATION_ARGS(loc)); \
781 #define ckWARNdep(loc,m) STMT_START { \
782 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
784 REPORT_LOCATION_ARGS(loc)); \
787 #define ckWARNregdep(loc,m) STMT_START { \
788 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
791 REPORT_LOCATION_ARGS(loc)); \
794 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
795 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
797 a1, REPORT_LOCATION_ARGS(loc)); \
800 #define ckWARN2reg(loc, m, a1) STMT_START { \
801 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
803 a1, REPORT_LOCATION_ARGS(loc)); \
806 #define vWARN3(loc, m, a1, a2) STMT_START { \
807 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
809 a1, a2, REPORT_LOCATION_ARGS(loc)); \
812 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
813 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
816 REPORT_LOCATION_ARGS(loc)); \
819 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
820 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
823 REPORT_LOCATION_ARGS(loc)); \
826 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
827 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
830 REPORT_LOCATION_ARGS(loc)); \
833 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
834 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
837 REPORT_LOCATION_ARGS(loc)); \
840 /* Macros for recording node offsets. 20001227 mjd@plover.com
841 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
842 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
843 * Element 0 holds the number n.
844 * Position is 1 indexed.
846 #ifndef RE_TRACK_PATTERN_OFFSETS
847 #define Set_Node_Offset_To_R(node,byte)
848 #define Set_Node_Offset(node,byte)
849 #define Set_Cur_Node_Offset
850 #define Set_Node_Length_To_R(node,len)
851 #define Set_Node_Length(node,len)
852 #define Set_Node_Cur_Length(node,start)
853 #define Node_Offset(n)
854 #define Node_Length(n)
855 #define Set_Node_Offset_Length(node,offset,len)
856 #define ProgLen(ri) ri->u.proglen
857 #define SetProgLen(ri,x) ri->u.proglen = x
859 #define ProgLen(ri) ri->u.offsets[0]
860 #define SetProgLen(ri,x) ri->u.offsets[0] = x
861 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
863 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
864 __LINE__, (int)(node), (int)(byte))); \
866 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
869 RExC_offsets[2*(node)-1] = (byte); \
874 #define Set_Node_Offset(node,byte) \
875 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
876 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
878 #define Set_Node_Length_To_R(node,len) STMT_START { \
880 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
881 __LINE__, (int)(node), (int)(len))); \
883 Perl_croak(aTHX_ "value of node is %d in Length macro", \
886 RExC_offsets[2*(node)] = (len); \
891 #define Set_Node_Length(node,len) \
892 Set_Node_Length_To_R((node)-RExC_emit_start, len)
893 #define Set_Node_Cur_Length(node, start) \
894 Set_Node_Length(node, RExC_parse - start)
896 /* Get offsets and lengths */
897 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
898 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
900 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
901 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
902 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
906 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
907 #define EXPERIMENTAL_INPLACESCAN
908 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
912 Perl_re_printf(pTHX_ const char *fmt, ...)
916 PerlIO *f= Perl_debug_log;
917 PERL_ARGS_ASSERT_RE_PRINTF;
919 result = PerlIO_vprintf(f, fmt, ap);
925 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
929 PerlIO *f= Perl_debug_log;
930 PERL_ARGS_ASSERT_RE_INDENTF;
932 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
933 result = PerlIO_vprintf(f, fmt, ap);
937 #endif /* DEBUGGING */
939 #define DEBUG_RExC_seen() \
940 DEBUG_OPTIMISE_MORE_r({ \
941 Perl_re_printf( aTHX_ "RExC_seen: "); \
943 if (RExC_seen & REG_ZERO_LEN_SEEN) \
944 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
946 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
947 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
949 if (RExC_seen & REG_GPOS_SEEN) \
950 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
952 if (RExC_seen & REG_RECURSE_SEEN) \
953 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
955 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
956 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
958 if (RExC_seen & REG_VERBARG_SEEN) \
959 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
961 if (RExC_seen & REG_CUTGROUP_SEEN) \
962 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
964 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
965 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
967 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
968 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
970 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
971 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
973 Perl_re_printf( aTHX_ "\n"); \
976 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
977 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
982 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
983 const char *close_str)
988 Perl_re_printf( aTHX_ "%s", open_str);
989 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
990 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
991 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
992 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
993 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
994 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
995 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
996 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
997 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
998 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
999 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1000 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1001 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1002 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1003 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1004 Perl_re_printf( aTHX_ "%s", close_str);
1009 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1010 U32 depth, int is_inf)
1012 GET_RE_DEBUG_FLAGS_DECL;
1014 DEBUG_OPTIMISE_MORE_r({
1017 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1021 (IV)data->pos_delta,
1025 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1027 Perl_re_printf( aTHX_
1028 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1030 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1031 is_inf ? "INF " : ""
1034 if (data->last_found) {
1036 Perl_re_printf(aTHX_
1037 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1038 SvPVX_const(data->last_found),
1040 (IV)data->last_start_min,
1041 (IV)data->last_start_max
1044 for (i = 0; i < 2; i++) {
1045 Perl_re_printf(aTHX_
1046 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1047 data->cur_is_floating == i ? "*" : "",
1048 i ? "Float" : "Fixed",
1049 SvPVX_const(data->substrs[i].str),
1050 (IV)data->substrs[i].min_offset,
1051 (IV)data->substrs[i].max_offset
1053 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1057 Perl_re_printf( aTHX_ "\n");
1063 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1064 regnode *scan, U32 depth, U32 flags)
1066 GET_RE_DEBUG_FLAGS_DECL;
1073 Next = regnext(scan);
1074 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1075 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1078 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1079 Next ? (REG_NODE_NUM(Next)) : 0 );
1080 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1081 Perl_re_printf( aTHX_ "\n");
1086 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1087 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1089 # define DEBUG_PEEP(str, scan, depth, flags) \
1090 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1093 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1094 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1098 /* =========================================================
1099 * BEGIN edit_distance stuff.
1101 * This calculates how many single character changes of any type are needed to
1102 * transform a string into another one. It is taken from version 3.1 of
1104 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1107 /* Our unsorted dictionary linked list. */
1108 /* Note we use UVs, not chars. */
1113 struct dictionary* next;
1115 typedef struct dictionary item;
1118 PERL_STATIC_INLINE item*
1119 push(UV key,item* curr)
1122 Newx(head, 1, item);
1130 PERL_STATIC_INLINE item*
1131 find(item* head, UV key)
1133 item* iterator = head;
1135 if (iterator->key == key){
1138 iterator = iterator->next;
1144 PERL_STATIC_INLINE item*
1145 uniquePush(item* head,UV key)
1147 item* iterator = head;
1150 if (iterator->key == key) {
1153 iterator = iterator->next;
1156 return push(key,head);
1159 PERL_STATIC_INLINE void
1160 dict_free(item* head)
1162 item* iterator = head;
1165 item* temp = iterator;
1166 iterator = iterator->next;
1173 /* End of Dictionary Stuff */
1175 /* All calculations/work are done here */
1177 S_edit_distance(const UV* src,
1179 const STRLEN x, /* length of src[] */
1180 const STRLEN y, /* length of tgt[] */
1181 const SSize_t maxDistance
1185 UV swapCount,swapScore,targetCharCount,i,j;
1187 UV score_ceil = x + y;
1189 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1191 /* intialize matrix start values */
1192 Newx(scores, ( (x + 2) * (y + 2)), UV);
1193 scores[0] = score_ceil;
1194 scores[1 * (y + 2) + 0] = score_ceil;
1195 scores[0 * (y + 2) + 1] = score_ceil;
1196 scores[1 * (y + 2) + 1] = 0;
1197 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1202 for (i=1;i<=x;i++) {
1204 head = uniquePush(head,src[i]);
1205 scores[(i+1) * (y + 2) + 1] = i;
1206 scores[(i+1) * (y + 2) + 0] = score_ceil;
1209 for (j=1;j<=y;j++) {
1212 head = uniquePush(head,tgt[j]);
1213 scores[1 * (y + 2) + (j + 1)] = j;
1214 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1217 targetCharCount = find(head,tgt[j-1])->value;
1218 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1220 if (src[i-1] != tgt[j-1]){
1221 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1225 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1229 find(head,src[i-1])->value = i;
1233 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1236 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1240 /* END of edit_distance() stuff
1241 * ========================================================= */
1243 /* is c a control character for which we have a mnemonic? */
1244 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1247 S_cntrl_to_mnemonic(const U8 c)
1249 /* Returns the mnemonic string that represents character 'c', if one
1250 * exists; NULL otherwise. The only ones that exist for the purposes of
1251 * this routine are a few control characters */
1254 case '\a': return "\\a";
1255 case '\b': return "\\b";
1256 case ESC_NATIVE: return "\\e";
1257 case '\f': return "\\f";
1258 case '\n': return "\\n";
1259 case '\r': return "\\r";
1260 case '\t': return "\\t";
1266 /* Mark that we cannot extend a found fixed substring at this point.
1267 Update the longest found anchored substring or the longest found
1268 floating substrings if needed. */
1271 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1272 SSize_t *minlenp, int is_inf)
1274 const STRLEN l = CHR_SVLEN(data->last_found);
1275 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1276 const STRLEN old_l = CHR_SVLEN(longest_sv);
1277 GET_RE_DEBUG_FLAGS_DECL;
1279 PERL_ARGS_ASSERT_SCAN_COMMIT;
1281 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1282 const U8 i = data->cur_is_floating;
1283 SvSetMagicSV(longest_sv, data->last_found);
1284 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1287 data->substrs[0].max_offset = data->substrs[0].min_offset;
1289 data->substrs[1].max_offset = (l
1290 ? data->last_start_max
1291 : (data->pos_delta > SSize_t_MAX - data->pos_min
1293 : data->pos_min + data->pos_delta));
1295 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1296 data->substrs[1].max_offset = SSize_t_MAX;
1299 if (data->flags & SF_BEFORE_EOL)
1300 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1302 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1303 data->substrs[i].minlenp = minlenp;
1304 data->substrs[i].lookbehind = 0;
1307 SvCUR_set(data->last_found, 0);
1309 SV * const sv = data->last_found;
1310 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1311 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1316 data->last_end = -1;
1317 data->flags &= ~SF_BEFORE_EOL;
1318 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1321 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1322 * list that describes which code points it matches */
1325 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1327 /* Set the SSC 'ssc' to match an empty string or any code point */
1329 PERL_ARGS_ASSERT_SSC_ANYTHING;
1331 assert(is_ANYOF_SYNTHETIC(ssc));
1333 /* mortalize so won't leak */
1334 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1335 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1339 S_ssc_is_anything(const regnode_ssc *ssc)
1341 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1342 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1343 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1344 * in any way, so there's no point in using it */
1349 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1351 assert(is_ANYOF_SYNTHETIC(ssc));
1353 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1357 /* See if the list consists solely of the range 0 - Infinity */
1358 invlist_iterinit(ssc->invlist);
1359 ret = invlist_iternext(ssc->invlist, &start, &end)
1363 invlist_iterfinish(ssc->invlist);
1369 /* If e.g., both \w and \W are set, matches everything */
1370 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1372 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1373 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1383 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1385 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1386 * string, any code point, or any posix class under locale */
1388 PERL_ARGS_ASSERT_SSC_INIT;
1390 Zero(ssc, 1, regnode_ssc);
1391 set_ANYOF_SYNTHETIC(ssc);
1392 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1395 /* If any portion of the regex is to operate under locale rules that aren't
1396 * fully known at compile time, initialization includes it. The reason
1397 * this isn't done for all regexes is that the optimizer was written under
1398 * the assumption that locale was all-or-nothing. Given the complexity and
1399 * lack of documentation in the optimizer, and that there are inadequate
1400 * test cases for locale, many parts of it may not work properly, it is
1401 * safest to avoid locale unless necessary. */
1402 if (RExC_contains_locale) {
1403 ANYOF_POSIXL_SETALL(ssc);
1406 ANYOF_POSIXL_ZERO(ssc);
1411 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1412 const regnode_ssc *ssc)
1414 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1415 * to the list of code points matched, and locale posix classes; hence does
1416 * not check its flags) */
1421 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1423 assert(is_ANYOF_SYNTHETIC(ssc));
1425 invlist_iterinit(ssc->invlist);
1426 ret = invlist_iternext(ssc->invlist, &start, &end)
1430 invlist_iterfinish(ssc->invlist);
1436 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1444 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1445 const regnode_charclass* const node)
1447 /* Returns a mortal inversion list defining which code points are matched
1448 * by 'node', which is of type ANYOF. Handles complementing the result if
1449 * appropriate. If some code points aren't knowable at this time, the
1450 * returned list must, and will, contain every code point that is a
1454 SV* only_utf8_locale_invlist = NULL;
1456 const U32 n = ARG(node);
1457 bool new_node_has_latin1 = FALSE;
1459 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1461 /* Look at the data structure created by S_set_ANYOF_arg() */
1462 if (n != ANYOF_ONLY_HAS_BITMAP) {
1463 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1464 AV * const av = MUTABLE_AV(SvRV(rv));
1465 SV **const ary = AvARRAY(av);
1466 assert(RExC_rxi->data->what[n] == 's');
1468 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1469 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1471 else if (ary[0] && ary[0] != &PL_sv_undef) {
1473 /* Here, no compile-time swash, and there are things that won't be
1474 * known until runtime -- we have to assume it could be anything */
1475 invlist = sv_2mortal(_new_invlist(1));
1476 return _add_range_to_invlist(invlist, 0, UV_MAX);
1478 else if (ary[3] && ary[3] != &PL_sv_undef) {
1480 /* Here no compile-time swash, and no run-time only data. Use the
1481 * node's inversion list */
1482 invlist = sv_2mortal(invlist_clone(ary[3]));
1485 /* Get the code points valid only under UTF-8 locales */
1486 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1487 && ary[2] && ary[2] != &PL_sv_undef)
1489 only_utf8_locale_invlist = ary[2];
1494 invlist = sv_2mortal(_new_invlist(0));
1497 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1498 * code points, and an inversion list for the others, but if there are code
1499 * points that should match only conditionally on the target string being
1500 * UTF-8, those are placed in the inversion list, and not the bitmap.
1501 * Since there are circumstances under which they could match, they are
1502 * included in the SSC. But if the ANYOF node is to be inverted, we have
1503 * to exclude them here, so that when we invert below, the end result
1504 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1505 * have to do this here before we add the unconditionally matched code
1507 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1508 _invlist_intersection_complement_2nd(invlist,
1513 /* Add in the points from the bit map */
1514 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1515 if (ANYOF_BITMAP_TEST(node, i)) {
1516 unsigned int start = i++;
1518 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1521 invlist = _add_range_to_invlist(invlist, start, i-1);
1522 new_node_has_latin1 = TRUE;
1526 /* If this can match all upper Latin1 code points, have to add them
1527 * as well. But don't add them if inverting, as when that gets done below,
1528 * it would exclude all these characters, including the ones it shouldn't
1529 * that were added just above */
1530 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1531 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1533 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1536 /* Similarly for these */
1537 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1538 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1541 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1542 _invlist_invert(invlist);
1544 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1546 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1547 * locale. We can skip this if there are no 0-255 at all. */
1548 _invlist_union(invlist, PL_Latin1, &invlist);
1551 /* Similarly add the UTF-8 locale possible matches. These have to be
1552 * deferred until after the non-UTF-8 locale ones are taken care of just
1553 * above, or it leads to wrong results under ANYOF_INVERT */
1554 if (only_utf8_locale_invlist) {
1555 _invlist_union_maybe_complement_2nd(invlist,
1556 only_utf8_locale_invlist,
1557 ANYOF_FLAGS(node) & ANYOF_INVERT,
1564 /* These two functions currently do the exact same thing */
1565 #define ssc_init_zero ssc_init
1567 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1568 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1570 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1571 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1572 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1575 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1576 const regnode_charclass *and_with)
1578 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1579 * another SSC or a regular ANYOF class. Can create false positives. */
1584 PERL_ARGS_ASSERT_SSC_AND;
1586 assert(is_ANYOF_SYNTHETIC(ssc));
1588 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1589 * the code point inversion list and just the relevant flags */
1590 if (is_ANYOF_SYNTHETIC(and_with)) {
1591 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1592 anded_flags = ANYOF_FLAGS(and_with);
1594 /* XXX This is a kludge around what appears to be deficiencies in the
1595 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1596 * there are paths through the optimizer where it doesn't get weeded
1597 * out when it should. And if we don't make some extra provision for
1598 * it like the code just below, it doesn't get added when it should.
1599 * This solution is to add it only when AND'ing, which is here, and
1600 * only when what is being AND'ed is the pristine, original node
1601 * matching anything. Thus it is like adding it to ssc_anything() but
1602 * only when the result is to be AND'ed. Probably the same solution
1603 * could be adopted for the same problem we have with /l matching,
1604 * which is solved differently in S_ssc_init(), and that would lead to
1605 * fewer false positives than that solution has. But if this solution
1606 * creates bugs, the consequences are only that a warning isn't raised
1607 * that should be; while the consequences for having /l bugs is
1608 * incorrect matches */
1609 if (ssc_is_anything((regnode_ssc *)and_with)) {
1610 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1614 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1615 if (OP(and_with) == ANYOFD) {
1616 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1619 anded_flags = ANYOF_FLAGS(and_with)
1620 &( ANYOF_COMMON_FLAGS
1621 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1622 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1623 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1625 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1630 ANYOF_FLAGS(ssc) &= anded_flags;
1632 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1633 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1634 * 'and_with' may be inverted. When not inverted, we have the situation of
1636 * (C1 | P1) & (C2 | P2)
1637 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1638 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1639 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1640 * <= ((C1 & C2) | P1 | P2)
1641 * Alternatively, the last few steps could be:
1642 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1643 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1644 * <= (C1 | C2 | (P1 & P2))
1645 * We favor the second approach if either P1 or P2 is non-empty. This is
1646 * because these components are a barrier to doing optimizations, as what
1647 * they match cannot be known until the moment of matching as they are
1648 * dependent on the current locale, 'AND"ing them likely will reduce or
1650 * But we can do better if we know that C1,P1 are in their initial state (a
1651 * frequent occurrence), each matching everything:
1652 * (<everything>) & (C2 | P2) = C2 | P2
1653 * Similarly, if C2,P2 are in their initial state (again a frequent
1654 * occurrence), the result is a no-op
1655 * (C1 | P1) & (<everything>) = C1 | P1
1658 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1659 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1660 * <= (C1 & ~C2) | (P1 & ~P2)
1663 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1664 && ! is_ANYOF_SYNTHETIC(and_with))
1668 ssc_intersection(ssc,
1670 FALSE /* Has already been inverted */
1673 /* If either P1 or P2 is empty, the intersection will be also; can skip
1675 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1676 ANYOF_POSIXL_ZERO(ssc);
1678 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1680 /* Note that the Posix class component P from 'and_with' actually
1682 * P = Pa | Pb | ... | Pn
1683 * where each component is one posix class, such as in [\w\s].
1685 * ~P = ~(Pa | Pb | ... | Pn)
1686 * = ~Pa & ~Pb & ... & ~Pn
1687 * <= ~Pa | ~Pb | ... | ~Pn
1688 * The last is something we can easily calculate, but unfortunately
1689 * is likely to have many false positives. We could do better
1690 * in some (but certainly not all) instances if two classes in
1691 * P have known relationships. For example
1692 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1694 * :lower: & :print: = :lower:
1695 * And similarly for classes that must be disjoint. For example,
1696 * since \s and \w can have no elements in common based on rules in
1697 * the POSIX standard,
1698 * \w & ^\S = nothing
1699 * Unfortunately, some vendor locales do not meet the Posix
1700 * standard, in particular almost everything by Microsoft.
1701 * The loop below just changes e.g., \w into \W and vice versa */
1703 regnode_charclass_posixl temp;
1704 int add = 1; /* To calculate the index of the complement */
1706 Zero(&temp, 1, regnode_charclass_posixl);
1707 ANYOF_POSIXL_ZERO(&temp);
1708 for (i = 0; i < ANYOF_MAX; i++) {
1710 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1711 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1713 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1714 ANYOF_POSIXL_SET(&temp, i + add);
1716 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1718 ANYOF_POSIXL_AND(&temp, ssc);
1720 } /* else ssc already has no posixes */
1721 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1722 in its initial state */
1723 else if (! is_ANYOF_SYNTHETIC(and_with)
1724 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1726 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1727 * copy it over 'ssc' */
1728 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1729 if (is_ANYOF_SYNTHETIC(and_with)) {
1730 StructCopy(and_with, ssc, regnode_ssc);
1733 ssc->invlist = anded_cp_list;
1734 ANYOF_POSIXL_ZERO(ssc);
1735 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1736 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1740 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1741 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1743 /* One or the other of P1, P2 is non-empty. */
1744 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1745 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1747 ssc_union(ssc, anded_cp_list, FALSE);
1749 else { /* P1 = P2 = empty */
1750 ssc_intersection(ssc, anded_cp_list, FALSE);
1756 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1757 const regnode_charclass *or_with)
1759 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1760 * another SSC or a regular ANYOF class. Can create false positives if
1761 * 'or_with' is to be inverted. */
1766 PERL_ARGS_ASSERT_SSC_OR;
1768 assert(is_ANYOF_SYNTHETIC(ssc));
1770 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1771 * the code point inversion list and just the relevant flags */
1772 if (is_ANYOF_SYNTHETIC(or_with)) {
1773 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1774 ored_flags = ANYOF_FLAGS(or_with);
1777 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1778 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1779 if (OP(or_with) != ANYOFD) {
1781 |= ANYOF_FLAGS(or_with)
1782 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1783 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1784 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1786 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1791 ANYOF_FLAGS(ssc) |= ored_flags;
1793 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1794 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1795 * 'or_with' may be inverted. When not inverted, we have the simple
1796 * situation of computing:
1797 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1798 * If P1|P2 yields a situation with both a class and its complement are
1799 * set, like having both \w and \W, this matches all code points, and we
1800 * can delete these from the P component of the ssc going forward. XXX We
1801 * might be able to delete all the P components, but I (khw) am not certain
1802 * about this, and it is better to be safe.
1805 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1806 * <= (C1 | P1) | ~C2
1807 * <= (C1 | ~C2) | P1
1808 * (which results in actually simpler code than the non-inverted case)
1811 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1812 && ! is_ANYOF_SYNTHETIC(or_with))
1814 /* We ignore P2, leaving P1 going forward */
1815 } /* else Not inverted */
1816 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1817 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1818 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1820 for (i = 0; i < ANYOF_MAX; i += 2) {
1821 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1823 ssc_match_all_cp(ssc);
1824 ANYOF_POSIXL_CLEAR(ssc, i);
1825 ANYOF_POSIXL_CLEAR(ssc, i+1);
1833 FALSE /* Already has been inverted */
1837 PERL_STATIC_INLINE void
1838 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1840 PERL_ARGS_ASSERT_SSC_UNION;
1842 assert(is_ANYOF_SYNTHETIC(ssc));
1844 _invlist_union_maybe_complement_2nd(ssc->invlist,
1850 PERL_STATIC_INLINE void
1851 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1853 const bool invert2nd)
1855 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1857 assert(is_ANYOF_SYNTHETIC(ssc));
1859 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1865 PERL_STATIC_INLINE void
1866 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1868 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1870 assert(is_ANYOF_SYNTHETIC(ssc));
1872 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1875 PERL_STATIC_INLINE void
1876 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1878 /* AND just the single code point 'cp' into the SSC 'ssc' */
1880 SV* cp_list = _new_invlist(2);
1882 PERL_ARGS_ASSERT_SSC_CP_AND;
1884 assert(is_ANYOF_SYNTHETIC(ssc));
1886 cp_list = add_cp_to_invlist(cp_list, cp);
1887 ssc_intersection(ssc, cp_list,
1888 FALSE /* Not inverted */
1890 SvREFCNT_dec_NN(cp_list);
1893 PERL_STATIC_INLINE void
1894 S_ssc_clear_locale(regnode_ssc *ssc)
1896 /* Set the SSC 'ssc' to not match any locale things */
1897 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1899 assert(is_ANYOF_SYNTHETIC(ssc));
1901 ANYOF_POSIXL_ZERO(ssc);
1902 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1905 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1908 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1910 /* The synthetic start class is used to hopefully quickly winnow down
1911 * places where a pattern could start a match in the target string. If it
1912 * doesn't really narrow things down that much, there isn't much point to
1913 * having the overhead of using it. This function uses some very crude
1914 * heuristics to decide if to use the ssc or not.
1916 * It returns TRUE if 'ssc' rules out more than half what it considers to
1917 * be the "likely" possible matches, but of course it doesn't know what the
1918 * actual things being matched are going to be; these are only guesses
1920 * For /l matches, it assumes that the only likely matches are going to be
1921 * in the 0-255 range, uniformly distributed, so half of that is 127
1922 * For /a and /d matches, it assumes that the likely matches will be just
1923 * the ASCII range, so half of that is 63
1924 * For /u and there isn't anything matching above the Latin1 range, it
1925 * assumes that that is the only range likely to be matched, and uses
1926 * half that as the cut-off: 127. If anything matches above Latin1,
1927 * it assumes that all of Unicode could match (uniformly), except for
1928 * non-Unicode code points and things in the General Category "Other"
1929 * (unassigned, private use, surrogates, controls and formats). This
1930 * is a much large number. */
1932 U32 count = 0; /* Running total of number of code points matched by
1934 UV start, end; /* Start and end points of current range in inversion
1936 const U32 max_code_points = (LOC)
1938 : (( ! UNI_SEMANTICS
1939 || invlist_highest(ssc->invlist) < 256)
1942 const U32 max_match = max_code_points / 2;
1944 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1946 invlist_iterinit(ssc->invlist);
1947 while (invlist_iternext(ssc->invlist, &start, &end)) {
1948 if (start >= max_code_points) {
1951 end = MIN(end, max_code_points - 1);
1952 count += end - start + 1;
1953 if (count >= max_match) {
1954 invlist_iterfinish(ssc->invlist);
1964 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1966 /* The inversion list in the SSC is marked mortal; now we need a more
1967 * permanent copy, which is stored the same way that is done in a regular
1968 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1971 SV* invlist = invlist_clone(ssc->invlist);
1973 PERL_ARGS_ASSERT_SSC_FINALIZE;
1975 assert(is_ANYOF_SYNTHETIC(ssc));
1977 /* The code in this file assumes that all but these flags aren't relevant
1978 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1979 * by the time we reach here */
1980 assert(! (ANYOF_FLAGS(ssc)
1981 & ~( ANYOF_COMMON_FLAGS
1982 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1983 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1985 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1987 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1988 NULL, NULL, NULL, FALSE);
1990 /* Make sure is clone-safe */
1991 ssc->invlist = NULL;
1993 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1994 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1997 if (RExC_contains_locale) {
2001 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2004 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2005 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2006 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2007 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2008 ? (TRIE_LIST_CUR( idx ) - 1) \
2014 dump_trie(trie,widecharmap,revcharmap)
2015 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2016 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2018 These routines dump out a trie in a somewhat readable format.
2019 The _interim_ variants are used for debugging the interim
2020 tables that are used to generate the final compressed
2021 representation which is what dump_trie expects.
2023 Part of the reason for their existence is to provide a form
2024 of documentation as to how the different representations function.
2029 Dumps the final compressed table form of the trie to Perl_debug_log.
2030 Used for debugging make_trie().
2034 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2035 AV *revcharmap, U32 depth)
2038 SV *sv=sv_newmortal();
2039 int colwidth= widecharmap ? 6 : 4;
2041 GET_RE_DEBUG_FLAGS_DECL;
2043 PERL_ARGS_ASSERT_DUMP_TRIE;
2045 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2046 depth+1, "Match","Base","Ofs" );
2048 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2049 SV ** const tmp = av_fetch( revcharmap, state, 0);
2051 Perl_re_printf( aTHX_ "%*s",
2053 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2054 PL_colors[0], PL_colors[1],
2055 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2056 PERL_PV_ESCAPE_FIRSTCHAR
2061 Perl_re_printf( aTHX_ "\n");
2062 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2064 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2065 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2066 Perl_re_printf( aTHX_ "\n");
2068 for( state = 1 ; state < trie->statecount ; state++ ) {
2069 const U32 base = trie->states[ state ].trans.base;
2071 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2073 if ( trie->states[ state ].wordnum ) {
2074 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2076 Perl_re_printf( aTHX_ "%6s", "" );
2079 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2084 while( ( base + ofs < trie->uniquecharcount ) ||
2085 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2086 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2090 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2092 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2093 if ( ( base + ofs >= trie->uniquecharcount )
2094 && ( base + ofs - trie->uniquecharcount
2096 && trie->trans[ base + ofs
2097 - trie->uniquecharcount ].check == state )
2099 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2100 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2103 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
2107 Perl_re_printf( aTHX_ "]");
2110 Perl_re_printf( aTHX_ "\n" );
2112 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2114 for (word=1; word <= trie->wordcount; word++) {
2115 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2116 (int)word, (int)(trie->wordinfo[word].prev),
2117 (int)(trie->wordinfo[word].len));
2119 Perl_re_printf( aTHX_ "\n" );
2122 Dumps a fully constructed but uncompressed trie in list form.
2123 List tries normally only are used for construction when the number of
2124 possible chars (trie->uniquecharcount) is very high.
2125 Used for debugging make_trie().
2128 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2129 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2133 SV *sv=sv_newmortal();
2134 int colwidth= widecharmap ? 6 : 4;
2135 GET_RE_DEBUG_FLAGS_DECL;
2137 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2139 /* print out the table precompression. */
2140 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2142 Perl_re_indentf( aTHX_ "%s",
2143 depth+1, "------:-----+-----------------\n" );
2145 for( state=1 ; state < next_alloc ; state ++ ) {
2148 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2149 depth+1, (UV)state );
2150 if ( ! trie->states[ state ].wordnum ) {
2151 Perl_re_printf( aTHX_ "%5s| ","");
2153 Perl_re_printf( aTHX_ "W%4x| ",
2154 trie->states[ state ].wordnum
2157 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2158 SV ** const tmp = av_fetch( revcharmap,
2159 TRIE_LIST_ITEM(state,charid).forid, 0);
2161 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2163 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2165 PL_colors[0], PL_colors[1],
2166 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2167 | PERL_PV_ESCAPE_FIRSTCHAR
2169 TRIE_LIST_ITEM(state,charid).forid,
2170 (UV)TRIE_LIST_ITEM(state,charid).newstate
2173 Perl_re_printf( aTHX_ "\n%*s| ",
2174 (int)((depth * 2) + 14), "");
2177 Perl_re_printf( aTHX_ "\n");
2182 Dumps a fully constructed but uncompressed trie in table form.
2183 This is the normal DFA style state transition table, with a few
2184 twists to facilitate compression later.
2185 Used for debugging make_trie().
2188 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2189 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2194 SV *sv=sv_newmortal();
2195 int colwidth= widecharmap ? 6 : 4;
2196 GET_RE_DEBUG_FLAGS_DECL;
2198 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2201 print out the table precompression so that we can do a visual check
2202 that they are identical.
2205 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2207 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2208 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2210 Perl_re_printf( aTHX_ "%*s",
2212 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2213 PL_colors[0], PL_colors[1],
2214 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2215 PERL_PV_ESCAPE_FIRSTCHAR
2221 Perl_re_printf( aTHX_ "\n");
2222 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2224 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2225 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2228 Perl_re_printf( aTHX_ "\n" );
2230 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2232 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2234 (UV)TRIE_NODENUM( state ) );
2236 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2237 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2239 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2241 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2243 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2244 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2245 (UV)trie->trans[ state ].check );
2247 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2248 (UV)trie->trans[ state ].check,
2249 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2257 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2258 startbranch: the first branch in the whole branch sequence
2259 first : start branch of sequence of branch-exact nodes.
2260 May be the same as startbranch
2261 last : Thing following the last branch.
2262 May be the same as tail.
2263 tail : item following the branch sequence
2264 count : words in the sequence
2265 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2266 depth : indent depth
2268 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2270 A trie is an N'ary tree where the branches are determined by digital
2271 decomposition of the key. IE, at the root node you look up the 1st character and
2272 follow that branch repeat until you find the end of the branches. Nodes can be
2273 marked as "accepting" meaning they represent a complete word. Eg:
2277 would convert into the following structure. Numbers represent states, letters
2278 following numbers represent valid transitions on the letter from that state, if
2279 the number is in square brackets it represents an accepting state, otherwise it
2280 will be in parenthesis.
2282 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2286 (1) +-i->(6)-+-s->[7]
2288 +-s->(3)-+-h->(4)-+-e->[5]
2290 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2292 This shows that when matching against the string 'hers' we will begin at state 1
2293 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2294 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2295 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2296 single traverse. We store a mapping from accepting to state to which word was
2297 matched, and then when we have multiple possibilities we try to complete the
2298 rest of the regex in the order in which they occurred in the alternation.
2300 The only prior NFA like behaviour that would be changed by the TRIE support is
2301 the silent ignoring of duplicate alternations which are of the form:
2303 / (DUPE|DUPE) X? (?{ ... }) Y /x
2305 Thus EVAL blocks following a trie may be called a different number of times with
2306 and without the optimisation. With the optimisations dupes will be silently
2307 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2308 the following demonstrates:
2310 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2312 which prints out 'word' three times, but
2314 'words'=~/(word|word|word)(?{ print $1 })S/
2316 which doesnt print it out at all. This is due to other optimisations kicking in.
2318 Example of what happens on a structural level:
2320 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2322 1: CURLYM[1] {1,32767}(18)
2333 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2334 and should turn into:
2336 1: CURLYM[1] {1,32767}(18)
2338 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2346 Cases where tail != last would be like /(?foo|bar)baz/:
2356 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2357 and would end up looking like:
2360 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2367 d = uvchr_to_utf8_flags(d, uv, 0);
2369 is the recommended Unicode-aware way of saying
2374 #define TRIE_STORE_REVCHAR(val) \
2377 SV *zlopp = newSV(UTF8_MAXBYTES); \
2378 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2379 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2380 SvCUR_set(zlopp, kapow - flrbbbbb); \
2383 av_push(revcharmap, zlopp); \
2385 char ooooff = (char)val; \
2386 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2390 /* This gets the next character from the input, folding it if not already
2392 #define TRIE_READ_CHAR STMT_START { \
2395 /* if it is UTF then it is either already folded, or does not need \
2397 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2399 else if (folder == PL_fold_latin1) { \
2400 /* This folder implies Unicode rules, which in the range expressible \
2401 * by not UTF is the lower case, with the two exceptions, one of \
2402 * which should have been taken care of before calling this */ \
2403 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2404 uvc = toLOWER_L1(*uc); \
2405 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2408 /* raw data, will be folded later if needed */ \
2416 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2417 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2418 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2419 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2420 TRIE_LIST_LEN( state ) = ging; \
2422 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2423 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2424 TRIE_LIST_CUR( state )++; \
2427 #define TRIE_LIST_NEW(state) STMT_START { \
2428 Newx( trie->states[ state ].trans.list, \
2429 4, reg_trie_trans_le ); \
2430 TRIE_LIST_CUR( state ) = 1; \
2431 TRIE_LIST_LEN( state ) = 4; \
2434 #define TRIE_HANDLE_WORD(state) STMT_START { \
2435 U16 dupe= trie->states[ state ].wordnum; \
2436 regnode * const noper_next = regnext( noper ); \
2439 /* store the word for dumping */ \
2441 if (OP(noper) != NOTHING) \
2442 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2444 tmp = newSVpvn_utf8( "", 0, UTF ); \
2445 av_push( trie_words, tmp ); \
2449 trie->wordinfo[curword].prev = 0; \
2450 trie->wordinfo[curword].len = wordlen; \
2451 trie->wordinfo[curword].accept = state; \
2453 if ( noper_next < tail ) { \
2455 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2457 trie->jump[curword] = (U16)(noper_next - convert); \
2459 jumper = noper_next; \
2461 nextbranch= regnext(cur); \
2465 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2466 /* chain, so that when the bits of chain are later */\
2467 /* linked together, the dups appear in the chain */\
2468 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2469 trie->wordinfo[dupe].prev = curword; \
2471 /* we haven't inserted this word yet. */ \
2472 trie->states[ state ].wordnum = curword; \
2477 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2478 ( ( base + charid >= ucharcount \
2479 && base + charid < ubound \
2480 && state == trie->trans[ base - ucharcount + charid ].check \
2481 && trie->trans[ base - ucharcount + charid ].next ) \
2482 ? trie->trans[ base - ucharcount + charid ].next \
2483 : ( state==1 ? special : 0 ) \
2486 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2488 TRIE_BITMAP_SET(trie, uvc); \
2489 /* store the folded codepoint */ \
2491 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2494 /* store first byte of utf8 representation of */ \
2495 /* variant codepoints */ \
2496 if (! UVCHR_IS_INVARIANT(uvc)) { \
2497 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2502 #define MADE_JUMP_TRIE 2
2503 #define MADE_EXACT_TRIE 4
2506 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2507 regnode *first, regnode *last, regnode *tail,
2508 U32 word_count, U32 flags, U32 depth)
2510 /* first pass, loop through and scan words */
2511 reg_trie_data *trie;
2512 HV *widecharmap = NULL;
2513 AV *revcharmap = newAV();
2519 regnode *jumper = NULL;
2520 regnode *nextbranch = NULL;
2521 regnode *convert = NULL;
2522 U32 *prev_states; /* temp array mapping each state to previous one */
2523 /* we just use folder as a flag in utf8 */
2524 const U8 * folder = NULL;
2526 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2527 * which stands for one trie structure, one hash, optionally followed
2530 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2531 AV *trie_words = NULL;
2532 /* along with revcharmap, this only used during construction but both are
2533 * useful during debugging so we store them in the struct when debugging.
2536 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2537 STRLEN trie_charcount=0;
2539 SV *re_trie_maxbuff;
2540 GET_RE_DEBUG_FLAGS_DECL;
2542 PERL_ARGS_ASSERT_MAKE_TRIE;
2544 PERL_UNUSED_ARG(depth);
2548 case EXACT: case EXACTL: break;
2552 case EXACTFLU8: folder = PL_fold_latin1; break;
2553 case EXACTF: folder = PL_fold; break;
2554 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2557 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2559 trie->startstate = 1;
2560 trie->wordcount = word_count;
2561 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2562 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2563 if (flags == EXACT || flags == EXACTL)
2564 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2565 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2566 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2569 trie_words = newAV();
2572 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2573 assert(re_trie_maxbuff);
2574 if (!SvIOK(re_trie_maxbuff)) {
2575 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2577 DEBUG_TRIE_COMPILE_r({
2578 Perl_re_indentf( aTHX_
2579 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2581 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2582 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2585 /* Find the node we are going to overwrite */
2586 if ( first == startbranch && OP( last ) != BRANCH ) {
2587 /* whole branch chain */
2590 /* branch sub-chain */
2591 convert = NEXTOPER( first );
2594 /* -- First loop and Setup --
2596 We first traverse the branches and scan each word to determine if it
2597 contains widechars, and how many unique chars there are, this is
2598 important as we have to build a table with at least as many columns as we
2601 We use an array of integers to represent the character codes 0..255
2602 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2603 the native representation of the character value as the key and IV's for
2606 *TODO* If we keep track of how many times each character is used we can
2607 remap the columns so that the table compression later on is more
2608 efficient in terms of memory by ensuring the most common value is in the
2609 middle and the least common are on the outside. IMO this would be better
2610 than a most to least common mapping as theres a decent chance the most
2611 common letter will share a node with the least common, meaning the node
2612 will not be compressible. With a middle is most common approach the worst
2613 case is when we have the least common nodes twice.
2617 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2618 regnode *noper = NEXTOPER( cur );
2622 U32 wordlen = 0; /* required init */
2623 STRLEN minchars = 0;
2624 STRLEN maxchars = 0;
2625 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2628 if (OP(noper) == NOTHING) {
2629 /* skip past a NOTHING at the start of an alternation
2630 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2632 regnode *noper_next= regnext(noper);
2633 if (noper_next < tail)
2637 if ( noper < tail &&
2639 OP(noper) == flags ||
2642 OP(noper) == EXACTFU_SS
2646 uc= (U8*)STRING(noper);
2647 e= uc + STR_LEN(noper);
2654 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2655 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2656 regardless of encoding */
2657 if (OP( noper ) == EXACTFU_SS) {
2658 /* false positives are ok, so just set this */
2659 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2663 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2665 TRIE_CHARCOUNT(trie)++;
2668 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2669 * is in effect. Under /i, this character can match itself, or
2670 * anything that folds to it. If not under /i, it can match just
2671 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2672 * all fold to k, and all are single characters. But some folds
2673 * expand to more than one character, so for example LATIN SMALL
2674 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2675 * the string beginning at 'uc' is 'ffi', it could be matched by
2676 * three characters, or just by the one ligature character. (It
2677 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2678 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2679 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2680 * match.) The trie needs to know the minimum and maximum number
2681 * of characters that could match so that it can use size alone to
2682 * quickly reject many match attempts. The max is simple: it is
2683 * the number of folded characters in this branch (since a fold is
2684 * never shorter than what folds to it. */
2688 /* And the min is equal to the max if not under /i (indicated by
2689 * 'folder' being NULL), or there are no multi-character folds. If
2690 * there is a multi-character fold, the min is incremented just
2691 * once, for the character that folds to the sequence. Each
2692 * character in the sequence needs to be added to the list below of
2693 * characters in the trie, but we count only the first towards the
2694 * min number of characters needed. This is done through the
2695 * variable 'foldlen', which is returned by the macros that look
2696 * for these sequences as the number of bytes the sequence
2697 * occupies. Each time through the loop, we decrement 'foldlen' by
2698 * how many bytes the current char occupies. Only when it reaches
2699 * 0 do we increment 'minchars' or look for another multi-character
2701 if (folder == NULL) {
2704 else if (foldlen > 0) {
2705 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2710 /* See if *uc is the beginning of a multi-character fold. If
2711 * so, we decrement the length remaining to look at, to account
2712 * for the current character this iteration. (We can use 'uc'
2713 * instead of the fold returned by TRIE_READ_CHAR because for
2714 * non-UTF, the latin1_safe macro is smart enough to account
2715 * for all the unfolded characters, and because for UTF, the
2716 * string will already have been folded earlier in the
2717 * compilation process */
2719 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2720 foldlen -= UTF8SKIP(uc);
2723 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2728 /* The current character (and any potential folds) should be added
2729 * to the possible matching characters for this position in this
2733 U8 folded= folder[ (U8) uvc ];
2734 if ( !trie->charmap[ folded ] ) {
2735 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2736 TRIE_STORE_REVCHAR( folded );
2739 if ( !trie->charmap[ uvc ] ) {
2740 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2741 TRIE_STORE_REVCHAR( uvc );
2744 /* store the codepoint in the bitmap, and its folded
2746 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2747 set_bit = 0; /* We've done our bit :-) */
2751 /* XXX We could come up with the list of code points that fold
2752 * to this using PL_utf8_foldclosures, except not for
2753 * multi-char folds, as there may be multiple combinations
2754 * there that could work, which needs to wait until runtime to
2755 * resolve (The comment about LIGATURE FFI above is such an
2760 widecharmap = newHV();
2762 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2765 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2767 if ( !SvTRUE( *svpp ) ) {
2768 sv_setiv( *svpp, ++trie->uniquecharcount );
2769 TRIE_STORE_REVCHAR(uvc);
2772 } /* end loop through characters in this branch of the trie */
2774 /* We take the min and max for this branch and combine to find the min
2775 * and max for all branches processed so far */
2776 if( cur == first ) {
2777 trie->minlen = minchars;
2778 trie->maxlen = maxchars;
2779 } else if (minchars < trie->minlen) {
2780 trie->minlen = minchars;
2781 } else if (maxchars > trie->maxlen) {
2782 trie->maxlen = maxchars;
2784 } /* end first pass */
2785 DEBUG_TRIE_COMPILE_r(
2786 Perl_re_indentf( aTHX_
2787 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2789 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2790 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2791 (int)trie->minlen, (int)trie->maxlen )
2795 We now know what we are dealing with in terms of unique chars and
2796 string sizes so we can calculate how much memory a naive
2797 representation using a flat table will take. If it's over a reasonable
2798 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2799 conservative but potentially much slower representation using an array
2802 At the end we convert both representations into the same compressed
2803 form that will be used in regexec.c for matching with. The latter
2804 is a form that cannot be used to construct with but has memory
2805 properties similar to the list form and access properties similar
2806 to the table form making it both suitable for fast searches and
2807 small enough that its feasable to store for the duration of a program.
2809 See the comment in the code where the compressed table is produced
2810 inplace from the flat tabe representation for an explanation of how
2811 the compression works.
2816 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2819 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2820 > SvIV(re_trie_maxbuff) )
2823 Second Pass -- Array Of Lists Representation
2825 Each state will be represented by a list of charid:state records
2826 (reg_trie_trans_le) the first such element holds the CUR and LEN
2827 points of the allocated array. (See defines above).
2829 We build the initial structure using the lists, and then convert
2830 it into the compressed table form which allows faster lookups
2831 (but cant be modified once converted).
2834 STRLEN transcount = 1;
2836 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2839 trie->states = (reg_trie_state *)
2840 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2841 sizeof(reg_trie_state) );
2845 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2847 regnode *noper = NEXTOPER( cur );
2848 U32 state = 1; /* required init */
2849 U16 charid = 0; /* sanity init */
2850 U32 wordlen = 0; /* required init */
2852 if (OP(noper) == NOTHING) {
2853 regnode *noper_next= regnext(noper);
2854 if (noper_next < tail)
2858 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2859 const U8 *uc= (U8*)STRING(noper);
2860 const U8 *e= uc + STR_LEN(noper);
2862 for ( ; uc < e ; uc += len ) {
2867 charid = trie->charmap[ uvc ];
2869 SV** const svpp = hv_fetch( widecharmap,
2876 charid=(U16)SvIV( *svpp );
2879 /* charid is now 0 if we dont know the char read, or
2880 * nonzero if we do */
2887 if ( !trie->states[ state ].trans.list ) {
2888 TRIE_LIST_NEW( state );
2891 check <= TRIE_LIST_USED( state );
2894 if ( TRIE_LIST_ITEM( state, check ).forid
2897 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2902 newstate = next_alloc++;
2903 prev_states[newstate] = state;
2904 TRIE_LIST_PUSH( state, charid, newstate );
2909 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
2913 TRIE_HANDLE_WORD(state);
2915 } /* end second pass */
2917 /* next alloc is the NEXT state to be allocated */
2918 trie->statecount = next_alloc;
2919 trie->states = (reg_trie_state *)
2920 PerlMemShared_realloc( trie->states,
2922 * sizeof(reg_trie_state) );
2924 /* and now dump it out before we compress it */
2925 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2926 revcharmap, next_alloc,
2930 trie->trans = (reg_trie_trans *)
2931 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2938 for( state=1 ; state < next_alloc ; state ++ ) {
2942 DEBUG_TRIE_COMPILE_MORE_r(
2943 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
2947 if (trie->states[state].trans.list) {
2948 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2952 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2953 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2954 if ( forid < minid ) {
2956 } else if ( forid > maxid ) {
2960 if ( transcount < tp + maxid - minid + 1) {
2962 trie->trans = (reg_trie_trans *)
2963 PerlMemShared_realloc( trie->trans,
2965 * sizeof(reg_trie_trans) );
2966 Zero( trie->trans + (transcount / 2),
2970 base = trie->uniquecharcount + tp - minid;
2971 if ( maxid == minid ) {
2973 for ( ; zp < tp ; zp++ ) {
2974 if ( ! trie->trans[ zp ].next ) {
2975 base = trie->uniquecharcount + zp - minid;
2976 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2978 trie->trans[ zp ].check = state;
2984 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2986 trie->trans[ tp ].check = state;
2991 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2992 const U32 tid = base
2993 - trie->uniquecharcount
2994 + TRIE_LIST_ITEM( state, idx ).forid;
2995 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2997 trie->trans[ tid ].check = state;
2999 tp += ( maxid - minid + 1 );
3001 Safefree(trie->states[ state ].trans.list);
3004 DEBUG_TRIE_COMPILE_MORE_r(
3005 Perl_re_printf( aTHX_ " base: %d\n",base);
3008 trie->states[ state ].trans.base=base;
3010 trie->lasttrans = tp + 1;
3014 Second Pass -- Flat Table Representation.
3016 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3017 each. We know that we will need Charcount+1 trans at most to store
3018 the data (one row per char at worst case) So we preallocate both
3019 structures assuming worst case.
3021 We then construct the trie using only the .next slots of the entry
3024 We use the .check field of the first entry of the node temporarily
3025 to make compression both faster and easier by keeping track of how
3026 many non zero fields are in the node.
3028 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3031 There are two terms at use here: state as a TRIE_NODEIDX() which is
3032 a number representing the first entry of the node, and state as a
3033 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3034 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3035 if there are 2 entrys per node. eg:
3043 The table is internally in the right hand, idx form. However as we
3044 also have to deal with the states array which is indexed by nodenum
3045 we have to use TRIE_NODENUM() to convert.
3048 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3051 trie->trans = (reg_trie_trans *)
3052 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3053 * trie->uniquecharcount + 1,
3054 sizeof(reg_trie_trans) );
3055 trie->states = (reg_trie_state *)
3056 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3057 sizeof(reg_trie_state) );
3058 next_alloc = trie->uniquecharcount + 1;
3061 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3063 regnode *noper = NEXTOPER( cur );
3065 U32 state = 1; /* required init */
3067 U16 charid = 0; /* sanity init */
3068 U32 accept_state = 0; /* sanity init */
3070 U32 wordlen = 0; /* required init */
3072 if (OP(noper) == NOTHING) {
3073 regnode *noper_next= regnext(noper);
3074 if (noper_next < tail)
3078 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3079 const U8 *uc= (U8*)STRING(noper);
3080 const U8 *e= uc + STR_LEN(noper);
3082 for ( ; uc < e ; uc += len ) {
3087 charid = trie->charmap[ uvc ];
3089 SV* const * const svpp = hv_fetch( widecharmap,
3093 charid = svpp ? (U16)SvIV(*svpp) : 0;
3097 if ( !trie->trans[ state + charid ].next ) {
3098 trie->trans[ state + charid ].next = next_alloc;
3099 trie->trans[ state ].check++;
3100 prev_states[TRIE_NODENUM(next_alloc)]
3101 = TRIE_NODENUM(state);
3102 next_alloc += trie->uniquecharcount;
3104 state = trie->trans[ state + charid ].next;
3106 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3108 /* charid is now 0 if we dont know the char read, or
3109 * nonzero if we do */
3112 accept_state = TRIE_NODENUM( state );
3113 TRIE_HANDLE_WORD(accept_state);
3115 } /* end second pass */
3117 /* and now dump it out before we compress it */
3118 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3120 next_alloc, depth+1));
3124 * Inplace compress the table.*
3126 For sparse data sets the table constructed by the trie algorithm will
3127 be mostly 0/FAIL transitions or to put it another way mostly empty.
3128 (Note that leaf nodes will not contain any transitions.)
3130 This algorithm compresses the tables by eliminating most such
3131 transitions, at the cost of a modest bit of extra work during lookup:
3133 - Each states[] entry contains a .base field which indicates the
3134 index in the state[] array wheres its transition data is stored.
3136 - If .base is 0 there are no valid transitions from that node.
3138 - If .base is nonzero then charid is added to it to find an entry in
3141 -If trans[states[state].base+charid].check!=state then the
3142 transition is taken to be a 0/Fail transition. Thus if there are fail
3143 transitions at the front of the node then the .base offset will point
3144 somewhere inside the previous nodes data (or maybe even into a node
3145 even earlier), but the .check field determines if the transition is
3149 The following process inplace converts the table to the compressed
3150 table: We first do not compress the root node 1,and mark all its
3151 .check pointers as 1 and set its .base pointer as 1 as well. This
3152 allows us to do a DFA construction from the compressed table later,
3153 and ensures that any .base pointers we calculate later are greater
3156 - We set 'pos' to indicate the first entry of the second node.
3158 - We then iterate over the columns of the node, finding the first and
3159 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3160 and set the .check pointers accordingly, and advance pos
3161 appropriately and repreat for the next node. Note that when we copy
3162 the next pointers we have to convert them from the original
3163 NODEIDX form to NODENUM form as the former is not valid post
3166 - If a node has no transitions used we mark its base as 0 and do not
3167 advance the pos pointer.
3169 - If a node only has one transition we use a second pointer into the
3170 structure to fill in allocated fail transitions from other states.
3171 This pointer is independent of the main pointer and scans forward
3172 looking for null transitions that are allocated to a state. When it
3173 finds one it writes the single transition into the "hole". If the
3174 pointer doesnt find one the single transition is appended as normal.
3176 - Once compressed we can Renew/realloc the structures to release the
3179 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3180 specifically Fig 3.47 and the associated pseudocode.
3184 const U32 laststate = TRIE_NODENUM( next_alloc );
3187 trie->statecount = laststate;
3189 for ( state = 1 ; state < laststate ; state++ ) {
3191 const U32 stateidx = TRIE_NODEIDX( state );
3192 const U32 o_used = trie->trans[ stateidx ].check;
3193 U32 used = trie->trans[ stateidx ].check;
3194 trie->trans[ stateidx ].check = 0;
3197 used && charid < trie->uniquecharcount;
3200 if ( flag || trie->trans[ stateidx + charid ].next ) {
3201 if ( trie->trans[ stateidx + charid ].next ) {
3203 for ( ; zp < pos ; zp++ ) {
3204 if ( ! trie->trans[ zp ].next ) {
3208 trie->states[ state ].trans.base
3210 + trie->uniquecharcount
3212 trie->trans[ zp ].next
3213 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3215 trie->trans[ zp ].check = state;
3216 if ( ++zp > pos ) pos = zp;
3223 trie->states[ state ].trans.base
3224 = pos + trie->uniquecharcount - charid ;
3226 trie->trans[ pos ].next
3227 = SAFE_TRIE_NODENUM(
3228 trie->trans[ stateidx + charid ].next );
3229 trie->trans[ pos ].check = state;
3234 trie->lasttrans = pos + 1;
3235 trie->states = (reg_trie_state *)
3236 PerlMemShared_realloc( trie->states, laststate
3237 * sizeof(reg_trie_state) );
3238 DEBUG_TRIE_COMPILE_MORE_r(
3239 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3241 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3245 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3248 } /* end table compress */
3250 DEBUG_TRIE_COMPILE_MORE_r(
3251 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3253 (UV)trie->statecount,
3254 (UV)trie->lasttrans)
3256 /* resize the trans array to remove unused space */
3257 trie->trans = (reg_trie_trans *)
3258 PerlMemShared_realloc( trie->trans, trie->lasttrans
3259 * sizeof(reg_trie_trans) );
3261 { /* Modify the program and insert the new TRIE node */
3262 U8 nodetype =(U8)(flags & 0xFF);
3266 regnode *optimize = NULL;
3267 #ifdef RE_TRACK_PATTERN_OFFSETS
3270 U32 mjd_nodelen = 0;
3271 #endif /* RE_TRACK_PATTERN_OFFSETS */
3272 #endif /* DEBUGGING */
3274 This means we convert either the first branch or the first Exact,
3275 depending on whether the thing following (in 'last') is a branch
3276 or not and whther first is the startbranch (ie is it a sub part of
3277 the alternation or is it the whole thing.)
3278 Assuming its a sub part we convert the EXACT otherwise we convert
3279 the whole branch sequence, including the first.
3281 /* Find the node we are going to overwrite */
3282 if ( first != startbranch || OP( last ) == BRANCH ) {
3283 /* branch sub-chain */
3284 NEXT_OFF( first ) = (U16)(last - first);
3285 #ifdef RE_TRACK_PATTERN_OFFSETS
3287 mjd_offset= Node_Offset((convert));
3288 mjd_nodelen= Node_Length((convert));
3291 /* whole branch chain */
3293 #ifdef RE_TRACK_PATTERN_OFFSETS
3296 const regnode *nop = NEXTOPER( convert );
3297 mjd_offset= Node_Offset((nop));
3298 mjd_nodelen= Node_Length((nop));
3302 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3304 (UV)mjd_offset, (UV)mjd_nodelen)
3307 /* But first we check to see if there is a common prefix we can
3308 split out as an EXACT and put in front of the TRIE node. */
3309 trie->startstate= 1;
3310 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3311 /* we want to find the first state that has more than
3312 * one transition, if that state is not the first state
3313 * then we have a common prefix which we can remove.
3316 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3318 I32 first_ofs = -1; /* keeps track of the ofs of the first
3319 transition, -1 means none */
3321 const U32 base = trie->states[ state ].trans.base;
3323 /* does this state terminate an alternation? */
3324 if ( trie->states[state].wordnum )
3327 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3328 if ( ( base + ofs >= trie->uniquecharcount ) &&
3329 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3330 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3332 if ( ++count > 1 ) {
3333 /* we have more than one transition */
3336 /* if this is the first state there is no common prefix
3337 * to extract, so we can exit */
3338 if ( state == 1 ) break;
3339 tmp = av_fetch( revcharmap, ofs, 0);
3340 ch = (U8*)SvPV_nolen_const( *tmp );
3342 /* if we are on count 2 then we need to initialize the
3343 * bitmap, and store the previous char if there was one
3346 /* clear the bitmap */
3347 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3349 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3352 if (first_ofs >= 0) {
3353 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3354 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3356 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3358 Perl_re_printf( aTHX_ "%s", (char*)ch)
3362 /* store the current firstchar in the bitmap */
3363 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3364 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3370 /* This state has only one transition, its transition is part
3371 * of a common prefix - we need to concatenate the char it
3372 * represents to what we have so far. */
3373 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3375 char *ch = SvPV( *tmp, len );
3377 SV *sv=sv_newmortal();
3378 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3380 (UV)state, (UV)first_ofs,
3381 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3382 PL_colors[0], PL_colors[1],
3383 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3384 PERL_PV_ESCAPE_FIRSTCHAR
3389 OP( convert ) = nodetype;
3390 str=STRING(convert);
3393 STR_LEN(convert) += len;
3399 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3404 trie->prefixlen = (state-1);
3406 regnode *n = convert+NODE_SZ_STR(convert);
3407 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3408 trie->startstate = state;
3409 trie->minlen -= (state - 1);
3410 trie->maxlen -= (state - 1);
3412 /* At least the UNICOS C compiler choked on this
3413 * being argument to DEBUG_r(), so let's just have
3416 #ifdef PERL_EXT_RE_BUILD
3422 regnode *fix = convert;
3423 U32 word = trie->wordcount;
3425 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3426 while( ++fix < n ) {
3427 Set_Node_Offset_Length(fix, 0, 0);
3430 SV ** const tmp = av_fetch( trie_words, word, 0 );
3432 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3433 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3435 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3443 NEXT_OFF(convert) = (U16)(tail - convert);
3444 DEBUG_r(optimize= n);
3450 if ( trie->maxlen ) {
3451 NEXT_OFF( convert ) = (U16)(tail - convert);
3452 ARG_SET( convert, data_slot );
3453 /* Store the offset to the first unabsorbed branch in
3454 jump[0], which is otherwise unused by the jump logic.
3455 We use this when dumping a trie and during optimisation. */
3457 trie->jump[0] = (U16)(nextbranch - convert);
3459 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3460 * and there is a bitmap
3461 * and the first "jump target" node we found leaves enough room
3462 * then convert the TRIE node into a TRIEC node, with the bitmap
3463 * embedded inline in the opcode - this is hypothetically faster.
3465 if ( !trie->states[trie->startstate].wordnum
3467 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3469 OP( convert ) = TRIEC;
3470 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3471 PerlMemShared_free(trie->bitmap);
3474 OP( convert ) = TRIE;
3476 /* store the type in the flags */
3477 convert->flags = nodetype;
3481 + regarglen[ OP( convert ) ];
3483 /* XXX We really should free up the resource in trie now,
3484 as we won't use them - (which resources?) dmq */
3486 /* needed for dumping*/
3487 DEBUG_r(if (optimize) {
3488 regnode *opt = convert;
3490 while ( ++opt < optimize) {
3491 Set_Node_Offset_Length(opt,0,0);
3494 Try to clean up some of the debris left after the
3497 while( optimize < jumper ) {
3498 mjd_nodelen += Node_Length((optimize));
3499 OP( optimize ) = OPTIMIZED;
3500 Set_Node_Offset_Length(optimize,0,0);
3503 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3505 } /* end node insert */
3507 /* Finish populating the prev field of the wordinfo array. Walk back
3508 * from each accept state until we find another accept state, and if
3509 * so, point the first word's .prev field at the second word. If the
3510 * second already has a .prev field set, stop now. This will be the
3511 * case either if we've already processed that word's accept state,
3512 * or that state had multiple words, and the overspill words were
3513 * already linked up earlier.
3520 for (word=1; word <= trie->wordcount; word++) {
3522 if (trie->wordinfo[word].prev)
3524 state = trie->wordinfo[word].accept;
3526 state = prev_states[state];
3529 prev = trie->states[state].wordnum;
3533 trie->wordinfo[word].prev = prev;
3535 Safefree(prev_states);
3539 /* and now dump out the compressed format */
3540 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3542 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3544 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3545 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3547 SvREFCNT_dec_NN(revcharmap);
3551 : trie->startstate>1
3557 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3559 /* The Trie is constructed and compressed now so we can build a fail array if
3562 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3564 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3568 We find the fail state for each state in the trie, this state is the longest
3569 proper suffix of the current state's 'word' that is also a proper prefix of
3570 another word in our trie. State 1 represents the word '' and is thus the
3571 default fail state. This allows the DFA not to have to restart after its
3572 tried and failed a word at a given point, it simply continues as though it
3573 had been matching the other word in the first place.
3575 'abcdgu'=~/abcdefg|cdgu/
3576 When we get to 'd' we are still matching the first word, we would encounter
3577 'g' which would fail, which would bring us to the state representing 'd' in
3578 the second word where we would try 'g' and succeed, proceeding to match
3581 /* add a fail transition */
3582 const U32 trie_offset = ARG(source);
3583 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3585 const U32 ucharcount = trie->uniquecharcount;
3586 const U32 numstates = trie->statecount;
3587 const U32 ubound = trie->lasttrans + ucharcount;
3591 U32 base = trie->states[ 1 ].trans.base;
3594 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3596 GET_RE_DEBUG_FLAGS_DECL;
3598 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3599 PERL_UNUSED_CONTEXT;
3601 PERL_UNUSED_ARG(depth);
3604 if ( OP(source) == TRIE ) {
3605 struct regnode_1 *op = (struct regnode_1 *)
3606 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3607 StructCopy(source,op,struct regnode_1);
3608 stclass = (regnode *)op;
3610 struct regnode_charclass *op = (struct regnode_charclass *)
3611 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3612 StructCopy(source,op,struct regnode_charclass);
3613 stclass = (regnode *)op;
3615 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3617 ARG_SET( stclass, data_slot );
3618 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3619 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3620 aho->trie=trie_offset;
3621 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3622 Copy( trie->states, aho->states, numstates, reg_trie_state );
3623 Newx( q, numstates, U32);
3624 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3627 /* initialize fail[0..1] to be 1 so that we always have
3628 a valid final fail state */
3629 fail[ 0 ] = fail[ 1 ] = 1;
3631 for ( charid = 0; charid < ucharcount ; charid++ ) {
3632 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3634 q[ q_write ] = newstate;
3635 /* set to point at the root */
3636 fail[ q[ q_write++ ] ]=1;
3639 while ( q_read < q_write) {
3640 const U32 cur = q[ q_read++ % numstates ];
3641 base = trie->states[ cur ].trans.base;
3643 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3644 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3646 U32 fail_state = cur;
3649 fail_state = fail[ fail_state ];
3650 fail_base = aho->states[ fail_state ].trans.base;
3651 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3653 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3654 fail[ ch_state ] = fail_state;
3655 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3657 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3659 q[ q_write++ % numstates] = ch_state;
3663 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3664 when we fail in state 1, this allows us to use the
3665 charclass scan to find a valid start char. This is based on the principle
3666 that theres a good chance the string being searched contains lots of stuff
3667 that cant be a start char.
3669 fail[ 0 ] = fail[ 1 ] = 0;
3670 DEBUG_TRIE_COMPILE_r({
3671 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3672 depth, (UV)numstates
3674 for( q_read=1; q_read<numstates; q_read++ ) {
3675 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3677 Perl_re_printf( aTHX_ "\n");
3680 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3685 /* The below joins as many adjacent EXACTish nodes as possible into a single
3686 * one. The regop may be changed if the node(s) contain certain sequences that
3687 * require special handling. The joining is only done if:
3688 * 1) there is room in the current conglomerated node to entirely contain the
3690 * 2) they are the exact same node type
3692 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3693 * these get optimized out
3695 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3696 * as possible, even if that means splitting an existing node so that its first
3697 * part is moved to the preceeding node. This would maximise the efficiency of
3698 * memEQ during matching. Elsewhere in this file, khw proposes splitting
3699 * EXACTFish nodes into portions that don't change under folding vs those that
3700 * do. Those portions that don't change may be the only things in the pattern that
3701 * could be used to find fixed and floating strings.
3703 * If a node is to match under /i (folded), the number of characters it matches
3704 * can be different than its character length if it contains a multi-character
3705 * fold. *min_subtract is set to the total delta number of characters of the
3708 * And *unfolded_multi_char is set to indicate whether or not the node contains
3709 * an unfolded multi-char fold. This happens when whether the fold is valid or
3710 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3711 * SMALL LETTER SHARP S, as only if the target string being matched against
3712 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3713 * folding rules depend on the locale in force at runtime. (Multi-char folds
3714 * whose components are all above the Latin1 range are not run-time locale
3715 * dependent, and have already been folded by the time this function is
3718 * This is as good a place as any to discuss the design of handling these
3719 * multi-character fold sequences. It's been wrong in Perl for a very long
3720 * time. There are three code points in Unicode whose multi-character folds
3721 * were long ago discovered to mess things up. The previous designs for
3722 * dealing with these involved assigning a special node for them. This
3723 * approach doesn't always work, as evidenced by this example:
3724 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3725 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3726 * would match just the \xDF, it won't be able to handle the case where a
3727 * successful match would have to cross the node's boundary. The new approach
3728 * that hopefully generally solves the problem generates an EXACTFU_SS node
3729 * that is "sss" in this case.
3731 * It turns out that there are problems with all multi-character folds, and not
3732 * just these three. Now the code is general, for all such cases. The
3733 * approach taken is:
3734 * 1) This routine examines each EXACTFish node that could contain multi-
3735 * character folded sequences. Since a single character can fold into
3736 * such a sequence, the minimum match length for this node is less than
3737 * the number of characters in the node. This routine returns in
3738 * *min_subtract how many characters to subtract from the the actual
3739 * length of the string to get a real minimum match length; it is 0 if
3740 * there are no multi-char foldeds. This delta is used by the caller to
3741 * adjust the min length of the match, and the delta between min and max,
3742 * so that the optimizer doesn't reject these possibilities based on size
3744 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3745 * is used for an EXACTFU node that contains at least one "ss" sequence in
3746 * it. For non-UTF-8 patterns and strings, this is the only case where
3747 * there is a possible fold length change. That means that a regular
3748 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3749 * with length changes, and so can be processed faster. regexec.c takes
3750 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3751 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3752 * known until runtime). This saves effort in regex matching. However,
3753 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3754 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3755 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3756 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3757 * possibilities for the non-UTF8 patterns are quite simple, except for
3758 * the sharp s. All the ones that don't involve a UTF-8 target string are
3759 * members of a fold-pair, and arrays are set up for all of them so that
3760 * the other member of the pair can be found quickly. Code elsewhere in
3761 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3762 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3763 * described in the next item.
3764 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3765 * validity of the fold won't be known until runtime, and so must remain
3766 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3767 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3768 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3769 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3770 * The reason this is a problem is that the optimizer part of regexec.c
3771 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3772 * that a character in the pattern corresponds to at most a single
3773 * character in the target string. (And I do mean character, and not byte
3774 * here, unlike other parts of the documentation that have never been
3775 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3776 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3777 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3778 * nodes, violate the assumption, and they are the only instances where it
3779 * is violated. I'm reluctant to try to change the assumption, as the
3780 * code involved is impenetrable to me (khw), so instead the code here
3781 * punts. This routine examines EXACTFL nodes, and (when the pattern
3782 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3783 * boolean indicating whether or not the node contains such a fold. When
3784 * it is true, the caller sets a flag that later causes the optimizer in
3785 * this file to not set values for the floating and fixed string lengths,
3786 * and thus avoids the optimizer code in regexec.c that makes the invalid
3787 * assumption. Thus, there is no optimization based on string lengths for
3788 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3789 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3790 * assumption is wrong only in these cases is that all other non-UTF-8
3791 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3792 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3793 * EXACTF nodes because we don't know at compile time if it actually
3794 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3795 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3796 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3797 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3798 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3799 * string would require the pattern to be forced into UTF-8, the overhead
3800 * of which we want to avoid. Similarly the unfolded multi-char folds in
3801 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3804 * Similarly, the code that generates tries doesn't currently handle
3805 * not-already-folded multi-char folds, and it looks like a pain to change
3806 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3807 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3808 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3809 * using /iaa matching will be doing so almost entirely with ASCII
3810 * strings, so this should rarely be encountered in practice */
3812 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3813 if (PL_regkind[OP(scan)] == EXACT) \
3814 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3817 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3818 UV *min_subtract, bool *unfolded_multi_char,
3819 U32 flags,regnode *val, U32 depth)
3821 /* Merge several consecutive EXACTish nodes into one. */
3822 regnode *n = regnext(scan);
3824 regnode *next = scan + NODE_SZ_STR(scan);
3828 regnode *stop = scan;
3829 GET_RE_DEBUG_FLAGS_DECL;
3831 PERL_UNUSED_ARG(depth);
3834 PERL_ARGS_ASSERT_JOIN_EXACT;
3835 #ifndef EXPERIMENTAL_INPLACESCAN
3836 PERL_UNUSED_ARG(flags);
3837 PERL_UNUSED_ARG(val);
3839 DEBUG_PEEP("join", scan, depth, 0);
3841 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3842 * EXACT ones that are mergeable to the current one. */
3844 && (PL_regkind[OP(n)] == NOTHING
3845 || (stringok && OP(n) == OP(scan)))
3847 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3850 if (OP(n) == TAIL || n > next)
3852 if (PL_regkind[OP(n)] == NOTHING) {
3853 DEBUG_PEEP("skip:", n, depth, 0);
3854 NEXT_OFF(scan) += NEXT_OFF(n);
3855 next = n + NODE_STEP_REGNODE;
3862 else if (stringok) {
3863 const unsigned int oldl = STR_LEN(scan);
3864 regnode * const nnext = regnext(n);
3866 /* XXX I (khw) kind of doubt that this works on platforms (should
3867 * Perl ever run on one) where U8_MAX is above 255 because of lots
3868 * of other assumptions */
3869 /* Don't join if the sum can't fit into a single node */
3870 if (oldl + STR_LEN(n) > U8_MAX)
3873 DEBUG_PEEP("merg", n, depth, 0);
3876 NEXT_OFF(scan) += NEXT_OFF(n);
3877 STR_LEN(scan) += STR_LEN(n);
3878 next = n + NODE_SZ_STR(n);
3879 /* Now we can overwrite *n : */
3880 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3888 #ifdef EXPERIMENTAL_INPLACESCAN
3889 if (flags && !NEXT_OFF(n)) {
3890 DEBUG_PEEP("atch", val, depth, 0);
3891 if (reg_off_by_arg[OP(n)]) {
3892 ARG_SET(n, val - n);
3895 NEXT_OFF(n) = val - n;
3903 *unfolded_multi_char = FALSE;
3905 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3906 * can now analyze for sequences of problematic code points. (Prior to
3907 * this final joining, sequences could have been split over boundaries, and
3908 * hence missed). The sequences only happen in folding, hence for any
3909 * non-EXACT EXACTish node */
3910 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3911 U8* s0 = (U8*) STRING(scan);
3913 U8* s_end = s0 + STR_LEN(scan);
3915 int total_count_delta = 0; /* Total delta number of characters that
3916 multi-char folds expand to */
3918 /* One pass is made over the node's string looking for all the
3919 * possibilities. To avoid some tests in the loop, there are two main
3920 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3925 if (OP(scan) == EXACTFL) {
3928 /* An EXACTFL node would already have been changed to another
3929 * node type unless there is at least one character in it that
3930 * is problematic; likely a character whose fold definition
3931 * won't be known until runtime, and so has yet to be folded.
3932 * For all but the UTF-8 locale, folds are 1-1 in length, but
3933 * to handle the UTF-8 case, we need to create a temporary
3934 * folded copy using UTF-8 locale rules in order to analyze it.
3935 * This is because our macros that look to see if a sequence is
3936 * a multi-char fold assume everything is folded (otherwise the
3937 * tests in those macros would be too complicated and slow).
3938 * Note that here, the non-problematic folds will have already
3939 * been done, so we can just copy such characters. We actually
3940 * don't completely fold the EXACTFL string. We skip the
3941 * unfolded multi-char folds, as that would just create work
3942 * below to figure out the size they already are */
3944 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3947 STRLEN s_len = UTF8SKIP(s);
3948 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3949 Copy(s, d, s_len, U8);
3952 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3953 *unfolded_multi_char = TRUE;
3954 Copy(s, d, s_len, U8);
3957 else if (isASCII(*s)) {
3958 *(d++) = toFOLD(*s);
3962 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
3968 /* Point the remainder of the routine to look at our temporary
3972 } /* End of creating folded copy of EXACTFL string */
3974 /* Examine the string for a multi-character fold sequence. UTF-8
3975 * patterns have all characters pre-folded by the time this code is
3977 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3978 length sequence we are looking for is 2 */
3980 int count = 0; /* How many characters in a multi-char fold */
3981 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3982 if (! len) { /* Not a multi-char fold: get next char */
3987 /* Nodes with 'ss' require special handling, except for
3988 * EXACTFA-ish for which there is no multi-char fold to this */
3989 if (len == 2 && *s == 's' && *(s+1) == 's'
3990 && OP(scan) != EXACTFA
3991 && OP(scan) != EXACTFA_NO_TRIE)
3994 if (OP(scan) != EXACTFL) {
3995 OP(scan) = EXACTFU_SS;
3999 else { /* Here is a generic multi-char fold. */
4000 U8* multi_end = s + len;
4002 /* Count how many characters are in it. In the case of
4003 * /aa, no folds which contain ASCII code points are
4004 * allowed, so check for those, and skip if found. */
4005 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
4006 count = utf8_length(s, multi_end);
4010 while (s < multi_end) {
4013 goto next_iteration;
4023 /* The delta is how long the sequence is minus 1 (1 is how long
4024 * the character that folds to the sequence is) */
4025 total_count_delta += count - 1;
4029 /* We created a temporary folded copy of the string in EXACTFL
4030 * nodes. Therefore we need to be sure it doesn't go below zero,
4031 * as the real string could be shorter */
4032 if (OP(scan) == EXACTFL) {
4033 int total_chars = utf8_length((U8*) STRING(scan),
4034 (U8*) STRING(scan) + STR_LEN(scan));
4035 if (total_count_delta > total_chars) {
4036 total_count_delta = total_chars;
4040 *min_subtract += total_count_delta;
4043 else if (OP(scan) == EXACTFA) {
4045 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
4046 * fold to the ASCII range (and there are no existing ones in the
4047 * upper latin1 range). But, as outlined in the comments preceding
4048 * this function, we need to flag any occurrences of the sharp s.
4049 * This character forbids trie formation (because of added
4051 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4052 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4053 || UNICODE_DOT_DOT_VERSION > 0)
4055 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4056 OP(scan) = EXACTFA_NO_TRIE;
4057 *unfolded_multi_char = TRUE;
4065 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
4066 * folds that are all Latin1. As explained in the comments
4067 * preceding this function, we look also for the sharp s in EXACTF
4068 * and EXACTFL nodes; it can be in the final position. Otherwise
4069 * we can stop looking 1 byte earlier because have to find at least
4070 * two characters for a multi-fold */
4071 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4076 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4077 if (! len) { /* Not a multi-char fold. */
4078 if (*s == LATIN_SMALL_LETTER_SHARP_S
4079 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4081 *unfolded_multi_char = TRUE;
4088 && isALPHA_FOLD_EQ(*s, 's')
4089 && isALPHA_FOLD_EQ(*(s+1), 's'))
4092 /* EXACTF nodes need to know that the minimum length
4093 * changed so that a sharp s in the string can match this
4094 * ss in the pattern, but they remain EXACTF nodes, as they
4095 * won't match this unless the target string is is UTF-8,
4096 * which we don't know until runtime. EXACTFL nodes can't
4097 * transform into EXACTFU nodes */
4098 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4099 OP(scan) = EXACTFU_SS;
4103 *min_subtract += len - 1;
4111 /* Allow dumping but overwriting the collection of skipped
4112 * ops and/or strings with fake optimized ops */
4113 n = scan + NODE_SZ_STR(scan);
4121 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4125 /* REx optimizer. Converts nodes into quicker variants "in place".
4126 Finds fixed substrings. */
4128 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4129 to the position after last scanned or to NULL. */
4131 #define INIT_AND_WITHP \
4132 assert(!and_withp); \
4133 Newx(and_withp,1, regnode_ssc); \
4134 SAVEFREEPV(and_withp)
4138 S_unwind_scan_frames(pTHX_ const void *p)
4140 scan_frame *f= (scan_frame *)p;
4142 scan_frame *n= f->next_frame;
4148 /* the return from this sub is the minimum length that could possibly match */
4150 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4151 SSize_t *minlenp, SSize_t *deltap,
4156 regnode_ssc *and_withp,
4157 U32 flags, U32 depth)
4158 /* scanp: Start here (read-write). */
4159 /* deltap: Write maxlen-minlen here. */
4160 /* last: Stop before this one. */
4161 /* data: string data about the pattern */
4162 /* stopparen: treat close N as END */
4163 /* recursed: which subroutines have we recursed into */
4164 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4166 /* There must be at least this number of characters to match */
4169 regnode *scan = *scanp, *next;
4171 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4172 int is_inf_internal = 0; /* The studied chunk is infinite */
4173 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4174 scan_data_t data_fake;
4175 SV *re_trie_maxbuff = NULL;
4176 regnode *first_non_open = scan;
4177 SSize_t stopmin = SSize_t_MAX;
4178 scan_frame *frame = NULL;
4179 GET_RE_DEBUG_FLAGS_DECL;
4181 PERL_ARGS_ASSERT_STUDY_CHUNK;
4182 RExC_study_started= 1;
4184 Zero(&data_fake, 1, scan_data_t);
4187 while (first_non_open && OP(first_non_open) == OPEN)
4188 first_non_open=regnext(first_non_open);
4194 RExC_study_chunk_recursed_count++;
4196 DEBUG_OPTIMISE_MORE_r(
4198 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4199 depth, (long)stopparen,
4200 (unsigned long)RExC_study_chunk_recursed_count,
4201 (unsigned long)depth, (unsigned long)recursed_depth,
4204 if (recursed_depth) {
4207 for ( j = 0 ; j < recursed_depth ; j++ ) {
4208 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4210 PAREN_TEST(RExC_study_chunk_recursed +
4211 ( j * RExC_study_chunk_recursed_bytes), i )
4214 !PAREN_TEST(RExC_study_chunk_recursed +
4215 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4218 Perl_re_printf( aTHX_ " %d",(int)i);
4222 if ( j + 1 < recursed_depth ) {
4223 Perl_re_printf( aTHX_ ",");
4227 Perl_re_printf( aTHX_ "\n");
4230 while ( scan && OP(scan) != END && scan < last ){
4231 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4232 node length to get a real minimum (because
4233 the folded version may be shorter) */
4234 bool unfolded_multi_char = FALSE;
4235 /* Peephole optimizer: */
4236 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4237 DEBUG_PEEP("Peep", scan, depth, flags);
4240 /* The reason we do this here is that we need to deal with things like
4241 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4242 * parsing code, as each (?:..) is handled by a different invocation of
4245 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4247 /* Follow the next-chain of the current node and optimize
4248 away all the NOTHINGs from it. */
4249 if (OP(scan) != CURLYX) {
4250 const int max = (reg_off_by_arg[OP(scan)]
4252 /* I32 may be smaller than U16 on CRAYs! */
4253 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4254 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4258 /* Skip NOTHING and LONGJMP. */
4259 while ((n = regnext(n))
4260 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4261 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4262 && off + noff < max)
4264 if (reg_off_by_arg[OP(scan)])
4267 NEXT_OFF(scan) = off;
4270 /* The principal pseudo-switch. Cannot be a switch, since we
4271 look into several different things. */
4272 if ( OP(scan) == DEFINEP ) {
4274 SSize_t deltanext = 0;
4275 SSize_t fake_last_close = 0;
4276 I32 f = SCF_IN_DEFINE;
4278 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4279 scan = regnext(scan);
4280 assert( OP(scan) == IFTHEN );
4281 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4283 data_fake.last_closep= &fake_last_close;
4285 next = regnext(scan);
4286 scan = NEXTOPER(NEXTOPER(scan));
4287 DEBUG_PEEP("scan", scan, depth, flags);
4288 DEBUG_PEEP("next", next, depth, flags);
4290 /* we suppose the run is continuous, last=next...
4291 * NOTE we dont use the return here! */
4292 /* DEFINEP study_chunk() recursion */
4293 (void)study_chunk(pRExC_state, &scan, &minlen,
4294 &deltanext, next, &data_fake, stopparen,
4295 recursed_depth, NULL, f, depth+1);
4300 OP(scan) == BRANCH ||
4301 OP(scan) == BRANCHJ ||
4304 next = regnext(scan);
4307 /* The op(next)==code check below is to see if we
4308 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4309 * IFTHEN is special as it might not appear in pairs.
4310 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4311 * we dont handle it cleanly. */
4312 if (OP(next) == code || code == IFTHEN) {
4313 /* NOTE - There is similar code to this block below for
4314 * handling TRIE nodes on a re-study. If you change stuff here
4315 * check there too. */
4316 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4318 regnode * const startbranch=scan;
4320 if (flags & SCF_DO_SUBSTR) {
4321 /* Cannot merge strings after this. */
4322 scan_commit(pRExC_state, data, minlenp, is_inf);
4325 if (flags & SCF_DO_STCLASS)
4326 ssc_init_zero(pRExC_state, &accum);
4328 while (OP(scan) == code) {
4329 SSize_t deltanext, minnext, fake;
4331 regnode_ssc this_class;
4333 DEBUG_PEEP("Branch", scan, depth, flags);
4336 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4338 data_fake.whilem_c = data->whilem_c;
4339 data_fake.last_closep = data->last_closep;
4342 data_fake.last_closep = &fake;
4344 data_fake.pos_delta = delta;
4345 next = regnext(scan);
4347 scan = NEXTOPER(scan); /* everything */
4348 if (code != BRANCH) /* everything but BRANCH */
4349 scan = NEXTOPER(scan);
4351 if (flags & SCF_DO_STCLASS) {
4352 ssc_init(pRExC_state, &this_class);
4353 data_fake.start_class = &this_class;
4354 f = SCF_DO_STCLASS_AND;
4356 if (flags & SCF_WHILEM_VISITED_POS)
4357 f |= SCF_WHILEM_VISITED_POS;
4359 /* we suppose the run is continuous, last=next...*/
4360 /* recurse study_chunk() for each BRANCH in an alternation */
4361 minnext = study_chunk(pRExC_state, &scan, minlenp,
4362 &deltanext, next, &data_fake, stopparen,
4363 recursed_depth, NULL, f,depth+1);
4367 if (deltanext == SSize_t_MAX) {
4368 is_inf = is_inf_internal = 1;
4370 } else if (max1 < minnext + deltanext)
4371 max1 = minnext + deltanext;
4373 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4375 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4376 if ( stopmin > minnext)
4377 stopmin = min + min1;
4378 flags &= ~SCF_DO_SUBSTR;
4380 data->flags |= SCF_SEEN_ACCEPT;
4383 if (data_fake.flags & SF_HAS_EVAL)
4384 data->flags |= SF_HAS_EVAL;
4385 data->whilem_c = data_fake.whilem_c;
4387 if (flags & SCF_DO_STCLASS)
4388 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4390 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4392 if (flags & SCF_DO_SUBSTR) {
4393 data->pos_min += min1;
4394 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4395 data->pos_delta = SSize_t_MAX;
4397 data->pos_delta += max1 - min1;
4398 if (max1 != min1 || is_inf)
4399 data->cur_is_floating = 1;
4402 if (delta == SSize_t_MAX
4403 || SSize_t_MAX - delta - (max1 - min1) < 0)
4404 delta = SSize_t_MAX;
4406 delta += max1 - min1;
4407 if (flags & SCF_DO_STCLASS_OR) {
4408 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4410 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4411 flags &= ~SCF_DO_STCLASS;
4414 else if (flags & SCF_DO_STCLASS_AND) {
4416 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4417 flags &= ~SCF_DO_STCLASS;
4420 /* Switch to OR mode: cache the old value of
4421 * data->start_class */
4423 StructCopy(data->start_class, and_withp, regnode_ssc);
4424 flags &= ~SCF_DO_STCLASS_AND;
4425 StructCopy(&accum, data->start_class, regnode_ssc);
4426 flags |= SCF_DO_STCLASS_OR;
4430 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4431 OP( startbranch ) == BRANCH )
4435 Assuming this was/is a branch we are dealing with: 'scan'
4436 now points at the item that follows the branch sequence,
4437 whatever it is. We now start at the beginning of the
4438 sequence and look for subsequences of
4444 which would be constructed from a pattern like
4447 If we can find such a subsequence we need to turn the first
4448 element into a trie and then add the subsequent branch exact
4449 strings to the trie.
4453 1. patterns where the whole set of branches can be
4456 2. patterns where only a subset can be converted.
4458 In case 1 we can replace the whole set with a single regop
4459 for the trie. In case 2 we need to keep the start and end
4462 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4463 becomes BRANCH TRIE; BRANCH X;
4465 There is an additional case, that being where there is a
4466 common prefix, which gets split out into an EXACT like node
4467 preceding the TRIE node.
4469 If x(1..n)==tail then we can do a simple trie, if not we make
4470 a "jump" trie, such that when we match the appropriate word
4471 we "jump" to the appropriate tail node. Essentially we turn
4472 a nested if into a case structure of sorts.
4477 if (!re_trie_maxbuff) {
4478 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4479 if (!SvIOK(re_trie_maxbuff))
4480 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4482 if ( SvIV(re_trie_maxbuff)>=0 ) {
4484 regnode *first = (regnode *)NULL;
4485 regnode *last = (regnode *)NULL;
4486 regnode *tail = scan;
4490 /* var tail is used because there may be a TAIL
4491 regop in the way. Ie, the exacts will point to the
4492 thing following the TAIL, but the last branch will
4493 point at the TAIL. So we advance tail. If we
4494 have nested (?:) we may have to move through several
4498 while ( OP( tail ) == TAIL ) {
4499 /* this is the TAIL generated by (?:) */
4500 tail = regnext( tail );
4504 DEBUG_TRIE_COMPILE_r({
4505 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4506 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4508 "Looking for TRIE'able sequences. Tail node is ",
4509 (UV)(tail - RExC_emit_start),
4510 SvPV_nolen_const( RExC_mysv )
4516 Step through the branches
4517 cur represents each branch,
4518 noper is the first thing to be matched as part
4520 noper_next is the regnext() of that node.
4522 We normally handle a case like this
4523 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4524 support building with NOJUMPTRIE, which restricts
4525 the trie logic to structures like /FOO|BAR/.
4527 If noper is a trieable nodetype then the branch is
4528 a possible optimization target. If we are building
4529 under NOJUMPTRIE then we require that noper_next is
4530 the same as scan (our current position in the regex
4533 Once we have two or more consecutive such branches
4534 we can create a trie of the EXACT's contents and
4535 stitch it in place into the program.
4537 If the sequence represents all of the branches in
4538 the alternation we replace the entire thing with a
4541 Otherwise when it is a subsequence we need to
4542 stitch it in place and replace only the relevant
4543 branches. This means the first branch has to remain
4544 as it is used by the alternation logic, and its
4545 next pointer, and needs to be repointed at the item
4546 on the branch chain following the last branch we
4547 have optimized away.
4549 This could be either a BRANCH, in which case the
4550 subsequence is internal, or it could be the item
4551 following the branch sequence in which case the
4552 subsequence is at the end (which does not
4553 necessarily mean the first node is the start of the
4556 TRIE_TYPE(X) is a define which maps the optype to a
4560 ----------------+-----------
4564 EXACTFU_SS | EXACTFU
4567 EXACTFLU8 | EXACTFLU8
4571 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4573 : ( EXACT == (X) ) \
4575 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4577 : ( EXACTFA == (X) ) \
4579 : ( EXACTL == (X) ) \
4581 : ( EXACTFLU8 == (X) ) \
4585 /* dont use tail as the end marker for this traverse */
4586 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4587 regnode * const noper = NEXTOPER( cur );
4588 U8 noper_type = OP( noper );
4589 U8 noper_trietype = TRIE_TYPE( noper_type );
4590 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4591 regnode * const noper_next = regnext( noper );
4592 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4593 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4596 DEBUG_TRIE_COMPILE_r({
4597 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4598 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4600 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4602 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4603 Perl_re_printf( aTHX_ " -> %d:%s",
4604 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4607 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4608 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4609 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4611 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4612 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4613 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4617 /* Is noper a trieable nodetype that can be merged
4618 * with the current trie (if there is one)? */
4622 ( noper_trietype == NOTHING )
4623 || ( trietype == NOTHING )
4624 || ( trietype == noper_trietype )
4627 && noper_next >= tail
4631 /* Handle mergable triable node Either we are
4632 * the first node in a new trieable sequence,
4633 * in which case we do some bookkeeping,
4634 * otherwise we update the end pointer. */
4637 if ( noper_trietype == NOTHING ) {
4638 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4639 regnode * const noper_next = regnext( noper );
4640 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4641 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4644 if ( noper_next_trietype ) {
4645 trietype = noper_next_trietype;
4646 } else if (noper_next_type) {
4647 /* a NOTHING regop is 1 regop wide.
4648 * We need at least two for a trie
4649 * so we can't merge this in */
4653 trietype = noper_trietype;
4656 if ( trietype == NOTHING )
4657 trietype = noper_trietype;
4662 } /* end handle mergable triable node */
4664 /* handle unmergable node -
4665 * noper may either be a triable node which can
4666 * not be tried together with the current trie,
4667 * or a non triable node */
4669 /* If last is set and trietype is not
4670 * NOTHING then we have found at least two
4671 * triable branch sequences in a row of a
4672 * similar trietype so we can turn them
4673 * into a trie. If/when we allow NOTHING to
4674 * start a trie sequence this condition
4675 * will be required, and it isn't expensive
4676 * so we leave it in for now. */
4677 if ( trietype && trietype != NOTHING )
4678 make_trie( pRExC_state,
4679 startbranch, first, cur, tail,
4680 count, trietype, depth+1 );
4681 last = NULL; /* note: we clear/update
4682 first, trietype etc below,
4683 so we dont do it here */
4687 && noper_next >= tail
4690 /* noper is triable, so we can start a new
4694 trietype = noper_trietype;
4696 /* if we already saw a first but the
4697 * current node is not triable then we have
4698 * to reset the first information. */
4703 } /* end handle unmergable node */
4704 } /* loop over branches */
4705 DEBUG_TRIE_COMPILE_r({
4706 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4707 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
4708 depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4709 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4710 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4711 PL_reg_name[trietype]
4715 if ( last && trietype ) {
4716 if ( trietype != NOTHING ) {
4717 /* the last branch of the sequence was part of
4718 * a trie, so we have to construct it here
4719 * outside of the loop */
4720 made= make_trie( pRExC_state, startbranch,
4721 first, scan, tail, count,
4722 trietype, depth+1 );
4723 #ifdef TRIE_STUDY_OPT
4724 if ( ((made == MADE_EXACT_TRIE &&
4725 startbranch == first)
4726 || ( first_non_open == first )) &&
4728 flags |= SCF_TRIE_RESTUDY;
4729 if ( startbranch == first
4732 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4737 /* at this point we know whatever we have is a
4738 * NOTHING sequence/branch AND if 'startbranch'
4739 * is 'first' then we can turn the whole thing
4742 if ( startbranch == first ) {
4744 /* the entire thing is a NOTHING sequence,
4745 * something like this: (?:|) So we can
4746 * turn it into a plain NOTHING op. */
4747 DEBUG_TRIE_COMPILE_r({
4748 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4749 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4751 SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4754 OP(startbranch)= NOTHING;
4755 NEXT_OFF(startbranch)= tail - startbranch;
4756 for ( opt= startbranch + 1; opt < tail ; opt++ )
4760 } /* end if ( last) */
4761 } /* TRIE_MAXBUF is non zero */
4766 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4767 scan = NEXTOPER(NEXTOPER(scan));
4768 } else /* single branch is optimized. */
4769 scan = NEXTOPER(scan);
4771 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4773 regnode *start = NULL;
4774 regnode *end = NULL;
4775 U32 my_recursed_depth= recursed_depth;
4777 if (OP(scan) != SUSPEND) { /* GOSUB */
4778 /* Do setup, note this code has side effects beyond
4779 * the rest of this block. Specifically setting
4780 * RExC_recurse[] must happen at least once during
4783 RExC_recurse[ARG2L(scan)] = scan;
4784 start = RExC_open_parens[paren];
4785 end = RExC_close_parens[paren];
4787 /* NOTE we MUST always execute the above code, even
4788 * if we do nothing with a GOSUB */
4790 ( flags & SCF_IN_DEFINE )
4793 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4795 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4798 /* no need to do anything here if we are in a define. */
4799 /* or we are after some kind of infinite construct
4800 * so we can skip recursing into this item.
4801 * Since it is infinite we will not change the maxlen
4802 * or delta, and if we miss something that might raise
4803 * the minlen it will merely pessimise a little.
4805 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4806 * might result in a minlen of 1 and not of 4,
4807 * but this doesn't make us mismatch, just try a bit
4808 * harder than we should.
4810 scan= regnext(scan);
4817 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4819 /* it is quite possible that there are more efficient ways
4820 * to do this. We maintain a bitmap per level of recursion
4821 * of which patterns we have entered so we can detect if a
4822 * pattern creates a possible infinite loop. When we
4823 * recurse down a level we copy the previous levels bitmap
4824 * down. When we are at recursion level 0 we zero the top
4825 * level bitmap. It would be nice to implement a different
4826 * more efficient way of doing this. In particular the top
4827 * level bitmap may be unnecessary.
4829 if (!recursed_depth) {
4830 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4832 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4833 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4834 RExC_study_chunk_recursed_bytes, U8);
4836 /* we havent recursed into this paren yet, so recurse into it */
4837 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
4838 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4839 my_recursed_depth= recursed_depth + 1;
4841 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
4842 /* some form of infinite recursion, assume infinite length
4844 if (flags & SCF_DO_SUBSTR) {
4845 scan_commit(pRExC_state, data, minlenp, is_inf);
4846 data->cur_is_floating = 1;
4848 is_inf = is_inf_internal = 1;
4849 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4850 ssc_anything(data->start_class);
4851 flags &= ~SCF_DO_STCLASS;
4853 start= NULL; /* reset start so we dont recurse later on. */
4858 end = regnext(scan);
4861 scan_frame *newframe;
4863 if (!RExC_frame_last) {
4864 Newxz(newframe, 1, scan_frame);
4865 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4866 RExC_frame_head= newframe;
4868 } else if (!RExC_frame_last->next_frame) {
4869 Newxz(newframe,1,scan_frame);
4870 RExC_frame_last->next_frame= newframe;
4871 newframe->prev_frame= RExC_frame_last;
4874 newframe= RExC_frame_last->next_frame;
4876 RExC_frame_last= newframe;
4878 newframe->next_regnode = regnext(scan);
4879 newframe->last_regnode = last;
4880 newframe->stopparen = stopparen;
4881 newframe->prev_recursed_depth = recursed_depth;
4882 newframe->this_prev_frame= frame;
4884 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
4885 DEBUG_PEEP("fnew", scan, depth, flags);
4892 recursed_depth= my_recursed_depth;
4897 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4898 SSize_t l = STR_LEN(scan);
4902 const U8 * const s = (U8*)STRING(scan);
4903 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4904 l = utf8_length(s, s + l);
4906 uc = *((U8*)STRING(scan));
4909 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4910 /* The code below prefers earlier match for fixed
4911 offset, later match for variable offset. */
4912 if (data->last_end == -1) { /* Update the start info. */
4913 data->last_start_min = data->pos_min;
4914 data->last_start_max = is_inf
4915 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4917 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4919 SvUTF8_on(data->last_found);
4921 SV * const sv = data->last_found;
4922 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4923 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4924 if (mg && mg->mg_len >= 0)
4925 mg->mg_len += utf8_length((U8*)STRING(scan),
4926 (U8*)STRING(scan)+STR_LEN(scan));
4928 data->last_end = data->pos_min + l;
4929 data->pos_min += l; /* As in the first entry. */
4930 data->flags &= ~SF_BEFORE_EOL;
4933 /* ANDing the code point leaves at most it, and not in locale, and
4934 * can't match null string */
4935 if (flags & SCF_DO_STCLASS_AND) {
4936 ssc_cp_and(data->start_class, uc);
4937 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4938 ssc_clear_locale(data->start_class);
4940 else if (flags & SCF_DO_STCLASS_OR) {
4941 ssc_add_cp(data->start_class, uc);
4942 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4944 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4945 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4947 flags &= ~SCF_DO_STCLASS;
4949 else if (PL_regkind[OP(scan)] == EXACT) {
4950 /* But OP != EXACT!, so is EXACTFish */
4951 SSize_t l = STR_LEN(scan);
4952 const U8 * s = (U8*)STRING(scan);
4954 /* Search for fixed substrings supports EXACT only. */
4955 if (flags & SCF_DO_SUBSTR) {
4957 scan_commit(pRExC_state, data, minlenp, is_inf);
4960 l = utf8_length(s, s + l);
4962 if (unfolded_multi_char) {
4963 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4965 min += l - min_subtract;
4967 delta += min_subtract;
4968 if (flags & SCF_DO_SUBSTR) {
4969 data->pos_min += l - min_subtract;
4970 if (data->pos_min < 0) {
4973 data->pos_delta += min_subtract;
4975 data->cur_is_floating = 1; /* float */
4979 if (flags & SCF_DO_STCLASS) {
4980 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4982 assert(EXACTF_invlist);
4983 if (flags & SCF_DO_STCLASS_AND) {
4984 if (OP(scan) != EXACTFL)
4985 ssc_clear_locale(data->start_class);
4986 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4987 ANYOF_POSIXL_ZERO(data->start_class);
4988 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4990 else { /* SCF_DO_STCLASS_OR */
4991 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4992 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4994 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4995 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4997 flags &= ~SCF_DO_STCLASS;
4998 SvREFCNT_dec(EXACTF_invlist);
5001 else if (REGNODE_VARIES(OP(scan))) {
5002 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5003 I32 fl = 0, f = flags;
5004 regnode * const oscan = scan;
5005 regnode_ssc this_class;
5006 regnode_ssc *oclass = NULL;
5007 I32 next_is_eval = 0;
5009 switch (PL_regkind[OP(scan)]) {
5010 case WHILEM: /* End of (?:...)* . */
5011 scan = NEXTOPER(scan);
5014 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5015 next = NEXTOPER(scan);
5016 if (OP(next) == EXACT
5017 || OP(next) == EXACTL
5018 || (flags & SCF_DO_STCLASS))
5021 maxcount = REG_INFTY;
5022 next = regnext(scan);
5023 scan = NEXTOPER(scan);
5027 if (flags & SCF_DO_SUBSTR)
5032 if (flags & SCF_DO_STCLASS) {
5034 maxcount = REG_INFTY;
5035 next = regnext(scan);
5036 scan = NEXTOPER(scan);
5039 if (flags & SCF_DO_SUBSTR) {
5040 scan_commit(pRExC_state, data, minlenp, is_inf);
5041 /* Cannot extend fixed substrings */
5042 data->cur_is_floating = 1; /* float */
5044 is_inf = is_inf_internal = 1;
5045 scan = regnext(scan);
5046 goto optimize_curly_tail;
5048 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5049 && (scan->flags == stopparen))
5054 mincount = ARG1(scan);
5055 maxcount = ARG2(scan);
5057 next = regnext(scan);
5058 if (OP(scan) == CURLYX) {
5059 I32 lp = (data ? *(data->last_closep) : 0);
5060 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5062 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5063 next_is_eval = (OP(scan) == EVAL);
5065 if (flags & SCF_DO_SUBSTR) {
5067 scan_commit(pRExC_state, data, minlenp, is_inf);
5068 /* Cannot extend fixed substrings */
5069 pos_before = data->pos_min;
5073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5075 data->flags |= SF_IS_INF;
5077 if (flags & SCF_DO_STCLASS) {
5078 ssc_init(pRExC_state, &this_class);
5079 oclass = data->start_class;
5080 data->start_class = &this_class;
5081 f |= SCF_DO_STCLASS_AND;
5082 f &= ~SCF_DO_STCLASS_OR;
5084 /* Exclude from super-linear cache processing any {n,m}
5085 regops for which the combination of input pos and regex
5086 pos is not enough information to determine if a match
5089 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5090 regex pos at the \s*, the prospects for a match depend not
5091 only on the input position but also on how many (bar\s*)
5092 repeats into the {4,8} we are. */
5093 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5094 f &= ~SCF_WHILEM_VISITED_POS;
5096 /* This will finish on WHILEM, setting scan, or on NULL: */
5097 /* recurse study_chunk() on loop bodies */
5098 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5099 last, data, stopparen, recursed_depth, NULL,
5101 ? (f & ~SCF_DO_SUBSTR)
5105 if (flags & SCF_DO_STCLASS)
5106 data->start_class = oclass;
5107 if (mincount == 0 || minnext == 0) {
5108 if (flags & SCF_DO_STCLASS_OR) {
5109 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5111 else if (flags & SCF_DO_STCLASS_AND) {
5112 /* Switch to OR mode: cache the old value of
5113 * data->start_class */
5115 StructCopy(data->start_class, and_withp, regnode_ssc);
5116 flags &= ~SCF_DO_STCLASS_AND;
5117 StructCopy(&this_class, data->start_class, regnode_ssc);
5118 flags |= SCF_DO_STCLASS_OR;
5119 ANYOF_FLAGS(data->start_class)
5120 |= SSC_MATCHES_EMPTY_STRING;
5122 } else { /* Non-zero len */
5123 if (flags & SCF_DO_STCLASS_OR) {
5124 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5125 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5127 else if (flags & SCF_DO_STCLASS_AND)
5128 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5129 flags &= ~SCF_DO_STCLASS;
5131 if (!scan) /* It was not CURLYX, but CURLY. */
5133 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5134 /* ? quantifier ok, except for (?{ ... }) */
5135 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5136 && (minnext == 0) && (deltanext == 0)
5137 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5138 && maxcount <= REG_INFTY/3) /* Complement check for big
5141 /* Fatal warnings may leak the regexp without this: */
5142 SAVEFREESV(RExC_rx_sv);
5143 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5144 "Quantifier unexpected on zero-length expression "
5145 "in regex m/%" UTF8f "/",
5146 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5148 (void)ReREFCNT_inc(RExC_rx_sv);
5151 min += minnext * mincount;
5152 is_inf_internal |= deltanext == SSize_t_MAX
5153 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5154 is_inf |= is_inf_internal;
5156 delta = SSize_t_MAX;
5158 delta += (minnext + deltanext) * maxcount
5159 - minnext * mincount;
5161 /* Try powerful optimization CURLYX => CURLYN. */
5162 if ( OP(oscan) == CURLYX && data
5163 && data->flags & SF_IN_PAR
5164 && !(data->flags & SF_HAS_EVAL)
5165 && !deltanext && minnext == 1 ) {
5166 /* Try to optimize to CURLYN. */
5167 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5168 regnode * const nxt1 = nxt;
5175 if (!REGNODE_SIMPLE(OP(nxt))
5176 && !(PL_regkind[OP(nxt)] == EXACT
5177 && STR_LEN(nxt) == 1))
5183 if (OP(nxt) != CLOSE)
5185 if (RExC_open_parens) {
5186 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5187 RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5189 /* Now we know that nxt2 is the only contents: */
5190 oscan->flags = (U8)ARG(nxt);
5192 OP(nxt1) = NOTHING; /* was OPEN. */
5195 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5196 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5197 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5198 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5199 OP(nxt + 1) = OPTIMIZED; /* was count. */
5200 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5205 /* Try optimization CURLYX => CURLYM. */
5206 if ( OP(oscan) == CURLYX && data
5207 && !(data->flags & SF_HAS_PAR)
5208 && !(data->flags & SF_HAS_EVAL)
5209 && !deltanext /* atom is fixed width */
5210 && minnext != 0 /* CURLYM can't handle zero width */
5212 /* Nor characters whose fold at run-time may be
5213 * multi-character */
5214 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5216 /* XXXX How to optimize if data == 0? */
5217 /* Optimize to a simpler form. */
5218 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5222 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5223 && (OP(nxt2) != WHILEM))
5225 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5226 /* Need to optimize away parenths. */
5227 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5228 /* Set the parenth number. */
5229 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5231 oscan->flags = (U8)ARG(nxt);
5232 if (RExC_open_parens) {
5233 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5234 RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5236 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5237 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5240 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5241 OP(nxt + 1) = OPTIMIZED; /* was count. */
5242 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5243 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5246 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5247 regnode *nnxt = regnext(nxt1);
5249 if (reg_off_by_arg[OP(nxt1)])
5250 ARG_SET(nxt1, nxt2 - nxt1);
5251 else if (nxt2 - nxt1 < U16_MAX)
5252 NEXT_OFF(nxt1) = nxt2 - nxt1;
5254 OP(nxt) = NOTHING; /* Cannot beautify */
5259 /* Optimize again: */
5260 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5261 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5262 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5267 else if ((OP(oscan) == CURLYX)
5268 && (flags & SCF_WHILEM_VISITED_POS)
5269 /* See the comment on a similar expression above.
5270 However, this time it's not a subexpression
5271 we care about, but the expression itself. */
5272 && (maxcount == REG_INFTY)
5274 /* This stays as CURLYX, we can put the count/of pair. */
5275 /* Find WHILEM (as in regexec.c) */
5276 regnode *nxt = oscan + NEXT_OFF(oscan);
5278 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5280 nxt = PREVOPER(nxt);
5281 if (nxt->flags & 0xf) {
5282 /* we've already set whilem count on this node */
5283 } else if (++data->whilem_c < 16) {
5284 assert(data->whilem_c <= RExC_whilem_seen);
5285 nxt->flags = (U8)(data->whilem_c
5286 | (RExC_whilem_seen << 4)); /* On WHILEM */
5289 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5291 if (flags & SCF_DO_SUBSTR) {
5292 SV *last_str = NULL;
5293 STRLEN last_chrs = 0;
5294 int counted = mincount != 0;
5296 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5298 SSize_t b = pos_before >= data->last_start_min
5299 ? pos_before : data->last_start_min;
5301 const char * const s = SvPV_const(data->last_found, l);
5302 SSize_t old = b - data->last_start_min;
5305 old = utf8_hop((U8*)s, old) - (U8*)s;
5307 /* Get the added string: */
5308 last_str = newSVpvn_utf8(s + old, l, UTF);
5309 last_chrs = UTF ? utf8_length((U8*)(s + old),
5310 (U8*)(s + old + l)) : l;
5311 if (deltanext == 0 && pos_before == b) {
5312 /* What was added is a constant string */
5315 SvGROW(last_str, (mincount * l) + 1);
5316 repeatcpy(SvPVX(last_str) + l,
5317 SvPVX_const(last_str), l,
5319 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5320 /* Add additional parts. */
5321 SvCUR_set(data->last_found,
5322 SvCUR(data->last_found) - l);
5323 sv_catsv(data->last_found, last_str);
5325 SV * sv = data->last_found;
5327 SvUTF8(sv) && SvMAGICAL(sv) ?
5328 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5329 if (mg && mg->mg_len >= 0)
5330 mg->mg_len += last_chrs * (mincount-1);
5332 last_chrs *= mincount;
5333 data->last_end += l * (mincount - 1);
5336 /* start offset must point into the last copy */
5337 data->last_start_min += minnext * (mincount - 1);
5338 data->last_start_max =
5341 : data->last_start_max +
5342 (maxcount - 1) * (minnext + data->pos_delta);
5345 /* It is counted once already... */
5346 data->pos_min += minnext * (mincount - counted);
5348 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5349 " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5350 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5351 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5353 if (deltanext != SSize_t_MAX)
5354 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5355 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5356 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5358 if (deltanext == SSize_t_MAX
5359 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5360 data->pos_delta = SSize_t_MAX;
5362 data->pos_delta += - counted * deltanext +
5363 (minnext + deltanext) * maxcount - minnext * mincount;
5364 if (mincount != maxcount) {
5365 /* Cannot extend fixed substrings found inside
5367 scan_commit(pRExC_state, data, minlenp, is_inf);
5368 if (mincount && last_str) {
5369 SV * const sv = data->last_found;
5370 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5371 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5375 sv_setsv(sv, last_str);
5376 data->last_end = data->pos_min;
5377 data->last_start_min = data->pos_min - last_chrs;
5378 data->last_start_max = is_inf
5380 : data->pos_min + data->pos_delta - last_chrs;
5382 data->cur_is_floating = 1; /* float */
5384 SvREFCNT_dec(last_str);
5386 if (data && (fl & SF_HAS_EVAL))
5387 data->flags |= SF_HAS_EVAL;
5388 optimize_curly_tail:
5389 if (OP(oscan) != CURLYX) {
5390 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5392 NEXT_OFF(oscan) += NEXT_OFF(next);
5398 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5403 if (flags & SCF_DO_SUBSTR) {
5404 /* Cannot expect anything... */
5405 scan_commit(pRExC_state, data, minlenp, is_inf);
5406 data->cur_is_floating = 1; /* float */
5408 is_inf = is_inf_internal = 1;
5409 if (flags & SCF_DO_STCLASS_OR) {
5410 if (OP(scan) == CLUMP) {
5411 /* Actually is any start char, but very few code points
5412 * aren't start characters */
5413 ssc_match_all_cp(data->start_class);
5416 ssc_anything(data->start_class);
5419 flags &= ~SCF_DO_STCLASS;
5423 else if (OP(scan) == LNBREAK) {
5424 if (flags & SCF_DO_STCLASS) {
5425 if (flags & SCF_DO_STCLASS_AND) {
5426 ssc_intersection(data->start_class,
5427 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5428 ssc_clear_locale(data->start_class);
5429 ANYOF_FLAGS(data->start_class)
5430 &= ~SSC_MATCHES_EMPTY_STRING;
5432 else if (flags & SCF_DO_STCLASS_OR) {
5433 ssc_union(data->start_class,
5434 PL_XPosix_ptrs[_CC_VERTSPACE],
5436 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5438 /* See commit msg for
5439 * 749e076fceedeb708a624933726e7989f2302f6a */
5440 ANYOF_FLAGS(data->start_class)
5441 &= ~SSC_MATCHES_EMPTY_STRING;
5443 flags &= ~SCF_DO_STCLASS;
5446 if (delta != SSize_t_MAX)
5447 delta++; /* Because of the 2 char string cr-lf */
5448 if (flags & SCF_DO_SUBSTR) {
5449 /* Cannot expect anything... */
5450 scan_commit(pRExC_state, data, minlenp, is_inf);
5452 data->pos_delta += 1;
5453 data->cur_is_floating = 1; /* float */
5456 else if (REGNODE_SIMPLE(OP(scan))) {
5458 if (flags & SCF_DO_SUBSTR) {
5459 scan_commit(pRExC_state, data, minlenp, is_inf);
5463 if (flags & SCF_DO_STCLASS) {
5465 SV* my_invlist = NULL;
5468 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5469 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5471 /* Some of the logic below assumes that switching
5472 locale on will only add false positives. */
5477 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5481 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5482 ssc_match_all_cp(data->start_class);
5487 SV* REG_ANY_invlist = _new_invlist(2);
5488 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5490 if (flags & SCF_DO_STCLASS_OR) {
5491 ssc_union(data->start_class,
5493 TRUE /* TRUE => invert, hence all but \n
5497 else if (flags & SCF_DO_STCLASS_AND) {
5498 ssc_intersection(data->start_class,
5500 TRUE /* TRUE => invert */
5502 ssc_clear_locale(data->start_class);
5504 SvREFCNT_dec_NN(REG_ANY_invlist);
5511 if (flags & SCF_DO_STCLASS_AND)
5512 ssc_and(pRExC_state, data->start_class,
5513 (regnode_charclass *) scan);
5515 ssc_or(pRExC_state, data->start_class,
5516 (regnode_charclass *) scan);
5524 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5525 if (flags & SCF_DO_STCLASS_AND) {
5526 bool was_there = cBOOL(
5527 ANYOF_POSIXL_TEST(data->start_class,
5529 ANYOF_POSIXL_ZERO(data->start_class);
5530 if (was_there) { /* Do an AND */
5531 ANYOF_POSIXL_SET(data->start_class, namedclass);
5533 /* No individual code points can now match */
5534 data->start_class->invlist
5535 = sv_2mortal(_new_invlist(0));
5538 int complement = namedclass + ((invert) ? -1 : 1);
5540 assert(flags & SCF_DO_STCLASS_OR);
5542 /* If the complement of this class was already there,
5543 * the result is that they match all code points,
5544 * (\d + \D == everything). Remove the classes from
5545 * future consideration. Locale is not relevant in
5547 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5548 ssc_match_all_cp(data->start_class);
5549 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5550 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5552 else { /* The usual case; just add this class to the
5554 ANYOF_POSIXL_SET(data->start_class, namedclass);
5563 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5565 /* This can be handled as a Posix class */
5566 goto join_posix_and_ascii;
5568 case NPOSIXA: /* For these, we always know the exact set of
5573 assert(FLAGS(scan) != _CC_ASCII);
5574 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5575 PL_XPosix_ptrs[_CC_ASCII],
5577 goto join_posix_and_ascii;
5585 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5587 /* NPOSIXD matches all upper Latin1 code points unless the
5588 * target string being matched is UTF-8, which is
5589 * unknowable until match time. Since we are going to
5590 * invert, we want to get rid of all of them so that the
5591 * inversion will match all */
5592 if (OP(scan) == NPOSIXD) {
5593 _invlist_subtract(my_invlist, PL_UpperLatin1,
5597 join_posix_and_ascii:
5599 if (flags & SCF_DO_STCLASS_AND) {
5600 ssc_intersection(data->start_class, my_invlist, invert);
5601 ssc_clear_locale(data->start_class);
5604 assert(flags & SCF_DO_STCLASS_OR);
5605 ssc_union(data->start_class, my_invlist, invert);
5607 SvREFCNT_dec(my_invlist);
5609 if (flags & SCF_DO_STCLASS_OR)
5610 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5611 flags &= ~SCF_DO_STCLASS;
5614 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5615 data->flags |= (OP(scan) == MEOL
5618 scan_commit(pRExC_state, data, minlenp, is_inf);
5621 else if ( PL_regkind[OP(scan)] == BRANCHJ
5622 /* Lookbehind, or need to calculate parens/evals/stclass: */
5623 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5624 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5626 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5627 || OP(scan) == UNLESSM )
5629 /* Negative Lookahead/lookbehind
5630 In this case we can't do fixed string optimisation.
5633 SSize_t deltanext, minnext, fake = 0;
5638 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5640 data_fake.whilem_c = data->whilem_c;
5641 data_fake.last_closep = data->last_closep;
5644 data_fake.last_closep = &fake;
5645 data_fake.pos_delta = delta;
5646 if ( flags & SCF_DO_STCLASS && !scan->flags
5647 && OP(scan) == IFMATCH ) { /* Lookahead */
5648 ssc_init(pRExC_state, &intrnl);
5649 data_fake.start_class = &intrnl;
5650 f |= SCF_DO_STCLASS_AND;
5652 if (flags & SCF_WHILEM_VISITED_POS)
5653 f |= SCF_WHILEM_VISITED_POS;
5654 next = regnext(scan);
5655 nscan = NEXTOPER(NEXTOPER(scan));
5657 /* recurse study_chunk() for lookahead body */
5658 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5659 last, &data_fake, stopparen,
5660 recursed_depth, NULL, f, depth+1);
5663 FAIL("Variable length lookbehind not implemented");
5665 else if (minnext > (I32)U8_MAX) {
5666 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5669 scan->flags = (U8)minnext;
5672 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5674 if (data_fake.flags & SF_HAS_EVAL)
5675 data->flags |= SF_HAS_EVAL;
5676 data->whilem_c = data_fake.whilem_c;
5678 if (f & SCF_DO_STCLASS_AND) {
5679 if (flags & SCF_DO_STCLASS_OR) {
5680 /* OR before, AND after: ideally we would recurse with
5681 * data_fake to get the AND applied by study of the
5682 * remainder of the pattern, and then derecurse;
5683 * *** HACK *** for now just treat as "no information".
5684 * See [perl #56690].
5686 ssc_init(pRExC_state, data->start_class);
5688 /* AND before and after: combine and continue. These
5689 * assertions are zero-length, so can match an EMPTY
5691 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5692 ANYOF_FLAGS(data->start_class)
5693 |= SSC_MATCHES_EMPTY_STRING;
5697 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5699 /* Positive Lookahead/lookbehind
5700 In this case we can do fixed string optimisation,
5701 but we must be careful about it. Note in the case of
5702 lookbehind the positions will be offset by the minimum
5703 length of the pattern, something we won't know about
5704 until after the recurse.
5706 SSize_t deltanext, fake = 0;
5710 /* We use SAVEFREEPV so that when the full compile
5711 is finished perl will clean up the allocated
5712 minlens when it's all done. This way we don't
5713 have to worry about freeing them when we know
5714 they wont be used, which would be a pain.
5717 Newx( minnextp, 1, SSize_t );
5718 SAVEFREEPV(minnextp);
5721 StructCopy(data, &data_fake, scan_data_t);
5722 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5725 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5726 data_fake.last_found=newSVsv(data->last_found);
5730 data_fake.last_closep = &fake;
5731 data_fake.flags = 0;
5732 data_fake.substrs[0].flags = 0;
5733 data_fake.substrs[1].flags = 0;
5734 data_fake.pos_delta = delta;
5736 data_fake.flags |= SF_IS_INF;
5737 if ( flags & SCF_DO_STCLASS && !scan->flags
5738 && OP(scan) == IFMATCH ) { /* Lookahead */
5739 ssc_init(pRExC_state, &intrnl);
5740 data_fake.start_class = &intrnl;
5741 f |= SCF_DO_STCLASS_AND;
5743 if (flags & SCF_WHILEM_VISITED_POS)
5744 f |= SCF_WHILEM_VISITED_POS;
5745 next = regnext(scan);
5746 nscan = NEXTOPER(NEXTOPER(scan));
5748 /* positive lookahead study_chunk() recursion */
5749 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5750 &deltanext, last, &data_fake,
5751 stopparen, recursed_depth, NULL,
5755 FAIL("Variable length lookbehind not implemented");
5757 else if (*minnextp > (I32)U8_MAX) {
5758 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5761 scan->flags = (U8)*minnextp;
5766 if (f & SCF_DO_STCLASS_AND) {
5767 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5768 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5771 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5773 if (data_fake.flags & SF_HAS_EVAL)
5774 data->flags |= SF_HAS_EVAL;
5775 data->whilem_c = data_fake.whilem_c;
5776 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5778 if (RExC_rx->minlen<*minnextp)
5779 RExC_rx->minlen=*minnextp;
5780 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5781 SvREFCNT_dec_NN(data_fake.last_found);
5783 for (i = 0; i < 2; i++) {
5784 if (data_fake.substrs[i].minlenp != minlenp) {
5785 data->substrs[i].min_offset =
5786 data_fake.substrs[i].min_offset;
5787 data->substrs[i].max_offset =
5788 data_fake.substrs[i].max_offset;
5789 data->substrs[i].minlenp =
5790 data_fake.substrs[i].minlenp;
5791 data->substrs[i].lookbehind += scan->flags;
5800 else if (OP(scan) == OPEN) {
5801 if (stopparen != (I32)ARG(scan))
5804 else if (OP(scan) == CLOSE) {
5805 if (stopparen == (I32)ARG(scan)) {
5808 if ((I32)ARG(scan) == is_par) {
5809 next = regnext(scan);
5811 if ( next && (OP(next) != WHILEM) && next < last)
5812 is_par = 0; /* Disable optimization */
5815 *(data->last_closep) = ARG(scan);
5817 else if (OP(scan) == EVAL) {
5819 data->flags |= SF_HAS_EVAL;
5821 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5822 if (flags & SCF_DO_SUBSTR) {
5823 scan_commit(pRExC_state, data, minlenp, is_inf);
5824 flags &= ~SCF_DO_SUBSTR;
5826 if (data && OP(scan)==ACCEPT) {
5827 data->flags |= SCF_SEEN_ACCEPT;
5832 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5834 if (flags & SCF_DO_SUBSTR) {
5835 scan_commit(pRExC_state, data, minlenp, is_inf);
5836 data->cur_is_floating = 1; /* float */
5838 is_inf = is_inf_internal = 1;
5839 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5840 ssc_anything(data->start_class);
5841 flags &= ~SCF_DO_STCLASS;
5843 else if (OP(scan) == GPOS) {
5844 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5845 !(delta || is_inf || (data && data->pos_delta)))
5847 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5848 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5849 if (RExC_rx->gofs < (STRLEN)min)
5850 RExC_rx->gofs = min;
5852 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5856 #ifdef TRIE_STUDY_OPT
5857 #ifdef FULL_TRIE_STUDY
5858 else if (PL_regkind[OP(scan)] == TRIE) {
5859 /* NOTE - There is similar code to this block above for handling
5860 BRANCH nodes on the initial study. If you change stuff here
5862 regnode *trie_node= scan;
5863 regnode *tail= regnext(scan);
5864 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5865 SSize_t max1 = 0, min1 = SSize_t_MAX;
5868 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5869 /* Cannot merge strings after this. */
5870 scan_commit(pRExC_state, data, minlenp, is_inf);
5872 if (flags & SCF_DO_STCLASS)
5873 ssc_init_zero(pRExC_state, &accum);
5879 const regnode *nextbranch= NULL;
5882 for ( word=1 ; word <= trie->wordcount ; word++)
5884 SSize_t deltanext=0, minnext=0, f = 0, fake;
5885 regnode_ssc this_class;
5887 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5889 data_fake.whilem_c = data->whilem_c;
5890 data_fake.last_closep = data->last_closep;
5893 data_fake.last_closep = &fake;
5894 data_fake.pos_delta = delta;
5895 if (flags & SCF_DO_STCLASS) {
5896 ssc_init(pRExC_state, &this_class);
5897 data_fake.start_class = &this_class;
5898 f = SCF_DO_STCLASS_AND;
5900 if (flags & SCF_WHILEM_VISITED_POS)
5901 f |= SCF_WHILEM_VISITED_POS;
5903 if (trie->jump[word]) {
5905 nextbranch = trie_node + trie->jump[0];
5906 scan= trie_node + trie->jump[word];
5907 /* We go from the jump point to the branch that follows
5908 it. Note this means we need the vestigal unused
5909 branches even though they arent otherwise used. */
5910 /* optimise study_chunk() for TRIE */
5911 minnext = study_chunk(pRExC_state, &scan, minlenp,
5912 &deltanext, (regnode *)nextbranch, &data_fake,
5913 stopparen, recursed_depth, NULL, f,depth+1);
5915 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5916 nextbranch= regnext((regnode*)nextbranch);
5918 if (min1 > (SSize_t)(minnext + trie->minlen))
5919 min1 = minnext + trie->minlen;
5920 if (deltanext == SSize_t_MAX) {
5921 is_inf = is_inf_internal = 1;
5923 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5924 max1 = minnext + deltanext + trie->maxlen;
5926 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5928 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5929 if ( stopmin > min + min1)
5930 stopmin = min + min1;
5931 flags &= ~SCF_DO_SUBSTR;
5933 data->flags |= SCF_SEEN_ACCEPT;
5936 if (data_fake.flags & SF_HAS_EVAL)
5937 data->flags |= SF_HAS_EVAL;
5938 data->whilem_c = data_fake.whilem_c;
5940 if (flags & SCF_DO_STCLASS)
5941 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5944 if (flags & SCF_DO_SUBSTR) {
5945 data->pos_min += min1;
5946 data->pos_delta += max1 - min1;
5947 if (max1 != min1 || is_inf)
5948 data->cur_is_floating = 1; /* float */
5951 if (delta != SSize_t_MAX) {
5952 if (SSize_t_MAX - (max1 - min1) >= delta)
5953 delta += max1 - min1;
5955 delta = SSize_t_MAX;
5957 if (flags & SCF_DO_STCLASS_OR) {
5958 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5960 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5961 flags &= ~SCF_DO_STCLASS;
5964 else if (flags & SCF_DO_STCLASS_AND) {
5966 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5967 flags &= ~SCF_DO_STCLASS;
5970 /* Switch to OR mode: cache the old value of
5971 * data->start_class */
5973 StructCopy(data->start_class, and_withp, regnode_ssc);
5974 flags &= ~SCF_DO_STCLASS_AND;
5975 StructCopy(&accum, data->start_class, regnode_ssc);
5976 flags |= SCF_DO_STCLASS_OR;
5983 else if (PL_regkind[OP(scan)] == TRIE) {
5984 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5987 min += trie->minlen;
5988 delta += (trie->maxlen - trie->minlen);
5989 flags &= ~SCF_DO_STCLASS; /* xxx */
5990 if (flags & SCF_DO_SUBSTR) {
5991 /* Cannot expect anything... */
5992 scan_commit(pRExC_state, data, minlenp, is_inf);
5993 data->pos_min += trie->minlen;
5994 data->pos_delta += (trie->maxlen - trie->minlen);
5995 if (trie->maxlen != trie->minlen)
5996 data->cur_is_floating = 1; /* float */
5998 if (trie->jump) /* no more substrings -- for now /grr*/
5999 flags &= ~SCF_DO_SUBSTR;
6001 #endif /* old or new */
6002 #endif /* TRIE_STUDY_OPT */
6004 /* Else: zero-length, ignore. */
6005 scan = regnext(scan);
6010 /* we need to unwind recursion. */
6013 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6014 DEBUG_PEEP("fend", scan, depth, flags);
6016 /* restore previous context */
6017 last = frame->last_regnode;
6018 scan = frame->next_regnode;
6019 stopparen = frame->stopparen;
6020 recursed_depth = frame->prev_recursed_depth;
6022 RExC_frame_last = frame->prev_frame;
6023 frame = frame->this_prev_frame;
6024 goto fake_study_recurse;
6028 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6031 *deltap = is_inf_internal ? SSize_t_MAX : delta;
6033 if (flags & SCF_DO_SUBSTR && is_inf)
6034 data->pos_delta = SSize_t_MAX - data->pos_min;
6035 if (is_par > (I32)U8_MAX)
6037 if (is_par && pars==1 && data) {
6038 data->flags |= SF_IN_PAR;
6039 data->flags &= ~SF_HAS_PAR;
6041 else if (pars && data) {
6042 data->flags |= SF_HAS_PAR;
6043 data->flags &= ~SF_IN_PAR;
6045 if (flags & SCF_DO_STCLASS_OR)
6046 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6047 if (flags & SCF_TRIE_RESTUDY)
6048 data->flags |= SCF_TRIE_RESTUDY;
6050 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6053 SSize_t final_minlen= min < stopmin ? min : stopmin;
6055 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6056 if (final_minlen > SSize_t_MAX - delta)
6057 RExC_maxlen = SSize_t_MAX;
6058 else if (RExC_maxlen < final_minlen + delta)
6059 RExC_maxlen = final_minlen + delta;
6061 return final_minlen;
6063 NOT_REACHED; /* NOTREACHED */
6067 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6069 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6071 PERL_ARGS_ASSERT_ADD_DATA;
6073 Renewc(RExC_rxi->data,
6074 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6075 char, struct reg_data);
6077 Renew(RExC_rxi->data->what, count + n, U8);
6079 Newx(RExC_rxi->data->what, n, U8);
6080 RExC_rxi->data->count = count + n;
6081 Copy(s, RExC_rxi->data->what + count, n, U8);
6085 /*XXX: todo make this not included in a non debugging perl, but appears to be
6086 * used anyway there, in 'use re' */
6087 #ifndef PERL_IN_XSUB_RE
6089 Perl_reginitcolors(pTHX)
6091 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6093 char *t = savepv(s);
6097 t = strchr(t, '\t');
6103 PL_colors[i] = t = (char *)"";
6108 PL_colors[i++] = (char *)"";
6115 #ifdef TRIE_STUDY_OPT
6116 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6119 (data.flags & SCF_TRIE_RESTUDY) \
6127 #define CHECK_RESTUDY_GOTO_butfirst
6131 * pregcomp - compile a regular expression into internal code
6133 * Decides which engine's compiler to call based on the hint currently in
6137 #ifndef PERL_IN_XSUB_RE
6139 /* return the currently in-scope regex engine (or the default if none) */
6141 regexp_engine const *
6142 Perl_current_re_engine(pTHX)
6144 if (IN_PERL_COMPILETIME) {
6145 HV * const table = GvHV(PL_hintgv);
6148 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6149 return &PL_core_reg_engine;
6150 ptr = hv_fetchs(table, "regcomp", FALSE);
6151 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6152 return &PL_core_reg_engine;
6153 return INT2PTR(regexp_engine*,SvIV(*ptr));
6157 if (!PL_curcop->cop_hints_hash)
6158 return &PL_core_reg_engine;
6159 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6160 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6161 return &PL_core_reg_engine;
6162 return INT2PTR(regexp_engine*,SvIV(ptr));
6168 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6170 regexp_engine const *eng = current_re_engine();
6171 GET_RE_DEBUG_FLAGS_DECL;
6173 PERL_ARGS_ASSERT_PREGCOMP;
6175 /* Dispatch a request to compile a regexp to correct regexp engine. */
6177 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6180 return CALLREGCOMP_ENG(eng, pattern, flags);
6184 /* public(ish) entry point for the perl core's own regex compiling code.
6185 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6186 * pattern rather than a list of OPs, and uses the internal engine rather
6187 * than the current one */
6190 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6192 SV *pat = pattern; /* defeat constness! */
6193 PERL_ARGS_ASSERT_RE_COMPILE;
6194 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6195 #ifdef PERL_IN_XSUB_RE
6198 &PL_core_reg_engine,
6200 NULL, NULL, rx_flags, 0);
6205 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6209 if (--cbs->refcnt > 0)
6211 for (n = 0; n < cbs->count; n++) {
6212 REGEXP *rx = cbs->cb[n].src_regex;
6213 cbs->cb[n].src_regex = NULL;
6221 static struct reg_code_blocks *
6222 S_alloc_code_blocks(pTHX_ int ncode)
6224 struct reg_code_blocks *cbs;
6225 Newx(cbs, 1, struct reg_code_blocks);
6228 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6230 Newx(cbs->cb, ncode, struct reg_code_block);
6237 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6238 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6239 * point to the realloced string and length.
6241 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6245 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6246 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6248 U8 *const src = (U8*)*pat_p;
6253 GET_RE_DEBUG_FLAGS_DECL;
6255 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6256 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6258 Newx(dst, *plen_p * 2 + 1, U8);
6261 while (s < *plen_p) {
6262 append_utf8_from_native_byte(src[s], &d);
6264 if (n < num_code_blocks) {
6265 assert(pRExC_state->code_blocks);
6266 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6267 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6268 assert(*(d - 1) == '(');
6271 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6272 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6273 assert(*(d - 1) == ')');
6282 *pat_p = (char*) dst;
6284 RExC_orig_utf8 = RExC_utf8 = 1;
6289 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6290 * while recording any code block indices, and handling overloading,
6291 * nested qr// objects etc. If pat is null, it will allocate a new
6292 * string, or just return the first arg, if there's only one.
6294 * Returns the malloced/updated pat.
6295 * patternp and pat_count is the array of SVs to be concatted;
6296 * oplist is the optional list of ops that generated the SVs;
6297 * recompile_p is a pointer to a boolean that will be set if
6298 * the regex will need to be recompiled.
6299 * delim, if non-null is an SV that will be inserted between each element
6303 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6304 SV *pat, SV ** const patternp, int pat_count,
6305 OP *oplist, bool *recompile_p, SV *delim)
6309 bool use_delim = FALSE;
6310 bool alloced = FALSE;
6312 /* if we know we have at least two args, create an empty string,
6313 * then concatenate args to that. For no args, return an empty string */
6314 if (!pat && pat_count != 1) {
6320 for (svp = patternp; svp < patternp + pat_count; svp++) {
6323 STRLEN orig_patlen = 0;
6325 SV *msv = use_delim ? delim : *svp;
6326 if (!msv) msv = &PL_sv_undef;
6328 /* if we've got a delimiter, we go round the loop twice for each
6329 * svp slot (except the last), using the delimiter the second
6338 if (SvTYPE(msv) == SVt_PVAV) {
6339 /* we've encountered an interpolated array within
6340 * the pattern, e.g. /...@a..../. Expand the list of elements,
6341 * then recursively append elements.
6342 * The code in this block is based on S_pushav() */
6344 AV *const av = (AV*)msv;
6345 const SSize_t maxarg = AvFILL(av) + 1;
6349 assert(oplist->op_type == OP_PADAV
6350 || oplist->op_type == OP_RV2AV);
6351 oplist = OpSIBLING(oplist);
6354 if (SvRMAGICAL(av)) {
6357 Newx(array, maxarg, SV*);
6359 for (i=0; i < maxarg; i++) {
6360 SV ** const svp = av_fetch(av, i, FALSE);
6361 array[i] = svp ? *svp : &PL_sv_undef;
6365 array = AvARRAY(av);
6367 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6368 array, maxarg, NULL, recompile_p,
6370 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6376 /* we make the assumption here that each op in the list of
6377 * op_siblings maps to one SV pushed onto the stack,
6378 * except for code blocks, with have both an OP_NULL and
6380 * This allows us to match up the list of SVs against the
6381 * list of OPs to find the next code block.
6383 * Note that PUSHMARK PADSV PADSV ..
6385 * PADRANGE PADSV PADSV ..
6386 * so the alignment still works. */
6389 if (oplist->op_type == OP_NULL
6390 && (oplist->op_flags & OPf_SPECIAL))
6392 assert(n < pRExC_state->code_blocks->count);
6393 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6394 pRExC_state->code_blocks->cb[n].block = oplist;
6395 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6398 oplist = OpSIBLING(oplist); /* skip CONST */
6401 oplist = OpSIBLING(oplist);;
6404 /* apply magic and QR overloading to arg */
6407 if (SvROK(msv) && SvAMAGIC(msv)) {
6408 SV *sv = AMG_CALLunary(msv, regexp_amg);
6412 if (SvTYPE(sv) != SVt_REGEXP)
6413 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6418 /* try concatenation overload ... */
6419 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6420 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6423 /* overloading involved: all bets are off over literal
6424 * code. Pretend we haven't seen it */
6426 pRExC_state->code_blocks->count -= n;
6430 /* ... or failing that, try "" overload */
6431 while (SvAMAGIC(msv)
6432 && (sv = AMG_CALLunary(msv, string_amg))
6436 && SvRV(msv) == SvRV(sv))
6441 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6445 /* this is a partially unrolled
6446 * sv_catsv_nomg(pat, msv);
6447 * that allows us to adjust code block indices if
6450 char *dst = SvPV_force_nomg(pat, dlen);
6452 if (SvUTF8(msv) && !SvUTF8(pat)) {
6453 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6454 sv_setpvn(pat, dst, dlen);
6457 sv_catsv_nomg(pat, msv);
6461 /* We have only one SV to process, but we need to verify
6462 * it is properly null terminated or we will fail asserts
6463 * later. In theory we probably shouldn't get such SV's,
6464 * but if we do we should handle it gracefully. */
6465 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
6466 /* not a string, or a string with a trailing null */
6469 /* a string with no trailing null, we need to copy it
6470 * so it we have a trailing null */
6476 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6479 /* extract any code blocks within any embedded qr//'s */
6480 if (rx && SvTYPE(rx) == SVt_REGEXP
6481 && RX_ENGINE((REGEXP*)rx)->op_comp)
6484 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6485 if (ri->code_blocks && ri->code_blocks->count) {
6487 /* the presence of an embedded qr// with code means
6488 * we should always recompile: the text of the
6489 * qr// may not have changed, but it may be a
6490 * different closure than last time */
6492 if (pRExC_state->code_blocks) {
6493 int new_count = pRExC_state->code_blocks->count
6494 + ri->code_blocks->count;
6495 Renew(pRExC_state->code_blocks->cb,
6496 new_count, struct reg_code_block);
6497 pRExC_state->code_blocks->count = new_count;
6500 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6501 ri->code_blocks->count);
6503 for (i=0; i < ri->code_blocks->count; i++) {
6504 struct reg_code_block *src, *dst;
6505 STRLEN offset = orig_patlen
6506 + ReANY((REGEXP *)rx)->pre_prefix;
6507 assert(n < pRExC_state->code_blocks->count);
6508 src = &ri->code_blocks->cb[i];
6509 dst = &pRExC_state->code_blocks->cb[n];
6510 dst->start = src->start + offset;
6511 dst->end = src->end + offset;
6512 dst->block = src->block;
6513 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6522 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6531 /* see if there are any run-time code blocks in the pattern.
6532 * False positives are allowed */
6535 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6536 char *pat, STRLEN plen)
6541 PERL_UNUSED_CONTEXT;
6543 for (s = 0; s < plen; s++) {
6544 if ( pRExC_state->code_blocks
6545 && n < pRExC_state->code_blocks->count
6546 && s == pRExC_state->code_blocks->cb[n].start)
6548 s = pRExC_state->code_blocks->cb[n].end;
6552 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6554 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6556 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6563 /* Handle run-time code blocks. We will already have compiled any direct
6564 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6565 * copy of it, but with any literal code blocks blanked out and
6566 * appropriate chars escaped; then feed it into
6568 * eval "qr'modified_pattern'"
6572 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6576 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6578 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6579 * and merge them with any code blocks of the original regexp.
6581 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6582 * instead, just save the qr and return FALSE; this tells our caller that
6583 * the original pattern needs upgrading to utf8.
6587 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6588 char *pat, STRLEN plen)
6592 GET_RE_DEBUG_FLAGS_DECL;
6594 if (pRExC_state->runtime_code_qr) {
6595 /* this is the second time we've been called; this should
6596 * only happen if the main pattern got upgraded to utf8
6597 * during compilation; re-use the qr we compiled first time
6598 * round (which should be utf8 too)
6600 qr = pRExC_state->runtime_code_qr;
6601 pRExC_state->runtime_code_qr = NULL;
6602 assert(RExC_utf8 && SvUTF8(qr));
6608 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6612 /* determine how many extra chars we need for ' and \ escaping */
6613 for (s = 0; s < plen; s++) {
6614 if (pat[s] == '\'' || pat[s] == '\\')
6618 Newx(newpat, newlen, char);
6620 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6622 for (s = 0; s < plen; s++) {
6623 if ( pRExC_state->code_blocks
6624 && n < pRExC_state->code_blocks->count
6625 && s == pRExC_state->code_blocks->cb[n].start)
6627 /* blank out literal code block */
6628 assert(pat[s] == '(');
6629 while (s <= pRExC_state->code_blocks->cb[n].end) {
6637 if (pat[s] == '\'' || pat[s] == '\\')
6642 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6644 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6650 Perl_re_printf( aTHX_
6651 "%sre-parsing pattern for runtime code:%s %s\n",
6652 PL_colors[4],PL_colors[5],newpat);
6655 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6661 PUSHSTACKi(PERLSI_REQUIRE);
6662 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6663 * parsing qr''; normally only q'' does this. It also alters
6665 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6666 SvREFCNT_dec_NN(sv);
6671 SV * const errsv = ERRSV;
6672 if (SvTRUE_NN(errsv))
6673 /* use croak_sv ? */
6674 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6676 assert(SvROK(qr_ref));
6678 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6679 /* the leaving below frees the tmp qr_ref.
6680 * Give qr a life of its own */
6688 if (!RExC_utf8 && SvUTF8(qr)) {
6689 /* first time through; the pattern got upgraded; save the
6690 * qr for the next time through */
6691 assert(!pRExC_state->runtime_code_qr);
6692 pRExC_state->runtime_code_qr = qr;
6697 /* extract any code blocks within the returned qr// */
6700 /* merge the main (r1) and run-time (r2) code blocks into one */
6702 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6703 struct reg_code_block *new_block, *dst;
6704 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6708 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
6710 SvREFCNT_dec_NN(qr);
6714 if (!r1->code_blocks)
6715 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
6717 r1c = r1->code_blocks->count;
6718 r2c = r2->code_blocks->count;
6720 Newx(new_block, r1c + r2c, struct reg_code_block);
6724 while (i1 < r1c || i2 < r2c) {
6725 struct reg_code_block *src;
6729 src = &r2->code_blocks->cb[i2++];
6733 src = &r1->code_blocks->cb[i1++];
6734 else if ( r1->code_blocks->cb[i1].start
6735 < r2->code_blocks->cb[i2].start)
6737 src = &r1->code_blocks->cb[i1++];
6738 assert(src->end < r2->code_blocks->cb[i2].start);
6741 assert( r1->code_blocks->cb[i1].start
6742 > r2->code_blocks->cb[i2].start);
6743 src = &r2->code_blocks->cb[i2++];
6745 assert(src->end < r1->code_blocks->cb[i1].start);
6748 assert(pat[src->start] == '(');
6749 assert(pat[src->end] == ')');
6750 dst->start = src->start;
6751 dst->end = src->end;
6752 dst->block = src->block;
6753 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6757 r1->code_blocks->count += r2c;
6758 Safefree(r1->code_blocks->cb);
6759 r1->code_blocks->cb = new_block;
6762 SvREFCNT_dec_NN(qr);
6768 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
6769 struct reg_substr_datum *rsd,
6770 struct scan_data_substrs *sub,
6771 STRLEN longest_length)
6773 /* This is the common code for setting up the floating and fixed length
6774 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6775 * as to whether succeeded or not */
6779 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
6780 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
6782 if (! (longest_length
6783 || (eol /* Can't have SEOL and MULTI */
6784 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6786 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6787 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6792 /* copy the information about the longest from the reg_scan_data
6793 over to the program. */
6794 if (SvUTF8(sub->str)) {
6796 rsd->utf8_substr = sub->str;
6798 rsd->substr = sub->str;
6799 rsd->utf8_substr = NULL;
6801 /* end_shift is how many chars that must be matched that
6802 follow this item. We calculate it ahead of time as once the
6803 lookbehind offset is added in we lose the ability to correctly
6805 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
6806 rsd->end_shift = ml - sub->min_offset
6808 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
6810 + (SvTAIL(sub->str) != 0)
6814 t = (eol/* Can't have SEOL and MULTI */
6815 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6816 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
6822 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6823 * regular expression into internal code.
6824 * The pattern may be passed either as:
6825 * a list of SVs (patternp plus pat_count)
6826 * a list of OPs (expr)
6827 * If both are passed, the SV list is used, but the OP list indicates
6828 * which SVs are actually pre-compiled code blocks
6830 * The SVs in the list have magic and qr overloading applied to them (and
6831 * the list may be modified in-place with replacement SVs in the latter
6834 * If the pattern hasn't changed from old_re, then old_re will be
6837 * eng is the current engine. If that engine has an op_comp method, then
6838 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6839 * do the initial concatenation of arguments and pass on to the external
6842 * If is_bare_re is not null, set it to a boolean indicating whether the
6843 * arg list reduced (after overloading) to a single bare regex which has
6844 * been returned (i.e. /$qr/).
6846 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6848 * pm_flags contains the PMf_* flags, typically based on those from the
6849 * pm_flags field of the related PMOP. Currently we're only interested in
6850 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6852 * We can't allocate space until we know how big the compiled form will be,
6853 * but we can't compile it (and thus know how big it is) until we've got a
6854 * place to put the code. So we cheat: we compile it twice, once with code
6855 * generation turned off and size counting turned on, and once "for real".
6856 * This also means that we don't allocate space until we are sure that the
6857 * thing really will compile successfully, and we never have to move the
6858 * code and thus invalidate pointers into it. (Note that it has to be in
6859 * one piece because free() must be able to free it all.) [NB: not true in perl]
6861 * Beware that the optimization-preparation code in here knows about some
6862 * of the structure of the compiled regexp. [I'll say.]
6866 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6867 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6868 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6872 regexp_internal *ri;
6880 SV** new_patternp = patternp;
6882 /* these are all flags - maybe they should be turned
6883 * into a single int with different bit masks */
6884 I32 sawlookahead = 0;
6889 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6891 bool runtime_code = 0;
6893 RExC_state_t RExC_state;
6894 RExC_state_t * const pRExC_state = &RExC_state;
6895 #ifdef TRIE_STUDY_OPT
6897 RExC_state_t copyRExC_state;
6899 GET_RE_DEBUG_FLAGS_DECL;
6901 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6903 DEBUG_r(if (!PL_colorset) reginitcolors());
6905 /* Initialize these here instead of as-needed, as is quick and avoids
6906 * having to test them each time otherwise */
6907 if (! PL_AboveLatin1) {
6909 char * dump_len_string;
6912 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6913 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6914 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6915 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6916 PL_HasMultiCharFold =
6917 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6919 /* This is calculated here, because the Perl program that generates the
6920 * static global ones doesn't currently have access to
6921 * NUM_ANYOF_CODE_POINTS */
6922 PL_InBitmap = _new_invlist(2);
6923 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6924 NUM_ANYOF_CODE_POINTS - 1);
6926 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6927 if ( ! dump_len_string
6928 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6930 PL_dump_re_max_len = 60; /* A reasonable default */
6935 pRExC_state->warn_text = NULL;
6936 pRExC_state->code_blocks = NULL;
6939 *is_bare_re = FALSE;
6941 if (expr && (expr->op_type == OP_LIST ||
6942 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6943 /* allocate code_blocks if needed */
6947 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6948 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6949 ncode++; /* count of DO blocks */
6952 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
6956 /* compile-time pattern with just OP_CONSTs and DO blocks */
6961 /* find how many CONSTs there are */
6964 if (expr->op_type == OP_CONST)
6967 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6968 if (o->op_type == OP_CONST)
6972 /* fake up an SV array */
6974 assert(!new_patternp);
6975 Newx(new_patternp, n, SV*);
6976 SAVEFREEPV(new_patternp);
6980 if (expr->op_type == OP_CONST)
6981 new_patternp[n] = cSVOPx_sv(expr);
6983 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6984 if (o->op_type == OP_CONST)
6985 new_patternp[n++] = cSVOPo_sv;
6990 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6991 "Assembling pattern from %d elements%s\n", pat_count,
6992 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6994 /* set expr to the first arg op */
6996 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
6997 && expr->op_type != OP_CONST)
6999 expr = cLISTOPx(expr)->op_first;
7000 assert( expr->op_type == OP_PUSHMARK
7001 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7002 || expr->op_type == OP_PADRANGE);
7003 expr = OpSIBLING(expr);
7006 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7007 expr, &recompile, NULL);
7009 /* handle bare (possibly after overloading) regex: foo =~ $re */
7014 if (SvTYPE(re) == SVt_REGEXP) {
7018 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7019 "Precompiled pattern%s\n",
7020 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7026 exp = SvPV_nomg(pat, plen);
7028 if (!eng->op_comp) {
7029 if ((SvUTF8(pat) && IN_BYTES)
7030 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7032 /* make a temporary copy; either to convert to bytes,
7033 * or to avoid repeating get-magic / overloaded stringify */
7034 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7035 (IN_BYTES ? 0 : SvUTF8(pat)));
7037 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7040 /* ignore the utf8ness if the pattern is 0 length */
7041 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7043 RExC_uni_semantics = 0;
7044 RExC_seen_unfolded_sharp_s = 0;
7045 RExC_contains_locale = 0;
7046 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7047 RExC_in_script_run = 0;
7048 RExC_study_started = 0;
7049 pRExC_state->runtime_code_qr = NULL;
7050 RExC_frame_head= NULL;
7051 RExC_frame_last= NULL;
7052 RExC_frame_count= 0;
7055 RExC_mysv1= sv_newmortal();
7056 RExC_mysv2= sv_newmortal();
7059 SV *dsv= sv_newmortal();
7060 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7061 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7062 PL_colors[4],PL_colors[5],s);
7066 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7069 if ((pm_flags & PMf_USE_RE_EVAL)
7070 /* this second condition covers the non-regex literal case,
7071 * i.e. $foo =~ '(?{})'. */
7072 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7074 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7076 /* return old regex if pattern hasn't changed */
7077 /* XXX: note in the below we have to check the flags as well as the
7080 * Things get a touch tricky as we have to compare the utf8 flag
7081 * independently from the compile flags. */
7085 && !!RX_UTF8(old_re) == !!RExC_utf8
7086 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7087 && RX_PRECOMP(old_re)
7088 && RX_PRELEN(old_re) == plen
7089 && memEQ(RX_PRECOMP(old_re), exp, plen)
7090 && !runtime_code /* with runtime code, always recompile */ )
7095 rx_flags = orig_rx_flags;
7097 if ( initial_charset == REGEX_DEPENDS_CHARSET
7098 && (RExC_utf8 ||RExC_uni_semantics))
7101 /* Set to use unicode semantics if the pattern is in utf8 and has the
7102 * 'depends' charset specified, as it means unicode when utf8 */
7103 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7107 RExC_precomp_adj = 0;
7108 RExC_flags = rx_flags;
7109 RExC_pm_flags = pm_flags;
7112 assert(TAINTING_get || !TAINT_get);
7114 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7116 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7117 /* whoops, we have a non-utf8 pattern, whilst run-time code
7118 * got compiled as utf8. Try again with a utf8 pattern */
7119 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7120 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7121 goto redo_first_pass;
7124 assert(!pRExC_state->runtime_code_qr);
7130 RExC_in_lookbehind = 0;
7131 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7134 RExC_recode_x_to_native = 0;
7136 RExC_in_multi_char_class = 0;
7138 /* First pass: determine size, legality. */
7140 RExC_start = RExC_adjusted_start = exp;
7141 RExC_end = exp + plen;
7142 RExC_precomp_end = RExC_end;
7147 RExC_emit = (regnode *) &RExC_emit_dummy;
7148 RExC_whilem_seen = 0;
7149 RExC_open_parens = NULL;
7150 RExC_close_parens = NULL;
7152 RExC_paren_names = NULL;
7154 RExC_paren_name_list = NULL;
7156 RExC_recurse = NULL;
7157 RExC_study_chunk_recursed = NULL;
7158 RExC_study_chunk_recursed_bytes= 0;
7159 RExC_recurse_count = 0;
7160 pRExC_state->code_index = 0;
7162 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7163 * code makes sure the final byte is an uncounted NUL. But should this
7164 * ever not be the case, lots of things could read beyond the end of the
7165 * buffer: loops like
7166 * while(isFOO(*RExC_parse)) RExC_parse++;
7167 * strchr(RExC_parse, "foo");
7168 * etc. So it is worth noting. */
7169 assert(*RExC_end == '\0');
7172 Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
7174 RExC_lastparse=NULL;
7177 if (reg(pRExC_state, 0, &flags,1) == NULL) {
7178 /* It's possible to write a regexp in ascii that represents Unicode
7179 codepoints outside of the byte range, such as via \x{100}. If we
7180 detect such a sequence we have to convert the entire pattern to utf8
7181 and then recompile, as our sizing calculation will have been based
7182 on 1 byte == 1 character, but we will need to use utf8 to encode
7183 at least some part of the pattern, and therefore must convert the whole
7186 if (flags & RESTART_PASS1) {
7187 if (flags & NEED_UTF8) {
7188 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7189 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7192 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7193 "Need to redo pass 1\n"));
7196 goto redo_first_pass;
7198 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
7202 Perl_re_printf( aTHX_
7203 "Required size %" IVdf " nodes\n"
7204 "Starting second pass (creation)\n",
7207 RExC_lastparse=NULL;
7210 /* The first pass could have found things that force Unicode semantics */
7211 if ((RExC_utf8 || RExC_uni_semantics)
7212 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7214 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7217 /* Small enough for pointer-storage convention?
7218 If extralen==0, this means that we will not need long jumps. */
7219 if (RExC_size >= 0x10000L && RExC_extralen)
7220 RExC_size += RExC_extralen;
7223 if (RExC_whilem_seen > 15)
7224 RExC_whilem_seen = 15;
7226 /* Allocate space and zero-initialize. Note, the two step process
7227 of zeroing when in debug mode, thus anything assigned has to
7228 happen after that */
7229 rx = (REGEXP*) newSV_type(SVt_REGEXP);
7231 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7232 char, regexp_internal);
7233 if ( r == NULL || ri == NULL )
7234 FAIL("Regexp out of space");
7236 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7237 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7240 /* bulk initialize base fields with 0. */
7241 Zero(ri, sizeof(regexp_internal), char);
7244 /* non-zero initialization begins here */
7247 r->extflags = rx_flags;
7248 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7250 if (pm_flags & PMf_IS_QR) {
7251 ri->code_blocks = pRExC_state->code_blocks;
7252 if (ri->code_blocks)
7253 ri->code_blocks->refcnt++;
7257 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7258 bool has_charset = (get_regex_charset(r->extflags)
7259 != REGEX_DEPENDS_CHARSET);
7261 /* The caret is output if there are any defaults: if not all the STD
7262 * flags are set, or if no character set specifier is needed */
7264 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7266 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7267 == REG_RUN_ON_COMMENT_SEEN);
7268 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7269 >> RXf_PMf_STD_PMMOD_SHIFT);
7270 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7273 /* We output all the necessary flags; we never output a minus, as all
7274 * those are defaults, so are
7275 * covered by the caret */
7276 const STRLEN wraplen = plen + has_p + has_runon
7277 + has_default /* If needs a caret */
7278 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7280 /* If needs a character set specifier */
7281 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7282 + (sizeof("(?:)") - 1);
7284 /* make sure PL_bitcount bounds not exceeded */
7285 assert(sizeof(STD_PAT_MODS) <= 8);
7287 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
7290 SvFLAGS(rx) |= SVf_UTF8;
7293 /* If a default, cover it using the caret */
7295 *p++= DEFAULT_PAT_MOD;
7299 const char* const name = get_regex_charset_name(r->extflags, &len);
7300 Copy(name, p, len, char);
7304 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7307 while((ch = *fptr++)) {
7315 Copy(RExC_precomp, p, plen, char);
7316 assert ((RX_WRAPPED(rx) - p) < 16);
7317 r->pre_prefix = p - RX_WRAPPED(rx);
7323 SvCUR_set(rx, p - RX_WRAPPED(rx));
7327 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
7329 /* Useful during FAIL. */
7330 #ifdef RE_TRACK_PATTERN_OFFSETS
7331 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7332 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7333 "%s %" UVuf " bytes for offset annotations.\n",
7334 ri->u.offsets ? "Got" : "Couldn't get",
7335 (UV)((2*RExC_size+1) * sizeof(U32))));
7337 SetProgLen(ri,RExC_size);
7342 /* Second pass: emit code. */
7343 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7344 RExC_pm_flags = pm_flags;
7346 RExC_end = exp + plen;
7348 RExC_emit_start = ri->program;
7349 RExC_emit = ri->program;
7350 RExC_emit_bound = ri->program + RExC_size + 1;
7351 pRExC_state->code_index = 0;
7353 *((char*) RExC_emit++) = (char) REG_MAGIC;
7354 /* setup various meta data about recursion, this all requires
7355 * RExC_npar to be correctly set, and a bit later on we clear it */
7356 if (RExC_seen & REG_RECURSE_SEEN) {
7357 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7358 "%*s%*s Setting up open/close parens\n",
7359 22, "| |", (int)(0 * 2 + 1), ""));
7361 /* setup RExC_open_parens, which holds the address of each
7362 * OPEN tag, and to make things simpler for the 0 index
7363 * the start of the program - this is used later for offsets */
7364 Newxz(RExC_open_parens, RExC_npar,regnode *);
7365 SAVEFREEPV(RExC_open_parens);
7366 RExC_open_parens[0] = RExC_emit;
7368 /* setup RExC_close_parens, which holds the address of each
7369 * CLOSE tag, and to make things simpler for the 0 index
7370 * the end of the program - this is used later for offsets */
7371 Newxz(RExC_close_parens, RExC_npar,regnode *);
7372 SAVEFREEPV(RExC_close_parens);
7373 /* we dont know where end op starts yet, so we dont
7374 * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7376 /* Note, RExC_npar is 1 + the number of parens in a pattern.
7377 * So its 1 if there are no parens. */
7378 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7379 ((RExC_npar & 0x07) != 0);
7380 Newx(RExC_study_chunk_recursed,
7381 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7382 SAVEFREEPV(RExC_study_chunk_recursed);
7385 if (reg(pRExC_state, 0, &flags,1) == NULL) {
7387 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
7390 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7393 /* XXXX To minimize changes to RE engine we always allocate
7394 3-units-long substrs field. */
7395 Newx(r->substrs, 1, struct reg_substr_data);
7396 if (RExC_recurse_count) {
7397 Newx(RExC_recurse,RExC_recurse_count,regnode *);
7398 SAVEFREEPV(RExC_recurse);
7402 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7404 RExC_study_chunk_recursed_count= 0;
7406 Zero(r->substrs, 1, struct reg_substr_data);
7407 if (RExC_study_chunk_recursed) {
7408 Zero(RExC_study_chunk_recursed,
7409 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7413 #ifdef TRIE_STUDY_OPT
7415 StructCopy(&zero_scan_data, &data, scan_data_t);
7416 copyRExC_state = RExC_state;
7419 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7421 RExC_state = copyRExC_state;
7422 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7423 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7425 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7426 StructCopy(&zero_scan_data, &data, scan_data_t);
7429 StructCopy(&zero_scan_data, &data, scan_data_t);
7432 /* Dig out information for optimizations. */
7433 r->extflags = RExC_flags; /* was pm_op */
7434 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7437 SvUTF8_on(rx); /* Unicode in it? */
7438 ri->regstclass = NULL;
7439 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7440 r->intflags |= PREGf_NAUGHTY;
7441 scan = ri->program + 1; /* First BRANCH. */
7443 /* testing for BRANCH here tells us whether there is "must appear"
7444 data in the pattern. If there is then we can use it for optimisations */
7445 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7448 STRLEN longest_length[2];
7449 regnode_ssc ch_class; /* pointed to by data */
7451 SSize_t last_close = 0; /* pointed to by data */
7452 regnode *first= scan;
7453 regnode *first_next= regnext(first);
7457 * Skip introductions and multiplicators >= 1
7458 * so that we can extract the 'meat' of the pattern that must
7459 * match in the large if() sequence following.
7460 * NOTE that EXACT is NOT covered here, as it is normally
7461 * picked up by the optimiser separately.
7463 * This is unfortunate as the optimiser isnt handling lookahead
7464 * properly currently.
7467 while ((OP(first) == OPEN && (sawopen = 1)) ||
7468 /* An OR of *one* alternative - should not happen now. */
7469 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7470 /* for now we can't handle lookbehind IFMATCH*/
7471 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7472 (OP(first) == PLUS) ||
7473 (OP(first) == MINMOD) ||
7474 /* An {n,m} with n>0 */
7475 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7476 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7479 * the only op that could be a regnode is PLUS, all the rest
7480 * will be regnode_1 or regnode_2.
7482 * (yves doesn't think this is true)
7484 if (OP(first) == PLUS)
7487 if (OP(first) == MINMOD)
7489 first += regarglen[OP(first)];
7491 first = NEXTOPER(first);
7492 first_next= regnext(first);
7495 /* Starting-point info. */
7497 DEBUG_PEEP("first:", first, 0, 0);
7498 /* Ignore EXACT as we deal with it later. */
7499 if (PL_regkind[OP(first)] == EXACT) {
7500 if (OP(first) == EXACT || OP(first) == EXACTL)
7501 NOOP; /* Empty, get anchored substr later. */
7503 ri->regstclass = first;
7506 else if (PL_regkind[OP(first)] == TRIE &&
7507 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7509 /* this can happen only on restudy */
7510 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7513 else if (REGNODE_SIMPLE(OP(first)))
7514 ri->regstclass = first;
7515 else if (PL_regkind[OP(first)] == BOUND ||
7516 PL_regkind[OP(first)] == NBOUND)
7517 ri->regstclass = first;
7518 else if (PL_regkind[OP(first)] == BOL) {
7519 r->intflags |= (OP(first) == MBOL
7522 first = NEXTOPER(first);
7525 else if (OP(first) == GPOS) {
7526 r->intflags |= PREGf_ANCH_GPOS;
7527 first = NEXTOPER(first);
7530 else if ((!sawopen || !RExC_sawback) &&
7532 (OP(first) == STAR &&
7533 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7534 !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7536 /* turn .* into ^.* with an implied $*=1 */
7538 (OP(NEXTOPER(first)) == REG_ANY)
7541 r->intflags |= (type | PREGf_IMPLICIT);
7542 first = NEXTOPER(first);
7545 if (sawplus && !sawminmod && !sawlookahead
7546 && (!sawopen || !RExC_sawback)
7547 && !pRExC_state->code_blocks) /* May examine pos and $& */
7548 /* x+ must match at the 1st pos of run of x's */
7549 r->intflags |= PREGf_SKIP;
7551 /* Scan is after the zeroth branch, first is atomic matcher. */
7552 #ifdef TRIE_STUDY_OPT
7555 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7556 (IV)(first - scan + 1))
7560 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7561 (IV)(first - scan + 1))
7567 * If there's something expensive in the r.e., find the
7568 * longest literal string that must appear and make it the
7569 * regmust. Resolve ties in favor of later strings, since
7570 * the regstart check works with the beginning of the r.e.
7571 * and avoiding duplication strengthens checking. Not a
7572 * strong reason, but sufficient in the absence of others.
7573 * [Now we resolve ties in favor of the earlier string if
7574 * it happens that c_offset_min has been invalidated, since the
7575 * earlier string may buy us something the later one won't.]
7578 data.substrs[0].str = newSVpvs("");
7579 data.substrs[1].str = newSVpvs("");
7580 data.last_found = newSVpvs("");
7581 data.cur_is_floating = 0; /* initially any found substring is fixed */
7582 ENTER_with_name("study_chunk");
7583 SAVEFREESV(data.substrs[0].str);
7584 SAVEFREESV(data.substrs[1].str);
7585 SAVEFREESV(data.last_found);
7587 if (!ri->regstclass) {
7588 ssc_init(pRExC_state, &ch_class);
7589 data.start_class = &ch_class;
7590 stclass_flag = SCF_DO_STCLASS_AND;
7591 } else /* XXXX Check for BOUND? */
7593 data.last_closep = &last_close;
7597 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7598 * (NO top level branches)
7600 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7601 scan + RExC_size, /* Up to end */
7603 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7604 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7608 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7611 if ( RExC_npar == 1 && !data.cur_is_floating
7612 && data.last_start_min == 0 && data.last_end > 0
7613 && !RExC_seen_zerolen
7614 && !(RExC_seen & REG_VERBARG_SEEN)
7615 && !(RExC_seen & REG_GPOS_SEEN)
7617 r->extflags |= RXf_CHECK_ALL;
7619 scan_commit(pRExC_state, &data,&minlen,0);
7622 /* XXX this is done in reverse order because that's the way the
7623 * code was before it was parameterised. Don't know whether it
7624 * actually needs doing in reverse order. DAPM */
7625 for (i = 1; i >= 0; i--) {
7626 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
7629 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
7630 && data.substrs[0].min_offset
7631 == data.substrs[1].min_offset
7632 && SvCUR(data.substrs[0].str)
7633 == SvCUR(data.substrs[1].str)
7635 && S_setup_longest (aTHX_ pRExC_state,
7636 &(r->substrs->data[i]),
7640 r->substrs->data[i].min_offset =
7641 data.substrs[i].min_offset - data.substrs[i].lookbehind;
7643 r->substrs->data[i].max_offset = data.substrs[i].max_offset;
7644 /* Don't offset infinity */
7645 if (data.substrs[i].max_offset < SSize_t_MAX)
7646 r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
7647 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
7650 r->substrs->data[i].substr = NULL;
7651 r->substrs->data[i].utf8_substr = NULL;
7652 longest_length[i] = 0;
7656 LEAVE_with_name("study_chunk");
7659 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7660 ri->regstclass = NULL;
7662 if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
7663 || r->substrs->data[0].min_offset)
7665 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7666 && is_ssc_worth_it(pRExC_state, data.start_class))
7668 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7670 ssc_finalize(pRExC_state, data.start_class);
7672 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7673 StructCopy(data.start_class,
7674 (regnode_ssc*)RExC_rxi->data->data[n],
7676 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7677 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7678 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7679 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7680 Perl_re_printf( aTHX_
7681 "synthetic stclass \"%s\".\n",
7682 SvPVX_const(sv));});
7683 data.start_class = NULL;
7686 /* A temporary algorithm prefers floated substr to fixed one of
7687 * same length to dig more info. */
7688 i = (longest_length[0] <= longest_length[1]);
7689 r->substrs->check_ix = i;
7690 r->check_end_shift = r->substrs->data[i].end_shift;
7691 r->check_substr = r->substrs->data[i].substr;
7692 r->check_utf8 = r->substrs->data[i].utf8_substr;
7693 r->check_offset_min = r->substrs->data[i].min_offset;
7694 r->check_offset_max = r->substrs->data[i].max_offset;
7695 if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
7696 r->intflags |= PREGf_NOSCAN;
7698 if ((r->check_substr || r->check_utf8) ) {
7699 r->extflags |= RXf_USE_INTUIT;
7700 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7701 r->extflags |= RXf_INTUIT_TAIL;
7704 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7705 if ( (STRLEN)minlen < longest_length[1] )
7706 minlen= longest_length[1];
7707 if ( (STRLEN)minlen < longest_length[0] )
7708 minlen= longest_length[0];
7712 /* Several toplevels. Best we can is to set minlen. */
7714 regnode_ssc ch_class;
7715 SSize_t last_close = 0;
7717 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
7719 scan = ri->program + 1;
7720 ssc_init(pRExC_state, &ch_class);
7721 data.start_class = &ch_class;
7722 data.last_closep = &last_close;
7726 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
7727 * (patterns WITH top level branches)
7729 minlen = study_chunk(pRExC_state,
7730 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7731 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7732 ? SCF_TRIE_DOING_RESTUDY
7736 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7738 r->check_substr = NULL;
7739 r->check_utf8 = NULL;
7740 r->substrs->data[0].substr = NULL;
7741 r->substrs->data[0].utf8_substr = NULL;
7742 r->substrs->data[1].substr = NULL;
7743 r->substrs->data[1].utf8_substr = NULL;
7745 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7746 && is_ssc_worth_it(pRExC_state, data.start_class))
7748 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7750 ssc_finalize(pRExC_state, data.start_class);
7752 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7753 StructCopy(data.start_class,
7754 (regnode_ssc*)RExC_rxi->data->data[n],
7756 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7757 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7758 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7759 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7760 Perl_re_printf( aTHX_
7761 "synthetic stclass \"%s\".\n",
7762 SvPVX_const(sv));});
7763 data.start_class = NULL;
7767 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7768 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7769 r->maxlen = REG_INFTY;
7772 r->maxlen = RExC_maxlen;
7775 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7776 the "real" pattern. */
7778 Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
7779 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7781 r->minlenret = minlen;
7782 if (r->minlen < minlen)
7785 if (RExC_seen & REG_RECURSE_SEEN ) {
7786 r->intflags |= PREGf_RECURSE_SEEN;
7787 Newx(r->recurse_locinput, r->nparens + 1, char *);
7789 if (RExC_seen & REG_GPOS_SEEN)
7790 r->intflags |= PREGf_GPOS_SEEN;
7791 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7792 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7794 if (pRExC_state->code_blocks)
7795 r->extflags |= RXf_EVAL_SEEN;
7796 if (RExC_seen & REG_VERBARG_SEEN)
7798 r->intflags |= PREGf_VERBARG_SEEN;
7799 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7801 if (RExC_seen & REG_CUTGROUP_SEEN)
7802 r->intflags |= PREGf_CUTGROUP_SEEN;
7803 if (pm_flags & PMf_USE_RE_EVAL)
7804 r->intflags |= PREGf_USE_RE_EVAL;
7805 if (RExC_paren_names)
7806 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7808 RXp_PAREN_NAMES(r) = NULL;
7810 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7811 * so it can be used in pp.c */
7812 if (r->intflags & PREGf_ANCH)
7813 r->extflags |= RXf_IS_ANCHORED;
7817 /* this is used to identify "special" patterns that might result
7818 * in Perl NOT calling the regex engine and instead doing the match "itself",
7819 * particularly special cases in split//. By having the regex compiler
7820 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7821 * we avoid weird issues with equivalent patterns resulting in different behavior,
7822 * AND we allow non Perl engines to get the same optimizations by the setting the
7823 * flags appropriately - Yves */
7824 regnode *first = ri->program + 1;
7826 regnode *next = regnext(first);
7829 if (PL_regkind[fop] == NOTHING && nop == END)
7830 r->extflags |= RXf_NULL;
7831 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7832 /* when fop is SBOL first->flags will be true only when it was
7833 * produced by parsing /\A/, and not when parsing /^/. This is
7834 * very important for the split code as there we want to
7835 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7836 * See rt #122761 for more details. -- Yves */
7837 r->extflags |= RXf_START_ONLY;
7838 else if (fop == PLUS
7839 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7841 r->extflags |= RXf_WHITE;
7842 else if ( r->extflags & RXf_SPLIT
7843 && (fop == EXACT || fop == EXACTL)
7844 && STR_LEN(first) == 1
7845 && *(STRING(first)) == ' '
7847 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7851 if (RExC_contains_locale) {
7852 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7856 if (RExC_paren_names) {
7857 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7858 ri->data->data[ri->name_list_idx]
7859 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7862 ri->name_list_idx = 0;
7864 while ( RExC_recurse_count > 0 ) {
7865 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7867 * This data structure is set up in study_chunk() and is used
7868 * to calculate the distance between a GOSUB regopcode and
7869 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
7872 * If for some reason someone writes code that optimises
7873 * away a GOSUB opcode then the assert should be changed to
7874 * an if(scan) to guard the ARG2L_SET() - Yves
7877 assert(scan && OP(scan) == GOSUB);
7878 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7881 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7882 /* assume we don't need to swap parens around before we match */
7884 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7885 (unsigned long)RExC_study_chunk_recursed_count);
7889 Perl_re_printf( aTHX_ "Final program:\n");
7892 #ifdef RE_TRACK_PATTERN_OFFSETS
7893 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7894 const STRLEN len = ri->u.offsets[0];
7896 GET_RE_DEBUG_FLAGS_DECL;
7897 Perl_re_printf( aTHX_
7898 "Offsets: [%" UVuf "]\n\t", (UV)ri->u.offsets[0]);
7899 for (i = 1; i <= len; i++) {
7900 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7901 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7902 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7904 Perl_re_printf( aTHX_ "\n");
7909 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7910 * by setting the regexp SV to readonly-only instead. If the
7911 * pattern's been recompiled, the USEDness should remain. */
7912 if (old_re && SvREADONLY(old_re))
7920 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7923 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7925 PERL_UNUSED_ARG(value);
7927 if (flags & RXapif_FETCH) {
7928 return reg_named_buff_fetch(rx, key, flags);
7929 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7930 Perl_croak_no_modify();
7932 } else if (flags & RXapif_EXISTS) {
7933 return reg_named_buff_exists(rx, key, flags)
7936 } else if (flags & RXapif_REGNAMES) {
7937 return reg_named_buff_all(rx, flags);
7938 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7939 return reg_named_buff_scalar(rx, flags);
7941 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7947 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7950 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7951 PERL_UNUSED_ARG(lastkey);
7953 if (flags & RXapif_FIRSTKEY)
7954 return reg_named_buff_firstkey(rx, flags);
7955 else if (flags & RXapif_NEXTKEY)
7956 return reg_named_buff_nextkey(rx, flags);
7958 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7965 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7969 struct regexp *const rx = ReANY(r);
7971 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7973 if (rx && RXp_PAREN_NAMES(rx)) {
7974 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7977 SV* sv_dat=HeVAL(he_str);
7978 I32 *nums=(I32*)SvPVX(sv_dat);
7979 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
7980 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7981 if ((I32)(rx->nparens) >= nums[i]
7982 && rx->offs[nums[i]].start != -1
7983 && rx->offs[nums[i]].end != -1)
7986 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7991 ret = newSVsv(&PL_sv_undef);
7994 av_push(retarray, ret);
7997 return newRV_noinc(MUTABLE_SV(retarray));
8004 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8007 struct regexp *const rx = ReANY(r);
8009 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8011 if (rx && RXp_PAREN_NAMES(rx)) {
8012 if (flags & RXapif_ALL) {
8013 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8015 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8017 SvREFCNT_dec_NN(sv);
8029 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8031 struct regexp *const rx = ReANY(r);
8033 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8035 if ( rx && RXp_PAREN_NAMES(rx) ) {
8036 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8038 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8045 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8047 struct regexp *const rx = ReANY(r);
8048 GET_RE_DEBUG_FLAGS_DECL;
8050 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8052 if (rx && RXp_PAREN_NAMES(rx)) {
8053 HV *hv = RXp_PAREN_NAMES(rx);
8055 while ( (temphe = hv_iternext_flags(hv,0)) ) {
8058 SV* sv_dat = HeVAL(temphe);
8059 I32 *nums = (I32*)SvPVX(sv_dat);
8060 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8061 if ((I32)(rx->lastparen) >= nums[i] &&
8062 rx->offs[nums[i]].start != -1 &&
8063 rx->offs[nums[i]].end != -1)
8069 if (parno || flags & RXapif_ALL) {
8070 return newSVhek(HeKEY_hek(temphe));
8078 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8083 struct regexp *const rx = ReANY(r);
8085 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8087 if (rx && RXp_PAREN_NAMES(rx)) {
8088 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8089 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8090 } else if (flags & RXapif_ONE) {
8091 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8092 av = MUTABLE_AV(SvRV(ret));
8093 length = av_tindex(av);
8094 SvREFCNT_dec_NN(ret);
8095 return newSViv(length + 1);
8097 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8102 return &PL_sv_undef;
8106 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8108 struct regexp *const rx = ReANY(r);
8111 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8113 if (rx && RXp_PAREN_NAMES(rx)) {
8114 HV *hv= RXp_PAREN_NAMES(rx);
8116 (void)hv_iterinit(hv);
8117 while ( (temphe = hv_iternext_flags(hv,0)) ) {
8120 SV* sv_dat = HeVAL(temphe);
8121 I32 *nums = (I32*)SvPVX(sv_dat);
8122 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8123 if ((I32)(rx->lastparen) >= nums[i] &&
8124 rx->offs[nums[i]].start != -1 &&
8125 rx->offs[nums[i]].end != -1)
8131 if (parno || flags & RXapif_ALL) {
8132 av_push(av, newSVhek(HeKEY_hek(temphe)));
8137 return newRV_noinc(MUTABLE_SV(av));
8141 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8144 struct regexp *const rx = ReANY(r);
8150 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8152 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8153 || n == RX_BUFF_IDX_CARET_FULLMATCH
8154 || n == RX_BUFF_IDX_CARET_POSTMATCH
8157 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8159 /* on something like
8162 * the KEEPCOPY is set on the PMOP rather than the regex */
8163 if (PL_curpm && r == PM_GETRE(PL_curpm))
8164 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8173 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8174 /* no need to distinguish between them any more */
8175 n = RX_BUFF_IDX_FULLMATCH;
8177 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8178 && rx->offs[0].start != -1)
8180 /* $`, ${^PREMATCH} */
8181 i = rx->offs[0].start;
8185 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8186 && rx->offs[0].end != -1)
8188 /* $', ${^POSTMATCH} */
8189 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8190 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8193 if ( 0 <= n && n <= (I32)rx->nparens &&
8194 (s1 = rx->offs[n].start) != -1 &&
8195 (t1 = rx->offs[n].end) != -1)
8197 /* $&, ${^MATCH}, $1 ... */
8199 s = rx->subbeg + s1 - rx->suboffset;
8204 assert(s >= rx->subbeg);
8205 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8207 #ifdef NO_TAINT_SUPPORT
8208 sv_setpvn(sv, s, i);
8210 const int oldtainted = TAINT_get;
8212 sv_setpvn(sv, s, i);
8213 TAINT_set(oldtainted);
8215 if (RXp_MATCH_UTF8(rx))
8220 if (RXp_MATCH_TAINTED(rx)) {
8221 if (SvTYPE(sv) >= SVt_PVMG) {
8222 MAGIC* const mg = SvMAGIC(sv);
8225 SvMAGIC_set(sv, mg->mg_moremagic);
8227 if ((mgt = SvMAGIC(sv))) {
8228 mg->mg_moremagic = mgt;
8229 SvMAGIC_set(sv, mg);
8246 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8247 SV const * const value)
8249 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8251 PERL_UNUSED_ARG(rx);
8252 PERL_UNUSED_ARG(paren);
8253 PERL_UNUSED_ARG(value);
8256 Perl_croak_no_modify();
8260 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8263 struct regexp *const rx = ReANY(r);
8267 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8269 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8270 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8271 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8274 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8276 /* on something like
8279 * the KEEPCOPY is set on the PMOP rather than the regex */
8280 if (PL_curpm && r == PM_GETRE(PL_curpm))
8281 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8287 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8289 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8290 case RX_BUFF_IDX_PREMATCH: /* $` */
8291 if (rx->offs[0].start != -1) {
8292 i = rx->offs[0].start;
8301 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8302 case RX_BUFF_IDX_POSTMATCH: /* $' */
8303 if (rx->offs[0].end != -1) {
8304 i = rx->sublen - rx->offs[0].end;
8306 s1 = rx->offs[0].end;
8313 default: /* $& / ${^MATCH}, $1, $2, ... */
8314 if (paren <= (I32)rx->nparens &&
8315 (s1 = rx->offs[paren].start) != -1 &&
8316 (t1 = rx->offs[paren].end) != -1)
8322 if (ckWARN(WARN_UNINITIALIZED))
8323 report_uninit((const SV *)sv);
8328 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8329 const char * const s = rx->subbeg - rx->suboffset + s1;
8334 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8341 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8343 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8344 PERL_UNUSED_ARG(rx);
8348 return newSVpvs("Regexp");
8351 /* Scans the name of a named buffer from the pattern.
8352 * If flags is REG_RSN_RETURN_NULL returns null.
8353 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8354 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8355 * to the parsed name as looked up in the RExC_paren_names hash.
8356 * If there is an error throws a vFAIL().. type exception.
8359 #define REG_RSN_RETURN_NULL 0
8360 #define REG_RSN_RETURN_NAME 1
8361 #define REG_RSN_RETURN_DATA 2
8364 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8366 char *name_start = RExC_parse;
8368 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8370 assert (RExC_parse <= RExC_end);
8371 if (RExC_parse == RExC_end) NOOP;
8372 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8373 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8374 * using do...while */
8377 RExC_parse += UTF8SKIP(RExC_parse);
8378 } while ( RExC_parse < RExC_end
8379 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8383 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8385 RExC_parse++; /* so the <- from the vFAIL is after the offending
8387 vFAIL("Group name must start with a non-digit word character");
8391 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8392 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8393 if ( flags == REG_RSN_RETURN_NAME)
8395 else if (flags==REG_RSN_RETURN_DATA) {
8398 if ( ! sv_name ) /* should not happen*/
8399 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8400 if (RExC_paren_names)
8401 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8403 sv_dat = HeVAL(he_str);
8405 vFAIL("Reference to nonexistent named group");
8409 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8410 (unsigned long) flags);
8412 NOT_REACHED; /* NOTREACHED */
8417 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8419 if (RExC_lastparse!=RExC_parse) { \
8420 Perl_re_printf( aTHX_ "%s", \
8421 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8422 RExC_end - RExC_parse, 16, \
8424 PERL_PV_ESCAPE_UNI_DETECT | \
8425 PERL_PV_PRETTY_ELLIPSES | \
8426 PERL_PV_PRETTY_LTGT | \
8427 PERL_PV_ESCAPE_RE | \
8428 PERL_PV_PRETTY_EXACTSIZE \
8432 Perl_re_printf( aTHX_ "%16s",""); \
8435 num = RExC_size + 1; \
8437 num=REG_NODE_NUM(RExC_emit); \
8438 if (RExC_lastnum!=num) \
8439 Perl_re_printf( aTHX_ "|%4d",num); \
8441 Perl_re_printf( aTHX_ "|%4s",""); \
8442 Perl_re_printf( aTHX_ "|%*s%-4s", \
8443 (int)((depth*2)), "", \
8447 RExC_lastparse=RExC_parse; \
8452 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8453 DEBUG_PARSE_MSG((funcname)); \
8454 Perl_re_printf( aTHX_ "%4s","\n"); \
8456 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8457 DEBUG_PARSE_MSG((funcname)); \
8458 Perl_re_printf( aTHX_ fmt "\n",args); \
8461 /* This section of code defines the inversion list object and its methods. The
8462 * interfaces are highly subject to change, so as much as possible is static to
8463 * this file. An inversion list is here implemented as a malloc'd C UV array
8464 * as an SVt_INVLIST scalar.
8466 * An inversion list for Unicode is an array of code points, sorted by ordinal
8467 * number. Each element gives the code point that begins a range that extends
8468 * up-to but not including the code point given by the next element. The final
8469 * element gives the first code point of a range that extends to the platform's
8470 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
8471 * ...) give ranges whose code points are all in the inversion list. We say
8472 * that those ranges are in the set. The odd-numbered elements give ranges
8473 * whose code points are not in the inversion list, and hence not in the set.
8474 * Thus, element [0] is the first code point in the list. Element [1]
8475 * is the first code point beyond that not in the list; and element [2] is the
8476 * first code point beyond that that is in the list. In other words, the first
8477 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8478 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
8479 * all code points in that range are not in the inversion list. The third
8480 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8481 * list, and so forth. Thus every element whose index is divisible by two
8482 * gives the beginning of a range that is in the list, and every element whose
8483 * index is not divisible by two gives the beginning of a range not in the
8484 * list. If the final element's index is divisible by two, the inversion list
8485 * extends to the platform's infinity; otherwise the highest code point in the
8486 * inversion list is the contents of that element minus 1.
8488 * A range that contains just a single code point N will look like
8490 * invlist[i+1] == N+1
8492 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8493 * impossible to represent, so element [i+1] is omitted. The single element
8495 * invlist[0] == UV_MAX
8496 * contains just UV_MAX, but is interpreted as matching to infinity.
8498 * Taking the complement (inverting) an inversion list is quite simple, if the
8499 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8500 * This implementation reserves an element at the beginning of each inversion
8501 * list to always contain 0; there is an additional flag in the header which
8502 * indicates if the list begins at the 0, or is offset to begin at the next
8503 * element. This means that the inversion list can be inverted without any
8504 * copying; just flip the flag.
8506 * More about inversion lists can be found in "Unicode Demystified"
8507 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8509 * The inversion list data structure is currently implemented as an SV pointing
8510 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8511 * array of UV whose memory management is automatically handled by the existing
8512 * facilities for SV's.
8514 * Some of the methods should always be private to the implementation, and some
8515 * should eventually be made public */
8517 /* The header definitions are in F<invlist_inline.h> */
8519 #ifndef PERL_IN_XSUB_RE
8521 PERL_STATIC_INLINE UV*
8522 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8524 /* Returns a pointer to the first element in the inversion list's array.
8525 * This is called upon initialization of an inversion list. Where the
8526 * array begins depends on whether the list has the code point U+0000 in it
8527 * or not. The other parameter tells it whether the code that follows this
8528 * call is about to put a 0 in the inversion list or not. The first
8529 * element is either the element reserved for 0, if TRUE, or the element
8530 * after it, if FALSE */
8532 bool* offset = get_invlist_offset_addr(invlist);
8533 UV* zero_addr = (UV *) SvPVX(invlist);
8535 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8538 assert(! _invlist_len(invlist));
8542 /* 1^1 = 0; 1^0 = 1 */
8543 *offset = 1 ^ will_have_0;
8544 return zero_addr + *offset;
8549 PERL_STATIC_INLINE void
8550 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8552 /* Sets the current number of elements stored in the inversion list.
8553 * Updates SvCUR correspondingly */
8554 PERL_UNUSED_CONTEXT;
8555 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8557 assert(SvTYPE(invlist) == SVt_INVLIST);
8562 : TO_INTERNAL_SIZE(len + offset));
8563 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8566 #ifndef PERL_IN_XSUB_RE
8569 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8571 /* Replaces the inversion list in 'dest' with the one from 'src'. It
8572 * steals the list from 'src', so 'src' is made to have a NULL list. This
8573 * is similar to what SvSetMagicSV() would do, if it were implemented on
8574 * inversion lists, though this routine avoids a copy */
8576 const UV src_len = _invlist_len(src);
8577 const bool src_offset = *get_invlist_offset_addr(src);
8578 const STRLEN src_byte_len = SvLEN(src);
8579 char * array = SvPVX(src);
8581 const int oldtainted = TAINT_get;
8583 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8585 assert(SvTYPE(src) == SVt_INVLIST);
8586 assert(SvTYPE(dest) == SVt_INVLIST);
8587 assert(! invlist_is_iterating(src));
8588 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8590 /* Make sure it ends in the right place with a NUL, as our inversion list
8591 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8593 array[src_byte_len - 1] = '\0';
8595 TAINT_NOT; /* Otherwise it breaks */
8596 sv_usepvn_flags(dest,
8600 /* This flag is documented to cause a copy to be avoided */
8601 SV_HAS_TRAILING_NUL);
8602 TAINT_set(oldtainted);
8607 /* Finish up copying over the other fields in an inversion list */
8608 *get_invlist_offset_addr(dest) = src_offset;
8609 invlist_set_len(dest, src_len, src_offset);
8610 *get_invlist_previous_index_addr(dest) = 0;
8611 invlist_iterfinish(dest);
8614 PERL_STATIC_INLINE IV*
8615 S_get_invlist_previous_index_addr(SV* invlist)
8617 /* Return the address of the IV that is reserved to hold the cached index
8619 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8621 assert(SvTYPE(invlist) == SVt_INVLIST);
8623 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8626 PERL_STATIC_INLINE IV
8627 S_invlist_previous_index(SV* const invlist)
8629 /* Returns cached index of previous search */
8631 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8633 return *get_invlist_previous_index_addr(invlist);
8636 PERL_STATIC_INLINE void
8637 S_invlist_set_previous_index(SV* const invlist, const IV index)
8639 /* Caches <index> for later retrieval */
8641 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8643 assert(index == 0 || index < (int) _invlist_len(invlist));
8645 *get_invlist_previous_index_addr(invlist) = index;
8648 PERL_STATIC_INLINE void
8649 S_invlist_trim(SV* invlist)
8651 /* Free the not currently-being-used space in an inversion list */
8653 /* But don't free up the space needed for the 0 UV that is always at the
8654 * beginning of the list, nor the trailing NUL */
8655 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8657 PERL_ARGS_ASSERT_INVLIST_TRIM;
8659 assert(SvTYPE(invlist) == SVt_INVLIST);
8661 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8664 PERL_STATIC_INLINE void
8665 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
8667 PERL_ARGS_ASSERT_INVLIST_CLEAR;
8669 assert(SvTYPE(invlist) == SVt_INVLIST);
8671 invlist_set_len(invlist, 0, 0);
8672 invlist_trim(invlist);
8675 #endif /* ifndef PERL_IN_XSUB_RE */
8677 PERL_STATIC_INLINE bool
8678 S_invlist_is_iterating(SV* const invlist)
8680 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8682 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8685 #ifndef PERL_IN_XSUB_RE
8687 PERL_STATIC_INLINE UV
8688 S_invlist_max(SV* const invlist)
8690 /* Returns the maximum number of elements storable in the inversion list's
8691 * array, without having to realloc() */
8693 PERL_ARGS_ASSERT_INVLIST_MAX;
8695 assert(SvTYPE(invlist) == SVt_INVLIST);
8697 /* Assumes worst case, in which the 0 element is not counted in the
8698 * inversion list, so subtracts 1 for that */
8699 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8700 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8701 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8704 Perl__new_invlist(pTHX_ IV initial_size)
8707 /* Return a pointer to a newly constructed inversion list, with enough
8708 * space to store 'initial_size' elements. If that number is negative, a
8709 * system default is used instead */
8713 if (initial_size < 0) {
8717 /* Allocate the initial space */
8718 new_list = newSV_type(SVt_INVLIST);
8720 /* First 1 is in case the zero element isn't in the list; second 1 is for
8722 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8723 invlist_set_len(new_list, 0, 0);
8725 /* Force iterinit() to be used to get iteration to work */
8726 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8728 *get_invlist_previous_index_addr(new_list) = 0;
8734 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8736 /* Return a pointer to a newly constructed inversion list, initialized to
8737 * point to <list>, which has to be in the exact correct inversion list
8738 * form, including internal fields. Thus this is a dangerous routine that
8739 * should not be used in the wrong hands. The passed in 'list' contains
8740 * several header fields at the beginning that are not part of the
8741 * inversion list body proper */
8743 const STRLEN length = (STRLEN) list[0];
8744 const UV version_id = list[1];
8745 const bool offset = cBOOL(list[2]);
8746 #define HEADER_LENGTH 3
8747 /* If any of the above changes in any way, you must change HEADER_LENGTH
8748 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8749 * perl -E 'say int(rand 2**31-1)'
8751 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8752 data structure type, so that one being
8753 passed in can be validated to be an
8754 inversion list of the correct vintage.
8757 SV* invlist = newSV_type(SVt_INVLIST);
8759 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8761 if (version_id != INVLIST_VERSION_ID) {
8762 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8765 /* The generated array passed in includes header elements that aren't part
8766 * of the list proper, so start it just after them */
8767 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8769 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8770 shouldn't touch it */
8772 *(get_invlist_offset_addr(invlist)) = offset;
8774 /* The 'length' passed to us is the physical number of elements in the
8775 * inversion list. But if there is an offset the logical number is one
8777 invlist_set_len(invlist, length - offset, offset);
8779 invlist_set_previous_index(invlist, 0);
8781 /* Initialize the iteration pointer. */
8782 invlist_iterfinish(invlist);
8784 SvREADONLY_on(invlist);
8790 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8792 /* Grow the maximum size of an inversion list */
8794 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8796 assert(SvTYPE(invlist) == SVt_INVLIST);
8798 /* Add one to account for the zero element at the beginning which may not
8799 * be counted by the calling parameters */
8800 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8804 S__append_range_to_invlist(pTHX_ SV* const invlist,
8805 const UV start, const UV end)
8807 /* Subject to change or removal. Append the range from 'start' to 'end' at
8808 * the end of the inversion list. The range must be above any existing
8812 UV max = invlist_max(invlist);
8813 UV len = _invlist_len(invlist);
8816 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8818 if (len == 0) { /* Empty lists must be initialized */
8819 offset = start != 0;
8820 array = _invlist_array_init(invlist, ! offset);
8823 /* Here, the existing list is non-empty. The current max entry in the
8824 * list is generally the first value not in the set, except when the
8825 * set extends to the end of permissible values, in which case it is
8826 * the first entry in that final set, and so this call is an attempt to
8827 * append out-of-order */
8829 UV final_element = len - 1;
8830 array = invlist_array(invlist);
8831 if ( array[final_element] > start
8832 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8834 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
8835 array[final_element], start,
8836 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8839 /* Here, it is a legal append. If the new range begins 1 above the end
8840 * of the range below it, it is extending the range below it, so the
8841 * new first value not in the set is one greater than the newly
8842 * extended range. */
8843 offset = *get_invlist_offset_addr(invlist);
8844 if (array[final_element] == start) {
8845 if (end != UV_MAX) {
8846 array[final_element] = end + 1;
8849 /* But if the end is the maximum representable on the machine,
8850 * assume that infinity was actually what was meant. Just let
8851 * the range that this would extend to have no end */
8852 invlist_set_len(invlist, len - 1, offset);
8858 /* Here the new range doesn't extend any existing set. Add it */
8860 len += 2; /* Includes an element each for the start and end of range */
8862 /* If wll overflow the existing space, extend, which may cause the array to
8865 invlist_extend(invlist, len);
8867 /* Have to set len here to avoid assert failure in invlist_array() */
8868 invlist_set_len(invlist, len, offset);
8870 array = invlist_array(invlist);
8873 invlist_set_len(invlist, len, offset);
8876 /* The next item on the list starts the range, the one after that is
8877 * one past the new range. */
8878 array[len - 2] = start;
8879 if (end != UV_MAX) {
8880 array[len - 1] = end + 1;
8883 /* But if the end is the maximum representable on the machine, just let
8884 * the range have no end */
8885 invlist_set_len(invlist, len - 1, offset);
8890 Perl__invlist_search(SV* const invlist, const UV cp)
8892 /* Searches the inversion list for the entry that contains the input code
8893 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8894 * return value is the index into the list's array of the range that
8895 * contains <cp>, that is, 'i' such that
8896 * array[i] <= cp < array[i+1]
8901 IV high = _invlist_len(invlist);
8902 const IV highest_element = high - 1;
8905 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8907 /* If list is empty, return failure. */
8912 /* (We can't get the array unless we know the list is non-empty) */
8913 array = invlist_array(invlist);
8915 mid = invlist_previous_index(invlist);
8917 if (mid > highest_element) {
8918 mid = highest_element;
8921 /* <mid> contains the cache of the result of the previous call to this
8922 * function (0 the first time). See if this call is for the same result,
8923 * or if it is for mid-1. This is under the theory that calls to this
8924 * function will often be for related code points that are near each other.
8925 * And benchmarks show that caching gives better results. We also test
8926 * here if the code point is within the bounds of the list. These tests
8927 * replace others that would have had to be made anyway to make sure that
8928 * the array bounds were not exceeded, and these give us extra information
8929 * at the same time */
8930 if (cp >= array[mid]) {
8931 if (cp >= array[highest_element]) {
8932 return highest_element;
8935 /* Here, array[mid] <= cp < array[highest_element]. This means that
8936 * the final element is not the answer, so can exclude it; it also
8937 * means that <mid> is not the final element, so can refer to 'mid + 1'
8939 if (cp < array[mid + 1]) {
8945 else { /* cp < aray[mid] */
8946 if (cp < array[0]) { /* Fail if outside the array */
8950 if (cp >= array[mid - 1]) {
8955 /* Binary search. What we are looking for is <i> such that
8956 * array[i] <= cp < array[i+1]
8957 * The loop below converges on the i+1. Note that there may not be an
8958 * (i+1)th element in the array, and things work nonetheless */
8959 while (low < high) {
8960 mid = (low + high) / 2;
8961 assert(mid <= highest_element);
8962 if (array[mid] <= cp) { /* cp >= array[mid] */
8965 /* We could do this extra test to exit the loop early.
8966 if (cp < array[low]) {
8971 else { /* cp < array[mid] */
8978 invlist_set_previous_index(invlist, high);
8983 Perl__invlist_populate_swatch(SV* const invlist,
8984 const UV start, const UV end, U8* swatch)
8986 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8987 * but is used when the swash has an inversion list. This makes this much
8988 * faster, as it uses a binary search instead of a linear one. This is
8989 * intimately tied to that function, and perhaps should be in utf8.c,
8990 * except it is intimately tied to inversion lists as well. It assumes
8991 * that <swatch> is all 0's on input */
8994 const IV len = _invlist_len(invlist);
8998 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
9000 if (len == 0) { /* Empty inversion list */
9004 array = invlist_array(invlist);
9006 /* Find which element it is */
9007 i = _invlist_search(invlist, start);
9009 /* We populate from <start> to <end> */
9010 while (current < end) {
9013 /* The inversion list gives the results for every possible code point
9014 * after the first one in the list. Only those ranges whose index is
9015 * even are ones that the inversion list matches. For the odd ones,
9016 * and if the initial code point is not in the list, we have to skip
9017 * forward to the next element */
9018 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9020 if (i >= len) { /* Finished if beyond the end of the array */
9024 if (current >= end) { /* Finished if beyond the end of what we
9026 if (LIKELY(end < UV_MAX)) {
9030 /* We get here when the upper bound is the maximum
9031 * representable on the machine, and we are looking for just
9032 * that code point. Have to special case it */
9034 goto join_end_of_list;
9037 assert(current >= start);
9039 /* The current range ends one below the next one, except don't go past
9042 upper = (i < len && array[i] < end) ? array[i] : end;
9044 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
9045 * for each code point in it */
9046 for (; current < upper; current++) {
9047 const STRLEN offset = (STRLEN)(current - start);
9048 swatch[offset >> 3] |= 1 << (offset & 7);
9053 /* Quit if at the end of the list */
9056 /* But first, have to deal with the highest possible code point on
9057 * the platform. The previous code assumes that <end> is one
9058 * beyond where we want to populate, but that is impossible at the
9059 * platform's infinity, so have to handle it specially */
9060 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9062 const STRLEN offset = (STRLEN)(end - start);
9063 swatch[offset >> 3] |= 1 << (offset & 7);
9068 /* Advance to the next range, which will be for code points not in the
9077 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9078 const bool complement_b, SV** output)
9080 /* Take the union of two inversion lists and point '*output' to it. On
9081 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9082 * even 'a' or 'b'). If to an inversion list, the contents of the original
9083 * list will be replaced by the union. The first list, 'a', may be
9084 * NULL, in which case a copy of the second list is placed in '*output'.
9085 * If 'complement_b' is TRUE, the union is taken of the complement
9086 * (inversion) of 'b' instead of b itself.
9088 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9089 * Richard Gillam, published by Addison-Wesley, and explained at some
9090 * length there. The preface says to incorporate its examples into your
9091 * code at your own risk.
9093 * The algorithm is like a merge sort. */
9095 const UV* array_a; /* a's array */
9097 UV len_a; /* length of a's array */
9100 SV* u; /* the resulting union */
9104 UV i_a = 0; /* current index into a's array */
9108 /* running count, as explained in the algorithm source book; items are
9109 * stopped accumulating and are output when the count changes to/from 0.
9110 * The count is incremented when we start a range that's in an input's set,
9111 * and decremented when we start a range that's not in a set. So this
9112 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9113 * and hence nothing goes into the union; 1, just one of the inputs is in
9114 * its set (and its current range gets added to the union); and 2 when both
9115 * inputs are in their sets. */
9118 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9120 assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
9122 len_b = _invlist_len(b);
9125 /* Here, 'b' is empty, hence it's complement is all possible code
9126 * points. So if the union includes the complement of 'b', it includes
9127 * everything, and we need not even look at 'a'. It's easiest to
9128 * create a new inversion list that matches everything. */
9130 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9132 if (*output == NULL) { /* If the output didn't exist, just point it
9134 *output = everything;
9136 else { /* Otherwise, replace its contents with the new list */
9137 invlist_replace_list_destroys_src(*output, everything);
9138 SvREFCNT_dec_NN(everything);
9144 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9145 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9146 * output will be empty */
9148 if (a == NULL || _invlist_len(a) == 0) {
9149 if (*output == NULL) {
9150 *output = _new_invlist(0);
9153 invlist_clear(*output);
9158 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9159 * union. We can just return a copy of 'a' if '*output' doesn't point
9160 * to an existing list */
9161 if (*output == NULL) {
9162 *output = invlist_clone(a);
9166 /* If the output is to overwrite 'a', we have a no-op, as it's
9172 /* Here, '*output' is to be overwritten by 'a' */
9173 u = invlist_clone(a);
9174 invlist_replace_list_destroys_src(*output, u);
9180 /* Here 'b' is not empty. See about 'a' */
9182 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9184 /* Here, 'a' is empty (and b is not). That means the union will come
9185 * entirely from 'b'. If '*output' is NULL, we can directly return a
9186 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9189 SV ** dest = (*output == NULL) ? output : &u;
9190 *dest = invlist_clone(b);
9192 _invlist_invert(*dest);
9196 invlist_replace_list_destroys_src(*output, u);
9203 /* Here both lists exist and are non-empty */
9204 array_a = invlist_array(a);
9205 array_b = invlist_array(b);
9207 /* If are to take the union of 'a' with the complement of b, set it
9208 * up so are looking at b's complement. */
9211 /* To complement, we invert: if the first element is 0, remove it. To
9212 * do this, we just pretend the array starts one later */
9213 if (array_b[0] == 0) {
9219 /* But if the first element is not zero, we pretend the list starts
9220 * at the 0 that is always stored immediately before the array. */
9226 /* Size the union for the worst case: that the sets are completely
9228 u = _new_invlist(len_a + len_b);
9230 /* Will contain U+0000 if either component does */
9231 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9232 || (len_b > 0 && array_b[0] == 0));
9234 /* Go through each input list item by item, stopping when have exhausted
9236 while (i_a < len_a && i_b < len_b) {
9237 UV cp; /* The element to potentially add to the union's array */
9238 bool cp_in_set; /* is it in the the input list's set or not */
9240 /* We need to take one or the other of the two inputs for the union.
9241 * Since we are merging two sorted lists, we take the smaller of the
9242 * next items. In case of a tie, we take first the one that is in its
9243 * set. If we first took the one not in its set, it would decrement
9244 * the count, possibly to 0 which would cause it to be output as ending
9245 * the range, and the next time through we would take the same number,
9246 * and output it again as beginning the next range. By doing it the
9247 * opposite way, there is no possibility that the count will be
9248 * momentarily decremented to 0, and thus the two adjoining ranges will
9249 * be seamlessly merged. (In a tie and both are in the set or both not
9250 * in the set, it doesn't matter which we take first.) */
9251 if ( array_a[i_a] < array_b[i_b]
9252 || ( array_a[i_a] == array_b[i_b]
9253 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9255 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9256 cp = array_a[i_a++];
9259 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9260 cp = array_b[i_b++];
9263 /* Here, have chosen which of the two inputs to look at. Only output
9264 * if the running count changes to/from 0, which marks the
9265 * beginning/end of a range that's in the set */
9268 array_u[i_u++] = cp;
9275 array_u[i_u++] = cp;
9281 /* The loop above increments the index into exactly one of the input lists
9282 * each iteration, and ends when either index gets to its list end. That
9283 * means the other index is lower than its end, and so something is
9284 * remaining in that one. We decrement 'count', as explained below, if
9285 * that list is in its set. (i_a and i_b each currently index the element
9286 * beyond the one we care about.) */
9287 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9288 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9293 /* Above we decremented 'count' if the list that had unexamined elements in
9294 * it was in its set. This has made it so that 'count' being non-zero
9295 * means there isn't anything left to output; and 'count' equal to 0 means
9296 * that what is left to output is precisely that which is left in the
9297 * non-exhausted input list.
9299 * To see why, note first that the exhausted input obviously has nothing
9300 * left to add to the union. If it was in its set at its end, that means
9301 * the set extends from here to the platform's infinity, and hence so does
9302 * the union and the non-exhausted set is irrelevant. The exhausted set
9303 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9304 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9305 * 'count' remains at 1. This is consistent with the decremented 'count'
9306 * != 0 meaning there's nothing left to add to the union.
9308 * But if the exhausted input wasn't in its set, it contributed 0 to
9309 * 'count', and the rest of the union will be whatever the other input is.
9310 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9311 * otherwise it gets decremented to 0. This is consistent with 'count'
9312 * == 0 meaning the remainder of the union is whatever is left in the
9313 * non-exhausted list. */
9318 IV copy_count = len_a - i_a;
9319 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9320 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9322 else { /* The non-exhausted input is b */
9323 copy_count = len_b - i_b;
9324 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9326 len_u = i_u + copy_count;
9329 /* Set the result to the final length, which can change the pointer to
9330 * array_u, so re-find it. (Note that it is unlikely that this will
9331 * change, as we are shrinking the space, not enlarging it) */
9332 if (len_u != _invlist_len(u)) {
9333 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9335 array_u = invlist_array(u);
9338 if (*output == NULL) { /* Simply return the new inversion list */
9342 /* Otherwise, overwrite the inversion list that was in '*output'. We
9343 * could instead free '*output', and then set it to 'u', but experience
9344 * has shown [perl #127392] that if the input is a mortal, we can get a
9345 * huge build-up of these during regex compilation before they get
9347 invlist_replace_list_destroys_src(*output, u);
9355 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9356 const bool complement_b, SV** i)
9358 /* Take the intersection of two inversion lists and point '*i' to it. On
9359 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9360 * even 'a' or 'b'). If to an inversion list, the contents of the original
9361 * list will be replaced by the intersection. The first list, 'a', may be
9362 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9363 * TRUE, the result will be the intersection of 'a' and the complement (or
9364 * inversion) of 'b' instead of 'b' directly.
9366 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9367 * Richard Gillam, published by Addison-Wesley, and explained at some
9368 * length there. The preface says to incorporate its examples into your
9369 * code at your own risk. In fact, it had bugs
9371 * The algorithm is like a merge sort, and is essentially the same as the
9375 const UV* array_a; /* a's array */
9377 UV len_a; /* length of a's array */
9380 SV* r; /* the resulting intersection */
9384 UV i_a = 0; /* current index into a's array */
9388 /* running count of how many of the two inputs are postitioned at ranges
9389 * that are in their sets. As explained in the algorithm source book,
9390 * items are stopped accumulating and are output when the count changes
9391 * to/from 2. The count is incremented when we start a range that's in an
9392 * input's set, and decremented when we start a range that's not in a set.
9393 * Only when it is 2 are we in the intersection. */
9396 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9398 assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
9400 /* Special case if either one is empty */
9401 len_a = (a == NULL) ? 0 : _invlist_len(a);
9402 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9403 if (len_a != 0 && complement_b) {
9405 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9406 * must be empty. Here, also we are using 'b's complement, which
9407 * hence must be every possible code point. Thus the intersection
9410 if (*i == a) { /* No-op */
9415 *i = invlist_clone(a);
9419 r = invlist_clone(a);
9420 invlist_replace_list_destroys_src(*i, r);
9425 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9426 * intersection must be empty */
9428 *i = _new_invlist(0);
9436 /* Here both lists exist and are non-empty */
9437 array_a = invlist_array(a);
9438 array_b = invlist_array(b);
9440 /* If are to take the intersection of 'a' with the complement of b, set it
9441 * up so are looking at b's complement. */
9444 /* To complement, we invert: if the first element is 0, remove it. To
9445 * do this, we just pretend the array starts one later */
9446 if (array_b[0] == 0) {
9452 /* But if the first element is not zero, we pretend the list starts
9453 * at the 0 that is always stored immediately before the array. */
9459 /* Size the intersection for the worst case: that the intersection ends up
9460 * fragmenting everything to be completely disjoint */
9461 r= _new_invlist(len_a + len_b);
9463 /* Will contain U+0000 iff both components do */
9464 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9465 && len_b > 0 && array_b[0] == 0);
9467 /* Go through each list item by item, stopping when have exhausted one of
9469 while (i_a < len_a && i_b < len_b) {
9470 UV cp; /* The element to potentially add to the intersection's
9472 bool cp_in_set; /* Is it in the input list's set or not */
9474 /* We need to take one or the other of the two inputs for the
9475 * intersection. Since we are merging two sorted lists, we take the
9476 * smaller of the next items. In case of a tie, we take first the one
9477 * that is not in its set (a difference from the union algorithm). If
9478 * we first took the one in its set, it would increment the count,
9479 * possibly to 2 which would cause it to be output as starting a range
9480 * in the intersection, and the next time through we would take that
9481 * same number, and output it again as ending the set. By doing the
9482 * opposite of this, there is no possibility that the count will be
9483 * momentarily incremented to 2. (In a tie and both are in the set or
9484 * both not in the set, it doesn't matter which we take first.) */
9485 if ( array_a[i_a] < array_b[i_b]
9486 || ( array_a[i_a] == array_b[i_b]
9487 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9489 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9490 cp = array_a[i_a++];
9493 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9497 /* Here, have chosen which of the two inputs to look at. Only output
9498 * if the running count changes to/from 2, which marks the
9499 * beginning/end of a range that's in the intersection */
9503 array_r[i_r++] = cp;
9508 array_r[i_r++] = cp;
9515 /* The loop above increments the index into exactly one of the input lists
9516 * each iteration, and ends when either index gets to its list end. That
9517 * means the other index is lower than its end, and so something is
9518 * remaining in that one. We increment 'count', as explained below, if the
9519 * exhausted list was in its set. (i_a and i_b each currently index the
9520 * element beyond the one we care about.) */
9521 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9522 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9527 /* Above we incremented 'count' if the exhausted list was in its set. This
9528 * has made it so that 'count' being below 2 means there is nothing left to
9529 * output; otheriwse what's left to add to the intersection is precisely
9530 * that which is left in the non-exhausted input list.
9532 * To see why, note first that the exhausted input obviously has nothing
9533 * left to affect the intersection. If it was in its set at its end, that
9534 * means the set extends from here to the platform's infinity, and hence
9535 * anything in the non-exhausted's list will be in the intersection, and
9536 * anything not in it won't be. Hence, the rest of the intersection is
9537 * precisely what's in the non-exhausted list The exhausted set also
9538 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
9539 * it means 'count' is now at least 2. This is consistent with the
9540 * incremented 'count' being >= 2 means to add the non-exhausted list to
9543 * But if the exhausted input wasn't in its set, it contributed 0 to
9544 * 'count', and the intersection can't include anything further; the
9545 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
9546 * incremented. This is consistent with 'count' being < 2 meaning nothing
9547 * further to add to the intersection. */
9548 if (count < 2) { /* Nothing left to put in the intersection. */
9551 else { /* copy the non-exhausted list, unchanged. */
9552 IV copy_count = len_a - i_a;
9553 if (copy_count > 0) { /* a is the one with stuff left */
9554 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9556 else { /* b is the one with stuff left */
9557 copy_count = len_b - i_b;
9558 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9560 len_r = i_r + copy_count;
9563 /* Set the result to the final length, which can change the pointer to
9564 * array_r, so re-find it. (Note that it is unlikely that this will
9565 * change, as we are shrinking the space, not enlarging it) */
9566 if (len_r != _invlist_len(r)) {
9567 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9569 array_r = invlist_array(r);
9572 if (*i == NULL) { /* Simply return the calculated intersection */
9575 else { /* Otherwise, replace the existing inversion list in '*i'. We could
9576 instead free '*i', and then set it to 'r', but experience has
9577 shown [perl #127392] that if the input is a mortal, we can get a
9578 huge build-up of these during regex compilation before they get
9581 invlist_replace_list_destroys_src(*i, r);
9593 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9595 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9596 * set. A pointer to the inversion list is returned. This may actually be
9597 * a new list, in which case the passed in one has been destroyed. The
9598 * passed-in inversion list can be NULL, in which case a new one is created
9599 * with just the one range in it. The new list is not necessarily
9600 * NUL-terminated. Space is not freed if the inversion list shrinks as a
9601 * result of this function. The gain would not be large, and in many
9602 * cases, this is called multiple times on a single inversion list, so
9603 * anything freed may almost immediately be needed again.
9605 * This used to mostly call the 'union' routine, but that is much more
9606 * heavyweight than really needed for a single range addition */
9608 UV* array; /* The array implementing the inversion list */
9609 UV len; /* How many elements in 'array' */
9610 SSize_t i_s; /* index into the invlist array where 'start'
9612 SSize_t i_e = 0; /* And the index where 'end' should go */
9613 UV cur_highest; /* The highest code point in the inversion list
9614 upon entry to this function */
9616 /* This range becomes the whole inversion list if none already existed */
9617 if (invlist == NULL) {
9618 invlist = _new_invlist(2);
9619 _append_range_to_invlist(invlist, start, end);
9623 /* Likewise, if the inversion list is currently empty */
9624 len = _invlist_len(invlist);
9626 _append_range_to_invlist(invlist, start, end);
9630 /* Starting here, we have to know the internals of the list */
9631 array = invlist_array(invlist);
9633 /* If the new range ends higher than the current highest ... */
9634 cur_highest = invlist_highest(invlist);
9635 if (end > cur_highest) {
9637 /* If the whole range is higher, we can just append it */
9638 if (start > cur_highest) {
9639 _append_range_to_invlist(invlist, start, end);
9643 /* Otherwise, add the portion that is higher ... */
9644 _append_range_to_invlist(invlist, cur_highest + 1, end);
9646 /* ... and continue on below to handle the rest. As a result of the
9647 * above append, we know that the index of the end of the range is the
9648 * final even numbered one of the array. Recall that the final element
9649 * always starts a range that extends to infinity. If that range is in
9650 * the set (meaning the set goes from here to infinity), it will be an
9651 * even index, but if it isn't in the set, it's odd, and the final
9652 * range in the set is one less, which is even. */
9653 if (end == UV_MAX) {
9661 /* We have dealt with appending, now see about prepending. If the new
9662 * range starts lower than the current lowest ... */
9663 if (start < array[0]) {
9665 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9666 * Let the union code handle it, rather than having to know the
9667 * trickiness in two code places. */
9668 if (UNLIKELY(start == 0)) {
9671 range_invlist = _new_invlist(2);
9672 _append_range_to_invlist(range_invlist, start, end);
9674 _invlist_union(invlist, range_invlist, &invlist);
9676 SvREFCNT_dec_NN(range_invlist);
9681 /* If the whole new range comes before the first entry, and doesn't
9682 * extend it, we have to insert it as an additional range */
9683 if (end < array[0] - 1) {
9685 goto splice_in_new_range;
9688 /* Here the new range adjoins the existing first range, extending it
9692 /* And continue on below to handle the rest. We know that the index of
9693 * the beginning of the range is the first one of the array */
9696 else { /* Not prepending any part of the new range to the existing list.
9697 * Find where in the list it should go. This finds i_s, such that:
9698 * invlist[i_s] <= start < array[i_s+1]
9700 i_s = _invlist_search(invlist, start);
9703 /* At this point, any extending before the beginning of the inversion list
9704 * and/or after the end has been done. This has made it so that, in the
9705 * code below, each endpoint of the new range is either in a range that is
9706 * in the set, or is in a gap between two ranges that are. This means we
9707 * don't have to worry about exceeding the array bounds.
9709 * Find where in the list the new range ends (but we can skip this if we
9710 * have already determined what it is, or if it will be the same as i_s,
9711 * which we already have computed) */
9713 i_e = (start == end)
9715 : _invlist_search(invlist, end);
9718 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
9719 * is a range that goes to infinity there is no element at invlist[i_e+1],
9720 * so only the first relation holds. */
9722 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9724 /* Here, the ranges on either side of the beginning of the new range
9725 * are in the set, and this range starts in the gap between them.
9727 * The new range extends the range above it downwards if the new range
9728 * ends at or above that range's start */
9729 const bool extends_the_range_above = ( end == UV_MAX
9730 || end + 1 >= array[i_s+1]);
9732 /* The new range extends the range below it upwards if it begins just
9733 * after where that range ends */
9734 if (start == array[i_s]) {
9736 /* If the new range fills the entire gap between the other ranges,
9737 * they will get merged together. Other ranges may also get
9738 * merged, depending on how many of them the new range spans. In
9739 * the general case, we do the merge later, just once, after we
9740 * figure out how many to merge. But in the case where the new
9741 * range exactly spans just this one gap (possibly extending into
9742 * the one above), we do the merge here, and an early exit. This
9743 * is done here to avoid having to special case later. */
9744 if (i_e - i_s <= 1) {
9746 /* If i_e - i_s == 1, it means that the new range terminates
9747 * within the range above, and hence 'extends_the_range_above'
9748 * must be true. (If the range above it extends to infinity,
9749 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9750 * will be 0, so no harm done.) */
9751 if (extends_the_range_above) {
9752 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9753 invlist_set_len(invlist,
9755 *(get_invlist_offset_addr(invlist)));
9759 /* Here, i_e must == i_s. We keep them in sync, as they apply
9760 * to the same range, and below we are about to decrement i_s
9765 /* Here, the new range is adjacent to the one below. (It may also
9766 * span beyond the range above, but that will get resolved later.)
9767 * Extend the range below to include this one. */
9768 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
9772 else if (extends_the_range_above) {
9774 /* Here the new range only extends the range above it, but not the
9775 * one below. It merges with the one above. Again, we keep i_e
9776 * and i_s in sync if they point to the same range */
9785 /* Here, we've dealt with the new range start extending any adjoining
9788 * If the new range extends to infinity, it is now the final one,
9789 * regardless of what was there before */
9790 if (UNLIKELY(end == UV_MAX)) {
9791 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
9795 /* If i_e started as == i_s, it has also been dealt with,
9796 * and been updated to the new i_s, which will fail the following if */
9797 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
9799 /* Here, the ranges on either side of the end of the new range are in
9800 * the set, and this range ends in the gap between them.
9802 * If this range is adjacent to (hence extends) the range above it, it
9803 * becomes part of that range; likewise if it extends the range below,
9804 * it becomes part of that range */
9805 if (end + 1 == array[i_e+1]) {
9809 else if (start <= array[i_e]) {
9810 array[i_e] = end + 1;
9817 /* If the range fits entirely in an existing range (as possibly already
9818 * extended above), it doesn't add anything new */
9819 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9823 /* Here, no part of the range is in the list. Must add it. It will
9824 * occupy 2 more slots */
9825 splice_in_new_range:
9827 invlist_extend(invlist, len + 2);
9828 array = invlist_array(invlist);
9829 /* Move the rest of the array down two slots. Don't include any
9831 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
9833 /* Do the actual splice */
9834 array[i_e+1] = start;
9835 array[i_e+2] = end + 1;
9836 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
9840 /* Here the new range crossed the boundaries of a pre-existing range. The
9841 * code above has adjusted things so that both ends are in ranges that are
9842 * in the set. This means everything in between must also be in the set.
9843 * Just squash things together */
9844 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
9845 invlist_set_len(invlist,
9847 *(get_invlist_offset_addr(invlist)));
9853 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9854 UV** other_elements_ptr)
9856 /* Create and return an inversion list whose contents are to be populated
9857 * by the caller. The caller gives the number of elements (in 'size') and
9858 * the very first element ('element0'). This function will set
9859 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9862 * Obviously there is some trust involved that the caller will properly
9863 * fill in the other elements of the array.
9865 * (The first element needs to be passed in, as the underlying code does
9866 * things differently depending on whether it is zero or non-zero) */
9868 SV* invlist = _new_invlist(size);
9871 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9873 invlist = add_cp_to_invlist(invlist, element0);
9874 offset = *get_invlist_offset_addr(invlist);
9876 invlist_set_len(invlist, size, offset);
9877 *other_elements_ptr = invlist_array(invlist) + 1;
9883 PERL_STATIC_INLINE SV*
9884 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9885 return _add_range_to_invlist(invlist, cp, cp);
9888 #ifndef PERL_IN_XSUB_RE
9890 Perl__invlist_invert(pTHX_ SV* const invlist)
9892 /* Complement the input inversion list. This adds a 0 if the list didn't
9893 * have a zero; removes it otherwise. As described above, the data
9894 * structure is set up so that this is very efficient */
9896 PERL_ARGS_ASSERT__INVLIST_INVERT;
9898 assert(! invlist_is_iterating(invlist));
9900 /* The inverse of matching nothing is matching everything */
9901 if (_invlist_len(invlist) == 0) {
9902 _append_range_to_invlist(invlist, 0, UV_MAX);
9906 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9911 PERL_STATIC_INLINE SV*
9912 S_invlist_clone(pTHX_ SV* const invlist)
9915 /* Return a new inversion list that is a copy of the input one, which is
9916 * unchanged. The new list will not be mortal even if the old one was. */
9918 /* Need to allocate extra space to accommodate Perl's addition of a
9919 * trailing NUL to SvPV's, since it thinks they are always strings */
9920 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9921 STRLEN physical_length = SvCUR(invlist);
9922 bool offset = *(get_invlist_offset_addr(invlist));
9924 PERL_ARGS_ASSERT_INVLIST_CLONE;
9926 *(get_invlist_offset_addr(new_invlist)) = offset;
9927 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9928 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9933 PERL_STATIC_INLINE STRLEN*
9934 S_get_invlist_iter_addr(SV* invlist)
9936 /* Return the address of the UV that contains the current iteration
9939 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9941 assert(SvTYPE(invlist) == SVt_INVLIST);
9943 return &(((XINVLIST*) SvANY(invlist))->iterator);
9946 PERL_STATIC_INLINE void
9947 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9949 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9951 *get_invlist_iter_addr(invlist) = 0;
9954 PERL_STATIC_INLINE void
9955 S_invlist_iterfinish(SV* invlist)
9957 /* Terminate iterator for invlist. This is to catch development errors.
9958 * Any iteration that is interrupted before completed should call this
9959 * function. Functions that add code points anywhere else but to the end
9960 * of an inversion list assert that they are not in the middle of an
9961 * iteration. If they were, the addition would make the iteration
9962 * problematical: if the iteration hadn't reached the place where things
9963 * were being added, it would be ok */
9965 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9967 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9971 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9973 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9974 * This call sets in <*start> and <*end>, the next range in <invlist>.
9975 * Returns <TRUE> if successful and the next call will return the next
9976 * range; <FALSE> if was already at the end of the list. If the latter,
9977 * <*start> and <*end> are unchanged, and the next call to this function
9978 * will start over at the beginning of the list */
9980 STRLEN* pos = get_invlist_iter_addr(invlist);
9981 UV len = _invlist_len(invlist);
9984 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9987 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9991 array = invlist_array(invlist);
9993 *start = array[(*pos)++];
9999 *end = array[(*pos)++] - 1;
10005 PERL_STATIC_INLINE UV
10006 S_invlist_highest(SV* const invlist)
10008 /* Returns the highest code point that matches an inversion list. This API
10009 * has an ambiguity, as it returns 0 under either the highest is actually
10010 * 0, or if the list is empty. If this distinction matters to you, check
10011 * for emptiness before calling this function */
10013 UV len = _invlist_len(invlist);
10016 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10022 array = invlist_array(invlist);
10024 /* The last element in the array in the inversion list always starts a
10025 * range that goes to infinity. That range may be for code points that are
10026 * matched in the inversion list, or it may be for ones that aren't
10027 * matched. In the latter case, the highest code point in the set is one
10028 * less than the beginning of this range; otherwise it is the final element
10029 * of this range: infinity */
10030 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10032 : array[len - 1] - 1;
10036 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10038 /* Get the contents of an inversion list into a string SV so that they can
10039 * be printed out. If 'traditional_style' is TRUE, it uses the format
10040 * traditionally done for debug tracing; otherwise it uses a format
10041 * suitable for just copying to the output, with blanks between ranges and
10042 * a dash between range components */
10046 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10047 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10049 if (traditional_style) {
10050 output = newSVpvs("\n");
10053 output = newSVpvs("");
10056 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10058 assert(! invlist_is_iterating(invlist));
10060 invlist_iterinit(invlist);
10061 while (invlist_iternext(invlist, &start, &end)) {
10062 if (end == UV_MAX) {
10063 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
10064 start, intra_range_delimiter,
10065 inter_range_delimiter);
10067 else if (end != start) {
10068 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10070 intra_range_delimiter,
10071 end, inter_range_delimiter);
10074 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10075 start, inter_range_delimiter);
10079 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10080 SvCUR_set(output, SvCUR(output) - 1);
10086 #ifndef PERL_IN_XSUB_RE
10088 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10089 const char * const indent, SV* const invlist)
10091 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10092 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10093 * the string 'indent'. The output looks like this:
10094 [0] 0x000A .. 0x000D
10096 [4] 0x2028 .. 0x2029
10097 [6] 0x3104 .. INFINITY
10098 * This means that the first range of code points matched by the list are
10099 * 0xA through 0xD; the second range contains only the single code point
10100 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10101 * are used to define each range (except if the final range extends to
10102 * infinity, only a single element is needed). The array index of the
10103 * first element for the corresponding range is given in brackets. */
10108 PERL_ARGS_ASSERT__INVLIST_DUMP;
10110 if (invlist_is_iterating(invlist)) {
10111 Perl_dump_indent(aTHX_ level, file,
10112 "%sCan't dump inversion list because is in middle of iterating\n",
10117 invlist_iterinit(invlist);
10118 while (invlist_iternext(invlist, &start, &end)) {
10119 if (end == UV_MAX) {
10120 Perl_dump_indent(aTHX_ level, file,
10121 "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
10122 indent, (UV)count, start);
10124 else if (end != start) {
10125 Perl_dump_indent(aTHX_ level, file,
10126 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10127 indent, (UV)count, start, end);
10130 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10131 indent, (UV)count, start);
10138 Perl__load_PL_utf8_foldclosures (pTHX)
10140 assert(! PL_utf8_foldclosures);
10142 /* If the folds haven't been read in, call a fold function
10144 if (! PL_utf8_tofold) {
10145 U8 dummy[UTF8_MAXBYTES_CASE+1];
10146 const U8 hyphen[] = HYPHEN_UTF8;
10148 /* This string is just a short named one above \xff */
10149 toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL);
10150 assert(PL_utf8_tofold); /* Verify that worked */
10152 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10156 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10158 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10160 /* Return a boolean as to if the two passed in inversion lists are
10161 * identical. The final argument, if TRUE, says to take the complement of
10162 * the second inversion list before doing the comparison */
10164 const UV* array_a = invlist_array(a);
10165 const UV* array_b = invlist_array(b);
10166 UV len_a = _invlist_len(a);
10167 UV len_b = _invlist_len(b);
10169 PERL_ARGS_ASSERT__INVLISTEQ;
10171 /* If are to compare 'a' with the complement of b, set it
10172 * up so are looking at b's complement. */
10173 if (complement_b) {
10175 /* The complement of nothing is everything, so <a> would have to have
10176 * just one element, starting at zero (ending at infinity) */
10178 return (len_a == 1 && array_a[0] == 0);
10180 else if (array_b[0] == 0) {
10182 /* Otherwise, to complement, we invert. Here, the first element is
10183 * 0, just remove it. To do this, we just pretend the array starts
10191 /* But if the first element is not zero, we pretend the list starts
10192 * at the 0 that is always stored immediately before the array. */
10198 return len_a == len_b
10199 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10205 * As best we can, determine the characters that can match the start of
10206 * the given EXACTF-ish node.
10208 * Returns the invlist as a new SV*; it is the caller's responsibility to
10209 * call SvREFCNT_dec() when done with it.
10212 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10214 const U8 * s = (U8*)STRING(node);
10215 SSize_t bytelen = STR_LEN(node);
10217 /* Start out big enough for 2 separate code points */
10218 SV* invlist = _new_invlist(4);
10220 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10225 /* We punt and assume can match anything if the node begins
10226 * with a multi-character fold. Things are complicated. For
10227 * example, /ffi/i could match any of:
10228 * "\N{LATIN SMALL LIGATURE FFI}"
10229 * "\N{LATIN SMALL LIGATURE FF}I"
10230 * "F\N{LATIN SMALL LIGATURE FI}"
10231 * plus several other things; and making sure we have all the
10232 * possibilities is hard. */
10233 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10234 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10237 /* Any Latin1 range character can potentially match any
10238 * other depending on the locale */
10239 if (OP(node) == EXACTFL) {
10240 _invlist_union(invlist, PL_Latin1, &invlist);
10243 /* But otherwise, it matches at least itself. We can
10244 * quickly tell if it has a distinct fold, and if so,
10245 * it matches that as well */
10246 invlist = add_cp_to_invlist(invlist, uc);
10247 if (IS_IN_SOME_FOLD_L1(uc))
10248 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10251 /* Some characters match above-Latin1 ones under /i. This
10252 * is true of EXACTFL ones when the locale is UTF-8 */
10253 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10254 && (! isASCII(uc) || (OP(node) != EXACTFA
10255 && OP(node) != EXACTFA_NO_TRIE)))
10257 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10261 else { /* Pattern is UTF-8 */
10262 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10263 STRLEN foldlen = UTF8SKIP(s);
10264 const U8* e = s + bytelen;
10267 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10269 /* The only code points that aren't folded in a UTF EXACTFish
10270 * node are are the problematic ones in EXACTFL nodes */
10271 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10272 /* We need to check for the possibility that this EXACTFL
10273 * node begins with a multi-char fold. Therefore we fold
10274 * the first few characters of it so that we can make that
10279 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10281 *(d++) = (U8) toFOLD(*s);
10286 toFOLD_utf8_safe(s, e, d, &len);
10292 /* And set up so the code below that looks in this folded
10293 * buffer instead of the node's string */
10295 foldlen = UTF8SKIP(folded);
10299 /* When we reach here 's' points to the fold of the first
10300 * character(s) of the node; and 'e' points to far enough along
10301 * the folded string to be just past any possible multi-char
10302 * fold. 'foldlen' is the length in bytes of the first
10305 * Unlike the non-UTF-8 case, the macro for determining if a
10306 * string is a multi-char fold requires all the characters to
10307 * already be folded. This is because of all the complications
10308 * if not. Note that they are folded anyway, except in EXACTFL
10309 * nodes. Like the non-UTF case above, we punt if the node
10310 * begins with a multi-char fold */
10312 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10313 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10315 else { /* Single char fold */
10317 /* It matches all the things that fold to it, which are
10318 * found in PL_utf8_foldclosures (including itself) */
10319 invlist = add_cp_to_invlist(invlist, uc);
10320 if (! PL_utf8_foldclosures)
10321 _load_PL_utf8_foldclosures();
10322 if ((listp = hv_fetch(PL_utf8_foldclosures,
10323 (char *) s, foldlen, FALSE)))
10325 AV* list = (AV*) *listp;
10327 for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
10328 SV** c_p = av_fetch(list, k, FALSE);
10334 /* /aa doesn't allow folds between ASCII and non- */
10335 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10336 && isASCII(c) != isASCII(uc))
10341 invlist = add_cp_to_invlist(invlist, c);
10350 #undef HEADER_LENGTH
10351 #undef TO_INTERNAL_SIZE
10352 #undef FROM_INTERNAL_SIZE
10353 #undef INVLIST_VERSION_ID
10355 /* End of inversion list object */
10358 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10360 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10361 * constructs, and updates RExC_flags with them. On input, RExC_parse
10362 * should point to the first flag; it is updated on output to point to the
10363 * final ')' or ':'. There needs to be at least one flag, or this will
10366 /* for (?g), (?gc), and (?o) warnings; warning
10367 about (?c) will warn about (?g) -- japhy */
10369 #define WASTED_O 0x01
10370 #define WASTED_G 0x02
10371 #define WASTED_C 0x04
10372 #define WASTED_GC (WASTED_G|WASTED_C)
10373 I32 wastedflags = 0x00;
10374 U32 posflags = 0, negflags = 0;
10375 U32 *flagsp = &posflags;
10376 char has_charset_modifier = '\0';
10378 bool has_use_defaults = FALSE;
10379 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10380 int x_mod_count = 0;
10382 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10384 /* '^' as an initial flag sets certain defaults */
10385 if (UCHARAT(RExC_parse) == '^') {
10387 has_use_defaults = TRUE;
10388 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10389 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10390 ? REGEX_UNICODE_CHARSET
10391 : REGEX_DEPENDS_CHARSET);
10394 cs = get_regex_charset(RExC_flags);
10395 if (cs == REGEX_DEPENDS_CHARSET
10396 && (RExC_utf8 || RExC_uni_semantics))
10398 cs = REGEX_UNICODE_CHARSET;
10401 while (RExC_parse < RExC_end) {
10402 /* && strchr("iogcmsx", *RExC_parse) */
10403 /* (?g), (?gc) and (?o) are useless here
10404 and must be globally applied -- japhy */
10405 switch (*RExC_parse) {
10407 /* Code for the imsxn flags */
10408 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10410 case LOCALE_PAT_MOD:
10411 if (has_charset_modifier) {
10412 goto excess_modifier;
10414 else if (flagsp == &negflags) {
10417 cs = REGEX_LOCALE_CHARSET;
10418 has_charset_modifier = LOCALE_PAT_MOD;
10420 case UNICODE_PAT_MOD:
10421 if (has_charset_modifier) {
10422 goto excess_modifier;
10424 else if (flagsp == &negflags) {
10427 cs = REGEX_UNICODE_CHARSET;
10428 has_charset_modifier = UNICODE_PAT_MOD;
10430 case ASCII_RESTRICT_PAT_MOD:
10431 if (flagsp == &negflags) {
10434 if (has_charset_modifier) {
10435 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10436 goto excess_modifier;
10438 /* Doubled modifier implies more restricted */
10439 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10442 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10444 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10446 case DEPENDS_PAT_MOD:
10447 if (has_use_defaults) {
10448 goto fail_modifiers;
10450 else if (flagsp == &negflags) {
10453 else if (has_charset_modifier) {
10454 goto excess_modifier;
10457 /* The dual charset means unicode semantics if the
10458 * pattern (or target, not known until runtime) are
10459 * utf8, or something in the pattern indicates unicode
10461 cs = (RExC_utf8 || RExC_uni_semantics)
10462 ? REGEX_UNICODE_CHARSET
10463 : REGEX_DEPENDS_CHARSET;
10464 has_charset_modifier = DEPENDS_PAT_MOD;
10468 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10469 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10471 else if (has_charset_modifier == *(RExC_parse - 1)) {
10472 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10473 *(RExC_parse - 1));
10476 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10478 NOT_REACHED; /*NOTREACHED*/
10481 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10482 *(RExC_parse - 1));
10483 NOT_REACHED; /*NOTREACHED*/
10484 case ONCE_PAT_MOD: /* 'o' */
10485 case GLOBAL_PAT_MOD: /* 'g' */
10486 if (PASS2 && ckWARN(WARN_REGEXP)) {
10487 const I32 wflagbit = *RExC_parse == 'o'
10490 if (! (wastedflags & wflagbit) ) {
10491 wastedflags |= wflagbit;
10492 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10495 "Useless (%s%c) - %suse /%c modifier",
10496 flagsp == &negflags ? "?-" : "?",
10498 flagsp == &negflags ? "don't " : "",
10505 case CONTINUE_PAT_MOD: /* 'c' */
10506 if (PASS2 && ckWARN(WARN_REGEXP)) {
10507 if (! (wastedflags & WASTED_C) ) {
10508 wastedflags |= WASTED_GC;
10509 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10512 "Useless (%sc) - %suse /gc modifier",
10513 flagsp == &negflags ? "?-" : "?",
10514 flagsp == &negflags ? "don't " : ""
10519 case KEEPCOPY_PAT_MOD: /* 'p' */
10520 if (flagsp == &negflags) {
10522 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10524 *flagsp |= RXf_PMf_KEEPCOPY;
10528 /* A flag is a default iff it is following a minus, so
10529 * if there is a minus, it means will be trying to
10530 * re-specify a default which is an error */
10531 if (has_use_defaults || flagsp == &negflags) {
10532 goto fail_modifiers;
10534 flagsp = &negflags;
10535 wastedflags = 0; /* reset so (?g-c) warns twice */
10541 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10542 negflags |= RXf_PMf_EXTENDED_MORE;
10544 RExC_flags |= posflags;
10546 if (negflags & RXf_PMf_EXTENDED) {
10547 negflags |= RXf_PMf_EXTENDED_MORE;
10549 RExC_flags &= ~negflags;
10550 set_regex_charset(&RExC_flags, cs);
10555 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10556 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10557 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10558 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10559 NOT_REACHED; /*NOTREACHED*/
10562 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10565 vFAIL("Sequence (?... not terminated");
10569 - reg - regular expression, i.e. main body or parenthesized thing
10571 * Caller must absorb opening parenthesis.
10573 * Combining parenthesis handling with the base level of regular expression
10574 * is a trifle forced, but the need to tie the tails of the branches to what
10575 * follows makes it hard to avoid.
10577 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10579 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10581 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10584 PERL_STATIC_INLINE regnode *
10585 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10587 char * parse_start,
10592 char* name_start = RExC_parse;
10594 SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10595 ? REG_RSN_RETURN_NULL
10596 : REG_RSN_RETURN_DATA);
10597 GET_RE_DEBUG_FLAGS_DECL;
10599 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10601 if (RExC_parse == name_start || *RExC_parse != ch) {
10602 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10603 vFAIL2("Sequence %.3s... not terminated",parse_start);
10607 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10608 RExC_rxi->data->data[num]=(void*)sv_dat;
10609 SvREFCNT_inc_simple_void(sv_dat);
10612 ret = reganode(pRExC_state,
10615 : (ASCII_FOLD_RESTRICTED)
10617 : (AT_LEAST_UNI_SEMANTICS)
10623 *flagp |= HASWIDTH;
10625 Set_Node_Offset(ret, parse_start+1);
10626 Set_Node_Cur_Length(ret, parse_start);
10628 nextchar(pRExC_state);
10632 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10633 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10634 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10635 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
10636 NULL, which cannot happen. */
10638 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10639 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10640 * 2 is like 1, but indicates that nextchar() has been called to advance
10641 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
10642 * this flag alerts us to the need to check for that */
10644 regnode *ret; /* Will be the head of the group. */
10647 regnode *ender = NULL;
10650 U32 oregflags = RExC_flags;
10651 bool have_branch = 0;
10653 I32 freeze_paren = 0;
10654 I32 after_freeze = 0;
10655 I32 num; /* numeric backreferences */
10657 char * parse_start = RExC_parse; /* MJD */
10658 char * const oregcomp_parse = RExC_parse;
10660 GET_RE_DEBUG_FLAGS_DECL;
10662 PERL_ARGS_ASSERT_REG;
10663 DEBUG_PARSE("reg ");
10665 *flagp = 0; /* Tentatively. */
10667 /* Having this true makes it feasible to have a lot fewer tests for the
10668 * parse pointer being in scope. For example, we can write
10669 * while(isFOO(*RExC_parse)) RExC_parse++;
10671 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10673 assert(*RExC_end == '\0');
10675 /* Make an OPEN node, if parenthesized. */
10678 /* Under /x, space and comments can be gobbled up between the '(' and
10679 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
10680 * intervening space, as the sequence is a token, and a token should be
10682 bool has_intervening_patws = (paren == 2 || paren == 's')
10683 && *(RExC_parse - 1) != '(';
10685 if (RExC_parse >= RExC_end) {
10686 vFAIL("Unmatched (");
10689 if (paren == 's') {
10691 /* A nested script run is a no-op besides clustering */
10692 if (RExC_in_script_run) {
10694 nextchar(pRExC_state);
10698 RExC_in_script_run = 1;
10700 ret = reg_node(pRExC_state, SROPEN);
10703 else if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10704 char *start_verb = RExC_parse + 1;
10706 char *start_arg = NULL;
10707 unsigned char op = 0;
10708 int arg_required = 0;
10709 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10711 if (has_intervening_patws) {
10712 RExC_parse++; /* past the '*' */
10713 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10715 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10716 if ( *RExC_parse == ':' ) {
10717 start_arg = RExC_parse + 1;
10720 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10722 verb_len = RExC_parse - start_verb;
10724 if (RExC_parse >= RExC_end) {
10725 goto unterminated_verb_pattern;
10728 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10729 while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10730 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10731 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10732 unterminated_verb_pattern:
10733 vFAIL("Unterminated verb pattern argument");
10734 if ( RExC_parse == start_arg )
10737 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10738 vFAIL("Unterminated verb pattern");
10741 /* Here, we know that RExC_parse < RExC_end */
10743 switch ( *start_verb ) {
10744 case 'A': /* (*ACCEPT) */
10745 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10747 internal_argval = RExC_nestroot;
10750 case 'C': /* (*COMMIT) */
10751 if ( memEQs(start_verb,verb_len,"COMMIT") )
10754 case 'F': /* (*FAIL) */
10755 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10759 case ':': /* (*:NAME) */
10760 case 'M': /* (*MARK:NAME) */
10761 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10766 case 'P': /* (*PRUNE) */
10767 if ( memEQs(start_verb,verb_len,"PRUNE") )
10770 case 'S': /* (*SKIP) */
10771 if ( memEQs(start_verb,verb_len,"SKIP") )
10774 case 'T': /* (*THEN) */
10775 /* [19:06] <TimToady> :: is then */
10776 if ( memEQs(start_verb,verb_len,"THEN") ) {
10778 RExC_seen |= REG_CUTGROUP_SEEN;
10783 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10785 "Unknown verb pattern '%" UTF8f "'",
10786 UTF8fARG(UTF, verb_len, start_verb));
10788 if ( arg_required && !start_arg ) {
10789 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10790 verb_len, start_verb);
10792 if (internal_argval == -1) {
10793 ret = reganode(pRExC_state, op, 0);
10795 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10797 RExC_seen |= REG_VERBARG_SEEN;
10798 if ( ! SIZE_ONLY ) {
10800 SV *sv = newSVpvn( start_arg,
10801 RExC_parse - start_arg);
10802 ARG(ret) = add_data( pRExC_state,
10803 STR_WITH_LEN("S"));
10804 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10809 if ( internal_argval != -1 )
10810 ARG2L_SET(ret, internal_argval);
10812 nextchar(pRExC_state);
10815 else if (*RExC_parse == '+') { /* (+...) */
10818 if (has_intervening_patws) {
10819 /* XXX Note that a potential gotcha is that outside of /x '( +
10820 * ...)' means to match a space at least once ... This is a
10821 * problem elsewhere too */
10822 vFAIL("In '(+...)', the '(' and '+' must be adjacent");
10825 if (! memBEGINPs(RExC_parse, (STRLEN) (RExC_end - RExC_parse),
10828 RExC_parse += strcspn(RExC_parse, ":)");
10829 vFAIL("Unknown (+ pattern");
10833 /* This indicates Unicode rules. */
10834 REQUIRE_UNI_RULES(flagp, NULL);
10836 RExC_parse += sizeof("script_run:") - 1;
10839 Perl_ck_warner_d(aTHX_
10840 packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
10841 "The script_run feature is experimental"
10842 REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
10845 ret = reg(pRExC_state, 's', &flags, depth+1);
10846 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10847 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10854 else if (*RExC_parse == '?') { /* (?...) */
10855 bool is_logical = 0;
10856 const char * const seqstart = RExC_parse;
10857 const char * endptr;
10858 if (has_intervening_patws) {
10860 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10863 RExC_parse++; /* past the '?' */
10864 paren = *RExC_parse; /* might be a trailing NUL, if not
10866 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10867 if (RExC_parse > RExC_end) {
10870 ret = NULL; /* For look-ahead/behind. */
10873 case 'P': /* (?P...) variants for those used to PCRE/Python */
10874 paren = *RExC_parse;
10875 if ( paren == '<') { /* (?P<...>) named capture */
10877 if (RExC_parse >= RExC_end) {
10878 vFAIL("Sequence (?P<... not terminated");
10880 goto named_capture;
10882 else if (paren == '>') { /* (?P>name) named recursion */
10884 if (RExC_parse >= RExC_end) {
10885 vFAIL("Sequence (?P>... not terminated");
10887 goto named_recursion;
10889 else if (paren == '=') { /* (?P=...) named backref */
10891 return handle_named_backref(pRExC_state, flagp,
10894 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10895 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10896 vFAIL3("Sequence (%.*s...) not recognized",
10897 RExC_parse-seqstart, seqstart);
10898 NOT_REACHED; /*NOTREACHED*/
10899 case '<': /* (?<...) */
10900 if (*RExC_parse == '!')
10902 else if (*RExC_parse != '=')
10909 case '\'': /* (?'...') */
10910 name_start = RExC_parse;
10911 svname = reg_scan_name(pRExC_state,
10912 SIZE_ONLY /* reverse test from the others */
10913 ? REG_RSN_RETURN_NAME
10914 : REG_RSN_RETURN_NULL);
10915 if ( RExC_parse == name_start
10916 || RExC_parse >= RExC_end
10917 || *RExC_parse != paren)
10919 vFAIL2("Sequence (?%c... not terminated",
10920 paren=='>' ? '<' : paren);
10925 if (!svname) /* shouldn't happen */
10927 "panic: reg_scan_name returned NULL");
10928 if (!RExC_paren_names) {
10929 RExC_paren_names= newHV();
10930 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10932 RExC_paren_name_list= newAV();
10933 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10936 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10938 sv_dat = HeVAL(he_str);
10940 /* croak baby croak */
10942 "panic: paren_name hash element allocation failed");
10943 } else if ( SvPOK(sv_dat) ) {
10944 /* (?|...) can mean we have dupes so scan to check
10945 its already been stored. Maybe a flag indicating
10946 we are inside such a construct would be useful,
10947 but the arrays are likely to be quite small, so
10948 for now we punt -- dmq */
10949 IV count = SvIV(sv_dat);
10950 I32 *pv = (I32*)SvPVX(sv_dat);
10952 for ( i = 0 ; i < count ; i++ ) {
10953 if ( pv[i] == RExC_npar ) {
10959 pv = (I32*)SvGROW(sv_dat,
10960 SvCUR(sv_dat) + sizeof(I32)+1);
10961 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10962 pv[count] = RExC_npar;
10963 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10966 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10967 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10970 SvIV_set(sv_dat, 1);
10973 /* Yes this does cause a memory leak in debugging Perls
10975 if (!av_store(RExC_paren_name_list,
10976 RExC_npar, SvREFCNT_inc(svname)))
10977 SvREFCNT_dec_NN(svname);
10980 /*sv_dump(sv_dat);*/
10982 nextchar(pRExC_state);
10984 goto capturing_parens;
10986 RExC_seen |= REG_LOOKBEHIND_SEEN;
10987 RExC_in_lookbehind++;
10989 if (RExC_parse >= RExC_end) {
10990 vFAIL("Sequence (?... not terminated");
10994 case '=': /* (?=...) */
10995 RExC_seen_zerolen++;
10997 case '!': /* (?!...) */
10998 RExC_seen_zerolen++;
10999 /* check if we're really just a "FAIL" assertion */
11000 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11001 FALSE /* Don't force to /x */ );
11002 if (*RExC_parse == ')') {
11003 ret=reganode(pRExC_state, OPFAIL, 0);
11004 nextchar(pRExC_state);
11008 case '|': /* (?|...) */
11009 /* branch reset, behave like a (?:...) except that
11010 buffers in alternations share the same numbers */
11012 after_freeze = freeze_paren = RExC_npar;
11014 case ':': /* (?:...) */
11015 case '>': /* (?>...) */
11017 case '$': /* (?$...) */
11018 case '@': /* (?@...) */
11019 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11021 case '0' : /* (?0) */
11022 case 'R' : /* (?R) */
11023 if (RExC_parse == RExC_end || *RExC_parse != ')')
11024 FAIL("Sequence (?R) not terminated");
11026 RExC_seen |= REG_RECURSE_SEEN;
11027 *flagp |= POSTPONED;
11028 goto gen_recurse_regop;
11030 /* named and numeric backreferences */
11031 case '&': /* (?&NAME) */
11032 parse_start = RExC_parse - 1;
11035 SV *sv_dat = reg_scan_name(pRExC_state,
11036 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11037 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11039 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11040 vFAIL("Sequence (?&... not terminated");
11041 goto gen_recurse_regop;
11044 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11046 vFAIL("Illegal pattern");
11048 goto parse_recursion;
11050 case '-': /* (?-1) */
11051 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11052 RExC_parse--; /* rewind to let it be handled later */
11056 case '1': case '2': case '3': case '4': /* (?1) */
11057 case '5': case '6': case '7': case '8': case '9':
11058 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11061 bool is_neg = FALSE;
11063 parse_start = RExC_parse - 1; /* MJD */
11064 if (*RExC_parse == '-') {
11068 if (grok_atoUV(RExC_parse, &unum, &endptr)
11072 RExC_parse = (char*)endptr;
11076 /* Some limit for num? */
11080 if (*RExC_parse!=')')
11081 vFAIL("Expecting close bracket");
11084 if ( paren == '-' ) {
11086 Diagram of capture buffer numbering.
11087 Top line is the normal capture buffer numbers
11088 Bottom line is the negative indexing as from
11092 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11096 num = RExC_npar + num;
11099 vFAIL("Reference to nonexistent group");
11101 } else if ( paren == '+' ) {
11102 num = RExC_npar + num - 1;
11104 /* We keep track how many GOSUB items we have produced.
11105 To start off the ARG2L() of the GOSUB holds its "id",
11106 which is used later in conjunction with RExC_recurse
11107 to calculate the offset we need to jump for the GOSUB,
11108 which it will store in the final representation.
11109 We have to defer the actual calculation until much later
11110 as the regop may move.
11113 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11115 if (num > (I32)RExC_rx->nparens) {
11117 vFAIL("Reference to nonexistent group");
11119 RExC_recurse_count++;
11120 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11121 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11122 22, "| |", (int)(depth * 2 + 1), "",
11123 (UV)ARG(ret), (IV)ARG2L(ret)));
11125 RExC_seen |= REG_RECURSE_SEEN;
11127 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
11128 Set_Node_Offset(ret, parse_start); /* MJD */
11130 *flagp |= POSTPONED;
11131 assert(*RExC_parse == ')');
11132 nextchar(pRExC_state);
11137 case '?': /* (??...) */
11139 if (*RExC_parse != '{') {
11140 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11141 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11143 "Sequence (%" UTF8f "...) not recognized",
11144 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11145 NOT_REACHED; /*NOTREACHED*/
11147 *flagp |= POSTPONED;
11151 case '{': /* (?{...}) */
11154 struct reg_code_block *cb;
11156 RExC_seen_zerolen++;
11158 if ( !pRExC_state->code_blocks
11159 || pRExC_state->code_index
11160 >= pRExC_state->code_blocks->count
11161 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11162 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11165 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11166 FAIL("panic: Sequence (?{...}): no code block found\n");
11167 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11169 /* this is a pre-compiled code block (?{...}) */
11170 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11171 RExC_parse = RExC_start + cb->end;
11174 if (cb->src_regex) {
11175 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11176 RExC_rxi->data->data[n] =
11177 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11178 RExC_rxi->data->data[n+1] = (void*)o;
11181 n = add_data(pRExC_state,
11182 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11183 RExC_rxi->data->data[n] = (void*)o;
11186 pRExC_state->code_index++;
11187 nextchar(pRExC_state);
11191 ret = reg_node(pRExC_state, LOGICAL);
11193 eval = reg2Lanode(pRExC_state, EVAL,
11196 /* for later propagation into (??{})
11198 RExC_flags & RXf_PMf_COMPILETIME
11203 REGTAIL(pRExC_state, ret, eval);
11204 /* deal with the length of this later - MJD */
11207 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11208 Set_Node_Length(ret, RExC_parse - parse_start + 1);
11209 Set_Node_Offset(ret, parse_start);
11212 case '(': /* (?(?{...})...) and (?(?=...)...) */
11215 const int DEFINE_len = sizeof("DEFINE") - 1;
11216 if (RExC_parse[0] == '?') { /* (?(?...)) */
11217 if ( RExC_parse < RExC_end - 1
11218 && ( RExC_parse[1] == '='
11219 || RExC_parse[1] == '!'
11220 || RExC_parse[1] == '<'
11221 || RExC_parse[1] == '{')
11222 ) { /* Lookahead or eval. */
11226 ret = reg_node(pRExC_state, LOGICAL);
11230 tail = reg(pRExC_state, 1, &flag, depth+1);
11231 if (flag & (RESTART_PASS1|NEED_UTF8)) {
11232 *flagp = flag & (RESTART_PASS1|NEED_UTF8);
11235 REGTAIL(pRExC_state, ret, tail);
11238 /* Fall through to ‘Unknown switch condition’ at the
11239 end of the if/else chain. */
11241 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11242 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11244 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11245 char *name_start= RExC_parse++;
11247 SV *sv_dat=reg_scan_name(pRExC_state,
11248 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11249 if ( RExC_parse == name_start
11250 || RExC_parse >= RExC_end
11251 || *RExC_parse != ch)
11253 vFAIL2("Sequence (?(%c... not terminated",
11254 (ch == '>' ? '<' : ch));
11258 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11259 RExC_rxi->data->data[num]=(void*)sv_dat;
11260 SvREFCNT_inc_simple_void(sv_dat);
11262 ret = reganode(pRExC_state,NGROUPP,num);
11263 goto insert_if_check_paren;
11265 else if (memBEGINs(RExC_parse,
11266 (STRLEN) (RExC_end - RExC_parse),
11269 ret = reganode(pRExC_state,DEFINEP,0);
11270 RExC_parse += DEFINE_len;
11272 goto insert_if_check_paren;
11274 else if (RExC_parse[0] == 'R') {
11276 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11277 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11278 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11281 if (RExC_parse[0] == '0') {
11285 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11287 if (grok_atoUV(RExC_parse, &uv, &endptr)
11290 parno = (I32)uv + 1;
11291 RExC_parse = (char*)endptr;
11293 /* else "Switch condition not recognized" below */
11294 } else if (RExC_parse[0] == '&') {
11297 sv_dat = reg_scan_name(pRExC_state,
11299 ? REG_RSN_RETURN_NULL
11300 : REG_RSN_RETURN_DATA);
11302 /* we should only have a false sv_dat when
11303 * SIZE_ONLY is true, and we always have false
11304 * sv_dat when SIZE_ONLY is true.
11305 * reg_scan_name() will VFAIL() if the name is
11306 * unknown when SIZE_ONLY is false, and otherwise
11307 * will return something, and when SIZE_ONLY is
11308 * true, reg_scan_name() just parses the string,
11309 * and doesnt return anything. (in theory) */
11310 assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11313 parno = 1 + *((I32 *)SvPVX(sv_dat));
11315 ret = reganode(pRExC_state,INSUBP,parno);
11316 goto insert_if_check_paren;
11318 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11322 if (grok_atoUV(RExC_parse, &uv, &endptr)
11326 RExC_parse = (char*)endptr;
11329 vFAIL("panic: grok_atoUV returned FALSE");
11331 ret = reganode(pRExC_state, GROUPP, parno);
11333 insert_if_check_paren:
11334 if (UCHARAT(RExC_parse) != ')') {
11335 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11336 vFAIL("Switch condition not recognized");
11338 nextchar(pRExC_state);
11340 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11341 br = regbranch(pRExC_state, &flags, 1,depth+1);
11343 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11344 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11347 FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11350 REGTAIL(pRExC_state, br, reganode(pRExC_state,
11352 c = UCHARAT(RExC_parse);
11353 nextchar(pRExC_state);
11354 if (flags&HASWIDTH)
11355 *flagp |= HASWIDTH;
11358 vFAIL("(?(DEFINE)....) does not allow branches");
11360 /* Fake one for optimizer. */
11361 lastbr = reganode(pRExC_state, IFTHEN, 0);
11363 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
11364 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11365 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11368 FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11371 REGTAIL(pRExC_state, ret, lastbr);
11372 if (flags&HASWIDTH)
11373 *flagp |= HASWIDTH;
11374 c = UCHARAT(RExC_parse);
11375 nextchar(pRExC_state);
11380 if (RExC_parse >= RExC_end)
11381 vFAIL("Switch (?(condition)... not terminated");
11383 vFAIL("Switch (?(condition)... contains too many branches");
11385 ender = reg_node(pRExC_state, TAIL);
11386 REGTAIL(pRExC_state, br, ender);
11388 REGTAIL(pRExC_state, lastbr, ender);
11389 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11392 REGTAIL(pRExC_state, ret, ender);
11393 RExC_size++; /* XXX WHY do we need this?!!
11394 For large programs it seems to be required
11395 but I can't figure out why. -- dmq*/
11398 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11399 vFAIL("Unknown switch condition (?(...))");
11401 case '[': /* (?[ ... ]) */
11402 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11404 case 0: /* A NUL */
11405 RExC_parse--; /* for vFAIL to print correctly */
11406 vFAIL("Sequence (? incomplete");
11408 default: /* e.g., (?i) */
11409 RExC_parse = (char *) seqstart + 1;
11411 parse_lparen_question_flags(pRExC_state);
11412 if (UCHARAT(RExC_parse) != ':') {
11413 if (RExC_parse < RExC_end)
11414 nextchar(pRExC_state);
11419 nextchar(pRExC_state);
11424 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
11429 ret = reganode(pRExC_state, OPEN, parno);
11431 if (!RExC_nestroot)
11432 RExC_nestroot = parno;
11433 if (RExC_open_parens && !RExC_open_parens[parno])
11435 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11436 "%*s%*s Setting open paren #%" IVdf " to %d\n",
11437 22, "| |", (int)(depth * 2 + 1), "",
11438 (IV)parno, REG_NODE_NUM(ret)));
11439 RExC_open_parens[parno]= ret;
11442 Set_Node_Length(ret, 1); /* MJD */
11443 Set_Node_Offset(ret, RExC_parse); /* MJD */
11446 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11455 /* Pick up the branches, linking them together. */
11456 parse_start = RExC_parse; /* MJD */
11457 br = regbranch(pRExC_state, &flags, 1,depth+1);
11459 /* branch_len = (paren != 0); */
11462 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11463 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11466 FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11468 if (*RExC_parse == '|') {
11469 if (!SIZE_ONLY && RExC_extralen) {
11470 reginsert(pRExC_state, BRANCHJ, br, depth+1);
11473 reginsert(pRExC_state, BRANCH, br, depth+1);
11474 Set_Node_Length(br, paren != 0);
11475 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11479 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
11481 else if (paren == ':') {
11482 *flagp |= flags&SIMPLE;
11484 if (is_open) { /* Starts with OPEN. */
11485 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
11487 else if (paren != '?') /* Not Conditional */
11489 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11491 while (*RExC_parse == '|') {
11492 if (!SIZE_ONLY && RExC_extralen) {
11493 ender = reganode(pRExC_state, LONGJMP,0);
11495 /* Append to the previous. */
11496 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11499 RExC_extralen += 2; /* Account for LONGJMP. */
11500 nextchar(pRExC_state);
11501 if (freeze_paren) {
11502 if (RExC_npar > after_freeze)
11503 after_freeze = RExC_npar;
11504 RExC_npar = freeze_paren;
11506 br = regbranch(pRExC_state, &flags, 0, depth+1);
11509 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11510 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11513 FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11515 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
11517 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11520 if (have_branch || paren != ':') {
11521 /* Make a closing node, and hook it on the end. */
11524 ender = reg_node(pRExC_state, TAIL);
11527 ender = reganode(pRExC_state, CLOSE, parno);
11528 if ( RExC_close_parens ) {
11529 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11530 "%*s%*s Setting close paren #%" IVdf " to %d\n",
11531 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11532 RExC_close_parens[parno]= ender;
11533 if (RExC_nestroot == parno)
11536 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11537 Set_Node_Length(ender,1); /* MJD */
11540 ender = reg_node(pRExC_state, SRCLOSE);
11541 RExC_in_script_run = 0;
11547 *flagp &= ~HASWIDTH;
11550 ender = reg_node(pRExC_state, SUCCEED);
11553 ender = reg_node(pRExC_state, END);
11555 assert(!RExC_end_op); /* there can only be one! */
11556 RExC_end_op = ender;
11557 if (RExC_close_parens) {
11558 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11559 "%*s%*s Setting close paren #0 (END) to %d\n",
11560 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11562 RExC_close_parens[0]= ender;
11567 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11568 DEBUG_PARSE_MSG("lsbr");
11569 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11570 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11571 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11572 SvPV_nolen_const(RExC_mysv1),
11573 (IV)REG_NODE_NUM(lastbr),
11574 SvPV_nolen_const(RExC_mysv2),
11575 (IV)REG_NODE_NUM(ender),
11576 (IV)(ender - lastbr)
11579 REGTAIL(pRExC_state, lastbr, ender);
11581 if (have_branch && !SIZE_ONLY) {
11582 char is_nothing= 1;
11584 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11586 /* Hook the tails of the branches to the closing node. */
11587 for (br = ret; br; br = regnext(br)) {
11588 const U8 op = PL_regkind[OP(br)];
11589 if (op == BRANCH) {
11590 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11591 if ( OP(NEXTOPER(br)) != NOTHING
11592 || regnext(NEXTOPER(br)) != ender)
11595 else if (op == BRANCHJ) {
11596 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11597 /* for now we always disable this optimisation * /
11598 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11599 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11605 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11606 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11607 DEBUG_PARSE_MSG("NADA");
11608 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11609 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11610 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11611 SvPV_nolen_const(RExC_mysv1),
11612 (IV)REG_NODE_NUM(ret),
11613 SvPV_nolen_const(RExC_mysv2),
11614 (IV)REG_NODE_NUM(ender),
11619 if (OP(ender) == TAIL) {
11624 for ( opt= br + 1; opt < ender ; opt++ )
11625 OP(opt)= OPTIMIZED;
11626 NEXT_OFF(br)= ender - br;
11634 static const char parens[] = "=!<,>";
11636 if (paren && (p = strchr(parens, paren))) {
11637 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11638 int flag = (p - parens) > 1;
11641 node = SUSPEND, flag = 0;
11642 reginsert(pRExC_state, node,ret, depth+1);
11643 Set_Node_Cur_Length(ret, parse_start);
11644 Set_Node_Offset(ret, parse_start + 1);
11646 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11650 /* Check for proper termination. */
11652 /* restore original flags, but keep (?p) and, if we've changed from /d
11653 * rules to /u, keep the /u */
11654 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11655 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11656 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11658 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11659 RExC_parse = oregcomp_parse;
11660 vFAIL("Unmatched (");
11662 nextchar(pRExC_state);
11664 else if (!paren && RExC_parse < RExC_end) {
11665 if (*RExC_parse == ')') {
11667 vFAIL("Unmatched )");
11670 FAIL("Junk on end of regexp"); /* "Can't happen". */
11671 NOT_REACHED; /* NOTREACHED */
11674 if (RExC_in_lookbehind) {
11675 RExC_in_lookbehind--;
11677 if (after_freeze > RExC_npar)
11678 RExC_npar = after_freeze;
11683 - regbranch - one alternative of an | operator
11685 * Implements the concatenation operator.
11687 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11688 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11691 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11694 regnode *chain = NULL;
11696 I32 flags = 0, c = 0;
11697 GET_RE_DEBUG_FLAGS_DECL;
11699 PERL_ARGS_ASSERT_REGBRANCH;
11701 DEBUG_PARSE("brnc");
11706 if (!SIZE_ONLY && RExC_extralen)
11707 ret = reganode(pRExC_state, BRANCHJ,0);
11709 ret = reg_node(pRExC_state, BRANCH);
11710 Set_Node_Length(ret, 1);
11714 if (!first && SIZE_ONLY)
11715 RExC_extralen += 1; /* BRANCHJ */
11717 *flagp = WORST; /* Tentatively. */
11719 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11720 FALSE /* Don't force to /x */ );
11721 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11722 flags &= ~TRYAGAIN;
11723 latest = regpiece(pRExC_state, &flags,depth+1);
11724 if (latest == NULL) {
11725 if (flags & TRYAGAIN)
11727 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11728 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11731 FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags);
11733 else if (ret == NULL)
11735 *flagp |= flags&(HASWIDTH|POSTPONED);
11736 if (chain == NULL) /* First piece. */
11737 *flagp |= flags&SPSTART;
11739 /* FIXME adding one for every branch after the first is probably
11740 * excessive now we have TRIE support. (hv) */
11742 REGTAIL(pRExC_state, chain, latest);
11747 if (chain == NULL) { /* Loop ran zero times. */
11748 chain = reg_node(pRExC_state, NOTHING);
11753 *flagp |= flags&SIMPLE;
11760 - regpiece - something followed by possible quantifier * + ? {n,m}
11762 * Note that the branching code sequences used for ? and the general cases
11763 * of * and + are somewhat optimized: they use the same NOTHING node as
11764 * both the endmarker for their branch list and the body of the last branch.
11765 * It might seem that this node could be dispensed with entirely, but the
11766 * endmarker role is not redundant.
11768 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11770 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11771 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11774 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11780 const char * const origparse = RExC_parse;
11782 I32 max = REG_INFTY;
11783 #ifdef RE_TRACK_PATTERN_OFFSETS
11786 const char *maxpos = NULL;
11789 /* Save the original in case we change the emitted regop to a FAIL. */
11790 regnode * const orig_emit = RExC_emit;
11792 GET_RE_DEBUG_FLAGS_DECL;
11794 PERL_ARGS_ASSERT_REGPIECE;
11796 DEBUG_PARSE("piec");
11798 ret = regatom(pRExC_state, &flags,depth+1);
11800 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11801 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11803 FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags);
11809 if (op == '{' && regcurly(RExC_parse)) {
11811 #ifdef RE_TRACK_PATTERN_OFFSETS
11812 parse_start = RExC_parse; /* MJD */
11814 next = RExC_parse + 1;
11815 while (isDIGIT(*next) || *next == ',') {
11816 if (*next == ',') {
11824 if (*next == '}') { /* got one */
11825 const char* endptr;
11829 if (isDIGIT(*RExC_parse)) {
11830 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11831 vFAIL("Invalid quantifier in {,}");
11832 if (uv >= REG_INFTY)
11833 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11838 if (*maxpos == ',')
11841 maxpos = RExC_parse;
11842 if (isDIGIT(*maxpos)) {
11843 if (!grok_atoUV(maxpos, &uv, &endptr))
11844 vFAIL("Invalid quantifier in {,}");
11845 if (uv >= REG_INFTY)
11846 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11849 max = REG_INFTY; /* meaning "infinity" */
11852 nextchar(pRExC_state);
11853 if (max < min) { /* If can't match, warn and optimize to fail
11855 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
11857 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11858 NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
11862 else if (min == max && *RExC_parse == '?')
11865 ckWARN2reg(RExC_parse + 1,
11866 "Useless use of greediness modifier '%c'",
11872 if ((flags&SIMPLE)) {
11873 if (min == 0 && max == REG_INFTY) {
11874 reginsert(pRExC_state, STAR, ret, depth+1);
11876 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11879 if (min == 1 && max == REG_INFTY) {
11880 reginsert(pRExC_state, PLUS, ret, depth+1);
11882 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11885 MARK_NAUGHTY_EXP(2, 2);
11886 reginsert(pRExC_state, CURLY, ret, depth+1);
11887 Set_Node_Offset(ret, parse_start+1); /* MJD */
11888 Set_Node_Cur_Length(ret, parse_start);
11891 regnode * const w = reg_node(pRExC_state, WHILEM);
11894 REGTAIL(pRExC_state, ret, w);
11895 if (!SIZE_ONLY && RExC_extralen) {
11896 reginsert(pRExC_state, LONGJMP,ret, depth+1);
11897 reginsert(pRExC_state, NOTHING,ret, depth+1);
11898 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
11900 reginsert(pRExC_state, CURLYX,ret, depth+1);
11902 Set_Node_Offset(ret, parse_start+1);
11903 Set_Node_Length(ret,
11904 op == '{' ? (RExC_parse - parse_start) : 1);
11906 if (!SIZE_ONLY && RExC_extralen)
11907 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
11908 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11910 RExC_whilem_seen++, RExC_extralen += 3;
11911 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
11918 *flagp |= HASWIDTH;
11920 ARG1_SET(ret, (U16)min);
11921 ARG2_SET(ret, (U16)max);
11923 if (max == REG_INFTY)
11924 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11930 if (!ISMULT1(op)) {
11935 #if 0 /* Now runtime fix should be reliable. */
11937 /* if this is reinstated, don't forget to put this back into perldiag:
11939 =item Regexp *+ operand could be empty at {#} in regex m/%s/
11941 (F) The part of the regexp subject to either the * or + quantifier
11942 could match an empty string. The {#} shows in the regular
11943 expression about where the problem was discovered.
11947 if (!(flags&HASWIDTH) && op != '?')
11948 vFAIL("Regexp *+ operand could be empty");
11951 #ifdef RE_TRACK_PATTERN_OFFSETS
11952 parse_start = RExC_parse;
11954 nextchar(pRExC_state);
11956 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11962 else if (op == '+') {
11966 else if (op == '?') {
11971 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11972 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11973 ckWARN2reg(RExC_parse,
11974 "%" UTF8f " matches null string many times",
11975 UTF8fARG(UTF, (RExC_parse >= origparse
11976 ? RExC_parse - origparse
11979 (void)ReREFCNT_inc(RExC_rx_sv);
11982 if (*RExC_parse == '?') {
11983 nextchar(pRExC_state);
11984 reginsert(pRExC_state, MINMOD, ret, depth+1);
11985 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11987 else if (*RExC_parse == '+') {
11989 nextchar(pRExC_state);
11990 ender = reg_node(pRExC_state, SUCCEED);
11991 REGTAIL(pRExC_state, ret, ender);
11992 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11993 ender = reg_node(pRExC_state, TAIL);
11994 REGTAIL(pRExC_state, ret, ender);
11997 if (ISMULT2(RExC_parse)) {
11999 vFAIL("Nested quantifiers");
12006 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12015 /* This routine teases apart the various meanings of \N and returns
12016 * accordingly. The input parameters constrain which meaning(s) is/are valid
12017 * in the current context.
12019 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12021 * If <code_point_p> is not NULL, the context is expecting the result to be a
12022 * single code point. If this \N instance turns out to a single code point,
12023 * the function returns TRUE and sets *code_point_p to that code point.
12025 * If <node_p> is not NULL, the context is expecting the result to be one of
12026 * the things representable by a regnode. If this \N instance turns out to be
12027 * one such, the function generates the regnode, returns TRUE and sets *node_p
12028 * to point to that regnode.
12030 * If this instance of \N isn't legal in any context, this function will
12031 * generate a fatal error and not return.
12033 * On input, RExC_parse should point to the first char following the \N at the
12034 * time of the call. On successful return, RExC_parse will have been updated
12035 * to point to just after the sequence identified by this routine. Also
12036 * *flagp has been updated as needed.
12038 * When there is some problem with the current context and this \N instance,
12039 * the function returns FALSE, without advancing RExC_parse, nor setting
12040 * *node_p, nor *code_point_p, nor *flagp.
12042 * If <cp_count> is not NULL, the caller wants to know the length (in code
12043 * points) that this \N sequence matches. This is set even if the function
12044 * returns FALSE, as detailed below.
12046 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12048 * Probably the most common case is for the \N to specify a single code point.
12049 * *cp_count will be set to 1, and *code_point_p will be set to that code
12052 * Another possibility is for the input to be an empty \N{}, which for
12053 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
12054 * will be set to a generated NOTHING node.
12056 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12057 * set to 0. *node_p will be set to a generated REG_ANY node.
12059 * The fourth possibility is that \N resolves to a sequence of more than one
12060 * code points. *cp_count will be set to the number of code points in the
12061 * sequence. *node_p * will be set to a generated node returned by this
12062 * function calling S_reg().
12064 * The final possibility is that it is premature to be calling this function;
12065 * that pass1 needs to be restarted. This can happen when this changes from
12066 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12067 * latter occurs only when the fourth possibility would otherwise be in
12068 * effect, and is because one of those code points requires the pattern to be
12069 * recompiled as UTF-8. The function returns FALSE, and sets the
12070 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
12071 * happens, the caller needs to desist from continuing parsing, and return
12072 * this information to its caller. This is not set for when there is only one
12073 * code point, as this can be called as part of an ANYOF node, and they can
12074 * store above-Latin1 code points without the pattern having to be in UTF-8.
12076 * For non-single-quoted regexes, the tokenizer has resolved character and
12077 * sequence names inside \N{...} into their Unicode values, normalizing the
12078 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12079 * hex-represented code points in the sequence. This is done there because
12080 * the names can vary based on what charnames pragma is in scope at the time,
12081 * so we need a way to take a snapshot of what they resolve to at the time of
12082 * the original parse. [perl #56444].
12084 * That parsing is skipped for single-quoted regexes, so we may here get
12085 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
12086 * parser. But if the single-quoted regex is something like '\N{U+41}', that
12087 * is legal and handled here. The code point is Unicode, and has to be
12088 * translated into the native character set for non-ASCII platforms.
12091 char * endbrace; /* points to '}' following the name */
12092 char *endchar; /* Points to '.' or '}' ending cur char in the input
12094 char* p = RExC_parse; /* Temporary */
12096 GET_RE_DEBUG_FLAGS_DECL;
12098 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12100 GET_RE_DEBUG_FLAGS;
12102 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12103 assert(! (node_p && cp_count)); /* At most 1 should be set */
12105 if (cp_count) { /* Initialize return for the most common case */
12109 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12110 * modifier. The other meanings do not, so use a temporary until we find
12111 * out which we are being called with */
12112 skip_to_be_ignored_text(pRExC_state, &p,
12113 FALSE /* Don't force to /x */ );
12115 /* Disambiguate between \N meaning a named character versus \N meaning
12116 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12117 * quantifier, or there is no '{' at all */
12118 if (*p != '{' || regcurly(p)) {
12128 *node_p = reg_node(pRExC_state, REG_ANY);
12129 *flagp |= HASWIDTH|SIMPLE;
12131 Set_Node_Length(*node_p, 1); /* MJD */
12135 /* Here, we have decided it should be a named character or sequence */
12137 /* The test above made sure that the next real character is a '{', but
12138 * under the /x modifier, it could be separated by space (or a comment and
12139 * \n) and this is not allowed (for consistency with \x{...} and the
12140 * tokenizer handling of \N{NAME}). */
12141 if (*RExC_parse != '{') {
12142 vFAIL("Missing braces on \\N{}");
12145 RExC_parse++; /* Skip past the '{' */
12147 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12148 if (! endbrace) { /* no trailing brace */
12149 vFAIL2("Missing right brace on \\%c{}", 'N');
12151 else if (!( endbrace == RExC_parse /* nothing between the {} */
12152 || memBEGINs(RExC_parse, /* U+ (bad hex is checked below
12153 for a better error msg) */
12154 (STRLEN) (RExC_end - RExC_parse),
12157 RExC_parse = endbrace; /* position msg's '<--HERE' */
12158 vFAIL("\\N{NAME} must be resolved by the lexer");
12161 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12164 if (endbrace == RExC_parse) { /* empty: \N{} */
12166 RExC_parse++; /* Position after the "}" */
12167 vFAIL("Zero length \\N{}");
12172 nextchar(pRExC_state);
12177 *node_p = reg_node(pRExC_state,NOTHING);
12181 RExC_parse += 2; /* Skip past the 'U+' */
12183 /* Because toke.c has generated a special construct for us guaranteed not
12184 * to have NULs, we can use a str function */
12185 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12187 /* Code points are separated by dots. If none, there is only one code
12188 * point, and is terminated by the brace */
12190 if (endchar >= endbrace) {
12191 STRLEN length_of_hex;
12192 I32 grok_hex_flags;
12194 /* Here, exactly one code point. If that isn't what is wanted, fail */
12195 if (! code_point_p) {
12200 /* Convert code point from hex */
12201 length_of_hex = (STRLEN)(endchar - RExC_parse);
12202 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
12203 | PERL_SCAN_DISALLOW_PREFIX
12205 /* No errors in the first pass (See [perl
12206 * #122671].) We let the code below find the
12207 * errors when there are multiple chars. */
12209 ? PERL_SCAN_SILENT_ILLDIGIT
12212 /* This routine is the one place where both single- and double-quotish
12213 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
12214 * must be converted to native. */
12215 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
12220 /* The tokenizer should have guaranteed validity, but it's possible to
12221 * bypass it by using single quoting, so check. Don't do the check
12222 * here when there are multiple chars; we do it below anyway. */
12223 if (length_of_hex == 0
12224 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
12226 RExC_parse += length_of_hex; /* Includes all the valid */
12227 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
12228 ? UTF8SKIP(RExC_parse)
12230 /* Guard against malformed utf8 */
12231 if (RExC_parse >= endchar) {
12232 RExC_parse = endchar;
12234 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12237 RExC_parse = endbrace + 1;
12240 else { /* Is a multiple character sequence */
12241 SV * substitute_parse;
12243 char *orig_end = RExC_end;
12244 char *save_start = RExC_start;
12247 /* Count the code points, if desired, in the sequence */
12250 while (RExC_parse < endbrace) {
12251 /* Point to the beginning of the next character in the sequence. */
12252 RExC_parse = endchar + 1;
12253 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12258 /* Fail if caller doesn't want to handle a multi-code-point sequence.
12259 * But don't backup up the pointer if the caller wants to know how many
12260 * code points there are (they can then handle things) */
12268 /* What is done here is to convert this to a sub-pattern of the form
12269 * \x{char1}\x{char2}... and then call reg recursively to parse it
12270 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
12271 * while not having to worry about special handling that some code
12272 * points may have. */
12274 substitute_parse = newSVpvs("?:");
12276 while (RExC_parse < endbrace) {
12278 /* Convert to notation the rest of the code understands */
12279 sv_catpv(substitute_parse, "\\x{");
12280 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
12281 sv_catpv(substitute_parse, "}");
12283 /* Point to the beginning of the next character in the sequence. */
12284 RExC_parse = endchar + 1;
12285 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12288 sv_catpv(substitute_parse, ")");
12290 len = SvCUR(substitute_parse);
12292 /* Don't allow empty number */
12293 if (len < (STRLEN) 8) {
12294 RExC_parse = endbrace;
12295 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12298 RExC_parse = RExC_start = RExC_adjusted_start
12299 = SvPV_nolen(substitute_parse);
12300 RExC_end = RExC_parse + len;
12302 /* The values are Unicode, and therefore not subject to recoding, but
12303 * have to be converted to native on a non-Unicode (meaning non-ASCII)
12306 RExC_recode_x_to_native = 1;
12309 *node_p = reg(pRExC_state, 1, &flags, depth+1);
12311 /* Restore the saved values */
12312 RExC_start = RExC_adjusted_start = save_start;
12313 RExC_parse = endbrace;
12314 RExC_end = orig_end;
12316 RExC_recode_x_to_native = 0;
12318 SvREFCNT_dec_NN(substitute_parse);
12321 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12322 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12325 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
12328 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12330 nextchar(pRExC_state);
12337 PERL_STATIC_INLINE U8
12338 S_compute_EXACTish(RExC_state_t *pRExC_state)
12342 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12350 op = get_regex_charset(RExC_flags);
12351 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12352 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12353 been, so there is no hole */
12356 return op + EXACTF;
12359 PERL_STATIC_INLINE void
12360 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12361 regnode *node, I32* flagp, STRLEN len, UV code_point,
12364 /* This knows the details about sizing an EXACTish node, setting flags for
12365 * it (by setting <*flagp>, and potentially populating it with a single
12368 * If <len> (the length in bytes) is non-zero, this function assumes that
12369 * the node has already been populated, and just does the sizing. In this
12370 * case <code_point> should be the final code point that has already been
12371 * placed into the node. This value will be ignored except that under some
12372 * circumstances <*flagp> is set based on it.
12374 * If <len> is zero, the function assumes that the node is to contain only
12375 * the single character given by <code_point> and calculates what <len>
12376 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
12377 * additionally will populate the node's STRING with <code_point> or its
12380 * In both cases <*flagp> is appropriately set
12382 * It knows that under FOLD, the Latin Sharp S and UTF characters above
12383 * 255, must be folded (the former only when the rules indicate it can
12386 * When it does the populating, it looks at the flag 'downgradable'. If
12387 * true with a node that folds, it checks if the single code point
12388 * participates in a fold, and if not downgrades the node to an EXACT.
12389 * This helps the optimizer */
12391 bool len_passed_in = cBOOL(len != 0);
12392 U8 character[UTF8_MAXBYTES_CASE+1];
12394 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12396 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12397 * sizing difference, and is extra work that is thrown away */
12398 if (downgradable && ! PASS2) {
12399 downgradable = FALSE;
12402 if (! len_passed_in) {
12404 if (UVCHR_IS_INVARIANT(code_point)) {
12405 if (LOC || ! FOLD) { /* /l defers folding until runtime */
12406 *character = (U8) code_point;
12408 else { /* Here is /i and not /l. (toFOLD() is defined on just
12409 ASCII, which isn't the same thing as INVARIANT on
12410 EBCDIC, but it works there, as the extra invariants
12411 fold to themselves) */
12412 *character = toFOLD((U8) code_point);
12414 /* We can downgrade to an EXACT node if this character
12415 * isn't a folding one. Note that this assumes that
12416 * nothing above Latin1 folds to some other invariant than
12417 * one of these alphabetics; otherwise we would also have
12419 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12420 * || ASCII_FOLD_RESTRICTED))
12422 if (downgradable && PL_fold[code_point] == code_point) {
12428 else if (FOLD && (! LOC
12429 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12430 { /* Folding, and ok to do so now */
12431 UV folded = _to_uni_fold_flags(
12435 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12436 ? FOLD_FLAGS_NOMIX_ASCII
12439 && folded == code_point /* This quickly rules out many
12440 cases, avoiding the
12441 _invlist_contains_cp() overhead
12443 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12450 else if (code_point <= MAX_UTF8_TWO_BYTE) {
12452 /* Not folding this cp, and can output it directly */
12453 *character = UTF8_TWO_BYTE_HI(code_point);
12454 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12458 uvchr_to_utf8( character, code_point);
12459 len = UTF8SKIP(character);
12461 } /* Else pattern isn't UTF8. */
12463 *character = (U8) code_point;
12465 } /* Else is folded non-UTF8 */
12466 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12467 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12468 || UNICODE_DOT_DOT_VERSION > 0)
12469 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12473 /* We don't fold any non-UTF8 except possibly the Sharp s (see
12474 * comments at join_exact()); */
12475 *character = (U8) code_point;
12478 /* Can turn into an EXACT node if we know the fold at compile time,
12479 * and it folds to itself and doesn't particpate in other folds */
12482 && PL_fold_latin1[code_point] == code_point
12483 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12484 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12488 } /* else is Sharp s. May need to fold it */
12489 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12491 *(character + 1) = 's';
12495 *character = LATIN_SMALL_LETTER_SHARP_S;
12501 RExC_size += STR_SZ(len);
12504 RExC_emit += STR_SZ(len);
12505 STR_LEN(node) = len;
12506 if (! len_passed_in) {
12507 Copy((char *) character, STRING(node), len, char);
12511 *flagp |= HASWIDTH;
12513 /* A single character node is SIMPLE, except for the special-cased SHARP S
12515 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12516 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12517 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12518 || UNICODE_DOT_DOT_VERSION > 0)
12519 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12520 || ! FOLD || ! DEPENDS_SEMANTICS)
12526 /* The OP may not be well defined in PASS1 */
12527 if (PASS2 && OP(node) == EXACTFL) {
12528 RExC_contains_locale = 1;
12533 S_new_regcurly(const char *s, const char *e)
12535 /* This is a temporary function designed to match the most lenient form of
12536 * a {m,n} quantifier we ever envision, with either number omitted, and
12537 * spaces anywhere between/before/after them.
12539 * If this function fails, then the string it matches is very unlikely to
12540 * ever be considered a valid quantifier, so we can allow the '{' that
12541 * begins it to be considered as a literal */
12543 bool has_min = FALSE;
12544 bool has_max = FALSE;
12546 PERL_ARGS_ASSERT_NEW_REGCURLY;
12548 if (s >= e || *s++ != '{')
12551 while (s < e && isSPACE(*s)) {
12554 while (s < e && isDIGIT(*s)) {
12558 while (s < e && isSPACE(*s)) {
12564 while (s < e && isSPACE(*s)) {
12567 while (s < e && isDIGIT(*s)) {
12571 while (s < e && isSPACE(*s)) {
12576 return s < e && *s == '}' && (has_min || has_max);
12579 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12580 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12583 S_backref_value(char *p)
12585 const char* endptr;
12587 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12594 - regatom - the lowest level
12596 Try to identify anything special at the start of the current parse position.
12597 If there is, then handle it as required. This may involve generating a
12598 single regop, such as for an assertion; or it may involve recursing, such as
12599 to handle a () structure.
12601 If the string doesn't start with something special then we gobble up
12602 as much literal text as we can. If we encounter a quantifier, we have to
12603 back off the final literal character, as that quantifier applies to just it
12604 and not to the whole string of literals.
12606 Once we have been able to handle whatever type of thing started the
12607 sequence, we return.
12609 Note: we have to be careful with escapes, as they can be both literal
12610 and special, and in the case of \10 and friends, context determines which.
12612 A summary of the code structure is:
12614 switch (first_byte) {
12615 cases for each special:
12616 handle this special;
12619 switch (2nd byte) {
12620 cases for each unambiguous special:
12621 handle this special;
12623 cases for each ambigous special/literal:
12625 if (special) handle here
12627 default: // unambiguously literal:
12630 default: // is a literal char
12633 create EXACTish node for literal;
12634 while (more input and node isn't full) {
12635 switch (input_byte) {
12636 cases for each special;
12637 make sure parse pointer is set so that the next call to
12638 regatom will see this special first
12639 goto loopdone; // EXACTish node terminated by prev. char
12641 append char to EXACTISH node;
12643 get next input byte;
12647 return the generated node;
12649 Specifically there are two separate switches for handling
12650 escape sequences, with the one for handling literal escapes requiring
12651 a dummy entry for all of the special escapes that are actually handled
12654 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12656 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12657 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12658 Otherwise does not return NULL.
12662 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12664 regnode *ret = NULL;
12671 GET_RE_DEBUG_FLAGS_DECL;
12673 *flagp = WORST; /* Tentatively. */
12675 DEBUG_PARSE("atom");
12677 PERL_ARGS_ASSERT_REGATOM;
12680 parse_start = RExC_parse;
12681 assert(RExC_parse < RExC_end);
12682 switch ((U8)*RExC_parse) {
12684 RExC_seen_zerolen++;
12685 nextchar(pRExC_state);
12686 if (RExC_flags & RXf_PMf_MULTILINE)
12687 ret = reg_node(pRExC_state, MBOL);
12689 ret = reg_node(pRExC_state, SBOL);
12690 Set_Node_Length(ret, 1); /* MJD */
12693 nextchar(pRExC_state);
12695 RExC_seen_zerolen++;
12696 if (RExC_flags & RXf_PMf_MULTILINE)
12697 ret = reg_node(pRExC_state, MEOL);
12699 ret = reg_node(pRExC_state, SEOL);
12700 Set_Node_Length(ret, 1); /* MJD */
12703 nextchar(pRExC_state);
12704 if (RExC_flags & RXf_PMf_SINGLELINE)
12705 ret = reg_node(pRExC_state, SANY);
12707 ret = reg_node(pRExC_state, REG_ANY);
12708 *flagp |= HASWIDTH|SIMPLE;
12710 Set_Node_Length(ret, 1); /* MJD */
12714 char * const oregcomp_parse = ++RExC_parse;
12715 ret = regclass(pRExC_state, flagp,depth+1,
12716 FALSE, /* means parse the whole char class */
12717 TRUE, /* allow multi-char folds */
12718 FALSE, /* don't silence non-portable warnings. */
12719 (bool) RExC_strict,
12720 TRUE, /* Allow an optimized regnode result */
12724 if (*flagp & (RESTART_PASS1|NEED_UTF8))
12726 FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
12729 if (*RExC_parse != ']') {
12730 RExC_parse = oregcomp_parse;
12731 vFAIL("Unmatched [");
12733 nextchar(pRExC_state);
12734 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12738 nextchar(pRExC_state);
12739 ret = reg(pRExC_state, 2, &flags,depth+1);
12741 if (flags & TRYAGAIN) {
12742 if (RExC_parse >= RExC_end) {
12743 /* Make parent create an empty node if needed. */
12744 *flagp |= TRYAGAIN;
12749 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12750 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12753 FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf,
12756 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12760 if (flags & TRYAGAIN) {
12761 *flagp |= TRYAGAIN;
12764 vFAIL("Internal urp");
12765 /* Supposed to be caught earlier. */
12771 vFAIL("Quantifier follows nothing");
12776 This switch handles escape sequences that resolve to some kind
12777 of special regop and not to literal text. Escape sequnces that
12778 resolve to literal text are handled below in the switch marked
12781 Every entry in this switch *must* have a corresponding entry
12782 in the literal escape switch. However, the opposite is not
12783 required, as the default for this switch is to jump to the
12784 literal text handling code.
12787 switch ((U8)*RExC_parse) {
12788 /* Special Escapes */
12790 RExC_seen_zerolen++;
12791 ret = reg_node(pRExC_state, SBOL);
12792 /* SBOL is shared with /^/ so we set the flags so we can tell
12793 * /\A/ from /^/ in split. We check ret because first pass we
12794 * have no regop struct to set the flags on. */
12798 goto finish_meta_pat;
12800 ret = reg_node(pRExC_state, GPOS);
12801 RExC_seen |= REG_GPOS_SEEN;
12803 goto finish_meta_pat;
12805 RExC_seen_zerolen++;
12806 ret = reg_node(pRExC_state, KEEPS);
12808 /* XXX:dmq : disabling in-place substitution seems to
12809 * be necessary here to avoid cases of memory corruption, as
12810 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12812 RExC_seen |= REG_LOOKBEHIND_SEEN;
12813 goto finish_meta_pat;
12815 ret = reg_node(pRExC_state, SEOL);
12817 RExC_seen_zerolen++; /* Do not optimize RE away */
12818 goto finish_meta_pat;
12820 ret = reg_node(pRExC_state, EOS);
12822 RExC_seen_zerolen++; /* Do not optimize RE away */
12823 goto finish_meta_pat;
12825 vFAIL("\\C no longer supported");
12827 ret = reg_node(pRExC_state, CLUMP);
12828 *flagp |= HASWIDTH;
12829 goto finish_meta_pat;
12835 arg = ANYOF_WORDCHAR;
12843 regex_charset charset = get_regex_charset(RExC_flags);
12845 RExC_seen_zerolen++;
12846 RExC_seen |= REG_LOOKBEHIND_SEEN;
12847 op = BOUND + charset;
12849 if (op == BOUNDL) {
12850 RExC_contains_locale = 1;
12853 ret = reg_node(pRExC_state, op);
12855 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12856 FLAGS(ret) = TRADITIONAL_BOUND;
12857 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
12863 char name = *RExC_parse;
12864 char * endbrace = NULL;
12866 if (RExC_parse < RExC_end) {
12867 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12871 vFAIL2("Missing right brace on \\%c{}", name);
12873 /* XXX Need to decide whether to take spaces or not. Should be
12874 * consistent with \p{}, but that currently is SPACE, which
12875 * means vertical too, which seems wrong
12876 * while (isBLANK(*RExC_parse)) {
12879 if (endbrace == RExC_parse) {
12880 RExC_parse++; /* After the '}' */
12881 vFAIL2("Empty \\%c{}", name);
12883 length = endbrace - RExC_parse;
12884 /*while (isBLANK(*(RExC_parse + length - 1))) {
12887 switch (*RExC_parse) {
12890 && (memNEs(RExC_parse + 1, length - 1, "cb")))
12892 goto bad_bound_type;
12894 FLAGS(ret) = GCB_BOUND;
12897 if (length != 2 || *(RExC_parse + 1) != 'b') {
12898 goto bad_bound_type;
12900 FLAGS(ret) = LB_BOUND;
12903 if (length != 2 || *(RExC_parse + 1) != 'b') {
12904 goto bad_bound_type;
12906 FLAGS(ret) = SB_BOUND;
12909 if (length != 2 || *(RExC_parse + 1) != 'b') {
12910 goto bad_bound_type;
12912 FLAGS(ret) = WB_BOUND;
12916 RExC_parse = endbrace;
12918 "'%" UTF8f "' is an unknown bound type",
12919 UTF8fARG(UTF, length, endbrace - length));
12920 NOT_REACHED; /*NOTREACHED*/
12922 RExC_parse = endbrace;
12923 REQUIRE_UNI_RULES(flagp, NULL);
12925 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
12929 /* Don't have to worry about UTF-8, in this message because
12930 * to get here the contents of the \b must be ASCII */
12931 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
12932 "Using /u for '%.*s' instead of /%s",
12934 endbrace - length + 1,
12935 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12936 ? ASCII_RESTRICT_PAT_MODS
12937 : ASCII_MORE_RESTRICT_PAT_MODS);
12941 if (PASS2 && invert) {
12942 OP(ret) += NBOUND - BOUND;
12944 goto finish_meta_pat;
12952 if (! DEPENDS_SEMANTICS) {
12956 /* \d doesn't have any matches in the upper Latin1 range, hence /d
12957 * is equivalent to /u. Changing to /u saves some branches at
12960 goto join_posix_op_known;
12963 ret = reg_node(pRExC_state, LNBREAK);
12964 *flagp |= HASWIDTH|SIMPLE;
12965 goto finish_meta_pat;
12973 goto join_posix_op_known;
12979 arg = ANYOF_VERTWS;
12981 goto join_posix_op_known;
12991 op = POSIXD + get_regex_charset(RExC_flags);
12992 if (op > POSIXA) { /* /aa is same as /a */
12995 else if (op == POSIXL) {
12996 RExC_contains_locale = 1;
12999 join_posix_op_known:
13002 op += NPOSIXD - POSIXD;
13005 ret = reg_node(pRExC_state, op);
13007 FLAGS(ret) = namedclass_to_classnum(arg);
13010 *flagp |= HASWIDTH|SIMPLE;
13014 if ( UCHARAT(RExC_parse + 1) == '{'
13015 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13018 vFAIL("Unescaped left brace in regex is illegal here");
13020 nextchar(pRExC_state);
13021 Set_Node_Length(ret, 2); /* MJD */
13027 ret = regclass(pRExC_state, flagp,depth+1,
13028 TRUE, /* means just parse this element */
13029 FALSE, /* don't allow multi-char folds */
13030 FALSE, /* don't silence non-portable warnings. It
13031 would be a bug if these returned
13033 (bool) RExC_strict,
13034 TRUE, /* Allow an optimized regnode result */
13037 if (*flagp & RESTART_PASS1)
13039 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13040 * multi-char folds are allowed. */
13042 FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
13047 Set_Node_Offset(ret, parse_start);
13048 Set_Node_Cur_Length(ret, parse_start - 2);
13049 nextchar(pRExC_state);
13052 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13053 * \N{...} evaluates to a sequence of more than one code points).
13054 * The function call below returns a regnode, which is our result.
13055 * The parameters cause it to fail if the \N{} evaluates to a
13056 * single code point; we handle those like any other literal. The
13057 * reason that the multicharacter case is handled here and not as
13058 * part of the EXACtish code is because of quantifiers. In
13059 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13060 * this way makes that Just Happen. dmq.
13061 * join_exact() will join this up with adjacent EXACTish nodes
13062 * later on, if appropriate. */
13064 if (grok_bslash_N(pRExC_state,
13065 &ret, /* Want a regnode returned */
13066 NULL, /* Fail if evaluates to a single code
13068 NULL, /* Don't need a count of how many code
13077 if (*flagp & RESTART_PASS1)
13080 /* Here, evaluates to a single code point. Go get that */
13081 RExC_parse = parse_start;
13084 case 'k': /* Handle \k<NAME> and \k'NAME' */
13088 if ( RExC_parse >= RExC_end - 1
13089 || (( ch = RExC_parse[1]) != '<'
13094 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13095 vFAIL2("Sequence %.2s... not terminated",parse_start);
13098 ret = handle_named_backref(pRExC_state,
13110 case '1': case '2': case '3': case '4':
13111 case '5': case '6': case '7': case '8': case '9':
13116 if (*RExC_parse == 'g') {
13120 if (*RExC_parse == '{') {
13124 if (*RExC_parse == '-') {
13128 if (hasbrace && !isDIGIT(*RExC_parse)) {
13129 if (isrel) RExC_parse--;
13131 goto parse_named_seq;
13134 if (RExC_parse >= RExC_end) {
13135 goto unterminated_g;
13137 num = S_backref_value(RExC_parse);
13139 vFAIL("Reference to invalid group 0");
13140 else if (num == I32_MAX) {
13141 if (isDIGIT(*RExC_parse))
13142 vFAIL("Reference to nonexistent group");
13145 vFAIL("Unterminated \\g... pattern");
13149 num = RExC_npar - num;
13151 vFAIL("Reference to nonexistent or unclosed group");
13155 num = S_backref_value(RExC_parse);
13156 /* bare \NNN might be backref or octal - if it is larger
13157 * than or equal RExC_npar then it is assumed to be an
13158 * octal escape. Note RExC_npar is +1 from the actual
13159 * number of parens. */
13160 /* Note we do NOT check if num == I32_MAX here, as that is
13161 * handled by the RExC_npar check */
13164 /* any numeric escape < 10 is always a backref */
13166 /* any numeric escape < RExC_npar is a backref */
13167 && num >= RExC_npar
13168 /* cannot be an octal escape if it starts with 8 */
13169 && *RExC_parse != '8'
13170 /* cannot be an octal escape it it starts with 9 */
13171 && *RExC_parse != '9'
13174 /* Probably not a backref, instead likely to be an
13175 * octal character escape, e.g. \35 or \777.
13176 * The above logic should make it obvious why using
13177 * octal escapes in patterns is problematic. - Yves */
13178 RExC_parse = parse_start;
13183 /* At this point RExC_parse points at a numeric escape like
13184 * \12 or \88 or something similar, which we should NOT treat
13185 * as an octal escape. It may or may not be a valid backref
13186 * escape. For instance \88888888 is unlikely to be a valid
13188 while (isDIGIT(*RExC_parse))
13191 if (*RExC_parse != '}')
13192 vFAIL("Unterminated \\g{...} pattern");
13196 if (num > (I32)RExC_rx->nparens)
13197 vFAIL("Reference to nonexistent group");
13200 ret = reganode(pRExC_state,
13203 : (ASCII_FOLD_RESTRICTED)
13205 : (AT_LEAST_UNI_SEMANTICS)
13211 *flagp |= HASWIDTH;
13213 /* override incorrect value set in reganode MJD */
13214 Set_Node_Offset(ret, parse_start);
13215 Set_Node_Cur_Length(ret, parse_start-1);
13216 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13217 FALSE /* Don't force to /x */ );
13221 if (RExC_parse >= RExC_end)
13222 FAIL("Trailing \\");
13225 /* Do not generate "unrecognized" warnings here, we fall
13226 back into the quick-grab loop below */
13227 RExC_parse = parse_start;
13229 } /* end of switch on a \foo sequence */
13234 /* '#' comments should have been spaced over before this function was
13236 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13238 if (RExC_flags & RXf_PMf_EXTENDED) {
13239 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13240 if (RExC_parse < RExC_end)
13250 /* Here, we have determined that the next thing is probably a
13251 * literal character. RExC_parse points to the first byte of its
13252 * definition. (It still may be an escape sequence that evaluates
13253 * to a single character) */
13259 #define MAX_NODE_STRING_SIZE 127
13260 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
13262 U8 upper_parse = MAX_NODE_STRING_SIZE;
13263 U8 node_type = compute_EXACTish(pRExC_state);
13264 bool next_is_quantifier;
13265 char * oldp = NULL;
13267 /* We can convert EXACTF nodes to EXACTFU if they contain only
13268 * characters that match identically regardless of the target
13269 * string's UTF8ness. The reason to do this is that EXACTF is not
13270 * trie-able, EXACTFU is.
13272 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13273 * contain only above-Latin1 characters (hence must be in UTF8),
13274 * which don't participate in folds with Latin1-range characters,
13275 * as the latter's folds aren't known until runtime. (We don't
13276 * need to figure this out until pass 2) */
13277 bool maybe_exactfu = PASS2
13278 && (node_type == EXACTF || node_type == EXACTFL);
13280 /* If a folding node contains only code points that don't
13281 * participate in folds, it can be changed into an EXACT node,
13282 * which allows the optimizer more things to look for */
13285 ret = reg_node(pRExC_state, node_type);
13287 /* In pass1, folded, we use a temporary buffer instead of the
13288 * actual node, as the node doesn't exist yet */
13289 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
13295 /* We look for the EXACTFish to EXACT node optimizaton only if
13296 * folding. (And we don't need to figure this out until pass 2).
13297 * XXX It might actually make sense to split the node into portions
13298 * that are exact and ones that aren't, so that we could later use
13299 * the exact ones to find the longest fixed and floating strings.
13300 * One would want to join them back into a larger node. One could
13301 * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
13302 maybe_exact = FOLD && PASS2;
13304 /* XXX The node can hold up to 255 bytes, yet this only goes to
13305 * 127. I (khw) do not know why. Keeping it somewhat less than
13306 * 255 allows us to not have to worry about overflow due to
13307 * converting to utf8 and fold expansion, but that value is
13308 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
13309 * split up by this limit into a single one using the real max of
13310 * 255. Even at 127, this breaks under rare circumstances. If
13311 * folding, we do not want to split a node at a character that is a
13312 * non-final in a multi-char fold, as an input string could just
13313 * happen to want to match across the node boundary. The join
13314 * would solve that problem if the join actually happens. But a
13315 * series of more than two nodes in a row each of 127 would cause
13316 * the first join to succeed to get to 254, but then there wouldn't
13317 * be room for the next one, which could at be one of those split
13318 * multi-char folds. I don't know of any fool-proof solution. One
13319 * could back off to end with only a code point that isn't such a
13320 * non-final, but it is possible for there not to be any in the
13323 assert( ! UTF /* Is at the beginning of a character */
13324 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13325 || UTF8_IS_START(UCHARAT(RExC_parse)));
13327 /* Here, we have a literal character. Find the maximal string of
13328 * them in the input that we can fit into a single EXACTish node.
13329 * We quit at the first non-literal or when the node gets full */
13330 for (p = RExC_parse;
13331 len < upper_parse && p < RExC_end;
13336 /* White space has already been ignored */
13337 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13338 || ! is_PATWS_safe((p), RExC_end, UTF));
13350 /* Literal Escapes Switch
13352 This switch is meant to handle escape sequences that
13353 resolve to a literal character.
13355 Every escape sequence that represents something
13356 else, like an assertion or a char class, is handled
13357 in the switch marked 'Special Escapes' above in this
13358 routine, but also has an entry here as anything that
13359 isn't explicitly mentioned here will be treated as
13360 an unescaped equivalent literal.
13363 switch ((U8)*++p) {
13364 /* These are all the special escapes. */
13365 case 'A': /* Start assertion */
13366 case 'b': case 'B': /* Word-boundary assertion*/
13367 case 'C': /* Single char !DANGEROUS! */
13368 case 'd': case 'D': /* digit class */
13369 case 'g': case 'G': /* generic-backref, pos assertion */
13370 case 'h': case 'H': /* HORIZWS */
13371 case 'k': case 'K': /* named backref, keep marker */
13372 case 'p': case 'P': /* Unicode property */
13373 case 'R': /* LNBREAK */
13374 case 's': case 'S': /* space class */
13375 case 'v': case 'V': /* VERTWS */
13376 case 'w': case 'W': /* word class */
13377 case 'X': /* eXtended Unicode "combining
13378 character sequence" */
13379 case 'z': case 'Z': /* End of line/string assertion */
13383 /* Anything after here is an escape that resolves to a
13384 literal. (Except digits, which may or may not)
13390 case 'N': /* Handle a single-code point named character. */
13391 RExC_parse = p + 1;
13392 if (! grok_bslash_N(pRExC_state,
13393 NULL, /* Fail if evaluates to
13394 anything other than a
13395 single code point */
13396 &ender, /* The returned single code
13398 NULL, /* Don't need a count of
13399 how many code points */
13404 if (*flagp & NEED_UTF8)
13405 FAIL("panic: grok_bslash_N set NEED_UTF8");
13406 if (*flagp & RESTART_PASS1)
13409 /* Here, it wasn't a single code point. Go close
13410 * up this EXACTish node. The switch() prior to
13411 * this switch handles the other cases */
13412 RExC_parse = p = oldp;
13416 RExC_parse = parse_start;
13417 if (ender > 0xff) {
13418 REQUIRE_UTF8(flagp);
13434 ender = ESC_NATIVE;
13444 const char* error_msg;
13446 bool valid = grok_bslash_o(&p,
13450 PASS2, /* out warnings */
13451 (bool) RExC_strict,
13452 TRUE, /* Output warnings
13457 RExC_parse = p; /* going to die anyway; point
13458 to exact spot of failure */
13462 if (ender > 0xff) {
13463 REQUIRE_UTF8(flagp);
13469 UV result = UV_MAX; /* initialize to erroneous
13471 const char* error_msg;
13473 bool valid = grok_bslash_x(&p,
13477 PASS2, /* out warnings */
13478 (bool) RExC_strict,
13479 TRUE, /* Silence warnings
13484 RExC_parse = p; /* going to die anyway; point
13485 to exact spot of failure */
13490 if (ender < 0x100) {
13492 if (RExC_recode_x_to_native) {
13493 ender = LATIN1_TO_NATIVE(ender);
13498 REQUIRE_UTF8(flagp);
13504 ender = grok_bslash_c(*p++, PASS2);
13506 case '8': case '9': /* must be a backreference */
13508 /* we have an escape like \8 which cannot be an octal escape
13509 * so we exit the loop, and let the outer loop handle this
13510 * escape which may or may not be a legitimate backref. */
13512 case '1': case '2': case '3':case '4':
13513 case '5': case '6': case '7':
13514 /* When we parse backslash escapes there is ambiguity
13515 * between backreferences and octal escapes. Any escape
13516 * from \1 - \9 is a backreference, any multi-digit
13517 * escape which does not start with 0 and which when
13518 * evaluated as decimal could refer to an already
13519 * parsed capture buffer is a back reference. Anything
13522 * Note this implies that \118 could be interpreted as
13523 * 118 OR as "\11" . "8" depending on whether there
13524 * were 118 capture buffers defined already in the
13527 /* NOTE, RExC_npar is 1 more than the actual number of
13528 * parens we have seen so far, hence the < RExC_npar below. */
13530 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13531 { /* Not to be treated as an octal constant, go
13539 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13541 ender = grok_oct(p, &numlen, &flags, NULL);
13542 if (ender > 0xff) {
13543 REQUIRE_UTF8(flagp);
13546 if (PASS2 /* like \08, \178 */
13548 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13550 reg_warn_non_literal_string(
13552 form_short_octal_warning(p, numlen));
13558 FAIL("Trailing \\");
13561 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13562 /* Include any left brace following the alpha to emphasize
13563 * that it could be part of an escape at some point
13565 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13566 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13568 goto normal_default;
13569 } /* End of switch on '\' */
13572 /* Currently we allow an lbrace at the start of a construct
13573 * without raising a warning. This is because we think we
13574 * will never want such a brace to be meant to be other
13575 * than taken literally. */
13576 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
13578 /* But, we raise a fatal warning otherwise, as the
13579 * deprecation cycle has come and gone. Except that it
13580 * turns out that some heavily-relied on upstream
13581 * software, notably GNU Autoconf, have failed to fix
13582 * their uses. For these, don't make it fatal unless
13583 * we anticipate using the '{' for something else.
13584 * This happens after any alpha, and for a looser {m,n}
13585 * quantifier specification */
13587 || ( p > parse_start + 1
13588 && isALPHA_A(*(p - 1))
13589 && *(p - 2) == '\\')
13590 || new_regcurly(p, RExC_end))
13592 RExC_parse = p + 1;
13593 vFAIL("Unescaped left brace in regex is "
13597 ckWARNregdep(p + 1,
13598 "Unescaped left brace in regex is "
13599 "deprecated here (and will be fatal "
13600 "in Perl 5.30), passed through");
13603 goto normal_default;
13606 if (PASS2 && p > RExC_parse && RExC_strict) {
13607 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
13610 default: /* A literal character */
13612 if (! UTF8_IS_INVARIANT(*p) && UTF) {
13614 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13615 &numlen, UTF8_ALLOW_DEFAULT);
13621 } /* End of switch on the literal */
13623 /* Here, have looked at the literal character and <ender>
13624 * contains its ordinal, <p> points to the character after it.
13625 * We need to check if the next non-ignored thing is a
13626 * quantifier. Move <p> to after anything that should be
13627 * ignored, which, as a side effect, positions <p> for the next
13628 * loop iteration */
13629 skip_to_be_ignored_text(pRExC_state, &p,
13630 FALSE /* Don't force to /x */ );
13632 /* If the next thing is a quantifier, it applies to this
13633 * character only, which means that this character has to be in
13634 * its own node and can't just be appended to the string in an
13635 * existing node, so if there are already other characters in
13636 * the node, close the node with just them, and set up to do
13637 * this character again next time through, when it will be the
13638 * only thing in its new node */
13640 next_is_quantifier = LIKELY(p < RExC_end)
13641 && UNLIKELY(ISMULT2(p));
13643 if (next_is_quantifier && LIKELY(len)) {
13648 /* Ready to add 'ender' to the node */
13650 if (! FOLD) { /* The simple case, just append the literal */
13652 /* In the sizing pass, we need only the size of the
13653 * character we are appending, hence we can delay getting
13654 * its representation until PASS2. */
13656 if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
13657 const STRLEN unilen = UVCHR_SKIP(ender);
13660 /* We have to subtract 1 just below (and again in
13661 * the corresponding PASS2 code) because the loop
13662 * increments <len> each time, as all but this path
13663 * (and one other) through it add a single byte to
13664 * the EXACTish node. But these paths would change
13665 * len to be the correct final value, so cancel out
13666 * the increment that follows */
13672 } else { /* PASS2 */
13674 if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
13675 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13676 len += (char *) new_s - s - 1;
13677 s = (char *) new_s;
13680 *(s++) = (char) ender;
13684 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13686 /* Here are folding under /l, and the code point is
13687 * problematic. First, we know we can't simplify things */
13688 maybe_exact = FALSE;
13689 maybe_exactfu = FALSE;
13691 /* A problematic code point in this context means that its
13692 * fold isn't known until runtime, so we can't fold it now.
13693 * (The non-problematic code points are the above-Latin1
13694 * ones that fold to also all above-Latin1. Their folds
13695 * don't vary no matter what the locale is.) But here we
13696 * have characters whose fold depends on the locale.
13697 * Unlike the non-folding case above, we have to keep track
13698 * of these in the sizing pass, so that we can make sure we
13699 * don't split too-long nodes in the middle of a potential
13700 * multi-char fold. And unlike the regular fold case
13701 * handled in the else clauses below, we don't actually
13702 * fold and don't have special cases to consider. What we
13703 * do for both passes is the PASS2 code for non-folding */
13704 goto not_fold_common;
13706 else /* A regular FOLD code point */
13708 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
13709 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
13710 || UNICODE_DOT_DOT_VERSION > 0)
13711 /* See comments for join_exact() as to why we fold
13712 * this non-UTF at compile time */
13713 || ( node_type == EXACTFU
13714 && ender == LATIN_SMALL_LETTER_SHARP_S)
13717 /* Here, are folding and are not UTF-8 encoded; therefore
13718 * the character must be in the range 0-255, and is not /l
13719 * (Not /l because we already handled these under /l in
13720 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13721 if (IS_IN_SOME_FOLD_L1(ender)) {
13722 maybe_exact = FALSE;
13724 /* See if the character's fold differs between /d and
13725 * /u. This includes the multi-char fold SHARP S to
13727 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13728 RExC_seen_unfolded_sharp_s = 1;
13729 maybe_exactfu = FALSE;
13731 else if (maybe_exactfu
13732 && (PL_fold[ender] != PL_fold_latin1[ender]
13733 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
13734 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
13735 || UNICODE_DOT_DOT_VERSION > 0)
13737 && isALPHA_FOLD_EQ(ender, 's')
13738 && isALPHA_FOLD_EQ(*(s-1), 's'))
13741 maybe_exactfu = FALSE;
13745 /* Even when folding, we store just the input character, as
13746 * we have an array that finds its fold quickly */
13747 *(s++) = (char) ender;
13749 else { /* FOLD, and UTF (or sharp s) */
13750 /* Unlike the non-fold case, we do actually have to
13751 * calculate the results here in pass 1. This is for two
13752 * reasons, the folded length may be longer than the
13753 * unfolded, and we have to calculate how many EXACTish
13754 * nodes it will take; and we may run out of room in a node
13755 * in the middle of a potential multi-char fold, and have
13756 * to back off accordingly. */
13759 if (isASCII_uni(ender)) {
13760 folded = toFOLD(ender);
13761 *(s)++ = (U8) folded;
13766 folded = _to_uni_fold_flags(
13770 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13771 ? FOLD_FLAGS_NOMIX_ASCII
13775 /* The loop increments <len> each time, as all but this
13776 * path (and one other) through it add a single byte to
13777 * the EXACTish node. But this one has changed len to
13778 * be the correct final value, so subtract one to
13779 * cancel out the increment that follows */
13780 len += foldlen - 1;
13782 /* If this node only contains non-folding code points so
13783 * far, see if this new one is also non-folding */
13785 if (folded != ender) {
13786 maybe_exact = FALSE;
13789 /* Here the fold is the original; we have to check
13790 * further to see if anything folds to it */
13791 if (_invlist_contains_cp(PL_utf8_foldable,
13794 maybe_exact = FALSE;
13801 if (next_is_quantifier) {
13803 /* Here, the next input is a quantifier, and to get here,
13804 * the current character is the only one in the node.
13805 * Also, here <len> doesn't include the final byte for this
13811 } /* End of loop through literal characters */
13813 /* Here we have either exhausted the input or ran out of room in
13814 * the node. (If we encountered a character that can't be in the
13815 * node, transfer is made directly to <loopdone>, and so we
13816 * wouldn't have fallen off the end of the loop.) In the latter
13817 * case, we artificially have to split the node into two, because
13818 * we just don't have enough space to hold everything. This
13819 * creates a problem if the final character participates in a
13820 * multi-character fold in the non-final position, as a match that
13821 * should have occurred won't, due to the way nodes are matched,
13822 * and our artificial boundary. So back off until we find a non-
13823 * problematic character -- one that isn't at the beginning or
13824 * middle of such a fold. (Either it doesn't participate in any
13825 * folds, or appears only in the final position of all the folds it
13826 * does participate in.) A better solution with far fewer false
13827 * positives, and that would fill the nodes more completely, would
13828 * be to actually have available all the multi-character folds to
13829 * test against, and to back-off only far enough to be sure that
13830 * this node isn't ending with a partial one. <upper_parse> is set
13831 * further below (if we need to reparse the node) to include just
13832 * up through that final non-problematic character that this code
13833 * identifies, so when it is set to less than the full node, we can
13834 * skip the rest of this */
13835 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13837 const STRLEN full_len = len;
13839 assert(len >= MAX_NODE_STRING_SIZE);
13841 /* Here, <s> points to the final byte of the final character.
13842 * Look backwards through the string until find a non-
13843 * problematic character */
13847 /* This has no multi-char folds to non-UTF characters */
13848 if (ASCII_FOLD_RESTRICTED) {
13852 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13856 if (! PL_NonL1NonFinalFold) {
13857 PL_NonL1NonFinalFold = _new_invlist_C_array(
13858 NonL1_Perl_Non_Final_Folds_invlist);
13861 /* Point to the first byte of the final character */
13862 s = (char *) utf8_hop((U8 *) s, -1);
13864 while (s >= s0) { /* Search backwards until find
13865 non-problematic char */
13866 if (UTF8_IS_INVARIANT(*s)) {
13868 /* There are no ascii characters that participate
13869 * in multi-char folds under /aa. In EBCDIC, the
13870 * non-ascii invariants are all control characters,
13871 * so don't ever participate in any folds. */
13872 if (ASCII_FOLD_RESTRICTED
13873 || ! IS_NON_FINAL_FOLD(*s))
13878 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13879 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13885 else if (! _invlist_contains_cp(
13886 PL_NonL1NonFinalFold,
13887 valid_utf8_to_uvchr((U8 *) s, NULL)))
13892 /* Here, the current character is problematic in that
13893 * it does occur in the non-final position of some
13894 * fold, so try the character before it, but have to
13895 * special case the very first byte in the string, so
13896 * we don't read outside the string */
13897 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13898 } /* End of loop backwards through the string */
13900 /* If there were only problematic characters in the string,
13901 * <s> will point to before s0, in which case the length
13902 * should be 0, otherwise include the length of the
13903 * non-problematic character just found */
13904 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13907 /* Here, have found the final character, if any, that is
13908 * non-problematic as far as ending the node without splitting
13909 * it across a potential multi-char fold. <len> contains the
13910 * number of bytes in the node up-to and including that
13911 * character, or is 0 if there is no such character, meaning
13912 * the whole node contains only problematic characters. In
13913 * this case, give up and just take the node as-is. We can't
13918 /* If the node ends in an 's' we make sure it stays EXACTF,
13919 * as if it turns into an EXACTFU, it could later get
13920 * joined with another 's' that would then wrongly match
13922 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13924 maybe_exactfu = FALSE;
13928 /* Here, the node does contain some characters that aren't
13929 * problematic. If one such is the final character in the
13930 * node, we are done */
13931 if (len == full_len) {
13934 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13936 /* If the final character is problematic, but the
13937 * penultimate is not, back-off that last character to
13938 * later start a new node with it */
13943 /* Here, the final non-problematic character is earlier
13944 * in the input than the penultimate character. What we do
13945 * is reparse from the beginning, going up only as far as
13946 * this final ok one, thus guaranteeing that the node ends
13947 * in an acceptable character. The reason we reparse is
13948 * that we know how far in the character is, but we don't
13949 * know how to correlate its position with the input parse.
13950 * An alternate implementation would be to build that
13951 * correlation as we go along during the original parse,
13952 * but that would entail extra work for every node, whereas
13953 * this code gets executed only when the string is too
13954 * large for the node, and the final two characters are
13955 * problematic, an infrequent occurrence. Yet another
13956 * possible strategy would be to save the tail of the
13957 * string, and the next time regatom is called, initialize
13958 * with that. The problem with this is that unless you
13959 * back off one more character, you won't be guaranteed
13960 * regatom will get called again, unless regbranch,
13961 * regpiece ... are also changed. If you do back off that
13962 * extra character, so that there is input guaranteed to
13963 * force calling regatom, you can't handle the case where
13964 * just the first character in the node is acceptable. I
13965 * (khw) decided to try this method which doesn't have that
13966 * pitfall; if performance issues are found, we can do a
13967 * combination of the current approach plus that one */
13973 } /* End of verifying node ends with an appropriate char */
13975 loopdone: /* Jumped to when encounters something that shouldn't be
13978 /* I (khw) don't know if you can get here with zero length, but the
13979 * old code handled this situation by creating a zero-length EXACT
13980 * node. Might as well be NOTHING instead */
13986 /* If 'maybe_exact' is still set here, means there are no
13987 * code points in the node that participate in folds;
13988 * similarly for 'maybe_exactfu' and code points that match
13989 * differently depending on UTF8ness of the target string
13990 * (for /u), or depending on locale for /l */
13996 else if (maybe_exactfu) {
14002 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
14003 FALSE /* Don't look to see if could
14004 be turned into an EXACT
14005 node, as we have already
14010 RExC_parse = p - 1;
14011 Set_Node_Cur_Length(ret, parse_start);
14014 /* len is STRLEN which is unsigned, need to copy to signed */
14017 vFAIL("Internal disaster");
14020 } /* End of label 'defchar:' */
14022 } /* End of giant switch on input character */
14024 /* Position parse to next real character */
14025 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14026 FALSE /* Don't force to /x */ );
14027 if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
14028 ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
14036 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14038 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
14039 * sets up the bitmap and any flags, removing those code points from the
14040 * inversion list, setting it to NULL should it become completely empty */
14042 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14043 assert(PL_regkind[OP(node)] == ANYOF);
14045 ANYOF_BITMAP_ZERO(node);
14046 if (*invlist_ptr) {
14048 /* This gets set if we actually need to modify things */
14049 bool change_invlist = FALSE;
14053 /* Start looking through *invlist_ptr */
14054 invlist_iterinit(*invlist_ptr);
14055 while (invlist_iternext(*invlist_ptr, &start, &end)) {
14059 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14060 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14063 /* Quit if are above what we should change */
14064 if (start >= NUM_ANYOF_CODE_POINTS) {
14068 change_invlist = TRUE;
14070 /* Set all the bits in the range, up to the max that we are doing */
14071 high = (end < NUM_ANYOF_CODE_POINTS - 1)
14073 : NUM_ANYOF_CODE_POINTS - 1;
14074 for (i = start; i <= (int) high; i++) {
14075 if (! ANYOF_BITMAP_TEST(node, i)) {
14076 ANYOF_BITMAP_SET(node, i);
14080 invlist_iterfinish(*invlist_ptr);
14082 /* Done with loop; remove any code points that are in the bitmap from
14083 * *invlist_ptr; similarly for code points above the bitmap if we have
14084 * a flag to match all of them anyways */
14085 if (change_invlist) {
14086 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14088 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14089 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14092 /* If have completely emptied it, remove it completely */
14093 if (_invlist_len(*invlist_ptr) == 0) {
14094 SvREFCNT_dec_NN(*invlist_ptr);
14095 *invlist_ptr = NULL;
14100 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14101 Character classes ([:foo:]) can also be negated ([:^foo:]).
14102 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14103 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14104 but trigger failures because they are currently unimplemented. */
14106 #define POSIXCC_DONE(c) ((c) == ':')
14107 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14108 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14109 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14111 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
14112 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
14113 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
14115 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14117 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14119 #define ADD_POSIX_WARNING(p, text) STMT_START { \
14120 if (posix_warnings) { \
14121 if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
14122 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
14126 REPORT_LOCATION_ARGS(p))); \
14129 #define CLEAR_POSIX_WARNINGS() \
14131 if (posix_warnings && RExC_warn_text) \
14132 av_clear(RExC_warn_text); \
14135 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
14137 CLEAR_POSIX_WARNINGS(); \
14142 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14144 const char * const s, /* Where the putative posix class begins.
14145 Normally, this is one past the '['. This
14146 parameter exists so it can be somewhere
14147 besides RExC_parse. */
14148 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14150 AV ** posix_warnings, /* Where to place any generated warnings, or
14152 const bool check_only /* Don't die if error */
14155 /* This parses what the caller thinks may be one of the three POSIX
14157 * 1) a character class, like [:blank:]
14158 * 2) a collating symbol, like [. .]
14159 * 3) an equivalence class, like [= =]
14160 * In the latter two cases, it croaks if it finds a syntactically legal
14161 * one, as these are not handled by Perl.
14163 * The main purpose is to look for a POSIX character class. It returns:
14164 * a) the class number
14165 * if it is a completely syntactically and semantically legal class.
14166 * 'updated_parse_ptr', if not NULL, is set to point to just after the
14167 * closing ']' of the class
14168 * b) OOB_NAMEDCLASS
14169 * if it appears that one of the three POSIX constructs was meant, but
14170 * its specification was somehow defective. 'updated_parse_ptr', if
14171 * not NULL, is set to point to the character just after the end
14172 * character of the class. See below for handling of warnings.
14173 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14174 * if it doesn't appear that a POSIX construct was intended.
14175 * 'updated_parse_ptr' is not changed. No warnings nor errors are
14178 * In b) there may be errors or warnings generated. If 'check_only' is
14179 * TRUE, then any errors are discarded. Warnings are returned to the
14180 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
14181 * instead it is NULL, warnings are suppressed. This is done in all
14182 * passes. The reason for this is that the rest of the parsing is heavily
14183 * dependent on whether this routine found a valid posix class or not. If
14184 * it did, the closing ']' is absorbed as part of the class. If no class,
14185 * or an invalid one is found, any ']' will be considered the terminator of
14186 * the outer bracketed character class, leading to very different results.
14187 * In particular, a '(?[ ])' construct will likely have a syntax error if
14188 * the class is parsed other than intended, and this will happen in pass1,
14189 * before the warnings would normally be output. This mechanism allows the
14190 * caller to output those warnings in pass1 just before dieing, giving a
14191 * much better clue as to what is wrong.
14193 * The reason for this function, and its complexity is that a bracketed
14194 * character class can contain just about anything. But it's easy to
14195 * mistype the very specific posix class syntax but yielding a valid
14196 * regular bracketed class, so it silently gets compiled into something
14197 * quite unintended.
14199 * The solution adopted here maintains backward compatibility except that
14200 * it adds a warning if it looks like a posix class was intended but
14201 * improperly specified. The warning is not raised unless what is input
14202 * very closely resembles one of the 14 legal posix classes. To do this,
14203 * it uses fuzzy parsing. It calculates how many single-character edits it
14204 * would take to transform what was input into a legal posix class. Only
14205 * if that number is quite small does it think that the intention was a
14206 * posix class. Obviously these are heuristics, and there will be cases
14207 * where it errs on one side or another, and they can be tweaked as
14208 * experience informs.
14210 * The syntax for a legal posix class is:
14212 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14214 * What this routine considers syntactically to be an intended posix class
14215 * is this (the comments indicate some restrictions that the pattern
14218 * qr/(?x: \[? # The left bracket, possibly
14220 * \h* # possibly followed by blanks
14221 * (?: \^ \h* )? # possibly a misplaced caret
14222 * [:;]? # The opening class character,
14223 * # possibly omitted. A typo
14224 * # semi-colon can also be used.
14226 * \^? # possibly a correctly placed
14227 * # caret, but not if there was also
14228 * # a misplaced one
14230 * .{3,15} # The class name. If there are
14231 * # deviations from the legal syntax,
14232 * # its edit distance must be close
14233 * # to a real class name in order
14234 * # for it to be considered to be
14235 * # an intended posix class.
14237 * [[:punct:]]? # The closing class character,
14238 * # possibly omitted. If not a colon
14239 * # nor semi colon, the class name
14240 * # must be even closer to a valid
14243 * \]? # The right bracket, possibly
14247 * In the above, \h must be ASCII-only.
14249 * These are heuristics, and can be tweaked as field experience dictates.
14250 * There will be cases when someone didn't intend to specify a posix class
14251 * that this warns as being so. The goal is to minimize these, while
14252 * maximizing the catching of things intended to be a posix class that
14253 * aren't parsed as such.
14257 const char * const e = RExC_end;
14258 unsigned complement = 0; /* If to complement the class */
14259 bool found_problem = FALSE; /* Assume OK until proven otherwise */
14260 bool has_opening_bracket = FALSE;
14261 bool has_opening_colon = FALSE;
14262 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
14264 const char * possible_end = NULL; /* used for a 2nd parse pass */
14265 const char* name_start; /* ptr to class name first char */
14267 /* If the number of single-character typos the input name is away from a
14268 * legal name is no more than this number, it is considered to have meant
14269 * the legal name */
14270 int max_distance = 2;
14272 /* to store the name. The size determines the maximum length before we
14273 * decide that no posix class was intended. Should be at least
14274 * sizeof("alphanumeric") */
14276 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14278 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14280 CLEAR_POSIX_WARNINGS();
14283 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14286 if (*(p - 1) != '[') {
14287 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14288 found_problem = TRUE;
14291 has_opening_bracket = TRUE;
14294 /* They could be confused and think you can put spaces between the
14297 found_problem = TRUE;
14301 } while (p < e && isBLANK(*p));
14303 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14306 /* For [. .] and [= =]. These are quite different internally from [: :],
14307 * so they are handled separately. */
14308 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14309 and 1 for at least one char in it
14312 const char open_char = *p;
14313 const char * temp_ptr = p + 1;
14315 /* These two constructs are not handled by perl, and if we find a
14316 * syntactically valid one, we croak. khw, who wrote this code, finds
14317 * this explanation of them very unclear:
14318 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14319 * And searching the rest of the internet wasn't very helpful either.
14320 * It looks like just about any byte can be in these constructs,
14321 * depending on the locale. But unless the pattern is being compiled
14322 * under /l, which is very rare, Perl runs under the C or POSIX locale.
14323 * In that case, it looks like [= =] isn't allowed at all, and that
14324 * [. .] could be any single code point, but for longer strings the
14325 * constituent characters would have to be the ASCII alphabetics plus
14326 * the minus-hyphen. Any sensible locale definition would limit itself
14327 * to these. And any portable one definitely should. Trying to parse
14328 * the general case is a nightmare (see [perl #127604]). So, this code
14329 * looks only for interiors of these constructs that match:
14331 * Using \w relaxes the apparent rules a little, without adding much
14332 * danger of mistaking something else for one of these constructs.
14334 * [. .] in some implementations described on the internet is usable to
14335 * escape a character that otherwise is special in bracketed character
14336 * classes. For example [.].] means a literal right bracket instead of
14337 * the ending of the class
14339 * [= =] can legitimately contain a [. .] construct, but we don't
14340 * handle this case, as that [. .] construct will later get parsed
14341 * itself and croak then. And [= =] is checked for even when not under
14342 * /l, as Perl has long done so.
14344 * The code below relies on there being a trailing NUL, so it doesn't
14345 * have to keep checking if the parse ptr < e.
14347 if (temp_ptr[1] == open_char) {
14350 else while ( temp_ptr < e
14351 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14356 if (*temp_ptr == open_char) {
14358 if (*temp_ptr == ']') {
14360 if (! found_problem && ! check_only) {
14361 RExC_parse = (char *) temp_ptr;
14362 vFAIL3("POSIX syntax [%c %c] is reserved for future "
14363 "extensions", open_char, open_char);
14366 /* Here, the syntax wasn't completely valid, or else the call
14367 * is to check-only */
14368 if (updated_parse_ptr) {
14369 *updated_parse_ptr = (char *) temp_ptr;
14372 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14376 /* If we find something that started out to look like one of these
14377 * constructs, but isn't, we continue below so that it can be checked
14378 * for being a class name with a typo of '.' or '=' instead of a colon.
14382 /* Here, we think there is a possibility that a [: :] class was meant, and
14383 * we have the first real character. It could be they think the '^' comes
14386 found_problem = TRUE;
14387 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14392 found_problem = TRUE;
14396 } while (p < e && isBLANK(*p));
14398 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14402 /* But the first character should be a colon, which they could have easily
14403 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14404 * distinguish from a colon, so treat that as a colon). */
14407 has_opening_colon = TRUE;
14409 else if (*p == ';') {
14410 found_problem = TRUE;
14412 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14413 has_opening_colon = TRUE;
14416 found_problem = TRUE;
14417 ADD_POSIX_WARNING(p, "there must be a starting ':'");
14419 /* Consider an initial punctuation (not one of the recognized ones) to
14420 * be a left terminator */
14421 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14426 /* They may think that you can put spaces between the components */
14428 found_problem = TRUE;
14432 } while (p < e && isBLANK(*p));
14434 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14439 /* We consider something like [^:^alnum:]] to not have been intended to
14440 * be a posix class, but XXX maybe we should */
14442 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14449 /* Again, they may think that you can put spaces between the components */
14451 found_problem = TRUE;
14455 } while (p < e && isBLANK(*p));
14457 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14462 /* XXX This ']' may be a typo, and something else was meant. But
14463 * treating it as such creates enough complications, that that
14464 * possibility isn't currently considered here. So we assume that the
14465 * ']' is what is intended, and if we've already found an initial '[',
14466 * this leaves this construct looking like [:] or [:^], which almost
14467 * certainly weren't intended to be posix classes */
14468 if (has_opening_bracket) {
14469 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14472 /* But this function can be called when we parse the colon for
14473 * something like qr/[alpha:]]/, so we back up to look for the
14478 found_problem = TRUE;
14479 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14481 else if (*p != ':') {
14483 /* XXX We are currently very restrictive here, so this code doesn't
14484 * consider the possibility that, say, /[alpha.]]/ was intended to
14485 * be a posix class. */
14486 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14489 /* Here we have something like 'foo:]'. There was no initial colon,
14490 * and we back up over 'foo. XXX Unlike the going forward case, we
14491 * don't handle typos of non-word chars in the middle */
14492 has_opening_colon = FALSE;
14495 while (p > RExC_start && isWORDCHAR(*p)) {
14500 /* Here, we have positioned ourselves to where we think the first
14501 * character in the potential class is */
14504 /* Now the interior really starts. There are certain key characters that
14505 * can end the interior, or these could just be typos. To catch both
14506 * cases, we may have to do two passes. In the first pass, we keep on
14507 * going unless we come to a sequence that matches
14508 * qr/ [[:punct:]] [[:blank:]]* \] /xa
14509 * This means it takes a sequence to end the pass, so two typos in a row if
14510 * that wasn't what was intended. If the class is perfectly formed, just
14511 * this one pass is needed. We also stop if there are too many characters
14512 * being accumulated, but this number is deliberately set higher than any
14513 * real class. It is set high enough so that someone who thinks that
14514 * 'alphanumeric' is a correct name would get warned that it wasn't.
14515 * While doing the pass, we keep track of where the key characters were in
14516 * it. If we don't find an end to the class, and one of the key characters
14517 * was found, we redo the pass, but stop when we get to that character.
14518 * Thus the key character was considered a typo in the first pass, but a
14519 * terminator in the second. If two key characters are found, we stop at
14520 * the second one in the first pass. Again this can miss two typos, but
14521 * catches a single one
14523 * In the first pass, 'possible_end' starts as NULL, and then gets set to
14524 * point to the first key character. For the second pass, it starts as -1.
14530 bool has_blank = FALSE;
14531 bool has_upper = FALSE;
14532 bool has_terminating_colon = FALSE;
14533 bool has_terminating_bracket = FALSE;
14534 bool has_semi_colon = FALSE;
14535 unsigned int name_len = 0;
14536 int punct_count = 0;
14540 /* Squeeze out blanks when looking up the class name below */
14541 if (isBLANK(*p) ) {
14543 found_problem = TRUE;
14548 /* The name will end with a punctuation */
14550 const char * peek = p + 1;
14552 /* Treat any non-']' punctuation followed by a ']' (possibly
14553 * with intervening blanks) as trying to terminate the class.
14554 * ']]' is very likely to mean a class was intended (but
14555 * missing the colon), but the warning message that gets
14556 * generated shows the error position better if we exit the
14557 * loop at the bottom (eventually), so skip it here. */
14559 if (peek < e && isBLANK(*peek)) {
14561 found_problem = TRUE;
14564 } while (peek < e && isBLANK(*peek));
14567 if (peek < e && *peek == ']') {
14568 has_terminating_bracket = TRUE;
14570 has_terminating_colon = TRUE;
14572 else if (*p == ';') {
14573 has_semi_colon = TRUE;
14574 has_terminating_colon = TRUE;
14577 found_problem = TRUE;
14584 /* Here we have punctuation we thought didn't end the class.
14585 * Keep track of the position of the key characters that are
14586 * more likely to have been class-enders */
14587 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14589 /* Allow just one such possible class-ender not actually
14590 * ending the class. */
14591 if (possible_end) {
14597 /* If we have too many punctuation characters, no use in
14599 if (++punct_count > max_distance) {
14603 /* Treat the punctuation as a typo. */
14604 input_text[name_len++] = *p;
14607 else if (isUPPER(*p)) { /* Use lowercase for lookup */
14608 input_text[name_len++] = toLOWER(*p);
14610 found_problem = TRUE;
14612 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14613 input_text[name_len++] = *p;
14617 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14621 /* The declaration of 'input_text' is how long we allow a potential
14622 * class name to be, before saying they didn't mean a class name at
14624 if (name_len >= C_ARRAY_LENGTH(input_text)) {
14629 /* We get to here when the possible class name hasn't been properly
14630 * terminated before:
14631 * 1) we ran off the end of the pattern; or
14632 * 2) found two characters, each of which might have been intended to
14633 * be the name's terminator
14634 * 3) found so many punctuation characters in the purported name,
14635 * that the edit distance to a valid one is exceeded
14636 * 4) we decided it was more characters than anyone could have
14637 * intended to be one. */
14639 found_problem = TRUE;
14641 /* In the final two cases, we know that looking up what we've
14642 * accumulated won't lead to a match, even a fuzzy one. */
14643 if ( name_len >= C_ARRAY_LENGTH(input_text)
14644 || punct_count > max_distance)
14646 /* If there was an intermediate key character that could have been
14647 * an intended end, redo the parse, but stop there */
14648 if (possible_end && possible_end != (char *) -1) {
14649 possible_end = (char *) -1; /* Special signal value to say
14650 we've done a first pass */
14655 /* Otherwise, it can't have meant to have been a class */
14656 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14659 /* If we ran off the end, and the final character was a punctuation
14660 * one, back up one, to look at that final one just below. Later, we
14661 * will restore the parse pointer if appropriate */
14662 if (name_len && p == e && isPUNCT(*(p-1))) {
14667 if (p < e && isPUNCT(*p)) {
14669 has_terminating_bracket = TRUE;
14671 /* If this is a 2nd ']', and the first one is just below this
14672 * one, consider that to be the real terminator. This gives a
14673 * uniform and better positioning for the warning message */
14675 && possible_end != (char *) -1
14676 && *possible_end == ']'
14677 && name_len && input_text[name_len - 1] == ']')
14682 /* And this is actually equivalent to having done the 2nd
14683 * pass now, so set it to not try again */
14684 possible_end = (char *) -1;
14689 has_terminating_colon = TRUE;
14691 else if (*p == ';') {
14692 has_semi_colon = TRUE;
14693 has_terminating_colon = TRUE;
14701 /* Here, we have a class name to look up. We can short circuit the
14702 * stuff below for short names that can't possibly be meant to be a
14703 * class name. (We can do this on the first pass, as any second pass
14704 * will yield an even shorter name) */
14705 if (name_len < 3) {
14706 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14709 /* Find which class it is. Initially switch on the length of the name.
14711 switch (name_len) {
14713 if (memEQs(name_start, 4, "word")) {
14714 /* this is not POSIX, this is the Perl \w */
14715 class_number = ANYOF_WORDCHAR;
14719 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14720 * graph lower print punct space upper
14721 * Offset 4 gives the best switch position. */
14722 switch (name_start[4]) {
14724 if (memBEGINs(name_start, 5, "alph")) /* alpha */
14725 class_number = ANYOF_ALPHA;
14728 if (memBEGINs(name_start, 5, "spac")) /* space */
14729 class_number = ANYOF_SPACE;
14732 if (memBEGINs(name_start, 5, "grap")) /* graph */
14733 class_number = ANYOF_GRAPH;
14736 if (memBEGINs(name_start, 5, "asci")) /* ascii */
14737 class_number = ANYOF_ASCII;
14740 if (memBEGINs(name_start, 5, "blan")) /* blank */
14741 class_number = ANYOF_BLANK;
14744 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
14745 class_number = ANYOF_CNTRL;
14748 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
14749 class_number = ANYOF_ALPHANUMERIC;
14752 if (memBEGINs(name_start, 5, "lowe")) /* lower */
14753 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14754 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
14755 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14758 if (memBEGINs(name_start, 5, "digi")) /* digit */
14759 class_number = ANYOF_DIGIT;
14760 else if (memBEGINs(name_start, 5, "prin")) /* print */
14761 class_number = ANYOF_PRINT;
14762 else if (memBEGINs(name_start, 5, "punc")) /* punct */
14763 class_number = ANYOF_PUNCT;
14768 if (memEQs(name_start, 6, "xdigit"))
14769 class_number = ANYOF_XDIGIT;
14773 /* If the name exactly matches a posix class name the class number will
14774 * here be set to it, and the input almost certainly was meant to be a
14775 * posix class, so we can skip further checking. If instead the syntax
14776 * is exactly correct, but the name isn't one of the legal ones, we
14777 * will return that as an error below. But if neither of these apply,
14778 * it could be that no posix class was intended at all, or that one
14779 * was, but there was a typo. We tease these apart by doing fuzzy
14780 * matching on the name */
14781 if (class_number == OOB_NAMEDCLASS && found_problem) {
14782 const UV posix_names[][6] = {
14783 { 'a', 'l', 'n', 'u', 'm' },
14784 { 'a', 'l', 'p', 'h', 'a' },
14785 { 'a', 's', 'c', 'i', 'i' },
14786 { 'b', 'l', 'a', 'n', 'k' },
14787 { 'c', 'n', 't', 'r', 'l' },
14788 { 'd', 'i', 'g', 'i', 't' },
14789 { 'g', 'r', 'a', 'p', 'h' },
14790 { 'l', 'o', 'w', 'e', 'r' },
14791 { 'p', 'r', 'i', 'n', 't' },
14792 { 'p', 'u', 'n', 'c', 't' },
14793 { 's', 'p', 'a', 'c', 'e' },
14794 { 'u', 'p', 'p', 'e', 'r' },
14795 { 'w', 'o', 'r', 'd' },
14796 { 'x', 'd', 'i', 'g', 'i', 't' }
14798 /* The names of the above all have added NULs to make them the same
14799 * size, so we need to also have the real lengths */
14800 const UV posix_name_lengths[] = {
14801 sizeof("alnum") - 1,
14802 sizeof("alpha") - 1,
14803 sizeof("ascii") - 1,
14804 sizeof("blank") - 1,
14805 sizeof("cntrl") - 1,
14806 sizeof("digit") - 1,
14807 sizeof("graph") - 1,
14808 sizeof("lower") - 1,
14809 sizeof("print") - 1,
14810 sizeof("punct") - 1,
14811 sizeof("space") - 1,
14812 sizeof("upper") - 1,
14813 sizeof("word") - 1,
14814 sizeof("xdigit")- 1
14817 int temp_max = max_distance; /* Use a temporary, so if we
14818 reparse, we haven't changed the
14821 /* Use a smaller max edit distance if we are missing one of the
14823 if ( has_opening_bracket + has_opening_colon < 2
14824 || has_terminating_bracket + has_terminating_colon < 2)
14829 /* See if the input name is close to a legal one */
14830 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14832 /* Short circuit call if the lengths are too far apart to be
14834 if (abs( (int) (name_len - posix_name_lengths[i]))
14840 if (edit_distance(input_text,
14843 posix_name_lengths[i],
14847 { /* If it is close, it probably was intended to be a class */
14848 goto probably_meant_to_be;
14852 /* Here the input name is not close enough to a valid class name
14853 * for us to consider it to be intended to be a posix class. If
14854 * we haven't already done so, and the parse found a character that
14855 * could have been terminators for the name, but which we absorbed
14856 * as typos during the first pass, repeat the parse, signalling it
14857 * to stop at that character */
14858 if (possible_end && possible_end != (char *) -1) {
14859 possible_end = (char *) -1;
14864 /* Here neither pass found a close-enough class name */
14865 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14868 probably_meant_to_be:
14870 /* Here we think that a posix specification was intended. Update any
14872 if (updated_parse_ptr) {
14873 *updated_parse_ptr = (char *) p;
14876 /* If a posix class name was intended but incorrectly specified, we
14877 * output or return the warnings */
14878 if (found_problem) {
14880 /* We set flags for these issues in the parse loop above instead of
14881 * adding them to the list of warnings, because we can parse it
14882 * twice, and we only want one warning instance */
14884 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14887 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14889 if (has_semi_colon) {
14890 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14892 else if (! has_terminating_colon) {
14893 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14895 if (! has_terminating_bracket) {
14896 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14899 if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
14900 *posix_warnings = RExC_warn_text;
14903 else if (class_number != OOB_NAMEDCLASS) {
14904 /* If it is a known class, return the class. The class number
14905 * #defines are structured so each complement is +1 to the normal
14907 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
14909 else if (! check_only) {
14911 /* Here, it is an unrecognized class. This is an error (unless the
14912 * call is to check only, which we've already handled above) */
14913 const char * const complement_string = (complement)
14916 RExC_parse = (char *) p;
14917 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
14919 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14923 return OOB_NAMEDCLASS;
14925 #undef ADD_POSIX_WARNING
14927 STATIC unsigned int
14928 S_regex_set_precedence(const U8 my_operator) {
14930 /* Returns the precedence in the (?[...]) construct of the input operator,
14931 * specified by its character representation. The precedence follows
14932 * general Perl rules, but it extends this so that ')' and ']' have (low)
14933 * precedence even though they aren't really operators */
14935 switch (my_operator) {
14951 NOT_REACHED; /* NOTREACHED */
14952 return 0; /* Silence compiler warning */
14956 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14957 I32 *flagp, U32 depth,
14958 char * const oregcomp_parse)
14960 /* Handle the (?[...]) construct to do set operations */
14962 U8 curchar; /* Current character being parsed */
14963 UV start, end; /* End points of code point ranges */
14964 SV* final = NULL; /* The end result inversion list */
14965 SV* result_string; /* 'final' stringified */
14966 AV* stack; /* stack of operators and operands not yet
14968 AV* fence_stack = NULL; /* A stack containing the positions in
14969 'stack' of where the undealt-with left
14970 parens would be if they were actually
14972 /* The 'volatile' is a workaround for an optimiser bug
14973 * in Solaris Studio 12.3. See RT #127455 */
14974 volatile IV fence = 0; /* Position of where most recent undealt-
14975 with left paren in stack is; -1 if none.
14977 STRLEN len; /* Temporary */
14978 regnode* node; /* Temporary, and final regnode returned by
14980 const bool save_fold = FOLD; /* Temporary */
14981 char *save_end, *save_parse; /* Temporaries */
14982 const bool in_locale = LOC; /* we turn off /l during processing */
14983 AV* posix_warnings = NULL;
14985 GET_RE_DEBUG_FLAGS_DECL;
14987 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14989 DEBUG_PARSE("xcls");
14992 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14995 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
14996 This is required so that the compile
14997 time values are valid in all runtime
15000 /* This will return only an ANYOF regnode, or (unlikely) something smaller
15001 * (such as EXACT). Thus we can skip most everything if just sizing. We
15002 * call regclass to handle '[]' so as to not have to reinvent its parsing
15003 * rules here (throwing away the size it computes each time). And, we exit
15004 * upon an unescaped ']' that isn't one ending a regclass. To do both
15005 * these things, we need to realize that something preceded by a backslash
15006 * is escaped, so we have to keep track of backslashes */
15008 UV nest_depth = 0; /* how many nested (?[...]) constructs */
15010 while (RExC_parse < RExC_end) {
15011 SV* current = NULL;
15013 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15014 TRUE /* Force /x */ );
15016 switch (*RExC_parse) {
15018 if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
15019 nest_depth++, RExC_parse+=2;
15024 /* Skip past this, so the next character gets skipped, after
15027 if (*RExC_parse == 'c') {
15028 /* Skip the \cX notation for control characters */
15029 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15035 /* See if this is a [:posix:] class. */
15036 bool is_posix_class = (OOB_NAMEDCLASS
15037 < handle_possible_posix(pRExC_state,
15041 TRUE /* checking only */));
15042 /* If it is a posix class, leave the parse pointer at the
15043 * '[' to fool regclass() into thinking it is part of a
15044 * '[[:posix:]]'. */
15045 if (! is_posix_class) {
15049 /* regclass() can only return RESTART_PASS1 and NEED_UTF8
15050 * if multi-char folds are allowed. */
15051 if (!regclass(pRExC_state, flagp,depth+1,
15052 is_posix_class, /* parse the whole char
15053 class only if not a
15055 FALSE, /* don't allow multi-char folds */
15056 TRUE, /* silence non-portable warnings. */
15058 FALSE, /* Require return to be an ANYOF */
15062 FAIL2("panic: regclass returned NULL to handle_sets, "
15063 "flags=%#" UVxf, (UV) *flagp);
15065 /* function call leaves parse pointing to the ']', except
15066 * if we faked it */
15067 if (is_posix_class) {
15071 SvREFCNT_dec(current); /* In case it returned something */
15076 if (RExC_parse[1] == ')') {
15078 if (nest_depth--) break;
15079 node = reganode(pRExC_state, ANYOF, 0);
15080 RExC_size += ANYOF_SKIP;
15081 nextchar(pRExC_state);
15082 Set_Node_Length(node,
15083 RExC_parse - oregcomp_parse + 1); /* MJD */
15085 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15090 /* We output the messages even if warnings are off, because we'll fail
15091 * the very next thing, and these give a likely diagnosis for that */
15092 if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15093 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15096 vFAIL("Unexpected ']' with no following ')' in (?[...");
15099 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15102 /* We output the messages even if warnings are off, because we'll fail
15103 * the very next thing, and these give a likely diagnosis for that */
15104 if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15105 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15108 vFAIL("Syntax error in (?[...])");
15111 /* Pass 2 only after this. */
15112 Perl_ck_warner_d(aTHX_
15113 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
15114 "The regex_sets feature is experimental" REPORT_LOCATION,
15115 REPORT_LOCATION_ARGS(RExC_parse));
15117 /* Everything in this construct is a metacharacter. Operands begin with
15118 * either a '\' (for an escape sequence), or a '[' for a bracketed
15119 * character class. Any other character should be an operator, or
15120 * parenthesis for grouping. Both types of operands are handled by calling
15121 * regclass() to parse them. It is called with a parameter to indicate to
15122 * return the computed inversion list. The parsing here is implemented via
15123 * a stack. Each entry on the stack is a single character representing one
15124 * of the operators; or else a pointer to an operand inversion list. */
15126 #define IS_OPERATOR(a) SvIOK(a)
15127 #define IS_OPERAND(a) (! IS_OPERATOR(a))
15129 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
15130 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15131 * with pronouncing it called it Reverse Polish instead, but now that YOU
15132 * know how to pronounce it you can use the correct term, thus giving due
15133 * credit to the person who invented it, and impressing your geek friends.
15134 * Wikipedia says that the pronounciation of "Ł" has been changing so that
15135 * it is now more like an English initial W (as in wonk) than an L.)
15137 * This means that, for example, 'a | b & c' is stored on the stack as
15145 * where the numbers in brackets give the stack [array] element number.
15146 * In this implementation, parentheses are not stored on the stack.
15147 * Instead a '(' creates a "fence" so that the part of the stack below the
15148 * fence is invisible except to the corresponding ')' (this allows us to
15149 * replace testing for parens, by using instead subtraction of the fence
15150 * position). As new operands are processed they are pushed onto the stack
15151 * (except as noted in the next paragraph). New operators of higher
15152 * precedence than the current final one are inserted on the stack before
15153 * the lhs operand (so that when the rhs is pushed next, everything will be
15154 * in the correct positions shown above. When an operator of equal or
15155 * lower precedence is encountered in parsing, all the stacked operations
15156 * of equal or higher precedence are evaluated, leaving the result as the
15157 * top entry on the stack. This makes higher precedence operations
15158 * evaluate before lower precedence ones, and causes operations of equal
15159 * precedence to left associate.
15161 * The only unary operator '!' is immediately pushed onto the stack when
15162 * encountered. When an operand is encountered, if the top of the stack is
15163 * a '!", the complement is immediately performed, and the '!' popped. The
15164 * resulting value is treated as a new operand, and the logic in the
15165 * previous paragraph is executed. Thus in the expression
15167 * the stack looks like
15173 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15180 * A ')' is treated as an operator with lower precedence than all the
15181 * aforementioned ones, which causes all operations on the stack above the
15182 * corresponding '(' to be evaluated down to a single resultant operand.
15183 * Then the fence for the '(' is removed, and the operand goes through the
15184 * algorithm above, without the fence.
15186 * A separate stack is kept of the fence positions, so that the position of
15187 * the latest so-far unbalanced '(' is at the top of it.
15189 * The ']' ending the construct is treated as the lowest operator of all,
15190 * so that everything gets evaluated down to a single operand, which is the
15193 sv_2mortal((SV *)(stack = newAV()));
15194 sv_2mortal((SV *)(fence_stack = newAV()));
15196 while (RExC_parse < RExC_end) {
15197 I32 top_index; /* Index of top-most element in 'stack' */
15198 SV** top_ptr; /* Pointer to top 'stack' element */
15199 SV* current = NULL; /* To contain the current inversion list
15201 SV* only_to_avoid_leaks;
15203 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15204 TRUE /* Force /x */ );
15205 if (RExC_parse >= RExC_end) {
15206 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
15209 curchar = UCHARAT(RExC_parse);
15213 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15214 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15215 DEBUG_U(dump_regex_sets_structures(pRExC_state,
15216 stack, fence, fence_stack));
15219 top_index = av_tindex_skip_len_mg(stack);
15222 SV** stacked_ptr; /* Ptr to something already on 'stack' */
15223 char stacked_operator; /* The topmost operator on the 'stack'. */
15224 SV* lhs; /* Operand to the left of the operator */
15225 SV* rhs; /* Operand to the right of the operator */
15226 SV* fence_ptr; /* Pointer to top element of the fence
15231 if ( RExC_parse < RExC_end - 1
15232 && (UCHARAT(RExC_parse + 1) == '?'))
15234 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
15235 * This happens when we have some thing like
15237 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15239 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
15241 * Here we would be handling the interpolated
15242 * '$thai_or_lao'. We handle this by a recursive call to
15243 * ourselves which returns the inversion list the
15244 * interpolated expression evaluates to. We use the flags
15245 * from the interpolated pattern. */
15246 U32 save_flags = RExC_flags;
15247 const char * save_parse;
15249 RExC_parse += 2; /* Skip past the '(?' */
15250 save_parse = RExC_parse;
15252 /* Parse any flags for the '(?' */
15253 parse_lparen_question_flags(pRExC_state);
15255 if (RExC_parse == save_parse /* Makes sure there was at
15256 least one flag (or else
15257 this embedding wasn't
15259 || RExC_parse >= RExC_end - 4
15260 || UCHARAT(RExC_parse) != ':'
15261 || UCHARAT(++RExC_parse) != '('
15262 || UCHARAT(++RExC_parse) != '?'
15263 || UCHARAT(++RExC_parse) != '[')
15266 /* In combination with the above, this moves the
15267 * pointer to the point just after the first erroneous
15268 * character (or if there are no flags, to where they
15269 * should have been) */
15270 if (RExC_parse >= RExC_end - 4) {
15271 RExC_parse = RExC_end;
15273 else if (RExC_parse != save_parse) {
15274 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15276 vFAIL("Expecting '(?flags:(?[...'");
15279 /* Recurse, with the meat of the embedded expression */
15281 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
15282 depth+1, oregcomp_parse);
15284 /* Here, 'current' contains the embedded expression's
15285 * inversion list, and RExC_parse points to the trailing
15286 * ']'; the next character should be the ')' */
15288 if (UCHARAT(RExC_parse) != ')')
15289 vFAIL("Expecting close paren for nested extended charclass");
15291 /* Then the ')' matching the original '(' handled by this
15292 * case: statement */
15294 if (UCHARAT(RExC_parse) != ')')
15295 vFAIL("Expecting close paren for wrapper for nested extended charclass");
15298 RExC_flags = save_flags;
15299 goto handle_operand;
15302 /* A regular '('. Look behind for illegal syntax */
15303 if (top_index - fence >= 0) {
15304 /* If the top entry on the stack is an operator, it had
15305 * better be a '!', otherwise the entry below the top
15306 * operand should be an operator */
15307 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
15308 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15309 || ( IS_OPERAND(*top_ptr)
15310 && ( top_index - fence < 1
15311 || ! (stacked_ptr = av_fetch(stack,
15314 || ! IS_OPERATOR(*stacked_ptr))))
15317 vFAIL("Unexpected '(' with no preceding operator");
15321 /* Stack the position of this undealt-with left paren */
15322 av_push(fence_stack, newSViv(fence));
15323 fence = top_index + 1;
15327 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15328 * multi-char folds are allowed. */
15329 if (!regclass(pRExC_state, flagp,depth+1,
15330 TRUE, /* means parse just the next thing */
15331 FALSE, /* don't allow multi-char folds */
15332 FALSE, /* don't silence non-portable warnings. */
15334 FALSE, /* Require return to be an ANYOF */
15338 FAIL2("panic: regclass returned NULL to handle_sets, "
15339 "flags=%#" UVxf, (UV) *flagp);
15342 /* regclass() will return with parsing just the \ sequence,
15343 * leaving the parse pointer at the next thing to parse */
15345 goto handle_operand;
15347 case '[': /* Is a bracketed character class */
15349 /* See if this is a [:posix:] class. */
15350 bool is_posix_class = (OOB_NAMEDCLASS
15351 < handle_possible_posix(pRExC_state,
15355 TRUE /* checking only */));
15356 /* If it is a posix class, leave the parse pointer at the '['
15357 * to fool regclass() into thinking it is part of a
15358 * '[[:posix:]]'. */
15359 if (! is_posix_class) {
15363 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15364 * multi-char folds are allowed. */
15365 if (!regclass(pRExC_state, flagp,depth+1,
15366 is_posix_class, /* parse the whole char
15367 class only if not a
15369 FALSE, /* don't allow multi-char folds */
15370 TRUE, /* silence non-portable warnings. */
15372 FALSE, /* Require return to be an ANYOF */
15377 FAIL2("panic: regclass returned NULL to handle_sets, "
15378 "flags=%#" UVxf, (UV) *flagp);
15381 /* function call leaves parse pointing to the ']', except if we
15383 if (is_posix_class) {
15387 goto handle_operand;
15391 if (top_index >= 1) {
15392 goto join_operators;
15395 /* Only a single operand on the stack: are done */
15399 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15401 vFAIL("Unexpected ')'");
15404 /* If nothing after the fence, is missing an operand */
15405 if (top_index - fence < 0) {
15409 /* If at least two things on the stack, treat this as an
15411 if (top_index - fence >= 1) {
15412 goto join_operators;
15415 /* Here only a single thing on the fenced stack, and there is a
15416 * fence. Get rid of it */
15417 fence_ptr = av_pop(fence_stack);
15419 fence = SvIV(fence_ptr) - 1;
15420 SvREFCNT_dec_NN(fence_ptr);
15427 /* Having gotten rid of the fence, we pop the operand at the
15428 * stack top and process it as a newly encountered operand */
15429 current = av_pop(stack);
15430 if (IS_OPERAND(current)) {
15431 goto handle_operand;
15443 /* These binary operators should have a left operand already
15445 if ( top_index - fence < 0
15446 || top_index - fence == 1
15447 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15448 || ! IS_OPERAND(*top_ptr))
15450 goto unexpected_binary;
15453 /* If only the one operand is on the part of the stack visible
15454 * to us, we just place this operator in the proper position */
15455 if (top_index - fence < 2) {
15457 /* Place the operator before the operand */
15459 SV* lhs = av_pop(stack);
15460 av_push(stack, newSVuv(curchar));
15461 av_push(stack, lhs);
15465 /* But if there is something else on the stack, we need to
15466 * process it before this new operator if and only if the
15467 * stacked operation has equal or higher precedence than the
15472 /* The operator on the stack is supposed to be below both its
15474 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15475 || IS_OPERAND(*stacked_ptr))
15477 /* But if not, it's legal and indicates we are completely
15478 * done if and only if we're currently processing a ']',
15479 * which should be the final thing in the expression */
15480 if (curchar == ']') {
15486 vFAIL2("Unexpected binary operator '%c' with no "
15487 "preceding operand", curchar);
15489 stacked_operator = (char) SvUV(*stacked_ptr);
15491 if (regex_set_precedence(curchar)
15492 > regex_set_precedence(stacked_operator))
15494 /* Here, the new operator has higher precedence than the
15495 * stacked one. This means we need to add the new one to
15496 * the stack to await its rhs operand (and maybe more
15497 * stuff). We put it before the lhs operand, leaving
15498 * untouched the stacked operator and everything below it
15500 lhs = av_pop(stack);
15501 assert(IS_OPERAND(lhs));
15503 av_push(stack, newSVuv(curchar));
15504 av_push(stack, lhs);
15508 /* Here, the new operator has equal or lower precedence than
15509 * what's already there. This means the operation already
15510 * there should be performed now, before the new one. */
15512 rhs = av_pop(stack);
15513 if (! IS_OPERAND(rhs)) {
15515 /* This can happen when a ! is not followed by an operand,
15516 * like in /(?[\t &!])/ */
15520 lhs = av_pop(stack);
15522 if (! IS_OPERAND(lhs)) {
15524 /* This can happen when there is an empty (), like in
15525 * /(?[[0]+()+])/ */
15529 switch (stacked_operator) {
15531 _invlist_intersection(lhs, rhs, &rhs);
15536 _invlist_union(lhs, rhs, &rhs);
15540 _invlist_subtract(lhs, rhs, &rhs);
15543 case '^': /* The union minus the intersection */
15548 _invlist_union(lhs, rhs, &u);
15549 _invlist_intersection(lhs, rhs, &i);
15550 _invlist_subtract(u, i, &rhs);
15551 SvREFCNT_dec_NN(i);
15552 SvREFCNT_dec_NN(u);
15558 /* Here, the higher precedence operation has been done, and the
15559 * result is in 'rhs'. We overwrite the stacked operator with
15560 * the result. Then we redo this code to either push the new
15561 * operator onto the stack or perform any higher precedence
15562 * stacked operation */
15563 only_to_avoid_leaks = av_pop(stack);
15564 SvREFCNT_dec(only_to_avoid_leaks);
15565 av_push(stack, rhs);
15568 case '!': /* Highest priority, right associative */
15570 /* If what's already at the top of the stack is another '!",
15571 * they just cancel each other out */
15572 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
15573 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15575 only_to_avoid_leaks = av_pop(stack);
15576 SvREFCNT_dec(only_to_avoid_leaks);
15578 else { /* Otherwise, since it's right associative, just push
15580 av_push(stack, newSVuv(curchar));
15585 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15586 vFAIL("Unexpected character");
15590 /* Here 'current' is the operand. If something is already on the
15591 * stack, we have to check if it is a !. But first, the code above
15592 * may have altered the stack in the time since we earlier set
15595 top_index = av_tindex_skip_len_mg(stack);
15596 if (top_index - fence >= 0) {
15597 /* If the top entry on the stack is an operator, it had better
15598 * be a '!', otherwise the entry below the top operand should
15599 * be an operator */
15600 top_ptr = av_fetch(stack, top_index, FALSE);
15602 if (IS_OPERATOR(*top_ptr)) {
15604 /* The only permissible operator at the top of the stack is
15605 * '!', which is applied immediately to this operand. */
15606 curchar = (char) SvUV(*top_ptr);
15607 if (curchar != '!') {
15608 SvREFCNT_dec(current);
15609 vFAIL2("Unexpected binary operator '%c' with no "
15610 "preceding operand", curchar);
15613 _invlist_invert(current);
15615 only_to_avoid_leaks = av_pop(stack);
15616 SvREFCNT_dec(only_to_avoid_leaks);
15618 /* And we redo with the inverted operand. This allows
15619 * handling multiple ! in a row */
15620 goto handle_operand;
15622 /* Single operand is ok only for the non-binary ')'
15624 else if ((top_index - fence == 0 && curchar != ')')
15625 || (top_index - fence > 0
15626 && (! (stacked_ptr = av_fetch(stack,
15629 || IS_OPERAND(*stacked_ptr))))
15631 SvREFCNT_dec(current);
15632 vFAIL("Operand with no preceding operator");
15636 /* Here there was nothing on the stack or the top element was
15637 * another operand. Just add this new one */
15638 av_push(stack, current);
15640 } /* End of switch on next parse token */
15642 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15643 } /* End of loop parsing through the construct */
15646 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
15647 vFAIL("Unmatched (");
15650 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
15651 || ((final = av_pop(stack)) == NULL)
15652 || ! IS_OPERAND(final)
15653 || SvTYPE(final) != SVt_INVLIST
15654 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
15657 SvREFCNT_dec(final);
15658 vFAIL("Incomplete expression within '(?[ ])'");
15661 /* Here, 'final' is the resultant inversion list from evaluating the
15662 * expression. Return it if so requested */
15663 if (return_invlist) {
15664 *return_invlist = final;
15668 /* Otherwise generate a resultant node, based on 'final'. regclass() is
15669 * expecting a string of ranges and individual code points */
15670 invlist_iterinit(final);
15671 result_string = newSVpvs("");
15672 while (invlist_iternext(final, &start, &end)) {
15673 if (start == end) {
15674 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
15677 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
15682 /* About to generate an ANYOF (or similar) node from the inversion list we
15683 * have calculated */
15684 save_parse = RExC_parse;
15685 RExC_parse = SvPV(result_string, len);
15686 save_end = RExC_end;
15687 RExC_end = RExC_parse + len;
15689 /* We turn off folding around the call, as the class we have constructed
15690 * already has all folding taken into consideration, and we don't want
15691 * regclass() to add to that */
15692 RExC_flags &= ~RXf_PMf_FOLD;
15693 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15694 * folds are allowed. */
15695 node = regclass(pRExC_state, flagp,depth+1,
15696 FALSE, /* means parse the whole char class */
15697 FALSE, /* don't allow multi-char folds */
15698 TRUE, /* silence non-portable warnings. The above may very
15699 well have generated non-portable code points, but
15700 they're valid on this machine */
15701 FALSE, /* similarly, no need for strict */
15702 FALSE, /* Require return to be an ANYOF */
15707 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf,
15710 /* Fix up the node type if we are in locale. (We have pretended we are
15711 * under /u for the purposes of regclass(), as this construct will only
15712 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
15713 * as to cause any warnings about bad locales to be output in regexec.c),
15714 * and add the flag that indicates to check if not in a UTF-8 locale. The
15715 * reason we above forbid optimization into something other than an ANYOF
15716 * node is simply to minimize the number of code changes in regexec.c.
15717 * Otherwise we would have to create new EXACTish node types and deal with
15718 * them. This decision could be revisited should this construct become
15721 * (One might think we could look at the resulting ANYOF node and suppress
15722 * the flag if everything is above 255, as those would be UTF-8 only,
15723 * but this isn't true, as the components that led to that result could
15724 * have been locale-affected, and just happen to cancel each other out
15725 * under UTF-8 locales.) */
15727 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15729 assert(OP(node) == ANYOF);
15733 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15737 RExC_flags |= RXf_PMf_FOLD;
15740 RExC_parse = save_parse + 1;
15741 RExC_end = save_end;
15742 SvREFCNT_dec_NN(final);
15743 SvREFCNT_dec_NN(result_string);
15745 nextchar(pRExC_state);
15746 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15750 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15753 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
15754 AV * stack, const IV fence, AV * fence_stack)
15755 { /* Dumps the stacks in handle_regex_sets() */
15757 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
15758 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
15761 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
15763 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
15765 if (stack_top < 0) {
15766 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
15769 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
15770 for (i = stack_top; i >= 0; i--) {
15771 SV ** element_ptr = av_fetch(stack, i, FALSE);
15772 if (! element_ptr) {
15775 if (IS_OPERATOR(*element_ptr)) {
15776 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
15777 (int) i, (int) SvIV(*element_ptr));
15780 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
15781 sv_dump(*element_ptr);
15786 if (fence_stack_top < 0) {
15787 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
15790 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
15791 for (i = fence_stack_top; i >= 0; i--) {
15792 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
15793 if (! element_ptr) {
15796 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
15797 (int) i, (int) SvIV(*element_ptr));
15808 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15810 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15811 * innocent-looking character class, like /[ks]/i won't have to go out to
15812 * disk to find the possible matches.
15814 * This should be called only for a Latin1-range code points, cp, which is
15815 * known to be involved in a simple fold with other code points above
15816 * Latin1. It would give false results if /aa has been specified.
15817 * Multi-char folds are outside the scope of this, and must be handled
15820 * XXX It would be better to generate these via regen, in case a new
15821 * version of the Unicode standard adds new mappings, though that is not
15822 * really likely, and may be caught by the default: case of the switch
15825 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15827 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15833 add_cp_to_invlist(*invlist, KELVIN_SIGN);
15837 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15840 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15841 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15843 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15844 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15845 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15847 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15848 *invlist = add_cp_to_invlist(*invlist,
15849 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15852 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15854 case LATIN_SMALL_LETTER_SHARP_S:
15855 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15860 #if UNICODE_MAJOR_VERSION < 3 \
15861 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15863 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15868 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15869 # if UNICODE_DOT_DOT_VERSION == 1
15870 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15876 /* Use deprecated warning to increase the chances of this being
15879 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15886 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15888 /* If the final parameter is NULL, output the elements of the array given
15889 * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
15890 * pushed onto it, (creating if necessary) */
15893 const bool first_is_fatal = ! return_posix_warnings
15894 && ckDEAD(packWARN(WARN_REGEXP));
15896 PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15898 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15899 if (return_posix_warnings) {
15900 if (! *return_posix_warnings) { /* mortalize to not leak if
15901 warnings are fatal */
15902 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15904 av_push(*return_posix_warnings, msg);
15907 if (first_is_fatal) { /* Avoid leaking this */
15908 av_undef(posix_warnings); /* This isn't necessary if the
15909 array is mortal, but is a
15911 (void) sv_2mortal(msg);
15913 SAVEFREESV(RExC_rx_sv);
15916 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15917 SvREFCNT_dec_NN(msg);
15923 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15925 /* This adds the string scalar <multi_string> to the array
15926 * <multi_char_matches>. <multi_string> is known to have exactly
15927 * <cp_count> code points in it. This is used when constructing a
15928 * bracketed character class and we find something that needs to match more
15929 * than a single character.
15931 * <multi_char_matches> is actually an array of arrays. Each top-level
15932 * element is an array that contains all the strings known so far that are
15933 * the same length. And that length (in number of code points) is the same
15934 * as the index of the top-level array. Hence, the [2] element is an
15935 * array, each element thereof is a string containing TWO code points;
15936 * while element [3] is for strings of THREE characters, and so on. Since
15937 * this is for multi-char strings there can never be a [0] nor [1] element.
15939 * When we rewrite the character class below, we will do so such that the
15940 * longest strings are written first, so that it prefers the longest
15941 * matching strings first. This is done even if it turns out that any
15942 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
15943 * Christiansen has agreed that this is ok. This makes the test for the
15944 * ligature 'ffi' come before the test for 'ff', for example */
15947 AV** this_array_ptr;
15949 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15951 if (! multi_char_matches) {
15952 multi_char_matches = newAV();
15955 if (av_exists(multi_char_matches, cp_count)) {
15956 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15957 this_array = *this_array_ptr;
15960 this_array = newAV();
15961 av_store(multi_char_matches, cp_count,
15964 av_push(this_array, multi_string);
15966 return multi_char_matches;
15969 /* The names of properties whose definitions are not known at compile time are
15970 * stored in this SV, after a constant heading. So if the length has been
15971 * changed since initialization, then there is a run-time definition. */
15972 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
15973 (SvCUR(listsv) != initial_listsv_len)
15975 /* There is a restricted set of white space characters that are legal when
15976 * ignoring white space in a bracketed character class. This generates the
15977 * code to skip them.
15979 * There is a line below that uses the same white space criteria but is outside
15980 * this macro. Both here and there must use the same definition */
15981 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
15984 while (isBLANK_A(UCHARAT(p))) \
15992 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15993 const bool stop_at_1, /* Just parse the next thing, don't
15994 look for a full character class */
15995 bool allow_multi_folds,
15996 const bool silence_non_portable, /* Don't output warnings
16000 bool optimizable, /* ? Allow a non-ANYOF return
16002 SV** ret_invlist, /* Return an inversion list, not a node */
16003 AV** return_posix_warnings
16006 /* parse a bracketed class specification. Most of these will produce an
16007 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16008 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
16009 * under /i with multi-character folds: it will be rewritten following the
16010 * paradigm of this example, where the <multi-fold>s are characters which
16011 * fold to multiple character sequences:
16012 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16013 * gets effectively rewritten as:
16014 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16015 * reg() gets called (recursively) on the rewritten version, and this
16016 * function will return what it constructs. (Actually the <multi-fold>s
16017 * aren't physically removed from the [abcdefghi], it's just that they are
16018 * ignored in the recursion by means of a flag:
16019 * <RExC_in_multi_char_class>.)
16021 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16022 * characters, with the corresponding bit set if that character is in the
16023 * list. For characters above this, a range list or swash is used. There
16024 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16025 * determinable at compile time
16027 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
16028 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
16029 * to UTF-8. This can only happen if ret_invlist is non-NULL.
16032 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16034 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16037 int namedclass = OOB_NAMEDCLASS;
16038 char *rangebegin = NULL;
16039 bool need_class = 0;
16041 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16042 than just initialized. */
16043 SV* properties = NULL; /* Code points that match \p{} \P{} */
16044 SV* posixes = NULL; /* Code points that match classes like [:word:],
16045 extended beyond the Latin1 range. These have to
16046 be kept separate from other code points for much
16047 of this function because their handling is
16048 different under /i, and for most classes under
16050 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
16051 separate for a while from the non-complemented
16052 versions because of complications with /d
16054 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16055 treated more simply than the general case,
16056 leading to less compilation and execution
16058 UV element_count = 0; /* Number of distinct elements in the class.
16059 Optimizations may be possible if this is tiny */
16060 AV * multi_char_matches = NULL; /* Code points that fold to more than one
16061 character; used under /i */
16063 char * stop_ptr = RExC_end; /* where to stop parsing */
16065 /* ignore unescaped whitespace? */
16066 const bool skip_white = cBOOL( ret_invlist
16067 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16069 /* Unicode properties are stored in a swash; this holds the current one
16070 * being parsed. If this swash is the only above-latin1 component of the
16071 * character class, an optimization is to pass it directly on to the
16072 * execution engine. Otherwise, it is set to NULL to indicate that there
16073 * are other things in the class that have to be dealt with at execution
16075 SV* swash = NULL; /* Code points that match \p{} \P{} */
16077 /* Set if a component of this character class is user-defined; just passed
16078 * on to the engine */
16079 bool has_user_defined_property = FALSE;
16081 /* inversion list of code points this node matches only when the target
16082 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
16084 SV* has_upper_latin1_only_utf8_matches = NULL;
16086 /* Inversion list of code points this node matches regardless of things
16087 * like locale, folding, utf8ness of the target string */
16088 SV* cp_list = NULL;
16090 /* Like cp_list, but code points on this list need to be checked for things
16091 * that fold to/from them under /i */
16092 SV* cp_foldable_list = NULL;
16094 /* Like cp_list, but code points on this list are valid only when the
16095 * runtime locale is UTF-8 */
16096 SV* only_utf8_locale_list = NULL;
16098 /* In a range, if one of the endpoints is non-character-set portable,
16099 * meaning that it hard-codes a code point that may mean a different
16100 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16101 * mnemonic '\t' which each mean the same character no matter which
16102 * character set the platform is on. */
16103 unsigned int non_portable_endpoint = 0;
16105 /* Is the range unicode? which means on a platform that isn't 1-1 native
16106 * to Unicode (i.e. non-ASCII), each code point in it should be considered
16107 * to be a Unicode value. */
16108 bool unicode_range = FALSE;
16109 bool invert = FALSE; /* Is this class to be complemented */
16111 bool warn_super = ALWAYS_WARN_SUPER;
16113 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
16114 case we need to change the emitted regop to an EXACT. */
16115 const char * orig_parse = RExC_parse;
16116 const SSize_t orig_size = RExC_size;
16117 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
16119 /* This variable is used to mark where the end in the input is of something
16120 * that looks like a POSIX construct but isn't. During the parse, when
16121 * something looks like it could be such a construct is encountered, it is
16122 * checked for being one, but not if we've already checked this area of the
16123 * input. Only after this position is reached do we check again */
16124 char *not_posix_region_end = RExC_parse - 1;
16126 AV* posix_warnings = NULL;
16127 const bool do_posix_warnings = return_posix_warnings
16128 || (PASS2 && ckWARN(WARN_REGEXP));
16130 GET_RE_DEBUG_FLAGS_DECL;
16132 PERL_ARGS_ASSERT_REGCLASS;
16134 PERL_UNUSED_ARG(depth);
16137 DEBUG_PARSE("clas");
16139 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
16140 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
16141 && UNICODE_DOT_DOT_VERSION == 0)
16142 allow_multi_folds = FALSE;
16145 /* Assume we are going to generate an ANYOF node. */
16146 ret = reganode(pRExC_state,
16153 RExC_size += ANYOF_SKIP;
16154 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
16157 ANYOF_FLAGS(ret) = 0;
16159 RExC_emit += ANYOF_SKIP;
16160 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16161 initial_listsv_len = SvCUR(listsv);
16162 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
16165 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16167 assert(RExC_parse <= RExC_end);
16169 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
16172 allow_multi_folds = FALSE;
16174 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16177 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16178 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16179 int maybe_class = handle_possible_posix(pRExC_state,
16181 ¬_posix_region_end,
16183 TRUE /* checking only */);
16184 if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16185 SAVEFREESV(RExC_rx_sv);
16186 ckWARN4reg(not_posix_region_end,
16187 "POSIX syntax [%c %c] belongs inside character classes%s",
16188 *RExC_parse, *RExC_parse,
16189 (maybe_class == OOB_NAMEDCLASS)
16190 ? ((POSIXCC_NOTYET(*RExC_parse))
16191 ? " (but this one isn't implemented)"
16192 : " (but this one isn't fully valid)")
16195 (void)ReREFCNT_inc(RExC_rx_sv);
16199 /* If the caller wants us to just parse a single element, accomplish this
16200 * by faking the loop ending condition */
16201 if (stop_at_1 && RExC_end > RExC_parse) {
16202 stop_ptr = RExC_parse + 1;
16205 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16206 if (UCHARAT(RExC_parse) == ']')
16207 goto charclassloop;
16211 if ( posix_warnings
16212 && av_tindex_skip_len_mg(posix_warnings) >= 0
16213 && RExC_parse > not_posix_region_end)
16215 /* Warnings about posix class issues are considered tentative until
16216 * we are far enough along in the parse that we can no longer
16217 * change our mind, at which point we either output them or add
16218 * them, if it has so specified, to what gets returned to the
16219 * caller. This is done each time through the loop so that a later
16220 * class won't zap them before they have been dealt with. */
16221 output_or_return_posix_warnings(pRExC_state, posix_warnings,
16222 return_posix_warnings);
16225 if (RExC_parse >= stop_ptr) {
16229 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16231 if (UCHARAT(RExC_parse) == ']') {
16237 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16238 save_value = value;
16239 save_prevvalue = prevvalue;
16242 rangebegin = RExC_parse;
16244 non_portable_endpoint = 0;
16246 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16247 value = utf8n_to_uvchr((U8*)RExC_parse,
16248 RExC_end - RExC_parse,
16249 &numlen, UTF8_ALLOW_DEFAULT);
16250 RExC_parse += numlen;
16253 value = UCHARAT(RExC_parse++);
16255 if (value == '[') {
16256 char * posix_class_end;
16257 namedclass = handle_possible_posix(pRExC_state,
16260 do_posix_warnings ? &posix_warnings : NULL,
16261 FALSE /* die if error */);
16262 if (namedclass > OOB_NAMEDCLASS) {
16264 /* If there was an earlier attempt to parse this particular
16265 * posix class, and it failed, it was a false alarm, as this
16266 * successful one proves */
16267 if ( posix_warnings
16268 && av_tindex_skip_len_mg(posix_warnings) >= 0
16269 && not_posix_region_end >= RExC_parse
16270 && not_posix_region_end <= posix_class_end)
16272 av_undef(posix_warnings);
16275 RExC_parse = posix_class_end;
16277 else if (namedclass == OOB_NAMEDCLASS) {
16278 not_posix_region_end = posix_class_end;
16281 namedclass = OOB_NAMEDCLASS;
16284 else if ( RExC_parse - 1 > not_posix_region_end
16285 && MAYBE_POSIXCC(value))
16287 (void) handle_possible_posix(
16289 RExC_parse - 1, /* -1 because parse has already been
16291 ¬_posix_region_end,
16292 do_posix_warnings ? &posix_warnings : NULL,
16293 TRUE /* checking only */);
16295 else if ( strict && ! skip_white
16296 && ( _generic_isCC(value, _CC_VERTSPACE)
16297 || is_VERTWS_cp_high(value)))
16299 vFAIL("Literal vertical space in [] is illegal except under /x");
16301 else if (value == '\\') {
16302 /* Is a backslash; get the code point of the char after it */
16304 if (RExC_parse >= RExC_end) {
16305 vFAIL("Unmatched [");
16308 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16309 value = utf8n_to_uvchr((U8*)RExC_parse,
16310 RExC_end - RExC_parse,
16311 &numlen, UTF8_ALLOW_DEFAULT);
16312 RExC_parse += numlen;
16315 value = UCHARAT(RExC_parse++);
16317 /* Some compilers cannot handle switching on 64-bit integer
16318 * values, therefore value cannot be an UV. Yes, this will
16319 * be a problem later if we want switch on Unicode.
16320 * A similar issue a little bit later when switching on
16321 * namedclass. --jhi */
16323 /* If the \ is escaping white space when white space is being
16324 * skipped, it means that that white space is wanted literally, and
16325 * is already in 'value'. Otherwise, need to translate the escape
16326 * into what it signifies. */
16327 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16329 case 'w': namedclass = ANYOF_WORDCHAR; break;
16330 case 'W': namedclass = ANYOF_NWORDCHAR; break;
16331 case 's': namedclass = ANYOF_SPACE; break;
16332 case 'S': namedclass = ANYOF_NSPACE; break;
16333 case 'd': namedclass = ANYOF_DIGIT; break;
16334 case 'D': namedclass = ANYOF_NDIGIT; break;
16335 case 'v': namedclass = ANYOF_VERTWS; break;
16336 case 'V': namedclass = ANYOF_NVERTWS; break;
16337 case 'h': namedclass = ANYOF_HORIZWS; break;
16338 case 'H': namedclass = ANYOF_NHORIZWS; break;
16339 case 'N': /* Handle \N{NAME} in class */
16341 const char * const backslash_N_beg = RExC_parse - 2;
16344 if (! grok_bslash_N(pRExC_state,
16345 NULL, /* No regnode */
16346 &value, /* Yes single value */
16347 &cp_count, /* Multiple code pt count */
16353 if (*flagp & NEED_UTF8)
16354 FAIL("panic: grok_bslash_N set NEED_UTF8");
16355 if (*flagp & RESTART_PASS1)
16358 if (cp_count < 0) {
16359 vFAIL("\\N in a character class must be a named character: \\N{...}");
16361 else if (cp_count == 0) {
16363 ckWARNreg(RExC_parse,
16364 "Ignoring zero length \\N{} in character class");
16367 else { /* cp_count > 1 */
16368 if (! RExC_in_multi_char_class) {
16369 if (invert || range || *RExC_parse == '-') {
16372 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16375 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16377 break; /* <value> contains the first code
16378 point. Drop out of the switch to
16382 SV * multi_char_N = newSVpvn(backslash_N_beg,
16383 RExC_parse - backslash_N_beg);
16385 = add_multi_match(multi_char_matches,
16390 } /* End of cp_count != 1 */
16392 /* This element should not be processed further in this
16395 value = save_value;
16396 prevvalue = save_prevvalue;
16397 continue; /* Back to top of loop to get next char */
16400 /* Here, is a single code point, and <value> contains it */
16401 unicode_range = TRUE; /* \N{} are Unicode */
16409 /* We will handle any undefined properties ourselves */
16410 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16411 /* And we actually would prefer to get
16412 * the straight inversion list of the
16413 * swash, since we will be accessing it
16414 * anyway, to save a little time */
16415 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16417 if (RExC_parse >= RExC_end)
16418 vFAIL2("Empty \\%c", (U8)value);
16419 if (*RExC_parse == '{') {
16420 const U8 c = (U8)value;
16421 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16424 vFAIL2("Missing right brace on \\%c{}", c);
16428 while (isSPACE(*RExC_parse)) {
16432 if (UCHARAT(RExC_parse) == '^') {
16434 /* toggle. (The rhs xor gets the single bit that
16435 * differs between P and p; the other xor inverts just
16437 value ^= 'P' ^ 'p';
16440 while (isSPACE(*RExC_parse)) {
16445 if (e == RExC_parse)
16446 vFAIL2("Empty \\%c{}", c);
16448 n = e - RExC_parse;
16449 while (isSPACE(*(RExC_parse + n - 1)))
16451 } /* The \p isn't immediately followed by a '{' */
16452 else if (! isALPHA(*RExC_parse)) {
16453 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16454 vFAIL2("Character following \\%c must be '{' or a "
16455 "single-character Unicode property name",
16465 char* base_name; /* name after any packages are stripped */
16466 char* lookup_name = NULL;
16467 const char * const colon_colon = "::";
16469 /* Try to get the definition of the property into
16470 * <invlist>. If /i is in effect, the effective property
16471 * will have its name be <__NAME_i>. The design is
16472 * discussed in commit
16473 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16474 name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16477 lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16479 /* The function call just below that uses this can fail
16480 * to return, leaking memory if we don't do this */
16481 SAVEFREEPV(lookup_name);
16484 /* Look up the property name, and get its swash and
16485 * inversion list, if the property is found */
16486 SvREFCNT_dec(swash); /* Free any left-overs */
16487 swash = _core_swash_init("utf8",
16494 NULL, /* No inversion list */
16497 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16498 HV* curpkg = (IN_PERL_COMPILETIME)
16500 : CopSTASH(PL_curcop);
16504 if (swash) { /* Got a swash but no inversion list.
16505 Something is likely wrong that will
16506 be sorted-out later */
16507 SvREFCNT_dec_NN(swash);
16511 /* Here didn't find it. It could be a an error (like a
16512 * typo) in specifying a Unicode property, or it could
16513 * be a user-defined property that will be available at
16514 * run-time. The names of these must begin with 'In'
16515 * or 'Is' (after any packages are stripped off). So
16516 * if not one of those, or if we accept only
16517 * compile-time properties, is an error; otherwise add
16518 * it to the list for run-time look up. */
16519 if ((base_name = rninstr(name, name + n,
16520 colon_colon, colon_colon + 2)))
16521 { /* Has ::. We know this must be a user-defined
16524 final_n -= base_name - name;
16533 || base_name[0] != 'I'
16534 || (base_name[1] != 's' && base_name[1] != 'n')
16537 const char * const msg
16539 ? "Illegal user-defined property name"
16540 : "Can't find Unicode property definition";
16541 RExC_parse = e + 1;
16543 /* diag_listed_as: Can't find Unicode property definition "%s" */
16544 vFAIL3utf8f("%s \"%" UTF8f "\"",
16545 msg, UTF8fARG(UTF, n, name));
16548 /* If the property name doesn't already have a package
16549 * name, add the current one to it so that it can be
16550 * referred to outside it. [perl #121777] */
16551 if (! has_pkg && curpkg) {
16552 char* pkgname = HvNAME(curpkg);
16553 if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
16554 char* full_name = Perl_form(aTHX_
16558 n = strlen(full_name);
16559 name = savepvn(full_name, n);
16563 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
16564 (value == 'p' ? '+' : '!'),
16565 (FOLD) ? "__" : "",
16566 UTF8fARG(UTF, n, name),
16567 (FOLD) ? "_i" : "");
16568 has_user_defined_property = TRUE;
16569 optimizable = FALSE; /* Will have to leave this an
16572 /* We don't know yet what this matches, so have to flag
16574 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16578 /* Here, did get the swash and its inversion list. If
16579 * the swash is from a user-defined property, then this
16580 * whole character class should be regarded as such */
16581 if (swash_init_flags
16582 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16584 has_user_defined_property = TRUE;
16587 /* We warn on matching an above-Unicode code point
16588 * if the match would return true, except don't
16589 * warn for \p{All}, which has exactly one element
16591 (_invlist_contains_cp(invlist, 0x110000)
16592 && (! (_invlist_len(invlist) == 1
16593 && *invlist_array(invlist) == 0)))
16599 /* Invert if asking for the complement */
16600 if (value == 'P') {
16601 _invlist_union_complement_2nd(properties,
16605 /* The swash can't be used as-is, because we've
16606 * inverted things; delay removing it to here after
16607 * have copied its invlist above */
16608 SvREFCNT_dec_NN(swash);
16612 _invlist_union(properties, invlist, &properties);
16616 RExC_parse = e + 1;
16617 namedclass = ANYOF_UNIPROP; /* no official name, but it's
16620 /* \p means they want Unicode semantics */
16621 REQUIRE_UNI_RULES(flagp, NULL);
16624 case 'n': value = '\n'; break;
16625 case 'r': value = '\r'; break;
16626 case 't': value = '\t'; break;
16627 case 'f': value = '\f'; break;
16628 case 'b': value = '\b'; break;
16629 case 'e': value = ESC_NATIVE; break;
16630 case 'a': value = '\a'; break;
16632 RExC_parse--; /* function expects to be pointed at the 'o' */
16634 const char* error_msg;
16635 bool valid = grok_bslash_o(&RExC_parse,
16639 PASS2, /* warnings only in
16642 silence_non_portable,
16648 non_portable_endpoint++;
16651 RExC_parse--; /* function expects to be pointed at the 'x' */
16653 const char* error_msg;
16654 bool valid = grok_bslash_x(&RExC_parse,
16658 PASS2, /* Output warnings */
16660 silence_non_portable,
16666 non_portable_endpoint++;
16669 value = grok_bslash_c(*RExC_parse++, PASS2);
16670 non_portable_endpoint++;
16672 case '0': case '1': case '2': case '3': case '4':
16673 case '5': case '6': case '7':
16675 /* Take 1-3 octal digits */
16676 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16677 numlen = (strict) ? 4 : 3;
16678 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16679 RExC_parse += numlen;
16682 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16683 vFAIL("Need exactly 3 octal digits");
16685 else if (! SIZE_ONLY /* like \08, \178 */
16687 && RExC_parse < RExC_end
16688 && isDIGIT(*RExC_parse)
16689 && ckWARN(WARN_REGEXP))
16691 SAVEFREESV(RExC_rx_sv);
16692 reg_warn_non_literal_string(
16694 form_short_octal_warning(RExC_parse, numlen));
16695 (void)ReREFCNT_inc(RExC_rx_sv);
16698 non_portable_endpoint++;
16702 /* Allow \_ to not give an error */
16703 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16705 vFAIL2("Unrecognized escape \\%c in character class",
16709 SAVEFREESV(RExC_rx_sv);
16710 ckWARN2reg(RExC_parse,
16711 "Unrecognized escape \\%c in character class passed through",
16713 (void)ReREFCNT_inc(RExC_rx_sv);
16717 } /* End of switch on char following backslash */
16718 } /* end of handling backslash escape sequences */
16720 /* Here, we have the current token in 'value' */
16722 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16725 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
16726 * literal, as is the character that began the false range, i.e.
16727 * the 'a' in the examples */
16730 const int w = (RExC_parse >= rangebegin)
16731 ? RExC_parse - rangebegin
16735 "False [] range \"%" UTF8f "\"",
16736 UTF8fARG(UTF, w, rangebegin));
16739 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16740 ckWARN2reg(RExC_parse,
16741 "False [] range \"%" UTF8f "\"",
16742 UTF8fARG(UTF, w, rangebegin));
16743 (void)ReREFCNT_inc(RExC_rx_sv);
16744 cp_list = add_cp_to_invlist(cp_list, '-');
16745 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16750 range = 0; /* this was not a true range */
16751 element_count += 2; /* So counts for three values */
16754 classnum = namedclass_to_classnum(namedclass);
16756 if (LOC && namedclass < ANYOF_POSIXL_MAX
16757 #ifndef HAS_ISASCII
16758 && classnum != _CC_ASCII
16761 /* What the Posix classes (like \w, [:space:]) match in locale
16762 * isn't knowable under locale until actual match time. Room
16763 * must be reserved (one time per outer bracketed class) to
16764 * store such classes. The space will contain a bit for each
16765 * named class that is to be matched against. This isn't
16766 * needed for \p{} and pseudo-classes, as they are not affected
16767 * by locale, and hence are dealt with separately */
16768 if (! need_class) {
16771 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16774 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16776 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16777 ANYOF_POSIXL_ZERO(ret);
16779 /* We can't change this into some other type of node
16780 * (unless this is the only element, in which case there
16781 * are nodes that mean exactly this) as has runtime
16783 optimizable = FALSE;
16786 /* Coverity thinks it is possible for this to be negative; both
16787 * jhi and khw think it's not, but be safer */
16788 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16789 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16791 /* See if it already matches the complement of this POSIX
16793 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16794 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16798 posixl_matches_all = TRUE;
16799 break; /* No need to continue. Since it matches both
16800 e.g., \w and \W, it matches everything, and the
16801 bracketed class can be optimized into qr/./s */
16804 /* Add this class to those that should be checked at runtime */
16805 ANYOF_POSIXL_SET(ret, namedclass);
16807 /* The above-Latin1 characters are not subject to locale rules.
16808 * Just add them, in the second pass, to the
16809 * unconditionally-matched list */
16811 SV* scratch_list = NULL;
16813 /* Get the list of the above-Latin1 code points this
16815 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16816 PL_XPosix_ptrs[classnum],
16818 /* Odd numbers are complements, like
16819 * NDIGIT, NASCII, ... */
16820 namedclass % 2 != 0,
16822 /* Checking if 'cp_list' is NULL first saves an extra
16823 * clone. Its reference count will be decremented at the
16824 * next union, etc, or if this is the only instance, at the
16825 * end of the routine */
16827 cp_list = scratch_list;
16830 _invlist_union(cp_list, scratch_list, &cp_list);
16831 SvREFCNT_dec_NN(scratch_list);
16833 continue; /* Go get next character */
16836 else if (! SIZE_ONLY) {
16838 /* Here, not in pass1 (in that pass we skip calculating the
16839 * contents of this class), and is not /l, or is a POSIX class
16840 * for which /l doesn't matter (or is a Unicode property, which
16841 * is skipped here). */
16842 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
16843 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16845 /* Here, should be \h, \H, \v, or \V. None of /d, /i
16846 * nor /l make a difference in what these match,
16847 * therefore we just add what they match to cp_list. */
16848 if (classnum != _CC_VERTSPACE) {
16849 assert( namedclass == ANYOF_HORIZWS
16850 || namedclass == ANYOF_NHORIZWS);
16852 /* It turns out that \h is just a synonym for
16854 classnum = _CC_BLANK;
16857 _invlist_union_maybe_complement_2nd(
16859 PL_XPosix_ptrs[classnum],
16860 namedclass % 2 != 0, /* Complement if odd
16861 (NHORIZWS, NVERTWS)
16866 else if ( UNI_SEMANTICS
16867 || classnum == _CC_ASCII
16868 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
16869 || classnum == _CC_XDIGIT)))
16871 /* We usually have to worry about /d and /a affecting what
16872 * POSIX classes match, with special code needed for /d
16873 * because we won't know until runtime what all matches.
16874 * But there is no extra work needed under /u, and
16875 * [:ascii:] is unaffected by /a and /d; and :digit: and
16876 * :xdigit: don't have runtime differences under /d. So we
16877 * can special case these, and avoid some extra work below,
16878 * and at runtime. */
16879 _invlist_union_maybe_complement_2nd(
16881 PL_XPosix_ptrs[classnum],
16882 namedclass % 2 != 0,
16885 else { /* Garden variety class. If is NUPPER, NALPHA, ...
16886 complement and use nposixes */
16887 SV** posixes_ptr = namedclass % 2 == 0
16890 _invlist_union_maybe_complement_2nd(
16892 PL_XPosix_ptrs[classnum],
16893 namedclass % 2 != 0,
16897 } /* end of namedclass \blah */
16899 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16901 /* If 'range' is set, 'value' is the ending of a range--check its
16902 * validity. (If value isn't a single code point in the case of a
16903 * range, we should have figured that out above in the code that
16904 * catches false ranges). Later, we will handle each individual code
16905 * point in the range. If 'range' isn't set, this could be the
16906 * beginning of a range, so check for that by looking ahead to see if
16907 * the next real character to be processed is the range indicator--the
16912 /* For unicode ranges, we have to test that the Unicode as opposed
16913 * to the native values are not decreasing. (Above 255, there is
16914 * no difference between native and Unicode) */
16915 if (unicode_range && prevvalue < 255 && value < 255) {
16916 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16917 goto backwards_range;
16922 if (prevvalue > value) /* b-a */ {
16927 w = RExC_parse - rangebegin;
16929 "Invalid [] range \"%" UTF8f "\"",
16930 UTF8fARG(UTF, w, rangebegin));
16931 NOT_REACHED; /* NOTREACHED */
16935 prevvalue = value; /* save the beginning of the potential range */
16936 if (! stop_at_1 /* Can't be a range if parsing just one thing */
16937 && *RExC_parse == '-')
16939 char* next_char_ptr = RExC_parse + 1;
16941 /* Get the next real char after the '-' */
16942 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16944 /* If the '-' is at the end of the class (just before the ']',
16945 * it is a literal minus; otherwise it is a range */
16946 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16947 RExC_parse = next_char_ptr;
16949 /* a bad range like \w-, [:word:]- ? */
16950 if (namedclass > OOB_NAMEDCLASS) {
16951 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16952 const int w = RExC_parse >= rangebegin
16953 ? RExC_parse - rangebegin
16956 vFAIL4("False [] range \"%*.*s\"",
16961 "False [] range \"%*.*s\"",
16966 cp_list = add_cp_to_invlist(cp_list, '-');
16970 range = 1; /* yeah, it's a range! */
16971 continue; /* but do it the next time */
16976 if (namedclass > OOB_NAMEDCLASS) {
16980 /* Here, we have a single value this time through the loop, and
16981 * <prevvalue> is the beginning of the range, if any; or <value> if
16984 /* non-Latin1 code point implies unicode semantics. Must be set in
16985 * pass1 so is there for the whole of pass 2 */
16987 REQUIRE_UNI_RULES(flagp, NULL);
16990 /* Ready to process either the single value, or the completed range.
16991 * For single-valued non-inverted ranges, we consider the possibility
16992 * of multi-char folds. (We made a conscious decision to not do this
16993 * for the other cases because it can often lead to non-intuitive
16994 * results. For example, you have the peculiar case that:
16995 * "s s" =~ /^[^\xDF]+$/i => Y
16996 * "ss" =~ /^[^\xDF]+$/i => N
16998 * See [perl #89750] */
16999 if (FOLD && allow_multi_folds && value == prevvalue) {
17000 if (value == LATIN_SMALL_LETTER_SHARP_S
17001 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17004 /* Here <value> is indeed a multi-char fold. Get what it is */
17006 U8 foldbuf[UTF8_MAXBYTES_CASE];
17009 UV folded = _to_uni_fold_flags(
17013 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17014 ? FOLD_FLAGS_NOMIX_ASCII
17018 /* Here, <folded> should be the first character of the
17019 * multi-char fold of <value>, with <foldbuf> containing the
17020 * whole thing. But, if this fold is not allowed (because of
17021 * the flags), <fold> will be the same as <value>, and should
17022 * be processed like any other character, so skip the special
17024 if (folded != value) {
17026 /* Skip if we are recursed, currently parsing the class
17027 * again. Otherwise add this character to the list of
17028 * multi-char folds. */
17029 if (! RExC_in_multi_char_class) {
17030 STRLEN cp_count = utf8_length(foldbuf,
17031 foldbuf + foldlen);
17032 SV* multi_fold = sv_2mortal(newSVpvs(""));
17034 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17037 = add_multi_match(multi_char_matches,
17043 /* This element should not be processed further in this
17046 value = save_value;
17047 prevvalue = save_prevvalue;
17053 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
17056 /* If the range starts above 255, everything is portable and
17057 * likely to be so for any forseeable character set, so don't
17059 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17060 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17062 else if (prevvalue != value) {
17064 /* Under strict, ranges that stop and/or end in an ASCII
17065 * printable should have each end point be a portable value
17066 * for it (preferably like 'A', but we don't warn if it is
17067 * a (portable) Unicode name or code point), and the range
17068 * must be be all digits or all letters of the same case.
17069 * Otherwise, the range is non-portable and unclear as to
17070 * what it contains */
17071 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
17072 && ( non_portable_endpoint
17073 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17074 || (isLOWER_A(prevvalue) && isLOWER_A(value))
17075 || (isUPPER_A(prevvalue) && isUPPER_A(value))
17077 vWARN(RExC_parse, "Ranges of ASCII printables should"
17078 " be some subset of \"0-9\","
17079 " \"A-Z\", or \"a-z\"");
17081 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17082 SSize_t index_start;
17083 SSize_t index_final;
17085 /* But the nature of Unicode and languages mean we
17086 * can't do the same checks for above-ASCII ranges,
17087 * except in the case of digit ones. These should
17088 * contain only digits from the same group of 10. The
17089 * ASCII case is handled just above. Hence here, the
17090 * range could be a range of digits. First some
17091 * unlikely special cases. Grandfather in that a range
17092 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17093 * if its starting value is one of the 10 digits prior
17094 * to it. This is because it is an alternate way of
17095 * writing 19D1, and some people may expect it to be in
17096 * that group. But it is bad, because it won't give
17097 * the expected results. In Unicode 5.2 it was
17098 * considered to be in that group (of 11, hence), but
17099 * this was fixed in the next version */
17101 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17102 goto warn_bad_digit_range;
17104 else if (UNLIKELY( prevvalue >= 0x1D7CE
17105 && value <= 0x1D7FF))
17107 /* This is the only other case currently in Unicode
17108 * where the algorithm below fails. The code
17109 * points just above are the end points of a single
17110 * range containing only decimal digits. It is 5
17111 * different series of 0-9. All other ranges of
17112 * digits currently in Unicode are just a single
17113 * series. (And mktables will notify us if a later
17114 * Unicode version breaks this.)
17116 * If the range being checked is at most 9 long,
17117 * and the digit values represented are in
17118 * numerical order, they are from the same series.
17120 if ( value - prevvalue > 9
17121 || ((( value - 0x1D7CE) % 10)
17122 <= (prevvalue - 0x1D7CE) % 10))
17124 goto warn_bad_digit_range;
17129 /* For all other ranges of digits in Unicode, the
17130 * algorithm is just to check if both end points
17131 * are in the same series, which is the same range.
17133 index_start = _invlist_search(
17134 PL_XPosix_ptrs[_CC_DIGIT],
17137 /* Warn if the range starts and ends with a digit,
17138 * and they are not in the same group of 10. */
17139 if ( index_start >= 0
17140 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17142 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17143 value)) != index_start
17144 && index_final >= 0
17145 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17147 warn_bad_digit_range:
17148 vWARN(RExC_parse, "Ranges of digits should be"
17149 " from the same group of"
17156 if ((! range || prevvalue == value) && non_portable_endpoint) {
17157 if (isPRINT_A(value)) {
17160 if (isBACKSLASHED_PUNCT(value)) {
17161 literal[d++] = '\\';
17163 literal[d++] = (char) value;
17164 literal[d++] = '\0';
17167 "\"%.*s\" is more clearly written simply as \"%s\"",
17168 (int) (RExC_parse - rangebegin),
17173 else if isMNEMONIC_CNTRL(value) {
17175 "\"%.*s\" is more clearly written simply as \"%s\"",
17176 (int) (RExC_parse - rangebegin),
17178 cntrl_to_mnemonic((U8) value)
17184 /* Deal with this element of the class */
17188 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17191 /* On non-ASCII platforms, for ranges that span all of 0..255, and
17192 * ones that don't require special handling, we can just add the
17193 * range like we do for ASCII platforms */
17194 if ((UNLIKELY(prevvalue == 0) && value >= 255)
17195 || ! (prevvalue < 256
17197 || (! non_portable_endpoint
17198 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17199 || (isUPPER_A(prevvalue)
17200 && isUPPER_A(value)))))))
17202 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17206 /* Here, requires special handling. This can be because it is
17207 * a range whose code points are considered to be Unicode, and
17208 * so must be individually translated into native, or because
17209 * its a subrange of 'A-Z' or 'a-z' which each aren't
17210 * contiguous in EBCDIC, but we have defined them to include
17211 * only the "expected" upper or lower case ASCII alphabetics.
17212 * Subranges above 255 are the same in native and Unicode, so
17213 * can be added as a range */
17214 U8 start = NATIVE_TO_LATIN1(prevvalue);
17216 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17217 for (j = start; j <= end; j++) {
17218 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17221 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17228 range = 0; /* this range (if it was one) is done now */
17229 } /* End of loop through all the text within the brackets */
17232 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17233 output_or_return_posix_warnings(pRExC_state, posix_warnings,
17234 return_posix_warnings);
17237 /* If anything in the class expands to more than one character, we have to
17238 * deal with them by building up a substitute parse string, and recursively
17239 * calling reg() on it, instead of proceeding */
17240 if (multi_char_matches) {
17241 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17244 char *save_end = RExC_end;
17245 char *save_parse = RExC_parse;
17246 char *save_start = RExC_start;
17247 STRLEN prefix_end = 0; /* We copy the character class after a
17248 prefix supplied here. This is the size
17249 + 1 of that prefix */
17250 bool first_time = TRUE; /* First multi-char occurrence doesn't get
17255 assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
17257 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
17258 because too confusing */
17260 sv_catpv(substitute_parse, "(?:");
17264 /* Look at the longest folds first */
17265 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17270 if (av_exists(multi_char_matches, cp_count)) {
17271 AV** this_array_ptr;
17274 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17276 while ((this_sequence = av_pop(*this_array_ptr)) !=
17279 if (! first_time) {
17280 sv_catpv(substitute_parse, "|");
17282 first_time = FALSE;
17284 sv_catpv(substitute_parse, SvPVX(this_sequence));
17289 /* If the character class contains anything else besides these
17290 * multi-character folds, have to include it in recursive parsing */
17291 if (element_count) {
17292 sv_catpv(substitute_parse, "|[");
17293 prefix_end = SvCUR(substitute_parse);
17294 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17296 /* Put in a closing ']' only if not going off the end, as otherwise
17297 * we are adding something that really isn't there */
17298 if (RExC_parse < RExC_end) {
17299 sv_catpv(substitute_parse, "]");
17303 sv_catpv(substitute_parse, ")");
17306 /* This is a way to get the parse to skip forward a whole named
17307 * sequence instead of matching the 2nd character when it fails the
17309 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17313 /* Set up the data structure so that any errors will be properly
17314 * reported. See the comments at the definition of
17315 * REPORT_LOCATION_ARGS for details */
17316 RExC_precomp_adj = orig_parse - RExC_precomp;
17317 RExC_start = RExC_parse = SvPV(substitute_parse, len);
17318 RExC_adjusted_start = RExC_start + prefix_end;
17319 RExC_end = RExC_parse + len;
17320 RExC_in_multi_char_class = 1;
17321 RExC_emit = (regnode *)orig_emit;
17323 ret = reg(pRExC_state, 1, ®_flags, depth+1);
17325 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
17327 /* And restore so can parse the rest of the pattern */
17328 RExC_parse = save_parse;
17329 RExC_start = RExC_adjusted_start = save_start;
17330 RExC_precomp_adj = 0;
17331 RExC_end = save_end;
17332 RExC_in_multi_char_class = 0;
17333 SvREFCNT_dec_NN(multi_char_matches);
17337 /* Here, we've gone through the entire class and dealt with multi-char
17338 * folds. We are now in a position that we can do some checks to see if we
17339 * can optimize this ANYOF node into a simpler one, even in Pass 1.
17340 * Currently we only do two checks:
17341 * 1) is in the unlikely event that the user has specified both, eg. \w and
17342 * \W under /l, then the class matches everything. (This optimization
17343 * is done only to make the optimizer code run later work.)
17344 * 2) if the character class contains only a single element (including a
17345 * single range), we see if there is an equivalent node for it.
17346 * Other checks are possible */
17348 && ! ret_invlist /* Can't optimize if returning the constructed
17350 && (UNLIKELY(posixl_matches_all) || element_count == 1))
17355 if (UNLIKELY(posixl_matches_all)) {
17358 else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
17359 class, like \w or [:digit:]
17362 /* All named classes are mapped into POSIXish nodes, with its FLAG
17363 * argument giving which class it is */
17364 switch ((I32)namedclass) {
17365 case ANYOF_UNIPROP:
17368 /* These don't depend on the charset modifiers. They always
17369 * match under /u rules */
17370 case ANYOF_NHORIZWS:
17371 case ANYOF_HORIZWS:
17372 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
17375 case ANYOF_NVERTWS:
17380 /* The actual POSIXish node for all the rest depends on the
17381 * charset modifier. The ones in the first set depend only on
17382 * ASCII or, if available on this platform, also locale */
17393 /* (named_class - ANY_OF_ASCII) is 0 or 1. xor'ing with
17394 * invert converts that to 1 or 0 */
17395 op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
17398 /* The following don't have any matches in the upper Latin1
17399 * range, hence /d is equivalent to /u for them. Making it /u
17400 * saves some branches at runtime */
17404 case ANYOF_NXDIGIT:
17405 if (! DEPENDS_SEMANTICS) {
17406 goto treat_as_default;
17412 /* The following change to CASED under /i */
17418 namedclass = ANYOF_CASED + (namedclass % 2);
17422 /* The rest have more possibilities depending on the charset.
17423 * We take advantage of the enum ordering of the charset
17424 * modifiers to get the exact node type, */
17427 op = POSIXD + get_regex_charset(RExC_flags);
17428 if (op > POSIXA) { /* /aa is same as /a */
17433 /* The odd numbered ones are the complements of the
17434 * next-lower even number one */
17435 if (namedclass % 2 == 1) {
17439 arg = namedclass_to_classnum(namedclass);
17443 else if (value == prevvalue) {
17445 /* Here, the class consists of just a single code point */
17448 if (! LOC && value == '\n') {
17449 op = REG_ANY; /* Optimize [^\n] */
17450 *flagp |= HASWIDTH|SIMPLE;
17454 else if (value < 256 || UTF) {
17456 /* Optimize a single value into an EXACTish node, but not if it
17457 * would require converting the pattern to UTF-8. */
17458 op = compute_EXACTish(pRExC_state);
17460 } /* Otherwise is a range */
17461 else if (! LOC) { /* locale could vary these */
17462 if (prevvalue == '0') {
17463 if (value == '9') {
17468 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17469 /* We can optimize A-Z or a-z, but not if they could match
17470 * something like the KELVIN SIGN under /i. */
17471 if (prevvalue == 'A') {
17474 && ! non_portable_endpoint
17477 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17481 else if (prevvalue == 'a') {
17484 && ! non_portable_endpoint
17487 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17494 /* Here, we have changed <op> away from its initial value iff we found
17495 * an optimization */
17498 /* Throw away this ANYOF regnode, and emit the calculated one,
17499 * which should correspond to the beginning, not current, state of
17501 const char * cur_parse = RExC_parse;
17502 RExC_parse = (char *)orig_parse;
17506 /* To get locale nodes to not use the full ANYOF size would
17507 * require moving the code above that writes the portions
17508 * of it that aren't in other nodes to after this point.
17509 * e.g. ANYOF_POSIXL_SET */
17510 RExC_size = orig_size;
17514 RExC_emit = (regnode *)orig_emit;
17515 if (PL_regkind[op] == POSIXD) {
17516 if (op == POSIXL) {
17517 RExC_contains_locale = 1;
17520 op += NPOSIXD - POSIXD;
17525 ret = reg_node(pRExC_state, op);
17527 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17531 *flagp |= HASWIDTH|SIMPLE;
17533 else if (PL_regkind[op] == EXACT) {
17534 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17535 TRUE /* downgradable to EXACT */
17539 *flagp |= HASWIDTH|SIMPLE;
17542 RExC_parse = (char *) cur_parse;
17544 SvREFCNT_dec(posixes);
17545 SvREFCNT_dec(nposixes);
17546 SvREFCNT_dec(simple_posixes);
17547 SvREFCNT_dec(cp_list);
17548 SvREFCNT_dec(cp_foldable_list);
17555 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17557 /* If folding, we calculate all characters that could fold to or from the
17558 * ones already on the list */
17559 if (cp_foldable_list) {
17561 UV start, end; /* End points of code point ranges */
17563 SV* fold_intersection = NULL;
17566 /* Our calculated list will be for Unicode rules. For locale
17567 * matching, we have to keep a separate list that is consulted at
17568 * runtime only when the locale indicates Unicode rules. For
17569 * non-locale, we just use the general list */
17571 use_list = &only_utf8_locale_list;
17574 use_list = &cp_list;
17577 /* Only the characters in this class that participate in folds need
17578 * be checked. Get the intersection of this class and all the
17579 * possible characters that are foldable. This can quickly narrow
17580 * down a large class */
17581 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17582 &fold_intersection);
17584 /* The folds for all the Latin1 characters are hard-coded into this
17585 * program, but we have to go out to disk to get the others. */
17586 if (invlist_highest(cp_foldable_list) >= 256) {
17588 /* This is a hash that for a particular fold gives all
17589 * characters that are involved in it */
17590 if (! PL_utf8_foldclosures) {
17591 _load_PL_utf8_foldclosures();
17595 /* Now look at the foldable characters in this class individually */
17596 invlist_iterinit(fold_intersection);
17597 while (invlist_iternext(fold_intersection, &start, &end)) {
17600 /* Look at every character in the range */
17601 for (j = start; j <= end; j++) {
17602 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17608 if (IS_IN_SOME_FOLD_L1(j)) {
17610 /* ASCII is always matched; non-ASCII is matched
17611 * only under Unicode rules (which could happen
17612 * under /l if the locale is a UTF-8 one */
17613 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17614 *use_list = add_cp_to_invlist(*use_list,
17615 PL_fold_latin1[j]);
17618 has_upper_latin1_only_utf8_matches
17619 = add_cp_to_invlist(
17620 has_upper_latin1_only_utf8_matches,
17621 PL_fold_latin1[j]);
17625 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17626 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17628 add_above_Latin1_folds(pRExC_state,
17635 /* Here is an above Latin1 character. We don't have the
17636 * rules hard-coded for it. First, get its fold. This is
17637 * the simple fold, as the multi-character folds have been
17638 * handled earlier and separated out */
17639 _to_uni_fold_flags(j, foldbuf, &foldlen,
17640 (ASCII_FOLD_RESTRICTED)
17641 ? FOLD_FLAGS_NOMIX_ASCII
17644 /* Single character fold of above Latin1. Add everything in
17645 * its fold closure to the list that this node should match.
17646 * The fold closures data structure is a hash with the keys
17647 * being the UTF-8 of every character that is folded to, like
17648 * 'k', and the values each an array of all code points that
17649 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
17650 * Multi-character folds are not included */
17651 if ((listp = hv_fetch(PL_utf8_foldclosures,
17652 (char *) foldbuf, foldlen, FALSE)))
17654 AV* list = (AV*) *listp;
17656 for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
17657 SV** c_p = av_fetch(list, k, FALSE);
17663 /* /aa doesn't allow folds between ASCII and non- */
17664 if ((ASCII_FOLD_RESTRICTED
17665 && (isASCII(c) != isASCII(j))))
17670 /* Folds under /l which cross the 255/256 boundary
17671 * are added to a separate list. (These are valid
17672 * only when the locale is UTF-8.) */
17673 if (c < 256 && LOC) {
17674 *use_list = add_cp_to_invlist(*use_list, c);
17678 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17680 cp_list = add_cp_to_invlist(cp_list, c);
17683 /* Similarly folds involving non-ascii Latin1
17684 * characters under /d are added to their list */
17685 has_upper_latin1_only_utf8_matches
17686 = add_cp_to_invlist(
17687 has_upper_latin1_only_utf8_matches,
17694 SvREFCNT_dec_NN(fold_intersection);
17697 /* Now that we have finished adding all the folds, there is no reason
17698 * to keep the foldable list separate */
17699 _invlist_union(cp_list, cp_foldable_list, &cp_list);
17700 SvREFCNT_dec_NN(cp_foldable_list);
17703 /* And combine the result (if any) with any inversion lists from posix
17704 * classes. The lists are kept separate up to now because we don't want to
17705 * fold the classes (folding of those is automatically handled by the swash
17706 * fetching code) */
17707 if (simple_posixes) { /* These are the classes known to be unaffected by
17710 _invlist_union(cp_list, simple_posixes, &cp_list);
17711 SvREFCNT_dec_NN(simple_posixes);
17714 cp_list = simple_posixes;
17717 if (posixes || nposixes) {
17719 /* We have to adjust /a and /aa */
17720 if (AT_LEAST_ASCII_RESTRICTED) {
17722 /* Under /a and /aa, nothing above ASCII matches these */
17724 _invlist_intersection(posixes,
17725 PL_XPosix_ptrs[_CC_ASCII],
17729 /* Under /a and /aa, everything above ASCII matches these
17732 _invlist_union_complement_2nd(nposixes,
17733 PL_XPosix_ptrs[_CC_ASCII],
17738 if (! DEPENDS_SEMANTICS) {
17740 /* For everything but /d, we can just add the current 'posixes' and
17741 * 'nposixes' to the main list */
17744 _invlist_union(cp_list, posixes, &cp_list);
17745 SvREFCNT_dec_NN(posixes);
17753 _invlist_union(cp_list, nposixes, &cp_list);
17754 SvREFCNT_dec_NN(nposixes);
17757 cp_list = nposixes;
17762 /* Under /d, things like \w match upper Latin1 characters only if
17763 * the target string is in UTF-8. But things like \W match all the
17764 * upper Latin1 characters if the target string is not in UTF-8.
17766 * Handle the case where there something like \W separately */
17768 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17770 /* A complemented posix class matches all upper Latin1
17771 * characters if not in UTF-8. And it matches just certain
17772 * ones when in UTF-8. That means those certain ones are
17773 * matched regardless, so can just be added to the
17774 * unconditional list */
17776 _invlist_union(cp_list, nposixes, &cp_list);
17777 SvREFCNT_dec_NN(nposixes);
17781 cp_list = nposixes;
17784 /* Likewise for 'posixes' */
17785 _invlist_union(posixes, cp_list, &cp_list);
17787 /* Likewise for anything else in the range that matched only
17789 if (has_upper_latin1_only_utf8_matches) {
17790 _invlist_union(cp_list,
17791 has_upper_latin1_only_utf8_matches,
17793 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17794 has_upper_latin1_only_utf8_matches = NULL;
17797 /* If we don't match all the upper Latin1 characters regardless
17798 * of UTF-8ness, we have to set a flag to match the rest when
17800 _invlist_subtract(only_non_utf8_list, cp_list,
17801 &only_non_utf8_list);
17802 if (_invlist_len(only_non_utf8_list) != 0) {
17803 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17805 SvREFCNT_dec_NN(only_non_utf8_list);
17808 /* Here there were no complemented posix classes. That means
17809 * the upper Latin1 characters in 'posixes' match only when the
17810 * target string is in UTF-8. So we have to add them to the
17811 * list of those types of code points, while adding the
17812 * remainder to the unconditional list.
17814 * First calculate what they are */
17815 SV* nonascii_but_latin1_properties = NULL;
17816 _invlist_intersection(posixes, PL_UpperLatin1,
17817 &nonascii_but_latin1_properties);
17819 /* And add them to the final list of such characters. */
17820 _invlist_union(has_upper_latin1_only_utf8_matches,
17821 nonascii_but_latin1_properties,
17822 &has_upper_latin1_only_utf8_matches);
17824 /* Remove them from what now becomes the unconditional list */
17825 _invlist_subtract(posixes, nonascii_but_latin1_properties,
17828 /* And add those unconditional ones to the final list */
17830 _invlist_union(cp_list, posixes, &cp_list);
17831 SvREFCNT_dec_NN(posixes);
17838 SvREFCNT_dec(nonascii_but_latin1_properties);
17840 /* Get rid of any characters that we now know are matched
17841 * unconditionally from the conditional list, which may make
17842 * that list empty */
17843 _invlist_subtract(has_upper_latin1_only_utf8_matches,
17845 &has_upper_latin1_only_utf8_matches);
17846 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17847 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17848 has_upper_latin1_only_utf8_matches = NULL;
17854 /* And combine the result (if any) with any inversion list from properties.
17855 * The lists are kept separate up to now so that we can distinguish the two
17856 * in regards to matching above-Unicode. A run-time warning is generated
17857 * if a Unicode property is matched against a non-Unicode code point. But,
17858 * we allow user-defined properties to match anything, without any warning,
17859 * and we also suppress the warning if there is a portion of the character
17860 * class that isn't a Unicode property, and which matches above Unicode, \W
17861 * or [\x{110000}] for example.
17862 * (Note that in this case, unlike the Posix one above, there is no
17863 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17864 * forces Unicode semantics */
17868 /* If it matters to the final outcome, see if a non-property
17869 * component of the class matches above Unicode. If so, the
17870 * warning gets suppressed. This is true even if just a single
17871 * such code point is specified, as, though not strictly correct if
17872 * another such code point is matched against, the fact that they
17873 * are using above-Unicode code points indicates they should know
17874 * the issues involved */
17876 warn_super = ! (invert
17877 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17880 _invlist_union(properties, cp_list, &cp_list);
17881 SvREFCNT_dec_NN(properties);
17884 cp_list = properties;
17889 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17891 /* Because an ANYOF node is the only one that warns, this node
17892 * can't be optimized into something else */
17893 optimizable = FALSE;
17897 /* Here, we have calculated what code points should be in the character
17900 * Now we can see about various optimizations. Fold calculation (which we
17901 * did above) needs to take place before inversion. Otherwise /[^k]/i
17902 * would invert to include K, which under /i would match k, which it
17903 * shouldn't. Therefore we can't invert folded locale now, as it won't be
17904 * folded until runtime */
17906 /* If we didn't do folding, it's because some information isn't available
17907 * until runtime; set the run-time fold flag for these. (We don't have to
17908 * worry about properties folding, as that is taken care of by the swash
17909 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
17910 * locales, or the class matches at least one 0-255 range code point */
17913 /* Some things on the list might be unconditionally included because of
17914 * other components. Remove them, and clean up the list if it goes to
17916 if (only_utf8_locale_list && cp_list) {
17917 _invlist_subtract(only_utf8_locale_list, cp_list,
17918 &only_utf8_locale_list);
17920 if (_invlist_len(only_utf8_locale_list) == 0) {
17921 SvREFCNT_dec_NN(only_utf8_locale_list);
17922 only_utf8_locale_list = NULL;
17925 if (only_utf8_locale_list) {
17928 |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17930 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17932 invlist_iterinit(cp_list);
17933 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17934 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17936 invlist_iterfinish(cp_list);
17939 else if ( DEPENDS_SEMANTICS
17940 && ( has_upper_latin1_only_utf8_matches
17941 || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
17944 optimizable = FALSE;
17948 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17949 * at compile time. Besides not inverting folded locale now, we can't
17950 * invert if there are things such as \w, which aren't known until runtime
17954 && OP(ret) != ANYOFD
17955 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17956 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17958 _invlist_invert(cp_list);
17960 /* Any swash can't be used as-is, because we've inverted things */
17962 SvREFCNT_dec_NN(swash);
17966 /* Clear the invert flag since have just done it here */
17973 *ret_invlist = cp_list;
17974 SvREFCNT_dec(swash);
17976 /* Discard the generated node */
17978 RExC_size = orig_size;
17981 RExC_emit = orig_emit;
17986 /* Some character classes are equivalent to other nodes. Such nodes take
17987 * up less room and generally fewer operations to execute than ANYOF nodes.
17988 * Above, we checked for and optimized into some such equivalents for
17989 * certain common classes that are easy to test. Getting to this point in
17990 * the code means that the class didn't get optimized there. Since this
17991 * code is only executed in Pass 2, it is too late to save space--it has
17992 * been allocated in Pass 1, and currently isn't given back. But turning
17993 * things into an EXACTish node can allow the optimizer to join it to any
17994 * adjacent such nodes. And if the class is equivalent to things like /./,
17995 * expensive run-time swashes can be avoided. Now that we have more
17996 * complete information, we can find things necessarily missed by the
17997 * earlier code. Another possible "optimization" that isn't done is that
17998 * something like [Ee] could be changed into an EXACTFU. khw tried this
17999 * and found that the ANYOF is faster, including for code points not in the
18000 * bitmap. This still might make sense to do, provided it got joined with
18001 * an adjacent node(s) to create a longer EXACTFU one. This could be
18002 * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
18003 * routine would know is joinable. If that didn't happen, the node type
18004 * could then be made a straight ANYOF */
18006 if (optimizable && cp_list && ! invert) {
18008 U8 op = END; /* The optimzation node-type */
18009 int posix_class = -1; /* Illegal value */
18010 const char * cur_parse= RExC_parse;
18012 invlist_iterinit(cp_list);
18013 if (! invlist_iternext(cp_list, &start, &end)) {
18015 /* Here, the list is empty. This happens, for example, when a
18016 * Unicode property that doesn't match anything is the only element
18017 * in the character class (perluniprops.pod notes such properties).
18020 *flagp |= HASWIDTH|SIMPLE;
18022 else if (start == end) { /* The range is a single code point */
18023 if (! invlist_iternext(cp_list, &start, &end)
18025 /* Don't do this optimization if it would require changing
18026 * the pattern to UTF-8 */
18027 && (start < 256 || UTF))
18029 /* Here, the list contains a single code point. Can optimize
18030 * into an EXACTish node */
18041 /* A locale node under folding with one code point can be
18042 * an EXACTFL, as its fold won't be calculated until
18048 /* Here, we are generally folding, but there is only one
18049 * code point to match. If we have to, we use an EXACT
18050 * node, but it would be better for joining with adjacent
18051 * nodes in the optimization pass if we used the same
18052 * EXACTFish node that any such are likely to be. We can
18053 * do this iff the code point doesn't participate in any
18054 * folds. For example, an EXACTF of a colon is the same as
18055 * an EXACT one, since nothing folds to or from a colon. */
18057 if (IS_IN_SOME_FOLD_L1(value)) {
18062 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
18067 /* If we haven't found the node type, above, it means we
18068 * can use the prevailing one */
18070 op = compute_EXACTish(pRExC_state);
18074 } /* End of first range contains just a single code point */
18075 else if (start == 0) {
18076 if (end == UV_MAX) {
18078 *flagp |= HASWIDTH|SIMPLE;
18081 else if (end == '\n' - 1
18082 && invlist_iternext(cp_list, &start, &end)
18083 && start == '\n' + 1 && end == UV_MAX)
18086 *flagp |= HASWIDTH|SIMPLE;
18090 invlist_iterfinish(cp_list);
18093 const UV cp_list_len = _invlist_len(cp_list);
18094 const UV* cp_list_array = invlist_array(cp_list);
18096 /* Here, didn't find an optimization. See if this matches any of
18097 * the POSIX classes. First try ASCII */
18099 if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
18101 *flagp |= HASWIDTH|SIMPLE;
18103 else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
18105 *flagp |= HASWIDTH|SIMPLE;
18107 else if (cp_list_array[cp_list_len-1] >= 0x2029) {
18109 /* Then try the other POSIX classes. The POSIXA ones are about
18110 * the same speed as ANYOF ops, but the ones that have
18111 * above-Latin1 code point matches are somewhat faster than
18112 * ANYOF. So optimize those, but don't bother with the POSIXA
18113 * ones nor [:cntrl:] which has no above-Latin1 matches. If
18114 * this ANYOF node has a lower highest possible matching code
18115 * point than any of the XPosix ones, we know that it can't
18116 * possibly be the same as any of them, so we can avoid
18117 * executing this code. The 0x2029 above for the lowest max
18118 * was determined by manual inspection of the classes, and
18119 * comes from \v. Suppose Unicode in a later version adds a
18120 * higher code point to \v. All that means is that this code
18121 * can be executed unnecessarily. It will still give the
18122 * correct answer. */
18124 for (posix_class = 0;
18125 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18130 if (posix_class == _CC_CNTRL) {
18134 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18136 /* Check if matches normal or inverted */
18137 if (_invlistEQ(cp_list,
18138 PL_XPosix_ptrs[posix_class],
18141 op = (try_inverted)
18144 *flagp |= HASWIDTH|SIMPLE;
18154 RExC_parse = (char *)orig_parse;
18155 RExC_emit = (regnode *)orig_emit;
18157 if (regarglen[op]) {
18158 ret = reganode(pRExC_state, op, 0);
18160 ret = reg_node(pRExC_state, op);
18163 RExC_parse = (char *)cur_parse;
18165 if (PL_regkind[op] == EXACT) {
18166 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
18167 TRUE /* downgradable to EXACT */
18170 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
18171 FLAGS(ret) = posix_class;
18174 SvREFCNT_dec_NN(cp_list);
18179 /* Here, <cp_list> contains all the code points we can determine at
18180 * compile time that match under all conditions. Go through it, and
18181 * for things that belong in the bitmap, put them there, and delete from
18182 * <cp_list>. While we are at it, see if everything above 255 is in the
18183 * list, and if so, set a flag to speed up execution */
18185 populate_ANYOF_from_invlist(ret, &cp_list);
18188 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
18191 /* Here, the bitmap has been populated with all the Latin1 code points that
18192 * always match. Can now add to the overall list those that match only
18193 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
18195 if (has_upper_latin1_only_utf8_matches) {
18197 _invlist_union(cp_list,
18198 has_upper_latin1_only_utf8_matches,
18200 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18203 cp_list = has_upper_latin1_only_utf8_matches;
18205 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
18208 /* If there is a swash and more than one element, we can't use the swash in
18209 * the optimization below. */
18210 if (swash && element_count > 1) {
18211 SvREFCNT_dec_NN(swash);
18215 /* Note that the optimization of using 'swash' if it is the only thing in
18216 * the class doesn't have us change swash at all, so it can include things
18217 * that are also in the bitmap; otherwise we have purposely deleted that
18218 * duplicate information */
18219 set_ANYOF_arg(pRExC_state, ret, cp_list,
18220 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18222 only_utf8_locale_list,
18223 swash, has_user_defined_property);
18225 *flagp |= HASWIDTH|SIMPLE;
18227 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
18228 RExC_contains_locale = 1;
18234 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
18237 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
18238 regnode* const node,
18240 SV* const runtime_defns,
18241 SV* const only_utf8_locale_list,
18243 const bool has_user_defined_property)
18245 /* Sets the arg field of an ANYOF-type node 'node', using information about
18246 * the node passed-in. If there is nothing outside the node's bitmap, the
18247 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
18248 * the count returned by add_data(), having allocated and stored an array,
18249 * av, that that count references, as follows:
18250 * av[0] stores the character class description in its textual form.
18251 * This is used later (regexec.c:Perl_regclass_swash()) to
18252 * initialize the appropriate swash, and is also useful for dumping
18253 * the regnode. This is set to &PL_sv_undef if the textual
18254 * description is not needed at run-time (as happens if the other
18255 * elements completely define the class)
18256 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
18257 * computed from av[0]. But if no further computation need be done,
18258 * the swash is stored here now (and av[0] is &PL_sv_undef).
18259 * av[2] stores the inversion list of code points that match only if the
18260 * current locale is UTF-8
18261 * av[3] stores the cp_list inversion list for use in addition or instead
18262 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
18263 * (Otherwise everything needed is already in av[0] and av[1])
18264 * av[4] is set if any component of the class is from a user-defined
18265 * property; used only if av[3] exists */
18269 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
18271 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
18272 assert(! (ANYOF_FLAGS(node)
18273 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
18274 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
18277 AV * const av = newAV();
18280 av_store(av, 0, (runtime_defns)
18281 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
18284 av_store(av, 1, swash);
18285 SvREFCNT_dec_NN(cp_list);
18288 av_store(av, 1, &PL_sv_undef);
18290 av_store(av, 3, cp_list);
18291 av_store(av, 4, newSVuv(has_user_defined_property));
18295 if (only_utf8_locale_list) {
18296 av_store(av, 2, only_utf8_locale_list);
18299 av_store(av, 2, &PL_sv_undef);
18302 rv = newRV_noinc(MUTABLE_SV(av));
18303 n = add_data(pRExC_state, STR_WITH_LEN("s"));
18304 RExC_rxi->data->data[n] = (void*)rv;
18309 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
18311 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
18312 const regnode* node,
18315 SV** only_utf8_locale_ptr,
18316 SV** output_invlist)
18319 /* For internal core use only.
18320 * Returns the swash for the input 'node' in the regex 'prog'.
18321 * If <doinit> is 'true', will attempt to create the swash if not already
18323 * If <listsvp> is non-null, will return the printable contents of the
18324 * swash. This can be used to get debugging information even before the
18325 * swash exists, by calling this function with 'doinit' set to false, in
18326 * which case the components that will be used to eventually create the
18327 * swash are returned (in a printable form).
18328 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
18329 * store an inversion list of code points that should match only if the
18330 * execution-time locale is a UTF-8 one.
18331 * If <output_invlist> is not NULL, it is where this routine is to store an
18332 * inversion list of the code points that would be instead returned in
18333 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
18334 * when this parameter is used, is just the non-code point data that
18335 * will go into creating the swash. This currently should be just
18336 * user-defined properties whose definitions were not known at compile
18337 * time. Using this parameter allows for easier manipulation of the
18338 * swash's data by the caller. It is illegal to call this function with
18339 * this parameter set, but not <listsvp>
18341 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
18342 * that, in spite of this function's name, the swash it returns may include
18343 * the bitmap data as well */
18346 SV *si = NULL; /* Input swash initialization string */
18347 SV* invlist = NULL;
18349 RXi_GET_DECL(prog,progi);
18350 const struct reg_data * const data = prog ? progi->data : NULL;
18352 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
18353 assert(! output_invlist || listsvp);
18355 if (data && data->count) {
18356 const U32 n = ARG(node);
18358 if (data->what[n] == 's') {
18359 SV * const rv = MUTABLE_SV(data->data[n]);
18360 AV * const av = MUTABLE_AV(SvRV(rv));
18361 SV **const ary = AvARRAY(av);
18362 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18364 si = *ary; /* ary[0] = the string to initialize the swash with */
18366 if (av_tindex_skip_len_mg(av) >= 2) {
18367 if (only_utf8_locale_ptr
18369 && ary[2] != &PL_sv_undef)
18371 *only_utf8_locale_ptr = ary[2];
18374 assert(only_utf8_locale_ptr);
18375 *only_utf8_locale_ptr = NULL;
18378 /* Elements 3 and 4 are either both present or both absent. [3]
18379 * is any inversion list generated at compile time; [4]
18380 * indicates if that inversion list has any user-defined
18381 * properties in it. */
18382 if (av_tindex_skip_len_mg(av) >= 3) {
18384 if (SvUV(ary[4])) {
18385 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18393 /* Element [1] is reserved for the set-up swash. If already there,
18394 * return it; if not, create it and store it there */
18395 if (ary[1] && SvROK(ary[1])) {
18398 else if (doinit && ((si && si != &PL_sv_undef)
18399 || (invlist && invlist != &PL_sv_undef))) {
18401 sw = _core_swash_init("utf8", /* the utf8 package */
18405 0, /* not from tr/// */
18407 &swash_init_flags);
18408 (void)av_store(av, 1, sw);
18413 /* If requested, return a printable version of what this swash matches */
18415 SV* matches_string = NULL;
18417 /* The swash should be used, if possible, to get the data, as it
18418 * contains the resolved data. But this function can be called at
18419 * compile-time, before everything gets resolved, in which case we
18420 * return the currently best available information, which is the string
18421 * that will eventually be used to do that resolving, 'si' */
18422 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18423 && (si && si != &PL_sv_undef))
18425 /* Here, we only have 'si' (and possibly some passed-in data in
18426 * 'invlist', which is handled below) If the caller only wants
18427 * 'si', use that. */
18428 if (! output_invlist) {
18429 matches_string = newSVsv(si);
18432 /* But if the caller wants an inversion list of the node, we
18433 * need to parse 'si' and place as much as possible in the
18434 * desired output inversion list, making 'matches_string' only
18435 * contain the currently unresolvable things */
18436 const char *si_string = SvPVX(si);
18437 STRLEN remaining = SvCUR(si);
18441 /* Ignore everything before the first new-line */
18442 while (*si_string != '\n' && remaining > 0) {
18446 assert(remaining > 0);
18451 while (remaining > 0) {
18453 /* The data consists of just strings defining user-defined
18454 * property names, but in prior incarnations, and perhaps
18455 * somehow from pluggable regex engines, it could still
18456 * hold hex code point definitions. Each component of a
18457 * range would be separated by a tab, and each range by a
18458 * new-line. If these are found, instead add them to the
18459 * inversion list */
18460 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
18461 |PERL_SCAN_SILENT_NON_PORTABLE;
18462 STRLEN len = remaining;
18463 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
18465 /* If the hex decode routine found something, it should go
18466 * up to the next \n */
18467 if ( *(si_string + len) == '\n') {
18468 if (count) { /* 2nd code point on line */
18469 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
18472 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
18475 goto prepare_for_next_iteration;
18478 /* If the hex decode was instead for the lower range limit,
18479 * save it, and go parse the upper range limit */
18480 if (*(si_string + len) == '\t') {
18481 assert(count == 0);
18485 prepare_for_next_iteration:
18486 si_string += len + 1;
18487 remaining -= len + 1;
18491 /* Here, didn't find a legal hex number. Just add it from
18492 * here to the next \n */
18495 while (*(si_string + len) != '\n' && remaining > 0) {
18499 if (*(si_string + len) == '\n') {
18503 if (matches_string) {
18504 sv_catpvn(matches_string, si_string, len - 1);
18507 matches_string = newSVpvn(si_string, len - 1);
18510 sv_catpvs(matches_string, " ");
18511 } /* end of loop through the text */
18513 assert(matches_string);
18514 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
18515 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18517 } /* end of has an 'si' but no swash */
18520 /* If we have a swash in place, its equivalent inversion list was above
18521 * placed into 'invlist'. If not, this variable may contain a stored
18522 * inversion list which is information beyond what is in 'si' */
18525 /* Again, if the caller doesn't want the output inversion list, put
18526 * everything in 'matches-string' */
18527 if (! output_invlist) {
18528 if ( ! matches_string) {
18529 matches_string = newSVpvs("\n");
18531 sv_catsv(matches_string, invlist_contents(invlist,
18532 TRUE /* traditional style */
18535 else if (! *output_invlist) {
18536 *output_invlist = invlist_clone(invlist);
18539 _invlist_union(*output_invlist, invlist, output_invlist);
18543 *listsvp = matches_string;
18548 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18550 /* reg_skipcomment()
18552 Absorbs an /x style # comment from the input stream,
18553 returning a pointer to the first character beyond the comment, or if the
18554 comment terminates the pattern without anything following it, this returns
18555 one past the final character of the pattern (in other words, RExC_end) and
18556 sets the REG_RUN_ON_COMMENT_SEEN flag.
18558 Note it's the callers responsibility to ensure that we are
18559 actually in /x mode
18563 PERL_STATIC_INLINE char*
18564 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18566 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18570 while (p < RExC_end) {
18571 if (*(++p) == '\n') {
18576 /* we ran off the end of the pattern without ending the comment, so we have
18577 * to add an \n when wrapping */
18578 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18583 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18585 const bool force_to_xmod
18588 /* If the text at the current parse position '*p' is a '(?#...)' comment,
18589 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18590 * is /x whitespace, advance '*p' so that on exit it points to the first
18591 * byte past all such white space and comments */
18593 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18595 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18597 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18600 if (RExC_end - (*p) >= 3
18602 && *(*p + 1) == '?'
18603 && *(*p + 2) == '#')
18605 while (*(*p) != ')') {
18606 if ((*p) == RExC_end)
18607 FAIL("Sequence (?#... not terminated");
18615 const char * save_p = *p;
18616 while ((*p) < RExC_end) {
18618 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18621 else if (*(*p) == '#') {
18622 (*p) = reg_skipcomment(pRExC_state, (*p));
18628 if (*p != save_p) {
18641 Advances the parse position by one byte, unless that byte is the beginning
18642 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
18643 those two cases, the parse position is advanced beyond all such comments and
18646 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18650 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18652 PERL_ARGS_ASSERT_NEXTCHAR;
18654 if (RExC_parse < RExC_end) {
18656 || UTF8_IS_INVARIANT(*RExC_parse)
18657 || UTF8_IS_START(*RExC_parse));
18659 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18661 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18662 FALSE /* Don't force /x */ );
18667 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18669 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18670 * space. In pass1, it aligns and increments RExC_size; in pass2,
18673 regnode * const ret = RExC_emit;
18674 GET_RE_DEBUG_FLAGS_DECL;
18676 PERL_ARGS_ASSERT_REGNODE_GUTS;
18678 assert(extra_size >= regarglen[op]);
18681 SIZE_ALIGN(RExC_size);
18682 RExC_size += 1 + extra_size;
18685 if (RExC_emit >= RExC_emit_bound)
18686 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18687 op, (void*)RExC_emit, (void*)RExC_emit_bound);
18689 NODE_ALIGN_FILL(ret);
18690 #ifndef RE_TRACK_PATTERN_OFFSETS
18691 PERL_UNUSED_ARG(name);
18693 if (RExC_offsets) { /* MJD */
18695 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
18698 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18699 ? "Overwriting end of array!\n" : "OK",
18700 (UV)(RExC_emit - RExC_emit_start),
18701 (UV)(RExC_parse - RExC_start),
18702 (UV)RExC_offsets[0]));
18703 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18710 - reg_node - emit a node
18712 STATIC regnode * /* Location. */
18713 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18715 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18717 PERL_ARGS_ASSERT_REG_NODE;
18719 assert(regarglen[op] == 0);
18722 regnode *ptr = ret;
18723 FILL_ADVANCE_NODE(ptr, op);
18730 - reganode - emit a node with an argument
18732 STATIC regnode * /* Location. */
18733 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18735 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18737 PERL_ARGS_ASSERT_REGANODE;
18739 assert(regarglen[op] == 1);
18742 regnode *ptr = ret;
18743 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18750 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18752 /* emit a node with U32 and I32 arguments */
18754 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18756 PERL_ARGS_ASSERT_REG2LANODE;
18758 assert(regarglen[op] == 2);
18761 regnode *ptr = ret;
18762 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18769 - reginsert - insert an operator in front of already-emitted operand
18771 * Means relocating the operand.
18773 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
18774 * set up NEXT_OFF() of the inserted node if needed. Something like this:
18776 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
18778 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
18780 * ALSO NOTE - operand->flags will be set to 0 as well.
18783 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
18788 const int offset = regarglen[(U8)op];
18789 const int size = NODE_STEP_REGNODE + offset;
18790 GET_RE_DEBUG_FLAGS_DECL;
18792 PERL_ARGS_ASSERT_REGINSERT;
18793 PERL_UNUSED_CONTEXT;
18794 PERL_UNUSED_ARG(depth);
18795 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18796 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18801 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18802 studying. If this is wrong then we need to adjust RExC_recurse
18803 below like we do with RExC_open_parens/RExC_close_parens. */
18807 if (RExC_open_parens) {
18809 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
18810 /* remember that RExC_npar is rex->nparens + 1,
18811 * iow it is 1 more than the number of parens seen in
18812 * the pattern so far. */
18813 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18814 /* note, RExC_open_parens[0] is the start of the
18815 * regex, it can't move. RExC_close_parens[0] is the end
18816 * of the regex, it *can* move. */
18817 if ( paren && RExC_open_parens[paren] >= operand ) {
18818 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18819 RExC_open_parens[paren] += size;
18821 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18823 if ( RExC_close_parens[paren] >= operand ) {
18824 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18825 RExC_close_parens[paren] += size;
18827 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18832 RExC_end_op += size;
18834 while (src > operand) {
18835 StructCopy(--src, --dst, regnode);
18836 #ifdef RE_TRACK_PATTERN_OFFSETS
18837 if (RExC_offsets) { /* MJD 20010112 */
18839 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
18843 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18844 ? "Overwriting end of array!\n" : "OK",
18845 (UV)(src - RExC_emit_start),
18846 (UV)(dst - RExC_emit_start),
18847 (UV)RExC_offsets[0]));
18848 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18849 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18854 place = operand; /* Op node, where operand used to be. */
18855 #ifdef RE_TRACK_PATTERN_OFFSETS
18856 if (RExC_offsets) { /* MJD */
18858 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
18862 (UV)(place - RExC_emit_start) > RExC_offsets[0]
18863 ? "Overwriting end of array!\n" : "OK",
18864 (UV)(place - RExC_emit_start),
18865 (UV)(RExC_parse - RExC_start),
18866 (UV)RExC_offsets[0]));
18867 Set_Node_Offset(place, RExC_parse);
18868 Set_Node_Length(place, 1);
18871 src = NEXTOPER(place);
18873 FILL_ADVANCE_NODE(place, op);
18874 Zero(src, offset, regnode);
18878 - regtail - set the next-pointer at the end of a node chain of p to val.
18879 - SEE ALSO: regtail_study
18882 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18883 const regnode * const p,
18884 const regnode * const val,
18888 GET_RE_DEBUG_FLAGS_DECL;
18890 PERL_ARGS_ASSERT_REGTAIL;
18892 PERL_UNUSED_ARG(depth);
18898 /* Find last node. */
18899 scan = (regnode *) p;
18901 regnode * const temp = regnext(scan);
18903 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18904 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18905 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
18906 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18907 (temp == NULL ? "->" : ""),
18908 (temp == NULL ? PL_reg_name[OP(val)] : "")
18916 if (reg_off_by_arg[OP(scan)]) {
18917 ARG_SET(scan, val - scan);
18920 NEXT_OFF(scan) = val - scan;
18926 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18927 - Look for optimizable sequences at the same time.
18928 - currently only looks for EXACT chains.
18930 This is experimental code. The idea is to use this routine to perform
18931 in place optimizations on branches and groups as they are constructed,
18932 with the long term intention of removing optimization from study_chunk so
18933 that it is purely analytical.
18935 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18936 to control which is which.
18939 /* TODO: All four parms should be const */
18942 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18943 const regnode *val,U32 depth)
18947 #ifdef EXPERIMENTAL_INPLACESCAN
18950 GET_RE_DEBUG_FLAGS_DECL;
18952 PERL_ARGS_ASSERT_REGTAIL_STUDY;
18958 /* Find last node. */
18962 regnode * const temp = regnext(scan);
18963 #ifdef EXPERIMENTAL_INPLACESCAN
18964 if (PL_regkind[OP(scan)] == EXACT) {
18965 bool unfolded_multi_char; /* Unexamined in this routine */
18966 if (join_exact(pRExC_state, scan, &min,
18967 &unfolded_multi_char, 1, val, depth+1))
18972 switch (OP(scan)) {
18976 case EXACTFA_NO_TRIE:
18982 if( exact == PSEUDO )
18984 else if ( exact != OP(scan) )
18993 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18994 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18995 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
18996 SvPV_nolen_const(RExC_mysv),
18997 REG_NODE_NUM(scan),
18998 PL_reg_name[exact]);
19005 DEBUG_PARSE_MSG("");
19006 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
19007 Perl_re_printf( aTHX_
19008 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19009 SvPV_nolen_const(RExC_mysv),
19010 (IV)REG_NODE_NUM(val),
19014 if (reg_off_by_arg[OP(scan)]) {
19015 ARG_SET(scan, val - scan);
19018 NEXT_OFF(scan) = val - scan;
19026 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19031 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19036 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19038 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19039 if (flags & (1<<bit)) {
19040 if (!set++ && lead)
19041 Perl_re_printf( aTHX_ "%s",lead);
19042 Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]);
19047 Perl_re_printf( aTHX_ "\n");
19049 Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
19054 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19060 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19062 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19063 if (flags & (1<<bit)) {
19064 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
19067 if (!set++ && lead)
19068 Perl_re_printf( aTHX_ "%s",lead);
19069 Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]);
19072 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19073 if (!set++ && lead) {
19074 Perl_re_printf( aTHX_ "%s",lead);
19077 case REGEX_UNICODE_CHARSET:
19078 Perl_re_printf( aTHX_ "UNICODE");
19080 case REGEX_LOCALE_CHARSET:
19081 Perl_re_printf( aTHX_ "LOCALE");
19083 case REGEX_ASCII_RESTRICTED_CHARSET:
19084 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
19086 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19087 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
19090 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
19096 Perl_re_printf( aTHX_ "\n");
19098 Perl_re_printf( aTHX_ "%s[none-set]\n",lead);
19104 Perl_regdump(pTHX_ const regexp *r)
19108 SV * const sv = sv_newmortal();
19109 SV *dsv= sv_newmortal();
19110 RXi_GET_DECL(r,ri);
19111 GET_RE_DEBUG_FLAGS_DECL;
19113 PERL_ARGS_ASSERT_REGDUMP;
19115 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19117 /* Header fields of interest. */
19118 for (i = 0; i < 2; i++) {
19119 if (r->substrs->data[i].substr) {
19120 RE_PV_QUOTED_DECL(s, 0, dsv,
19121 SvPVX_const(r->substrs->data[i].substr),
19122 RE_SV_DUMPLEN(r->substrs->data[i].substr),
19123 PL_dump_re_max_len);
19124 Perl_re_printf( aTHX_
19125 "%s %s%s at %" IVdf "..%" UVuf " ",
19126 i ? "floating" : "anchored",
19128 RE_SV_TAIL(r->substrs->data[i].substr),
19129 (IV)r->substrs->data[i].min_offset,
19130 (UV)r->substrs->data[i].max_offset);
19132 else if (r->substrs->data[i].utf8_substr) {
19133 RE_PV_QUOTED_DECL(s, 1, dsv,
19134 SvPVX_const(r->substrs->data[i].utf8_substr),
19135 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19137 Perl_re_printf( aTHX_
19138 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19139 i ? "floating" : "anchored",
19141 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19142 (IV)r->substrs->data[i].min_offset,
19143 (UV)r->substrs->data[i].max_offset);
19147 if (r->check_substr || r->check_utf8)
19148 Perl_re_printf( aTHX_
19150 ( r->check_substr == r->substrs->data[1].substr
19151 && r->check_utf8 == r->substrs->data[1].utf8_substr
19152 ? "(checking floating" : "(checking anchored"));
19153 if (r->intflags & PREGf_NOSCAN)
19154 Perl_re_printf( aTHX_ " noscan");
19155 if (r->extflags & RXf_CHECK_ALL)
19156 Perl_re_printf( aTHX_ " isall");
19157 if (r->check_substr || r->check_utf8)
19158 Perl_re_printf( aTHX_ ") ");
19160 if (ri->regstclass) {
19161 regprop(r, sv, ri->regstclass, NULL, NULL);
19162 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
19164 if (r->intflags & PREGf_ANCH) {
19165 Perl_re_printf( aTHX_ "anchored");
19166 if (r->intflags & PREGf_ANCH_MBOL)
19167 Perl_re_printf( aTHX_ "(MBOL)");
19168 if (r->intflags & PREGf_ANCH_SBOL)
19169 Perl_re_printf( aTHX_ "(SBOL)");
19170 if (r->intflags & PREGf_ANCH_GPOS)
19171 Perl_re_printf( aTHX_ "(GPOS)");
19172 Perl_re_printf( aTHX_ " ");
19174 if (r->intflags & PREGf_GPOS_SEEN)
19175 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
19176 if (r->intflags & PREGf_SKIP)
19177 Perl_re_printf( aTHX_ "plus ");
19178 if (r->intflags & PREGf_IMPLICIT)
19179 Perl_re_printf( aTHX_ "implicit ");
19180 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
19181 if (r->extflags & RXf_EVAL_SEEN)
19182 Perl_re_printf( aTHX_ "with eval ");
19183 Perl_re_printf( aTHX_ "\n");
19185 regdump_extflags("r->extflags: ",r->extflags);
19186 regdump_intflags("r->intflags: ",r->intflags);
19189 PERL_ARGS_ASSERT_REGDUMP;
19190 PERL_UNUSED_CONTEXT;
19191 PERL_UNUSED_ARG(r);
19192 #endif /* DEBUGGING */
19195 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
19198 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
19199 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
19200 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
19201 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
19202 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
19203 || _CC_VERTSPACE != 15
19204 # error Need to adjust order of anyofs[]
19206 static const char * const anyofs[] = {
19243 - regprop - printable representation of opcode, with run time support
19247 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
19251 RXi_GET_DECL(prog,progi);
19252 GET_RE_DEBUG_FLAGS_DECL;
19254 PERL_ARGS_ASSERT_REGPROP;
19258 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
19259 /* It would be nice to FAIL() here, but this may be called from
19260 regexec.c, and it would be hard to supply pRExC_state. */
19261 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19262 (int)OP(o), (int)REGNODE_MAX);
19263 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
19265 k = PL_regkind[OP(o)];
19268 sv_catpvs(sv, " ");
19269 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
19270 * is a crude hack but it may be the best for now since
19271 * we have no flag "this EXACTish node was UTF-8"
19273 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
19274 PL_colors[0], PL_colors[1],
19275 PERL_PV_ESCAPE_UNI_DETECT |
19276 PERL_PV_ESCAPE_NONASCII |
19277 PERL_PV_PRETTY_ELLIPSES |
19278 PERL_PV_PRETTY_LTGT |
19279 PERL_PV_PRETTY_NOCLEAR
19281 } else if (k == TRIE) {
19282 /* print the details of the trie in dumpuntil instead, as
19283 * progi->data isn't available here */
19284 const char op = OP(o);
19285 const U32 n = ARG(o);
19286 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
19287 (reg_ac_data *)progi->data->data[n] :
19289 const reg_trie_data * const trie
19290 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
19292 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
19293 DEBUG_TRIE_COMPILE_r({
19295 sv_catpvs(sv, "(JUMP)");
19296 Perl_sv_catpvf(aTHX_ sv,
19297 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
19298 (UV)trie->startstate,
19299 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
19300 (UV)trie->wordcount,
19303 (UV)TRIE_CHARCOUNT(trie),
19304 (UV)trie->uniquecharcount
19307 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
19308 sv_catpvs(sv, "[");
19309 (void) put_charclass_bitmap_innards(sv,
19310 ((IS_ANYOF_TRIE(op))
19312 : TRIE_BITMAP(trie)),
19318 sv_catpvs(sv, "]");
19320 } else if (k == CURLY) {
19321 U32 lo = ARG1(o), hi = ARG2(o);
19322 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
19323 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
19324 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
19325 if (hi == REG_INFTY)
19326 sv_catpvs(sv, "INFTY");
19328 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
19329 sv_catpvs(sv, "}");
19331 else if (k == WHILEM && o->flags) /* Ordinal/of */
19332 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
19333 else if (k == REF || k == OPEN || k == CLOSE
19334 || k == GROUPP || OP(o)==ACCEPT)
19336 AV *name_list= NULL;
19337 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
19338 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
19339 if ( RXp_PAREN_NAMES(prog) ) {
19340 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19341 } else if ( pRExC_state ) {
19342 name_list= RExC_paren_name_list;
19345 if ( k != REF || (OP(o) < NREF)) {
19346 SV **name= av_fetch(name_list, parno, 0 );
19348 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19351 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
19352 I32 *nums=(I32*)SvPVX(sv_dat);
19353 SV **name= av_fetch(name_list, nums[0], 0 );
19356 for ( n=0; n<SvIVX(sv_dat); n++ ) {
19357 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
19358 (n ? "," : ""), (IV)nums[n]);
19360 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19364 if ( k == REF && reginfo) {
19365 U32 n = ARG(o); /* which paren pair */
19366 I32 ln = prog->offs[n].start;
19367 if (prog->lastparen < n || ln == -1)
19368 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
19369 else if (ln == prog->offs[n].end)
19370 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19372 const char *s = reginfo->strbeg + ln;
19373 Perl_sv_catpvf(aTHX_ sv, ": ");
19374 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19375 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19378 } else if (k == GOSUB) {
19379 AV *name_list= NULL;
19380 if ( RXp_PAREN_NAMES(prog) ) {
19381 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19382 } else if ( pRExC_state ) {
19383 name_list= RExC_paren_name_list;
19386 /* Paren and offset */
19387 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19388 (int)((o + (int)ARG2L(o)) - progi->program) );
19390 SV **name= av_fetch(name_list, ARG(o), 0 );
19392 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19395 else if (k == LOGICAL)
19396 /* 2: embedded, otherwise 1 */
19397 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19398 else if (k == ANYOF) {
19399 const U8 flags = ANYOF_FLAGS(o);
19400 bool do_sep = FALSE; /* Do we need to separate various components of
19402 /* Set if there is still an unresolved user-defined property */
19403 SV *unresolved = NULL;
19405 /* Things that are ignored except when the runtime locale is UTF-8 */
19406 SV *only_utf8_locale_invlist = NULL;
19408 /* Code points that don't fit in the bitmap */
19409 SV *nonbitmap_invlist = NULL;
19411 /* And things that aren't in the bitmap, but are small enough to be */
19412 SV* bitmap_range_not_in_bitmap = NULL;
19414 const bool inverted = flags & ANYOF_INVERT;
19416 if (OP(o) == ANYOFL) {
19417 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
19418 sv_catpvs(sv, "{utf8-locale-reqd}");
19420 if (flags & ANYOFL_FOLD) {
19421 sv_catpvs(sv, "{i}");
19425 /* If there is stuff outside the bitmap, get it */
19426 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
19427 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
19429 &only_utf8_locale_invlist,
19430 &nonbitmap_invlist);
19431 /* The non-bitmap data may contain stuff that could fit in the
19432 * bitmap. This could come from a user-defined property being
19433 * finally resolved when this call was done; or much more likely
19434 * because there are matches that require UTF-8 to be valid, and so
19435 * aren't in the bitmap. This is teased apart later */
19436 _invlist_intersection(nonbitmap_invlist,
19438 &bitmap_range_not_in_bitmap);
19439 /* Leave just the things that don't fit into the bitmap */
19440 _invlist_subtract(nonbitmap_invlist,
19442 &nonbitmap_invlist);
19445 /* Obey this flag to add all above-the-bitmap code points */
19446 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
19447 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
19448 NUM_ANYOF_CODE_POINTS,
19452 /* Ready to start outputting. First, the initial left bracket */
19453 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19455 /* Then all the things that could fit in the bitmap */
19456 do_sep = put_charclass_bitmap_innards(sv,
19458 bitmap_range_not_in_bitmap,
19459 only_utf8_locale_invlist,
19462 /* Can't try inverting for a
19463 * better display if there are
19464 * things that haven't been
19466 unresolved != NULL);
19467 SvREFCNT_dec(bitmap_range_not_in_bitmap);
19469 /* If there are user-defined properties which haven't been defined yet,
19470 * output them. If the result is not to be inverted, it is clearest to
19471 * output them in a separate [] from the bitmap range stuff. If the
19472 * result is to be complemented, we have to show everything in one [],
19473 * as the inversion applies to the whole thing. Use {braces} to
19474 * separate them from anything in the bitmap and anything above the
19478 if (! do_sep) { /* If didn't output anything in the bitmap */
19479 sv_catpvs(sv, "^");
19481 sv_catpvs(sv, "{");
19484 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19486 sv_catsv(sv, unresolved);
19488 sv_catpvs(sv, "}");
19490 do_sep = ! inverted;
19493 /* And, finally, add the above-the-bitmap stuff */
19494 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
19497 /* See if truncation size is overridden */
19498 const STRLEN dump_len = (PL_dump_re_max_len > 256)
19499 ? PL_dump_re_max_len
19502 /* This is output in a separate [] */
19504 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19507 /* And, for easy of understanding, it is shown in the
19508 * uncomplemented form if possible. The one exception being if
19509 * there are unresolved items, where the inversion has to be
19510 * delayed until runtime */
19511 if (inverted && ! unresolved) {
19512 _invlist_invert(nonbitmap_invlist);
19513 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
19516 contents = invlist_contents(nonbitmap_invlist,
19517 FALSE /* output suitable for catsv */
19520 /* If the output is shorter than the permissible maximum, just do it. */
19521 if (SvCUR(contents) <= dump_len) {
19522 sv_catsv(sv, contents);
19525 const char * contents_string = SvPVX(contents);
19526 STRLEN i = dump_len;
19528 /* Otherwise, start at the permissible max and work back to the
19529 * first break possibility */
19530 while (i > 0 && contents_string[i] != ' ') {
19533 if (i == 0) { /* Fail-safe. Use the max if we couldn't
19534 find a legal break */
19538 sv_catpvn(sv, contents_string, i);
19539 sv_catpvs(sv, "...");
19542 SvREFCNT_dec_NN(contents);
19543 SvREFCNT_dec_NN(nonbitmap_invlist);
19546 /* And finally the matching, closing ']' */
19547 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19549 SvREFCNT_dec(unresolved);
19551 else if (k == POSIXD || k == NPOSIXD) {
19552 U8 index = FLAGS(o) * 2;
19553 if (index < C_ARRAY_LENGTH(anyofs)) {
19554 if (*anyofs[index] != '[') {
19557 sv_catpv(sv, anyofs[index]);
19558 if (*anyofs[index] != '[') {
19563 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19566 else if (k == BOUND || k == NBOUND) {
19567 /* Must be synced with order of 'bound_type' in regcomp.h */
19568 const char * const bounds[] = {
19569 "", /* Traditional */
19575 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19576 sv_catpv(sv, bounds[FLAGS(o)]);
19578 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19579 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19580 else if (OP(o) == SBOL)
19581 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19583 /* add on the verb argument if there is one */
19584 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19586 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
19587 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19589 sv_catpvs(sv, ":NULL");
19592 PERL_UNUSED_CONTEXT;
19593 PERL_UNUSED_ARG(sv);
19594 PERL_UNUSED_ARG(o);
19595 PERL_UNUSED_ARG(prog);
19596 PERL_UNUSED_ARG(reginfo);
19597 PERL_UNUSED_ARG(pRExC_state);
19598 #endif /* DEBUGGING */
19604 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19605 { /* Assume that RE_INTUIT is set */
19606 struct regexp *const prog = ReANY(r);
19607 GET_RE_DEBUG_FLAGS_DECL;
19609 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19610 PERL_UNUSED_CONTEXT;
19614 const char * const s = SvPV_nolen_const(RX_UTF8(r)
19615 ? prog->check_utf8 : prog->check_substr);
19617 if (!PL_colorset) reginitcolors();
19618 Perl_re_printf( aTHX_
19619 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19621 RX_UTF8(r) ? "utf8 " : "",
19622 PL_colors[5],PL_colors[0],
19625 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
19628 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19629 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19635 handles refcounting and freeing the perl core regexp structure. When
19636 it is necessary to actually free the structure the first thing it
19637 does is call the 'free' method of the regexp_engine associated to
19638 the regexp, allowing the handling of the void *pprivate; member
19639 first. (This routine is not overridable by extensions, which is why
19640 the extensions free is called first.)
19642 See regdupe and regdupe_internal if you change anything here.
19644 #ifndef PERL_IN_XSUB_RE
19646 Perl_pregfree(pTHX_ REGEXP *r)
19652 Perl_pregfree2(pTHX_ REGEXP *rx)
19654 struct regexp *const r = ReANY(rx);
19655 GET_RE_DEBUG_FLAGS_DECL;
19657 PERL_ARGS_ASSERT_PREGFREE2;
19659 if (r->mother_re) {
19660 ReREFCNT_dec(r->mother_re);
19662 CALLREGFREE_PVT(rx); /* free the private data */
19663 SvREFCNT_dec(RXp_PAREN_NAMES(r));
19667 for (i = 0; i < 2; i++) {
19668 SvREFCNT_dec(r->substrs->data[i].substr);
19669 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
19671 Safefree(r->substrs);
19673 RX_MATCH_COPY_FREE(rx);
19674 #ifdef PERL_ANY_COW
19675 SvREFCNT_dec(r->saved_copy);
19678 SvREFCNT_dec(r->qr_anoncv);
19679 if (r->recurse_locinput)
19680 Safefree(r->recurse_locinput);
19686 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
19687 except that dsv will be created if NULL.
19689 This function is used in two main ways. First to implement
19690 $r = qr/....; $s = $$r;
19692 Secondly, it is used as a hacky workaround to the structural issue of
19694 being stored in the regexp structure which is in turn stored in
19695 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19696 could be PL_curpm in multiple contexts, and could require multiple
19697 result sets being associated with the pattern simultaneously, such
19698 as when doing a recursive match with (??{$qr})
19700 The solution is to make a lightweight copy of the regexp structure
19701 when a qr// is returned from the code executed by (??{$qr}) this
19702 lightweight copy doesn't actually own any of its data except for
19703 the starp/end and the actual regexp structure itself.
19709 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
19711 struct regexp *drx;
19712 struct regexp *const srx = ReANY(ssv);
19713 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
19715 PERL_ARGS_ASSERT_REG_TEMP_COPY;
19718 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
19720 SvOK_off((SV *)dsv);
19722 /* For PVLVs, the head (sv_any) points to an XPVLV, while
19723 * the LV's xpvlenu_rx will point to a regexp body, which
19724 * we allocate here */
19725 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19726 assert(!SvPVX(dsv));
19727 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
19728 temp->sv_any = NULL;
19729 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19730 SvREFCNT_dec_NN(temp);
19731 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19732 ing below will not set it. */
19733 SvCUR_set(dsv, SvCUR(ssv));
19736 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19737 sv_force_normal(sv) is called. */
19741 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
19742 SvPV_set(dsv, RX_WRAPPED(ssv));
19743 /* We share the same string buffer as the original regexp, on which we
19744 hold a reference count, incremented when mother_re is set below.
19745 The string pointer is copied here, being part of the regexp struct.
19747 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
19748 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19752 const I32 npar = srx->nparens+1;
19753 Newx(drx->offs, npar, regexp_paren_pair);
19754 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
19756 if (srx->substrs) {
19758 Newx(drx->substrs, 1, struct reg_substr_data);
19759 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
19761 for (i = 0; i < 2; i++) {
19762 SvREFCNT_inc_void(drx->substrs->data[i].substr);
19763 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
19766 /* check_substr and check_utf8, if non-NULL, point to either their
19767 anchored or float namesakes, and don't hold a second reference. */
19769 RX_MATCH_COPIED_off(dsv);
19770 #ifdef PERL_ANY_COW
19771 drx->saved_copy = NULL;
19773 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
19774 SvREFCNT_inc_void(drx->qr_anoncv);
19775 if (srx->recurse_locinput)
19776 Newx(drx->recurse_locinput,srx->nparens + 1,char *);
19783 /* regfree_internal()
19785 Free the private data in a regexp. This is overloadable by
19786 extensions. Perl takes care of the regexp structure in pregfree(),
19787 this covers the *pprivate pointer which technically perl doesn't
19788 know about, however of course we have to handle the
19789 regexp_internal structure when no extension is in use.
19791 Note this is called before freeing anything in the regexp
19796 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19798 struct regexp *const r = ReANY(rx);
19799 RXi_GET_DECL(r,ri);
19800 GET_RE_DEBUG_FLAGS_DECL;
19802 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19808 SV *dsv= sv_newmortal();
19809 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19810 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
19811 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19812 PL_colors[4],PL_colors[5],s);
19815 #ifdef RE_TRACK_PATTERN_OFFSETS
19817 Safefree(ri->u.offsets); /* 20010421 MJD */
19819 if (ri->code_blocks)
19820 S_free_codeblocks(aTHX_ ri->code_blocks);
19823 int n = ri->data->count;
19826 /* If you add a ->what type here, update the comment in regcomp.h */
19827 switch (ri->data->what[n]) {
19833 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19836 Safefree(ri->data->data[n]);
19842 { /* Aho Corasick add-on structure for a trie node.
19843 Used in stclass optimization only */
19845 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19846 #ifdef USE_ITHREADS
19850 refcount = --aho->refcount;
19853 PerlMemShared_free(aho->states);
19854 PerlMemShared_free(aho->fail);
19855 /* do this last!!!! */
19856 PerlMemShared_free(ri->data->data[n]);
19857 /* we should only ever get called once, so
19858 * assert as much, and also guard the free
19859 * which /might/ happen twice. At the least
19860 * it will make code anlyzers happy and it
19861 * doesn't cost much. - Yves */
19862 assert(ri->regstclass);
19863 if (ri->regstclass) {
19864 PerlMemShared_free(ri->regstclass);
19865 ri->regstclass = 0;
19872 /* trie structure. */
19874 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19875 #ifdef USE_ITHREADS
19879 refcount = --trie->refcount;
19882 PerlMemShared_free(trie->charmap);
19883 PerlMemShared_free(trie->states);
19884 PerlMemShared_free(trie->trans);
19886 PerlMemShared_free(trie->bitmap);
19888 PerlMemShared_free(trie->jump);
19889 PerlMemShared_free(trie->wordinfo);
19890 /* do this last!!!! */
19891 PerlMemShared_free(ri->data->data[n]);
19896 Perl_croak(aTHX_ "panic: regfree data code '%c'",
19897 ri->data->what[n]);
19900 Safefree(ri->data->what);
19901 Safefree(ri->data);
19907 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19908 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19909 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
19912 re_dup_guts - duplicate a regexp.
19914 This routine is expected to clone a given regexp structure. It is only
19915 compiled under USE_ITHREADS.
19917 After all of the core data stored in struct regexp is duplicated
19918 the regexp_engine.dupe method is used to copy any private data
19919 stored in the *pprivate pointer. This allows extensions to handle
19920 any duplication it needs to do.
19922 See pregfree() and regfree_internal() if you change anything here.
19924 #if defined(USE_ITHREADS)
19925 #ifndef PERL_IN_XSUB_RE
19927 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19931 const struct regexp *r = ReANY(sstr);
19932 struct regexp *ret = ReANY(dstr);
19934 PERL_ARGS_ASSERT_RE_DUP_GUTS;
19936 npar = r->nparens+1;
19937 Newx(ret->offs, npar, regexp_paren_pair);
19938 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19940 if (ret->substrs) {
19941 /* Do it this way to avoid reading from *r after the StructCopy().
19942 That way, if any of the sv_dup_inc()s dislodge *r from the L1
19943 cache, it doesn't matter. */
19945 const bool anchored = r->check_substr
19946 ? r->check_substr == r->substrs->data[0].substr
19947 : r->check_utf8 == r->substrs->data[0].utf8_substr;
19948 Newx(ret->substrs, 1, struct reg_substr_data);
19949 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19951 for (i = 0; i < 2; i++) {
19952 ret->substrs->data[i].substr =
19953 sv_dup_inc(ret->substrs->data[i].substr, param);
19954 ret->substrs->data[i].utf8_substr =
19955 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
19958 /* check_substr and check_utf8, if non-NULL, point to either their
19959 anchored or float namesakes, and don't hold a second reference. */
19961 if (ret->check_substr) {
19963 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
19965 ret->check_substr = ret->substrs->data[0].substr;
19966 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
19968 assert(r->check_substr == r->substrs->data[1].substr);
19969 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
19971 ret->check_substr = ret->substrs->data[1].substr;
19972 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
19974 } else if (ret->check_utf8) {
19976 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
19978 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
19983 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19984 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19985 if (r->recurse_locinput)
19986 Newx(ret->recurse_locinput,r->nparens + 1,char *);
19989 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19991 if (RX_MATCH_COPIED(dstr))
19992 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
19994 ret->subbeg = NULL;
19995 #ifdef PERL_ANY_COW
19996 ret->saved_copy = NULL;
19999 /* Whether mother_re be set or no, we need to copy the string. We
20000 cannot refrain from copying it when the storage points directly to
20001 our mother regexp, because that's
20002 1: a buffer in a different thread
20003 2: something we no longer hold a reference on
20004 so we need to copy it locally. */
20005 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20006 ret->mother_re = NULL;
20008 #endif /* PERL_IN_XSUB_RE */
20013 This is the internal complement to regdupe() which is used to copy
20014 the structure pointed to by the *pprivate pointer in the regexp.
20015 This is the core version of the extension overridable cloning hook.
20016 The regexp structure being duplicated will be copied by perl prior
20017 to this and will be provided as the regexp *r argument, however
20018 with the /old/ structures pprivate pointer value. Thus this routine
20019 may override any copying normally done by perl.
20021 It returns a pointer to the new regexp_internal structure.
20025 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20028 struct regexp *const r = ReANY(rx);
20029 regexp_internal *reti;
20031 RXi_GET_DECL(r,ri);
20033 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20037 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20038 char, regexp_internal);
20039 Copy(ri->program, reti->program, len+1, regnode);
20042 if (ri->code_blocks) {
20044 Newx(reti->code_blocks, 1, struct reg_code_blocks);
20045 Newx(reti->code_blocks->cb, ri->code_blocks->count,
20046 struct reg_code_block);
20047 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20048 ri->code_blocks->count, struct reg_code_block);
20049 for (n = 0; n < ri->code_blocks->count; n++)
20050 reti->code_blocks->cb[n].src_regex = (REGEXP*)
20051 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20052 reti->code_blocks->count = ri->code_blocks->count;
20053 reti->code_blocks->refcnt = 1;
20056 reti->code_blocks = NULL;
20058 reti->regstclass = NULL;
20061 struct reg_data *d;
20062 const int count = ri->data->count;
20065 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20066 char, struct reg_data);
20067 Newx(d->what, count, U8);
20070 for (i = 0; i < count; i++) {
20071 d->what[i] = ri->data->what[i];
20072 switch (d->what[i]) {
20073 /* see also regcomp.h and regfree_internal() */
20074 case 'a': /* actually an AV, but the dup function is identical.
20075 values seem to be "plain sv's" generally. */
20076 case 'r': /* a compiled regex (but still just another SV) */
20077 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20078 this use case should go away, the code could have used
20079 'a' instead - see S_set_ANYOF_arg() for array contents. */
20080 case 'S': /* actually an SV, but the dup function is identical. */
20081 case 'u': /* actually an HV, but the dup function is identical.
20082 values are "plain sv's" */
20083 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20086 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20087 * patterns which could start with several different things. Pre-TRIE
20088 * this was more important than it is now, however this still helps
20089 * in some places, for instance /x?a+/ might produce a SSC equivalent
20090 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20093 /* This is cheating. */
20094 Newx(d->data[i], 1, regnode_ssc);
20095 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20096 reti->regstclass = (regnode*)d->data[i];
20099 /* AHO-CORASICK fail table */
20100 /* Trie stclasses are readonly and can thus be shared
20101 * without duplication. We free the stclass in pregfree
20102 * when the corresponding reg_ac_data struct is freed.
20104 reti->regstclass= ri->regstclass;
20107 /* TRIE transition table */
20109 ((reg_trie_data*)ri->data->data[i])->refcount++;
20112 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20113 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20114 is not from another regexp */
20115 d->data[i] = ri->data->data[i];
20118 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20119 ri->data->what[i]);
20128 reti->name_list_idx = ri->name_list_idx;
20130 #ifdef RE_TRACK_PATTERN_OFFSETS
20131 if (ri->u.offsets) {
20132 Newx(reti->u.offsets, 2*len+1, U32);
20133 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
20136 SetProgLen(reti,len);
20139 return (void*)reti;
20142 #endif /* USE_ITHREADS */
20144 #ifndef PERL_IN_XSUB_RE
20147 - regnext - dig the "next" pointer out of a node
20150 Perl_regnext(pTHX_ regnode *p)
20157 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
20158 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20159 (int)OP(p), (int)REGNODE_MAX);
20162 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
20171 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
20174 STRLEN l1 = strlen(pat1);
20175 STRLEN l2 = strlen(pat2);
20178 const char *message;
20180 PERL_ARGS_ASSERT_RE_CROAK2;
20186 Copy(pat1, buf, l1 , char);
20187 Copy(pat2, buf + l1, l2 , char);
20188 buf[l1 + l2] = '\n';
20189 buf[l1 + l2 + 1] = '\0';
20190 va_start(args, pat2);
20191 msv = vmess(buf, &args);
20193 message = SvPV_const(msv,l1);
20196 Copy(message, buf, l1 , char);
20197 /* l1-1 to avoid \n */
20198 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
20201 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
20203 #ifndef PERL_IN_XSUB_RE
20205 Perl_save_re_context(pTHX)
20210 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
20213 const REGEXP * const rx = PM_GETRE(PL_curpm);
20215 nparens = RX_NPARENS(rx);
20218 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
20219 * that PL_curpm will be null, but that utf8.pm and the modules it
20220 * loads will only use $1..$3.
20221 * The t/porting/re_context.t test file checks this assumption.
20226 for (i = 1; i <= nparens; i++) {
20227 char digits[TYPE_CHARS(long)];
20228 const STRLEN len = my_snprintf(digits, sizeof(digits),
20230 GV *const *const gvp
20231 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
20234 GV * const gv = *gvp;
20235 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
20245 S_put_code_point(pTHX_ SV *sv, UV c)
20247 PERL_ARGS_ASSERT_PUT_CODE_POINT;
20250 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
20252 else if (isPRINT(c)) {
20253 const char string = (char) c;
20255 /* We use {phrase} as metanotation in the class, so also escape literal
20257 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
20258 sv_catpvs(sv, "\\");
20259 sv_catpvn(sv, &string, 1);
20261 else if (isMNEMONIC_CNTRL(c)) {
20262 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
20265 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
20269 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
20272 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
20274 /* Appends to 'sv' a displayable version of the range of code points from
20275 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
20276 * that have them, when they occur at the beginning or end of the range.
20277 * It uses hex to output the remaining code points, unless 'allow_literals'
20278 * is true, in which case the printable ASCII ones are output as-is (though
20279 * some of these will be escaped by put_code_point()).
20281 * NOTE: This is designed only for printing ranges of code points that fit
20282 * inside an ANYOF bitmap. Higher code points are simply suppressed
20285 const unsigned int min_range_count = 3;
20287 assert(start <= end);
20289 PERL_ARGS_ASSERT_PUT_RANGE;
20291 while (start <= end) {
20293 const char * format;
20295 if (end - start < min_range_count) {
20297 /* Output chars individually when they occur in short ranges */
20298 for (; start <= end; start++) {
20299 put_code_point(sv, start);
20304 /* If permitted by the input options, and there is a possibility that
20305 * this range contains a printable literal, look to see if there is
20307 if (allow_literals && start <= MAX_PRINT_A) {
20309 /* If the character at the beginning of the range isn't an ASCII
20310 * printable, effectively split the range into two parts:
20311 * 1) the portion before the first such printable,
20313 * and output them separately. */
20314 if (! isPRINT_A(start)) {
20315 UV temp_end = start + 1;
20317 /* There is no point looking beyond the final possible
20318 * printable, in MAX_PRINT_A */
20319 UV max = MIN(end, MAX_PRINT_A);
20321 while (temp_end <= max && ! isPRINT_A(temp_end)) {
20325 /* Here, temp_end points to one beyond the first printable if
20326 * found, or to one beyond 'max' if not. If none found, make
20327 * sure that we use the entire range */
20328 if (temp_end > MAX_PRINT_A) {
20329 temp_end = end + 1;
20332 /* Output the first part of the split range: the part that
20333 * doesn't have printables, with the parameter set to not look
20334 * for literals (otherwise we would infinitely recurse) */
20335 put_range(sv, start, temp_end - 1, FALSE);
20337 /* The 2nd part of the range (if any) starts here. */
20340 /* We do a continue, instead of dropping down, because even if
20341 * the 2nd part is non-empty, it could be so short that we want
20342 * to output it as individual characters, as tested for at the
20343 * top of this loop. */
20347 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
20348 * output a sub-range of just the digits or letters, then process
20349 * the remaining portion as usual. */
20350 if (isALPHANUMERIC_A(start)) {
20351 UV mask = (isDIGIT_A(start))
20356 UV temp_end = start + 1;
20358 /* Find the end of the sub-range that includes just the
20359 * characters in the same class as the first character in it */
20360 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
20365 /* For short ranges, don't duplicate the code above to output
20366 * them; just call recursively */
20367 if (temp_end - start < min_range_count) {
20368 put_range(sv, start, temp_end, FALSE);
20370 else { /* Output as a range */
20371 put_code_point(sv, start);
20372 sv_catpvs(sv, "-");
20373 put_code_point(sv, temp_end);
20375 start = temp_end + 1;
20379 /* We output any other printables as individual characters */
20380 if (isPUNCT_A(start) || isSPACE_A(start)) {
20381 while (start <= end && (isPUNCT_A(start)
20382 || isSPACE_A(start)))
20384 put_code_point(sv, start);
20389 } /* End of looking for literals */
20391 /* Here is not to output as a literal. Some control characters have
20392 * mnemonic names. Split off any of those at the beginning and end of
20393 * the range to print mnemonically. It isn't possible for many of
20394 * these to be in a row, so this won't overwhelm with output */
20396 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
20398 while (isMNEMONIC_CNTRL(start) && start <= end) {
20399 put_code_point(sv, start);
20403 /* If this didn't take care of the whole range ... */
20404 if (start <= end) {
20406 /* Look backwards from the end to find the final non-mnemonic
20409 while (isMNEMONIC_CNTRL(temp_end)) {
20413 /* And separately output the interior range that doesn't start
20414 * or end with mnemonics */
20415 put_range(sv, start, temp_end, FALSE);
20417 /* Then output the mnemonic trailing controls */
20418 start = temp_end + 1;
20419 while (start <= end) {
20420 put_code_point(sv, start);
20427 /* As a final resort, output the range or subrange as hex. */
20429 this_end = (end < NUM_ANYOF_CODE_POINTS)
20431 : NUM_ANYOF_CODE_POINTS - 1;
20432 #if NUM_ANYOF_CODE_POINTS > 256
20433 format = (this_end < 256)
20434 ? "\\x%02" UVXf "-\\x%02" UVXf
20435 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
20437 format = "\\x%02" UVXf "-\\x%02" UVXf;
20439 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
20440 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
20441 GCC_DIAG_RESTORE_STMT;
20447 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
20449 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
20453 bool allow_literals = TRUE;
20455 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
20457 /* Generally, it is more readable if printable characters are output as
20458 * literals, but if a range (nearly) spans all of them, it's best to output
20459 * it as a single range. This code will use a single range if all but 2
20460 * ASCII printables are in it */
20461 invlist_iterinit(invlist);
20462 while (invlist_iternext(invlist, &start, &end)) {
20464 /* If the range starts beyond the final printable, it doesn't have any
20466 if (start > MAX_PRINT_A) {
20470 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
20471 * all but two, the range must start and end no later than 2 from
20473 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
20474 if (end > MAX_PRINT_A) {
20480 if (end - start >= MAX_PRINT_A - ' ' - 2) {
20481 allow_literals = FALSE;
20486 invlist_iterfinish(invlist);
20488 /* Here we have figured things out. Output each range */
20489 invlist_iterinit(invlist);
20490 while (invlist_iternext(invlist, &start, &end)) {
20491 if (start >= NUM_ANYOF_CODE_POINTS) {
20494 put_range(sv, start, end, allow_literals);
20496 invlist_iterfinish(invlist);
20502 S_put_charclass_bitmap_innards_common(pTHX_
20503 SV* invlist, /* The bitmap */
20504 SV* posixes, /* Under /l, things like [:word:], \S */
20505 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
20506 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
20507 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
20508 const bool invert /* Is the result to be inverted? */
20511 /* Create and return an SV containing a displayable version of the bitmap
20512 * and associated information determined by the input parameters. If the
20513 * output would have been only the inversion indicator '^', NULL is instead
20518 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
20521 output = newSVpvs("^");
20524 output = newSVpvs("");
20527 /* First, the code points in the bitmap that are unconditionally there */
20528 put_charclass_bitmap_innards_invlist(output, invlist);
20530 /* Traditionally, these have been placed after the main code points */
20532 sv_catsv(output, posixes);
20535 if (only_utf8 && _invlist_len(only_utf8)) {
20536 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
20537 put_charclass_bitmap_innards_invlist(output, only_utf8);
20540 if (not_utf8 && _invlist_len(not_utf8)) {
20541 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
20542 put_charclass_bitmap_innards_invlist(output, not_utf8);
20545 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
20546 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
20547 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
20549 /* This is the only list in this routine that can legally contain code
20550 * points outside the bitmap range. The call just above to
20551 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
20552 * output them here. There's about a half-dozen possible, and none in
20553 * contiguous ranges longer than 2 */
20554 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20556 SV* above_bitmap = NULL;
20558 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
20560 invlist_iterinit(above_bitmap);
20561 while (invlist_iternext(above_bitmap, &start, &end)) {
20564 for (i = start; i <= end; i++) {
20565 put_code_point(output, i);
20568 invlist_iterfinish(above_bitmap);
20569 SvREFCNT_dec_NN(above_bitmap);
20573 if (invert && SvCUR(output) == 1) {
20581 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20583 SV *nonbitmap_invlist,
20584 SV *only_utf8_locale_invlist,
20585 const regnode * const node,
20586 const bool force_as_is_display)
20588 /* Appends to 'sv' a displayable version of the innards of the bracketed
20589 * character class defined by the other arguments:
20590 * 'bitmap' points to the bitmap.
20591 * 'nonbitmap_invlist' is an inversion list of the code points that are in
20592 * the bitmap range, but for some reason aren't in the bitmap; NULL if
20593 * none. The reasons for this could be that they require some
20594 * condition such as the target string being or not being in UTF-8
20595 * (under /d), or because they came from a user-defined property that
20596 * was not resolved at the time of the regex compilation (under /u)
20597 * 'only_utf8_locale_invlist' is an inversion list of the code points that
20598 * are valid only if the runtime locale is a UTF-8 one; NULL if none
20599 * 'node' is the regex pattern node. It is needed only when the above two
20600 * parameters are not null, and is passed so that this routine can
20601 * tease apart the various reasons for them.
20602 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
20603 * to invert things to see if that leads to a cleaner display. If
20604 * FALSE, this routine is free to use its judgment about doing this.
20606 * It returns TRUE if there was actually something output. (It may be that
20607 * the bitmap, etc is empty.)
20609 * When called for outputting the bitmap of a non-ANYOF node, just pass the
20610 * bitmap, with the succeeding parameters set to NULL, and the final one to
20614 /* In general, it tries to display the 'cleanest' representation of the
20615 * innards, choosing whether to display them inverted or not, regardless of
20616 * whether the class itself is to be inverted. However, there are some
20617 * cases where it can't try inverting, as what actually matches isn't known
20618 * until runtime, and hence the inversion isn't either. */
20619 bool inverting_allowed = ! force_as_is_display;
20622 STRLEN orig_sv_cur = SvCUR(sv);
20624 SV* invlist; /* Inversion list we accumulate of code points that
20625 are unconditionally matched */
20626 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
20628 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
20630 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
20631 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
20634 SV* as_is_display; /* The output string when we take the inputs
20636 SV* inverted_display; /* The output string when we invert the inputs */
20638 U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20640 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
20642 /* We are biased in favor of displaying things without them being inverted,
20643 * as that is generally easier to understand */
20644 const int bias = 5;
20646 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20648 /* Start off with whatever code points are passed in. (We clone, so we
20649 * don't change the caller's list) */
20650 if (nonbitmap_invlist) {
20651 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20652 invlist = invlist_clone(nonbitmap_invlist);
20654 else { /* Worst case size is every other code point is matched */
20655 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20659 if (OP(node) == ANYOFD) {
20661 /* This flag indicates that the code points below 0x100 in the
20662 * nonbitmap list are precisely the ones that match only when the
20663 * target is UTF-8 (they should all be non-ASCII). */
20664 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20666 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20667 _invlist_subtract(invlist, only_utf8, &invlist);
20670 /* And this flag for matching all non-ASCII 0xFF and below */
20671 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20673 not_utf8 = invlist_clone(PL_UpperLatin1);
20676 else if (OP(node) == ANYOFL) {
20678 /* If either of these flags are set, what matches isn't
20679 * determinable except during execution, so don't know enough here
20681 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20682 inverting_allowed = FALSE;
20685 /* What the posix classes match also varies at runtime, so these
20686 * will be output symbolically. */
20687 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20690 posixes = newSVpvs("");
20691 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20692 if (ANYOF_POSIXL_TEST(node,i)) {
20693 sv_catpv(posixes, anyofs[i]);
20700 /* Accumulate the bit map into the unconditional match list */
20701 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20702 if (BITMAP_TEST(bitmap, i)) {
20704 for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20707 invlist = _add_range_to_invlist(invlist, start, i-1);
20711 /* Make sure that the conditional match lists don't have anything in them
20712 * that match unconditionally; otherwise the output is quite confusing.
20713 * This could happen if the code that populates these misses some
20716 _invlist_subtract(only_utf8, invlist, &only_utf8);
20719 _invlist_subtract(not_utf8, invlist, ¬_utf8);
20722 if (only_utf8_locale_invlist) {
20724 /* Since this list is passed in, we have to make a copy before
20726 only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20728 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20730 /* And, it can get really weird for us to try outputting an inverted
20731 * form of this list when it has things above the bitmap, so don't even
20733 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20734 inverting_allowed = FALSE;
20738 /* Calculate what the output would be if we take the input as-is */
20739 as_is_display = put_charclass_bitmap_innards_common(invlist,
20746 /* If have to take the output as-is, just do that */
20747 if (! inverting_allowed) {
20748 if (as_is_display) {
20749 sv_catsv(sv, as_is_display);
20750 SvREFCNT_dec_NN(as_is_display);
20753 else { /* But otherwise, create the output again on the inverted input, and
20754 use whichever version is shorter */
20756 int inverted_bias, as_is_bias;
20758 /* We will apply our bias to whichever of the the results doesn't have
20768 inverted_bias = bias;
20771 /* Now invert each of the lists that contribute to the output,
20772 * excluding from the result things outside the possible range */
20774 /* For the unconditional inversion list, we have to add in all the
20775 * conditional code points, so that when inverted, they will be gone
20777 _invlist_union(only_utf8, invlist, &invlist);
20778 _invlist_union(not_utf8, invlist, &invlist);
20779 _invlist_union(only_utf8_locale, invlist, &invlist);
20780 _invlist_invert(invlist);
20781 _invlist_intersection(invlist, PL_InBitmap, &invlist);
20784 _invlist_invert(only_utf8);
20785 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20787 else if (not_utf8) {
20789 /* If a code point matches iff the target string is not in UTF-8,
20790 * then complementing the result has it not match iff not in UTF-8,
20791 * which is the same thing as matching iff it is UTF-8. */
20792 only_utf8 = not_utf8;
20796 if (only_utf8_locale) {
20797 _invlist_invert(only_utf8_locale);
20798 _invlist_intersection(only_utf8_locale,
20800 &only_utf8_locale);
20803 inverted_display = put_charclass_bitmap_innards_common(
20808 only_utf8_locale, invert);
20810 /* Use the shortest representation, taking into account our bias
20811 * against showing it inverted */
20812 if ( inverted_display
20813 && ( ! as_is_display
20814 || ( SvCUR(inverted_display) + inverted_bias
20815 < SvCUR(as_is_display) + as_is_bias)))
20817 sv_catsv(sv, inverted_display);
20819 else if (as_is_display) {
20820 sv_catsv(sv, as_is_display);
20823 SvREFCNT_dec(as_is_display);
20824 SvREFCNT_dec(inverted_display);
20827 SvREFCNT_dec_NN(invlist);
20828 SvREFCNT_dec(only_utf8);
20829 SvREFCNT_dec(not_utf8);
20830 SvREFCNT_dec(posixes);
20831 SvREFCNT_dec(only_utf8_locale);
20833 return SvCUR(sv) > orig_sv_cur;
20836 #define CLEAR_OPTSTART \
20837 if (optstart) STMT_START { \
20838 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
20839 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
20843 #define DUMPUNTIL(b,e) \
20845 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20847 STATIC const regnode *
20848 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20849 const regnode *last, const regnode *plast,
20850 SV* sv, I32 indent, U32 depth)
20852 U8 op = PSEUDO; /* Arbitrary non-END op. */
20853 const regnode *next;
20854 const regnode *optstart= NULL;
20856 RXi_GET_DECL(r,ri);
20857 GET_RE_DEBUG_FLAGS_DECL;
20859 PERL_ARGS_ASSERT_DUMPUNTIL;
20861 #ifdef DEBUG_DUMPUNTIL
20862 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start,
20863 last ? last-start : 0,plast ? plast-start : 0);
20866 if (plast && plast < last)
20869 while (PL_regkind[op] != END && (!last || node < last)) {
20871 /* While that wasn't END last time... */
20874 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
20876 next = regnext((regnode *)node);
20879 if (OP(node) == OPTIMIZED) {
20880 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20887 regprop(r, sv, node, NULL, NULL);
20888 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
20889 (int)(2*indent + 1), "", SvPVX_const(sv));
20891 if (OP(node) != OPTIMIZED) {
20892 if (next == NULL) /* Next ptr. */
20893 Perl_re_printf( aTHX_ " (0)");
20894 else if (PL_regkind[(U8)op] == BRANCH
20895 && PL_regkind[OP(next)] != BRANCH )
20896 Perl_re_printf( aTHX_ " (FAIL)");
20898 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
20899 Perl_re_printf( aTHX_ "\n");
20903 if (PL_regkind[(U8)op] == BRANCHJ) {
20906 const regnode *nnode = (OP(next) == LONGJMP
20907 ? regnext((regnode *)next)
20909 if (last && nnode > last)
20911 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20914 else if (PL_regkind[(U8)op] == BRANCH) {
20916 DUMPUNTIL(NEXTOPER(node), next);
20918 else if ( PL_regkind[(U8)op] == TRIE ) {
20919 const regnode *this_trie = node;
20920 const char op = OP(node);
20921 const U32 n = ARG(node);
20922 const reg_ac_data * const ac = op>=AHOCORASICK ?
20923 (reg_ac_data *)ri->data->data[n] :
20925 const reg_trie_data * const trie =
20926 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20928 AV *const trie_words
20929 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20931 const regnode *nextbranch= NULL;
20934 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20935 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20937 Perl_re_indentf( aTHX_ "%s ",
20940 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20941 SvCUR(*elem_ptr), PL_dump_re_max_len,
20942 PL_colors[0], PL_colors[1],
20944 ? PERL_PV_ESCAPE_UNI
20946 | PERL_PV_PRETTY_ELLIPSES
20947 | PERL_PV_PRETTY_LTGT
20952 U16 dist= trie->jump[word_idx+1];
20953 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
20954 (UV)((dist ? this_trie + dist : next) - start));
20957 nextbranch= this_trie + trie->jump[0];
20958 DUMPUNTIL(this_trie + dist, nextbranch);
20960 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20961 nextbranch= regnext((regnode *)nextbranch);
20963 Perl_re_printf( aTHX_ "\n");
20966 if (last && next > last)
20971 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
20972 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20973 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20975 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20977 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20979 else if ( op == PLUS || op == STAR) {
20980 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20982 else if (PL_regkind[(U8)op] == ANYOF) {
20983 /* arglen 1 + class block */
20984 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20985 ? ANYOF_POSIXL_SKIP
20987 node = NEXTOPER(node);
20989 else if (PL_regkind[(U8)op] == EXACT) {
20990 /* Literal string, where present. */
20991 node += NODE_SZ_STR(node) - 1;
20992 node = NEXTOPER(node);
20995 node = NEXTOPER(node);
20996 node += regarglen[(U8)op];
20998 if (op == CURLYX || op == OPEN || op == SROPEN)
21002 #ifdef DEBUG_DUMPUNTIL
21003 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
21008 #endif /* DEBUGGING */
21011 * ex: set ts=8 sts=4 sw=4 et: