5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
108 /* this is a chain of data about sub patterns we are processing that
109 need to be handled separately/specially in study_chunk. Its so
110 we can simulate recursion without losing state. */
112 typedef struct scan_frame {
113 regnode *last_regnode; /* last node to process in this frame */
114 regnode *next_regnode; /* next node to process when last is reached */
115 U32 prev_recursed_depth;
116 I32 stopparen; /* what stopparen do we use */
117 U32 is_top_frame; /* what flags do we use? */
119 struct scan_frame *this_prev_frame; /* this previous frame */
120 struct scan_frame *prev_frame; /* previous frame */
121 struct scan_frame *next_frame; /* next frame */
124 /* Certain characters are output as a sequence with the first being a
126 #define isBACKSLASHED_PUNCT(c) \
127 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
130 struct RExC_state_t {
131 U32 flags; /* RXf_* are we folding, multilining? */
132 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
133 char *precomp; /* uncompiled string. */
134 REGEXP *rx_sv; /* The SV that is the regexp. */
135 regexp *rx; /* perl core regexp structure */
136 regexp_internal *rxi; /* internal data for regexp object
138 char *start; /* Start of input for compile */
139 char *end; /* End of input for compile */
140 char *parse; /* Input-scan pointer. */
141 SSize_t whilem_seen; /* number of WHILEM in this expr */
142 regnode *emit_start; /* Start of emitted-code area */
143 regnode *emit_bound; /* First regnode outside of the
145 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
146 implies compiling, so don't emit */
147 regnode_ssc emit_dummy; /* placeholder for emit to point to;
148 large enough for the largest
149 non-EXACTish node, so can use it as
151 I32 naughty; /* How bad is this pattern? */
152 I32 sawback; /* Did we see \1, ...? */
154 SSize_t size; /* Code size. */
155 I32 npar; /* Capture buffer count, (OPEN) plus
156 one. ("par" 0 is the whole
158 I32 nestroot; /* root parens we are in - used by
162 regnode **open_parens; /* pointers to open parens */
163 regnode **close_parens; /* pointers to close parens */
164 regnode *opend; /* END node in program */
165 I32 utf8; /* whether the pattern is utf8 or not */
166 I32 orig_utf8; /* whether the pattern was originally in utf8 */
167 /* XXX use this for future optimisation of case
168 * where pattern must be upgraded to utf8. */
169 I32 uni_semantics; /* If a d charset modifier should use unicode
170 rules, even if the pattern is not in
172 HV *paren_names; /* Paren names */
174 regnode **recurse; /* Recurse regops */
175 I32 recurse_count; /* Number of recurse regops */
176 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
178 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
182 I32 override_recoding;
184 I32 recode_x_to_native;
186 I32 in_multi_char_class;
187 struct reg_code_block *code_blocks; /* positions of literal (?{})
189 int num_code_blocks; /* size of code_blocks[] */
190 int code_index; /* next code_blocks[] slot */
191 SSize_t maxlen; /* mininum possible number of chars in string to match */
192 scan_frame *frame_head;
193 scan_frame *frame_last;
196 #ifdef ADD_TO_REGEXEC
197 char *starttry; /* -Dr: where regtry was called. */
198 #define RExC_starttry (pRExC_state->starttry)
200 SV *runtime_code_qr; /* qr with the runtime code blocks */
202 const char *lastparse;
204 AV *paren_name_list; /* idx -> name */
205 U32 study_chunk_recursed_count;
208 #define RExC_lastparse (pRExC_state->lastparse)
209 #define RExC_lastnum (pRExC_state->lastnum)
210 #define RExC_paren_name_list (pRExC_state->paren_name_list)
211 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
212 #define RExC_mysv (pRExC_state->mysv1)
213 #define RExC_mysv1 (pRExC_state->mysv1)
214 #define RExC_mysv2 (pRExC_state->mysv2)
217 bool seen_unfolded_sharp_s;
220 #define RExC_flags (pRExC_state->flags)
221 #define RExC_pm_flags (pRExC_state->pm_flags)
222 #define RExC_precomp (pRExC_state->precomp)
223 #define RExC_rx_sv (pRExC_state->rx_sv)
224 #define RExC_rx (pRExC_state->rx)
225 #define RExC_rxi (pRExC_state->rxi)
226 #define RExC_start (pRExC_state->start)
227 #define RExC_end (pRExC_state->end)
228 #define RExC_parse (pRExC_state->parse)
229 #define RExC_whilem_seen (pRExC_state->whilem_seen)
231 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
232 * EXACTF node, hence was parsed under /di rules. If later in the parse,
233 * something forces the pattern into using /ui rules, the sharp s should be
234 * folded into the sequence 'ss', which takes up more space than previously
235 * calculated. This means that the sizing pass needs to be restarted. (The
236 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
237 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
238 * so there is no need to resize [perl #125990]. */
239 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241 #ifdef RE_TRACK_PATTERN_OFFSETS
242 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
245 #define RExC_emit (pRExC_state->emit)
246 #define RExC_emit_dummy (pRExC_state->emit_dummy)
247 #define RExC_emit_start (pRExC_state->emit_start)
248 #define RExC_emit_bound (pRExC_state->emit_bound)
249 #define RExC_sawback (pRExC_state->sawback)
250 #define RExC_seen (pRExC_state->seen)
251 #define RExC_size (pRExC_state->size)
252 #define RExC_maxlen (pRExC_state->maxlen)
253 #define RExC_npar (pRExC_state->npar)
254 #define RExC_nestroot (pRExC_state->nestroot)
255 #define RExC_extralen (pRExC_state->extralen)
256 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
257 #define RExC_utf8 (pRExC_state->utf8)
258 #define RExC_uni_semantics (pRExC_state->uni_semantics)
259 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
260 #define RExC_open_parens (pRExC_state->open_parens)
261 #define RExC_close_parens (pRExC_state->close_parens)
262 #define RExC_opend (pRExC_state->opend)
263 #define RExC_paren_names (pRExC_state->paren_names)
264 #define RExC_recurse (pRExC_state->recurse)
265 #define RExC_recurse_count (pRExC_state->recurse_count)
266 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
267 #define RExC_study_chunk_recursed_bytes \
268 (pRExC_state->study_chunk_recursed_bytes)
269 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
270 #define RExC_contains_locale (pRExC_state->contains_locale)
271 #define RExC_contains_i (pRExC_state->contains_i)
272 #define RExC_override_recoding (pRExC_state->override_recoding)
274 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
276 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
277 #define RExC_frame_head (pRExC_state->frame_head)
278 #define RExC_frame_last (pRExC_state->frame_last)
279 #define RExC_frame_count (pRExC_state->frame_count)
280 #define RExC_strict (pRExC_state->strict)
282 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
283 * a flag to disable back-off on the fixed/floating substrings - if it's
284 * a high complexity pattern we assume the benefit of avoiding a full match
285 * is worth the cost of checking for the substrings even if they rarely help.
287 #define RExC_naughty (pRExC_state->naughty)
288 #define TOO_NAUGHTY (10)
289 #define MARK_NAUGHTY(add) \
290 if (RExC_naughty < TOO_NAUGHTY) \
291 RExC_naughty += (add)
292 #define MARK_NAUGHTY_EXP(exp, add) \
293 if (RExC_naughty < TOO_NAUGHTY) \
294 RExC_naughty += RExC_naughty / (exp) + (add)
296 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
297 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
298 ((*s) == '{' && regcurly(s)))
301 * Flags to be passed up and down.
303 #define WORST 0 /* Worst case. */
304 #define HASWIDTH 0x01 /* Known to match non-null strings. */
306 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
307 * character. (There needs to be a case: in the switch statement in regexec.c
308 * for any node marked SIMPLE.) Note that this is not the same thing as
311 #define SPSTART 0x04 /* Starts with * or + */
312 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
313 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
314 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */
315 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
316 calcuate sizes as UTF-8 */
318 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
320 /* whether trie related optimizations are enabled */
321 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
322 #define TRIE_STUDY_OPT
323 #define FULL_TRIE_STUDY
329 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
330 #define PBITVAL(paren) (1 << ((paren) & 7))
331 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
332 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
333 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
335 #define REQUIRE_UTF8(flagp) STMT_START { \
338 *flagp = RESTART_PASS1|NEED_UTF8; \
343 /* Change from /d into /u rules, and restart the parse if we've already seen
344 * something whose size would increase as a result, by setting *flagp and
345 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
346 * we've change to /u during the parse. */
347 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
349 if (DEPENDS_SEMANTICS) { \
351 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
352 RExC_uni_semantics = 1; \
353 if (RExC_seen_unfolded_sharp_s) { \
354 *flagp |= RESTART_PASS1; \
355 return restart_retval; \
360 /* This converts the named class defined in regcomp.h to its equivalent class
361 * number defined in handy.h. */
362 #define namedclass_to_classnum(class) ((int) ((class) / 2))
363 #define classnum_to_namedclass(classnum) ((classnum) * 2)
365 #define _invlist_union_complement_2nd(a, b, output) \
366 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
367 #define _invlist_intersection_complement_2nd(a, b, output) \
368 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
370 /* About scan_data_t.
372 During optimisation we recurse through the regexp program performing
373 various inplace (keyhole style) optimisations. In addition study_chunk
374 and scan_commit populate this data structure with information about
375 what strings MUST appear in the pattern. We look for the longest
376 string that must appear at a fixed location, and we look for the
377 longest string that may appear at a floating location. So for instance
382 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
383 strings (because they follow a .* construct). study_chunk will identify
384 both FOO and BAR as being the longest fixed and floating strings respectively.
386 The strings can be composites, for instance
390 will result in a composite fixed substring 'foo'.
392 For each string some basic information is maintained:
394 - offset or min_offset
395 This is the position the string must appear at, or not before.
396 It also implicitly (when combined with minlenp) tells us how many
397 characters must match before the string we are searching for.
398 Likewise when combined with minlenp and the length of the string it
399 tells us how many characters must appear after the string we have
403 Only used for floating strings. This is the rightmost point that
404 the string can appear at. If set to SSize_t_MAX it indicates that the
405 string can occur infinitely far to the right.
408 A pointer to the minimum number of characters of the pattern that the
409 string was found inside. This is important as in the case of positive
410 lookahead or positive lookbehind we can have multiple patterns
415 The minimum length of the pattern overall is 3, the minimum length
416 of the lookahead part is 3, but the minimum length of the part that
417 will actually match is 1. So 'FOO's minimum length is 3, but the
418 minimum length for the F is 1. This is important as the minimum length
419 is used to determine offsets in front of and behind the string being
420 looked for. Since strings can be composites this is the length of the
421 pattern at the time it was committed with a scan_commit. Note that
422 the length is calculated by study_chunk, so that the minimum lengths
423 are not known until the full pattern has been compiled, thus the
424 pointer to the value.
428 In the case of lookbehind the string being searched for can be
429 offset past the start point of the final matching string.
430 If this value was just blithely removed from the min_offset it would
431 invalidate some of the calculations for how many chars must match
432 before or after (as they are derived from min_offset and minlen and
433 the length of the string being searched for).
434 When the final pattern is compiled and the data is moved from the
435 scan_data_t structure into the regexp structure the information
436 about lookbehind is factored in, with the information that would
437 have been lost precalculated in the end_shift field for the
440 The fields pos_min and pos_delta are used to store the minimum offset
441 and the delta to the maximum offset at the current point in the pattern.
445 typedef struct scan_data_t {
446 /*I32 len_min; unused */
447 /*I32 len_delta; unused */
451 SSize_t last_end; /* min value, <0 unless valid. */
452 SSize_t last_start_min;
453 SSize_t last_start_max;
454 SV **longest; /* Either &l_fixed, or &l_float. */
455 SV *longest_fixed; /* longest fixed string found in pattern */
456 SSize_t offset_fixed; /* offset where it starts */
457 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
458 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
459 SV *longest_float; /* longest floating string found in pattern */
460 SSize_t offset_float_min; /* earliest point in string it can appear */
461 SSize_t offset_float_max; /* latest point in string it can appear */
462 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
463 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
466 SSize_t *last_closep;
467 regnode_ssc *start_class;
471 * Forward declarations for pregcomp()'s friends.
474 static const scan_data_t zero_scan_data =
475 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
477 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
478 #define SF_BEFORE_SEOL 0x0001
479 #define SF_BEFORE_MEOL 0x0002
480 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
481 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
483 #define SF_FIX_SHIFT_EOL (+2)
484 #define SF_FL_SHIFT_EOL (+4)
486 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
487 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
489 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
490 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
491 #define SF_IS_INF 0x0040
492 #define SF_HAS_PAR 0x0080
493 #define SF_IN_PAR 0x0100
494 #define SF_HAS_EVAL 0x0200
495 #define SCF_DO_SUBSTR 0x0400
496 #define SCF_DO_STCLASS_AND 0x0800
497 #define SCF_DO_STCLASS_OR 0x1000
498 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
499 #define SCF_WHILEM_VISITED_POS 0x2000
501 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
502 #define SCF_SEEN_ACCEPT 0x8000
503 #define SCF_TRIE_DOING_RESTUDY 0x10000
504 #define SCF_IN_DEFINE 0x20000
509 #define UTF cBOOL(RExC_utf8)
511 /* The enums for all these are ordered so things work out correctly */
512 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
513 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
514 == REGEX_DEPENDS_CHARSET)
515 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
516 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
517 >= REGEX_UNICODE_CHARSET)
518 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
519 == REGEX_ASCII_RESTRICTED_CHARSET)
520 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
521 >= REGEX_ASCII_RESTRICTED_CHARSET)
522 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
523 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
525 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
527 /* For programs that want to be strictly Unicode compatible by dying if any
528 * attempt is made to match a non-Unicode code point against a Unicode
530 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
532 #define OOB_NAMEDCLASS -1
534 /* There is no code point that is out-of-bounds, so this is problematic. But
535 * its only current use is to initialize a variable that is always set before
537 #define OOB_UNICODE 0xDEADBEEF
539 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
540 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
543 /* length of regex to show in messages that don't mark a position within */
544 #define RegexLengthToShowInErrorMessages 127
547 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
548 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
549 * op/pragma/warn/regcomp.
551 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
552 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
554 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
555 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
557 #define REPORT_LOCATION_ARGS(offset) \
558 UTF8fARG(UTF, offset, RExC_precomp), \
559 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
561 /* Used to point after bad bytes for an error message, but avoid skipping
562 * past a nul byte. */
563 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
566 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
567 * arg. Show regex, up to a maximum length. If it's too long, chop and add
570 #define _FAIL(code) STMT_START { \
571 const char *ellipses = ""; \
572 IV len = RExC_end - RExC_precomp; \
575 SAVEFREESV(RExC_rx_sv); \
576 if (len > RegexLengthToShowInErrorMessages) { \
577 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
578 len = RegexLengthToShowInErrorMessages - 10; \
584 #define FAIL(msg) _FAIL( \
585 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
586 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
588 #define FAIL2(msg,arg) _FAIL( \
589 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
590 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
593 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
595 #define Simple_vFAIL(m) STMT_START { \
597 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
598 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
599 m, REPORT_LOCATION_ARGS(offset)); \
603 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
605 #define vFAIL(m) STMT_START { \
607 SAVEFREESV(RExC_rx_sv); \
612 * Like Simple_vFAIL(), but accepts two arguments.
614 #define Simple_vFAIL2(m,a1) STMT_START { \
615 const IV offset = RExC_parse - RExC_precomp; \
616 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
617 REPORT_LOCATION_ARGS(offset)); \
621 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
623 #define vFAIL2(m,a1) STMT_START { \
625 SAVEFREESV(RExC_rx_sv); \
626 Simple_vFAIL2(m, a1); \
631 * Like Simple_vFAIL(), but accepts three arguments.
633 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
634 const IV offset = RExC_parse - RExC_precomp; \
635 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
636 REPORT_LOCATION_ARGS(offset)); \
640 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
642 #define vFAIL3(m,a1,a2) STMT_START { \
644 SAVEFREESV(RExC_rx_sv); \
645 Simple_vFAIL3(m, a1, a2); \
649 * Like Simple_vFAIL(), but accepts four arguments.
651 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
652 const IV offset = RExC_parse - RExC_precomp; \
653 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
654 REPORT_LOCATION_ARGS(offset)); \
657 #define vFAIL4(m,a1,a2,a3) STMT_START { \
659 SAVEFREESV(RExC_rx_sv); \
660 Simple_vFAIL4(m, a1, a2, a3); \
663 /* A specialized version of vFAIL2 that works with UTF8f */
664 #define vFAIL2utf8f(m, a1) STMT_START { \
665 const IV offset = RExC_parse - RExC_precomp; \
667 SAVEFREESV(RExC_rx_sv); \
668 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
669 REPORT_LOCATION_ARGS(offset)); \
672 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
673 const IV offset = RExC_parse - RExC_precomp; \
675 SAVEFREESV(RExC_rx_sv); \
676 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
677 REPORT_LOCATION_ARGS(offset)); \
680 /* These have asserts in them because of [perl #122671] Many warnings in
681 * regcomp.c can occur twice. If they get output in pass1 and later in that
682 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
683 * would get output again. So they should be output in pass2, and these
684 * asserts make sure new warnings follow that paradigm. */
686 /* m is not necessarily a "literal string", in this macro */
687 #define reg_warn_non_literal_string(loc, m) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
690 m, REPORT_LOCATION_ARGS(offset)); \
693 #define ckWARNreg(loc,m) STMT_START { \
694 const IV offset = loc - RExC_precomp; \
695 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
696 REPORT_LOCATION_ARGS(offset)); \
699 #define vWARN(loc, m) STMT_START { \
700 const IV offset = loc - RExC_precomp; \
701 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
702 REPORT_LOCATION_ARGS(offset)); \
705 #define vWARN_dep(loc, m) STMT_START { \
706 const IV offset = loc - RExC_precomp; \
707 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
708 REPORT_LOCATION_ARGS(offset)); \
711 #define ckWARNdep(loc,m) STMT_START { \
712 const IV offset = loc - RExC_precomp; \
713 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
715 REPORT_LOCATION_ARGS(offset)); \
718 #define ckWARNregdep(loc,m) STMT_START { \
719 const IV offset = loc - RExC_precomp; \
720 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
722 REPORT_LOCATION_ARGS(offset)); \
725 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
726 const IV offset = loc - RExC_precomp; \
727 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
729 a1, REPORT_LOCATION_ARGS(offset)); \
732 #define ckWARN2reg(loc, m, a1) STMT_START { \
733 const IV offset = loc - RExC_precomp; \
734 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
735 a1, REPORT_LOCATION_ARGS(offset)); \
738 #define vWARN3(loc, m, a1, a2) STMT_START { \
739 const IV offset = loc - RExC_precomp; \
740 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
741 a1, a2, REPORT_LOCATION_ARGS(offset)); \
744 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
745 const IV offset = loc - RExC_precomp; \
746 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
747 a1, a2, REPORT_LOCATION_ARGS(offset)); \
750 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
751 const IV offset = loc - RExC_precomp; \
752 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
753 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
756 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
757 const IV offset = loc - RExC_precomp; \
758 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
759 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
762 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
763 const IV offset = loc - RExC_precomp; \
764 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
765 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
768 /* Macros for recording node offsets. 20001227 mjd@plover.com
769 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
770 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
771 * Element 0 holds the number n.
772 * Position is 1 indexed.
774 #ifndef RE_TRACK_PATTERN_OFFSETS
775 #define Set_Node_Offset_To_R(node,byte)
776 #define Set_Node_Offset(node,byte)
777 #define Set_Cur_Node_Offset
778 #define Set_Node_Length_To_R(node,len)
779 #define Set_Node_Length(node,len)
780 #define Set_Node_Cur_Length(node,start)
781 #define Node_Offset(n)
782 #define Node_Length(n)
783 #define Set_Node_Offset_Length(node,offset,len)
784 #define ProgLen(ri) ri->u.proglen
785 #define SetProgLen(ri,x) ri->u.proglen = x
787 #define ProgLen(ri) ri->u.offsets[0]
788 #define SetProgLen(ri,x) ri->u.offsets[0] = x
789 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
791 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
792 __LINE__, (int)(node), (int)(byte))); \
794 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
797 RExC_offsets[2*(node)-1] = (byte); \
802 #define Set_Node_Offset(node,byte) \
803 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
804 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
806 #define Set_Node_Length_To_R(node,len) STMT_START { \
808 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
809 __LINE__, (int)(node), (int)(len))); \
811 Perl_croak(aTHX_ "value of node is %d in Length macro", \
814 RExC_offsets[2*(node)] = (len); \
819 #define Set_Node_Length(node,len) \
820 Set_Node_Length_To_R((node)-RExC_emit_start, len)
821 #define Set_Node_Cur_Length(node, start) \
822 Set_Node_Length(node, RExC_parse - start)
824 /* Get offsets and lengths */
825 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
826 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
828 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
829 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
830 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
834 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
835 #define EXPERIMENTAL_INPLACESCAN
836 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
838 #define DEBUG_RExC_seen() \
839 DEBUG_OPTIMISE_MORE_r({ \
840 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
842 if (RExC_seen & REG_ZERO_LEN_SEEN) \
843 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
845 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
846 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
848 if (RExC_seen & REG_GPOS_SEEN) \
849 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
851 if (RExC_seen & REG_RECURSE_SEEN) \
852 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
854 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
855 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
857 if (RExC_seen & REG_VERBARG_SEEN) \
858 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
860 if (RExC_seen & REG_CUTGROUP_SEEN) \
861 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
863 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
864 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
866 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
867 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
869 if (RExC_seen & REG_GOSTART_SEEN) \
870 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
872 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
873 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
875 PerlIO_printf(Perl_debug_log,"\n"); \
878 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
879 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
881 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
883 PerlIO_printf(Perl_debug_log, "%s", open_str); \
884 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
885 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
886 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
887 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
888 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
889 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
890 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
891 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
892 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
893 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
894 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
895 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
896 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
897 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
898 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
899 PerlIO_printf(Perl_debug_log, "%s", close_str); \
903 #define DEBUG_STUDYDATA(str,data,depth) \
904 DEBUG_OPTIMISE_MORE_r(if(data){ \
905 PerlIO_printf(Perl_debug_log, \
906 "%*s" str "Pos:%"IVdf"/%"IVdf \
908 (int)(depth)*2, "", \
909 (IV)((data)->pos_min), \
910 (IV)((data)->pos_delta), \
911 (UV)((data)->flags) \
913 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
914 PerlIO_printf(Perl_debug_log, \
915 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
916 (IV)((data)->whilem_c), \
917 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
918 is_inf ? "INF " : "" \
920 if ((data)->last_found) \
921 PerlIO_printf(Perl_debug_log, \
922 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
923 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
924 SvPVX_const((data)->last_found), \
925 (IV)((data)->last_end), \
926 (IV)((data)->last_start_min), \
927 (IV)((data)->last_start_max), \
928 ((data)->longest && \
929 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
930 SvPVX_const((data)->longest_fixed), \
931 (IV)((data)->offset_fixed), \
932 ((data)->longest && \
933 (data)->longest==&((data)->longest_float)) ? "*" : "", \
934 SvPVX_const((data)->longest_float), \
935 (IV)((data)->offset_float_min), \
936 (IV)((data)->offset_float_max) \
938 PerlIO_printf(Perl_debug_log,"\n"); \
941 /* is c a control character for which we have a mnemonic? */
942 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
945 S_cntrl_to_mnemonic(const U8 c)
947 /* Returns the mnemonic string that represents character 'c', if one
948 * exists; NULL otherwise. The only ones that exist for the purposes of
949 * this routine are a few control characters */
952 case '\a': return "\\a";
953 case '\b': return "\\b";
954 case ESC_NATIVE: return "\\e";
955 case '\f': return "\\f";
956 case '\n': return "\\n";
957 case '\r': return "\\r";
958 case '\t': return "\\t";
964 /* Mark that we cannot extend a found fixed substring at this point.
965 Update the longest found anchored substring and the longest found
966 floating substrings if needed. */
969 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
970 SSize_t *minlenp, int is_inf)
972 const STRLEN l = CHR_SVLEN(data->last_found);
973 const STRLEN old_l = CHR_SVLEN(*data->longest);
974 GET_RE_DEBUG_FLAGS_DECL;
976 PERL_ARGS_ASSERT_SCAN_COMMIT;
978 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
979 SvSetMagicSV(*data->longest, data->last_found);
980 if (*data->longest == data->longest_fixed) {
981 data->offset_fixed = l ? data->last_start_min : data->pos_min;
982 if (data->flags & SF_BEFORE_EOL)
984 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
986 data->flags &= ~SF_FIX_BEFORE_EOL;
987 data->minlen_fixed=minlenp;
988 data->lookbehind_fixed=0;
990 else { /* *data->longest == data->longest_float */
991 data->offset_float_min = l ? data->last_start_min : data->pos_min;
992 data->offset_float_max = (l
993 ? data->last_start_max
994 : (data->pos_delta > SSize_t_MAX - data->pos_min
996 : data->pos_min + data->pos_delta));
998 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
999 data->offset_float_max = SSize_t_MAX;
1000 if (data->flags & SF_BEFORE_EOL)
1002 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1004 data->flags &= ~SF_FL_BEFORE_EOL;
1005 data->minlen_float=minlenp;
1006 data->lookbehind_float=0;
1009 SvCUR_set(data->last_found, 0);
1011 SV * const sv = data->last_found;
1012 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1013 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1018 data->last_end = -1;
1019 data->flags &= ~SF_BEFORE_EOL;
1020 DEBUG_STUDYDATA("commit: ",data,0);
1023 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1024 * list that describes which code points it matches */
1027 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1029 /* Set the SSC 'ssc' to match an empty string or any code point */
1031 PERL_ARGS_ASSERT_SSC_ANYTHING;
1033 assert(is_ANYOF_SYNTHETIC(ssc));
1035 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1036 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1037 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1041 S_ssc_is_anything(const regnode_ssc *ssc)
1043 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1044 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1045 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1046 * in any way, so there's no point in using it */
1051 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1053 assert(is_ANYOF_SYNTHETIC(ssc));
1055 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1059 /* See if the list consists solely of the range 0 - Infinity */
1060 invlist_iterinit(ssc->invlist);
1061 ret = invlist_iternext(ssc->invlist, &start, &end)
1065 invlist_iterfinish(ssc->invlist);
1071 /* If e.g., both \w and \W are set, matches everything */
1072 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1074 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1075 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1085 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1087 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1088 * string, any code point, or any posix class under locale */
1090 PERL_ARGS_ASSERT_SSC_INIT;
1092 Zero(ssc, 1, regnode_ssc);
1093 set_ANYOF_SYNTHETIC(ssc);
1094 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1097 /* If any portion of the regex is to operate under locale rules that aren't
1098 * fully known at compile time, initialization includes it. The reason
1099 * this isn't done for all regexes is that the optimizer was written under
1100 * the assumption that locale was all-or-nothing. Given the complexity and
1101 * lack of documentation in the optimizer, and that there are inadequate
1102 * test cases for locale, many parts of it may not work properly, it is
1103 * safest to avoid locale unless necessary. */
1104 if (RExC_contains_locale) {
1105 ANYOF_POSIXL_SETALL(ssc);
1108 ANYOF_POSIXL_ZERO(ssc);
1113 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1114 const regnode_ssc *ssc)
1116 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1117 * to the list of code points matched, and locale posix classes; hence does
1118 * not check its flags) */
1123 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1125 assert(is_ANYOF_SYNTHETIC(ssc));
1127 invlist_iterinit(ssc->invlist);
1128 ret = invlist_iternext(ssc->invlist, &start, &end)
1132 invlist_iterfinish(ssc->invlist);
1138 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1146 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1147 const regnode_charclass* const node)
1149 /* Returns a mortal inversion list defining which code points are matched
1150 * by 'node', which is of type ANYOF. Handles complementing the result if
1151 * appropriate. If some code points aren't knowable at this time, the
1152 * returned list must, and will, contain every code point that is a
1155 SV* invlist = sv_2mortal(_new_invlist(0));
1156 SV* only_utf8_locale_invlist = NULL;
1158 const U32 n = ARG(node);
1159 bool new_node_has_latin1 = FALSE;
1161 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1163 /* Look at the data structure created by S_set_ANYOF_arg() */
1164 if (n != ANYOF_ONLY_HAS_BITMAP) {
1165 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1166 AV * const av = MUTABLE_AV(SvRV(rv));
1167 SV **const ary = AvARRAY(av);
1168 assert(RExC_rxi->data->what[n] == 's');
1170 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1171 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1173 else if (ary[0] && ary[0] != &PL_sv_undef) {
1175 /* Here, no compile-time swash, and there are things that won't be
1176 * known until runtime -- we have to assume it could be anything */
1177 return _add_range_to_invlist(invlist, 0, UV_MAX);
1179 else if (ary[3] && ary[3] != &PL_sv_undef) {
1181 /* Here no compile-time swash, and no run-time only data. Use the
1182 * node's inversion list */
1183 invlist = sv_2mortal(invlist_clone(ary[3]));
1186 /* Get the code points valid only under UTF-8 locales */
1187 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1188 && ary[2] && ary[2] != &PL_sv_undef)
1190 only_utf8_locale_invlist = ary[2];
1194 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1195 * code points, and an inversion list for the others, but if there are code
1196 * points that should match only conditionally on the target string being
1197 * UTF-8, those are placed in the inversion list, and not the bitmap.
1198 * Since there are circumstances under which they could match, they are
1199 * included in the SSC. But if the ANYOF node is to be inverted, we have
1200 * to exclude them here, so that when we invert below, the end result
1201 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1202 * have to do this here before we add the unconditionally matched code
1204 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1205 _invlist_intersection_complement_2nd(invlist,
1210 /* Add in the points from the bit map */
1211 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1212 if (ANYOF_BITMAP_TEST(node, i)) {
1213 invlist = add_cp_to_invlist(invlist, i);
1214 new_node_has_latin1 = TRUE;
1218 /* If this can match all upper Latin1 code points, have to add them
1220 if (OP(node) == ANYOFD
1221 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1223 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1226 /* Similarly for these */
1227 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1228 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1231 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1232 _invlist_invert(invlist);
1234 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1236 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1237 * locale. We can skip this if there are no 0-255 at all. */
1238 _invlist_union(invlist, PL_Latin1, &invlist);
1241 /* Similarly add the UTF-8 locale possible matches. These have to be
1242 * deferred until after the non-UTF-8 locale ones are taken care of just
1243 * above, or it leads to wrong results under ANYOF_INVERT */
1244 if (only_utf8_locale_invlist) {
1245 _invlist_union_maybe_complement_2nd(invlist,
1246 only_utf8_locale_invlist,
1247 ANYOF_FLAGS(node) & ANYOF_INVERT,
1254 /* These two functions currently do the exact same thing */
1255 #define ssc_init_zero ssc_init
1257 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1258 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1260 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1261 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1262 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1265 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1266 const regnode_charclass *and_with)
1268 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1269 * another SSC or a regular ANYOF class. Can create false positives. */
1274 PERL_ARGS_ASSERT_SSC_AND;
1276 assert(is_ANYOF_SYNTHETIC(ssc));
1278 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1279 * the code point inversion list and just the relevant flags */
1280 if (is_ANYOF_SYNTHETIC(and_with)) {
1281 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1282 anded_flags = ANYOF_FLAGS(and_with);
1284 /* XXX This is a kludge around what appears to be deficiencies in the
1285 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1286 * there are paths through the optimizer where it doesn't get weeded
1287 * out when it should. And if we don't make some extra provision for
1288 * it like the code just below, it doesn't get added when it should.
1289 * This solution is to add it only when AND'ing, which is here, and
1290 * only when what is being AND'ed is the pristine, original node
1291 * matching anything. Thus it is like adding it to ssc_anything() but
1292 * only when the result is to be AND'ed. Probably the same solution
1293 * could be adopted for the same problem we have with /l matching,
1294 * which is solved differently in S_ssc_init(), and that would lead to
1295 * fewer false positives than that solution has. But if this solution
1296 * creates bugs, the consequences are only that a warning isn't raised
1297 * that should be; while the consequences for having /l bugs is
1298 * incorrect matches */
1299 if (ssc_is_anything((regnode_ssc *)and_with)) {
1300 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1304 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1305 if (OP(and_with) == ANYOFD) {
1306 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1309 anded_flags = ANYOF_FLAGS(and_with)
1310 &( ANYOF_COMMON_FLAGS
1311 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
1315 ANYOF_FLAGS(ssc) &= anded_flags;
1317 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1318 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1319 * 'and_with' may be inverted. When not inverted, we have the situation of
1321 * (C1 | P1) & (C2 | P2)
1322 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1323 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1324 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1325 * <= ((C1 & C2) | P1 | P2)
1326 * Alternatively, the last few steps could be:
1327 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1328 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1329 * <= (C1 | C2 | (P1 & P2))
1330 * We favor the second approach if either P1 or P2 is non-empty. This is
1331 * because these components are a barrier to doing optimizations, as what
1332 * they match cannot be known until the moment of matching as they are
1333 * dependent on the current locale, 'AND"ing them likely will reduce or
1335 * But we can do better if we know that C1,P1 are in their initial state (a
1336 * frequent occurrence), each matching everything:
1337 * (<everything>) & (C2 | P2) = C2 | P2
1338 * Similarly, if C2,P2 are in their initial state (again a frequent
1339 * occurrence), the result is a no-op
1340 * (C1 | P1) & (<everything>) = C1 | P1
1343 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1344 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1345 * <= (C1 & ~C2) | (P1 & ~P2)
1348 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1349 && ! is_ANYOF_SYNTHETIC(and_with))
1353 ssc_intersection(ssc,
1355 FALSE /* Has already been inverted */
1358 /* If either P1 or P2 is empty, the intersection will be also; can skip
1360 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1361 ANYOF_POSIXL_ZERO(ssc);
1363 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1365 /* Note that the Posix class component P from 'and_with' actually
1367 * P = Pa | Pb | ... | Pn
1368 * where each component is one posix class, such as in [\w\s].
1370 * ~P = ~(Pa | Pb | ... | Pn)
1371 * = ~Pa & ~Pb & ... & ~Pn
1372 * <= ~Pa | ~Pb | ... | ~Pn
1373 * The last is something we can easily calculate, but unfortunately
1374 * is likely to have many false positives. We could do better
1375 * in some (but certainly not all) instances if two classes in
1376 * P have known relationships. For example
1377 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1379 * :lower: & :print: = :lower:
1380 * And similarly for classes that must be disjoint. For example,
1381 * since \s and \w can have no elements in common based on rules in
1382 * the POSIX standard,
1383 * \w & ^\S = nothing
1384 * Unfortunately, some vendor locales do not meet the Posix
1385 * standard, in particular almost everything by Microsoft.
1386 * The loop below just changes e.g., \w into \W and vice versa */
1388 regnode_charclass_posixl temp;
1389 int add = 1; /* To calculate the index of the complement */
1391 ANYOF_POSIXL_ZERO(&temp);
1392 for (i = 0; i < ANYOF_MAX; i++) {
1394 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1395 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1397 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1398 ANYOF_POSIXL_SET(&temp, i + add);
1400 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1402 ANYOF_POSIXL_AND(&temp, ssc);
1404 } /* else ssc already has no posixes */
1405 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1406 in its initial state */
1407 else if (! is_ANYOF_SYNTHETIC(and_with)
1408 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1410 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1411 * copy it over 'ssc' */
1412 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1413 if (is_ANYOF_SYNTHETIC(and_with)) {
1414 StructCopy(and_with, ssc, regnode_ssc);
1417 ssc->invlist = anded_cp_list;
1418 ANYOF_POSIXL_ZERO(ssc);
1419 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1424 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1425 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1427 /* One or the other of P1, P2 is non-empty. */
1428 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1429 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1431 ssc_union(ssc, anded_cp_list, FALSE);
1433 else { /* P1 = P2 = empty */
1434 ssc_intersection(ssc, anded_cp_list, FALSE);
1440 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1441 const regnode_charclass *or_with)
1443 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1444 * another SSC or a regular ANYOF class. Can create false positives if
1445 * 'or_with' is to be inverted. */
1450 PERL_ARGS_ASSERT_SSC_OR;
1452 assert(is_ANYOF_SYNTHETIC(ssc));
1454 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1455 * the code point inversion list and just the relevant flags */
1456 if (is_ANYOF_SYNTHETIC(or_with)) {
1457 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1458 ored_flags = ANYOF_FLAGS(or_with);
1461 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1462 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1463 if (OP(or_with) != ANYOFD) {
1465 |= ANYOF_FLAGS(or_with)
1466 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1470 ANYOF_FLAGS(ssc) |= ored_flags;
1472 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1473 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1474 * 'or_with' may be inverted. When not inverted, we have the simple
1475 * situation of computing:
1476 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1477 * If P1|P2 yields a situation with both a class and its complement are
1478 * set, like having both \w and \W, this matches all code points, and we
1479 * can delete these from the P component of the ssc going forward. XXX We
1480 * might be able to delete all the P components, but I (khw) am not certain
1481 * about this, and it is better to be safe.
1484 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1485 * <= (C1 | P1) | ~C2
1486 * <= (C1 | ~C2) | P1
1487 * (which results in actually simpler code than the non-inverted case)
1490 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1491 && ! is_ANYOF_SYNTHETIC(or_with))
1493 /* We ignore P2, leaving P1 going forward */
1494 } /* else Not inverted */
1495 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1496 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1497 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1499 for (i = 0; i < ANYOF_MAX; i += 2) {
1500 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1502 ssc_match_all_cp(ssc);
1503 ANYOF_POSIXL_CLEAR(ssc, i);
1504 ANYOF_POSIXL_CLEAR(ssc, i+1);
1512 FALSE /* Already has been inverted */
1516 PERL_STATIC_INLINE void
1517 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1519 PERL_ARGS_ASSERT_SSC_UNION;
1521 assert(is_ANYOF_SYNTHETIC(ssc));
1523 _invlist_union_maybe_complement_2nd(ssc->invlist,
1529 PERL_STATIC_INLINE void
1530 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1532 const bool invert2nd)
1534 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1536 assert(is_ANYOF_SYNTHETIC(ssc));
1538 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1544 PERL_STATIC_INLINE void
1545 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1547 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1549 assert(is_ANYOF_SYNTHETIC(ssc));
1551 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1554 PERL_STATIC_INLINE void
1555 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1557 /* AND just the single code point 'cp' into the SSC 'ssc' */
1559 SV* cp_list = _new_invlist(2);
1561 PERL_ARGS_ASSERT_SSC_CP_AND;
1563 assert(is_ANYOF_SYNTHETIC(ssc));
1565 cp_list = add_cp_to_invlist(cp_list, cp);
1566 ssc_intersection(ssc, cp_list,
1567 FALSE /* Not inverted */
1569 SvREFCNT_dec_NN(cp_list);
1572 PERL_STATIC_INLINE void
1573 S_ssc_clear_locale(regnode_ssc *ssc)
1575 /* Set the SSC 'ssc' to not match any locale things */
1576 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1578 assert(is_ANYOF_SYNTHETIC(ssc));
1580 ANYOF_POSIXL_ZERO(ssc);
1581 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1584 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1587 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1589 /* The synthetic start class is used to hopefully quickly winnow down
1590 * places where a pattern could start a match in the target string. If it
1591 * doesn't really narrow things down that much, there isn't much point to
1592 * having the overhead of using it. This function uses some very crude
1593 * heuristics to decide if to use the ssc or not.
1595 * It returns TRUE if 'ssc' rules out more than half what it considers to
1596 * be the "likely" possible matches, but of course it doesn't know what the
1597 * actual things being matched are going to be; these are only guesses
1599 * For /l matches, it assumes that the only likely matches are going to be
1600 * in the 0-255 range, uniformly distributed, so half of that is 127
1601 * For /a and /d matches, it assumes that the likely matches will be just
1602 * the ASCII range, so half of that is 63
1603 * For /u and there isn't anything matching above the Latin1 range, it
1604 * assumes that that is the only range likely to be matched, and uses
1605 * half that as the cut-off: 127. If anything matches above Latin1,
1606 * it assumes that all of Unicode could match (uniformly), except for
1607 * non-Unicode code points and things in the General Category "Other"
1608 * (unassigned, private use, surrogates, controls and formats). This
1609 * is a much large number. */
1611 const U32 max_match = (LOC)
1615 : (invlist_highest(ssc->invlist) < 256)
1617 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1618 U32 count = 0; /* Running total of number of code points matched by
1620 UV start, end; /* Start and end points of current range in inversion
1623 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1625 invlist_iterinit(ssc->invlist);
1626 while (invlist_iternext(ssc->invlist, &start, &end)) {
1628 /* /u is the only thing that we expect to match above 255; so if not /u
1629 * and even if there are matches above 255, ignore them. This catches
1630 * things like \d under /d which does match the digits above 255, but
1631 * since the pattern is /d, it is not likely to be expecting them */
1632 if (! UNI_SEMANTICS) {
1636 end = MIN(end, 255);
1638 count += end - start + 1;
1639 if (count > max_match) {
1640 invlist_iterfinish(ssc->invlist);
1650 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1652 /* The inversion list in the SSC is marked mortal; now we need a more
1653 * permanent copy, which is stored the same way that is done in a regular
1654 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1657 SV* invlist = invlist_clone(ssc->invlist);
1659 PERL_ARGS_ASSERT_SSC_FINALIZE;
1661 assert(is_ANYOF_SYNTHETIC(ssc));
1663 /* The code in this file assumes that all but these flags aren't relevant
1664 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1665 * by the time we reach here */
1666 assert(! (ANYOF_FLAGS(ssc)
1667 & ~( ANYOF_COMMON_FLAGS
1668 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
1670 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1672 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1673 NULL, NULL, NULL, FALSE);
1675 /* Make sure is clone-safe */
1676 ssc->invlist = NULL;
1678 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1679 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1682 if (RExC_contains_locale) {
1686 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1689 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1690 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1691 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1692 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1693 ? (TRIE_LIST_CUR( idx ) - 1) \
1699 dump_trie(trie,widecharmap,revcharmap)
1700 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1701 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1703 These routines dump out a trie in a somewhat readable format.
1704 The _interim_ variants are used for debugging the interim
1705 tables that are used to generate the final compressed
1706 representation which is what dump_trie expects.
1708 Part of the reason for their existence is to provide a form
1709 of documentation as to how the different representations function.
1714 Dumps the final compressed table form of the trie to Perl_debug_log.
1715 Used for debugging make_trie().
1719 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1720 AV *revcharmap, U32 depth)
1723 SV *sv=sv_newmortal();
1724 int colwidth= widecharmap ? 6 : 4;
1726 GET_RE_DEBUG_FLAGS_DECL;
1728 PERL_ARGS_ASSERT_DUMP_TRIE;
1730 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1731 (int)depth * 2 + 2,"",
1732 "Match","Base","Ofs" );
1734 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1735 SV ** const tmp = av_fetch( revcharmap, state, 0);
1737 PerlIO_printf( Perl_debug_log, "%*s",
1739 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1740 PL_colors[0], PL_colors[1],
1741 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1742 PERL_PV_ESCAPE_FIRSTCHAR
1747 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1748 (int)depth * 2 + 2,"");
1750 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1751 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1752 PerlIO_printf( Perl_debug_log, "\n");
1754 for( state = 1 ; state < trie->statecount ; state++ ) {
1755 const U32 base = trie->states[ state ].trans.base;
1757 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1758 (int)depth * 2 + 2,"", (UV)state);
1760 if ( trie->states[ state ].wordnum ) {
1761 PerlIO_printf( Perl_debug_log, " W%4X",
1762 trie->states[ state ].wordnum );
1764 PerlIO_printf( Perl_debug_log, "%6s", "" );
1767 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1772 while( ( base + ofs < trie->uniquecharcount ) ||
1773 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1774 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1778 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1780 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1781 if ( ( base + ofs >= trie->uniquecharcount )
1782 && ( base + ofs - trie->uniquecharcount
1784 && trie->trans[ base + ofs
1785 - trie->uniquecharcount ].check == state )
1787 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1789 (UV)trie->trans[ base + ofs
1790 - trie->uniquecharcount ].next );
1792 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1796 PerlIO_printf( Perl_debug_log, "]");
1799 PerlIO_printf( Perl_debug_log, "\n" );
1801 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1803 for (word=1; word <= trie->wordcount; word++) {
1804 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1805 (int)word, (int)(trie->wordinfo[word].prev),
1806 (int)(trie->wordinfo[word].len));
1808 PerlIO_printf(Perl_debug_log, "\n" );
1811 Dumps a fully constructed but uncompressed trie in list form.
1812 List tries normally only are used for construction when the number of
1813 possible chars (trie->uniquecharcount) is very high.
1814 Used for debugging make_trie().
1817 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1818 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1822 SV *sv=sv_newmortal();
1823 int colwidth= widecharmap ? 6 : 4;
1824 GET_RE_DEBUG_FLAGS_DECL;
1826 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1828 /* print out the table precompression. */
1829 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1830 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1831 "------:-----+-----------------\n" );
1833 for( state=1 ; state < next_alloc ; state ++ ) {
1836 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1837 (int)depth * 2 + 2,"", (UV)state );
1838 if ( ! trie->states[ state ].wordnum ) {
1839 PerlIO_printf( Perl_debug_log, "%5s| ","");
1841 PerlIO_printf( Perl_debug_log, "W%4x| ",
1842 trie->states[ state ].wordnum
1845 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1846 SV ** const tmp = av_fetch( revcharmap,
1847 TRIE_LIST_ITEM(state,charid).forid, 0);
1849 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1851 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1853 PL_colors[0], PL_colors[1],
1854 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1855 | PERL_PV_ESCAPE_FIRSTCHAR
1857 TRIE_LIST_ITEM(state,charid).forid,
1858 (UV)TRIE_LIST_ITEM(state,charid).newstate
1861 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1862 (int)((depth * 2) + 14), "");
1865 PerlIO_printf( Perl_debug_log, "\n");
1870 Dumps a fully constructed but uncompressed trie in table form.
1871 This is the normal DFA style state transition table, with a few
1872 twists to facilitate compression later.
1873 Used for debugging make_trie().
1876 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1877 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1882 SV *sv=sv_newmortal();
1883 int colwidth= widecharmap ? 6 : 4;
1884 GET_RE_DEBUG_FLAGS_DECL;
1886 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1889 print out the table precompression so that we can do a visual check
1890 that they are identical.
1893 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1895 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1896 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1898 PerlIO_printf( Perl_debug_log, "%*s",
1900 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1901 PL_colors[0], PL_colors[1],
1902 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1903 PERL_PV_ESCAPE_FIRSTCHAR
1909 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1911 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1912 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1915 PerlIO_printf( Perl_debug_log, "\n" );
1917 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1919 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1920 (int)depth * 2 + 2,"",
1921 (UV)TRIE_NODENUM( state ) );
1923 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1924 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1926 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1928 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1930 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1931 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1932 (UV)trie->trans[ state ].check );
1934 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1935 (UV)trie->trans[ state ].check,
1936 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1944 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1945 startbranch: the first branch in the whole branch sequence
1946 first : start branch of sequence of branch-exact nodes.
1947 May be the same as startbranch
1948 last : Thing following the last branch.
1949 May be the same as tail.
1950 tail : item following the branch sequence
1951 count : words in the sequence
1952 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1953 depth : indent depth
1955 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1957 A trie is an N'ary tree where the branches are determined by digital
1958 decomposition of the key. IE, at the root node you look up the 1st character and
1959 follow that branch repeat until you find the end of the branches. Nodes can be
1960 marked as "accepting" meaning they represent a complete word. Eg:
1964 would convert into the following structure. Numbers represent states, letters
1965 following numbers represent valid transitions on the letter from that state, if
1966 the number is in square brackets it represents an accepting state, otherwise it
1967 will be in parenthesis.
1969 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1973 (1) +-i->(6)-+-s->[7]
1975 +-s->(3)-+-h->(4)-+-e->[5]
1977 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1979 This shows that when matching against the string 'hers' we will begin at state 1
1980 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1981 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1982 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1983 single traverse. We store a mapping from accepting to state to which word was
1984 matched, and then when we have multiple possibilities we try to complete the
1985 rest of the regex in the order in which they occurred in the alternation.
1987 The only prior NFA like behaviour that would be changed by the TRIE support is
1988 the silent ignoring of duplicate alternations which are of the form:
1990 / (DUPE|DUPE) X? (?{ ... }) Y /x
1992 Thus EVAL blocks following a trie may be called a different number of times with
1993 and without the optimisation. With the optimisations dupes will be silently
1994 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1995 the following demonstrates:
1997 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1999 which prints out 'word' three times, but
2001 'words'=~/(word|word|word)(?{ print $1 })S/
2003 which doesnt print it out at all. This is due to other optimisations kicking in.
2005 Example of what happens on a structural level:
2007 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2009 1: CURLYM[1] {1,32767}(18)
2020 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2021 and should turn into:
2023 1: CURLYM[1] {1,32767}(18)
2025 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2033 Cases where tail != last would be like /(?foo|bar)baz/:
2043 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2044 and would end up looking like:
2047 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2054 d = uvchr_to_utf8_flags(d, uv, 0);
2056 is the recommended Unicode-aware way of saying
2061 #define TRIE_STORE_REVCHAR(val) \
2064 SV *zlopp = newSV(UTF8_MAXBYTES); \
2065 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2066 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2067 SvCUR_set(zlopp, kapow - flrbbbbb); \
2070 av_push(revcharmap, zlopp); \
2072 char ooooff = (char)val; \
2073 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2077 /* This gets the next character from the input, folding it if not already
2079 #define TRIE_READ_CHAR STMT_START { \
2082 /* if it is UTF then it is either already folded, or does not need \
2084 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2086 else if (folder == PL_fold_latin1) { \
2087 /* This folder implies Unicode rules, which in the range expressible \
2088 * by not UTF is the lower case, with the two exceptions, one of \
2089 * which should have been taken care of before calling this */ \
2090 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2091 uvc = toLOWER_L1(*uc); \
2092 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2095 /* raw data, will be folded later if needed */ \
2103 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2104 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2105 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2106 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2108 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2109 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2110 TRIE_LIST_CUR( state )++; \
2113 #define TRIE_LIST_NEW(state) STMT_START { \
2114 Newxz( trie->states[ state ].trans.list, \
2115 4, reg_trie_trans_le ); \
2116 TRIE_LIST_CUR( state ) = 1; \
2117 TRIE_LIST_LEN( state ) = 4; \
2120 #define TRIE_HANDLE_WORD(state) STMT_START { \
2121 U16 dupe= trie->states[ state ].wordnum; \
2122 regnode * const noper_next = regnext( noper ); \
2125 /* store the word for dumping */ \
2127 if (OP(noper) != NOTHING) \
2128 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2130 tmp = newSVpvn_utf8( "", 0, UTF ); \
2131 av_push( trie_words, tmp ); \
2135 trie->wordinfo[curword].prev = 0; \
2136 trie->wordinfo[curword].len = wordlen; \
2137 trie->wordinfo[curword].accept = state; \
2139 if ( noper_next < tail ) { \
2141 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2143 trie->jump[curword] = (U16)(noper_next - convert); \
2145 jumper = noper_next; \
2147 nextbranch= regnext(cur); \
2151 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2152 /* chain, so that when the bits of chain are later */\
2153 /* linked together, the dups appear in the chain */\
2154 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2155 trie->wordinfo[dupe].prev = curword; \
2157 /* we haven't inserted this word yet. */ \
2158 trie->states[ state ].wordnum = curword; \
2163 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2164 ( ( base + charid >= ucharcount \
2165 && base + charid < ubound \
2166 && state == trie->trans[ base - ucharcount + charid ].check \
2167 && trie->trans[ base - ucharcount + charid ].next ) \
2168 ? trie->trans[ base - ucharcount + charid ].next \
2169 : ( state==1 ? special : 0 ) \
2173 #define MADE_JUMP_TRIE 2
2174 #define MADE_EXACT_TRIE 4
2177 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2178 regnode *first, regnode *last, regnode *tail,
2179 U32 word_count, U32 flags, U32 depth)
2181 /* first pass, loop through and scan words */
2182 reg_trie_data *trie;
2183 HV *widecharmap = NULL;
2184 AV *revcharmap = newAV();
2190 regnode *jumper = NULL;
2191 regnode *nextbranch = NULL;
2192 regnode *convert = NULL;
2193 U32 *prev_states; /* temp array mapping each state to previous one */
2194 /* we just use folder as a flag in utf8 */
2195 const U8 * folder = NULL;
2198 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2199 AV *trie_words = NULL;
2200 /* along with revcharmap, this only used during construction but both are
2201 * useful during debugging so we store them in the struct when debugging.
2204 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2205 STRLEN trie_charcount=0;
2207 SV *re_trie_maxbuff;
2208 GET_RE_DEBUG_FLAGS_DECL;
2210 PERL_ARGS_ASSERT_MAKE_TRIE;
2212 PERL_UNUSED_ARG(depth);
2216 case EXACT: case EXACTL: break;
2220 case EXACTFLU8: folder = PL_fold_latin1; break;
2221 case EXACTF: folder = PL_fold; break;
2222 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2225 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2227 trie->startstate = 1;
2228 trie->wordcount = word_count;
2229 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2230 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2231 if (flags == EXACT || flags == EXACTL)
2232 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2233 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2234 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2237 trie_words = newAV();
2240 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2241 assert(re_trie_maxbuff);
2242 if (!SvIOK(re_trie_maxbuff)) {
2243 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2245 DEBUG_TRIE_COMPILE_r({
2246 PerlIO_printf( Perl_debug_log,
2247 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2248 (int)depth * 2 + 2, "",
2249 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2250 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2253 /* Find the node we are going to overwrite */
2254 if ( first == startbranch && OP( last ) != BRANCH ) {
2255 /* whole branch chain */
2258 /* branch sub-chain */
2259 convert = NEXTOPER( first );
2262 /* -- First loop and Setup --
2264 We first traverse the branches and scan each word to determine if it
2265 contains widechars, and how many unique chars there are, this is
2266 important as we have to build a table with at least as many columns as we
2269 We use an array of integers to represent the character codes 0..255
2270 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2271 the native representation of the character value as the key and IV's for
2274 *TODO* If we keep track of how many times each character is used we can
2275 remap the columns so that the table compression later on is more
2276 efficient in terms of memory by ensuring the most common value is in the
2277 middle and the least common are on the outside. IMO this would be better
2278 than a most to least common mapping as theres a decent chance the most
2279 common letter will share a node with the least common, meaning the node
2280 will not be compressible. With a middle is most common approach the worst
2281 case is when we have the least common nodes twice.
2285 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2286 regnode *noper = NEXTOPER( cur );
2287 const U8 *uc = (U8*)STRING( noper );
2288 const U8 *e = uc + STR_LEN( noper );
2290 U32 wordlen = 0; /* required init */
2291 STRLEN minchars = 0;
2292 STRLEN maxchars = 0;
2293 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2296 if (OP(noper) == NOTHING) {
2297 regnode *noper_next= regnext(noper);
2298 if (noper_next != tail && OP(noper_next) == flags) {
2300 uc= (U8*)STRING(noper);
2301 e= uc + STR_LEN(noper);
2302 trie->minlen= STR_LEN(noper);
2309 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2310 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2311 regardless of encoding */
2312 if (OP( noper ) == EXACTFU_SS) {
2313 /* false positives are ok, so just set this */
2314 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2317 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2319 TRIE_CHARCOUNT(trie)++;
2322 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2323 * is in effect. Under /i, this character can match itself, or
2324 * anything that folds to it. If not under /i, it can match just
2325 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2326 * all fold to k, and all are single characters. But some folds
2327 * expand to more than one character, so for example LATIN SMALL
2328 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2329 * the string beginning at 'uc' is 'ffi', it could be matched by
2330 * three characters, or just by the one ligature character. (It
2331 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2332 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2333 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2334 * match.) The trie needs to know the minimum and maximum number
2335 * of characters that could match so that it can use size alone to
2336 * quickly reject many match attempts. The max is simple: it is
2337 * the number of folded characters in this branch (since a fold is
2338 * never shorter than what folds to it. */
2342 /* And the min is equal to the max if not under /i (indicated by
2343 * 'folder' being NULL), or there are no multi-character folds. If
2344 * there is a multi-character fold, the min is incremented just
2345 * once, for the character that folds to the sequence. Each
2346 * character in the sequence needs to be added to the list below of
2347 * characters in the trie, but we count only the first towards the
2348 * min number of characters needed. This is done through the
2349 * variable 'foldlen', which is returned by the macros that look
2350 * for these sequences as the number of bytes the sequence
2351 * occupies. Each time through the loop, we decrement 'foldlen' by
2352 * how many bytes the current char occupies. Only when it reaches
2353 * 0 do we increment 'minchars' or look for another multi-character
2355 if (folder == NULL) {
2358 else if (foldlen > 0) {
2359 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2364 /* See if *uc is the beginning of a multi-character fold. If
2365 * so, we decrement the length remaining to look at, to account
2366 * for the current character this iteration. (We can use 'uc'
2367 * instead of the fold returned by TRIE_READ_CHAR because for
2368 * non-UTF, the latin1_safe macro is smart enough to account
2369 * for all the unfolded characters, and because for UTF, the
2370 * string will already have been folded earlier in the
2371 * compilation process */
2373 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2374 foldlen -= UTF8SKIP(uc);
2377 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2382 /* The current character (and any potential folds) should be added
2383 * to the possible matching characters for this position in this
2387 U8 folded= folder[ (U8) uvc ];
2388 if ( !trie->charmap[ folded ] ) {
2389 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2390 TRIE_STORE_REVCHAR( folded );
2393 if ( !trie->charmap[ uvc ] ) {
2394 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2395 TRIE_STORE_REVCHAR( uvc );
2398 /* store the codepoint in the bitmap, and its folded
2400 TRIE_BITMAP_SET(trie, uvc);
2402 /* store the folded codepoint */
2403 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2406 /* store first byte of utf8 representation of
2407 variant codepoints */
2408 if (! UVCHR_IS_INVARIANT(uvc)) {
2409 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2412 set_bit = 0; /* We've done our bit :-) */
2416 /* XXX We could come up with the list of code points that fold
2417 * to this using PL_utf8_foldclosures, except not for
2418 * multi-char folds, as there may be multiple combinations
2419 * there that could work, which needs to wait until runtime to
2420 * resolve (The comment about LIGATURE FFI above is such an
2425 widecharmap = newHV();
2427 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2430 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2432 if ( !SvTRUE( *svpp ) ) {
2433 sv_setiv( *svpp, ++trie->uniquecharcount );
2434 TRIE_STORE_REVCHAR(uvc);
2437 } /* end loop through characters in this branch of the trie */
2439 /* We take the min and max for this branch and combine to find the min
2440 * and max for all branches processed so far */
2441 if( cur == first ) {
2442 trie->minlen = minchars;
2443 trie->maxlen = maxchars;
2444 } else if (minchars < trie->minlen) {
2445 trie->minlen = minchars;
2446 } else if (maxchars > trie->maxlen) {
2447 trie->maxlen = maxchars;
2449 } /* end first pass */
2450 DEBUG_TRIE_COMPILE_r(
2451 PerlIO_printf( Perl_debug_log,
2452 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2453 (int)depth * 2 + 2,"",
2454 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2455 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2456 (int)trie->minlen, (int)trie->maxlen )
2460 We now know what we are dealing with in terms of unique chars and
2461 string sizes so we can calculate how much memory a naive
2462 representation using a flat table will take. If it's over a reasonable
2463 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2464 conservative but potentially much slower representation using an array
2467 At the end we convert both representations into the same compressed
2468 form that will be used in regexec.c for matching with. The latter
2469 is a form that cannot be used to construct with but has memory
2470 properties similar to the list form and access properties similar
2471 to the table form making it both suitable for fast searches and
2472 small enough that its feasable to store for the duration of a program.
2474 See the comment in the code where the compressed table is produced
2475 inplace from the flat tabe representation for an explanation of how
2476 the compression works.
2481 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2484 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2485 > SvIV(re_trie_maxbuff) )
2488 Second Pass -- Array Of Lists Representation
2490 Each state will be represented by a list of charid:state records
2491 (reg_trie_trans_le) the first such element holds the CUR and LEN
2492 points of the allocated array. (See defines above).
2494 We build the initial structure using the lists, and then convert
2495 it into the compressed table form which allows faster lookups
2496 (but cant be modified once converted).
2499 STRLEN transcount = 1;
2501 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2502 "%*sCompiling trie using list compiler\n",
2503 (int)depth * 2 + 2, ""));
2505 trie->states = (reg_trie_state *)
2506 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2507 sizeof(reg_trie_state) );
2511 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2513 regnode *noper = NEXTOPER( cur );
2514 U8 *uc = (U8*)STRING( noper );
2515 const U8 *e = uc + STR_LEN( noper );
2516 U32 state = 1; /* required init */
2517 U16 charid = 0; /* sanity init */
2518 U32 wordlen = 0; /* required init */
2520 if (OP(noper) == NOTHING) {
2521 regnode *noper_next= regnext(noper);
2522 if (noper_next != tail && OP(noper_next) == flags) {
2524 uc= (U8*)STRING(noper);
2525 e= uc + STR_LEN(noper);
2529 if (OP(noper) != NOTHING) {
2530 for ( ; uc < e ; uc += len ) {
2535 charid = trie->charmap[ uvc ];
2537 SV** const svpp = hv_fetch( widecharmap,
2544 charid=(U16)SvIV( *svpp );
2547 /* charid is now 0 if we dont know the char read, or
2548 * nonzero if we do */
2555 if ( !trie->states[ state ].trans.list ) {
2556 TRIE_LIST_NEW( state );
2559 check <= TRIE_LIST_USED( state );
2562 if ( TRIE_LIST_ITEM( state, check ).forid
2565 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2570 newstate = next_alloc++;
2571 prev_states[newstate] = state;
2572 TRIE_LIST_PUSH( state, charid, newstate );
2577 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2581 TRIE_HANDLE_WORD(state);
2583 } /* end second pass */
2585 /* next alloc is the NEXT state to be allocated */
2586 trie->statecount = next_alloc;
2587 trie->states = (reg_trie_state *)
2588 PerlMemShared_realloc( trie->states,
2590 * sizeof(reg_trie_state) );
2592 /* and now dump it out before we compress it */
2593 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2594 revcharmap, next_alloc,
2598 trie->trans = (reg_trie_trans *)
2599 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2606 for( state=1 ; state < next_alloc ; state ++ ) {
2610 DEBUG_TRIE_COMPILE_MORE_r(
2611 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2615 if (trie->states[state].trans.list) {
2616 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2620 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2621 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2622 if ( forid < minid ) {
2624 } else if ( forid > maxid ) {
2628 if ( transcount < tp + maxid - minid + 1) {
2630 trie->trans = (reg_trie_trans *)
2631 PerlMemShared_realloc( trie->trans,
2633 * sizeof(reg_trie_trans) );
2634 Zero( trie->trans + (transcount / 2),
2638 base = trie->uniquecharcount + tp - minid;
2639 if ( maxid == minid ) {
2641 for ( ; zp < tp ; zp++ ) {
2642 if ( ! trie->trans[ zp ].next ) {
2643 base = trie->uniquecharcount + zp - minid;
2644 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2646 trie->trans[ zp ].check = state;
2652 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2654 trie->trans[ tp ].check = state;
2659 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2660 const U32 tid = base
2661 - trie->uniquecharcount
2662 + TRIE_LIST_ITEM( state, idx ).forid;
2663 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2665 trie->trans[ tid ].check = state;
2667 tp += ( maxid - minid + 1 );
2669 Safefree(trie->states[ state ].trans.list);
2672 DEBUG_TRIE_COMPILE_MORE_r(
2673 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2676 trie->states[ state ].trans.base=base;
2678 trie->lasttrans = tp + 1;
2682 Second Pass -- Flat Table Representation.
2684 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2685 each. We know that we will need Charcount+1 trans at most to store
2686 the data (one row per char at worst case) So we preallocate both
2687 structures assuming worst case.
2689 We then construct the trie using only the .next slots of the entry
2692 We use the .check field of the first entry of the node temporarily
2693 to make compression both faster and easier by keeping track of how
2694 many non zero fields are in the node.
2696 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2699 There are two terms at use here: state as a TRIE_NODEIDX() which is
2700 a number representing the first entry of the node, and state as a
2701 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2702 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2703 if there are 2 entrys per node. eg:
2711 The table is internally in the right hand, idx form. However as we
2712 also have to deal with the states array which is indexed by nodenum
2713 we have to use TRIE_NODENUM() to convert.
2716 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2717 "%*sCompiling trie using table compiler\n",
2718 (int)depth * 2 + 2, ""));
2720 trie->trans = (reg_trie_trans *)
2721 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2722 * trie->uniquecharcount + 1,
2723 sizeof(reg_trie_trans) );
2724 trie->states = (reg_trie_state *)
2725 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2726 sizeof(reg_trie_state) );
2727 next_alloc = trie->uniquecharcount + 1;
2730 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2732 regnode *noper = NEXTOPER( cur );
2733 const U8 *uc = (U8*)STRING( noper );
2734 const U8 *e = uc + STR_LEN( noper );
2736 U32 state = 1; /* required init */
2738 U16 charid = 0; /* sanity init */
2739 U32 accept_state = 0; /* sanity init */
2741 U32 wordlen = 0; /* required init */
2743 if (OP(noper) == NOTHING) {
2744 regnode *noper_next= regnext(noper);
2745 if (noper_next != tail && OP(noper_next) == flags) {
2747 uc= (U8*)STRING(noper);
2748 e= uc + STR_LEN(noper);
2752 if ( OP(noper) != NOTHING ) {
2753 for ( ; uc < e ; uc += len ) {
2758 charid = trie->charmap[ uvc ];
2760 SV* const * const svpp = hv_fetch( widecharmap,
2764 charid = svpp ? (U16)SvIV(*svpp) : 0;
2768 if ( !trie->trans[ state + charid ].next ) {
2769 trie->trans[ state + charid ].next = next_alloc;
2770 trie->trans[ state ].check++;
2771 prev_states[TRIE_NODENUM(next_alloc)]
2772 = TRIE_NODENUM(state);
2773 next_alloc += trie->uniquecharcount;
2775 state = trie->trans[ state + charid ].next;
2777 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2779 /* charid is now 0 if we dont know the char read, or
2780 * nonzero if we do */
2783 accept_state = TRIE_NODENUM( state );
2784 TRIE_HANDLE_WORD(accept_state);
2786 } /* end second pass */
2788 /* and now dump it out before we compress it */
2789 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2791 next_alloc, depth+1));
2795 * Inplace compress the table.*
2797 For sparse data sets the table constructed by the trie algorithm will
2798 be mostly 0/FAIL transitions or to put it another way mostly empty.
2799 (Note that leaf nodes will not contain any transitions.)
2801 This algorithm compresses the tables by eliminating most such
2802 transitions, at the cost of a modest bit of extra work during lookup:
2804 - Each states[] entry contains a .base field which indicates the
2805 index in the state[] array wheres its transition data is stored.
2807 - If .base is 0 there are no valid transitions from that node.
2809 - If .base is nonzero then charid is added to it to find an entry in
2812 -If trans[states[state].base+charid].check!=state then the
2813 transition is taken to be a 0/Fail transition. Thus if there are fail
2814 transitions at the front of the node then the .base offset will point
2815 somewhere inside the previous nodes data (or maybe even into a node
2816 even earlier), but the .check field determines if the transition is
2820 The following process inplace converts the table to the compressed
2821 table: We first do not compress the root node 1,and mark all its
2822 .check pointers as 1 and set its .base pointer as 1 as well. This
2823 allows us to do a DFA construction from the compressed table later,
2824 and ensures that any .base pointers we calculate later are greater
2827 - We set 'pos' to indicate the first entry of the second node.
2829 - We then iterate over the columns of the node, finding the first and
2830 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2831 and set the .check pointers accordingly, and advance pos
2832 appropriately and repreat for the next node. Note that when we copy
2833 the next pointers we have to convert them from the original
2834 NODEIDX form to NODENUM form as the former is not valid post
2837 - If a node has no transitions used we mark its base as 0 and do not
2838 advance the pos pointer.
2840 - If a node only has one transition we use a second pointer into the
2841 structure to fill in allocated fail transitions from other states.
2842 This pointer is independent of the main pointer and scans forward
2843 looking for null transitions that are allocated to a state. When it
2844 finds one it writes the single transition into the "hole". If the
2845 pointer doesnt find one the single transition is appended as normal.
2847 - Once compressed we can Renew/realloc the structures to release the
2850 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2851 specifically Fig 3.47 and the associated pseudocode.
2855 const U32 laststate = TRIE_NODENUM( next_alloc );
2858 trie->statecount = laststate;
2860 for ( state = 1 ; state < laststate ; state++ ) {
2862 const U32 stateidx = TRIE_NODEIDX( state );
2863 const U32 o_used = trie->trans[ stateidx ].check;
2864 U32 used = trie->trans[ stateidx ].check;
2865 trie->trans[ stateidx ].check = 0;
2868 used && charid < trie->uniquecharcount;
2871 if ( flag || trie->trans[ stateidx + charid ].next ) {
2872 if ( trie->trans[ stateidx + charid ].next ) {
2874 for ( ; zp < pos ; zp++ ) {
2875 if ( ! trie->trans[ zp ].next ) {
2879 trie->states[ state ].trans.base
2881 + trie->uniquecharcount
2883 trie->trans[ zp ].next
2884 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2886 trie->trans[ zp ].check = state;
2887 if ( ++zp > pos ) pos = zp;
2894 trie->states[ state ].trans.base
2895 = pos + trie->uniquecharcount - charid ;
2897 trie->trans[ pos ].next
2898 = SAFE_TRIE_NODENUM(
2899 trie->trans[ stateidx + charid ].next );
2900 trie->trans[ pos ].check = state;
2905 trie->lasttrans = pos + 1;
2906 trie->states = (reg_trie_state *)
2907 PerlMemShared_realloc( trie->states, laststate
2908 * sizeof(reg_trie_state) );
2909 DEBUG_TRIE_COMPILE_MORE_r(
2910 PerlIO_printf( Perl_debug_log,
2911 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2912 (int)depth * 2 + 2,"",
2913 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2917 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2920 } /* end table compress */
2922 DEBUG_TRIE_COMPILE_MORE_r(
2923 PerlIO_printf(Perl_debug_log,
2924 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2925 (int)depth * 2 + 2, "",
2926 (UV)trie->statecount,
2927 (UV)trie->lasttrans)
2929 /* resize the trans array to remove unused space */
2930 trie->trans = (reg_trie_trans *)
2931 PerlMemShared_realloc( trie->trans, trie->lasttrans
2932 * sizeof(reg_trie_trans) );
2934 { /* Modify the program and insert the new TRIE node */
2935 U8 nodetype =(U8)(flags & 0xFF);
2939 regnode *optimize = NULL;
2940 #ifdef RE_TRACK_PATTERN_OFFSETS
2943 U32 mjd_nodelen = 0;
2944 #endif /* RE_TRACK_PATTERN_OFFSETS */
2945 #endif /* DEBUGGING */
2947 This means we convert either the first branch or the first Exact,
2948 depending on whether the thing following (in 'last') is a branch
2949 or not and whther first is the startbranch (ie is it a sub part of
2950 the alternation or is it the whole thing.)
2951 Assuming its a sub part we convert the EXACT otherwise we convert
2952 the whole branch sequence, including the first.
2954 /* Find the node we are going to overwrite */
2955 if ( first != startbranch || OP( last ) == BRANCH ) {
2956 /* branch sub-chain */
2957 NEXT_OFF( first ) = (U16)(last - first);
2958 #ifdef RE_TRACK_PATTERN_OFFSETS
2960 mjd_offset= Node_Offset((convert));
2961 mjd_nodelen= Node_Length((convert));
2964 /* whole branch chain */
2966 #ifdef RE_TRACK_PATTERN_OFFSETS
2969 const regnode *nop = NEXTOPER( convert );
2970 mjd_offset= Node_Offset((nop));
2971 mjd_nodelen= Node_Length((nop));
2975 PerlIO_printf(Perl_debug_log,
2976 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2977 (int)depth * 2 + 2, "",
2978 (UV)mjd_offset, (UV)mjd_nodelen)
2981 /* But first we check to see if there is a common prefix we can
2982 split out as an EXACT and put in front of the TRIE node. */
2983 trie->startstate= 1;
2984 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2986 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2990 const U32 base = trie->states[ state ].trans.base;
2992 if ( trie->states[state].wordnum )
2995 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2996 if ( ( base + ofs >= trie->uniquecharcount ) &&
2997 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2998 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3000 if ( ++count > 1 ) {
3001 SV **tmp = av_fetch( revcharmap, ofs, 0);
3002 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3003 if ( state == 1 ) break;
3005 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3007 PerlIO_printf(Perl_debug_log,
3008 "%*sNew Start State=%"UVuf" Class: [",
3009 (int)depth * 2 + 2, "",
3012 SV ** const tmp = av_fetch( revcharmap, idx, 0);
3013 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3015 TRIE_BITMAP_SET(trie,*ch);
3017 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3019 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3023 TRIE_BITMAP_SET(trie,*ch);
3025 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3026 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
3032 SV **tmp = av_fetch( revcharmap, idx, 0);
3034 char *ch = SvPV( *tmp, len );
3036 SV *sv=sv_newmortal();
3037 PerlIO_printf( Perl_debug_log,
3038 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3039 (int)depth * 2 + 2, "",
3041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3042 PL_colors[0], PL_colors[1],
3043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3044 PERL_PV_ESCAPE_FIRSTCHAR
3049 OP( convert ) = nodetype;
3050 str=STRING(convert);
3053 STR_LEN(convert) += len;
3059 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3064 trie->prefixlen = (state-1);
3066 regnode *n = convert+NODE_SZ_STR(convert);
3067 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3068 trie->startstate = state;
3069 trie->minlen -= (state - 1);
3070 trie->maxlen -= (state - 1);
3072 /* At least the UNICOS C compiler choked on this
3073 * being argument to DEBUG_r(), so let's just have
3076 #ifdef PERL_EXT_RE_BUILD
3082 regnode *fix = convert;
3083 U32 word = trie->wordcount;
3085 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3086 while( ++fix < n ) {
3087 Set_Node_Offset_Length(fix, 0, 0);
3090 SV ** const tmp = av_fetch( trie_words, word, 0 );
3092 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3093 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3095 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3103 NEXT_OFF(convert) = (U16)(tail - convert);
3104 DEBUG_r(optimize= n);
3110 if ( trie->maxlen ) {
3111 NEXT_OFF( convert ) = (U16)(tail - convert);
3112 ARG_SET( convert, data_slot );
3113 /* Store the offset to the first unabsorbed branch in
3114 jump[0], which is otherwise unused by the jump logic.
3115 We use this when dumping a trie and during optimisation. */
3117 trie->jump[0] = (U16)(nextbranch - convert);
3119 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3120 * and there is a bitmap
3121 * and the first "jump target" node we found leaves enough room
3122 * then convert the TRIE node into a TRIEC node, with the bitmap
3123 * embedded inline in the opcode - this is hypothetically faster.
3125 if ( !trie->states[trie->startstate].wordnum
3127 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3129 OP( convert ) = TRIEC;
3130 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3131 PerlMemShared_free(trie->bitmap);
3134 OP( convert ) = TRIE;
3136 /* store the type in the flags */
3137 convert->flags = nodetype;
3141 + regarglen[ OP( convert ) ];
3143 /* XXX We really should free up the resource in trie now,
3144 as we won't use them - (which resources?) dmq */
3146 /* needed for dumping*/
3147 DEBUG_r(if (optimize) {
3148 regnode *opt = convert;
3150 while ( ++opt < optimize) {
3151 Set_Node_Offset_Length(opt,0,0);
3154 Try to clean up some of the debris left after the
3157 while( optimize < jumper ) {
3158 mjd_nodelen += Node_Length((optimize));
3159 OP( optimize ) = OPTIMIZED;
3160 Set_Node_Offset_Length(optimize,0,0);
3163 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3165 } /* end node insert */
3167 /* Finish populating the prev field of the wordinfo array. Walk back
3168 * from each accept state until we find another accept state, and if
3169 * so, point the first word's .prev field at the second word. If the
3170 * second already has a .prev field set, stop now. This will be the
3171 * case either if we've already processed that word's accept state,
3172 * or that state had multiple words, and the overspill words were
3173 * already linked up earlier.
3180 for (word=1; word <= trie->wordcount; word++) {
3182 if (trie->wordinfo[word].prev)
3184 state = trie->wordinfo[word].accept;
3186 state = prev_states[state];
3189 prev = trie->states[state].wordnum;
3193 trie->wordinfo[word].prev = prev;
3195 Safefree(prev_states);
3199 /* and now dump out the compressed format */
3200 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3202 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3204 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3205 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3207 SvREFCNT_dec_NN(revcharmap);
3211 : trie->startstate>1
3217 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3219 /* The Trie is constructed and compressed now so we can build a fail array if
3222 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3224 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3228 We find the fail state for each state in the trie, this state is the longest
3229 proper suffix of the current state's 'word' that is also a proper prefix of
3230 another word in our trie. State 1 represents the word '' and is thus the
3231 default fail state. This allows the DFA not to have to restart after its
3232 tried and failed a word at a given point, it simply continues as though it
3233 had been matching the other word in the first place.
3235 'abcdgu'=~/abcdefg|cdgu/
3236 When we get to 'd' we are still matching the first word, we would encounter
3237 'g' which would fail, which would bring us to the state representing 'd' in
3238 the second word where we would try 'g' and succeed, proceeding to match
3241 /* add a fail transition */
3242 const U32 trie_offset = ARG(source);
3243 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3245 const U32 ucharcount = trie->uniquecharcount;
3246 const U32 numstates = trie->statecount;
3247 const U32 ubound = trie->lasttrans + ucharcount;
3251 U32 base = trie->states[ 1 ].trans.base;
3254 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3256 GET_RE_DEBUG_FLAGS_DECL;
3258 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3259 PERL_UNUSED_CONTEXT;
3261 PERL_UNUSED_ARG(depth);
3264 if ( OP(source) == TRIE ) {
3265 struct regnode_1 *op = (struct regnode_1 *)
3266 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3267 StructCopy(source,op,struct regnode_1);
3268 stclass = (regnode *)op;
3270 struct regnode_charclass *op = (struct regnode_charclass *)
3271 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3272 StructCopy(source,op,struct regnode_charclass);
3273 stclass = (regnode *)op;
3275 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3277 ARG_SET( stclass, data_slot );
3278 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3279 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3280 aho->trie=trie_offset;
3281 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3282 Copy( trie->states, aho->states, numstates, reg_trie_state );
3283 Newxz( q, numstates, U32);
3284 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3287 /* initialize fail[0..1] to be 1 so that we always have
3288 a valid final fail state */
3289 fail[ 0 ] = fail[ 1 ] = 1;
3291 for ( charid = 0; charid < ucharcount ; charid++ ) {
3292 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3294 q[ q_write ] = newstate;
3295 /* set to point at the root */
3296 fail[ q[ q_write++ ] ]=1;
3299 while ( q_read < q_write) {
3300 const U32 cur = q[ q_read++ % numstates ];
3301 base = trie->states[ cur ].trans.base;
3303 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3304 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3306 U32 fail_state = cur;
3309 fail_state = fail[ fail_state ];
3310 fail_base = aho->states[ fail_state ].trans.base;
3311 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3313 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3314 fail[ ch_state ] = fail_state;
3315 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3317 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3319 q[ q_write++ % numstates] = ch_state;
3323 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3324 when we fail in state 1, this allows us to use the
3325 charclass scan to find a valid start char. This is based on the principle
3326 that theres a good chance the string being searched contains lots of stuff
3327 that cant be a start char.
3329 fail[ 0 ] = fail[ 1 ] = 0;
3330 DEBUG_TRIE_COMPILE_r({
3331 PerlIO_printf(Perl_debug_log,
3332 "%*sStclass Failtable (%"UVuf" states): 0",
3333 (int)(depth * 2), "", (UV)numstates
3335 for( q_read=1; q_read<numstates; q_read++ ) {
3336 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3338 PerlIO_printf(Perl_debug_log, "\n");
3341 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3346 #define DEBUG_PEEP(str,scan,depth) \
3347 DEBUG_OPTIMISE_r({if (scan){ \
3348 regnode *Next = regnext(scan); \
3349 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3350 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3351 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3352 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3353 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3354 PerlIO_printf(Perl_debug_log, "\n"); \
3357 /* The below joins as many adjacent EXACTish nodes as possible into a single
3358 * one. The regop may be changed if the node(s) contain certain sequences that
3359 * require special handling. The joining is only done if:
3360 * 1) there is room in the current conglomerated node to entirely contain the
3362 * 2) they are the exact same node type
3364 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3365 * these get optimized out
3367 * If a node is to match under /i (folded), the number of characters it matches
3368 * can be different than its character length if it contains a multi-character
3369 * fold. *min_subtract is set to the total delta number of characters of the
3372 * And *unfolded_multi_char is set to indicate whether or not the node contains
3373 * an unfolded multi-char fold. This happens when whether the fold is valid or
3374 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3375 * SMALL LETTER SHARP S, as only if the target string being matched against
3376 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3377 * folding rules depend on the locale in force at runtime. (Multi-char folds
3378 * whose components are all above the Latin1 range are not run-time locale
3379 * dependent, and have already been folded by the time this function is
3382 * This is as good a place as any to discuss the design of handling these
3383 * multi-character fold sequences. It's been wrong in Perl for a very long
3384 * time. There are three code points in Unicode whose multi-character folds
3385 * were long ago discovered to mess things up. The previous designs for
3386 * dealing with these involved assigning a special node for them. This
3387 * approach doesn't always work, as evidenced by this example:
3388 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3389 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3390 * would match just the \xDF, it won't be able to handle the case where a
3391 * successful match would have to cross the node's boundary. The new approach
3392 * that hopefully generally solves the problem generates an EXACTFU_SS node
3393 * that is "sss" in this case.
3395 * It turns out that there are problems with all multi-character folds, and not
3396 * just these three. Now the code is general, for all such cases. The
3397 * approach taken is:
3398 * 1) This routine examines each EXACTFish node that could contain multi-
3399 * character folded sequences. Since a single character can fold into
3400 * such a sequence, the minimum match length for this node is less than
3401 * the number of characters in the node. This routine returns in
3402 * *min_subtract how many characters to subtract from the the actual
3403 * length of the string to get a real minimum match length; it is 0 if
3404 * there are no multi-char foldeds. This delta is used by the caller to
3405 * adjust the min length of the match, and the delta between min and max,
3406 * so that the optimizer doesn't reject these possibilities based on size
3408 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3409 * is used for an EXACTFU node that contains at least one "ss" sequence in
3410 * it. For non-UTF-8 patterns and strings, this is the only case where
3411 * there is a possible fold length change. That means that a regular
3412 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3413 * with length changes, and so can be processed faster. regexec.c takes
3414 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3415 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3416 * known until runtime). This saves effort in regex matching. However,
3417 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3418 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3419 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3420 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3421 * possibilities for the non-UTF8 patterns are quite simple, except for
3422 * the sharp s. All the ones that don't involve a UTF-8 target string are
3423 * members of a fold-pair, and arrays are set up for all of them so that
3424 * the other member of the pair can be found quickly. Code elsewhere in
3425 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3426 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3427 * described in the next item.
3428 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3429 * validity of the fold won't be known until runtime, and so must remain
3430 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3431 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3432 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3433 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3434 * The reason this is a problem is that the optimizer part of regexec.c
3435 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3436 * that a character in the pattern corresponds to at most a single
3437 * character in the target string. (And I do mean character, and not byte
3438 * here, unlike other parts of the documentation that have never been
3439 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3440 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3441 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3442 * nodes, violate the assumption, and they are the only instances where it
3443 * is violated. I'm reluctant to try to change the assumption, as the
3444 * code involved is impenetrable to me (khw), so instead the code here
3445 * punts. This routine examines EXACTFL nodes, and (when the pattern
3446 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3447 * boolean indicating whether or not the node contains such a fold. When
3448 * it is true, the caller sets a flag that later causes the optimizer in
3449 * this file to not set values for the floating and fixed string lengths,
3450 * and thus avoids the optimizer code in regexec.c that makes the invalid
3451 * assumption. Thus, there is no optimization based on string lengths for
3452 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3453 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3454 * assumption is wrong only in these cases is that all other non-UTF-8
3455 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3456 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3457 * EXACTF nodes because we don't know at compile time if it actually
3458 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3459 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3460 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3461 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3462 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3463 * string would require the pattern to be forced into UTF-8, the overhead
3464 * of which we want to avoid. Similarly the unfolded multi-char folds in
3465 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3468 * Similarly, the code that generates tries doesn't currently handle
3469 * not-already-folded multi-char folds, and it looks like a pain to change
3470 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3471 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3472 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3473 * using /iaa matching will be doing so almost entirely with ASCII
3474 * strings, so this should rarely be encountered in practice */
3476 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3477 if (PL_regkind[OP(scan)] == EXACT) \
3478 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3481 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3482 UV *min_subtract, bool *unfolded_multi_char,
3483 U32 flags,regnode *val, U32 depth)
3485 /* Merge several consecutive EXACTish nodes into one. */
3486 regnode *n = regnext(scan);
3488 regnode *next = scan + NODE_SZ_STR(scan);
3492 regnode *stop = scan;
3493 GET_RE_DEBUG_FLAGS_DECL;
3495 PERL_UNUSED_ARG(depth);
3498 PERL_ARGS_ASSERT_JOIN_EXACT;
3499 #ifndef EXPERIMENTAL_INPLACESCAN
3500 PERL_UNUSED_ARG(flags);
3501 PERL_UNUSED_ARG(val);
3503 DEBUG_PEEP("join",scan,depth);
3505 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3506 * EXACT ones that are mergeable to the current one. */
3508 && (PL_regkind[OP(n)] == NOTHING
3509 || (stringok && OP(n) == OP(scan)))
3511 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3514 if (OP(n) == TAIL || n > next)
3516 if (PL_regkind[OP(n)] == NOTHING) {
3517 DEBUG_PEEP("skip:",n,depth);
3518 NEXT_OFF(scan) += NEXT_OFF(n);
3519 next = n + NODE_STEP_REGNODE;
3526 else if (stringok) {
3527 const unsigned int oldl = STR_LEN(scan);
3528 regnode * const nnext = regnext(n);
3530 /* XXX I (khw) kind of doubt that this works on platforms (should
3531 * Perl ever run on one) where U8_MAX is above 255 because of lots
3532 * of other assumptions */
3533 /* Don't join if the sum can't fit into a single node */
3534 if (oldl + STR_LEN(n) > U8_MAX)
3537 DEBUG_PEEP("merg",n,depth);
3540 NEXT_OFF(scan) += NEXT_OFF(n);
3541 STR_LEN(scan) += STR_LEN(n);
3542 next = n + NODE_SZ_STR(n);
3543 /* Now we can overwrite *n : */
3544 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3552 #ifdef EXPERIMENTAL_INPLACESCAN
3553 if (flags && !NEXT_OFF(n)) {
3554 DEBUG_PEEP("atch", val, depth);
3555 if (reg_off_by_arg[OP(n)]) {
3556 ARG_SET(n, val - n);
3559 NEXT_OFF(n) = val - n;
3567 *unfolded_multi_char = FALSE;
3569 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3570 * can now analyze for sequences of problematic code points. (Prior to
3571 * this final joining, sequences could have been split over boundaries, and
3572 * hence missed). The sequences only happen in folding, hence for any
3573 * non-EXACT EXACTish node */
3574 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3575 U8* s0 = (U8*) STRING(scan);
3577 U8* s_end = s0 + STR_LEN(scan);
3579 int total_count_delta = 0; /* Total delta number of characters that
3580 multi-char folds expand to */
3582 /* One pass is made over the node's string looking for all the
3583 * possibilities. To avoid some tests in the loop, there are two main
3584 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3589 if (OP(scan) == EXACTFL) {
3592 /* An EXACTFL node would already have been changed to another
3593 * node type unless there is at least one character in it that
3594 * is problematic; likely a character whose fold definition
3595 * won't be known until runtime, and so has yet to be folded.
3596 * For all but the UTF-8 locale, folds are 1-1 in length, but
3597 * to handle the UTF-8 case, we need to create a temporary
3598 * folded copy using UTF-8 locale rules in order to analyze it.
3599 * This is because our macros that look to see if a sequence is
3600 * a multi-char fold assume everything is folded (otherwise the
3601 * tests in those macros would be too complicated and slow).
3602 * Note that here, the non-problematic folds will have already
3603 * been done, so we can just copy such characters. We actually
3604 * don't completely fold the EXACTFL string. We skip the
3605 * unfolded multi-char folds, as that would just create work
3606 * below to figure out the size they already are */
3608 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3611 STRLEN s_len = UTF8SKIP(s);
3612 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3613 Copy(s, d, s_len, U8);
3616 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3617 *unfolded_multi_char = TRUE;
3618 Copy(s, d, s_len, U8);
3621 else if (isASCII(*s)) {
3622 *(d++) = toFOLD(*s);
3626 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3632 /* Point the remainder of the routine to look at our temporary
3636 } /* End of creating folded copy of EXACTFL string */
3638 /* Examine the string for a multi-character fold sequence. UTF-8
3639 * patterns have all characters pre-folded by the time this code is
3641 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3642 length sequence we are looking for is 2 */
3644 int count = 0; /* How many characters in a multi-char fold */
3645 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3646 if (! len) { /* Not a multi-char fold: get next char */
3651 /* Nodes with 'ss' require special handling, except for
3652 * EXACTFA-ish for which there is no multi-char fold to this */
3653 if (len == 2 && *s == 's' && *(s+1) == 's'
3654 && OP(scan) != EXACTFA
3655 && OP(scan) != EXACTFA_NO_TRIE)
3658 if (OP(scan) != EXACTFL) {
3659 OP(scan) = EXACTFU_SS;
3663 else { /* Here is a generic multi-char fold. */
3664 U8* multi_end = s + len;
3666 /* Count how many characters are in it. In the case of
3667 * /aa, no folds which contain ASCII code points are
3668 * allowed, so check for those, and skip if found. */
3669 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3670 count = utf8_length(s, multi_end);
3674 while (s < multi_end) {
3677 goto next_iteration;
3687 /* The delta is how long the sequence is minus 1 (1 is how long
3688 * the character that folds to the sequence is) */
3689 total_count_delta += count - 1;
3693 /* We created a temporary folded copy of the string in EXACTFL
3694 * nodes. Therefore we need to be sure it doesn't go below zero,
3695 * as the real string could be shorter */
3696 if (OP(scan) == EXACTFL) {
3697 int total_chars = utf8_length((U8*) STRING(scan),
3698 (U8*) STRING(scan) + STR_LEN(scan));
3699 if (total_count_delta > total_chars) {
3700 total_count_delta = total_chars;
3704 *min_subtract += total_count_delta;
3707 else if (OP(scan) == EXACTFA) {
3709 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3710 * fold to the ASCII range (and there are no existing ones in the
3711 * upper latin1 range). But, as outlined in the comments preceding
3712 * this function, we need to flag any occurrences of the sharp s.
3713 * This character forbids trie formation (because of added
3715 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3716 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3717 || UNICODE_DOT_DOT_VERSION > 0)
3719 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3720 OP(scan) = EXACTFA_NO_TRIE;
3721 *unfolded_multi_char = TRUE;
3729 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3730 * folds that are all Latin1. As explained in the comments
3731 * preceding this function, we look also for the sharp s in EXACTF
3732 * and EXACTFL nodes; it can be in the final position. Otherwise
3733 * we can stop looking 1 byte earlier because have to find at least
3734 * two characters for a multi-fold */
3735 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3740 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3741 if (! len) { /* Not a multi-char fold. */
3742 if (*s == LATIN_SMALL_LETTER_SHARP_S
3743 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3745 *unfolded_multi_char = TRUE;
3752 && isALPHA_FOLD_EQ(*s, 's')
3753 && isALPHA_FOLD_EQ(*(s+1), 's'))
3756 /* EXACTF nodes need to know that the minimum length
3757 * changed so that a sharp s in the string can match this
3758 * ss in the pattern, but they remain EXACTF nodes, as they
3759 * won't match this unless the target string is is UTF-8,
3760 * which we don't know until runtime. EXACTFL nodes can't
3761 * transform into EXACTFU nodes */
3762 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3763 OP(scan) = EXACTFU_SS;
3767 *min_subtract += len - 1;
3775 /* Allow dumping but overwriting the collection of skipped
3776 * ops and/or strings with fake optimized ops */
3777 n = scan + NODE_SZ_STR(scan);
3785 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3789 /* REx optimizer. Converts nodes into quicker variants "in place".
3790 Finds fixed substrings. */
3792 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3793 to the position after last scanned or to NULL. */
3795 #define INIT_AND_WITHP \
3796 assert(!and_withp); \
3797 Newx(and_withp,1, regnode_ssc); \
3798 SAVEFREEPV(and_withp)
3802 S_unwind_scan_frames(pTHX_ const void *p)
3804 scan_frame *f= (scan_frame *)p;
3806 scan_frame *n= f->next_frame;
3814 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3815 SSize_t *minlenp, SSize_t *deltap,
3820 regnode_ssc *and_withp,
3821 U32 flags, U32 depth)
3822 /* scanp: Start here (read-write). */
3823 /* deltap: Write maxlen-minlen here. */
3824 /* last: Stop before this one. */
3825 /* data: string data about the pattern */
3826 /* stopparen: treat close N as END */
3827 /* recursed: which subroutines have we recursed into */
3828 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3830 /* There must be at least this number of characters to match */
3833 regnode *scan = *scanp, *next;
3835 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3836 int is_inf_internal = 0; /* The studied chunk is infinite */
3837 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3838 scan_data_t data_fake;
3839 SV *re_trie_maxbuff = NULL;
3840 regnode *first_non_open = scan;
3841 SSize_t stopmin = SSize_t_MAX;
3842 scan_frame *frame = NULL;
3843 GET_RE_DEBUG_FLAGS_DECL;
3845 PERL_ARGS_ASSERT_STUDY_CHUNK;
3849 while (first_non_open && OP(first_non_open) == OPEN)
3850 first_non_open=regnext(first_non_open);
3856 RExC_study_chunk_recursed_count++;
3858 DEBUG_OPTIMISE_MORE_r(
3860 PerlIO_printf(Perl_debug_log,
3861 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3862 (int)(depth*2), "", (long)stopparen,
3863 (unsigned long)RExC_study_chunk_recursed_count,
3864 (unsigned long)depth, (unsigned long)recursed_depth,
3867 if (recursed_depth) {
3870 for ( j = 0 ; j < recursed_depth ; j++ ) {
3871 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3873 PAREN_TEST(RExC_study_chunk_recursed +
3874 ( j * RExC_study_chunk_recursed_bytes), i )
3877 !PAREN_TEST(RExC_study_chunk_recursed +
3878 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3881 PerlIO_printf(Perl_debug_log," %d",(int)i);
3885 if ( j + 1 < recursed_depth ) {
3886 PerlIO_printf(Perl_debug_log, ",");
3890 PerlIO_printf(Perl_debug_log,"\n");
3893 while ( scan && OP(scan) != END && scan < last ){
3894 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3895 node length to get a real minimum (because
3896 the folded version may be shorter) */
3897 bool unfolded_multi_char = FALSE;
3898 /* Peephole optimizer: */
3899 DEBUG_STUDYDATA("Peep:", data, depth);
3900 DEBUG_PEEP("Peep", scan, depth);
3903 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3904 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3905 * by a different invocation of reg() -- Yves
3907 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3909 /* Follow the next-chain of the current node and optimize
3910 away all the NOTHINGs from it. */
3911 if (OP(scan) != CURLYX) {
3912 const int max = (reg_off_by_arg[OP(scan)]
3914 /* I32 may be smaller than U16 on CRAYs! */
3915 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3916 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3920 /* Skip NOTHING and LONGJMP. */
3921 while ((n = regnext(n))
3922 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3923 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3924 && off + noff < max)
3926 if (reg_off_by_arg[OP(scan)])
3929 NEXT_OFF(scan) = off;
3932 /* The principal pseudo-switch. Cannot be a switch, since we
3933 look into several different things. */
3934 if ( OP(scan) == DEFINEP ) {
3936 SSize_t deltanext = 0;
3937 SSize_t fake_last_close = 0;
3938 I32 f = SCF_IN_DEFINE;
3940 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3941 scan = regnext(scan);
3942 assert( OP(scan) == IFTHEN );
3943 DEBUG_PEEP("expect IFTHEN", scan, depth);
3945 data_fake.last_closep= &fake_last_close;
3947 next = regnext(scan);
3948 scan = NEXTOPER(NEXTOPER(scan));
3949 DEBUG_PEEP("scan", scan, depth);
3950 DEBUG_PEEP("next", next, depth);
3952 /* we suppose the run is continuous, last=next...
3953 * NOTE we dont use the return here! */
3954 (void)study_chunk(pRExC_state, &scan, &minlen,
3955 &deltanext, next, &data_fake, stopparen,
3956 recursed_depth, NULL, f, depth+1);
3961 OP(scan) == BRANCH ||
3962 OP(scan) == BRANCHJ ||
3965 next = regnext(scan);
3968 /* The op(next)==code check below is to see if we
3969 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3970 * IFTHEN is special as it might not appear in pairs.
3971 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3972 * we dont handle it cleanly. */
3973 if (OP(next) == code || code == IFTHEN) {
3974 /* NOTE - There is similar code to this block below for
3975 * handling TRIE nodes on a re-study. If you change stuff here
3976 * check there too. */
3977 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3979 regnode * const startbranch=scan;
3981 if (flags & SCF_DO_SUBSTR) {
3982 /* Cannot merge strings after this. */
3983 scan_commit(pRExC_state, data, minlenp, is_inf);
3986 if (flags & SCF_DO_STCLASS)
3987 ssc_init_zero(pRExC_state, &accum);
3989 while (OP(scan) == code) {
3990 SSize_t deltanext, minnext, fake;
3992 regnode_ssc this_class;
3994 DEBUG_PEEP("Branch", scan, depth);
3997 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3999 data_fake.whilem_c = data->whilem_c;
4000 data_fake.last_closep = data->last_closep;
4003 data_fake.last_closep = &fake;
4005 data_fake.pos_delta = delta;
4006 next = regnext(scan);
4008 scan = NEXTOPER(scan); /* everything */
4009 if (code != BRANCH) /* everything but BRANCH */
4010 scan = NEXTOPER(scan);
4012 if (flags & SCF_DO_STCLASS) {
4013 ssc_init(pRExC_state, &this_class);
4014 data_fake.start_class = &this_class;
4015 f = SCF_DO_STCLASS_AND;
4017 if (flags & SCF_WHILEM_VISITED_POS)
4018 f |= SCF_WHILEM_VISITED_POS;
4020 /* we suppose the run is continuous, last=next...*/
4021 minnext = study_chunk(pRExC_state, &scan, minlenp,
4022 &deltanext, next, &data_fake, stopparen,
4023 recursed_depth, NULL, f,depth+1);
4027 if (deltanext == SSize_t_MAX) {
4028 is_inf = is_inf_internal = 1;
4030 } else if (max1 < minnext + deltanext)
4031 max1 = minnext + deltanext;
4033 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4035 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4036 if ( stopmin > minnext)
4037 stopmin = min + min1;
4038 flags &= ~SCF_DO_SUBSTR;
4040 data->flags |= SCF_SEEN_ACCEPT;
4043 if (data_fake.flags & SF_HAS_EVAL)
4044 data->flags |= SF_HAS_EVAL;
4045 data->whilem_c = data_fake.whilem_c;
4047 if (flags & SCF_DO_STCLASS)
4048 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4050 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4052 if (flags & SCF_DO_SUBSTR) {
4053 data->pos_min += min1;
4054 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4055 data->pos_delta = SSize_t_MAX;
4057 data->pos_delta += max1 - min1;
4058 if (max1 != min1 || is_inf)
4059 data->longest = &(data->longest_float);
4062 if (delta == SSize_t_MAX
4063 || SSize_t_MAX - delta - (max1 - min1) < 0)
4064 delta = SSize_t_MAX;
4066 delta += max1 - min1;
4067 if (flags & SCF_DO_STCLASS_OR) {
4068 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4070 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4071 flags &= ~SCF_DO_STCLASS;
4074 else if (flags & SCF_DO_STCLASS_AND) {
4076 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4077 flags &= ~SCF_DO_STCLASS;
4080 /* Switch to OR mode: cache the old value of
4081 * data->start_class */
4083 StructCopy(data->start_class, and_withp, regnode_ssc);
4084 flags &= ~SCF_DO_STCLASS_AND;
4085 StructCopy(&accum, data->start_class, regnode_ssc);
4086 flags |= SCF_DO_STCLASS_OR;
4090 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4091 OP( startbranch ) == BRANCH )
4095 Assuming this was/is a branch we are dealing with: 'scan'
4096 now points at the item that follows the branch sequence,
4097 whatever it is. We now start at the beginning of the
4098 sequence and look for subsequences of
4104 which would be constructed from a pattern like
4107 If we can find such a subsequence we need to turn the first
4108 element into a trie and then add the subsequent branch exact
4109 strings to the trie.
4113 1. patterns where the whole set of branches can be
4116 2. patterns where only a subset can be converted.
4118 In case 1 we can replace the whole set with a single regop
4119 for the trie. In case 2 we need to keep the start and end
4122 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4123 becomes BRANCH TRIE; BRANCH X;
4125 There is an additional case, that being where there is a
4126 common prefix, which gets split out into an EXACT like node
4127 preceding the TRIE node.
4129 If x(1..n)==tail then we can do a simple trie, if not we make
4130 a "jump" trie, such that when we match the appropriate word
4131 we "jump" to the appropriate tail node. Essentially we turn
4132 a nested if into a case structure of sorts.
4137 if (!re_trie_maxbuff) {
4138 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4139 if (!SvIOK(re_trie_maxbuff))
4140 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4142 if ( SvIV(re_trie_maxbuff)>=0 ) {
4144 regnode *first = (regnode *)NULL;
4145 regnode *last = (regnode *)NULL;
4146 regnode *tail = scan;
4150 /* var tail is used because there may be a TAIL
4151 regop in the way. Ie, the exacts will point to the
4152 thing following the TAIL, but the last branch will
4153 point at the TAIL. So we advance tail. If we
4154 have nested (?:) we may have to move through several
4158 while ( OP( tail ) == TAIL ) {
4159 /* this is the TAIL generated by (?:) */
4160 tail = regnext( tail );
4164 DEBUG_TRIE_COMPILE_r({
4165 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4166 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4167 (int)depth * 2 + 2, "",
4168 "Looking for TRIE'able sequences. Tail node is: ",
4169 SvPV_nolen_const( RExC_mysv )
4175 Step through the branches
4176 cur represents each branch,
4177 noper is the first thing to be matched as part
4179 noper_next is the regnext() of that node.
4181 We normally handle a case like this
4182 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4183 support building with NOJUMPTRIE, which restricts
4184 the trie logic to structures like /FOO|BAR/.
4186 If noper is a trieable nodetype then the branch is
4187 a possible optimization target. If we are building
4188 under NOJUMPTRIE then we require that noper_next is
4189 the same as scan (our current position in the regex
4192 Once we have two or more consecutive such branches
4193 we can create a trie of the EXACT's contents and
4194 stitch it in place into the program.
4196 If the sequence represents all of the branches in
4197 the alternation we replace the entire thing with a
4200 Otherwise when it is a subsequence we need to
4201 stitch it in place and replace only the relevant
4202 branches. This means the first branch has to remain
4203 as it is used by the alternation logic, and its
4204 next pointer, and needs to be repointed at the item
4205 on the branch chain following the last branch we
4206 have optimized away.
4208 This could be either a BRANCH, in which case the
4209 subsequence is internal, or it could be the item
4210 following the branch sequence in which case the
4211 subsequence is at the end (which does not
4212 necessarily mean the first node is the start of the
4215 TRIE_TYPE(X) is a define which maps the optype to a
4219 ----------------+-----------
4223 EXACTFU_SS | EXACTFU
4226 EXACTFLU8 | EXACTFLU8
4230 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4232 : ( EXACT == (X) ) \
4234 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4236 : ( EXACTFA == (X) ) \
4238 : ( EXACTL == (X) ) \
4240 : ( EXACTFLU8 == (X) ) \
4244 /* dont use tail as the end marker for this traverse */
4245 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4246 regnode * const noper = NEXTOPER( cur );
4247 U8 noper_type = OP( noper );
4248 U8 noper_trietype = TRIE_TYPE( noper_type );
4249 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4250 regnode * const noper_next = regnext( noper );
4251 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4252 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4255 DEBUG_TRIE_COMPILE_r({
4256 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4257 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4258 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4260 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4261 PerlIO_printf( Perl_debug_log, " -> %s",
4262 SvPV_nolen_const(RExC_mysv));
4265 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4266 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4267 SvPV_nolen_const(RExC_mysv));
4269 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4270 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4271 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4275 /* Is noper a trieable nodetype that can be merged
4276 * with the current trie (if there is one)? */
4280 ( noper_trietype == NOTHING)
4281 || ( trietype == NOTHING )
4282 || ( trietype == noper_trietype )
4285 && noper_next == tail
4289 /* Handle mergable triable node Either we are
4290 * the first node in a new trieable sequence,
4291 * in which case we do some bookkeeping,
4292 * otherwise we update the end pointer. */
4295 if ( noper_trietype == NOTHING ) {
4296 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4297 regnode * const noper_next = regnext( noper );
4298 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4299 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4302 if ( noper_next_trietype ) {
4303 trietype = noper_next_trietype;
4304 } else if (noper_next_type) {
4305 /* a NOTHING regop is 1 regop wide.
4306 * We need at least two for a trie
4307 * so we can't merge this in */
4311 trietype = noper_trietype;
4314 if ( trietype == NOTHING )
4315 trietype = noper_trietype;
4320 } /* end handle mergable triable node */
4322 /* handle unmergable node -
4323 * noper may either be a triable node which can
4324 * not be tried together with the current trie,
4325 * or a non triable node */
4327 /* If last is set and trietype is not
4328 * NOTHING then we have found at least two
4329 * triable branch sequences in a row of a
4330 * similar trietype so we can turn them
4331 * into a trie. If/when we allow NOTHING to
4332 * start a trie sequence this condition
4333 * will be required, and it isn't expensive
4334 * so we leave it in for now. */
4335 if ( trietype && trietype != NOTHING )
4336 make_trie( pRExC_state,
4337 startbranch, first, cur, tail,
4338 count, trietype, depth+1 );
4339 last = NULL; /* note: we clear/update
4340 first, trietype etc below,
4341 so we dont do it here */
4345 && noper_next == tail
4348 /* noper is triable, so we can start a new
4352 trietype = noper_trietype;
4354 /* if we already saw a first but the
4355 * current node is not triable then we have
4356 * to reset the first information. */
4361 } /* end handle unmergable node */
4362 } /* loop over branches */
4363 DEBUG_TRIE_COMPILE_r({
4364 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4365 PerlIO_printf( Perl_debug_log,
4366 "%*s- %s (%d) <SCAN FINISHED>\n",
4368 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4371 if ( last && trietype ) {
4372 if ( trietype != NOTHING ) {
4373 /* the last branch of the sequence was part of
4374 * a trie, so we have to construct it here
4375 * outside of the loop */
4376 made= make_trie( pRExC_state, startbranch,
4377 first, scan, tail, count,
4378 trietype, depth+1 );
4379 #ifdef TRIE_STUDY_OPT
4380 if ( ((made == MADE_EXACT_TRIE &&
4381 startbranch == first)
4382 || ( first_non_open == first )) &&
4384 flags |= SCF_TRIE_RESTUDY;
4385 if ( startbranch == first
4388 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4393 /* at this point we know whatever we have is a
4394 * NOTHING sequence/branch AND if 'startbranch'
4395 * is 'first' then we can turn the whole thing
4398 if ( startbranch == first ) {
4400 /* the entire thing is a NOTHING sequence,
4401 * something like this: (?:|) So we can
4402 * turn it into a plain NOTHING op. */
4403 DEBUG_TRIE_COMPILE_r({
4404 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4405 PerlIO_printf( Perl_debug_log,
4406 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4407 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4410 OP(startbranch)= NOTHING;
4411 NEXT_OFF(startbranch)= tail - startbranch;
4412 for ( opt= startbranch + 1; opt < tail ; opt++ )
4416 } /* end if ( last) */
4417 } /* TRIE_MAXBUF is non zero */
4422 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4423 scan = NEXTOPER(NEXTOPER(scan));
4424 } else /* single branch is optimized. */
4425 scan = NEXTOPER(scan);
4427 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4429 regnode *start = NULL;
4430 regnode *end = NULL;
4431 U32 my_recursed_depth= recursed_depth;
4434 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4435 /* Do setup, note this code has side effects beyond
4436 * the rest of this block. Specifically setting
4437 * RExC_recurse[] must happen at least once during
4439 if (OP(scan) == GOSUB) {
4441 RExC_recurse[ARG2L(scan)] = scan;
4442 start = RExC_open_parens[paren-1];
4443 end = RExC_close_parens[paren-1];
4445 start = RExC_rxi->program + 1;
4448 /* NOTE we MUST always execute the above code, even
4449 * if we do nothing with a GOSUB/GOSTART */
4451 ( flags & SCF_IN_DEFINE )
4454 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4456 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4459 /* no need to do anything here if we are in a define. */
4460 /* or we are after some kind of infinite construct
4461 * so we can skip recursing into this item.
4462 * Since it is infinite we will not change the maxlen
4463 * or delta, and if we miss something that might raise
4464 * the minlen it will merely pessimise a little.
4466 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4467 * might result in a minlen of 1 and not of 4,
4468 * but this doesn't make us mismatch, just try a bit
4469 * harder than we should.
4471 scan= regnext(scan);
4478 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4480 /* it is quite possible that there are more efficient ways
4481 * to do this. We maintain a bitmap per level of recursion
4482 * of which patterns we have entered so we can detect if a
4483 * pattern creates a possible infinite loop. When we
4484 * recurse down a level we copy the previous levels bitmap
4485 * down. When we are at recursion level 0 we zero the top
4486 * level bitmap. It would be nice to implement a different
4487 * more efficient way of doing this. In particular the top
4488 * level bitmap may be unnecessary.
4490 if (!recursed_depth) {
4491 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4493 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4494 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4495 RExC_study_chunk_recursed_bytes, U8);
4497 /* we havent recursed into this paren yet, so recurse into it */
4498 DEBUG_STUDYDATA("set:", data,depth);
4499 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4500 my_recursed_depth= recursed_depth + 1;
4502 DEBUG_STUDYDATA("inf:", data,depth);
4503 /* some form of infinite recursion, assume infinite length
4505 if (flags & SCF_DO_SUBSTR) {
4506 scan_commit(pRExC_state, data, minlenp, is_inf);
4507 data->longest = &(data->longest_float);
4509 is_inf = is_inf_internal = 1;
4510 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4511 ssc_anything(data->start_class);
4512 flags &= ~SCF_DO_STCLASS;
4514 start= NULL; /* reset start so we dont recurse later on. */
4519 end = regnext(scan);
4522 scan_frame *newframe;
4524 if (!RExC_frame_last) {
4525 Newxz(newframe, 1, scan_frame);
4526 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4527 RExC_frame_head= newframe;
4529 } else if (!RExC_frame_last->next_frame) {
4530 Newxz(newframe,1,scan_frame);
4531 RExC_frame_last->next_frame= newframe;
4532 newframe->prev_frame= RExC_frame_last;
4535 newframe= RExC_frame_last->next_frame;
4537 RExC_frame_last= newframe;
4539 newframe->next_regnode = regnext(scan);
4540 newframe->last_regnode = last;
4541 newframe->stopparen = stopparen;
4542 newframe->prev_recursed_depth = recursed_depth;
4543 newframe->this_prev_frame= frame;
4545 DEBUG_STUDYDATA("frame-new:",data,depth);
4546 DEBUG_PEEP("fnew", scan, depth);
4553 recursed_depth= my_recursed_depth;
4558 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4559 SSize_t l = STR_LEN(scan);
4562 const U8 * const s = (U8*)STRING(scan);
4563 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4564 l = utf8_length(s, s + l);
4566 uc = *((U8*)STRING(scan));
4569 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4570 /* The code below prefers earlier match for fixed
4571 offset, later match for variable offset. */
4572 if (data->last_end == -1) { /* Update the start info. */
4573 data->last_start_min = data->pos_min;
4574 data->last_start_max = is_inf
4575 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4577 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4579 SvUTF8_on(data->last_found);
4581 SV * const sv = data->last_found;
4582 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4583 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4584 if (mg && mg->mg_len >= 0)
4585 mg->mg_len += utf8_length((U8*)STRING(scan),
4586 (U8*)STRING(scan)+STR_LEN(scan));
4588 data->last_end = data->pos_min + l;
4589 data->pos_min += l; /* As in the first entry. */
4590 data->flags &= ~SF_BEFORE_EOL;
4593 /* ANDing the code point leaves at most it, and not in locale, and
4594 * can't match null string */
4595 if (flags & SCF_DO_STCLASS_AND) {
4596 ssc_cp_and(data->start_class, uc);
4597 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4598 ssc_clear_locale(data->start_class);
4600 else if (flags & SCF_DO_STCLASS_OR) {
4601 ssc_add_cp(data->start_class, uc);
4602 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4604 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4605 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4607 flags &= ~SCF_DO_STCLASS;
4609 else if (PL_regkind[OP(scan)] == EXACT) {
4610 /* But OP != EXACT!, so is EXACTFish */
4611 SSize_t l = STR_LEN(scan);
4612 const U8 * s = (U8*)STRING(scan);
4614 /* Search for fixed substrings supports EXACT only. */
4615 if (flags & SCF_DO_SUBSTR) {
4617 scan_commit(pRExC_state, data, minlenp, is_inf);
4620 l = utf8_length(s, s + l);
4622 if (unfolded_multi_char) {
4623 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4625 min += l - min_subtract;
4627 delta += min_subtract;
4628 if (flags & SCF_DO_SUBSTR) {
4629 data->pos_min += l - min_subtract;
4630 if (data->pos_min < 0) {
4633 data->pos_delta += min_subtract;
4635 data->longest = &(data->longest_float);
4639 if (flags & SCF_DO_STCLASS) {
4640 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4642 assert(EXACTF_invlist);
4643 if (flags & SCF_DO_STCLASS_AND) {
4644 if (OP(scan) != EXACTFL)
4645 ssc_clear_locale(data->start_class);
4646 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4647 ANYOF_POSIXL_ZERO(data->start_class);
4648 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4650 else { /* SCF_DO_STCLASS_OR */
4651 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4652 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4654 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4655 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4657 flags &= ~SCF_DO_STCLASS;
4658 SvREFCNT_dec(EXACTF_invlist);
4661 else if (REGNODE_VARIES(OP(scan))) {
4662 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4663 I32 fl = 0, f = flags;
4664 regnode * const oscan = scan;
4665 regnode_ssc this_class;
4666 regnode_ssc *oclass = NULL;
4667 I32 next_is_eval = 0;
4669 switch (PL_regkind[OP(scan)]) {
4670 case WHILEM: /* End of (?:...)* . */
4671 scan = NEXTOPER(scan);
4674 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4675 next = NEXTOPER(scan);
4676 if (OP(next) == EXACT
4677 || OP(next) == EXACTL
4678 || (flags & SCF_DO_STCLASS))
4681 maxcount = REG_INFTY;
4682 next = regnext(scan);
4683 scan = NEXTOPER(scan);
4687 if (flags & SCF_DO_SUBSTR)
4692 if (flags & SCF_DO_STCLASS) {
4694 maxcount = REG_INFTY;
4695 next = regnext(scan);
4696 scan = NEXTOPER(scan);
4699 if (flags & SCF_DO_SUBSTR) {
4700 scan_commit(pRExC_state, data, minlenp, is_inf);
4701 /* Cannot extend fixed substrings */
4702 data->longest = &(data->longest_float);
4704 is_inf = is_inf_internal = 1;
4705 scan = regnext(scan);
4706 goto optimize_curly_tail;
4708 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4709 && (scan->flags == stopparen))
4714 mincount = ARG1(scan);
4715 maxcount = ARG2(scan);
4717 next = regnext(scan);
4718 if (OP(scan) == CURLYX) {
4719 I32 lp = (data ? *(data->last_closep) : 0);
4720 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4722 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4723 next_is_eval = (OP(scan) == EVAL);
4725 if (flags & SCF_DO_SUBSTR) {
4727 scan_commit(pRExC_state, data, minlenp, is_inf);
4728 /* Cannot extend fixed substrings */
4729 pos_before = data->pos_min;
4733 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4735 data->flags |= SF_IS_INF;
4737 if (flags & SCF_DO_STCLASS) {
4738 ssc_init(pRExC_state, &this_class);
4739 oclass = data->start_class;
4740 data->start_class = &this_class;
4741 f |= SCF_DO_STCLASS_AND;
4742 f &= ~SCF_DO_STCLASS_OR;
4744 /* Exclude from super-linear cache processing any {n,m}
4745 regops for which the combination of input pos and regex
4746 pos is not enough information to determine if a match
4749 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4750 regex pos at the \s*, the prospects for a match depend not
4751 only on the input position but also on how many (bar\s*)
4752 repeats into the {4,8} we are. */
4753 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4754 f &= ~SCF_WHILEM_VISITED_POS;
4756 /* This will finish on WHILEM, setting scan, or on NULL: */
4757 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4758 last, data, stopparen, recursed_depth, NULL,
4760 ? (f & ~SCF_DO_SUBSTR)
4764 if (flags & SCF_DO_STCLASS)
4765 data->start_class = oclass;
4766 if (mincount == 0 || minnext == 0) {
4767 if (flags & SCF_DO_STCLASS_OR) {
4768 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4770 else if (flags & SCF_DO_STCLASS_AND) {
4771 /* Switch to OR mode: cache the old value of
4772 * data->start_class */
4774 StructCopy(data->start_class, and_withp, regnode_ssc);
4775 flags &= ~SCF_DO_STCLASS_AND;
4776 StructCopy(&this_class, data->start_class, regnode_ssc);
4777 flags |= SCF_DO_STCLASS_OR;
4778 ANYOF_FLAGS(data->start_class)
4779 |= SSC_MATCHES_EMPTY_STRING;
4781 } else { /* Non-zero len */
4782 if (flags & SCF_DO_STCLASS_OR) {
4783 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4784 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4786 else if (flags & SCF_DO_STCLASS_AND)
4787 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4788 flags &= ~SCF_DO_STCLASS;
4790 if (!scan) /* It was not CURLYX, but CURLY. */
4792 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4793 /* ? quantifier ok, except for (?{ ... }) */
4794 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4795 && (minnext == 0) && (deltanext == 0)
4796 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4797 && maxcount <= REG_INFTY/3) /* Complement check for big
4800 /* Fatal warnings may leak the regexp without this: */
4801 SAVEFREESV(RExC_rx_sv);
4802 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4803 "Quantifier unexpected on zero-length expression "
4804 "in regex m/%"UTF8f"/",
4805 UTF8fARG(UTF, RExC_end - RExC_precomp,
4807 (void)ReREFCNT_inc(RExC_rx_sv);
4810 min += minnext * mincount;
4811 is_inf_internal |= deltanext == SSize_t_MAX
4812 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4813 is_inf |= is_inf_internal;
4815 delta = SSize_t_MAX;
4817 delta += (minnext + deltanext) * maxcount
4818 - minnext * mincount;
4820 /* Try powerful optimization CURLYX => CURLYN. */
4821 if ( OP(oscan) == CURLYX && data
4822 && data->flags & SF_IN_PAR
4823 && !(data->flags & SF_HAS_EVAL)
4824 && !deltanext && minnext == 1 ) {
4825 /* Try to optimize to CURLYN. */
4826 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4827 regnode * const nxt1 = nxt;
4834 if (!REGNODE_SIMPLE(OP(nxt))
4835 && !(PL_regkind[OP(nxt)] == EXACT
4836 && STR_LEN(nxt) == 1))
4842 if (OP(nxt) != CLOSE)
4844 if (RExC_open_parens) {
4845 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4846 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4848 /* Now we know that nxt2 is the only contents: */
4849 oscan->flags = (U8)ARG(nxt);
4851 OP(nxt1) = NOTHING; /* was OPEN. */
4854 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4855 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4856 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4857 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4858 OP(nxt + 1) = OPTIMIZED; /* was count. */
4859 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4864 /* Try optimization CURLYX => CURLYM. */
4865 if ( OP(oscan) == CURLYX && data
4866 && !(data->flags & SF_HAS_PAR)
4867 && !(data->flags & SF_HAS_EVAL)
4868 && !deltanext /* atom is fixed width */
4869 && minnext != 0 /* CURLYM can't handle zero width */
4871 /* Nor characters whose fold at run-time may be
4872 * multi-character */
4873 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4875 /* XXXX How to optimize if data == 0? */
4876 /* Optimize to a simpler form. */
4877 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4881 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4882 && (OP(nxt2) != WHILEM))
4884 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4885 /* Need to optimize away parenths. */
4886 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4887 /* Set the parenth number. */
4888 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4890 oscan->flags = (U8)ARG(nxt);
4891 if (RExC_open_parens) {
4892 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4893 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4895 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4896 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4899 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4900 OP(nxt + 1) = OPTIMIZED; /* was count. */
4901 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4902 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4905 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4906 regnode *nnxt = regnext(nxt1);
4908 if (reg_off_by_arg[OP(nxt1)])
4909 ARG_SET(nxt1, nxt2 - nxt1);
4910 else if (nxt2 - nxt1 < U16_MAX)
4911 NEXT_OFF(nxt1) = nxt2 - nxt1;
4913 OP(nxt) = NOTHING; /* Cannot beautify */
4918 /* Optimize again: */
4919 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4920 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4925 else if ((OP(oscan) == CURLYX)
4926 && (flags & SCF_WHILEM_VISITED_POS)
4927 /* See the comment on a similar expression above.
4928 However, this time it's not a subexpression
4929 we care about, but the expression itself. */
4930 && (maxcount == REG_INFTY)
4931 && data && ++data->whilem_c < 16) {
4932 /* This stays as CURLYX, we can put the count/of pair. */
4933 /* Find WHILEM (as in regexec.c) */
4934 regnode *nxt = oscan + NEXT_OFF(oscan);
4936 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4938 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4939 | (RExC_whilem_seen << 4)); /* On WHILEM */
4941 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4943 if (flags & SCF_DO_SUBSTR) {
4944 SV *last_str = NULL;
4945 STRLEN last_chrs = 0;
4946 int counted = mincount != 0;
4948 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4950 SSize_t b = pos_before >= data->last_start_min
4951 ? pos_before : data->last_start_min;
4953 const char * const s = SvPV_const(data->last_found, l);
4954 SSize_t old = b - data->last_start_min;
4957 old = utf8_hop((U8*)s, old) - (U8*)s;
4959 /* Get the added string: */
4960 last_str = newSVpvn_utf8(s + old, l, UTF);
4961 last_chrs = UTF ? utf8_length((U8*)(s + old),
4962 (U8*)(s + old + l)) : l;
4963 if (deltanext == 0 && pos_before == b) {
4964 /* What was added is a constant string */
4967 SvGROW(last_str, (mincount * l) + 1);
4968 repeatcpy(SvPVX(last_str) + l,
4969 SvPVX_const(last_str), l,
4971 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4972 /* Add additional parts. */
4973 SvCUR_set(data->last_found,
4974 SvCUR(data->last_found) - l);
4975 sv_catsv(data->last_found, last_str);
4977 SV * sv = data->last_found;
4979 SvUTF8(sv) && SvMAGICAL(sv) ?
4980 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4981 if (mg && mg->mg_len >= 0)
4982 mg->mg_len += last_chrs * (mincount-1);
4984 last_chrs *= mincount;
4985 data->last_end += l * (mincount - 1);
4988 /* start offset must point into the last copy */
4989 data->last_start_min += minnext * (mincount - 1);
4990 data->last_start_max =
4993 : data->last_start_max +
4994 (maxcount - 1) * (minnext + data->pos_delta);
4997 /* It is counted once already... */
4998 data->pos_min += minnext * (mincount - counted);
5000 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5001 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5002 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5003 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5005 if (deltanext != SSize_t_MAX)
5006 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5007 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5008 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5010 if (deltanext == SSize_t_MAX
5011 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5012 data->pos_delta = SSize_t_MAX;
5014 data->pos_delta += - counted * deltanext +
5015 (minnext + deltanext) * maxcount - minnext * mincount;
5016 if (mincount != maxcount) {
5017 /* Cannot extend fixed substrings found inside
5019 scan_commit(pRExC_state, data, minlenp, is_inf);
5020 if (mincount && last_str) {
5021 SV * const sv = data->last_found;
5022 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5023 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5027 sv_setsv(sv, last_str);
5028 data->last_end = data->pos_min;
5029 data->last_start_min = data->pos_min - last_chrs;
5030 data->last_start_max = is_inf
5032 : data->pos_min + data->pos_delta - last_chrs;
5034 data->longest = &(data->longest_float);
5036 SvREFCNT_dec(last_str);
5038 if (data && (fl & SF_HAS_EVAL))
5039 data->flags |= SF_HAS_EVAL;
5040 optimize_curly_tail:
5041 if (OP(oscan) != CURLYX) {
5042 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5044 NEXT_OFF(oscan) += NEXT_OFF(next);
5050 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5055 if (flags & SCF_DO_SUBSTR) {
5056 /* Cannot expect anything... */
5057 scan_commit(pRExC_state, data, minlenp, is_inf);
5058 data->longest = &(data->longest_float);
5060 is_inf = is_inf_internal = 1;
5061 if (flags & SCF_DO_STCLASS_OR) {
5062 if (OP(scan) == CLUMP) {
5063 /* Actually is any start char, but very few code points
5064 * aren't start characters */
5065 ssc_match_all_cp(data->start_class);
5068 ssc_anything(data->start_class);
5071 flags &= ~SCF_DO_STCLASS;
5075 else if (OP(scan) == LNBREAK) {
5076 if (flags & SCF_DO_STCLASS) {
5077 if (flags & SCF_DO_STCLASS_AND) {
5078 ssc_intersection(data->start_class,
5079 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5080 ssc_clear_locale(data->start_class);
5081 ANYOF_FLAGS(data->start_class)
5082 &= ~SSC_MATCHES_EMPTY_STRING;
5084 else if (flags & SCF_DO_STCLASS_OR) {
5085 ssc_union(data->start_class,
5086 PL_XPosix_ptrs[_CC_VERTSPACE],
5088 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5090 /* See commit msg for
5091 * 749e076fceedeb708a624933726e7989f2302f6a */
5092 ANYOF_FLAGS(data->start_class)
5093 &= ~SSC_MATCHES_EMPTY_STRING;
5095 flags &= ~SCF_DO_STCLASS;
5098 if (delta != SSize_t_MAX)
5099 delta++; /* Because of the 2 char string cr-lf */
5100 if (flags & SCF_DO_SUBSTR) {
5101 /* Cannot expect anything... */
5102 scan_commit(pRExC_state, data, minlenp, is_inf);
5104 data->pos_delta += 1;
5105 data->longest = &(data->longest_float);
5108 else if (REGNODE_SIMPLE(OP(scan))) {
5110 if (flags & SCF_DO_SUBSTR) {
5111 scan_commit(pRExC_state, data, minlenp, is_inf);
5115 if (flags & SCF_DO_STCLASS) {
5117 SV* my_invlist = NULL;
5120 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5121 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5123 /* Some of the logic below assumes that switching
5124 locale on will only add false positives. */
5129 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5133 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5134 ssc_match_all_cp(data->start_class);
5139 SV* REG_ANY_invlist = _new_invlist(2);
5140 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5142 if (flags & SCF_DO_STCLASS_OR) {
5143 ssc_union(data->start_class,
5145 TRUE /* TRUE => invert, hence all but \n
5149 else if (flags & SCF_DO_STCLASS_AND) {
5150 ssc_intersection(data->start_class,
5152 TRUE /* TRUE => invert */
5154 ssc_clear_locale(data->start_class);
5156 SvREFCNT_dec_NN(REG_ANY_invlist);
5163 if (flags & SCF_DO_STCLASS_AND)
5164 ssc_and(pRExC_state, data->start_class,
5165 (regnode_charclass *) scan);
5167 ssc_or(pRExC_state, data->start_class,
5168 (regnode_charclass *) scan);
5176 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5177 if (flags & SCF_DO_STCLASS_AND) {
5178 bool was_there = cBOOL(
5179 ANYOF_POSIXL_TEST(data->start_class,
5181 ANYOF_POSIXL_ZERO(data->start_class);
5182 if (was_there) { /* Do an AND */
5183 ANYOF_POSIXL_SET(data->start_class, namedclass);
5185 /* No individual code points can now match */
5186 data->start_class->invlist
5187 = sv_2mortal(_new_invlist(0));
5190 int complement = namedclass + ((invert) ? -1 : 1);
5192 assert(flags & SCF_DO_STCLASS_OR);
5194 /* If the complement of this class was already there,
5195 * the result is that they match all code points,
5196 * (\d + \D == everything). Remove the classes from
5197 * future consideration. Locale is not relevant in
5199 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5200 ssc_match_all_cp(data->start_class);
5201 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5202 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5204 else { /* The usual case; just add this class to the
5206 ANYOF_POSIXL_SET(data->start_class, namedclass);
5211 case NPOSIXA: /* For these, we always know the exact set of
5216 if (FLAGS(scan) == _CC_ASCII) {
5217 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5220 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5221 PL_XPosix_ptrs[_CC_ASCII],
5232 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5234 /* NPOSIXD matches all upper Latin1 code points unless the
5235 * target string being matched is UTF-8, which is
5236 * unknowable until match time. Since we are going to
5237 * invert, we want to get rid of all of them so that the
5238 * inversion will match all */
5239 if (OP(scan) == NPOSIXD) {
5240 _invlist_subtract(my_invlist, PL_UpperLatin1,
5246 if (flags & SCF_DO_STCLASS_AND) {
5247 ssc_intersection(data->start_class, my_invlist, invert);
5248 ssc_clear_locale(data->start_class);
5251 assert(flags & SCF_DO_STCLASS_OR);
5252 ssc_union(data->start_class, my_invlist, invert);
5254 SvREFCNT_dec(my_invlist);
5256 if (flags & SCF_DO_STCLASS_OR)
5257 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5258 flags &= ~SCF_DO_STCLASS;
5261 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5262 data->flags |= (OP(scan) == MEOL
5265 scan_commit(pRExC_state, data, minlenp, is_inf);
5268 else if ( PL_regkind[OP(scan)] == BRANCHJ
5269 /* Lookbehind, or need to calculate parens/evals/stclass: */
5270 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5271 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5273 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5274 || OP(scan) == UNLESSM )
5276 /* Negative Lookahead/lookbehind
5277 In this case we can't do fixed string optimisation.
5280 SSize_t deltanext, minnext, fake = 0;
5285 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5287 data_fake.whilem_c = data->whilem_c;
5288 data_fake.last_closep = data->last_closep;
5291 data_fake.last_closep = &fake;
5292 data_fake.pos_delta = delta;
5293 if ( flags & SCF_DO_STCLASS && !scan->flags
5294 && OP(scan) == IFMATCH ) { /* Lookahead */
5295 ssc_init(pRExC_state, &intrnl);
5296 data_fake.start_class = &intrnl;
5297 f |= SCF_DO_STCLASS_AND;
5299 if (flags & SCF_WHILEM_VISITED_POS)
5300 f |= SCF_WHILEM_VISITED_POS;
5301 next = regnext(scan);
5302 nscan = NEXTOPER(NEXTOPER(scan));
5303 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5304 last, &data_fake, stopparen,
5305 recursed_depth, NULL, f, depth+1);
5308 FAIL("Variable length lookbehind not implemented");
5310 else if (minnext > (I32)U8_MAX) {
5311 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5314 scan->flags = (U8)minnext;
5317 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5319 if (data_fake.flags & SF_HAS_EVAL)
5320 data->flags |= SF_HAS_EVAL;
5321 data->whilem_c = data_fake.whilem_c;
5323 if (f & SCF_DO_STCLASS_AND) {
5324 if (flags & SCF_DO_STCLASS_OR) {
5325 /* OR before, AND after: ideally we would recurse with
5326 * data_fake to get the AND applied by study of the
5327 * remainder of the pattern, and then derecurse;
5328 * *** HACK *** for now just treat as "no information".
5329 * See [perl #56690].
5331 ssc_init(pRExC_state, data->start_class);
5333 /* AND before and after: combine and continue. These
5334 * assertions are zero-length, so can match an EMPTY
5336 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5337 ANYOF_FLAGS(data->start_class)
5338 |= SSC_MATCHES_EMPTY_STRING;
5342 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5344 /* Positive Lookahead/lookbehind
5345 In this case we can do fixed string optimisation,
5346 but we must be careful about it. Note in the case of
5347 lookbehind the positions will be offset by the minimum
5348 length of the pattern, something we won't know about
5349 until after the recurse.
5351 SSize_t deltanext, fake = 0;
5355 /* We use SAVEFREEPV so that when the full compile
5356 is finished perl will clean up the allocated
5357 minlens when it's all done. This way we don't
5358 have to worry about freeing them when we know
5359 they wont be used, which would be a pain.
5362 Newx( minnextp, 1, SSize_t );
5363 SAVEFREEPV(minnextp);
5366 StructCopy(data, &data_fake, scan_data_t);
5367 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5370 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5371 data_fake.last_found=newSVsv(data->last_found);
5375 data_fake.last_closep = &fake;
5376 data_fake.flags = 0;
5377 data_fake.pos_delta = delta;
5379 data_fake.flags |= SF_IS_INF;
5380 if ( flags & SCF_DO_STCLASS && !scan->flags
5381 && OP(scan) == IFMATCH ) { /* Lookahead */
5382 ssc_init(pRExC_state, &intrnl);
5383 data_fake.start_class = &intrnl;
5384 f |= SCF_DO_STCLASS_AND;
5386 if (flags & SCF_WHILEM_VISITED_POS)
5387 f |= SCF_WHILEM_VISITED_POS;
5388 next = regnext(scan);
5389 nscan = NEXTOPER(NEXTOPER(scan));
5391 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5392 &deltanext, last, &data_fake,
5393 stopparen, recursed_depth, NULL,
5397 FAIL("Variable length lookbehind not implemented");
5399 else if (*minnextp > (I32)U8_MAX) {
5400 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5403 scan->flags = (U8)*minnextp;
5408 if (f & SCF_DO_STCLASS_AND) {
5409 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5410 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5413 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5415 if (data_fake.flags & SF_HAS_EVAL)
5416 data->flags |= SF_HAS_EVAL;
5417 data->whilem_c = data_fake.whilem_c;
5418 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5419 if (RExC_rx->minlen<*minnextp)
5420 RExC_rx->minlen=*minnextp;
5421 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5422 SvREFCNT_dec_NN(data_fake.last_found);
5424 if ( data_fake.minlen_fixed != minlenp )
5426 data->offset_fixed= data_fake.offset_fixed;
5427 data->minlen_fixed= data_fake.minlen_fixed;
5428 data->lookbehind_fixed+= scan->flags;
5430 if ( data_fake.minlen_float != minlenp )
5432 data->minlen_float= data_fake.minlen_float;
5433 data->offset_float_min=data_fake.offset_float_min;
5434 data->offset_float_max=data_fake.offset_float_max;
5435 data->lookbehind_float+= scan->flags;
5442 else if (OP(scan) == OPEN) {
5443 if (stopparen != (I32)ARG(scan))
5446 else if (OP(scan) == CLOSE) {
5447 if (stopparen == (I32)ARG(scan)) {
5450 if ((I32)ARG(scan) == is_par) {
5451 next = regnext(scan);
5453 if ( next && (OP(next) != WHILEM) && next < last)
5454 is_par = 0; /* Disable optimization */
5457 *(data->last_closep) = ARG(scan);
5459 else if (OP(scan) == EVAL) {
5461 data->flags |= SF_HAS_EVAL;
5463 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5464 if (flags & SCF_DO_SUBSTR) {
5465 scan_commit(pRExC_state, data, minlenp, is_inf);
5466 flags &= ~SCF_DO_SUBSTR;
5468 if (data && OP(scan)==ACCEPT) {
5469 data->flags |= SCF_SEEN_ACCEPT;
5474 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5476 if (flags & SCF_DO_SUBSTR) {
5477 scan_commit(pRExC_state, data, minlenp, is_inf);
5478 data->longest = &(data->longest_float);
5480 is_inf = is_inf_internal = 1;
5481 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5482 ssc_anything(data->start_class);
5483 flags &= ~SCF_DO_STCLASS;
5485 else if (OP(scan) == GPOS) {
5486 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5487 !(delta || is_inf || (data && data->pos_delta)))
5489 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5490 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5491 if (RExC_rx->gofs < (STRLEN)min)
5492 RExC_rx->gofs = min;
5494 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5498 #ifdef TRIE_STUDY_OPT
5499 #ifdef FULL_TRIE_STUDY
5500 else if (PL_regkind[OP(scan)] == TRIE) {
5501 /* NOTE - There is similar code to this block above for handling
5502 BRANCH nodes on the initial study. If you change stuff here
5504 regnode *trie_node= scan;
5505 regnode *tail= regnext(scan);
5506 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5507 SSize_t max1 = 0, min1 = SSize_t_MAX;
5510 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5511 /* Cannot merge strings after this. */
5512 scan_commit(pRExC_state, data, minlenp, is_inf);
5514 if (flags & SCF_DO_STCLASS)
5515 ssc_init_zero(pRExC_state, &accum);
5521 const regnode *nextbranch= NULL;
5524 for ( word=1 ; word <= trie->wordcount ; word++)
5526 SSize_t deltanext=0, minnext=0, f = 0, fake;
5527 regnode_ssc this_class;
5529 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5531 data_fake.whilem_c = data->whilem_c;
5532 data_fake.last_closep = data->last_closep;
5535 data_fake.last_closep = &fake;
5536 data_fake.pos_delta = delta;
5537 if (flags & SCF_DO_STCLASS) {
5538 ssc_init(pRExC_state, &this_class);
5539 data_fake.start_class = &this_class;
5540 f = SCF_DO_STCLASS_AND;
5542 if (flags & SCF_WHILEM_VISITED_POS)
5543 f |= SCF_WHILEM_VISITED_POS;
5545 if (trie->jump[word]) {
5547 nextbranch = trie_node + trie->jump[0];
5548 scan= trie_node + trie->jump[word];
5549 /* We go from the jump point to the branch that follows
5550 it. Note this means we need the vestigal unused
5551 branches even though they arent otherwise used. */
5552 minnext = study_chunk(pRExC_state, &scan, minlenp,
5553 &deltanext, (regnode *)nextbranch, &data_fake,
5554 stopparen, recursed_depth, NULL, f,depth+1);
5556 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5557 nextbranch= regnext((regnode*)nextbranch);
5559 if (min1 > (SSize_t)(minnext + trie->minlen))
5560 min1 = minnext + trie->minlen;
5561 if (deltanext == SSize_t_MAX) {
5562 is_inf = is_inf_internal = 1;
5564 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5565 max1 = minnext + deltanext + trie->maxlen;
5567 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5569 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5570 if ( stopmin > min + min1)
5571 stopmin = min + min1;
5572 flags &= ~SCF_DO_SUBSTR;
5574 data->flags |= SCF_SEEN_ACCEPT;
5577 if (data_fake.flags & SF_HAS_EVAL)
5578 data->flags |= SF_HAS_EVAL;
5579 data->whilem_c = data_fake.whilem_c;
5581 if (flags & SCF_DO_STCLASS)
5582 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5585 if (flags & SCF_DO_SUBSTR) {
5586 data->pos_min += min1;
5587 data->pos_delta += max1 - min1;
5588 if (max1 != min1 || is_inf)
5589 data->longest = &(data->longest_float);
5592 if (delta != SSize_t_MAX)
5593 delta += max1 - min1;
5594 if (flags & SCF_DO_STCLASS_OR) {
5595 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5597 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5598 flags &= ~SCF_DO_STCLASS;
5601 else if (flags & SCF_DO_STCLASS_AND) {
5603 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5604 flags &= ~SCF_DO_STCLASS;
5607 /* Switch to OR mode: cache the old value of
5608 * data->start_class */
5610 StructCopy(data->start_class, and_withp, regnode_ssc);
5611 flags &= ~SCF_DO_STCLASS_AND;
5612 StructCopy(&accum, data->start_class, regnode_ssc);
5613 flags |= SCF_DO_STCLASS_OR;
5620 else if (PL_regkind[OP(scan)] == TRIE) {
5621 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5624 min += trie->minlen;
5625 delta += (trie->maxlen - trie->minlen);
5626 flags &= ~SCF_DO_STCLASS; /* xxx */
5627 if (flags & SCF_DO_SUBSTR) {
5628 /* Cannot expect anything... */
5629 scan_commit(pRExC_state, data, minlenp, is_inf);
5630 data->pos_min += trie->minlen;
5631 data->pos_delta += (trie->maxlen - trie->minlen);
5632 if (trie->maxlen != trie->minlen)
5633 data->longest = &(data->longest_float);
5635 if (trie->jump) /* no more substrings -- for now /grr*/
5636 flags &= ~SCF_DO_SUBSTR;
5638 #endif /* old or new */
5639 #endif /* TRIE_STUDY_OPT */
5641 /* Else: zero-length, ignore. */
5642 scan = regnext(scan);
5644 /* If we are exiting a recursion we can unset its recursed bit
5645 * and allow ourselves to enter it again - no danger of an
5646 * infinite loop there.
5647 if (stopparen > -1 && recursed) {
5648 DEBUG_STUDYDATA("unset:", data,depth);
5649 PAREN_UNSET( recursed, stopparen);
5655 DEBUG_STUDYDATA("frame-end:",data,depth);
5656 DEBUG_PEEP("fend", scan, depth);
5658 /* restore previous context */
5659 last = frame->last_regnode;
5660 scan = frame->next_regnode;
5661 stopparen = frame->stopparen;
5662 recursed_depth = frame->prev_recursed_depth;
5664 RExC_frame_last = frame->prev_frame;
5665 frame = frame->this_prev_frame;
5666 goto fake_study_recurse;
5671 DEBUG_STUDYDATA("pre-fin:",data,depth);
5674 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5676 if (flags & SCF_DO_SUBSTR && is_inf)
5677 data->pos_delta = SSize_t_MAX - data->pos_min;
5678 if (is_par > (I32)U8_MAX)
5680 if (is_par && pars==1 && data) {
5681 data->flags |= SF_IN_PAR;
5682 data->flags &= ~SF_HAS_PAR;
5684 else if (pars && data) {
5685 data->flags |= SF_HAS_PAR;
5686 data->flags &= ~SF_IN_PAR;
5688 if (flags & SCF_DO_STCLASS_OR)
5689 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5690 if (flags & SCF_TRIE_RESTUDY)
5691 data->flags |= SCF_TRIE_RESTUDY;
5693 DEBUG_STUDYDATA("post-fin:",data,depth);
5696 SSize_t final_minlen= min < stopmin ? min : stopmin;
5698 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5699 if (final_minlen > SSize_t_MAX - delta)
5700 RExC_maxlen = SSize_t_MAX;
5701 else if (RExC_maxlen < final_minlen + delta)
5702 RExC_maxlen = final_minlen + delta;
5704 return final_minlen;
5706 NOT_REACHED; /* NOTREACHED */
5710 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5712 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5714 PERL_ARGS_ASSERT_ADD_DATA;
5716 Renewc(RExC_rxi->data,
5717 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5718 char, struct reg_data);
5720 Renew(RExC_rxi->data->what, count + n, U8);
5722 Newx(RExC_rxi->data->what, n, U8);
5723 RExC_rxi->data->count = count + n;
5724 Copy(s, RExC_rxi->data->what + count, n, U8);
5728 /*XXX: todo make this not included in a non debugging perl, but appears to be
5729 * used anyway there, in 'use re' */
5730 #ifndef PERL_IN_XSUB_RE
5732 Perl_reginitcolors(pTHX)
5734 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5736 char *t = savepv(s);
5740 t = strchr(t, '\t');
5746 PL_colors[i] = t = (char *)"";
5751 PL_colors[i++] = (char *)"";
5758 #ifdef TRIE_STUDY_OPT
5759 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5762 (data.flags & SCF_TRIE_RESTUDY) \
5770 #define CHECK_RESTUDY_GOTO_butfirst
5774 * pregcomp - compile a regular expression into internal code
5776 * Decides which engine's compiler to call based on the hint currently in
5780 #ifndef PERL_IN_XSUB_RE
5782 /* return the currently in-scope regex engine (or the default if none) */
5784 regexp_engine const *
5785 Perl_current_re_engine(pTHX)
5787 if (IN_PERL_COMPILETIME) {
5788 HV * const table = GvHV(PL_hintgv);
5791 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5792 return &PL_core_reg_engine;
5793 ptr = hv_fetchs(table, "regcomp", FALSE);
5794 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5795 return &PL_core_reg_engine;
5796 return INT2PTR(regexp_engine*,SvIV(*ptr));
5800 if (!PL_curcop->cop_hints_hash)
5801 return &PL_core_reg_engine;
5802 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5803 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5804 return &PL_core_reg_engine;
5805 return INT2PTR(regexp_engine*,SvIV(ptr));
5811 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5813 regexp_engine const *eng = current_re_engine();
5814 GET_RE_DEBUG_FLAGS_DECL;
5816 PERL_ARGS_ASSERT_PREGCOMP;
5818 /* Dispatch a request to compile a regexp to correct regexp engine. */
5820 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5823 return CALLREGCOMP_ENG(eng, pattern, flags);
5827 /* public(ish) entry point for the perl core's own regex compiling code.
5828 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5829 * pattern rather than a list of OPs, and uses the internal engine rather
5830 * than the current one */
5833 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5835 SV *pat = pattern; /* defeat constness! */
5836 PERL_ARGS_ASSERT_RE_COMPILE;
5837 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5838 #ifdef PERL_IN_XSUB_RE
5841 &PL_core_reg_engine,
5843 NULL, NULL, rx_flags, 0);
5847 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5848 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5849 * point to the realloced string and length.
5851 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5855 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5856 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5858 U8 *const src = (U8*)*pat_p;
5863 GET_RE_DEBUG_FLAGS_DECL;
5865 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5866 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5868 Newx(dst, *plen_p * 2 + 1, U8);
5871 while (s < *plen_p) {
5872 append_utf8_from_native_byte(src[s], &d);
5873 if (n < num_code_blocks) {
5874 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5875 pRExC_state->code_blocks[n].start = d - dst - 1;
5876 assert(*(d - 1) == '(');
5879 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5880 pRExC_state->code_blocks[n].end = d - dst - 1;
5881 assert(*(d - 1) == ')');
5890 *pat_p = (char*) dst;
5892 RExC_orig_utf8 = RExC_utf8 = 1;
5897 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5898 * while recording any code block indices, and handling overloading,
5899 * nested qr// objects etc. If pat is null, it will allocate a new
5900 * string, or just return the first arg, if there's only one.
5902 * Returns the malloced/updated pat.
5903 * patternp and pat_count is the array of SVs to be concatted;
5904 * oplist is the optional list of ops that generated the SVs;
5905 * recompile_p is a pointer to a boolean that will be set if
5906 * the regex will need to be recompiled.
5907 * delim, if non-null is an SV that will be inserted between each element
5911 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5912 SV *pat, SV ** const patternp, int pat_count,
5913 OP *oplist, bool *recompile_p, SV *delim)
5917 bool use_delim = FALSE;
5918 bool alloced = FALSE;
5920 /* if we know we have at least two args, create an empty string,
5921 * then concatenate args to that. For no args, return an empty string */
5922 if (!pat && pat_count != 1) {
5928 for (svp = patternp; svp < patternp + pat_count; svp++) {
5931 STRLEN orig_patlen = 0;
5933 SV *msv = use_delim ? delim : *svp;
5934 if (!msv) msv = &PL_sv_undef;
5936 /* if we've got a delimiter, we go round the loop twice for each
5937 * svp slot (except the last), using the delimiter the second
5946 if (SvTYPE(msv) == SVt_PVAV) {
5947 /* we've encountered an interpolated array within
5948 * the pattern, e.g. /...@a..../. Expand the list of elements,
5949 * then recursively append elements.
5950 * The code in this block is based on S_pushav() */
5952 AV *const av = (AV*)msv;
5953 const SSize_t maxarg = AvFILL(av) + 1;
5957 assert(oplist->op_type == OP_PADAV
5958 || oplist->op_type == OP_RV2AV);
5959 oplist = OpSIBLING(oplist);
5962 if (SvRMAGICAL(av)) {
5965 Newx(array, maxarg, SV*);
5967 for (i=0; i < maxarg; i++) {
5968 SV ** const svp = av_fetch(av, i, FALSE);
5969 array[i] = svp ? *svp : &PL_sv_undef;
5973 array = AvARRAY(av);
5975 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5976 array, maxarg, NULL, recompile_p,
5978 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5984 /* we make the assumption here that each op in the list of
5985 * op_siblings maps to one SV pushed onto the stack,
5986 * except for code blocks, with have both an OP_NULL and
5988 * This allows us to match up the list of SVs against the
5989 * list of OPs to find the next code block.
5991 * Note that PUSHMARK PADSV PADSV ..
5993 * PADRANGE PADSV PADSV ..
5994 * so the alignment still works. */
5997 if (oplist->op_type == OP_NULL
5998 && (oplist->op_flags & OPf_SPECIAL))
6000 assert(n < pRExC_state->num_code_blocks);
6001 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6002 pRExC_state->code_blocks[n].block = oplist;
6003 pRExC_state->code_blocks[n].src_regex = NULL;
6006 oplist = OpSIBLING(oplist); /* skip CONST */
6009 oplist = OpSIBLING(oplist);;
6012 /* apply magic and QR overloading to arg */
6015 if (SvROK(msv) && SvAMAGIC(msv)) {
6016 SV *sv = AMG_CALLunary(msv, regexp_amg);
6020 if (SvTYPE(sv) != SVt_REGEXP)
6021 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6026 /* try concatenation overload ... */
6027 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6028 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6031 /* overloading involved: all bets are off over literal
6032 * code. Pretend we haven't seen it */
6033 pRExC_state->num_code_blocks -= n;
6037 /* ... or failing that, try "" overload */
6038 while (SvAMAGIC(msv)
6039 && (sv = AMG_CALLunary(msv, string_amg))
6043 && SvRV(msv) == SvRV(sv))
6048 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6052 /* this is a partially unrolled
6053 * sv_catsv_nomg(pat, msv);
6054 * that allows us to adjust code block indices if
6057 char *dst = SvPV_force_nomg(pat, dlen);
6059 if (SvUTF8(msv) && !SvUTF8(pat)) {
6060 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6061 sv_setpvn(pat, dst, dlen);
6064 sv_catsv_nomg(pat, msv);
6071 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6074 /* extract any code blocks within any embedded qr//'s */
6075 if (rx && SvTYPE(rx) == SVt_REGEXP
6076 && RX_ENGINE((REGEXP*)rx)->op_comp)
6079 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6080 if (ri->num_code_blocks) {
6082 /* the presence of an embedded qr// with code means
6083 * we should always recompile: the text of the
6084 * qr// may not have changed, but it may be a
6085 * different closure than last time */
6087 Renew(pRExC_state->code_blocks,
6088 pRExC_state->num_code_blocks + ri->num_code_blocks,
6089 struct reg_code_block);
6090 pRExC_state->num_code_blocks += ri->num_code_blocks;
6092 for (i=0; i < ri->num_code_blocks; i++) {
6093 struct reg_code_block *src, *dst;
6094 STRLEN offset = orig_patlen
6095 + ReANY((REGEXP *)rx)->pre_prefix;
6096 assert(n < pRExC_state->num_code_blocks);
6097 src = &ri->code_blocks[i];
6098 dst = &pRExC_state->code_blocks[n];
6099 dst->start = src->start + offset;
6100 dst->end = src->end + offset;
6101 dst->block = src->block;
6102 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6111 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6120 /* see if there are any run-time code blocks in the pattern.
6121 * False positives are allowed */
6124 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6125 char *pat, STRLEN plen)
6130 PERL_UNUSED_CONTEXT;
6132 for (s = 0; s < plen; s++) {
6133 if (n < pRExC_state->num_code_blocks
6134 && s == pRExC_state->code_blocks[n].start)
6136 s = pRExC_state->code_blocks[n].end;
6140 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6142 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6144 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6151 /* Handle run-time code blocks. We will already have compiled any direct
6152 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6153 * copy of it, but with any literal code blocks blanked out and
6154 * appropriate chars escaped; then feed it into
6156 * eval "qr'modified_pattern'"
6160 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6164 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6166 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6167 * and merge them with any code blocks of the original regexp.
6169 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6170 * instead, just save the qr and return FALSE; this tells our caller that
6171 * the original pattern needs upgrading to utf8.
6175 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6176 char *pat, STRLEN plen)
6180 GET_RE_DEBUG_FLAGS_DECL;
6182 if (pRExC_state->runtime_code_qr) {
6183 /* this is the second time we've been called; this should
6184 * only happen if the main pattern got upgraded to utf8
6185 * during compilation; re-use the qr we compiled first time
6186 * round (which should be utf8 too)
6188 qr = pRExC_state->runtime_code_qr;
6189 pRExC_state->runtime_code_qr = NULL;
6190 assert(RExC_utf8 && SvUTF8(qr));
6196 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6200 /* determine how many extra chars we need for ' and \ escaping */
6201 for (s = 0; s < plen; s++) {
6202 if (pat[s] == '\'' || pat[s] == '\\')
6206 Newx(newpat, newlen, char);
6208 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6210 for (s = 0; s < plen; s++) {
6211 if (n < pRExC_state->num_code_blocks
6212 && s == pRExC_state->code_blocks[n].start)
6214 /* blank out literal code block */
6215 assert(pat[s] == '(');
6216 while (s <= pRExC_state->code_blocks[n].end) {
6224 if (pat[s] == '\'' || pat[s] == '\\')
6229 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6233 PerlIO_printf(Perl_debug_log,
6234 "%sre-parsing pattern for runtime code:%s %s\n",
6235 PL_colors[4],PL_colors[5],newpat);
6238 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6244 PUSHSTACKi(PERLSI_REQUIRE);
6245 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6246 * parsing qr''; normally only q'' does this. It also alters
6248 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6249 SvREFCNT_dec_NN(sv);
6254 SV * const errsv = ERRSV;
6255 if (SvTRUE_NN(errsv))
6257 Safefree(pRExC_state->code_blocks);
6258 /* use croak_sv ? */
6259 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6262 assert(SvROK(qr_ref));
6264 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6265 /* the leaving below frees the tmp qr_ref.
6266 * Give qr a life of its own */
6274 if (!RExC_utf8 && SvUTF8(qr)) {
6275 /* first time through; the pattern got upgraded; save the
6276 * qr for the next time through */
6277 assert(!pRExC_state->runtime_code_qr);
6278 pRExC_state->runtime_code_qr = qr;
6283 /* extract any code blocks within the returned qr// */
6286 /* merge the main (r1) and run-time (r2) code blocks into one */
6288 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6289 struct reg_code_block *new_block, *dst;
6290 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6293 if (!r2->num_code_blocks) /* we guessed wrong */
6295 SvREFCNT_dec_NN(qr);
6300 r1->num_code_blocks + r2->num_code_blocks,
6301 struct reg_code_block);
6304 while ( i1 < r1->num_code_blocks
6305 || i2 < r2->num_code_blocks)
6307 struct reg_code_block *src;
6310 if (i1 == r1->num_code_blocks) {
6311 src = &r2->code_blocks[i2++];
6314 else if (i2 == r2->num_code_blocks)
6315 src = &r1->code_blocks[i1++];
6316 else if ( r1->code_blocks[i1].start
6317 < r2->code_blocks[i2].start)
6319 src = &r1->code_blocks[i1++];
6320 assert(src->end < r2->code_blocks[i2].start);
6323 assert( r1->code_blocks[i1].start
6324 > r2->code_blocks[i2].start);
6325 src = &r2->code_blocks[i2++];
6327 assert(src->end < r1->code_blocks[i1].start);
6330 assert(pat[src->start] == '(');
6331 assert(pat[src->end] == ')');
6332 dst->start = src->start;
6333 dst->end = src->end;
6334 dst->block = src->block;
6335 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6339 r1->num_code_blocks += r2->num_code_blocks;
6340 Safefree(r1->code_blocks);
6341 r1->code_blocks = new_block;
6344 SvREFCNT_dec_NN(qr);
6350 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6351 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6352 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6353 STRLEN longest_length, bool eol, bool meol)
6355 /* This is the common code for setting up the floating and fixed length
6356 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6357 * as to whether succeeded or not */
6362 if (! (longest_length
6363 || (eol /* Can't have SEOL and MULTI */
6364 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6366 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6367 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6372 /* copy the information about the longest from the reg_scan_data
6373 over to the program. */
6374 if (SvUTF8(sv_longest)) {
6375 *rx_utf8 = sv_longest;
6378 *rx_substr = sv_longest;
6381 /* end_shift is how many chars that must be matched that
6382 follow this item. We calculate it ahead of time as once the
6383 lookbehind offset is added in we lose the ability to correctly
6385 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6386 *rx_end_shift = ml - offset
6387 - longest_length + (SvTAIL(sv_longest) != 0)
6390 t = (eol/* Can't have SEOL and MULTI */
6391 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6392 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6398 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6399 * regular expression into internal code.
6400 * The pattern may be passed either as:
6401 * a list of SVs (patternp plus pat_count)
6402 * a list of OPs (expr)
6403 * If both are passed, the SV list is used, but the OP list indicates
6404 * which SVs are actually pre-compiled code blocks
6406 * The SVs in the list have magic and qr overloading applied to them (and
6407 * the list may be modified in-place with replacement SVs in the latter
6410 * If the pattern hasn't changed from old_re, then old_re will be
6413 * eng is the current engine. If that engine has an op_comp method, then
6414 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6415 * do the initial concatenation of arguments and pass on to the external
6418 * If is_bare_re is not null, set it to a boolean indicating whether the
6419 * arg list reduced (after overloading) to a single bare regex which has
6420 * been returned (i.e. /$qr/).
6422 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6424 * pm_flags contains the PMf_* flags, typically based on those from the
6425 * pm_flags field of the related PMOP. Currently we're only interested in
6426 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6428 * We can't allocate space until we know how big the compiled form will be,
6429 * but we can't compile it (and thus know how big it is) until we've got a
6430 * place to put the code. So we cheat: we compile it twice, once with code
6431 * generation turned off and size counting turned on, and once "for real".
6432 * This also means that we don't allocate space until we are sure that the
6433 * thing really will compile successfully, and we never have to move the
6434 * code and thus invalidate pointers into it. (Note that it has to be in
6435 * one piece because free() must be able to free it all.) [NB: not true in perl]
6437 * Beware that the optimization-preparation code in here knows about some
6438 * of the structure of the compiled regexp. [I'll say.]
6442 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6443 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6444 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6448 regexp_internal *ri;
6456 SV *code_blocksv = NULL;
6457 SV** new_patternp = patternp;
6459 /* these are all flags - maybe they should be turned
6460 * into a single int with different bit masks */
6461 I32 sawlookahead = 0;
6466 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6468 bool runtime_code = 0;
6470 RExC_state_t RExC_state;
6471 RExC_state_t * const pRExC_state = &RExC_state;
6472 #ifdef TRIE_STUDY_OPT
6474 RExC_state_t copyRExC_state;
6476 GET_RE_DEBUG_FLAGS_DECL;
6478 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6480 DEBUG_r(if (!PL_colorset) reginitcolors());
6482 /* Initialize these here instead of as-needed, as is quick and avoids
6483 * having to test them each time otherwise */
6484 if (! PL_AboveLatin1) {
6485 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6486 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6487 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6488 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6489 PL_HasMultiCharFold =
6490 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6492 /* This is calculated here, because the Perl program that generates the
6493 * static global ones doesn't currently have access to
6494 * NUM_ANYOF_CODE_POINTS */
6495 PL_InBitmap = _new_invlist(2);
6496 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6497 NUM_ANYOF_CODE_POINTS - 1);
6500 pRExC_state->code_blocks = NULL;
6501 pRExC_state->num_code_blocks = 0;
6504 *is_bare_re = FALSE;
6506 if (expr && (expr->op_type == OP_LIST ||
6507 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6508 /* allocate code_blocks if needed */
6512 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6513 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6514 ncode++; /* count of DO blocks */
6516 pRExC_state->num_code_blocks = ncode;
6517 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6522 /* compile-time pattern with just OP_CONSTs and DO blocks */
6527 /* find how many CONSTs there are */
6530 if (expr->op_type == OP_CONST)
6533 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6534 if (o->op_type == OP_CONST)
6538 /* fake up an SV array */
6540 assert(!new_patternp);
6541 Newx(new_patternp, n, SV*);
6542 SAVEFREEPV(new_patternp);
6546 if (expr->op_type == OP_CONST)
6547 new_patternp[n] = cSVOPx_sv(expr);
6549 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6550 if (o->op_type == OP_CONST)
6551 new_patternp[n++] = cSVOPo_sv;
6556 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6557 "Assembling pattern from %d elements%s\n", pat_count,
6558 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6560 /* set expr to the first arg op */
6562 if (pRExC_state->num_code_blocks
6563 && expr->op_type != OP_CONST)
6565 expr = cLISTOPx(expr)->op_first;
6566 assert( expr->op_type == OP_PUSHMARK
6567 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6568 || expr->op_type == OP_PADRANGE);
6569 expr = OpSIBLING(expr);
6572 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6573 expr, &recompile, NULL);
6575 /* handle bare (possibly after overloading) regex: foo =~ $re */
6580 if (SvTYPE(re) == SVt_REGEXP) {
6584 Safefree(pRExC_state->code_blocks);
6585 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6586 "Precompiled pattern%s\n",
6587 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6593 exp = SvPV_nomg(pat, plen);
6595 if (!eng->op_comp) {
6596 if ((SvUTF8(pat) && IN_BYTES)
6597 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6599 /* make a temporary copy; either to convert to bytes,
6600 * or to avoid repeating get-magic / overloaded stringify */
6601 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6602 (IN_BYTES ? 0 : SvUTF8(pat)));
6604 Safefree(pRExC_state->code_blocks);
6605 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6608 /* ignore the utf8ness if the pattern is 0 length */
6609 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6611 RExC_uni_semantics = 0;
6612 RExC_seen_unfolded_sharp_s = 0;
6613 RExC_contains_locale = 0;
6614 RExC_contains_i = 0;
6615 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6616 pRExC_state->runtime_code_qr = NULL;
6617 RExC_frame_head= NULL;
6618 RExC_frame_last= NULL;
6619 RExC_frame_count= 0;
6622 RExC_mysv1= sv_newmortal();
6623 RExC_mysv2= sv_newmortal();
6626 SV *dsv= sv_newmortal();
6627 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6628 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6629 PL_colors[4],PL_colors[5],s);
6633 /* we jump here if we have to recompile, e.g., from upgrading the pattern
6636 if ((pm_flags & PMf_USE_RE_EVAL)
6637 /* this second condition covers the non-regex literal case,
6638 * i.e. $foo =~ '(?{})'. */
6639 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6641 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6643 /* return old regex if pattern hasn't changed */
6644 /* XXX: note in the below we have to check the flags as well as the
6647 * Things get a touch tricky as we have to compare the utf8 flag
6648 * independently from the compile flags. */
6652 && !!RX_UTF8(old_re) == !!RExC_utf8
6653 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6654 && RX_PRECOMP(old_re)
6655 && RX_PRELEN(old_re) == plen
6656 && memEQ(RX_PRECOMP(old_re), exp, plen)
6657 && !runtime_code /* with runtime code, always recompile */ )
6659 Safefree(pRExC_state->code_blocks);
6663 rx_flags = orig_rx_flags;
6665 if (rx_flags & PMf_FOLD) {
6666 RExC_contains_i = 1;
6668 if ( initial_charset == REGEX_DEPENDS_CHARSET
6669 && (RExC_utf8 ||RExC_uni_semantics))
6672 /* Set to use unicode semantics if the pattern is in utf8 and has the
6673 * 'depends' charset specified, as it means unicode when utf8 */
6674 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6678 RExC_flags = rx_flags;
6679 RExC_pm_flags = pm_flags;
6682 if (TAINTING_get && TAINT_get)
6683 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6685 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6686 /* whoops, we have a non-utf8 pattern, whilst run-time code
6687 * got compiled as utf8. Try again with a utf8 pattern */
6688 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6689 pRExC_state->num_code_blocks);
6690 goto redo_first_pass;
6693 assert(!pRExC_state->runtime_code_qr);
6699 RExC_in_lookbehind = 0;
6700 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6702 RExC_override_recoding = 0;
6704 RExC_recode_x_to_native = 0;
6706 RExC_in_multi_char_class = 0;
6708 /* First pass: determine size, legality. */
6711 RExC_end = exp + plen;
6716 RExC_emit = (regnode *) &RExC_emit_dummy;
6717 RExC_whilem_seen = 0;
6718 RExC_open_parens = NULL;
6719 RExC_close_parens = NULL;
6721 RExC_paren_names = NULL;
6723 RExC_paren_name_list = NULL;
6725 RExC_recurse = NULL;
6726 RExC_study_chunk_recursed = NULL;
6727 RExC_study_chunk_recursed_bytes= 0;
6728 RExC_recurse_count = 0;
6729 pRExC_state->code_index = 0;
6732 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6734 RExC_lastparse=NULL;
6736 /* reg may croak on us, not giving us a chance to free
6737 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6738 need it to survive as long as the regexp (qr/(?{})/).
6739 We must check that code_blocksv is not already set, because we may
6740 have jumped back to restart the sizing pass. */
6741 if (pRExC_state->code_blocks && !code_blocksv) {
6742 code_blocksv = newSV_type(SVt_PV);
6743 SAVEFREESV(code_blocksv);
6744 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6745 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6747 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6748 /* It's possible to write a regexp in ascii that represents Unicode
6749 codepoints outside of the byte range, such as via \x{100}. If we
6750 detect such a sequence we have to convert the entire pattern to utf8
6751 and then recompile, as our sizing calculation will have been based
6752 on 1 byte == 1 character, but we will need to use utf8 to encode
6753 at least some part of the pattern, and therefore must convert the whole
6756 if (flags & RESTART_PASS1) {
6757 if (flags & NEED_UTF8) {
6758 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6759 pRExC_state->num_code_blocks);
6762 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6763 "Need to redo pass 1\n"));
6766 goto redo_first_pass;
6768 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6771 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6774 PerlIO_printf(Perl_debug_log,
6775 "Required size %"IVdf" nodes\n"
6776 "Starting second pass (creation)\n",
6779 RExC_lastparse=NULL;
6782 /* The first pass could have found things that force Unicode semantics */
6783 if ((RExC_utf8 || RExC_uni_semantics)
6784 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6786 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6789 /* Small enough for pointer-storage convention?
6790 If extralen==0, this means that we will not need long jumps. */
6791 if (RExC_size >= 0x10000L && RExC_extralen)
6792 RExC_size += RExC_extralen;
6795 if (RExC_whilem_seen > 15)
6796 RExC_whilem_seen = 15;
6798 /* Allocate space and zero-initialize. Note, the two step process
6799 of zeroing when in debug mode, thus anything assigned has to
6800 happen after that */
6801 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6803 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6804 char, regexp_internal);
6805 if ( r == NULL || ri == NULL )
6806 FAIL("Regexp out of space");
6808 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6809 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6812 /* bulk initialize base fields with 0. */
6813 Zero(ri, sizeof(regexp_internal), char);
6816 /* non-zero initialization begins here */
6819 r->extflags = rx_flags;
6820 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6822 if (pm_flags & PMf_IS_QR) {
6823 ri->code_blocks = pRExC_state->code_blocks;
6824 ri->num_code_blocks = pRExC_state->num_code_blocks;
6829 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6830 if (pRExC_state->code_blocks[n].src_regex)
6831 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6832 if(pRExC_state->code_blocks)
6833 SAVEFREEPV(pRExC_state->code_blocks); /* often null */
6837 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6838 bool has_charset = (get_regex_charset(r->extflags)
6839 != REGEX_DEPENDS_CHARSET);
6841 /* The caret is output if there are any defaults: if not all the STD
6842 * flags are set, or if no character set specifier is needed */
6844 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6846 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6847 == REG_RUN_ON_COMMENT_SEEN);
6848 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
6849 >> RXf_PMf_STD_PMMOD_SHIFT);
6850 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6853 /* We output all the necessary flags; we never output a minus, as all
6854 * those are defaults, so are
6855 * covered by the caret */
6856 const STRLEN wraplen = plen + has_p + has_runon
6857 + has_default /* If needs a caret */
6858 + PL_bitcount[reganch] /* 1 char for each set standard flag */
6860 /* If needs a character set specifier */
6861 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6862 + (sizeof("(?:)") - 1);
6864 /* make sure PL_bitcount bounds not exceeded */
6865 assert(sizeof(STD_PAT_MODS) <= 8);
6867 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6868 r->xpv_len_u.xpvlenu_pv = p;
6870 SvFLAGS(rx) |= SVf_UTF8;
6873 /* If a default, cover it using the caret */
6875 *p++= DEFAULT_PAT_MOD;
6879 const char* const name = get_regex_charset_name(r->extflags, &len);
6880 Copy(name, p, len, char);
6884 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6887 while((ch = *fptr++)) {
6895 Copy(RExC_precomp, p, plen, char);
6896 assert ((RX_WRAPPED(rx) - p) < 16);
6897 r->pre_prefix = p - RX_WRAPPED(rx);
6903 SvCUR_set(rx, p - RX_WRAPPED(rx));
6907 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6909 /* setup various meta data about recursion, this all requires
6910 * RExC_npar to be correctly set, and a bit later on we clear it */
6911 if (RExC_seen & REG_RECURSE_SEEN) {
6912 Newxz(RExC_open_parens, RExC_npar,regnode *);
6913 SAVEFREEPV(RExC_open_parens);
6914 Newxz(RExC_close_parens,RExC_npar,regnode *);
6915 SAVEFREEPV(RExC_close_parens);
6917 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6918 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6919 * So its 1 if there are no parens. */
6920 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6921 ((RExC_npar & 0x07) != 0);
6922 Newx(RExC_study_chunk_recursed,
6923 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6924 SAVEFREEPV(RExC_study_chunk_recursed);
6927 /* Useful during FAIL. */
6928 #ifdef RE_TRACK_PATTERN_OFFSETS
6929 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6930 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6931 "%s %"UVuf" bytes for offset annotations.\n",
6932 ri->u.offsets ? "Got" : "Couldn't get",
6933 (UV)((2*RExC_size+1) * sizeof(U32))));
6935 SetProgLen(ri,RExC_size);
6940 /* Second pass: emit code. */
6941 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6942 RExC_pm_flags = pm_flags;
6944 RExC_end = exp + plen;
6947 RExC_emit_start = ri->program;
6948 RExC_emit = ri->program;
6949 RExC_emit_bound = ri->program + RExC_size + 1;
6950 pRExC_state->code_index = 0;
6952 *((char*) RExC_emit++) = (char) REG_MAGIC;
6953 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6955 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6957 /* XXXX To minimize changes to RE engine we always allocate
6958 3-units-long substrs field. */
6959 Newx(r->substrs, 1, struct reg_substr_data);
6960 if (RExC_recurse_count) {
6961 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6962 SAVEFREEPV(RExC_recurse);
6966 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6968 RExC_study_chunk_recursed_count= 0;
6970 Zero(r->substrs, 1, struct reg_substr_data);
6971 if (RExC_study_chunk_recursed) {
6972 Zero(RExC_study_chunk_recursed,
6973 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6977 #ifdef TRIE_STUDY_OPT
6979 StructCopy(&zero_scan_data, &data, scan_data_t);
6980 copyRExC_state = RExC_state;
6983 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6985 RExC_state = copyRExC_state;
6986 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6987 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6989 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6990 StructCopy(&zero_scan_data, &data, scan_data_t);
6993 StructCopy(&zero_scan_data, &data, scan_data_t);
6996 /* Dig out information for optimizations. */
6997 r->extflags = RExC_flags; /* was pm_op */
6998 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7001 SvUTF8_on(rx); /* Unicode in it? */
7002 ri->regstclass = NULL;
7003 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7004 r->intflags |= PREGf_NAUGHTY;
7005 scan = ri->program + 1; /* First BRANCH. */
7007 /* testing for BRANCH here tells us whether there is "must appear"
7008 data in the pattern. If there is then we can use it for optimisations */
7009 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7012 STRLEN longest_float_length, longest_fixed_length;
7013 regnode_ssc ch_class; /* pointed to by data */
7015 SSize_t last_close = 0; /* pointed to by data */
7016 regnode *first= scan;
7017 regnode *first_next= regnext(first);
7019 * Skip introductions and multiplicators >= 1
7020 * so that we can extract the 'meat' of the pattern that must
7021 * match in the large if() sequence following.
7022 * NOTE that EXACT is NOT covered here, as it is normally
7023 * picked up by the optimiser separately.
7025 * This is unfortunate as the optimiser isnt handling lookahead
7026 * properly currently.
7029 while ((OP(first) == OPEN && (sawopen = 1)) ||
7030 /* An OR of *one* alternative - should not happen now. */
7031 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7032 /* for now we can't handle lookbehind IFMATCH*/
7033 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7034 (OP(first) == PLUS) ||
7035 (OP(first) == MINMOD) ||
7036 /* An {n,m} with n>0 */
7037 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7038 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7041 * the only op that could be a regnode is PLUS, all the rest
7042 * will be regnode_1 or regnode_2.
7044 * (yves doesn't think this is true)
7046 if (OP(first) == PLUS)
7049 if (OP(first) == MINMOD)
7051 first += regarglen[OP(first)];
7053 first = NEXTOPER(first);
7054 first_next= regnext(first);
7057 /* Starting-point info. */
7059 DEBUG_PEEP("first:",first,0);
7060 /* Ignore EXACT as we deal with it later. */
7061 if (PL_regkind[OP(first)] == EXACT) {
7062 if (OP(first) == EXACT || OP(first) == EXACTL)
7063 NOOP; /* Empty, get anchored substr later. */
7065 ri->regstclass = first;
7068 else if (PL_regkind[OP(first)] == TRIE &&
7069 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7071 /* this can happen only on restudy */
7072 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7075 else if (REGNODE_SIMPLE(OP(first)))
7076 ri->regstclass = first;
7077 else if (PL_regkind[OP(first)] == BOUND ||
7078 PL_regkind[OP(first)] == NBOUND)
7079 ri->regstclass = first;
7080 else if (PL_regkind[OP(first)] == BOL) {
7081 r->intflags |= (OP(first) == MBOL
7084 first = NEXTOPER(first);
7087 else if (OP(first) == GPOS) {
7088 r->intflags |= PREGf_ANCH_GPOS;
7089 first = NEXTOPER(first);
7092 else if ((!sawopen || !RExC_sawback) &&
7094 (OP(first) == STAR &&
7095 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7096 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7098 /* turn .* into ^.* with an implied $*=1 */
7100 (OP(NEXTOPER(first)) == REG_ANY)
7103 r->intflags |= (type | PREGf_IMPLICIT);
7104 first = NEXTOPER(first);
7107 if (sawplus && !sawminmod && !sawlookahead
7108 && (!sawopen || !RExC_sawback)
7109 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7110 /* x+ must match at the 1st pos of run of x's */
7111 r->intflags |= PREGf_SKIP;
7113 /* Scan is after the zeroth branch, first is atomic matcher. */
7114 #ifdef TRIE_STUDY_OPT
7117 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7118 (IV)(first - scan + 1))
7122 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7123 (IV)(first - scan + 1))
7129 * If there's something expensive in the r.e., find the
7130 * longest literal string that must appear and make it the
7131 * regmust. Resolve ties in favor of later strings, since
7132 * the regstart check works with the beginning of the r.e.
7133 * and avoiding duplication strengthens checking. Not a
7134 * strong reason, but sufficient in the absence of others.
7135 * [Now we resolve ties in favor of the earlier string if
7136 * it happens that c_offset_min has been invalidated, since the
7137 * earlier string may buy us something the later one won't.]
7140 data.longest_fixed = newSVpvs("");
7141 data.longest_float = newSVpvs("");
7142 data.last_found = newSVpvs("");
7143 data.longest = &(data.longest_fixed);
7144 ENTER_with_name("study_chunk");
7145 SAVEFREESV(data.longest_fixed);
7146 SAVEFREESV(data.longest_float);
7147 SAVEFREESV(data.last_found);
7149 if (!ri->regstclass) {
7150 ssc_init(pRExC_state, &ch_class);
7151 data.start_class = &ch_class;
7152 stclass_flag = SCF_DO_STCLASS_AND;
7153 } else /* XXXX Check for BOUND? */
7155 data.last_closep = &last_close;
7158 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7159 scan + RExC_size, /* Up to end */
7161 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7162 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7166 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7169 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7170 && data.last_start_min == 0 && data.last_end > 0
7171 && !RExC_seen_zerolen
7172 && !(RExC_seen & REG_VERBARG_SEEN)
7173 && !(RExC_seen & REG_GPOS_SEEN)
7175 r->extflags |= RXf_CHECK_ALL;
7177 scan_commit(pRExC_state, &data,&minlen,0);
7179 longest_float_length = CHR_SVLEN(data.longest_float);
7181 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7182 && data.offset_fixed == data.offset_float_min
7183 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7184 && S_setup_longest (aTHX_ pRExC_state,
7188 &(r->float_end_shift),
7189 data.lookbehind_float,
7190 data.offset_float_min,
7192 longest_float_length,
7193 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7194 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7196 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7197 r->float_max_offset = data.offset_float_max;
7198 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7199 r->float_max_offset -= data.lookbehind_float;
7200 SvREFCNT_inc_simple_void_NN(data.longest_float);
7203 r->float_substr = r->float_utf8 = NULL;
7204 longest_float_length = 0;
7207 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7209 if (S_setup_longest (aTHX_ pRExC_state,
7211 &(r->anchored_utf8),
7212 &(r->anchored_substr),
7213 &(r->anchored_end_shift),
7214 data.lookbehind_fixed,
7217 longest_fixed_length,
7218 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7219 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7221 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7222 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7225 r->anchored_substr = r->anchored_utf8 = NULL;
7226 longest_fixed_length = 0;
7228 LEAVE_with_name("study_chunk");
7231 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7232 ri->regstclass = NULL;
7234 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7236 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7237 && is_ssc_worth_it(pRExC_state, data.start_class))
7239 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7241 ssc_finalize(pRExC_state, data.start_class);
7243 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7244 StructCopy(data.start_class,
7245 (regnode_ssc*)RExC_rxi->data->data[n],
7247 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7248 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7249 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7250 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7251 PerlIO_printf(Perl_debug_log,
7252 "synthetic stclass \"%s\".\n",
7253 SvPVX_const(sv));});
7254 data.start_class = NULL;
7257 /* A temporary algorithm prefers floated substr to fixed one to dig
7259 if (longest_fixed_length > longest_float_length) {
7260 r->substrs->check_ix = 0;
7261 r->check_end_shift = r->anchored_end_shift;
7262 r->check_substr = r->anchored_substr;
7263 r->check_utf8 = r->anchored_utf8;
7264 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7265 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7266 r->intflags |= PREGf_NOSCAN;
7269 r->substrs->check_ix = 1;
7270 r->check_end_shift = r->float_end_shift;
7271 r->check_substr = r->float_substr;
7272 r->check_utf8 = r->float_utf8;
7273 r->check_offset_min = r->float_min_offset;
7274 r->check_offset_max = r->float_max_offset;
7276 if ((r->check_substr || r->check_utf8) ) {
7277 r->extflags |= RXf_USE_INTUIT;
7278 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7279 r->extflags |= RXf_INTUIT_TAIL;
7281 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7283 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7284 if ( (STRLEN)minlen < longest_float_length )
7285 minlen= longest_float_length;
7286 if ( (STRLEN)minlen < longest_fixed_length )
7287 minlen= longest_fixed_length;
7291 /* Several toplevels. Best we can is to set minlen. */
7293 regnode_ssc ch_class;
7294 SSize_t last_close = 0;
7296 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7298 scan = ri->program + 1;
7299 ssc_init(pRExC_state, &ch_class);
7300 data.start_class = &ch_class;
7301 data.last_closep = &last_close;
7304 minlen = study_chunk(pRExC_state,
7305 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7306 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7307 ? SCF_TRIE_DOING_RESTUDY
7311 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7313 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7314 = r->float_substr = r->float_utf8 = NULL;
7316 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7317 && is_ssc_worth_it(pRExC_state, data.start_class))
7319 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7321 ssc_finalize(pRExC_state, data.start_class);
7323 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7324 StructCopy(data.start_class,
7325 (regnode_ssc*)RExC_rxi->data->data[n],
7327 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7328 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7329 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7330 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7331 PerlIO_printf(Perl_debug_log,
7332 "synthetic stclass \"%s\".\n",
7333 SvPVX_const(sv));});
7334 data.start_class = NULL;
7338 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7339 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7340 r->maxlen = REG_INFTY;
7343 r->maxlen = RExC_maxlen;
7346 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7347 the "real" pattern. */
7349 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7350 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7352 r->minlenret = minlen;
7353 if (r->minlen < minlen)
7356 if (RExC_seen & REG_GPOS_SEEN)
7357 r->intflags |= PREGf_GPOS_SEEN;
7358 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7359 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7361 if (pRExC_state->num_code_blocks)
7362 r->extflags |= RXf_EVAL_SEEN;
7363 if (RExC_seen & REG_VERBARG_SEEN)
7365 r->intflags |= PREGf_VERBARG_SEEN;
7366 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7368 if (RExC_seen & REG_CUTGROUP_SEEN)
7369 r->intflags |= PREGf_CUTGROUP_SEEN;
7370 if (pm_flags & PMf_USE_RE_EVAL)
7371 r->intflags |= PREGf_USE_RE_EVAL;
7372 if (RExC_paren_names)
7373 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7375 RXp_PAREN_NAMES(r) = NULL;
7377 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7378 * so it can be used in pp.c */
7379 if (r->intflags & PREGf_ANCH)
7380 r->extflags |= RXf_IS_ANCHORED;
7384 /* this is used to identify "special" patterns that might result
7385 * in Perl NOT calling the regex engine and instead doing the match "itself",
7386 * particularly special cases in split//. By having the regex compiler
7387 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7388 * we avoid weird issues with equivalent patterns resulting in different behavior,
7389 * AND we allow non Perl engines to get the same optimizations by the setting the
7390 * flags appropriately - Yves */
7391 regnode *first = ri->program + 1;
7393 regnode *next = regnext(first);
7396 if (PL_regkind[fop] == NOTHING && nop == END)
7397 r->extflags |= RXf_NULL;
7398 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7399 /* when fop is SBOL first->flags will be true only when it was
7400 * produced by parsing /\A/, and not when parsing /^/. This is
7401 * very important for the split code as there we want to
7402 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7403 * See rt #122761 for more details. -- Yves */
7404 r->extflags |= RXf_START_ONLY;
7405 else if (fop == PLUS
7406 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7408 r->extflags |= RXf_WHITE;
7409 else if ( r->extflags & RXf_SPLIT
7410 && (fop == EXACT || fop == EXACTL)
7411 && STR_LEN(first) == 1
7412 && *(STRING(first)) == ' '
7414 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7418 if (RExC_contains_locale) {
7419 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7423 if (RExC_paren_names) {
7424 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7425 ri->data->data[ri->name_list_idx]
7426 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7429 ri->name_list_idx = 0;
7431 if (RExC_recurse_count) {
7432 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7433 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7434 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7437 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7438 /* assume we don't need to swap parens around before we match */
7440 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7441 (unsigned long)RExC_study_chunk_recursed_count);
7445 PerlIO_printf(Perl_debug_log,"Final program:\n");
7448 #ifdef RE_TRACK_PATTERN_OFFSETS
7449 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7450 const STRLEN len = ri->u.offsets[0];
7452 GET_RE_DEBUG_FLAGS_DECL;
7453 PerlIO_printf(Perl_debug_log,
7454 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7455 for (i = 1; i <= len; i++) {
7456 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7457 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7458 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7460 PerlIO_printf(Perl_debug_log, "\n");
7465 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7466 * by setting the regexp SV to readonly-only instead. If the
7467 * pattern's been recompiled, the USEDness should remain. */
7468 if (old_re && SvREADONLY(old_re))
7476 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7479 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7481 PERL_UNUSED_ARG(value);
7483 if (flags & RXapif_FETCH) {
7484 return reg_named_buff_fetch(rx, key, flags);
7485 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7486 Perl_croak_no_modify();
7488 } else if (flags & RXapif_EXISTS) {
7489 return reg_named_buff_exists(rx, key, flags)
7492 } else if (flags & RXapif_REGNAMES) {
7493 return reg_named_buff_all(rx, flags);
7494 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7495 return reg_named_buff_scalar(rx, flags);
7497 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7503 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7506 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7507 PERL_UNUSED_ARG(lastkey);
7509 if (flags & RXapif_FIRSTKEY)
7510 return reg_named_buff_firstkey(rx, flags);
7511 else if (flags & RXapif_NEXTKEY)
7512 return reg_named_buff_nextkey(rx, flags);
7514 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7521 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7524 AV *retarray = NULL;
7526 struct regexp *const rx = ReANY(r);
7528 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7530 if (flags & RXapif_ALL)
7533 if (rx && RXp_PAREN_NAMES(rx)) {
7534 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7537 SV* sv_dat=HeVAL(he_str);
7538 I32 *nums=(I32*)SvPVX(sv_dat);
7539 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7540 if ((I32)(rx->nparens) >= nums[i]
7541 && rx->offs[nums[i]].start != -1
7542 && rx->offs[nums[i]].end != -1)
7545 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7550 ret = newSVsv(&PL_sv_undef);
7553 av_push(retarray, ret);
7556 return newRV_noinc(MUTABLE_SV(retarray));
7563 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7566 struct regexp *const rx = ReANY(r);
7568 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7570 if (rx && RXp_PAREN_NAMES(rx)) {
7571 if (flags & RXapif_ALL) {
7572 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7574 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7576 SvREFCNT_dec_NN(sv);
7588 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7590 struct regexp *const rx = ReANY(r);
7592 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7594 if ( rx && RXp_PAREN_NAMES(rx) ) {
7595 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7597 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7604 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7606 struct regexp *const rx = ReANY(r);
7607 GET_RE_DEBUG_FLAGS_DECL;
7609 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7611 if (rx && RXp_PAREN_NAMES(rx)) {
7612 HV *hv = RXp_PAREN_NAMES(rx);
7614 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7617 SV* sv_dat = HeVAL(temphe);
7618 I32 *nums = (I32*)SvPVX(sv_dat);
7619 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7620 if ((I32)(rx->lastparen) >= nums[i] &&
7621 rx->offs[nums[i]].start != -1 &&
7622 rx->offs[nums[i]].end != -1)
7628 if (parno || flags & RXapif_ALL) {
7629 return newSVhek(HeKEY_hek(temphe));
7637 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7642 struct regexp *const rx = ReANY(r);
7644 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7646 if (rx && RXp_PAREN_NAMES(rx)) {
7647 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7648 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7649 } else if (flags & RXapif_ONE) {
7650 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7651 av = MUTABLE_AV(SvRV(ret));
7652 length = av_tindex(av);
7653 SvREFCNT_dec_NN(ret);
7654 return newSViv(length + 1);
7656 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7661 return &PL_sv_undef;
7665 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7667 struct regexp *const rx = ReANY(r);
7670 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7672 if (rx && RXp_PAREN_NAMES(rx)) {
7673 HV *hv= RXp_PAREN_NAMES(rx);
7675 (void)hv_iterinit(hv);
7676 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7679 SV* sv_dat = HeVAL(temphe);
7680 I32 *nums = (I32*)SvPVX(sv_dat);
7681 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7682 if ((I32)(rx->lastparen) >= nums[i] &&
7683 rx->offs[nums[i]].start != -1 &&
7684 rx->offs[nums[i]].end != -1)
7690 if (parno || flags & RXapif_ALL) {
7691 av_push(av, newSVhek(HeKEY_hek(temphe)));
7696 return newRV_noinc(MUTABLE_SV(av));
7700 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7703 struct regexp *const rx = ReANY(r);
7709 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7711 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7712 || n == RX_BUFF_IDX_CARET_FULLMATCH
7713 || n == RX_BUFF_IDX_CARET_POSTMATCH
7716 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7718 /* on something like
7721 * the KEEPCOPY is set on the PMOP rather than the regex */
7722 if (PL_curpm && r == PM_GETRE(PL_curpm))
7723 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7732 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7733 /* no need to distinguish between them any more */
7734 n = RX_BUFF_IDX_FULLMATCH;
7736 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7737 && rx->offs[0].start != -1)
7739 /* $`, ${^PREMATCH} */
7740 i = rx->offs[0].start;
7744 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7745 && rx->offs[0].end != -1)
7747 /* $', ${^POSTMATCH} */
7748 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7749 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7752 if ( 0 <= n && n <= (I32)rx->nparens &&
7753 (s1 = rx->offs[n].start) != -1 &&
7754 (t1 = rx->offs[n].end) != -1)
7756 /* $&, ${^MATCH}, $1 ... */
7758 s = rx->subbeg + s1 - rx->suboffset;
7763 assert(s >= rx->subbeg);
7764 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7766 #ifdef NO_TAINT_SUPPORT
7767 sv_setpvn(sv, s, i);
7769 const int oldtainted = TAINT_get;
7771 sv_setpvn(sv, s, i);
7772 TAINT_set(oldtainted);
7774 if (RXp_MATCH_UTF8(rx))
7779 if (RXp_MATCH_TAINTED(rx)) {
7780 if (SvTYPE(sv) >= SVt_PVMG) {
7781 MAGIC* const mg = SvMAGIC(sv);
7784 SvMAGIC_set(sv, mg->mg_moremagic);
7786 if ((mgt = SvMAGIC(sv))) {
7787 mg->mg_moremagic = mgt;
7788 SvMAGIC_set(sv, mg);
7799 sv_setsv(sv,&PL_sv_undef);
7805 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7806 SV const * const value)
7808 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7810 PERL_UNUSED_ARG(rx);
7811 PERL_UNUSED_ARG(paren);
7812 PERL_UNUSED_ARG(value);
7815 Perl_croak_no_modify();
7819 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7822 struct regexp *const rx = ReANY(r);
7826 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7828 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7829 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7830 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7833 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7835 /* on something like
7838 * the KEEPCOPY is set on the PMOP rather than the regex */
7839 if (PL_curpm && r == PM_GETRE(PL_curpm))
7840 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7846 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7848 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7849 case RX_BUFF_IDX_PREMATCH: /* $` */
7850 if (rx->offs[0].start != -1) {
7851 i = rx->offs[0].start;
7860 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7861 case RX_BUFF_IDX_POSTMATCH: /* $' */
7862 if (rx->offs[0].end != -1) {
7863 i = rx->sublen - rx->offs[0].end;
7865 s1 = rx->offs[0].end;
7872 default: /* $& / ${^MATCH}, $1, $2, ... */
7873 if (paren <= (I32)rx->nparens &&
7874 (s1 = rx->offs[paren].start) != -1 &&
7875 (t1 = rx->offs[paren].end) != -1)
7881 if (ckWARN(WARN_UNINITIALIZED))
7882 report_uninit((const SV *)sv);
7887 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7888 const char * const s = rx->subbeg - rx->suboffset + s1;
7893 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7900 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7902 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7903 PERL_UNUSED_ARG(rx);
7907 return newSVpvs("Regexp");
7910 /* Scans the name of a named buffer from the pattern.
7911 * If flags is REG_RSN_RETURN_NULL returns null.
7912 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7913 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7914 * to the parsed name as looked up in the RExC_paren_names hash.
7915 * If there is an error throws a vFAIL().. type exception.
7918 #define REG_RSN_RETURN_NULL 0
7919 #define REG_RSN_RETURN_NAME 1
7920 #define REG_RSN_RETURN_DATA 2
7923 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7925 char *name_start = RExC_parse;
7927 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7929 assert (RExC_parse <= RExC_end);
7930 if (RExC_parse == RExC_end) NOOP;
7931 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7932 /* skip IDFIRST by using do...while */
7935 RExC_parse += UTF8SKIP(RExC_parse);
7936 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7940 } while (isWORDCHAR(*RExC_parse));
7942 RExC_parse++; /* so the <- from the vFAIL is after the offending
7944 vFAIL("Group name must start with a non-digit word character");
7948 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7949 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7950 if ( flags == REG_RSN_RETURN_NAME)
7952 else if (flags==REG_RSN_RETURN_DATA) {
7955 if ( ! sv_name ) /* should not happen*/
7956 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7957 if (RExC_paren_names)
7958 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7960 sv_dat = HeVAL(he_str);
7962 vFAIL("Reference to nonexistent named group");
7966 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7967 (unsigned long) flags);
7969 NOT_REACHED; /* NOTREACHED */
7974 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7976 if (RExC_lastparse!=RExC_parse) { \
7977 PerlIO_printf(Perl_debug_log, "%s", \
7978 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7979 RExC_end - RExC_parse, 16, \
7981 PERL_PV_ESCAPE_UNI_DETECT | \
7982 PERL_PV_PRETTY_ELLIPSES | \
7983 PERL_PV_PRETTY_LTGT | \
7984 PERL_PV_ESCAPE_RE | \
7985 PERL_PV_PRETTY_EXACTSIZE \
7989 PerlIO_printf(Perl_debug_log,"%16s",""); \
7992 num = RExC_size + 1; \
7994 num=REG_NODE_NUM(RExC_emit); \
7995 if (RExC_lastnum!=num) \
7996 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7998 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7999 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8000 (int)((depth*2)), "", \
8004 RExC_lastparse=RExC_parse; \
8009 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8010 DEBUG_PARSE_MSG((funcname)); \
8011 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8013 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8014 DEBUG_PARSE_MSG((funcname)); \
8015 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8018 /* This section of code defines the inversion list object and its methods. The
8019 * interfaces are highly subject to change, so as much as possible is static to
8020 * this file. An inversion list is here implemented as a malloc'd C UV array
8021 * as an SVt_INVLIST scalar.
8023 * An inversion list for Unicode is an array of code points, sorted by ordinal
8024 * number. The zeroth element is the first code point in the list. The 1th
8025 * element is the first element beyond that not in the list. In other words,
8026 * the first range is
8027 * invlist[0]..(invlist[1]-1)
8028 * The other ranges follow. Thus every element whose index is divisible by two
8029 * marks the beginning of a range that is in the list, and every element not
8030 * divisible by two marks the beginning of a range not in the list. A single
8031 * element inversion list that contains the single code point N generally
8032 * consists of two elements
8035 * (The exception is when N is the highest representable value on the
8036 * machine, in which case the list containing just it would be a single
8037 * element, itself. By extension, if the last range in the list extends to
8038 * infinity, then the first element of that range will be in the inversion list
8039 * at a position that is divisible by two, and is the final element in the
8041 * Taking the complement (inverting) an inversion list is quite simple, if the
8042 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8043 * This implementation reserves an element at the beginning of each inversion
8044 * list to always contain 0; there is an additional flag in the header which
8045 * indicates if the list begins at the 0, or is offset to begin at the next
8048 * More about inversion lists can be found in "Unicode Demystified"
8049 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8050 * More will be coming when functionality is added later.
8052 * The inversion list data structure is currently implemented as an SV pointing
8053 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8054 * array of UV whose memory management is automatically handled by the existing
8055 * facilities for SV's.
8057 * Some of the methods should always be private to the implementation, and some
8058 * should eventually be made public */
8060 /* The header definitions are in F<invlist_inline.h> */
8062 PERL_STATIC_INLINE UV*
8063 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8065 /* Returns a pointer to the first element in the inversion list's array.
8066 * This is called upon initialization of an inversion list. Where the
8067 * array begins depends on whether the list has the code point U+0000 in it
8068 * or not. The other parameter tells it whether the code that follows this
8069 * call is about to put a 0 in the inversion list or not. The first
8070 * element is either the element reserved for 0, if TRUE, or the element
8071 * after it, if FALSE */
8073 bool* offset = get_invlist_offset_addr(invlist);
8074 UV* zero_addr = (UV *) SvPVX(invlist);
8076 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8079 assert(! _invlist_len(invlist));
8083 /* 1^1 = 0; 1^0 = 1 */
8084 *offset = 1 ^ will_have_0;
8085 return zero_addr + *offset;
8088 PERL_STATIC_INLINE void
8089 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8091 /* Sets the current number of elements stored in the inversion list.
8092 * Updates SvCUR correspondingly */
8093 PERL_UNUSED_CONTEXT;
8094 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8096 assert(SvTYPE(invlist) == SVt_INVLIST);
8101 : TO_INTERNAL_SIZE(len + offset));
8102 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8105 #ifndef PERL_IN_XSUB_RE
8107 PERL_STATIC_INLINE IV*
8108 S_get_invlist_previous_index_addr(SV* invlist)
8110 /* Return the address of the IV that is reserved to hold the cached index
8112 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8114 assert(SvTYPE(invlist) == SVt_INVLIST);
8116 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8119 PERL_STATIC_INLINE IV
8120 S_invlist_previous_index(SV* const invlist)
8122 /* Returns cached index of previous search */
8124 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8126 return *get_invlist_previous_index_addr(invlist);
8129 PERL_STATIC_INLINE void
8130 S_invlist_set_previous_index(SV* const invlist, const IV index)
8132 /* Caches <index> for later retrieval */
8134 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8136 assert(index == 0 || index < (int) _invlist_len(invlist));
8138 *get_invlist_previous_index_addr(invlist) = index;
8141 PERL_STATIC_INLINE void
8142 S_invlist_trim(SV* const invlist)
8144 PERL_ARGS_ASSERT_INVLIST_TRIM;
8146 assert(SvTYPE(invlist) == SVt_INVLIST);
8148 /* Change the length of the inversion list to how many entries it currently
8150 SvPV_shrink_to_cur((SV *) invlist);
8153 PERL_STATIC_INLINE bool
8154 S_invlist_is_iterating(SV* const invlist)
8156 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8158 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8161 #endif /* ifndef PERL_IN_XSUB_RE */
8163 PERL_STATIC_INLINE UV
8164 S_invlist_max(SV* const invlist)
8166 /* Returns the maximum number of elements storable in the inversion list's
8167 * array, without having to realloc() */
8169 PERL_ARGS_ASSERT_INVLIST_MAX;
8171 assert(SvTYPE(invlist) == SVt_INVLIST);
8173 /* Assumes worst case, in which the 0 element is not counted in the
8174 * inversion list, so subtracts 1 for that */
8175 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8176 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8177 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8180 #ifndef PERL_IN_XSUB_RE
8182 Perl__new_invlist(pTHX_ IV initial_size)
8185 /* Return a pointer to a newly constructed inversion list, with enough
8186 * space to store 'initial_size' elements. If that number is negative, a
8187 * system default is used instead */
8191 if (initial_size < 0) {
8195 /* Allocate the initial space */
8196 new_list = newSV_type(SVt_INVLIST);
8198 /* First 1 is in case the zero element isn't in the list; second 1 is for
8200 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8201 invlist_set_len(new_list, 0, 0);
8203 /* Force iterinit() to be used to get iteration to work */
8204 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8206 *get_invlist_previous_index_addr(new_list) = 0;
8212 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8214 /* Return a pointer to a newly constructed inversion list, initialized to
8215 * point to <list>, which has to be in the exact correct inversion list
8216 * form, including internal fields. Thus this is a dangerous routine that
8217 * should not be used in the wrong hands. The passed in 'list' contains
8218 * several header fields at the beginning that are not part of the
8219 * inversion list body proper */
8221 const STRLEN length = (STRLEN) list[0];
8222 const UV version_id = list[1];
8223 const bool offset = cBOOL(list[2]);
8224 #define HEADER_LENGTH 3
8225 /* If any of the above changes in any way, you must change HEADER_LENGTH
8226 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8227 * perl -E 'say int(rand 2**31-1)'
8229 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8230 data structure type, so that one being
8231 passed in can be validated to be an
8232 inversion list of the correct vintage.
8235 SV* invlist = newSV_type(SVt_INVLIST);
8237 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8239 if (version_id != INVLIST_VERSION_ID) {
8240 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8243 /* The generated array passed in includes header elements that aren't part
8244 * of the list proper, so start it just after them */
8245 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8247 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8248 shouldn't touch it */
8250 *(get_invlist_offset_addr(invlist)) = offset;
8252 /* The 'length' passed to us is the physical number of elements in the
8253 * inversion list. But if there is an offset the logical number is one
8255 invlist_set_len(invlist, length - offset, offset);
8257 invlist_set_previous_index(invlist, 0);
8259 /* Initialize the iteration pointer. */
8260 invlist_iterfinish(invlist);
8262 SvREADONLY_on(invlist);
8266 #endif /* ifndef PERL_IN_XSUB_RE */
8269 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8271 /* Grow the maximum size of an inversion list */
8273 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8275 assert(SvTYPE(invlist) == SVt_INVLIST);
8277 /* Add one to account for the zero element at the beginning which may not
8278 * be counted by the calling parameters */
8279 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8283 S__append_range_to_invlist(pTHX_ SV* const invlist,
8284 const UV start, const UV end)
8286 /* Subject to change or removal. Append the range from 'start' to 'end' at
8287 * the end of the inversion list. The range must be above any existing
8291 UV max = invlist_max(invlist);
8292 UV len = _invlist_len(invlist);
8295 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8297 if (len == 0) { /* Empty lists must be initialized */
8298 offset = start != 0;
8299 array = _invlist_array_init(invlist, ! offset);
8302 /* Here, the existing list is non-empty. The current max entry in the
8303 * list is generally the first value not in the set, except when the
8304 * set extends to the end of permissible values, in which case it is
8305 * the first entry in that final set, and so this call is an attempt to
8306 * append out-of-order */
8308 UV final_element = len - 1;
8309 array = invlist_array(invlist);
8310 if (array[final_element] > start
8311 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8313 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8314 array[final_element], start,
8315 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8318 /* Here, it is a legal append. If the new range begins with the first
8319 * value not in the set, it is extending the set, so the new first
8320 * value not in the set is one greater than the newly extended range.
8322 offset = *get_invlist_offset_addr(invlist);
8323 if (array[final_element] == start) {
8324 if (end != UV_MAX) {
8325 array[final_element] = end + 1;
8328 /* But if the end is the maximum representable on the machine,
8329 * just let the range that this would extend to have no end */
8330 invlist_set_len(invlist, len - 1, offset);
8336 /* Here the new range doesn't extend any existing set. Add it */
8338 len += 2; /* Includes an element each for the start and end of range */
8340 /* If wll overflow the existing space, extend, which may cause the array to
8343 invlist_extend(invlist, len);
8345 /* Have to set len here to avoid assert failure in invlist_array() */
8346 invlist_set_len(invlist, len, offset);
8348 array = invlist_array(invlist);
8351 invlist_set_len(invlist, len, offset);
8354 /* The next item on the list starts the range, the one after that is
8355 * one past the new range. */
8356 array[len - 2] = start;
8357 if (end != UV_MAX) {
8358 array[len - 1] = end + 1;
8361 /* But if the end is the maximum representable on the machine, just let
8362 * the range have no end */
8363 invlist_set_len(invlist, len - 1, offset);
8367 #ifndef PERL_IN_XSUB_RE
8370 Perl__invlist_search(SV* const invlist, const UV cp)
8372 /* Searches the inversion list for the entry that contains the input code
8373 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8374 * return value is the index into the list's array of the range that
8379 IV high = _invlist_len(invlist);
8380 const IV highest_element = high - 1;
8383 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8385 /* If list is empty, return failure. */
8390 /* (We can't get the array unless we know the list is non-empty) */
8391 array = invlist_array(invlist);
8393 mid = invlist_previous_index(invlist);
8394 assert(mid >=0 && mid <= highest_element);
8396 /* <mid> contains the cache of the result of the previous call to this
8397 * function (0 the first time). See if this call is for the same result,
8398 * or if it is for mid-1. This is under the theory that calls to this
8399 * function will often be for related code points that are near each other.
8400 * And benchmarks show that caching gives better results. We also test
8401 * here if the code point is within the bounds of the list. These tests
8402 * replace others that would have had to be made anyway to make sure that
8403 * the array bounds were not exceeded, and these give us extra information
8404 * at the same time */
8405 if (cp >= array[mid]) {
8406 if (cp >= array[highest_element]) {
8407 return highest_element;
8410 /* Here, array[mid] <= cp < array[highest_element]. This means that
8411 * the final element is not the answer, so can exclude it; it also
8412 * means that <mid> is not the final element, so can refer to 'mid + 1'
8414 if (cp < array[mid + 1]) {
8420 else { /* cp < aray[mid] */
8421 if (cp < array[0]) { /* Fail if outside the array */
8425 if (cp >= array[mid - 1]) {
8430 /* Binary search. What we are looking for is <i> such that
8431 * array[i] <= cp < array[i+1]
8432 * The loop below converges on the i+1. Note that there may not be an
8433 * (i+1)th element in the array, and things work nonetheless */
8434 while (low < high) {
8435 mid = (low + high) / 2;
8436 assert(mid <= highest_element);
8437 if (array[mid] <= cp) { /* cp >= array[mid] */
8440 /* We could do this extra test to exit the loop early.
8441 if (cp < array[low]) {
8446 else { /* cp < array[mid] */
8453 invlist_set_previous_index(invlist, high);
8458 Perl__invlist_populate_swatch(SV* const invlist,
8459 const UV start, const UV end, U8* swatch)
8461 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8462 * but is used when the swash has an inversion list. This makes this much
8463 * faster, as it uses a binary search instead of a linear one. This is
8464 * intimately tied to that function, and perhaps should be in utf8.c,
8465 * except it is intimately tied to inversion lists as well. It assumes
8466 * that <swatch> is all 0's on input */
8469 const IV len = _invlist_len(invlist);
8473 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8475 if (len == 0) { /* Empty inversion list */
8479 array = invlist_array(invlist);
8481 /* Find which element it is */
8482 i = _invlist_search(invlist, start);
8484 /* We populate from <start> to <end> */
8485 while (current < end) {
8488 /* The inversion list gives the results for every possible code point
8489 * after the first one in the list. Only those ranges whose index is
8490 * even are ones that the inversion list matches. For the odd ones,
8491 * and if the initial code point is not in the list, we have to skip
8492 * forward to the next element */
8493 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8495 if (i >= len) { /* Finished if beyond the end of the array */
8499 if (current >= end) { /* Finished if beyond the end of what we
8501 if (LIKELY(end < UV_MAX)) {
8505 /* We get here when the upper bound is the maximum
8506 * representable on the machine, and we are looking for just
8507 * that code point. Have to special case it */
8509 goto join_end_of_list;
8512 assert(current >= start);
8514 /* The current range ends one below the next one, except don't go past
8517 upper = (i < len && array[i] < end) ? array[i] : end;
8519 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8520 * for each code point in it */
8521 for (; current < upper; current++) {
8522 const STRLEN offset = (STRLEN)(current - start);
8523 swatch[offset >> 3] |= 1 << (offset & 7);
8528 /* Quit if at the end of the list */
8531 /* But first, have to deal with the highest possible code point on
8532 * the platform. The previous code assumes that <end> is one
8533 * beyond where we want to populate, but that is impossible at the
8534 * platform's infinity, so have to handle it specially */
8535 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8537 const STRLEN offset = (STRLEN)(end - start);
8538 swatch[offset >> 3] |= 1 << (offset & 7);
8543 /* Advance to the next range, which will be for code points not in the
8552 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8553 const bool complement_b, SV** output)
8555 /* Take the union of two inversion lists and point <output> to it. *output
8556 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8557 * the reference count to that list will be decremented if not already a
8558 * temporary (mortal); otherwise *output will be made correspondingly
8559 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8560 * second list is returned. If <complement_b> is TRUE, the union is taken
8561 * of the complement (inversion) of <b> instead of b itself.
8563 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8564 * Richard Gillam, published by Addison-Wesley, and explained at some
8565 * length there. The preface says to incorporate its examples into your
8566 * code at your own risk.
8568 * The algorithm is like a merge sort.
8570 * XXX A potential performance improvement is to keep track as we go along
8571 * if only one of the inputs contributes to the result, meaning the other
8572 * is a subset of that one. In that case, we can skip the final copy and
8573 * return the larger of the input lists, but then outside code might need
8574 * to keep track of whether to free the input list or not */
8576 const UV* array_a; /* a's array */
8578 UV len_a; /* length of a's array */
8581 SV* u; /* the resulting union */
8585 UV i_a = 0; /* current index into a's array */
8589 /* running count, as explained in the algorithm source book; items are
8590 * stopped accumulating and are output when the count changes to/from 0.
8591 * The count is incremented when we start a range that's in the set, and
8592 * decremented when we start a range that's not in the set. So its range
8593 * is 0 to 2. Only when the count is zero is something not in the set.
8597 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8600 /* If either one is empty, the union is the other one */
8601 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8602 bool make_temp = FALSE; /* Should we mortalize the result? */
8606 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8612 *output = invlist_clone(b);
8614 _invlist_invert(*output);
8616 } /* else *output already = b; */
8619 sv_2mortal(*output);
8623 else if ((len_b = _invlist_len(b)) == 0) {
8624 bool make_temp = FALSE;
8626 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8631 /* The complement of an empty list is a list that has everything in it,
8632 * so the union with <a> includes everything too */
8635 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8639 *output = _new_invlist(1);
8640 _append_range_to_invlist(*output, 0, UV_MAX);
8642 else if (*output != a) {
8643 *output = invlist_clone(a);
8645 /* else *output already = a; */
8648 sv_2mortal(*output);
8653 /* Here both lists exist and are non-empty */
8654 array_a = invlist_array(a);
8655 array_b = invlist_array(b);
8657 /* If are to take the union of 'a' with the complement of b, set it
8658 * up so are looking at b's complement. */
8661 /* To complement, we invert: if the first element is 0, remove it. To
8662 * do this, we just pretend the array starts one later */
8663 if (array_b[0] == 0) {
8669 /* But if the first element is not zero, we pretend the list starts
8670 * at the 0 that is always stored immediately before the array. */
8676 /* Size the union for the worst case: that the sets are completely
8678 u = _new_invlist(len_a + len_b);
8680 /* Will contain U+0000 if either component does */
8681 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8682 || (len_b > 0 && array_b[0] == 0));
8684 /* Go through each list item by item, stopping when exhausted one of
8686 while (i_a < len_a && i_b < len_b) {
8687 UV cp; /* The element to potentially add to the union's array */
8688 bool cp_in_set; /* is it in the the input list's set or not */
8690 /* We need to take one or the other of the two inputs for the union.
8691 * Since we are merging two sorted lists, we take the smaller of the
8692 * next items. In case of a tie, we take the one that is in its set
8693 * first. If we took one not in the set first, it would decrement the
8694 * count, possibly to 0 which would cause it to be output as ending the
8695 * range, and the next time through we would take the same number, and
8696 * output it again as beginning the next range. By doing it the
8697 * opposite way, there is no possibility that the count will be
8698 * momentarily decremented to 0, and thus the two adjoining ranges will
8699 * be seamlessly merged. (In a tie and both are in the set or both not
8700 * in the set, it doesn't matter which we take first.) */
8701 if (array_a[i_a] < array_b[i_b]
8702 || (array_a[i_a] == array_b[i_b]
8703 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8705 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8709 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8710 cp = array_b[i_b++];
8713 /* Here, have chosen which of the two inputs to look at. Only output
8714 * if the running count changes to/from 0, which marks the
8715 * beginning/end of a range in that's in the set */
8718 array_u[i_u++] = cp;
8725 array_u[i_u++] = cp;
8730 /* Here, we are finished going through at least one of the lists, which
8731 * means there is something remaining in at most one. We check if the list
8732 * that hasn't been exhausted is positioned such that we are in the middle
8733 * of a range in its set or not. (i_a and i_b point to the element beyond
8734 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8735 * is potentially more to output.
8736 * There are four cases:
8737 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8738 * in the union is entirely from the non-exhausted set.
8739 * 2) Both were in their sets, count is 2. Nothing further should
8740 * be output, as everything that remains will be in the exhausted
8741 * list's set, hence in the union; decrementing to 1 but not 0 insures
8743 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8744 * Nothing further should be output because the union includes
8745 * everything from the exhausted set. Not decrementing ensures that.
8746 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8747 * decrementing to 0 insures that we look at the remainder of the
8748 * non-exhausted set */
8749 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8750 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8755 /* The final length is what we've output so far, plus what else is about to
8756 * be output. (If 'count' is non-zero, then the input list we exhausted
8757 * has everything remaining up to the machine's limit in its set, and hence
8758 * in the union, so there will be no further output. */
8761 /* At most one of the subexpressions will be non-zero */
8762 len_u += (len_a - i_a) + (len_b - i_b);
8765 /* Set result to final length, which can change the pointer to array_u, so
8767 if (len_u != _invlist_len(u)) {
8768 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8770 array_u = invlist_array(u);
8773 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8774 * the other) ended with everything above it not in its set. That means
8775 * that the remaining part of the union is precisely the same as the
8776 * non-exhausted list, so can just copy it unchanged. (If both list were
8777 * exhausted at the same time, then the operations below will be both 0.)
8780 IV copy_count; /* At most one will have a non-zero copy count */
8781 if ((copy_count = len_a - i_a) > 0) {
8782 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8784 else if ((copy_count = len_b - i_b) > 0) {
8785 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8789 /* We may be removing a reference to one of the inputs. If so, the output
8790 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8791 * count decremented) */
8792 if (a == *output || b == *output) {
8793 assert(! invlist_is_iterating(*output));
8794 if ((SvTEMP(*output))) {
8798 SvREFCNT_dec_NN(*output);
8808 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8809 const bool complement_b, SV** i)
8811 /* Take the intersection of two inversion lists and point <i> to it. *i
8812 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8813 * the reference count to that list will be decremented if not already a
8814 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8815 * The first list, <a>, may be NULL, in which case an empty list is
8816 * returned. If <complement_b> is TRUE, the result will be the
8817 * intersection of <a> and the complement (or inversion) of <b> instead of
8820 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8821 * Richard Gillam, published by Addison-Wesley, and explained at some
8822 * length there. The preface says to incorporate its examples into your
8823 * code at your own risk. In fact, it had bugs
8825 * The algorithm is like a merge sort, and is essentially the same as the
8829 const UV* array_a; /* a's array */
8831 UV len_a; /* length of a's array */
8834 SV* r; /* the resulting intersection */
8838 UV i_a = 0; /* current index into a's array */
8842 /* running count, as explained in the algorithm source book; items are
8843 * stopped accumulating and are output when the count changes to/from 2.
8844 * The count is incremented when we start a range that's in the set, and
8845 * decremented when we start a range that's not in the set. So its range
8846 * is 0 to 2. Only when the count is 2 is something in the intersection.
8850 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8853 /* Special case if either one is empty */
8854 len_a = (a == NULL) ? 0 : _invlist_len(a);
8855 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8856 bool make_temp = FALSE;
8858 if (len_a != 0 && complement_b) {
8860 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8861 * be empty. Here, also we are using 'b's complement, which hence
8862 * must be every possible code point. Thus the intersection is
8866 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8871 *i = invlist_clone(a);
8873 /* else *i is already 'a' */
8881 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8882 * intersection must be empty */
8884 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8889 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8893 *i = _new_invlist(0);
8901 /* Here both lists exist and are non-empty */
8902 array_a = invlist_array(a);
8903 array_b = invlist_array(b);
8905 /* If are to take the intersection of 'a' with the complement of b, set it
8906 * up so are looking at b's complement. */
8909 /* To complement, we invert: if the first element is 0, remove it. To
8910 * do this, we just pretend the array starts one later */
8911 if (array_b[0] == 0) {
8917 /* But if the first element is not zero, we pretend the list starts
8918 * at the 0 that is always stored immediately before the array. */
8924 /* Size the intersection for the worst case: that the intersection ends up
8925 * fragmenting everything to be completely disjoint */
8926 r= _new_invlist(len_a + len_b);
8928 /* Will contain U+0000 iff both components do */
8929 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8930 && len_b > 0 && array_b[0] == 0);
8932 /* Go through each list item by item, stopping when exhausted one of
8934 while (i_a < len_a && i_b < len_b) {
8935 UV cp; /* The element to potentially add to the intersection's
8937 bool cp_in_set; /* Is it in the input list's set or not */
8939 /* We need to take one or the other of the two inputs for the
8940 * intersection. Since we are merging two sorted lists, we take the
8941 * smaller of the next items. In case of a tie, we take the one that
8942 * is not in its set first (a difference from the union algorithm). If
8943 * we took one in the set first, it would increment the count, possibly
8944 * to 2 which would cause it to be output as starting a range in the
8945 * intersection, and the next time through we would take that same
8946 * number, and output it again as ending the set. By doing it the
8947 * opposite of this, there is no possibility that the count will be
8948 * momentarily incremented to 2. (In a tie and both are in the set or
8949 * both not in the set, it doesn't matter which we take first.) */
8950 if (array_a[i_a] < array_b[i_b]
8951 || (array_a[i_a] == array_b[i_b]
8952 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8954 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8958 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8962 /* Here, have chosen which of the two inputs to look at. Only output
8963 * if the running count changes to/from 2, which marks the
8964 * beginning/end of a range that's in the intersection */
8968 array_r[i_r++] = cp;
8973 array_r[i_r++] = cp;
8979 /* Here, we are finished going through at least one of the lists, which
8980 * means there is something remaining in at most one. We check if the list
8981 * that has been exhausted is positioned such that we are in the middle
8982 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8983 * the ones we care about.) There are four cases:
8984 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8985 * nothing left in the intersection.
8986 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8987 * above 2. What should be output is exactly that which is in the
8988 * non-exhausted set, as everything it has is also in the intersection
8989 * set, and everything it doesn't have can't be in the intersection
8990 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8991 * gets incremented to 2. Like the previous case, the intersection is
8992 * everything that remains in the non-exhausted set.
8993 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8994 * remains 1. And the intersection has nothing more. */
8995 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8996 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9001 /* The final length is what we've output so far plus what else is in the
9002 * intersection. At most one of the subexpressions below will be non-zero
9006 len_r += (len_a - i_a) + (len_b - i_b);
9009 /* Set result to final length, which can change the pointer to array_r, so
9011 if (len_r != _invlist_len(r)) {
9012 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9014 array_r = invlist_array(r);
9017 /* Finish outputting any remaining */
9018 if (count >= 2) { /* At most one will have a non-zero copy count */
9020 if ((copy_count = len_a - i_a) > 0) {
9021 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9023 else if ((copy_count = len_b - i_b) > 0) {
9024 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9028 /* We may be removing a reference to one of the inputs. If so, the output
9029 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9030 * count decremented) */
9031 if (a == *i || b == *i) {
9032 assert(! invlist_is_iterating(*i));
9037 SvREFCNT_dec_NN(*i);
9047 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9049 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9050 * set. A pointer to the inversion list is returned. This may actually be
9051 * a new list, in which case the passed in one has been destroyed. The
9052 * passed-in inversion list can be NULL, in which case a new one is created
9053 * with just the one range in it */
9058 if (invlist == NULL) {
9059 invlist = _new_invlist(2);
9063 len = _invlist_len(invlist);
9066 /* If comes after the final entry actually in the list, can just append it
9069 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9070 && start >= invlist_array(invlist)[len - 1]))
9072 _append_range_to_invlist(invlist, start, end);
9076 /* Here, can't just append things, create and return a new inversion list
9077 * which is the union of this range and the existing inversion list. (If
9078 * the new range is well-behaved wrt to the old one, we could just insert
9079 * it, doing a Move() down on the tail of the old one (potentially growing
9080 * it first). But to determine that means we would have the extra
9081 * (possibly throw-away) work of first finding where the new one goes and
9082 * whether it disrupts (splits) an existing range, so it doesn't appear to
9083 * me (khw) that it's worth it) */
9084 range_invlist = _new_invlist(2);
9085 _append_range_to_invlist(range_invlist, start, end);
9087 _invlist_union(invlist, range_invlist, &invlist);
9089 /* The temporary can be freed */
9090 SvREFCNT_dec_NN(range_invlist);
9096 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9097 UV** other_elements_ptr)
9099 /* Create and return an inversion list whose contents are to be populated
9100 * by the caller. The caller gives the number of elements (in 'size') and
9101 * the very first element ('element0'). This function will set
9102 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9105 * Obviously there is some trust involved that the caller will properly
9106 * fill in the other elements of the array.
9108 * (The first element needs to be passed in, as the underlying code does
9109 * things differently depending on whether it is zero or non-zero) */
9111 SV* invlist = _new_invlist(size);
9114 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9116 _append_range_to_invlist(invlist, element0, element0);
9117 offset = *get_invlist_offset_addr(invlist);
9119 invlist_set_len(invlist, size, offset);
9120 *other_elements_ptr = invlist_array(invlist) + 1;
9126 PERL_STATIC_INLINE SV*
9127 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9128 return _add_range_to_invlist(invlist, cp, cp);
9131 #ifndef PERL_IN_XSUB_RE
9133 Perl__invlist_invert(pTHX_ SV* const invlist)
9135 /* Complement the input inversion list. This adds a 0 if the list didn't
9136 * have a zero; removes it otherwise. As described above, the data
9137 * structure is set up so that this is very efficient */
9139 PERL_ARGS_ASSERT__INVLIST_INVERT;
9141 assert(! invlist_is_iterating(invlist));
9143 /* The inverse of matching nothing is matching everything */
9144 if (_invlist_len(invlist) == 0) {
9145 _append_range_to_invlist(invlist, 0, UV_MAX);
9149 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9154 PERL_STATIC_INLINE SV*
9155 S_invlist_clone(pTHX_ SV* const invlist)
9158 /* Return a new inversion list that is a copy of the input one, which is
9159 * unchanged. The new list will not be mortal even if the old one was. */
9161 /* Need to allocate extra space to accommodate Perl's addition of a
9162 * trailing NUL to SvPV's, since it thinks they are always strings */
9163 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9164 STRLEN physical_length = SvCUR(invlist);
9165 bool offset = *(get_invlist_offset_addr(invlist));
9167 PERL_ARGS_ASSERT_INVLIST_CLONE;
9169 *(get_invlist_offset_addr(new_invlist)) = offset;
9170 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9171 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9176 PERL_STATIC_INLINE STRLEN*
9177 S_get_invlist_iter_addr(SV* invlist)
9179 /* Return the address of the UV that contains the current iteration
9182 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9184 assert(SvTYPE(invlist) == SVt_INVLIST);
9186 return &(((XINVLIST*) SvANY(invlist))->iterator);
9189 PERL_STATIC_INLINE void
9190 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9192 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9194 *get_invlist_iter_addr(invlist) = 0;
9197 PERL_STATIC_INLINE void
9198 S_invlist_iterfinish(SV* invlist)
9200 /* Terminate iterator for invlist. This is to catch development errors.
9201 * Any iteration that is interrupted before completed should call this
9202 * function. Functions that add code points anywhere else but to the end
9203 * of an inversion list assert that they are not in the middle of an
9204 * iteration. If they were, the addition would make the iteration
9205 * problematical: if the iteration hadn't reached the place where things
9206 * were being added, it would be ok */
9208 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9210 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9214 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9216 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9217 * This call sets in <*start> and <*end>, the next range in <invlist>.
9218 * Returns <TRUE> if successful and the next call will return the next
9219 * range; <FALSE> if was already at the end of the list. If the latter,
9220 * <*start> and <*end> are unchanged, and the next call to this function
9221 * will start over at the beginning of the list */
9223 STRLEN* pos = get_invlist_iter_addr(invlist);
9224 UV len = _invlist_len(invlist);
9227 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9230 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9234 array = invlist_array(invlist);
9236 *start = array[(*pos)++];
9242 *end = array[(*pos)++] - 1;
9248 PERL_STATIC_INLINE UV
9249 S_invlist_highest(SV* const invlist)
9251 /* Returns the highest code point that matches an inversion list. This API
9252 * has an ambiguity, as it returns 0 under either the highest is actually
9253 * 0, or if the list is empty. If this distinction matters to you, check
9254 * for emptiness before calling this function */
9256 UV len = _invlist_len(invlist);
9259 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9265 array = invlist_array(invlist);
9267 /* The last element in the array in the inversion list always starts a
9268 * range that goes to infinity. That range may be for code points that are
9269 * matched in the inversion list, or it may be for ones that aren't
9270 * matched. In the latter case, the highest code point in the set is one
9271 * less than the beginning of this range; otherwise it is the final element
9272 * of this range: infinity */
9273 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9275 : array[len - 1] - 1;
9278 #ifndef PERL_IN_XSUB_RE
9280 Perl__invlist_contents(pTHX_ SV* const invlist)
9282 /* Get the contents of an inversion list into a string SV so that they can
9283 * be printed out. It uses the format traditionally done for debug tracing
9287 SV* output = newSVpvs("\n");
9289 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9291 assert(! invlist_is_iterating(invlist));
9293 invlist_iterinit(invlist);
9294 while (invlist_iternext(invlist, &start, &end)) {
9295 if (end == UV_MAX) {
9296 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9298 else if (end != start) {
9299 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9303 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9311 #ifndef PERL_IN_XSUB_RE
9313 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9314 const char * const indent, SV* const invlist)
9316 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9317 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9318 * the string 'indent'. The output looks like this:
9319 [0] 0x000A .. 0x000D
9321 [4] 0x2028 .. 0x2029
9322 [6] 0x3104 .. INFINITY
9323 * This means that the first range of code points matched by the list are
9324 * 0xA through 0xD; the second range contains only the single code point
9325 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9326 * are used to define each range (except if the final range extends to
9327 * infinity, only a single element is needed). The array index of the
9328 * first element for the corresponding range is given in brackets. */
9333 PERL_ARGS_ASSERT__INVLIST_DUMP;
9335 if (invlist_is_iterating(invlist)) {
9336 Perl_dump_indent(aTHX_ level, file,
9337 "%sCan't dump inversion list because is in middle of iterating\n",
9342 invlist_iterinit(invlist);
9343 while (invlist_iternext(invlist, &start, &end)) {
9344 if (end == UV_MAX) {
9345 Perl_dump_indent(aTHX_ level, file,
9346 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9347 indent, (UV)count, start);
9349 else if (end != start) {
9350 Perl_dump_indent(aTHX_ level, file,
9351 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9352 indent, (UV)count, start, end);
9355 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9356 indent, (UV)count, start);
9363 Perl__load_PL_utf8_foldclosures (pTHX)
9365 assert(! PL_utf8_foldclosures);
9367 /* If the folds haven't been read in, call a fold function
9369 if (! PL_utf8_tofold) {
9370 U8 dummy[UTF8_MAXBYTES_CASE+1];
9372 /* This string is just a short named one above \xff */
9373 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9374 assert(PL_utf8_tofold); /* Verify that worked */
9376 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9380 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9382 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9384 /* Return a boolean as to if the two passed in inversion lists are
9385 * identical. The final argument, if TRUE, says to take the complement of
9386 * the second inversion list before doing the comparison */
9388 const UV* array_a = invlist_array(a);
9389 const UV* array_b = invlist_array(b);
9390 UV len_a = _invlist_len(a);
9391 UV len_b = _invlist_len(b);
9393 UV i = 0; /* current index into the arrays */
9394 bool retval = TRUE; /* Assume are identical until proven otherwise */
9396 PERL_ARGS_ASSERT__INVLISTEQ;
9398 /* If are to compare 'a' with the complement of b, set it
9399 * up so are looking at b's complement. */
9402 /* The complement of nothing is everything, so <a> would have to have
9403 * just one element, starting at zero (ending at infinity) */
9405 return (len_a == 1 && array_a[0] == 0);
9407 else if (array_b[0] == 0) {
9409 /* Otherwise, to complement, we invert. Here, the first element is
9410 * 0, just remove it. To do this, we just pretend the array starts
9418 /* But if the first element is not zero, we pretend the list starts
9419 * at the 0 that is always stored immediately before the array. */
9425 /* Make sure that the lengths are the same, as well as the final element
9426 * before looping through the remainder. (Thus we test the length, final,
9427 * and first elements right off the bat) */
9428 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9431 else for (i = 0; i < len_a - 1; i++) {
9432 if (array_a[i] != array_b[i]) {
9443 * As best we can, determine the characters that can match the start of
9444 * the given EXACTF-ish node.
9446 * Returns the invlist as a new SV*; it is the caller's responsibility to
9447 * call SvREFCNT_dec() when done with it.
9450 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9452 const U8 * s = (U8*)STRING(node);
9453 SSize_t bytelen = STR_LEN(node);
9455 /* Start out big enough for 2 separate code points */
9456 SV* invlist = _new_invlist(4);
9458 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9463 /* We punt and assume can match anything if the node begins
9464 * with a multi-character fold. Things are complicated. For
9465 * example, /ffi/i could match any of:
9466 * "\N{LATIN SMALL LIGATURE FFI}"
9467 * "\N{LATIN SMALL LIGATURE FF}I"
9468 * "F\N{LATIN SMALL LIGATURE FI}"
9469 * plus several other things; and making sure we have all the
9470 * possibilities is hard. */
9471 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9472 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9475 /* Any Latin1 range character can potentially match any
9476 * other depending on the locale */
9477 if (OP(node) == EXACTFL) {
9478 _invlist_union(invlist, PL_Latin1, &invlist);
9481 /* But otherwise, it matches at least itself. We can
9482 * quickly tell if it has a distinct fold, and if so,
9483 * it matches that as well */
9484 invlist = add_cp_to_invlist(invlist, uc);
9485 if (IS_IN_SOME_FOLD_L1(uc))
9486 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9489 /* Some characters match above-Latin1 ones under /i. This
9490 * is true of EXACTFL ones when the locale is UTF-8 */
9491 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9492 && (! isASCII(uc) || (OP(node) != EXACTFA
9493 && OP(node) != EXACTFA_NO_TRIE)))
9495 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9499 else { /* Pattern is UTF-8 */
9500 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9501 STRLEN foldlen = UTF8SKIP(s);
9502 const U8* e = s + bytelen;
9505 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9507 /* The only code points that aren't folded in a UTF EXACTFish
9508 * node are are the problematic ones in EXACTFL nodes */
9509 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9510 /* We need to check for the possibility that this EXACTFL
9511 * node begins with a multi-char fold. Therefore we fold
9512 * the first few characters of it so that we can make that
9517 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9519 *(d++) = (U8) toFOLD(*s);
9524 to_utf8_fold(s, d, &len);
9530 /* And set up so the code below that looks in this folded
9531 * buffer instead of the node's string */
9533 foldlen = UTF8SKIP(folded);
9537 /* When we reach here 's' points to the fold of the first
9538 * character(s) of the node; and 'e' points to far enough along
9539 * the folded string to be just past any possible multi-char
9540 * fold. 'foldlen' is the length in bytes of the first
9543 * Unlike the non-UTF-8 case, the macro for determining if a
9544 * string is a multi-char fold requires all the characters to
9545 * already be folded. This is because of all the complications
9546 * if not. Note that they are folded anyway, except in EXACTFL
9547 * nodes. Like the non-UTF case above, we punt if the node
9548 * begins with a multi-char fold */
9550 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9551 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9553 else { /* Single char fold */
9555 /* It matches all the things that fold to it, which are
9556 * found in PL_utf8_foldclosures (including itself) */
9557 invlist = add_cp_to_invlist(invlist, uc);
9558 if (! PL_utf8_foldclosures)
9559 _load_PL_utf8_foldclosures();
9560 if ((listp = hv_fetch(PL_utf8_foldclosures,
9561 (char *) s, foldlen, FALSE)))
9563 AV* list = (AV*) *listp;
9565 for (k = 0; k <= av_tindex(list); k++) {
9566 SV** c_p = av_fetch(list, k, FALSE);
9572 /* /aa doesn't allow folds between ASCII and non- */
9573 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9574 && isASCII(c) != isASCII(uc))
9579 invlist = add_cp_to_invlist(invlist, c);
9588 #undef HEADER_LENGTH
9589 #undef TO_INTERNAL_SIZE
9590 #undef FROM_INTERNAL_SIZE
9591 #undef INVLIST_VERSION_ID
9593 /* End of inversion list object */
9596 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9598 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9599 * constructs, and updates RExC_flags with them. On input, RExC_parse
9600 * should point to the first flag; it is updated on output to point to the
9601 * final ')' or ':'. There needs to be at least one flag, or this will
9604 /* for (?g), (?gc), and (?o) warnings; warning
9605 about (?c) will warn about (?g) -- japhy */
9607 #define WASTED_O 0x01
9608 #define WASTED_G 0x02
9609 #define WASTED_C 0x04
9610 #define WASTED_GC (WASTED_G|WASTED_C)
9611 I32 wastedflags = 0x00;
9612 U32 posflags = 0, negflags = 0;
9613 U32 *flagsp = &posflags;
9614 char has_charset_modifier = '\0';
9616 bool has_use_defaults = FALSE;
9617 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9618 int x_mod_count = 0;
9620 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9622 /* '^' as an initial flag sets certain defaults */
9623 if (UCHARAT(RExC_parse) == '^') {
9625 has_use_defaults = TRUE;
9626 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9627 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9628 ? REGEX_UNICODE_CHARSET
9629 : REGEX_DEPENDS_CHARSET);
9632 cs = get_regex_charset(RExC_flags);
9633 if (cs == REGEX_DEPENDS_CHARSET
9634 && (RExC_utf8 || RExC_uni_semantics))
9636 cs = REGEX_UNICODE_CHARSET;
9639 while (*RExC_parse) {
9640 /* && strchr("iogcmsx", *RExC_parse) */
9641 /* (?g), (?gc) and (?o) are useless here
9642 and must be globally applied -- japhy */
9643 switch (*RExC_parse) {
9645 /* Code for the imsxn flags */
9646 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9648 case LOCALE_PAT_MOD:
9649 if (has_charset_modifier) {
9650 goto excess_modifier;
9652 else if (flagsp == &negflags) {
9655 cs = REGEX_LOCALE_CHARSET;
9656 has_charset_modifier = LOCALE_PAT_MOD;
9658 case UNICODE_PAT_MOD:
9659 if (has_charset_modifier) {
9660 goto excess_modifier;
9662 else if (flagsp == &negflags) {
9665 cs = REGEX_UNICODE_CHARSET;
9666 has_charset_modifier = UNICODE_PAT_MOD;
9668 case ASCII_RESTRICT_PAT_MOD:
9669 if (flagsp == &negflags) {
9672 if (has_charset_modifier) {
9673 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9674 goto excess_modifier;
9676 /* Doubled modifier implies more restricted */
9677 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9680 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9682 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9684 case DEPENDS_PAT_MOD:
9685 if (has_use_defaults) {
9686 goto fail_modifiers;
9688 else if (flagsp == &negflags) {
9691 else if (has_charset_modifier) {
9692 goto excess_modifier;
9695 /* The dual charset means unicode semantics if the
9696 * pattern (or target, not known until runtime) are
9697 * utf8, or something in the pattern indicates unicode
9699 cs = (RExC_utf8 || RExC_uni_semantics)
9700 ? REGEX_UNICODE_CHARSET
9701 : REGEX_DEPENDS_CHARSET;
9702 has_charset_modifier = DEPENDS_PAT_MOD;
9706 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9707 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9709 else if (has_charset_modifier == *(RExC_parse - 1)) {
9710 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9714 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9716 NOT_REACHED; /*NOTREACHED*/
9719 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9721 NOT_REACHED; /*NOTREACHED*/
9722 case ONCE_PAT_MOD: /* 'o' */
9723 case GLOBAL_PAT_MOD: /* 'g' */
9724 if (PASS2 && ckWARN(WARN_REGEXP)) {
9725 const I32 wflagbit = *RExC_parse == 'o'
9728 if (! (wastedflags & wflagbit) ) {
9729 wastedflags |= wflagbit;
9730 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9733 "Useless (%s%c) - %suse /%c modifier",
9734 flagsp == &negflags ? "?-" : "?",
9736 flagsp == &negflags ? "don't " : "",
9743 case CONTINUE_PAT_MOD: /* 'c' */
9744 if (PASS2 && ckWARN(WARN_REGEXP)) {
9745 if (! (wastedflags & WASTED_C) ) {
9746 wastedflags |= WASTED_GC;
9747 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9750 "Useless (%sc) - %suse /gc modifier",
9751 flagsp == &negflags ? "?-" : "?",
9752 flagsp == &negflags ? "don't " : ""
9757 case KEEPCOPY_PAT_MOD: /* 'p' */
9758 if (flagsp == &negflags) {
9760 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9762 *flagsp |= RXf_PMf_KEEPCOPY;
9766 /* A flag is a default iff it is following a minus, so
9767 * if there is a minus, it means will be trying to
9768 * re-specify a default which is an error */
9769 if (has_use_defaults || flagsp == &negflags) {
9770 goto fail_modifiers;
9773 wastedflags = 0; /* reset so (?g-c) warns twice */
9777 RExC_flags |= posflags;
9778 RExC_flags &= ~negflags;
9779 set_regex_charset(&RExC_flags, cs);
9780 if (RExC_flags & RXf_PMf_FOLD) {
9781 RExC_contains_i = 1;
9784 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9790 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9791 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9792 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9793 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9794 NOT_REACHED; /*NOTREACHED*/
9801 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9806 - reg - regular expression, i.e. main body or parenthesized thing
9808 * Caller must absorb opening parenthesis.
9810 * Combining parenthesis handling with the base level of regular expression
9811 * is a trifle forced, but the need to tie the tails of the branches to what
9812 * follows makes it hard to avoid.
9814 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9816 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9818 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9821 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9822 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
9823 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
9824 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
9825 NULL, which cannot happen. */
9827 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9828 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9829 * 2 is like 1, but indicates that nextchar() has been called to advance
9830 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9831 * this flag alerts us to the need to check for that */
9833 regnode *ret; /* Will be the head of the group. */
9836 regnode *ender = NULL;
9839 U32 oregflags = RExC_flags;
9840 bool have_branch = 0;
9842 I32 freeze_paren = 0;
9843 I32 after_freeze = 0;
9844 I32 num; /* numeric backreferences */
9846 char * parse_start = RExC_parse; /* MJD */
9847 char * const oregcomp_parse = RExC_parse;
9849 GET_RE_DEBUG_FLAGS_DECL;
9851 PERL_ARGS_ASSERT_REG;
9852 DEBUG_PARSE("reg ");
9854 *flagp = 0; /* Tentatively. */
9857 /* Make an OPEN node, if parenthesized. */
9860 /* Under /x, space and comments can be gobbled up between the '(' and
9861 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9862 * intervening space, as the sequence is a token, and a token should be
9864 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9866 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9867 char *start_verb = RExC_parse;
9868 STRLEN verb_len = 0;
9869 char *start_arg = NULL;
9870 unsigned char op = 0;
9871 int arg_required = 0;
9872 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
9874 if (has_intervening_patws) {
9876 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9878 while ( *RExC_parse && *RExC_parse != ')' ) {
9879 if ( *RExC_parse == ':' ) {
9880 start_arg = RExC_parse + 1;
9886 verb_len = RExC_parse - start_verb;
9889 while ( *RExC_parse && *RExC_parse != ')' )
9891 if ( *RExC_parse != ')' )
9892 vFAIL("Unterminated verb pattern argument");
9893 if ( RExC_parse == start_arg )
9896 if ( *RExC_parse != ')' )
9897 vFAIL("Unterminated verb pattern");
9900 switch ( *start_verb ) {
9901 case 'A': /* (*ACCEPT) */
9902 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9904 internal_argval = RExC_nestroot;
9907 case 'C': /* (*COMMIT) */
9908 if ( memEQs(start_verb,verb_len,"COMMIT") )
9911 case 'F': /* (*FAIL) */
9912 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9916 case ':': /* (*:NAME) */
9917 case 'M': /* (*MARK:NAME) */
9918 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9923 case 'P': /* (*PRUNE) */
9924 if ( memEQs(start_verb,verb_len,"PRUNE") )
9927 case 'S': /* (*SKIP) */
9928 if ( memEQs(start_verb,verb_len,"SKIP") )
9931 case 'T': /* (*THEN) */
9932 /* [19:06] <TimToady> :: is then */
9933 if ( memEQs(start_verb,verb_len,"THEN") ) {
9935 RExC_seen |= REG_CUTGROUP_SEEN;
9940 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9942 "Unknown verb pattern '%"UTF8f"'",
9943 UTF8fARG(UTF, verb_len, start_verb));
9945 if ( arg_required && !start_arg ) {
9946 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9947 verb_len, start_verb);
9949 if (internal_argval == -1) {
9950 ret = reganode(pRExC_state, op, 0);
9952 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
9954 RExC_seen |= REG_VERBARG_SEEN;
9955 if ( ! SIZE_ONLY ) {
9957 SV *sv = newSVpvn( start_arg,
9958 RExC_parse - start_arg);
9959 ARG(ret) = add_data( pRExC_state,
9961 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9966 if ( internal_argval != -1 )
9967 ARG2L_SET(ret, internal_argval);
9969 nextchar(pRExC_state);
9972 else if (*RExC_parse == '?') { /* (?...) */
9973 bool is_logical = 0;
9974 const char * const seqstart = RExC_parse;
9975 const char * endptr;
9976 if (has_intervening_patws) {
9978 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9982 paren = *RExC_parse++;
9983 ret = NULL; /* For look-ahead/behind. */
9986 case 'P': /* (?P...) variants for those used to PCRE/Python */
9987 paren = *RExC_parse++;
9988 if ( paren == '<') /* (?P<...>) named capture */
9990 else if (paren == '>') { /* (?P>name) named recursion */
9991 goto named_recursion;
9993 else if (paren == '=') { /* (?P=...) named backref */
9994 /* this pretty much dupes the code for \k<NAME> in
9995 * regatom(), if you change this make sure you change that
9997 char* name_start = RExC_parse;
9999 SV *sv_dat = reg_scan_name(pRExC_state,
10000 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10001 if (RExC_parse == name_start || *RExC_parse != ')')
10002 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
10003 vFAIL2("Sequence %.3s... not terminated",parse_start);
10006 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10007 RExC_rxi->data->data[num]=(void*)sv_dat;
10008 SvREFCNT_inc_simple_void(sv_dat);
10011 ret = reganode(pRExC_state,
10014 : (ASCII_FOLD_RESTRICTED)
10016 : (AT_LEAST_UNI_SEMANTICS)
10022 *flagp |= HASWIDTH;
10024 Set_Node_Offset(ret, parse_start+1);
10025 Set_Node_Cur_Length(ret, parse_start);
10027 nextchar(pRExC_state);
10031 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10032 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10033 vFAIL3("Sequence (%.*s...) not recognized",
10034 RExC_parse-seqstart, seqstart);
10035 NOT_REACHED; /*NOTREACHED*/
10036 case '<': /* (?<...) */
10037 if (*RExC_parse == '!')
10039 else if (*RExC_parse != '=')
10045 case '\'': /* (?'...') */
10046 name_start= RExC_parse;
10047 svname = reg_scan_name(pRExC_state,
10048 SIZE_ONLY /* reverse test from the others */
10049 ? REG_RSN_RETURN_NAME
10050 : REG_RSN_RETURN_NULL);
10051 if (RExC_parse == name_start || *RExC_parse != paren)
10052 vFAIL2("Sequence (?%c... not terminated",
10053 paren=='>' ? '<' : paren);
10057 if (!svname) /* shouldn't happen */
10059 "panic: reg_scan_name returned NULL");
10060 if (!RExC_paren_names) {
10061 RExC_paren_names= newHV();
10062 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10064 RExC_paren_name_list= newAV();
10065 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10068 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10070 sv_dat = HeVAL(he_str);
10072 /* croak baby croak */
10074 "panic: paren_name hash element allocation failed");
10075 } else if ( SvPOK(sv_dat) ) {
10076 /* (?|...) can mean we have dupes so scan to check
10077 its already been stored. Maybe a flag indicating
10078 we are inside such a construct would be useful,
10079 but the arrays are likely to be quite small, so
10080 for now we punt -- dmq */
10081 IV count = SvIV(sv_dat);
10082 I32 *pv = (I32*)SvPVX(sv_dat);
10084 for ( i = 0 ; i < count ; i++ ) {
10085 if ( pv[i] == RExC_npar ) {
10091 pv = (I32*)SvGROW(sv_dat,
10092 SvCUR(sv_dat) + sizeof(I32)+1);
10093 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10094 pv[count] = RExC_npar;
10095 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10098 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10099 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10102 SvIV_set(sv_dat, 1);
10105 /* Yes this does cause a memory leak in debugging Perls
10107 if (!av_store(RExC_paren_name_list,
10108 RExC_npar, SvREFCNT_inc(svname)))
10109 SvREFCNT_dec_NN(svname);
10112 /*sv_dump(sv_dat);*/
10114 nextchar(pRExC_state);
10116 goto capturing_parens;
10118 RExC_seen |= REG_LOOKBEHIND_SEEN;
10119 RExC_in_lookbehind++;
10122 case '=': /* (?=...) */
10123 RExC_seen_zerolen++;
10125 case '!': /* (?!...) */
10126 RExC_seen_zerolen++;
10127 /* check if we're really just a "FAIL" assertion */
10128 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10129 FALSE /* Don't force to /x */ );
10130 if (*RExC_parse == ')') {
10131 ret=reganode(pRExC_state, OPFAIL, 0);
10132 nextchar(pRExC_state);
10136 case '|': /* (?|...) */
10137 /* branch reset, behave like a (?:...) except that
10138 buffers in alternations share the same numbers */
10140 after_freeze = freeze_paren = RExC_npar;
10142 case ':': /* (?:...) */
10143 case '>': /* (?>...) */
10145 case '$': /* (?$...) */
10146 case '@': /* (?@...) */
10147 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10149 case '0' : /* (?0) */
10150 case 'R' : /* (?R) */
10151 if (*RExC_parse != ')')
10152 FAIL("Sequence (?R) not terminated");
10153 ret = reg_node(pRExC_state, GOSTART);
10154 RExC_seen |= REG_GOSTART_SEEN;
10155 *flagp |= POSTPONED;
10156 nextchar(pRExC_state);
10159 /* named and numeric backreferences */
10160 case '&': /* (?&NAME) */
10161 parse_start = RExC_parse - 1;
10164 SV *sv_dat = reg_scan_name(pRExC_state,
10165 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10166 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10168 if (RExC_parse == RExC_end || *RExC_parse != ')')
10169 vFAIL("Sequence (?&... not terminated");
10170 goto gen_recurse_regop;
10173 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10175 vFAIL("Illegal pattern");
10177 goto parse_recursion;
10179 case '-': /* (?-1) */
10180 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10181 RExC_parse--; /* rewind to let it be handled later */
10185 case '1': case '2': case '3': case '4': /* (?1) */
10186 case '5': case '6': case '7': case '8': case '9':
10190 bool is_neg = FALSE;
10192 parse_start = RExC_parse - 1; /* MJD */
10193 if (*RExC_parse == '-') {
10197 if (grok_atoUV(RExC_parse, &unum, &endptr)
10201 RExC_parse = (char*)endptr;
10205 /* Some limit for num? */
10209 if (*RExC_parse!=')')
10210 vFAIL("Expecting close bracket");
10213 if ( paren == '-' ) {
10215 Diagram of capture buffer numbering.
10216 Top line is the normal capture buffer numbers
10217 Bottom line is the negative indexing as from
10221 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10225 num = RExC_npar + num;
10228 vFAIL("Reference to nonexistent group");
10230 } else if ( paren == '+' ) {
10231 num = RExC_npar + num - 1;
10234 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10236 if (num > (I32)RExC_rx->nparens) {
10238 vFAIL("Reference to nonexistent group");
10240 RExC_recurse_count++;
10241 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10242 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10243 22, "| |", (int)(depth * 2 + 1), "",
10244 (UV)ARG(ret), (IV)ARG2L(ret)));
10246 RExC_seen |= REG_RECURSE_SEEN;
10247 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10248 Set_Node_Offset(ret, parse_start); /* MJD */
10250 *flagp |= POSTPONED;
10251 nextchar(pRExC_state);
10256 case '?': /* (??...) */
10258 if (*RExC_parse != '{') {
10259 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10260 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10262 "Sequence (%"UTF8f"...) not recognized",
10263 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10264 NOT_REACHED; /*NOTREACHED*/
10266 *flagp |= POSTPONED;
10267 paren = *RExC_parse++;
10269 case '{': /* (?{...}) */
10272 struct reg_code_block *cb;
10274 RExC_seen_zerolen++;
10276 if ( !pRExC_state->num_code_blocks
10277 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10278 || pRExC_state->code_blocks[pRExC_state->code_index].start
10279 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10282 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10283 FAIL("panic: Sequence (?{...}): no code block found\n");
10284 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10286 /* this is a pre-compiled code block (?{...}) */
10287 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10288 RExC_parse = RExC_start + cb->end;
10291 if (cb->src_regex) {
10292 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10293 RExC_rxi->data->data[n] =
10294 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10295 RExC_rxi->data->data[n+1] = (void*)o;
10298 n = add_data(pRExC_state,
10299 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10300 RExC_rxi->data->data[n] = (void*)o;
10303 pRExC_state->code_index++;
10304 nextchar(pRExC_state);
10308 ret = reg_node(pRExC_state, LOGICAL);
10310 eval = reg2Lanode(pRExC_state, EVAL,
10313 /* for later propagation into (??{})
10315 RExC_flags & RXf_PMf_COMPILETIME
10320 REGTAIL(pRExC_state, ret, eval);
10321 /* deal with the length of this later - MJD */
10324 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10325 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10326 Set_Node_Offset(ret, parse_start);
10329 case '(': /* (?(?{...})...) and (?(?=...)...) */
10332 const int DEFINE_len = sizeof("DEFINE") - 1;
10333 if (RExC_parse[0] == '?') { /* (?(?...)) */
10335 RExC_parse[1] == '=' ||
10336 RExC_parse[1] == '!' ||
10337 RExC_parse[1] == '<' ||
10338 RExC_parse[1] == '{'
10339 ) { /* Lookahead or eval. */
10343 ret = reg_node(pRExC_state, LOGICAL);
10347 tail = reg(pRExC_state, 1, &flag, depth+1);
10348 if (flag & (RESTART_PASS1|NEED_UTF8)) {
10349 *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10352 REGTAIL(pRExC_state, ret, tail);
10355 /* Fall through to ‘Unknown switch condition’ at the
10356 end of the if/else chain. */
10358 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10359 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10361 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10362 char *name_start= RExC_parse++;
10364 SV *sv_dat=reg_scan_name(pRExC_state,
10365 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10366 if (RExC_parse == name_start || *RExC_parse != ch)
10367 vFAIL2("Sequence (?(%c... not terminated",
10368 (ch == '>' ? '<' : ch));
10371 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10372 RExC_rxi->data->data[num]=(void*)sv_dat;
10373 SvREFCNT_inc_simple_void(sv_dat);
10375 ret = reganode(pRExC_state,NGROUPP,num);
10376 goto insert_if_check_paren;
10378 else if (RExC_end - RExC_parse >= DEFINE_len
10379 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10381 ret = reganode(pRExC_state,DEFINEP,0);
10382 RExC_parse += DEFINE_len;
10384 goto insert_if_check_paren;
10386 else if (RExC_parse[0] == 'R') {
10389 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10391 if (grok_atoUV(RExC_parse, &uv, &endptr)
10395 RExC_parse = (char*)endptr;
10397 /* else "Switch condition not recognized" below */
10398 } else if (RExC_parse[0] == '&') {
10401 sv_dat = reg_scan_name(pRExC_state,
10403 ? REG_RSN_RETURN_NULL
10404 : REG_RSN_RETURN_DATA);
10405 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10407 ret = reganode(pRExC_state,INSUBP,parno);
10408 goto insert_if_check_paren;
10410 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10414 if (grok_atoUV(RExC_parse, &uv, &endptr)
10418 RExC_parse = (char*)endptr;
10421 vFAIL("panic: grok_atoUV returned FALSE");
10423 ret = reganode(pRExC_state, GROUPP, parno);
10425 insert_if_check_paren:
10426 if (UCHARAT(RExC_parse) != ')') {
10427 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10428 vFAIL("Switch condition not recognized");
10430 nextchar(pRExC_state);
10432 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10433 br = regbranch(pRExC_state, &flags, 1,depth+1);
10435 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10436 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10439 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10442 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10444 c = UCHARAT(RExC_parse);
10445 nextchar(pRExC_state);
10446 if (flags&HASWIDTH)
10447 *flagp |= HASWIDTH;
10450 vFAIL("(?(DEFINE)....) does not allow branches");
10452 /* Fake one for optimizer. */
10453 lastbr = reganode(pRExC_state, IFTHEN, 0);
10455 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10456 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10457 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10460 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10463 REGTAIL(pRExC_state, ret, lastbr);
10464 if (flags&HASWIDTH)
10465 *flagp |= HASWIDTH;
10466 c = UCHARAT(RExC_parse);
10467 nextchar(pRExC_state);
10472 if (RExC_parse>RExC_end)
10473 vFAIL("Switch (?(condition)... not terminated");
10475 vFAIL("Switch (?(condition)... contains too many branches");
10477 ender = reg_node(pRExC_state, TAIL);
10478 REGTAIL(pRExC_state, br, ender);
10480 REGTAIL(pRExC_state, lastbr, ender);
10481 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10484 REGTAIL(pRExC_state, ret, ender);
10485 RExC_size++; /* XXX WHY do we need this?!!
10486 For large programs it seems to be required
10487 but I can't figure out why. -- dmq*/
10490 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10491 vFAIL("Unknown switch condition (?(...))");
10493 case '[': /* (?[ ... ]) */
10494 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10497 RExC_parse--; /* for vFAIL to print correctly */
10498 vFAIL("Sequence (? incomplete");
10500 default: /* e.g., (?i) */
10503 parse_lparen_question_flags(pRExC_state);
10504 if (UCHARAT(RExC_parse) != ':') {
10506 nextchar(pRExC_state);
10511 nextchar(pRExC_state);
10516 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10521 ret = reganode(pRExC_state, OPEN, parno);
10523 if (!RExC_nestroot)
10524 RExC_nestroot = parno;
10525 if (RExC_seen & REG_RECURSE_SEEN
10526 && !RExC_open_parens[parno-1])
10528 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10529 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10530 22, "| |", (int)(depth * 2 + 1), "",
10531 (IV)parno, REG_NODE_NUM(ret)));
10532 RExC_open_parens[parno-1]= ret;
10535 Set_Node_Length(ret, 1); /* MJD */
10536 Set_Node_Offset(ret, RExC_parse); /* MJD */
10539 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10548 /* Pick up the branches, linking them together. */
10549 parse_start = RExC_parse; /* MJD */
10550 br = regbranch(pRExC_state, &flags, 1,depth+1);
10552 /* branch_len = (paren != 0); */
10555 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10556 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10559 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10561 if (*RExC_parse == '|') {
10562 if (!SIZE_ONLY && RExC_extralen) {
10563 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10566 reginsert(pRExC_state, BRANCH, br, depth+1);
10567 Set_Node_Length(br, paren != 0);
10568 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10572 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10574 else if (paren == ':') {
10575 *flagp |= flags&SIMPLE;
10577 if (is_open) { /* Starts with OPEN. */
10578 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10580 else if (paren != '?') /* Not Conditional */
10582 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10584 while (*RExC_parse == '|') {
10585 if (!SIZE_ONLY && RExC_extralen) {
10586 ender = reganode(pRExC_state, LONGJMP,0);
10588 /* Append to the previous. */
10589 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10592 RExC_extralen += 2; /* Account for LONGJMP. */
10593 nextchar(pRExC_state);
10594 if (freeze_paren) {
10595 if (RExC_npar > after_freeze)
10596 after_freeze = RExC_npar;
10597 RExC_npar = freeze_paren;
10599 br = regbranch(pRExC_state, &flags, 0, depth+1);
10602 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10603 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10606 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10608 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10610 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10613 if (have_branch || paren != ':') {
10614 /* Make a closing node, and hook it on the end. */
10617 ender = reg_node(pRExC_state, TAIL);
10620 ender = reganode(pRExC_state, CLOSE, parno);
10621 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10622 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10623 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10624 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10625 RExC_close_parens[parno-1]= ender;
10626 if (RExC_nestroot == parno)
10629 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10630 Set_Node_Length(ender,1); /* MJD */
10636 *flagp &= ~HASWIDTH;
10639 ender = reg_node(pRExC_state, SUCCEED);
10642 ender = reg_node(pRExC_state, END);
10644 assert(!RExC_opend); /* there can only be one! */
10645 RExC_opend = ender;
10649 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10650 DEBUG_PARSE_MSG("lsbr");
10651 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10652 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10653 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10654 SvPV_nolen_const(RExC_mysv1),
10655 (IV)REG_NODE_NUM(lastbr),
10656 SvPV_nolen_const(RExC_mysv2),
10657 (IV)REG_NODE_NUM(ender),
10658 (IV)(ender - lastbr)
10661 REGTAIL(pRExC_state, lastbr, ender);
10663 if (have_branch && !SIZE_ONLY) {
10664 char is_nothing= 1;
10666 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10668 /* Hook the tails of the branches to the closing node. */
10669 for (br = ret; br; br = regnext(br)) {
10670 const U8 op = PL_regkind[OP(br)];
10671 if (op == BRANCH) {
10672 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10673 if ( OP(NEXTOPER(br)) != NOTHING
10674 || regnext(NEXTOPER(br)) != ender)
10677 else if (op == BRANCHJ) {
10678 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10679 /* for now we always disable this optimisation * /
10680 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10681 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10687 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10688 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10689 DEBUG_PARSE_MSG("NADA");
10690 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10691 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10692 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10693 SvPV_nolen_const(RExC_mysv1),
10694 (IV)REG_NODE_NUM(ret),
10695 SvPV_nolen_const(RExC_mysv2),
10696 (IV)REG_NODE_NUM(ender),
10701 if (OP(ender) == TAIL) {
10706 for ( opt= br + 1; opt < ender ; opt++ )
10707 OP(opt)= OPTIMIZED;
10708 NEXT_OFF(br)= ender - br;
10716 static const char parens[] = "=!<,>";
10718 if (paren && (p = strchr(parens, paren))) {
10719 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10720 int flag = (p - parens) > 1;
10723 node = SUSPEND, flag = 0;
10724 reginsert(pRExC_state, node,ret, depth+1);
10725 Set_Node_Cur_Length(ret, parse_start);
10726 Set_Node_Offset(ret, parse_start + 1);
10728 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10732 /* Check for proper termination. */
10734 /* restore original flags, but keep (?p) and, if we've changed from /d
10735 * rules to /u, keep the /u */
10736 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10737 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
10738 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
10740 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
10741 RExC_parse = oregcomp_parse;
10742 vFAIL("Unmatched (");
10744 nextchar(pRExC_state);
10746 else if (!paren && RExC_parse < RExC_end) {
10747 if (*RExC_parse == ')') {
10749 vFAIL("Unmatched )");
10752 FAIL("Junk on end of regexp"); /* "Can't happen". */
10753 NOT_REACHED; /* NOTREACHED */
10756 if (RExC_in_lookbehind) {
10757 RExC_in_lookbehind--;
10759 if (after_freeze > RExC_npar)
10760 RExC_npar = after_freeze;
10765 - regbranch - one alternative of an | operator
10767 * Implements the concatenation operator.
10769 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10770 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10773 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10776 regnode *chain = NULL;
10778 I32 flags = 0, c = 0;
10779 GET_RE_DEBUG_FLAGS_DECL;
10781 PERL_ARGS_ASSERT_REGBRANCH;
10783 DEBUG_PARSE("brnc");
10788 if (!SIZE_ONLY && RExC_extralen)
10789 ret = reganode(pRExC_state, BRANCHJ,0);
10791 ret = reg_node(pRExC_state, BRANCH);
10792 Set_Node_Length(ret, 1);
10796 if (!first && SIZE_ONLY)
10797 RExC_extralen += 1; /* BRANCHJ */
10799 *flagp = WORST; /* Tentatively. */
10801 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10802 FALSE /* Don't force to /x */ );
10803 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10804 flags &= ~TRYAGAIN;
10805 latest = regpiece(pRExC_state, &flags,depth+1);
10806 if (latest == NULL) {
10807 if (flags & TRYAGAIN)
10809 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10810 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10813 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10815 else if (ret == NULL)
10817 *flagp |= flags&(HASWIDTH|POSTPONED);
10818 if (chain == NULL) /* First piece. */
10819 *flagp |= flags&SPSTART;
10821 /* FIXME adding one for every branch after the first is probably
10822 * excessive now we have TRIE support. (hv) */
10824 REGTAIL(pRExC_state, chain, latest);
10829 if (chain == NULL) { /* Loop ran zero times. */
10830 chain = reg_node(pRExC_state, NOTHING);
10835 *flagp |= flags&SIMPLE;
10842 - regpiece - something followed by possible [*+?]
10844 * Note that the branching code sequences used for ? and the general cases
10845 * of * and + are somewhat optimized: they use the same NOTHING node as
10846 * both the endmarker for their branch list and the body of the last branch.
10847 * It might seem that this node could be dispensed with entirely, but the
10848 * endmarker role is not redundant.
10850 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10852 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10853 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10856 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10862 const char * const origparse = RExC_parse;
10864 I32 max = REG_INFTY;
10865 #ifdef RE_TRACK_PATTERN_OFFSETS
10868 const char *maxpos = NULL;
10871 /* Save the original in case we change the emitted regop to a FAIL. */
10872 regnode * const orig_emit = RExC_emit;
10874 GET_RE_DEBUG_FLAGS_DECL;
10876 PERL_ARGS_ASSERT_REGPIECE;
10878 DEBUG_PARSE("piec");
10880 ret = regatom(pRExC_state, &flags,depth+1);
10882 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
10883 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
10885 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10891 if (op == '{' && regcurly(RExC_parse)) {
10893 #ifdef RE_TRACK_PATTERN_OFFSETS
10894 parse_start = RExC_parse; /* MJD */
10896 next = RExC_parse + 1;
10897 while (isDIGIT(*next) || *next == ',') {
10898 if (*next == ',') {
10906 if (*next == '}') { /* got one */
10907 const char* endptr;
10911 if (isDIGIT(*RExC_parse)) {
10912 if (!grok_atoUV(RExC_parse, &uv, &endptr))
10913 vFAIL("Invalid quantifier in {,}");
10914 if (uv >= REG_INFTY)
10915 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10920 if (*maxpos == ',')
10923 maxpos = RExC_parse;
10924 if (isDIGIT(*maxpos)) {
10925 if (!grok_atoUV(maxpos, &uv, &endptr))
10926 vFAIL("Invalid quantifier in {,}");
10927 if (uv >= REG_INFTY)
10928 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10931 max = REG_INFTY; /* meaning "infinity" */
10934 nextchar(pRExC_state);
10935 if (max < min) { /* If can't match, warn and optimize to fail
10939 /* We can't back off the size because we have to reserve
10940 * enough space for all the things we are about to throw
10941 * away, but we can shrink it by the ammount we are about
10942 * to re-use here */
10943 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10946 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10947 RExC_emit = orig_emit;
10949 ret = reganode(pRExC_state, OPFAIL, 0);
10952 else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
10955 ckWARN2reg(RExC_parse + 1,
10956 "Useless use of greediness modifier '%c'",
10959 /* Absorb the modifier, so later code doesn't see nor use it */
10960 nextchar(pRExC_state);
10964 if ((flags&SIMPLE)) {
10965 if (min == 0 && max == REG_INFTY) {
10966 reginsert(pRExC_state, STAR, ret, depth+1);
10969 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10972 if (min == 1 && max == REG_INFTY) {
10973 reginsert(pRExC_state, PLUS, ret, depth+1);
10976 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10979 MARK_NAUGHTY_EXP(2, 2);
10980 reginsert(pRExC_state, CURLY, ret, depth+1);
10981 Set_Node_Offset(ret, parse_start+1); /* MJD */
10982 Set_Node_Cur_Length(ret, parse_start);
10985 regnode * const w = reg_node(pRExC_state, WHILEM);
10988 REGTAIL(pRExC_state, ret, w);
10989 if (!SIZE_ONLY && RExC_extralen) {
10990 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10991 reginsert(pRExC_state, NOTHING,ret, depth+1);
10992 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10994 reginsert(pRExC_state, CURLYX,ret, depth+1);
10996 Set_Node_Offset(ret, parse_start+1);
10997 Set_Node_Length(ret,
10998 op == '{' ? (RExC_parse - parse_start) : 1);
11000 if (!SIZE_ONLY && RExC_extralen)
11001 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
11002 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11004 RExC_whilem_seen++, RExC_extralen += 3;
11005 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
11012 *flagp |= HASWIDTH;
11014 ARG1_SET(ret, (U16)min);
11015 ARG2_SET(ret, (U16)max);
11017 if (max == REG_INFTY)
11018 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11024 if (!ISMULT1(op)) {
11029 #if 0 /* Now runtime fix should be reliable. */
11031 /* if this is reinstated, don't forget to put this back into perldiag:
11033 =item Regexp *+ operand could be empty at {#} in regex m/%s/
11035 (F) The part of the regexp subject to either the * or + quantifier
11036 could match an empty string. The {#} shows in the regular
11037 expression about where the problem was discovered.
11041 if (!(flags&HASWIDTH) && op != '?')
11042 vFAIL("Regexp *+ operand could be empty");
11045 #ifdef RE_TRACK_PATTERN_OFFSETS
11046 parse_start = RExC_parse;
11048 nextchar(pRExC_state);
11050 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11056 else if (op == '+') {
11060 else if (op == '?') {
11065 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11066 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11067 ckWARN2reg(RExC_parse,
11068 "%"UTF8f" matches null string many times",
11069 UTF8fARG(UTF, (RExC_parse >= origparse
11070 ? RExC_parse - origparse
11073 (void)ReREFCNT_inc(RExC_rx_sv);
11076 if (RExC_parse < RExC_end && *RExC_parse == '?') {
11077 nextchar(pRExC_state);
11078 reginsert(pRExC_state, MINMOD, ret, depth+1);
11079 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11082 if (RExC_parse < RExC_end && *RExC_parse == '+') {
11084 nextchar(pRExC_state);
11085 ender = reg_node(pRExC_state, SUCCEED);
11086 REGTAIL(pRExC_state, ret, ender);
11087 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11089 ender = reg_node(pRExC_state, TAIL);
11090 REGTAIL(pRExC_state, ret, ender);
11093 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11095 vFAIL("Nested quantifiers");
11102 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11110 /* This routine teases apart the various meanings of \N and returns
11111 * accordingly. The input parameters constrain which meaning(s) is/are valid
11112 * in the current context.
11114 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11116 * If <code_point_p> is not NULL, the context is expecting the result to be a
11117 * single code point. If this \N instance turns out to a single code point,
11118 * the function returns TRUE and sets *code_point_p to that code point.
11120 * If <node_p> is not NULL, the context is expecting the result to be one of
11121 * the things representable by a regnode. If this \N instance turns out to be
11122 * one such, the function generates the regnode, returns TRUE and sets *node_p
11123 * to point to that regnode.
11125 * If this instance of \N isn't legal in any context, this function will
11126 * generate a fatal error and not return.
11128 * On input, RExC_parse should point to the first char following the \N at the
11129 * time of the call. On successful return, RExC_parse will have been updated
11130 * to point to just after the sequence identified by this routine. Also
11131 * *flagp has been updated as needed.
11133 * When there is some problem with the current context and this \N instance,
11134 * the function returns FALSE, without advancing RExC_parse, nor setting
11135 * *node_p, nor *code_point_p, nor *flagp.
11137 * If <cp_count> is not NULL, the caller wants to know the length (in code
11138 * points) that this \N sequence matches. This is set even if the function
11139 * returns FALSE, as detailed below.
11141 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11143 * Probably the most common case is for the \N to specify a single code point.
11144 * *cp_count will be set to 1, and *code_point_p will be set to that code
11147 * Another possibility is for the input to be an empty \N{}, which for
11148 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11149 * will be set to a generated NOTHING node.
11151 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11152 * set to 0. *node_p will be set to a generated REG_ANY node.
11154 * The fourth possibility is that \N resolves to a sequence of more than one
11155 * code points. *cp_count will be set to the number of code points in the
11156 * sequence. *node_p * will be set to a generated node returned by this
11157 * function calling S_reg().
11159 * The final possibility is that it is premature to be calling this function;
11160 * that pass1 needs to be restarted. This can happen when this changes from
11161 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
11162 * latter occurs only when the fourth possibility would otherwise be in
11163 * effect, and is because one of those code points requires the pattern to be
11164 * recompiled as UTF-8. The function returns FALSE, and sets the
11165 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
11166 * happens, the caller needs to desist from continuing parsing, and return
11167 * this information to its caller. This is not set for when there is only one
11168 * code point, as this can be called as part of an ANYOF node, and they can
11169 * store above-Latin1 code points without the pattern having to be in UTF-8.
11171 * For non-single-quoted regexes, the tokenizer has resolved character and
11172 * sequence names inside \N{...} into their Unicode values, normalizing the
11173 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11174 * hex-represented code points in the sequence. This is done there because
11175 * the names can vary based on what charnames pragma is in scope at the time,
11176 * so we need a way to take a snapshot of what they resolve to at the time of
11177 * the original parse. [perl #56444].
11179 * That parsing is skipped for single-quoted regexes, so we may here get
11180 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11181 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11182 * is legal and handled here. The code point is Unicode, and has to be
11183 * translated into the native character set for non-ASCII platforms.
11186 char * endbrace; /* points to '}' following the name */
11187 char *endchar; /* Points to '.' or '}' ending cur char in the input
11189 char* p = RExC_parse; /* Temporary */
11191 GET_RE_DEBUG_FLAGS_DECL;
11193 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11195 GET_RE_DEBUG_FLAGS;
11197 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11198 assert(! (node_p && cp_count)); /* At most 1 should be set */
11200 if (cp_count) { /* Initialize return for the most common case */
11204 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11205 * modifier. The other meanings do not, so use a temporary until we find
11206 * out which we are being called with */
11207 skip_to_be_ignored_text(pRExC_state, &p,
11208 FALSE /* Don't force to /x */ );
11210 /* Disambiguate between \N meaning a named character versus \N meaning
11211 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11212 * quantifier, or there is no '{' at all */
11213 if (*p != '{' || regcurly(p)) {
11223 *node_p = reg_node(pRExC_state, REG_ANY);
11224 *flagp |= HASWIDTH|SIMPLE;
11226 Set_Node_Length(*node_p, 1); /* MJD */
11230 /* Here, we have decided it should be a named character or sequence */
11232 /* The test above made sure that the next real character is a '{', but
11233 * under the /x modifier, it could be separated by space (or a comment and
11234 * \n) and this is not allowed (for consistency with \x{...} and the
11235 * tokenizer handling of \N{NAME}). */
11236 if (*RExC_parse != '{') {
11237 vFAIL("Missing braces on \\N{}");
11240 RExC_parse++; /* Skip past the '{' */
11242 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11243 || ! (endbrace == RExC_parse /* nothing between the {} */
11244 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11245 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11248 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11249 vFAIL("\\N{NAME} must be resolved by the lexer");
11252 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11255 if (endbrace == RExC_parse) { /* empty: \N{} */
11259 nextchar(pRExC_state);
11264 *node_p = reg_node(pRExC_state,NOTHING);
11268 RExC_parse += 2; /* Skip past the 'U+' */
11270 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11272 /* Code points are separated by dots. If none, there is only one code
11273 * point, and is terminated by the brace */
11275 if (endchar >= endbrace) {
11276 STRLEN length_of_hex;
11277 I32 grok_hex_flags;
11279 /* Here, exactly one code point. If that isn't what is wanted, fail */
11280 if (! code_point_p) {
11285 /* Convert code point from hex */
11286 length_of_hex = (STRLEN)(endchar - RExC_parse);
11287 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11288 | PERL_SCAN_DISALLOW_PREFIX
11290 /* No errors in the first pass (See [perl
11291 * #122671].) We let the code below find the
11292 * errors when there are multiple chars. */
11294 ? PERL_SCAN_SILENT_ILLDIGIT
11297 /* This routine is the one place where both single- and double-quotish
11298 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11299 * must be converted to native. */
11300 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11305 /* The tokenizer should have guaranteed validity, but it's possible to
11306 * bypass it by using single quoting, so check. Don't do the check
11307 * here when there are multiple chars; we do it below anyway. */
11308 if (length_of_hex == 0
11309 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11311 RExC_parse += length_of_hex; /* Includes all the valid */
11312 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11313 ? UTF8SKIP(RExC_parse)
11315 /* Guard against malformed utf8 */
11316 if (RExC_parse >= endchar) {
11317 RExC_parse = endchar;
11319 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11322 RExC_parse = endbrace + 1;
11325 else { /* Is a multiple character sequence */
11326 SV * substitute_parse;
11328 char *orig_end = RExC_end;
11331 /* Count the code points, if desired, in the sequence */
11334 while (RExC_parse < endbrace) {
11335 /* Point to the beginning of the next character in the sequence. */
11336 RExC_parse = endchar + 1;
11337 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11342 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11343 * But don't backup up the pointer if the caller want to know how many
11344 * code points there are (they can then handle things) */
11352 /* What is done here is to convert this to a sub-pattern of the form
11353 * \x{char1}\x{char2}... and then call reg recursively to parse it
11354 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11355 * while not having to worry about special handling that some code
11356 * points may have. */
11358 substitute_parse = newSVpvs("?:");
11360 while (RExC_parse < endbrace) {
11362 /* Convert to notation the rest of the code understands */
11363 sv_catpv(substitute_parse, "\\x{");
11364 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11365 sv_catpv(substitute_parse, "}");
11367 /* Point to the beginning of the next character in the sequence. */
11368 RExC_parse = endchar + 1;
11369 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11372 sv_catpv(substitute_parse, ")");
11374 RExC_parse = SvPV(substitute_parse, len);
11376 /* Don't allow empty number */
11377 if (len < (STRLEN) 8) {
11378 RExC_parse = endbrace;
11379 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11381 RExC_end = RExC_parse + len;
11383 /* The values are Unicode, and therefore not subject to recoding, but
11384 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11386 RExC_override_recoding = 1;
11388 RExC_recode_x_to_native = 1;
11392 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11393 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11394 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11397 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11400 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11403 /* Restore the saved values */
11404 RExC_parse = endbrace;
11405 RExC_end = orig_end;
11406 RExC_override_recoding = 0;
11408 RExC_recode_x_to_native = 0;
11411 SvREFCNT_dec_NN(substitute_parse);
11412 nextchar(pRExC_state);
11422 * It returns the code point in utf8 for the value in *encp.
11423 * value: a code value in the source encoding
11424 * encp: a pointer to an Encode object
11426 * If the result from Encode is not a single character,
11427 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11430 S_reg_recode(pTHX_ const char value, SV **encp)
11433 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11434 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11435 const STRLEN newlen = SvCUR(sv);
11436 UV uv = UNICODE_REPLACEMENT;
11438 PERL_ARGS_ASSERT_REG_RECODE;
11442 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11445 if (!newlen || numlen != newlen) {
11446 uv = UNICODE_REPLACEMENT;
11452 PERL_STATIC_INLINE U8
11453 S_compute_EXACTish(RExC_state_t *pRExC_state)
11457 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11465 op = get_regex_charset(RExC_flags);
11466 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11467 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11468 been, so there is no hole */
11471 return op + EXACTF;
11474 PERL_STATIC_INLINE void
11475 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11476 regnode *node, I32* flagp, STRLEN len, UV code_point,
11479 /* This knows the details about sizing an EXACTish node, setting flags for
11480 * it (by setting <*flagp>, and potentially populating it with a single
11483 * If <len> (the length in bytes) is non-zero, this function assumes that
11484 * the node has already been populated, and just does the sizing. In this
11485 * case <code_point> should be the final code point that has already been
11486 * placed into the node. This value will be ignored except that under some
11487 * circumstances <*flagp> is set based on it.
11489 * If <len> is zero, the function assumes that the node is to contain only
11490 * the single character given by <code_point> and calculates what <len>
11491 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11492 * additionally will populate the node's STRING with <code_point> or its
11495 * In both cases <*flagp> is appropriately set
11497 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11498 * 255, must be folded (the former only when the rules indicate it can
11501 * When it does the populating, it looks at the flag 'downgradable'. If
11502 * true with a node that folds, it checks if the single code point
11503 * participates in a fold, and if not downgrades the node to an EXACT.
11504 * This helps the optimizer */
11506 bool len_passed_in = cBOOL(len != 0);
11507 U8 character[UTF8_MAXBYTES_CASE+1];
11509 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11511 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11512 * sizing difference, and is extra work that is thrown away */
11513 if (downgradable && ! PASS2) {
11514 downgradable = FALSE;
11517 if (! len_passed_in) {
11519 if (UVCHR_IS_INVARIANT(code_point)) {
11520 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11521 *character = (U8) code_point;
11523 else { /* Here is /i and not /l. (toFOLD() is defined on just
11524 ASCII, which isn't the same thing as INVARIANT on
11525 EBCDIC, but it works there, as the extra invariants
11526 fold to themselves) */
11527 *character = toFOLD((U8) code_point);
11529 /* We can downgrade to an EXACT node if this character
11530 * isn't a folding one. Note that this assumes that
11531 * nothing above Latin1 folds to some other invariant than
11532 * one of these alphabetics; otherwise we would also have
11534 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11535 * || ASCII_FOLD_RESTRICTED))
11537 if (downgradable && PL_fold[code_point] == code_point) {
11543 else if (FOLD && (! LOC
11544 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11545 { /* Folding, and ok to do so now */
11546 UV folded = _to_uni_fold_flags(
11550 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11551 ? FOLD_FLAGS_NOMIX_ASCII
11554 && folded == code_point /* This quickly rules out many
11555 cases, avoiding the
11556 _invlist_contains_cp() overhead
11558 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11565 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11567 /* Not folding this cp, and can output it directly */
11568 *character = UTF8_TWO_BYTE_HI(code_point);
11569 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11573 uvchr_to_utf8( character, code_point);
11574 len = UTF8SKIP(character);
11576 } /* Else pattern isn't UTF8. */
11578 *character = (U8) code_point;
11580 } /* Else is folded non-UTF8 */
11581 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11582 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11583 || UNICODE_DOT_DOT_VERSION > 0)
11584 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11588 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11589 * comments at join_exact()); */
11590 *character = (U8) code_point;
11593 /* Can turn into an EXACT node if we know the fold at compile time,
11594 * and it folds to itself and doesn't particpate in other folds */
11597 && PL_fold_latin1[code_point] == code_point
11598 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11599 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11603 } /* else is Sharp s. May need to fold it */
11604 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11606 *(character + 1) = 's';
11610 *character = LATIN_SMALL_LETTER_SHARP_S;
11616 RExC_size += STR_SZ(len);
11619 RExC_emit += STR_SZ(len);
11620 STR_LEN(node) = len;
11621 if (! len_passed_in) {
11622 Copy((char *) character, STRING(node), len, char);
11626 *flagp |= HASWIDTH;
11628 /* A single character node is SIMPLE, except for the special-cased SHARP S
11630 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
11631 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11632 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11633 || UNICODE_DOT_DOT_VERSION > 0)
11634 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
11635 || ! FOLD || ! DEPENDS_SEMANTICS)
11641 /* The OP may not be well defined in PASS1 */
11642 if (PASS2 && OP(node) == EXACTFL) {
11643 RExC_contains_locale = 1;
11648 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11649 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11652 S_backref_value(char *p)
11654 const char* endptr;
11656 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11663 - regatom - the lowest level
11665 Try to identify anything special at the start of the pattern. If there
11666 is, then handle it as required. This may involve generating a single regop,
11667 such as for an assertion; or it may involve recursing, such as to
11668 handle a () structure.
11670 If the string doesn't start with something special then we gobble up
11671 as much literal text as we can.
11673 Once we have been able to handle whatever type of thing started the
11674 sequence, we return.
11676 Note: we have to be careful with escapes, as they can be both literal
11677 and special, and in the case of \10 and friends, context determines which.
11679 A summary of the code structure is:
11681 switch (first_byte) {
11682 cases for each special:
11683 handle this special;
11686 switch (2nd byte) {
11687 cases for each unambiguous special:
11688 handle this special;
11690 cases for each ambigous special/literal:
11692 if (special) handle here
11694 default: // unambiguously literal:
11697 default: // is a literal char
11700 create EXACTish node for literal;
11701 while (more input and node isn't full) {
11702 switch (input_byte) {
11703 cases for each special;
11704 make sure parse pointer is set so that the next call to
11705 regatom will see this special first
11706 goto loopdone; // EXACTish node terminated by prev. char
11708 append char to EXACTISH node;
11710 get next input byte;
11714 return the generated node;
11716 Specifically there are two separate switches for handling
11717 escape sequences, with the one for handling literal escapes requiring
11718 a dummy entry for all of the special escapes that are actually handled
11721 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11723 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11724 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11725 Otherwise does not return NULL.
11729 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11731 regnode *ret = NULL;
11738 GET_RE_DEBUG_FLAGS_DECL;
11740 *flagp = WORST; /* Tentatively. */
11742 DEBUG_PARSE("atom");
11744 PERL_ARGS_ASSERT_REGATOM;
11747 parse_start = RExC_parse;
11748 switch ((U8)*RExC_parse) {
11750 RExC_seen_zerolen++;
11751 nextchar(pRExC_state);
11752 if (RExC_flags & RXf_PMf_MULTILINE)
11753 ret = reg_node(pRExC_state, MBOL);
11755 ret = reg_node(pRExC_state, SBOL);
11756 Set_Node_Length(ret, 1); /* MJD */
11759 nextchar(pRExC_state);
11761 RExC_seen_zerolen++;
11762 if (RExC_flags & RXf_PMf_MULTILINE)
11763 ret = reg_node(pRExC_state, MEOL);
11765 ret = reg_node(pRExC_state, SEOL);
11766 Set_Node_Length(ret, 1); /* MJD */
11769 nextchar(pRExC_state);
11770 if (RExC_flags & RXf_PMf_SINGLELINE)
11771 ret = reg_node(pRExC_state, SANY);
11773 ret = reg_node(pRExC_state, REG_ANY);
11774 *flagp |= HASWIDTH|SIMPLE;
11776 Set_Node_Length(ret, 1); /* MJD */
11780 char * const oregcomp_parse = ++RExC_parse;
11781 ret = regclass(pRExC_state, flagp,depth+1,
11782 FALSE, /* means parse the whole char class */
11783 TRUE, /* allow multi-char folds */
11784 FALSE, /* don't silence non-portable warnings. */
11785 (bool) RExC_strict,
11786 TRUE, /* Allow an optimized regnode result */
11789 if (*flagp & (RESTART_PASS1|NEED_UTF8))
11791 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11794 if (*RExC_parse != ']') {
11795 RExC_parse = oregcomp_parse;
11796 vFAIL("Unmatched [");
11798 nextchar(pRExC_state);
11799 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11803 nextchar(pRExC_state);
11804 ret = reg(pRExC_state, 2, &flags,depth+1);
11806 if (flags & TRYAGAIN) {
11807 if (RExC_parse == RExC_end) {
11808 /* Make parent create an empty node if needed. */
11809 *flagp |= TRYAGAIN;
11814 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11815 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11818 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11821 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11825 if (flags & TRYAGAIN) {
11826 *flagp |= TRYAGAIN;
11829 vFAIL("Internal urp");
11830 /* Supposed to be caught earlier. */
11836 vFAIL("Quantifier follows nothing");
11841 This switch handles escape sequences that resolve to some kind
11842 of special regop and not to literal text. Escape sequnces that
11843 resolve to literal text are handled below in the switch marked
11846 Every entry in this switch *must* have a corresponding entry
11847 in the literal escape switch. However, the opposite is not
11848 required, as the default for this switch is to jump to the
11849 literal text handling code.
11851 switch ((U8)*++RExC_parse) {
11852 /* Special Escapes */
11854 RExC_seen_zerolen++;
11855 ret = reg_node(pRExC_state, SBOL);
11856 /* SBOL is shared with /^/ so we set the flags so we can tell
11857 * /\A/ from /^/ in split. We check ret because first pass we
11858 * have no regop struct to set the flags on. */
11862 goto finish_meta_pat;
11864 ret = reg_node(pRExC_state, GPOS);
11865 RExC_seen |= REG_GPOS_SEEN;
11867 goto finish_meta_pat;
11869 RExC_seen_zerolen++;
11870 ret = reg_node(pRExC_state, KEEPS);
11872 /* XXX:dmq : disabling in-place substitution seems to
11873 * be necessary here to avoid cases of memory corruption, as
11874 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11876 RExC_seen |= REG_LOOKBEHIND_SEEN;
11877 goto finish_meta_pat;
11879 ret = reg_node(pRExC_state, SEOL);
11881 RExC_seen_zerolen++; /* Do not optimize RE away */
11882 goto finish_meta_pat;
11884 ret = reg_node(pRExC_state, EOS);
11886 RExC_seen_zerolen++; /* Do not optimize RE away */
11887 goto finish_meta_pat;
11889 vFAIL("\\C no longer supported");
11891 ret = reg_node(pRExC_state, CLUMP);
11892 *flagp |= HASWIDTH;
11893 goto finish_meta_pat;
11899 arg = ANYOF_WORDCHAR;
11907 regex_charset charset = get_regex_charset(RExC_flags);
11909 RExC_seen_zerolen++;
11910 RExC_seen |= REG_LOOKBEHIND_SEEN;
11911 op = BOUND + charset;
11913 if (op == BOUNDL) {
11914 RExC_contains_locale = 1;
11917 ret = reg_node(pRExC_state, op);
11919 if (*(RExC_parse + 1) != '{') {
11920 FLAGS(ret) = TRADITIONAL_BOUND;
11921 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
11927 char name = *RExC_parse;
11930 endbrace = strchr(RExC_parse, '}');
11933 vFAIL2("Missing right brace on \\%c{}", name);
11935 /* XXX Need to decide whether to take spaces or not. Should be
11936 * consistent with \p{}, but that currently is SPACE, which
11937 * means vertical too, which seems wrong
11938 * while (isBLANK(*RExC_parse)) {
11941 if (endbrace == RExC_parse) {
11942 RExC_parse++; /* After the '}' */
11943 vFAIL2("Empty \\%c{}", name);
11945 length = endbrace - RExC_parse;
11946 /*while (isBLANK(*(RExC_parse + length - 1))) {
11949 switch (*RExC_parse) {
11952 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11954 goto bad_bound_type;
11956 FLAGS(ret) = GCB_BOUND;
11959 if (length != 2 || *(RExC_parse + 1) != 'b') {
11960 goto bad_bound_type;
11962 FLAGS(ret) = SB_BOUND;
11965 if (length != 2 || *(RExC_parse + 1) != 'b') {
11966 goto bad_bound_type;
11968 FLAGS(ret) = WB_BOUND;
11972 RExC_parse = endbrace;
11974 "'%"UTF8f"' is an unknown bound type",
11975 UTF8fARG(UTF, length, endbrace - length));
11976 NOT_REACHED; /*NOTREACHED*/
11978 RExC_parse = endbrace;
11979 REQUIRE_UNI_RULES(flagp, NULL);
11981 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
11985 /* Don't have to worry about UTF-8, in this message because
11986 * to get here the contents of the \b must be ASCII */
11987 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
11988 "Using /u for '%.*s' instead of /%s",
11990 endbrace - length + 1,
11991 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11992 ? ASCII_RESTRICT_PAT_MODS
11993 : ASCII_MORE_RESTRICT_PAT_MODS);
11997 if (PASS2 && invert) {
11998 OP(ret) += NBOUND - BOUND;
12000 goto finish_meta_pat;
12008 if (! DEPENDS_SEMANTICS) {
12012 /* \d doesn't have any matches in the upper Latin1 range, hence /d
12013 * is equivalent to /u. Changing to /u saves some branches at
12016 goto join_posix_op_known;
12019 ret = reg_node(pRExC_state, LNBREAK);
12020 *flagp |= HASWIDTH|SIMPLE;
12021 goto finish_meta_pat;
12029 goto join_posix_op_known;
12035 arg = ANYOF_VERTWS;
12037 goto join_posix_op_known;
12047 op = POSIXD + get_regex_charset(RExC_flags);
12048 if (op > POSIXA) { /* /aa is same as /a */
12051 else if (op == POSIXL) {
12052 RExC_contains_locale = 1;
12055 join_posix_op_known:
12058 op += NPOSIXD - POSIXD;
12061 ret = reg_node(pRExC_state, op);
12063 FLAGS(ret) = namedclass_to_classnum(arg);
12066 *flagp |= HASWIDTH|SIMPLE;
12070 nextchar(pRExC_state);
12071 Set_Node_Length(ret, 2); /* MJD */
12077 ret = regclass(pRExC_state, flagp,depth+1,
12078 TRUE, /* means just parse this element */
12079 FALSE, /* don't allow multi-char folds */
12080 FALSE, /* don't silence non-portable warnings. It
12081 would be a bug if these returned
12083 (bool) RExC_strict,
12084 TRUE, /* Allow an optimized regnode result */
12086 if (*flagp & RESTART_PASS1)
12088 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12089 * multi-char folds are allowed. */
12091 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12096 Set_Node_Offset(ret, parse_start);
12097 Set_Node_Cur_Length(ret, parse_start - 2);
12098 nextchar(pRExC_state);
12101 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12102 * \N{...} evaluates to a sequence of more than one code points).
12103 * The function call below returns a regnode, which is our result.
12104 * The parameters cause it to fail if the \N{} evaluates to a
12105 * single code point; we handle those like any other literal. The
12106 * reason that the multicharacter case is handled here and not as
12107 * part of the EXACtish code is because of quantifiers. In
12108 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12109 * this way makes that Just Happen. dmq.
12110 * join_exact() will join this up with adjacent EXACTish nodes
12111 * later on, if appropriate. */
12113 if (grok_bslash_N(pRExC_state,
12114 &ret, /* Want a regnode returned */
12115 NULL, /* Fail if evaluates to a single code
12117 NULL, /* Don't need a count of how many code
12125 if (*flagp & RESTART_PASS1)
12128 /* Here, evaluates to a single code point. Go get that */
12129 RExC_parse = parse_start;
12132 case 'k': /* Handle \k<NAME> and \k'NAME' */
12135 char ch= RExC_parse[1];
12136 if (ch != '<' && ch != '\'' && ch != '{') {
12138 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12139 vFAIL2("Sequence %.2s... not terminated",parse_start);
12141 /* this pretty much dupes the code for (?P=...) in reg(), if
12142 you change this make sure you change that */
12143 char* name_start = (RExC_parse += 2);
12145 SV *sv_dat = reg_scan_name(pRExC_state,
12146 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12147 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12148 if (RExC_parse == name_start || *RExC_parse != ch)
12149 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12150 vFAIL2("Sequence %.3s... not terminated",parse_start);
12153 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12154 RExC_rxi->data->data[num]=(void*)sv_dat;
12155 SvREFCNT_inc_simple_void(sv_dat);
12159 ret = reganode(pRExC_state,
12162 : (ASCII_FOLD_RESTRICTED)
12164 : (AT_LEAST_UNI_SEMANTICS)
12170 *flagp |= HASWIDTH;
12172 /* override incorrect value set in reganode MJD */
12173 Set_Node_Offset(ret, parse_start+1);
12174 Set_Node_Cur_Length(ret, parse_start);
12175 nextchar(pRExC_state);
12181 case '1': case '2': case '3': case '4':
12182 case '5': case '6': case '7': case '8': case '9':
12187 if (*RExC_parse == 'g') {
12191 if (*RExC_parse == '{') {
12195 if (*RExC_parse == '-') {
12199 if (hasbrace && !isDIGIT(*RExC_parse)) {
12200 if (isrel) RExC_parse--;
12202 goto parse_named_seq;
12205 num = S_backref_value(RExC_parse);
12207 vFAIL("Reference to invalid group 0");
12208 else if (num == I32_MAX) {
12209 if (isDIGIT(*RExC_parse))
12210 vFAIL("Reference to nonexistent group");
12212 vFAIL("Unterminated \\g... pattern");
12216 num = RExC_npar - num;
12218 vFAIL("Reference to nonexistent or unclosed group");
12222 num = S_backref_value(RExC_parse);
12223 /* bare \NNN might be backref or octal - if it is larger
12224 * than or equal RExC_npar then it is assumed to be an
12225 * octal escape. Note RExC_npar is +1 from the actual
12226 * number of parens. */
12227 /* Note we do NOT check if num == I32_MAX here, as that is
12228 * handled by the RExC_npar check */
12231 /* any numeric escape < 10 is always a backref */
12233 /* any numeric escape < RExC_npar is a backref */
12234 && num >= RExC_npar
12235 /* cannot be an octal escape if it starts with 8 */
12236 && *RExC_parse != '8'
12237 /* cannot be an octal escape it it starts with 9 */
12238 && *RExC_parse != '9'
12241 /* Probably not a backref, instead likely to be an
12242 * octal character escape, e.g. \35 or \777.
12243 * The above logic should make it obvious why using
12244 * octal escapes in patterns is problematic. - Yves */
12245 RExC_parse = parse_start;
12250 /* At this point RExC_parse points at a numeric escape like
12251 * \12 or \88 or something similar, which we should NOT treat
12252 * as an octal escape. It may or may not be a valid backref
12253 * escape. For instance \88888888 is unlikely to be a valid
12255 while (isDIGIT(*RExC_parse))
12258 if (*RExC_parse != '}')
12259 vFAIL("Unterminated \\g{...} pattern");
12263 if (num > (I32)RExC_rx->nparens)
12264 vFAIL("Reference to nonexistent group");
12267 ret = reganode(pRExC_state,
12270 : (ASCII_FOLD_RESTRICTED)
12272 : (AT_LEAST_UNI_SEMANTICS)
12278 *flagp |= HASWIDTH;
12280 /* override incorrect value set in reganode MJD */
12281 Set_Node_Offset(ret, parse_start);
12282 Set_Node_Cur_Length(ret, parse_start-1);
12283 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12284 FALSE /* Don't force to /x */ );
12288 if (RExC_parse >= RExC_end)
12289 FAIL("Trailing \\");
12292 /* Do not generate "unrecognized" warnings here, we fall
12293 back into the quick-grab loop below */
12294 RExC_parse = parse_start;
12296 } /* end of switch on a \foo sequence */
12301 /* '#' comments should have been spaced over before this function was
12303 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12305 if (RExC_flags & RXf_PMf_EXTENDED) {
12306 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12307 if (RExC_parse < RExC_end)
12317 /* Here, we have determined that the next thing is probably a
12318 * literal character. RExC_parse points to the first byte of its
12319 * definition. (It still may be an escape sequence that evaluates
12320 * to a single character) */
12326 #define MAX_NODE_STRING_SIZE 127
12327 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12329 U8 upper_parse = MAX_NODE_STRING_SIZE;
12330 U8 node_type = compute_EXACTish(pRExC_state);
12331 bool next_is_quantifier;
12332 char * oldp = NULL;
12334 /* We can convert EXACTF nodes to EXACTFU if they contain only
12335 * characters that match identically regardless of the target
12336 * string's UTF8ness. The reason to do this is that EXACTF is not
12337 * trie-able, EXACTFU is.
12339 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12340 * contain only above-Latin1 characters (hence must be in UTF8),
12341 * which don't participate in folds with Latin1-range characters,
12342 * as the latter's folds aren't known until runtime. (We don't
12343 * need to figure this out until pass 2) */
12344 bool maybe_exactfu = PASS2
12345 && (node_type == EXACTF || node_type == EXACTFL);
12347 /* If a folding node contains only code points that don't
12348 * participate in folds, it can be changed into an EXACT node,
12349 * which allows the optimizer more things to look for */
12352 ret = reg_node(pRExC_state, node_type);
12354 /* In pass1, folded, we use a temporary buffer instead of the
12355 * actual node, as the node doesn't exist yet */
12356 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12362 /* We look for the EXACTFish to EXACT node optimizaton only if
12363 * folding. (And we don't need to figure this out until pass 2) */
12364 maybe_exact = FOLD && PASS2;
12366 /* XXX The node can hold up to 255 bytes, yet this only goes to
12367 * 127. I (khw) do not know why. Keeping it somewhat less than
12368 * 255 allows us to not have to worry about overflow due to
12369 * converting to utf8 and fold expansion, but that value is
12370 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12371 * split up by this limit into a single one using the real max of
12372 * 255. Even at 127, this breaks under rare circumstances. If
12373 * folding, we do not want to split a node at a character that is a
12374 * non-final in a multi-char fold, as an input string could just
12375 * happen to want to match across the node boundary. The join
12376 * would solve that problem if the join actually happens. But a
12377 * series of more than two nodes in a row each of 127 would cause
12378 * the first join to succeed to get to 254, but then there wouldn't
12379 * be room for the next one, which could at be one of those split
12380 * multi-char folds. I don't know of any fool-proof solution. One
12381 * could back off to end with only a code point that isn't such a
12382 * non-final, but it is possible for there not to be any in the
12385 assert( ! UTF /* Is at the beginning of a character */
12386 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12387 || UTF8_IS_START(UCHARAT(RExC_parse)));
12389 for (p = RExC_parse;
12390 len < upper_parse && p < RExC_end;
12395 /* White space has already been ignored */
12396 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
12397 || ! is_PATWS_safe((p), RExC_end, UTF));
12409 /* Literal Escapes Switch
12411 This switch is meant to handle escape sequences that
12412 resolve to a literal character.
12414 Every escape sequence that represents something
12415 else, like an assertion or a char class, is handled
12416 in the switch marked 'Special Escapes' above in this
12417 routine, but also has an entry here as anything that
12418 isn't explicitly mentioned here will be treated as
12419 an unescaped equivalent literal.
12422 switch ((U8)*++p) {
12423 /* These are all the special escapes. */
12424 case 'A': /* Start assertion */
12425 case 'b': case 'B': /* Word-boundary assertion*/
12426 case 'C': /* Single char !DANGEROUS! */
12427 case 'd': case 'D': /* digit class */
12428 case 'g': case 'G': /* generic-backref, pos assertion */
12429 case 'h': case 'H': /* HORIZWS */
12430 case 'k': case 'K': /* named backref, keep marker */
12431 case 'p': case 'P': /* Unicode property */
12432 case 'R': /* LNBREAK */
12433 case 's': case 'S': /* space class */
12434 case 'v': case 'V': /* VERTWS */
12435 case 'w': case 'W': /* word class */
12436 case 'X': /* eXtended Unicode "combining
12437 character sequence" */
12438 case 'z': case 'Z': /* End of line/string assertion */
12442 /* Anything after here is an escape that resolves to a
12443 literal. (Except digits, which may or may not)
12449 case 'N': /* Handle a single-code point named character. */
12450 RExC_parse = p + 1;
12451 if (! grok_bslash_N(pRExC_state,
12452 NULL, /* Fail if evaluates to
12453 anything other than a
12454 single code point */
12455 &ender, /* The returned single code
12457 NULL, /* Don't need a count of
12458 how many code points */
12462 if (*flagp & NEED_UTF8)
12463 FAIL("panic: grok_bslash_N set NEED_UTF8");
12464 if (*flagp & RESTART_PASS1)
12467 /* Here, it wasn't a single code point. Go close
12468 * up this EXACTish node. The switch() prior to
12469 * this switch handles the other cases */
12470 RExC_parse = p = oldp;
12474 if (ender > 0xff) {
12475 REQUIRE_UTF8(flagp);
12491 ender = ESC_NATIVE;
12501 const char* error_msg;
12503 bool valid = grok_bslash_o(&p,
12506 PASS2, /* out warnings */
12507 (bool) RExC_strict,
12508 TRUE, /* Output warnings
12513 RExC_parse = p; /* going to die anyway; point
12514 to exact spot of failure */
12518 if (IN_ENCODING && ender < 0x100) {
12519 goto recode_encoding;
12521 if (ender > 0xff) {
12522 REQUIRE_UTF8(flagp);
12528 UV result = UV_MAX; /* initialize to erroneous
12530 const char* error_msg;
12532 bool valid = grok_bslash_x(&p,
12535 PASS2, /* out warnings */
12536 (bool) RExC_strict,
12537 TRUE, /* Silence warnings
12542 RExC_parse = p; /* going to die anyway; point
12543 to exact spot of failure */
12548 if (ender < 0x100) {
12550 if (RExC_recode_x_to_native) {
12551 ender = LATIN1_TO_NATIVE(ender);
12556 goto recode_encoding;
12560 REQUIRE_UTF8(flagp);
12566 ender = grok_bslash_c(*p++, PASS2);
12568 case '8': case '9': /* must be a backreference */
12570 /* we have an escape like \8 which cannot be an octal escape
12571 * so we exit the loop, and let the outer loop handle this
12572 * escape which may or may not be a legitimate backref. */
12574 case '1': case '2': case '3':case '4':
12575 case '5': case '6': case '7':
12576 /* When we parse backslash escapes there is ambiguity
12577 * between backreferences and octal escapes. Any escape
12578 * from \1 - \9 is a backreference, any multi-digit
12579 * escape which does not start with 0 and which when
12580 * evaluated as decimal could refer to an already
12581 * parsed capture buffer is a back reference. Anything
12584 * Note this implies that \118 could be interpreted as
12585 * 118 OR as "\11" . "8" depending on whether there
12586 * were 118 capture buffers defined already in the
12589 /* NOTE, RExC_npar is 1 more than the actual number of
12590 * parens we have seen so far, hence the < RExC_npar below. */
12592 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12593 { /* Not to be treated as an octal constant, go
12601 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12603 ender = grok_oct(p, &numlen, &flags, NULL);
12604 if (ender > 0xff) {
12605 REQUIRE_UTF8(flagp);
12608 if (PASS2 /* like \08, \178 */
12611 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12613 reg_warn_non_literal_string(
12615 form_short_octal_warning(p, numlen));
12618 if (IN_ENCODING && ender < 0x100)
12619 goto recode_encoding;
12622 if (! RExC_override_recoding) {
12623 SV* enc = _get_encoding();
12624 ender = reg_recode((const char)(U8)ender, &enc);
12626 ckWARNreg(p, "Invalid escape in the specified encoding");
12627 REQUIRE_UTF8(flagp);
12632 FAIL("Trailing \\");
12635 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12636 /* Include any left brace following the alpha to emphasize
12637 * that it could be part of an escape at some point
12639 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12640 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12642 goto normal_default;
12643 } /* End of switch on '\' */
12646 /* Currently we don't warn when the lbrace is at the start
12647 * of a construct. This catches it in the middle of a
12648 * literal string, or when it's the first thing after
12649 * something like "\b" */
12651 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12653 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12656 default: /* A literal character */
12658 if (! UTF8_IS_INVARIANT(*p) && UTF) {
12660 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12661 &numlen, UTF8_ALLOW_DEFAULT);
12667 } /* End of switch on the literal */
12669 /* Here, have looked at the literal character and <ender>
12670 * contains its ordinal, <p> points to the character after it.
12671 * We need to check if the next non-ignored thing is a
12672 * quantifier. Move <p> to after anything that should be
12673 * ignored, which, as a side effect, positions <p> for the next
12674 * loop iteration */
12675 skip_to_be_ignored_text(pRExC_state, &p,
12676 FALSE /* Don't force to /x */ );
12678 /* If the next thing is a quantifier, it applies to this
12679 * character only, which means that this character has to be in
12680 * its own node and can't just be appended to the string in an
12681 * existing node, so if there are already other characters in
12682 * the node, close the node with just them, and set up to do
12683 * this character again next time through, when it will be the
12684 * only thing in its new node */
12685 if ((next_is_quantifier = ( LIKELY(p < RExC_end)
12686 && UNLIKELY(ISMULT2(p))))
12693 /* Ready to add 'ender' to the node */
12695 if (! FOLD) { /* The simple case, just append the literal */
12697 /* In the sizing pass, we need only the size of the
12698 * character we are appending, hence we can delay getting
12699 * its representation until PASS2. */
12702 const STRLEN unilen = UVCHR_SKIP(ender);
12705 /* We have to subtract 1 just below (and again in
12706 * the corresponding PASS2 code) because the loop
12707 * increments <len> each time, as all but this path
12708 * (and one other) through it add a single byte to
12709 * the EXACTish node. But these paths would change
12710 * len to be the correct final value, so cancel out
12711 * the increment that follows */
12717 } else { /* PASS2 */
12720 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12721 len += (char *) new_s - s - 1;
12722 s = (char *) new_s;
12725 *(s++) = (char) ender;
12729 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12731 /* Here are folding under /l, and the code point is
12732 * problematic. First, we know we can't simplify things */
12733 maybe_exact = FALSE;
12734 maybe_exactfu = FALSE;
12736 /* A problematic code point in this context means that its
12737 * fold isn't known until runtime, so we can't fold it now.
12738 * (The non-problematic code points are the above-Latin1
12739 * ones that fold to also all above-Latin1. Their folds
12740 * don't vary no matter what the locale is.) But here we
12741 * have characters whose fold depends on the locale.
12742 * Unlike the non-folding case above, we have to keep track
12743 * of these in the sizing pass, so that we can make sure we
12744 * don't split too-long nodes in the middle of a potential
12745 * multi-char fold. And unlike the regular fold case
12746 * handled in the else clauses below, we don't actually
12747 * fold and don't have special cases to consider. What we
12748 * do for both passes is the PASS2 code for non-folding */
12749 goto not_fold_common;
12751 else /* A regular FOLD code point */
12753 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12754 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12755 || UNICODE_DOT_DOT_VERSION > 0)
12756 /* See comments for join_exact() as to why we fold this
12757 * non-UTF at compile time */
12758 || (node_type == EXACTFU
12759 && ender == LATIN_SMALL_LETTER_SHARP_S)
12762 /* Here, are folding and are not UTF-8 encoded; therefore
12763 * the character must be in the range 0-255, and is not /l
12764 * (Not /l because we already handled these under /l in
12765 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12766 if (IS_IN_SOME_FOLD_L1(ender)) {
12767 maybe_exact = FALSE;
12769 /* See if the character's fold differs between /d and
12770 * /u. This includes the multi-char fold SHARP S to
12772 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
12773 RExC_seen_unfolded_sharp_s = 1;
12774 maybe_exactfu = FALSE;
12776 else if (maybe_exactfu
12777 && (PL_fold[ender] != PL_fold_latin1[ender]
12778 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12779 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12780 || UNICODE_DOT_DOT_VERSION > 0)
12782 && isALPHA_FOLD_EQ(ender, 's')
12783 && isALPHA_FOLD_EQ(*(s-1), 's'))
12786 maybe_exactfu = FALSE;
12790 /* Even when folding, we store just the input character, as
12791 * we have an array that finds its fold quickly */
12792 *(s++) = (char) ender;
12794 else { /* FOLD, and UTF (or sharp s) */
12795 /* Unlike the non-fold case, we do actually have to
12796 * calculate the results here in pass 1. This is for two
12797 * reasons, the folded length may be longer than the
12798 * unfolded, and we have to calculate how many EXACTish
12799 * nodes it will take; and we may run out of room in a node
12800 * in the middle of a potential multi-char fold, and have
12801 * to back off accordingly. */
12804 if (isASCII_uni(ender)) {
12805 folded = toFOLD(ender);
12806 *(s)++ = (U8) folded;
12811 folded = _to_uni_fold_flags(
12815 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12816 ? FOLD_FLAGS_NOMIX_ASCII
12820 /* The loop increments <len> each time, as all but this
12821 * path (and one other) through it add a single byte to
12822 * the EXACTish node. But this one has changed len to
12823 * be the correct final value, so subtract one to
12824 * cancel out the increment that follows */
12825 len += foldlen - 1;
12827 /* If this node only contains non-folding code points so
12828 * far, see if this new one is also non-folding */
12830 if (folded != ender) {
12831 maybe_exact = FALSE;
12834 /* Here the fold is the original; we have to check
12835 * further to see if anything folds to it */
12836 if (_invlist_contains_cp(PL_utf8_foldable,
12839 maybe_exact = FALSE;
12846 if (next_is_quantifier) {
12848 /* Here, the next input is a quantifier, and to get here,
12849 * the current character is the only one in the node.
12850 * Also, here <len> doesn't include the final byte for this
12856 } /* End of loop through literal characters */
12858 /* Here we have either exhausted the input or ran out of room in
12859 * the node. (If we encountered a character that can't be in the
12860 * node, transfer is made directly to <loopdone>, and so we
12861 * wouldn't have fallen off the end of the loop.) In the latter
12862 * case, we artificially have to split the node into two, because
12863 * we just don't have enough space to hold everything. This
12864 * creates a problem if the final character participates in a
12865 * multi-character fold in the non-final position, as a match that
12866 * should have occurred won't, due to the way nodes are matched,
12867 * and our artificial boundary. So back off until we find a non-
12868 * problematic character -- one that isn't at the beginning or
12869 * middle of such a fold. (Either it doesn't participate in any
12870 * folds, or appears only in the final position of all the folds it
12871 * does participate in.) A better solution with far fewer false
12872 * positives, and that would fill the nodes more completely, would
12873 * be to actually have available all the multi-character folds to
12874 * test against, and to back-off only far enough to be sure that
12875 * this node isn't ending with a partial one. <upper_parse> is set
12876 * further below (if we need to reparse the node) to include just
12877 * up through that final non-problematic character that this code
12878 * identifies, so when it is set to less than the full node, we can
12879 * skip the rest of this */
12880 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12882 const STRLEN full_len = len;
12884 assert(len >= MAX_NODE_STRING_SIZE);
12886 /* Here, <s> points to the final byte of the final character.
12887 * Look backwards through the string until find a non-
12888 * problematic character */
12892 /* This has no multi-char folds to non-UTF characters */
12893 if (ASCII_FOLD_RESTRICTED) {
12897 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12901 if (! PL_NonL1NonFinalFold) {
12902 PL_NonL1NonFinalFold = _new_invlist_C_array(
12903 NonL1_Perl_Non_Final_Folds_invlist);
12906 /* Point to the first byte of the final character */
12907 s = (char *) utf8_hop((U8 *) s, -1);
12909 while (s >= s0) { /* Search backwards until find
12910 non-problematic char */
12911 if (UTF8_IS_INVARIANT(*s)) {
12913 /* There are no ascii characters that participate
12914 * in multi-char folds under /aa. In EBCDIC, the
12915 * non-ascii invariants are all control characters,
12916 * so don't ever participate in any folds. */
12917 if (ASCII_FOLD_RESTRICTED
12918 || ! IS_NON_FINAL_FOLD(*s))
12923 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12924 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
12930 else if (! _invlist_contains_cp(
12931 PL_NonL1NonFinalFold,
12932 valid_utf8_to_uvchr((U8 *) s, NULL)))
12937 /* Here, the current character is problematic in that
12938 * it does occur in the non-final position of some
12939 * fold, so try the character before it, but have to
12940 * special case the very first byte in the string, so
12941 * we don't read outside the string */
12942 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12943 } /* End of loop backwards through the string */
12945 /* If there were only problematic characters in the string,
12946 * <s> will point to before s0, in which case the length
12947 * should be 0, otherwise include the length of the
12948 * non-problematic character just found */
12949 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12952 /* Here, have found the final character, if any, that is
12953 * non-problematic as far as ending the node without splitting
12954 * it across a potential multi-char fold. <len> contains the
12955 * number of bytes in the node up-to and including that
12956 * character, or is 0 if there is no such character, meaning
12957 * the whole node contains only problematic characters. In
12958 * this case, give up and just take the node as-is. We can't
12963 /* If the node ends in an 's' we make sure it stays EXACTF,
12964 * as if it turns into an EXACTFU, it could later get
12965 * joined with another 's' that would then wrongly match
12967 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12969 maybe_exactfu = FALSE;
12973 /* Here, the node does contain some characters that aren't
12974 * problematic. If one such is the final character in the
12975 * node, we are done */
12976 if (len == full_len) {
12979 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12981 /* If the final character is problematic, but the
12982 * penultimate is not, back-off that last character to
12983 * later start a new node with it */
12988 /* Here, the final non-problematic character is earlier
12989 * in the input than the penultimate character. What we do
12990 * is reparse from the beginning, going up only as far as
12991 * this final ok one, thus guaranteeing that the node ends
12992 * in an acceptable character. The reason we reparse is
12993 * that we know how far in the character is, but we don't
12994 * know how to correlate its position with the input parse.
12995 * An alternate implementation would be to build that
12996 * correlation as we go along during the original parse,
12997 * but that would entail extra work for every node, whereas
12998 * this code gets executed only when the string is too
12999 * large for the node, and the final two characters are
13000 * problematic, an infrequent occurrence. Yet another
13001 * possible strategy would be to save the tail of the
13002 * string, and the next time regatom is called, initialize
13003 * with that. The problem with this is that unless you
13004 * back off one more character, you won't be guaranteed
13005 * regatom will get called again, unless regbranch,
13006 * regpiece ... are also changed. If you do back off that
13007 * extra character, so that there is input guaranteed to
13008 * force calling regatom, you can't handle the case where
13009 * just the first character in the node is acceptable. I
13010 * (khw) decided to try this method which doesn't have that
13011 * pitfall; if performance issues are found, we can do a
13012 * combination of the current approach plus that one */
13018 } /* End of verifying node ends with an appropriate char */
13020 loopdone: /* Jumped to when encounters something that shouldn't be
13023 /* I (khw) don't know if you can get here with zero length, but the
13024 * old code handled this situation by creating a zero-length EXACT
13025 * node. Might as well be NOTHING instead */
13031 /* If 'maybe_exact' is still set here, means there are no
13032 * code points in the node that participate in folds;
13033 * similarly for 'maybe_exactfu' and code points that match
13034 * differently depending on UTF8ness of the target string
13035 * (for /u), or depending on locale for /l */
13041 else if (maybe_exactfu) {
13047 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13048 FALSE /* Don't look to see if could
13049 be turned into an EXACT
13050 node, as we have already
13055 RExC_parse = p - 1;
13056 Set_Node_Cur_Length(ret, parse_start);
13058 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13059 FALSE /* Don't force to /x */ );
13061 /* len is STRLEN which is unsigned, need to copy to signed */
13064 vFAIL("Internal disaster");
13067 } /* End of label 'defchar:' */
13069 } /* End of giant switch on input character */
13076 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13078 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
13079 * sets up the bitmap and any flags, removing those code points from the
13080 * inversion list, setting it to NULL should it become completely empty */
13082 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13083 assert(PL_regkind[OP(node)] == ANYOF);
13085 ANYOF_BITMAP_ZERO(node);
13086 if (*invlist_ptr) {
13088 /* This gets set if we actually need to modify things */
13089 bool change_invlist = FALSE;
13093 /* Start looking through *invlist_ptr */
13094 invlist_iterinit(*invlist_ptr);
13095 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13099 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13100 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13102 else if (end >= NUM_ANYOF_CODE_POINTS) {
13103 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13106 /* Quit if are above what we should change */
13107 if (start >= NUM_ANYOF_CODE_POINTS) {
13111 change_invlist = TRUE;
13113 /* Set all the bits in the range, up to the max that we are doing */
13114 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13116 : NUM_ANYOF_CODE_POINTS - 1;
13117 for (i = start; i <= (int) high; i++) {
13118 if (! ANYOF_BITMAP_TEST(node, i)) {
13119 ANYOF_BITMAP_SET(node, i);
13123 invlist_iterfinish(*invlist_ptr);
13125 /* Done with loop; remove any code points that are in the bitmap from
13126 * *invlist_ptr; similarly for code points above the bitmap if we have
13127 * a flag to match all of them anyways */
13128 if (change_invlist) {
13129 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13131 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13132 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13135 /* If have completely emptied it, remove it completely */
13136 if (_invlist_len(*invlist_ptr) == 0) {
13137 SvREFCNT_dec_NN(*invlist_ptr);
13138 *invlist_ptr = NULL;
13143 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13144 Character classes ([:foo:]) can also be negated ([:^foo:]).
13145 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13146 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13147 but trigger failures because they are currently unimplemented. */
13149 #define POSIXCC_DONE(c) ((c) == ':')
13150 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13151 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13153 PERL_STATIC_INLINE I32
13154 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13156 I32 namedclass = OOB_NAMEDCLASS;
13158 PERL_ARGS_ASSERT_REGPPOSIXCC;
13160 if (value == '[' && RExC_parse + 1 < RExC_end &&
13161 /* I smell either [: or [= or [. -- POSIX has been here, right? */
13162 POSIXCC(UCHARAT(RExC_parse)))
13164 const char c = UCHARAT(RExC_parse);
13165 char* const s = RExC_parse++;
13167 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13169 if (RExC_parse == RExC_end) {
13172 /* Try to give a better location for the error (than the end of
13173 * the string) by looking for the matching ']' */
13175 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13178 vFAIL2("Unmatched '%c' in POSIX class", c);
13180 /* Grandfather lone [:, [=, [. */
13184 const char* const t = RExC_parse++; /* skip over the c */
13187 if (UCHARAT(RExC_parse) == ']') {
13188 const char *posixcc = s + 1;
13189 RExC_parse++; /* skip over the ending ] */
13192 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13193 const I32 skip = t - posixcc;
13195 /* Initially switch on the length of the name. */
13198 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13199 this is the Perl \w
13201 namedclass = ANYOF_WORDCHAR;
13204 /* Names all of length 5. */
13205 /* alnum alpha ascii blank cntrl digit graph lower
13206 print punct space upper */
13207 /* Offset 4 gives the best switch position. */
13208 switch (posixcc[4]) {
13210 if (memEQ(posixcc, "alph", 4)) /* alpha */
13211 namedclass = ANYOF_ALPHA;
13214 if (memEQ(posixcc, "spac", 4)) /* space */
13215 namedclass = ANYOF_SPACE;
13218 if (memEQ(posixcc, "grap", 4)) /* graph */
13219 namedclass = ANYOF_GRAPH;
13222 if (memEQ(posixcc, "asci", 4)) /* ascii */
13223 namedclass = ANYOF_ASCII;
13226 if (memEQ(posixcc, "blan", 4)) /* blank */
13227 namedclass = ANYOF_BLANK;
13230 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13231 namedclass = ANYOF_CNTRL;
13234 if (memEQ(posixcc, "alnu", 4)) /* alnum */
13235 namedclass = ANYOF_ALPHANUMERIC;
13238 if (memEQ(posixcc, "lowe", 4)) /* lower */
13239 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13240 else if (memEQ(posixcc, "uppe", 4)) /* upper */
13241 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13244 if (memEQ(posixcc, "digi", 4)) /* digit */
13245 namedclass = ANYOF_DIGIT;
13246 else if (memEQ(posixcc, "prin", 4)) /* print */
13247 namedclass = ANYOF_PRINT;
13248 else if (memEQ(posixcc, "punc", 4)) /* punct */
13249 namedclass = ANYOF_PUNCT;
13254 if (memEQ(posixcc, "xdigit", 6))
13255 namedclass = ANYOF_XDIGIT;
13259 if (namedclass == OOB_NAMEDCLASS)
13261 "POSIX class [:%"UTF8f":] unknown",
13262 UTF8fARG(UTF, t - s - 1, s + 1));
13264 /* The #defines are structured so each complement is +1 to
13265 * the normal one */
13269 assert (posixcc[skip] == ':');
13270 assert (posixcc[skip+1] == ']');
13271 } else if (!SIZE_ONLY) {
13272 /* [[=foo=]] and [[.foo.]] are still future. */
13274 /* adjust RExC_parse so the warning shows after
13275 the class closes */
13276 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13278 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13281 /* Maternal grandfather:
13282 * "[:" ending in ":" but not in ":]" */
13284 vFAIL("Unmatched '[' in POSIX class");
13287 /* Grandfather lone [:, [=, [. */
13297 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13299 /* This applies some heuristics at the current parse position (which should
13300 * be at a '[') to see if what follows might be intended to be a [:posix:]
13301 * class. It returns true if it really is a posix class, of course, but it
13302 * also can return true if it thinks that what was intended was a posix
13303 * class that didn't quite make it.
13305 * It will return true for
13307 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13308 * ')' indicating the end of the (?[
13309 * [:any garbage including %^&$ punctuation:]
13311 * This is designed to be called only from S_handle_regex_sets; it could be
13312 * easily adapted to be called from the spot at the beginning of regclass()
13313 * that checks to see in a normal bracketed class if the surrounding []
13314 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13315 * change long-standing behavior, so I (khw) didn't do that */
13316 char* p = RExC_parse + 1;
13317 char first_char = *p;
13319 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13321 assert(*(p - 1) == '[');
13323 if (! POSIXCC(first_char)) {
13328 while (p < RExC_end && isWORDCHAR(*p)) p++;
13330 if (p >= RExC_end) {
13334 if (p - RExC_parse > 2 /* Got at least 1 word character */
13335 && (*p == first_char
13336 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13341 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13344 && p - RExC_parse > 2 /* [:] evaluates to colon;
13345 [::] is a bad posix class. */
13346 && first_char == *(p - 1));
13349 STATIC unsigned int
13350 S_regex_set_precedence(const U8 my_operator) {
13352 /* Returns the precedence in the (?[...]) construct of the input operator,
13353 * specified by its character representation. The precedence follows
13354 * general Perl rules, but it extends this so that ')' and ']' have (low)
13355 * precedence even though they aren't really operators */
13357 switch (my_operator) {
13373 NOT_REACHED; /* NOTREACHED */
13374 return 0; /* Silence compiler warning */
13378 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13379 I32 *flagp, U32 depth,
13380 char * const oregcomp_parse)
13382 /* Handle the (?[...]) construct to do set operations */
13384 U8 curchar; /* Current character being parsed */
13385 UV start, end; /* End points of code point ranges */
13386 SV* final = NULL; /* The end result inversion list */
13387 SV* result_string; /* 'final' stringified */
13388 AV* stack; /* stack of operators and operands not yet
13390 AV* fence_stack = NULL; /* A stack containing the positions in
13391 'stack' of where the undealt-with left
13392 parens would be if they were actually
13394 IV fence = 0; /* Position of where most recent undealt-
13395 with left paren in stack is; -1 if none.
13397 STRLEN len; /* Temporary */
13398 regnode* node; /* Temporary, and final regnode returned by
13400 const bool save_fold = FOLD; /* Temporary */
13401 char *save_end, *save_parse; /* Temporaries */
13402 const bool in_locale = LOC; /* we turn off /l during processing */
13404 GET_RE_DEBUG_FLAGS_DECL;
13406 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13409 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
13412 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
13413 This is required so that the compile
13414 time values are valid in all runtime
13417 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13418 * (such as EXACT). Thus we can skip most everything if just sizing. We
13419 * call regclass to handle '[]' so as to not have to reinvent its parsing
13420 * rules here (throwing away the size it computes each time). And, we exit
13421 * upon an unescaped ']' that isn't one ending a regclass. To do both
13422 * these things, we need to realize that something preceded by a backslash
13423 * is escaped, so we have to keep track of backslashes */
13425 UV depth = 0; /* how many nested (?[...]) constructs */
13427 while (RExC_parse < RExC_end) {
13428 SV* current = NULL;
13430 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13431 TRUE /* Force /x */ );
13433 switch (*RExC_parse) {
13435 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13440 /* Skip the next byte (which could cause us to end up in
13441 * the middle of a UTF-8 character, but since none of those
13442 * are confusable with anything we currently handle in this
13443 * switch (invariants all), it's safe. We'll just hit the
13444 * default: case next time and keep on incrementing until
13445 * we find one of the invariants we do handle. */
13450 /* If this looks like it is a [:posix:] class, leave the
13451 * parse pointer at the '[' to fool regclass() into
13452 * thinking it is part of a '[[:posix:]]'. That function
13453 * will use strict checking to force a syntax error if it
13454 * doesn't work out to a legitimate class */
13455 bool is_posix_class
13456 = could_it_be_a_POSIX_class(pRExC_state);
13457 if (! is_posix_class) {
13461 /* regclass() can only return RESTART_PASS1 and NEED_UTF8
13462 * if multi-char folds are allowed. */
13463 if (!regclass(pRExC_state, flagp,depth+1,
13464 is_posix_class, /* parse the whole char
13465 class only if not a
13467 FALSE, /* don't allow multi-char folds */
13468 TRUE, /* silence non-portable warnings. */
13470 FALSE, /* Require return to be an ANYOF */
13473 FAIL2("panic: regclass returned NULL to handle_sets, "
13474 "flags=%#"UVxf"", (UV) *flagp);
13476 /* function call leaves parse pointing to the ']', except
13477 * if we faked it */
13478 if (is_posix_class) {
13482 SvREFCNT_dec(current); /* In case it returned something */
13487 if (depth--) break;
13489 if (RExC_parse < RExC_end
13490 && *RExC_parse == ')')
13492 node = reganode(pRExC_state, ANYOF, 0);
13493 RExC_size += ANYOF_SKIP;
13494 nextchar(pRExC_state);
13495 Set_Node_Length(node,
13496 RExC_parse - oregcomp_parse + 1); /* MJD */
13498 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
13506 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13510 FAIL("Syntax error in (?[...])");
13513 /* Pass 2 only after this. */
13514 Perl_ck_warner_d(aTHX_
13515 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13516 "The regex_sets feature is experimental" REPORT_LOCATION,
13517 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13519 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13520 RExC_precomp + (RExC_parse - RExC_precomp)));
13522 /* Everything in this construct is a metacharacter. Operands begin with
13523 * either a '\' (for an escape sequence), or a '[' for a bracketed
13524 * character class. Any other character should be an operator, or
13525 * parenthesis for grouping. Both types of operands are handled by calling
13526 * regclass() to parse them. It is called with a parameter to indicate to
13527 * return the computed inversion list. The parsing here is implemented via
13528 * a stack. Each entry on the stack is a single character representing one
13529 * of the operators; or else a pointer to an operand inversion list. */
13531 #define IS_OPERATOR(a) SvIOK(a)
13532 #define IS_OPERAND(a) (! IS_OPERATOR(a))
13534 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
13535 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13536 * with pronouncing it called it Reverse Polish instead, but now that YOU
13537 * know how to pronounce it you can use the correct term, thus giving due
13538 * credit to the person who invented it, and impressing your geek friends.
13539 * Wikipedia says that the pronounciation of "Ł" has been changing so that
13540 * it is now more like an English initial W (as in wonk) than an L.)
13542 * This means that, for example, 'a | b & c' is stored on the stack as
13550 * where the numbers in brackets give the stack [array] element number.
13551 * In this implementation, parentheses are not stored on the stack.
13552 * Instead a '(' creates a "fence" so that the part of the stack below the
13553 * fence is invisible except to the corresponding ')' (this allows us to
13554 * replace testing for parens, by using instead subtraction of the fence
13555 * position). As new operands are processed they are pushed onto the stack
13556 * (except as noted in the next paragraph). New operators of higher
13557 * precedence than the current final one are inserted on the stack before
13558 * the lhs operand (so that when the rhs is pushed next, everything will be
13559 * in the correct positions shown above. When an operator of equal or
13560 * lower precedence is encountered in parsing, all the stacked operations
13561 * of equal or higher precedence are evaluated, leaving the result as the
13562 * top entry on the stack. This makes higher precedence operations
13563 * evaluate before lower precedence ones, and causes operations of equal
13564 * precedence to left associate.
13566 * The only unary operator '!' is immediately pushed onto the stack when
13567 * encountered. When an operand is encountered, if the top of the stack is
13568 * a '!", the complement is immediately performed, and the '!' popped. The
13569 * resulting value is treated as a new operand, and the logic in the
13570 * previous paragraph is executed. Thus in the expression
13572 * the stack looks like
13578 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13585 * A ')' is treated as an operator with lower precedence than all the
13586 * aforementioned ones, which causes all operations on the stack above the
13587 * corresponding '(' to be evaluated down to a single resultant operand.
13588 * Then the fence for the '(' is removed, and the operand goes through the
13589 * algorithm above, without the fence.
13591 * A separate stack is kept of the fence positions, so that the position of
13592 * the latest so-far unbalanced '(' is at the top of it.
13594 * The ']' ending the construct is treated as the lowest operator of all,
13595 * so that everything gets evaluated down to a single operand, which is the
13598 sv_2mortal((SV *)(stack = newAV()));
13599 sv_2mortal((SV *)(fence_stack = newAV()));
13601 while (RExC_parse < RExC_end) {
13602 I32 top_index; /* Index of top-most element in 'stack' */
13603 SV** top_ptr; /* Pointer to top 'stack' element */
13604 SV* current = NULL; /* To contain the current inversion list
13606 SV* only_to_avoid_leaks;
13608 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13609 TRUE /* Force /x */ );
13610 if (RExC_parse >= RExC_end) {
13611 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13614 curchar = UCHARAT(RExC_parse);
13618 top_index = av_tindex(stack);
13621 SV** stacked_ptr; /* Ptr to something already on 'stack' */
13622 char stacked_operator; /* The topmost operator on the 'stack'. */
13623 SV* lhs; /* Operand to the left of the operator */
13624 SV* rhs; /* Operand to the right of the operator */
13625 SV* fence_ptr; /* Pointer to top element of the fence
13630 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13632 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13633 * This happens when we have some thing like
13635 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13637 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13639 * Here we would be handling the interpolated
13640 * '$thai_or_lao'. We handle this by a recursive call to
13641 * ourselves which returns the inversion list the
13642 * interpolated expression evaluates to. We use the flags
13643 * from the interpolated pattern. */
13644 U32 save_flags = RExC_flags;
13645 const char * save_parse;
13647 RExC_parse += 2; /* Skip past the '(?' */
13648 save_parse = RExC_parse;
13650 /* Parse any flags for the '(?' */
13651 parse_lparen_question_flags(pRExC_state);
13653 if (RExC_parse == save_parse /* Makes sure there was at
13654 least one flag (or else
13655 this embedding wasn't
13657 || RExC_parse >= RExC_end - 4
13658 || UCHARAT(RExC_parse) != ':'
13659 || UCHARAT(++RExC_parse) != '('
13660 || UCHARAT(++RExC_parse) != '?'
13661 || UCHARAT(++RExC_parse) != '[')
13664 /* In combination with the above, this moves the
13665 * pointer to the point just after the first erroneous
13666 * character (or if there are no flags, to where they
13667 * should have been) */
13668 if (RExC_parse >= RExC_end - 4) {
13669 RExC_parse = RExC_end;
13671 else if (RExC_parse != save_parse) {
13672 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13674 vFAIL("Expecting '(?flags:(?[...'");
13677 /* Recurse, with the meat of the embedded expression */
13679 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13680 depth+1, oregcomp_parse);
13682 /* Here, 'current' contains the embedded expression's
13683 * inversion list, and RExC_parse points to the trailing
13684 * ']'; the next character should be the ')' */
13686 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13688 /* Then the ')' matching the original '(' handled by this
13689 * case: statement */
13691 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13694 RExC_flags = save_flags;
13695 goto handle_operand;
13698 /* A regular '('. Look behind for illegal syntax */
13699 if (top_index - fence >= 0) {
13700 /* If the top entry on the stack is an operator, it had
13701 * better be a '!', otherwise the entry below the top
13702 * operand should be an operator */
13703 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13704 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
13705 || ( IS_OPERAND(*top_ptr)
13706 && ( top_index - fence < 1
13707 || ! (stacked_ptr = av_fetch(stack,
13710 || ! IS_OPERATOR(*stacked_ptr))))
13713 vFAIL("Unexpected '(' with no preceding operator");
13717 /* Stack the position of this undealt-with left paren */
13718 fence = top_index + 1;
13719 av_push(fence_stack, newSViv(fence));
13723 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13724 * multi-char folds are allowed. */
13725 if (!regclass(pRExC_state, flagp,depth+1,
13726 TRUE, /* means parse just the next thing */
13727 FALSE, /* don't allow multi-char folds */
13728 FALSE, /* don't silence non-portable warnings. */
13730 FALSE, /* Require return to be an ANYOF */
13733 FAIL2("panic: regclass returned NULL to handle_sets, "
13734 "flags=%#"UVxf"", (UV) *flagp);
13737 /* regclass() will return with parsing just the \ sequence,
13738 * leaving the parse pointer at the next thing to parse */
13740 goto handle_operand;
13742 case '[': /* Is a bracketed character class */
13744 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13746 if (! is_posix_class) {
13750 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13751 * multi-char folds are allowed. */
13752 if(!regclass(pRExC_state, flagp,depth+1,
13753 is_posix_class, /* parse the whole char class
13754 only if not a posix class */
13755 FALSE, /* don't allow multi-char folds */
13756 FALSE, /* don't silence non-portable warnings. */
13758 FALSE, /* Require return to be an ANYOF */
13762 FAIL2("panic: regclass returned NULL to handle_sets, "
13763 "flags=%#"UVxf"", (UV) *flagp);
13766 /* function call leaves parse pointing to the ']', except if we
13768 if (is_posix_class) {
13772 goto handle_operand;
13776 if (top_index >= 1) {
13777 goto join_operators;
13780 /* Only a single operand on the stack: are done */
13784 if (av_tindex(fence_stack) < 0) {
13786 vFAIL("Unexpected ')'");
13789 /* If at least two thing on the stack, treat this as an
13791 if (top_index - fence >= 1) {
13792 goto join_operators;
13795 /* Here only a single thing on the fenced stack, and there is a
13796 * fence. Get rid of it */
13797 fence_ptr = av_pop(fence_stack);
13799 fence = SvIV(fence_ptr) - 1;
13800 SvREFCNT_dec_NN(fence_ptr);
13807 /* Having gotten rid of the fence, we pop the operand at the
13808 * stack top and process it as a newly encountered operand */
13809 current = av_pop(stack);
13810 assert(IS_OPERAND(current));
13811 goto handle_operand;
13819 /* These binary operators should have a left operand already
13821 if ( top_index - fence < 0
13822 || top_index - fence == 1
13823 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13824 || ! IS_OPERAND(*top_ptr))
13826 goto unexpected_binary;
13829 /* If only the one operand is on the part of the stack visible
13830 * to us, we just place this operator in the proper position */
13831 if (top_index - fence < 2) {
13833 /* Place the operator before the operand */
13835 SV* lhs = av_pop(stack);
13836 av_push(stack, newSVuv(curchar));
13837 av_push(stack, lhs);
13841 /* But if there is something else on the stack, we need to
13842 * process it before this new operator if and only if the
13843 * stacked operation has equal or higher precedence than the
13848 /* The operator on the stack is supposed to be below both its
13850 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13851 || IS_OPERAND(*stacked_ptr))
13853 /* But if not, it's legal and indicates we are completely
13854 * done if and only if we're currently processing a ']',
13855 * which should be the final thing in the expression */
13856 if (curchar == ']') {
13862 vFAIL2("Unexpected binary operator '%c' with no "
13863 "preceding operand", curchar);
13865 stacked_operator = (char) SvUV(*stacked_ptr);
13867 if (regex_set_precedence(curchar)
13868 > regex_set_precedence(stacked_operator))
13870 /* Here, the new operator has higher precedence than the
13871 * stacked one. This means we need to add the new one to
13872 * the stack to await its rhs operand (and maybe more
13873 * stuff). We put it before the lhs operand, leaving
13874 * untouched the stacked operator and everything below it
13876 lhs = av_pop(stack);
13877 assert(IS_OPERAND(lhs));
13879 av_push(stack, newSVuv(curchar));
13880 av_push(stack, lhs);
13884 /* Here, the new operator has equal or lower precedence than
13885 * what's already there. This means the operation already
13886 * there should be performed now, before the new one. */
13888 rhs = av_pop(stack);
13889 if (! IS_OPERAND(rhs)) {
13891 /* This can happen when a ! is not followed by an operand,
13892 * like in /(?[\t &!])/ */
13896 lhs = av_pop(stack);
13897 assert(IS_OPERAND(lhs));
13899 switch (stacked_operator) {
13901 _invlist_intersection(lhs, rhs, &rhs);
13906 _invlist_union(lhs, rhs, &rhs);
13910 _invlist_subtract(lhs, rhs, &rhs);
13913 case '^': /* The union minus the intersection */
13919 _invlist_union(lhs, rhs, &u);
13920 _invlist_intersection(lhs, rhs, &i);
13921 /* _invlist_subtract will overwrite rhs
13922 without freeing what it already contains */
13924 _invlist_subtract(u, i, &rhs);
13925 SvREFCNT_dec_NN(i);
13926 SvREFCNT_dec_NN(u);
13927 SvREFCNT_dec_NN(element);
13933 /* Here, the higher precedence operation has been done, and the
13934 * result is in 'rhs'. We overwrite the stacked operator with
13935 * the result. Then we redo this code to either push the new
13936 * operator onto the stack or perform any higher precedence
13937 * stacked operation */
13938 only_to_avoid_leaks = av_pop(stack);
13939 SvREFCNT_dec(only_to_avoid_leaks);
13940 av_push(stack, rhs);
13943 case '!': /* Highest priority, right associative, so just push
13945 av_push(stack, newSVuv(curchar));
13949 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13950 vFAIL("Unexpected character");
13954 /* Here 'current' is the operand. If something is already on the
13955 * stack, we have to check if it is a !. */
13956 top_index = av_tindex(stack); /* Code above may have altered the
13957 * stack in the time since we
13958 * earlier set 'top_index'. */
13959 if (top_index - fence >= 0) {
13960 /* If the top entry on the stack is an operator, it had better
13961 * be a '!', otherwise the entry below the top operand should
13962 * be an operator */
13963 top_ptr = av_fetch(stack, top_index, FALSE);
13965 if (IS_OPERATOR(*top_ptr)) {
13967 /* The only permissible operator at the top of the stack is
13968 * '!', which is applied immediately to this operand. */
13969 curchar = (char) SvUV(*top_ptr);
13970 if (curchar != '!') {
13971 SvREFCNT_dec(current);
13972 vFAIL2("Unexpected binary operator '%c' with no "
13973 "preceding operand", curchar);
13976 _invlist_invert(current);
13978 only_to_avoid_leaks = av_pop(stack);
13979 SvREFCNT_dec(only_to_avoid_leaks);
13980 top_index = av_tindex(stack);
13982 /* And we redo with the inverted operand. This allows
13983 * handling multiple ! in a row */
13984 goto handle_operand;
13986 /* Single operand is ok only for the non-binary ')'
13988 else if ((top_index - fence == 0 && curchar != ')')
13989 || (top_index - fence > 0
13990 && (! (stacked_ptr = av_fetch(stack,
13993 || IS_OPERAND(*stacked_ptr))))
13995 SvREFCNT_dec(current);
13996 vFAIL("Operand with no preceding operator");
14000 /* Here there was nothing on the stack or the top element was
14001 * another operand. Just add this new one */
14002 av_push(stack, current);
14004 } /* End of switch on next parse token */
14006 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14007 } /* End of loop parsing through the construct */
14010 if (av_tindex(fence_stack) >= 0) {
14011 vFAIL("Unmatched (");
14014 if (av_tindex(stack) < 0 /* Was empty */
14015 || ((final = av_pop(stack)) == NULL)
14016 || ! IS_OPERAND(final)
14017 || SvTYPE(final) != SVt_INVLIST
14018 || av_tindex(stack) >= 0) /* More left on stack */
14021 SvREFCNT_dec(final);
14022 vFAIL("Incomplete expression within '(?[ ])'");
14025 /* Here, 'final' is the resultant inversion list from evaluating the
14026 * expression. Return it if so requested */
14027 if (return_invlist) {
14028 *return_invlist = final;
14032 /* Otherwise generate a resultant node, based on 'final'. regclass() is
14033 * expecting a string of ranges and individual code points */
14034 invlist_iterinit(final);
14035 result_string = newSVpvs("");
14036 while (invlist_iternext(final, &start, &end)) {
14037 if (start == end) {
14038 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
14041 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
14046 /* About to generate an ANYOF (or similar) node from the inversion list we
14047 * have calculated */
14048 save_parse = RExC_parse;
14049 RExC_parse = SvPV(result_string, len);
14050 save_end = RExC_end;
14051 RExC_end = RExC_parse + len;
14053 /* We turn off folding around the call, as the class we have constructed
14054 * already has all folding taken into consideration, and we don't want
14055 * regclass() to add to that */
14056 RExC_flags &= ~RXf_PMf_FOLD;
14057 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
14058 * folds are allowed. */
14059 node = regclass(pRExC_state, flagp,depth+1,
14060 FALSE, /* means parse the whole char class */
14061 FALSE, /* don't allow multi-char folds */
14062 TRUE, /* silence non-portable warnings. The above may very
14063 well have generated non-portable code points, but
14064 they're valid on this machine */
14065 FALSE, /* similarly, no need for strict */
14066 FALSE, /* Require return to be an ANYOF */
14070 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
14073 /* Fix up the node type if we are in locale. (We have pretended we are
14074 * under /u for the purposes of regclass(), as this construct will only
14075 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
14076 * as to cause any warnings about bad locales to be output in regexec.c),
14077 * and add the flag that indicates to check if not in a UTF-8 locale. The
14078 * reason we above forbid optimization into something other than an ANYOF
14079 * node is simply to minimize the number of code changes in regexec.c.
14080 * Otherwise we would have to create new EXACTish node types and deal with
14081 * them. This decision could be revisited should this construct become
14084 * (One might think we could look at the resulting ANYOF node and suppress
14085 * the flag if everything is above 255, as those would be UTF-8 only,
14086 * but this isn't true, as the components that led to that result could
14087 * have been locale-affected, and just happen to cancel each other out
14088 * under UTF-8 locales.) */
14090 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14092 assert(OP(node) == ANYOF);
14095 ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
14099 RExC_flags |= RXf_PMf_FOLD;
14102 RExC_parse = save_parse + 1;
14103 RExC_end = save_end;
14104 SvREFCNT_dec_NN(final);
14105 SvREFCNT_dec_NN(result_string);
14107 nextchar(pRExC_state);
14108 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
14115 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
14117 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
14118 * innocent-looking character class, like /[ks]/i won't have to go out to
14119 * disk to find the possible matches.
14121 * This should be called only for a Latin1-range code points, cp, which is
14122 * known to be involved in a simple fold with other code points above
14123 * Latin1. It would give false results if /aa has been specified.
14124 * Multi-char folds are outside the scope of this, and must be handled
14127 * XXX It would be better to generate these via regen, in case a new
14128 * version of the Unicode standard adds new mappings, though that is not
14129 * really likely, and may be caught by the default: case of the switch
14132 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14134 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14140 add_cp_to_invlist(*invlist, KELVIN_SIGN);
14144 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14147 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14148 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14150 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14151 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14152 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14154 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14155 *invlist = add_cp_to_invlist(*invlist,
14156 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14159 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
14161 case LATIN_SMALL_LETTER_SHARP_S:
14162 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14167 #if UNICODE_MAJOR_VERSION < 3 \
14168 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
14170 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
14175 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
14176 # if UNICODE_DOT_DOT_VERSION == 1
14177 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
14183 /* Use deprecated warning to increase the chances of this being
14186 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14193 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14195 /* This adds the string scalar <multi_string> to the array
14196 * <multi_char_matches>. <multi_string> is known to have exactly
14197 * <cp_count> code points in it. This is used when constructing a
14198 * bracketed character class and we find something that needs to match more
14199 * than a single character.
14201 * <multi_char_matches> is actually an array of arrays. Each top-level
14202 * element is an array that contains all the strings known so far that are
14203 * the same length. And that length (in number of code points) is the same
14204 * as the index of the top-level array. Hence, the [2] element is an
14205 * array, each element thereof is a string containing TWO code points;
14206 * while element [3] is for strings of THREE characters, and so on. Since
14207 * this is for multi-char strings there can never be a [0] nor [1] element.
14209 * When we rewrite the character class below, we will do so such that the
14210 * longest strings are written first, so that it prefers the longest
14211 * matching strings first. This is done even if it turns out that any
14212 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
14213 * Christiansen has agreed that this is ok. This makes the test for the
14214 * ligature 'ffi' come before the test for 'ff', for example */
14217 AV** this_array_ptr;
14219 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14221 if (! multi_char_matches) {
14222 multi_char_matches = newAV();
14225 if (av_exists(multi_char_matches, cp_count)) {
14226 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14227 this_array = *this_array_ptr;
14230 this_array = newAV();
14231 av_store(multi_char_matches, cp_count,
14234 av_push(this_array, multi_string);
14236 return multi_char_matches;
14239 /* The names of properties whose definitions are not known at compile time are
14240 * stored in this SV, after a constant heading. So if the length has been
14241 * changed since initialization, then there is a run-time definition. */
14242 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
14243 (SvCUR(listsv) != initial_listsv_len)
14245 /* There is a restricted set of white space characters that are legal when
14246 * ignoring white space in a bracketed character class. This generates the
14247 * code to skip them.
14249 * There is a line below that uses the same white space criteria but is outside
14250 * this macro. Both here and there must use the same definition */
14251 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
14254 while ( p < RExC_end \
14255 && isBLANK_A(UCHARAT(p))) \
14263 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14264 const bool stop_at_1, /* Just parse the next thing, don't
14265 look for a full character class */
14266 bool allow_multi_folds,
14267 const bool silence_non_portable, /* Don't output warnings
14271 bool optimizable, /* ? Allow a non-ANYOF return
14273 SV** ret_invlist /* Return an inversion list, not a node */
14276 /* parse a bracketed class specification. Most of these will produce an
14277 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14278 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
14279 * under /i with multi-character folds: it will be rewritten following the
14280 * paradigm of this example, where the <multi-fold>s are characters which
14281 * fold to multiple character sequences:
14282 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14283 * gets effectively rewritten as:
14284 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14285 * reg() gets called (recursively) on the rewritten version, and this
14286 * function will return what it constructs. (Actually the <multi-fold>s
14287 * aren't physically removed from the [abcdefghi], it's just that they are
14288 * ignored in the recursion by means of a flag:
14289 * <RExC_in_multi_char_class>.)
14291 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14292 * characters, with the corresponding bit set if that character is in the
14293 * list. For characters above this, a range list or swash is used. There
14294 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14295 * determinable at compile time
14297 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
14298 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
14299 * to UTF-8. This can only happen if ret_invlist is non-NULL.
14302 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14304 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14307 IV namedclass = OOB_NAMEDCLASS;
14308 char *rangebegin = NULL;
14309 bool need_class = 0;
14311 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14312 than just initialized. */
14313 SV* properties = NULL; /* Code points that match \p{} \P{} */
14314 SV* posixes = NULL; /* Code points that match classes like [:word:],
14315 extended beyond the Latin1 range. These have to
14316 be kept separate from other code points for much
14317 of this function because their handling is
14318 different under /i, and for most classes under
14320 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
14321 separate for a while from the non-complemented
14322 versions because of complications with /d
14324 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14325 treated more simply than the general case,
14326 leading to less compilation and execution
14328 UV element_count = 0; /* Number of distinct elements in the class.
14329 Optimizations may be possible if this is tiny */
14330 AV * multi_char_matches = NULL; /* Code points that fold to more than one
14331 character; used under /i */
14333 char * stop_ptr = RExC_end; /* where to stop parsing */
14334 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14337 /* Unicode properties are stored in a swash; this holds the current one
14338 * being parsed. If this swash is the only above-latin1 component of the
14339 * character class, an optimization is to pass it directly on to the
14340 * execution engine. Otherwise, it is set to NULL to indicate that there
14341 * are other things in the class that have to be dealt with at execution
14343 SV* swash = NULL; /* Code points that match \p{} \P{} */
14345 /* Set if a component of this character class is user-defined; just passed
14346 * on to the engine */
14347 bool has_user_defined_property = FALSE;
14349 /* inversion list of code points this node matches only when the target
14350 * string is in UTF-8. (Because is under /d) */
14351 SV* depends_list = NULL;
14353 /* Inversion list of code points this node matches regardless of things
14354 * like locale, folding, utf8ness of the target string */
14355 SV* cp_list = NULL;
14357 /* Like cp_list, but code points on this list need to be checked for things
14358 * that fold to/from them under /i */
14359 SV* cp_foldable_list = NULL;
14361 /* Like cp_list, but code points on this list are valid only when the
14362 * runtime locale is UTF-8 */
14363 SV* only_utf8_locale_list = NULL;
14365 /* In a range, if one of the endpoints is non-character-set portable,
14366 * meaning that it hard-codes a code point that may mean a different
14367 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14368 * mnemonic '\t' which each mean the same character no matter which
14369 * character set the platform is on. */
14370 unsigned int non_portable_endpoint = 0;
14372 /* Is the range unicode? which means on a platform that isn't 1-1 native
14373 * to Unicode (i.e. non-ASCII), each code point in it should be considered
14374 * to be a Unicode value. */
14375 bool unicode_range = FALSE;
14376 bool invert = FALSE; /* Is this class to be complemented */
14378 bool warn_super = ALWAYS_WARN_SUPER;
14380 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14381 case we need to change the emitted regop to an EXACT. */
14382 const char * orig_parse = RExC_parse;
14383 const SSize_t orig_size = RExC_size;
14384 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14385 GET_RE_DEBUG_FLAGS_DECL;
14387 PERL_ARGS_ASSERT_REGCLASS;
14389 PERL_UNUSED_ARG(depth);
14392 DEBUG_PARSE("clas");
14394 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
14395 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
14396 && UNICODE_DOT_DOT_VERSION == 0)
14397 allow_multi_folds = FALSE;
14400 /* Assume we are going to generate an ANYOF node. */
14401 ret = reganode(pRExC_state,
14404 : (DEPENDS_SEMANTICS)
14410 RExC_size += ANYOF_SKIP;
14411 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14414 ANYOF_FLAGS(ret) = 0;
14416 RExC_emit += ANYOF_SKIP;
14417 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14418 initial_listsv_len = SvCUR(listsv);
14419 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
14422 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14424 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14427 allow_multi_folds = FALSE;
14429 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14432 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14433 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14434 const char *s = RExC_parse;
14435 const char c = *s++;
14440 while (isWORDCHAR(*s))
14442 if (*s && c == *s && s[1] == ']') {
14443 SAVEFREESV(RExC_rx_sv);
14445 "POSIX syntax [%c %c] belongs inside character classes",
14447 (void)ReREFCNT_inc(RExC_rx_sv);
14451 /* If the caller wants us to just parse a single element, accomplish this
14452 * by faking the loop ending condition */
14453 if (stop_at_1 && RExC_end > RExC_parse) {
14454 stop_ptr = RExC_parse + 1;
14457 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14458 if (UCHARAT(RExC_parse) == ']')
14459 goto charclassloop;
14462 if (RExC_parse >= stop_ptr) {
14466 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14468 if (UCHARAT(RExC_parse) == ']') {
14474 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14475 save_value = value;
14476 save_prevvalue = prevvalue;
14479 rangebegin = RExC_parse;
14481 non_portable_endpoint = 0;
14484 value = utf8n_to_uvchr((U8*)RExC_parse,
14485 RExC_end - RExC_parse,
14486 &numlen, UTF8_ALLOW_DEFAULT);
14487 RExC_parse += numlen;
14490 value = UCHARAT(RExC_parse++);
14493 && RExC_parse < RExC_end
14494 && POSIXCC(UCHARAT(RExC_parse)))
14496 namedclass = regpposixcc(pRExC_state, value, strict);
14498 else if (value == '\\') {
14499 /* Is a backslash; get the code point of the char after it */
14500 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14501 value = utf8n_to_uvchr((U8*)RExC_parse,
14502 RExC_end - RExC_parse,
14503 &numlen, UTF8_ALLOW_DEFAULT);
14504 RExC_parse += numlen;
14507 value = UCHARAT(RExC_parse++);
14509 /* Some compilers cannot handle switching on 64-bit integer
14510 * values, therefore value cannot be an UV. Yes, this will
14511 * be a problem later if we want switch on Unicode.
14512 * A similar issue a little bit later when switching on
14513 * namedclass. --jhi */
14515 /* If the \ is escaping white space when white space is being
14516 * skipped, it means that that white space is wanted literally, and
14517 * is already in 'value'. Otherwise, need to translate the escape
14518 * into what it signifies. */
14519 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
14521 case 'w': namedclass = ANYOF_WORDCHAR; break;
14522 case 'W': namedclass = ANYOF_NWORDCHAR; break;
14523 case 's': namedclass = ANYOF_SPACE; break;
14524 case 'S': namedclass = ANYOF_NSPACE; break;
14525 case 'd': namedclass = ANYOF_DIGIT; break;
14526 case 'D': namedclass = ANYOF_NDIGIT; break;
14527 case 'v': namedclass = ANYOF_VERTWS; break;
14528 case 'V': namedclass = ANYOF_NVERTWS; break;
14529 case 'h': namedclass = ANYOF_HORIZWS; break;
14530 case 'H': namedclass = ANYOF_NHORIZWS; break;
14531 case 'N': /* Handle \N{NAME} in class */
14533 const char * const backslash_N_beg = RExC_parse - 2;
14536 if (! grok_bslash_N(pRExC_state,
14537 NULL, /* No regnode */
14538 &value, /* Yes single value */
14539 &cp_count, /* Multiple code pt count */
14544 if (*flagp & NEED_UTF8)
14545 FAIL("panic: grok_bslash_N set NEED_UTF8");
14546 if (*flagp & RESTART_PASS1)
14549 if (cp_count < 0) {
14550 vFAIL("\\N in a character class must be a named character: \\N{...}");
14552 else if (cp_count == 0) {
14554 RExC_parse++; /* Position after the "}" */
14555 vFAIL("Zero length \\N{}");
14558 ckWARNreg(RExC_parse,
14559 "Ignoring zero length \\N{} in character class");
14562 else { /* cp_count > 1 */
14563 if (! RExC_in_multi_char_class) {
14564 if (invert || range || *RExC_parse == '-') {
14567 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14570 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14572 break; /* <value> contains the first code
14573 point. Drop out of the switch to
14577 SV * multi_char_N = newSVpvn(backslash_N_beg,
14578 RExC_parse - backslash_N_beg);
14580 = add_multi_match(multi_char_matches,
14585 } /* End of cp_count != 1 */
14587 /* This element should not be processed further in this
14590 value = save_value;
14591 prevvalue = save_prevvalue;
14592 continue; /* Back to top of loop to get next char */
14595 /* Here, is a single code point, and <value> contains it */
14596 unicode_range = TRUE; /* \N{} are Unicode */
14604 /* We will handle any undefined properties ourselves */
14605 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14606 /* And we actually would prefer to get
14607 * the straight inversion list of the
14608 * swash, since we will be accessing it
14609 * anyway, to save a little time */
14610 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14612 if (RExC_parse >= RExC_end)
14613 vFAIL2("Empty \\%c{}", (U8)value);
14614 if (*RExC_parse == '{') {
14615 const U8 c = (U8)value;
14616 e = strchr(RExC_parse, '}');
14619 vFAIL2("Missing right brace on \\%c{}", c);
14623 while (isSPACE(*RExC_parse)) {
14627 if (UCHARAT(RExC_parse) == '^') {
14629 /* toggle. (The rhs xor gets the single bit that
14630 * differs between P and p; the other xor inverts just
14632 value ^= 'P' ^ 'p';
14635 while (isSPACE(*RExC_parse)) {
14640 if (e == RExC_parse)
14641 vFAIL2("Empty \\%c{}", c);
14643 n = e - RExC_parse;
14644 while (isSPACE(*(RExC_parse + n - 1)))
14646 } /* The \p isn't immediately followed by a '{' */
14647 else if (! isALPHA(*RExC_parse)) {
14648 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14649 vFAIL2("Character following \\%c must be '{' or a "
14650 "single-character Unicode property name",
14660 char* base_name; /* name after any packages are stripped */
14661 const char * const colon_colon = "::";
14663 /* Try to get the definition of the property into
14664 * <invlist>. If /i is in effect, the effective property
14665 * will have its name be <__NAME_i>. The design is
14666 * discussed in commit
14667 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14668 name = savepv(Perl_form(aTHX_
14670 (FOLD) ? "__" : "",
14676 /* Look up the property name, and get its swash and
14677 * inversion list, if the property is found */
14678 if (swash) { /* Return any left-overs */
14679 SvREFCNT_dec_NN(swash);
14681 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14684 NULL, /* No inversion list */
14687 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14688 HV* curpkg = (IN_PERL_COMPILETIME)
14690 : CopSTASH(PL_curcop);
14694 if (swash) { /* Got a swash but no inversion list.
14695 Something is likely wrong that will
14696 be sorted-out later */
14697 SvREFCNT_dec_NN(swash);
14701 /* Here didn't find it. It could be a an error (like a
14702 * typo) in specifying a Unicode property, or it could
14703 * be a user-defined property that will be available at
14704 * run-time. The names of these must begin with 'In'
14705 * or 'Is' (after any packages are stripped off). So
14706 * if not one of those, or if we accept only
14707 * compile-time properties, is an error; otherwise add
14708 * it to the list for run-time look up. */
14709 if ((base_name = rninstr(name, name + n,
14710 colon_colon, colon_colon + 2)))
14711 { /* Has ::. We know this must be a user-defined
14714 final_n -= base_name - name;
14723 || base_name[0] != 'I'
14724 || (base_name[1] != 's' && base_name[1] != 'n')
14727 const char * const msg
14729 ? "Illegal user-defined property name"
14730 : "Can't find Unicode property definition";
14731 RExC_parse = e + 1;
14733 /* diag_listed_as: Can't find Unicode property definition "%s" */
14734 vFAIL3utf8f("%s \"%"UTF8f"\"",
14735 msg, UTF8fARG(UTF, n, name));
14738 /* If the property name doesn't already have a package
14739 * name, add the current one to it so that it can be
14740 * referred to outside it. [perl #121777] */
14741 if (! has_pkg && curpkg) {
14742 char* pkgname = HvNAME(curpkg);
14743 if (strNE(pkgname, "main")) {
14744 char* full_name = Perl_form(aTHX_
14748 n = strlen(full_name);
14750 name = savepvn(full_name, n);
14753 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14754 (value == 'p' ? '+' : '!'),
14755 UTF8fARG(UTF, n, name));
14756 has_user_defined_property = TRUE;
14757 optimizable = FALSE; /* Will have to leave this an
14760 /* We don't know yet, so have to assume that the
14761 * property could match something in the upper Latin1
14762 * range, hence something that isn't utf8. Note that
14763 * this would cause things in <depends_list> to match
14764 * inappropriately, except that any \p{}, including
14765 * this one forces Unicode semantics, which means there
14766 * is no <depends_list> */
14768 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14772 /* Here, did get the swash and its inversion list. If
14773 * the swash is from a user-defined property, then this
14774 * whole character class should be regarded as such */
14775 if (swash_init_flags
14776 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14778 has_user_defined_property = TRUE;
14781 /* We warn on matching an above-Unicode code point
14782 * if the match would return true, except don't
14783 * warn for \p{All}, which has exactly one element
14785 (_invlist_contains_cp(invlist, 0x110000)
14786 && (! (_invlist_len(invlist) == 1
14787 && *invlist_array(invlist) == 0)))
14793 /* Invert if asking for the complement */
14794 if (value == 'P') {
14795 _invlist_union_complement_2nd(properties,
14799 /* The swash can't be used as-is, because we've
14800 * inverted things; delay removing it to here after
14801 * have copied its invlist above */
14802 SvREFCNT_dec_NN(swash);
14806 _invlist_union(properties, invlist, &properties);
14811 RExC_parse = e + 1;
14812 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14815 /* \p means they want Unicode semantics */
14816 REQUIRE_UNI_RULES(flagp, NULL);
14819 case 'n': value = '\n'; break;
14820 case 'r': value = '\r'; break;
14821 case 't': value = '\t'; break;
14822 case 'f': value = '\f'; break;
14823 case 'b': value = '\b'; break;
14824 case 'e': value = ESC_NATIVE; break;
14825 case 'a': value = '\a'; break;
14827 RExC_parse--; /* function expects to be pointed at the 'o' */
14829 const char* error_msg;
14830 bool valid = grok_bslash_o(&RExC_parse,
14833 PASS2, /* warnings only in
14836 silence_non_portable,
14842 non_portable_endpoint++;
14843 if (IN_ENCODING && value < 0x100) {
14844 goto recode_encoding;
14848 RExC_parse--; /* function expects to be pointed at the 'x' */
14850 const char* error_msg;
14851 bool valid = grok_bslash_x(&RExC_parse,
14854 PASS2, /* Output warnings */
14856 silence_non_portable,
14862 non_portable_endpoint++;
14863 if (IN_ENCODING && value < 0x100)
14864 goto recode_encoding;
14867 value = grok_bslash_c(*RExC_parse++, PASS2);
14868 non_portable_endpoint++;
14870 case '0': case '1': case '2': case '3': case '4':
14871 case '5': case '6': case '7':
14873 /* Take 1-3 octal digits */
14874 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14875 numlen = (strict) ? 4 : 3;
14876 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14877 RExC_parse += numlen;
14880 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14881 vFAIL("Need exactly 3 octal digits");
14883 else if (! SIZE_ONLY /* like \08, \178 */
14885 && RExC_parse < RExC_end
14886 && isDIGIT(*RExC_parse)
14887 && ckWARN(WARN_REGEXP))
14889 SAVEFREESV(RExC_rx_sv);
14890 reg_warn_non_literal_string(
14892 form_short_octal_warning(RExC_parse, numlen));
14893 (void)ReREFCNT_inc(RExC_rx_sv);
14896 non_portable_endpoint++;
14897 if (IN_ENCODING && value < 0x100)
14898 goto recode_encoding;
14902 if (! RExC_override_recoding) {
14903 SV* enc = _get_encoding();
14904 value = reg_recode((const char)(U8)value, &enc);
14907 vFAIL("Invalid escape in the specified encoding");
14910 ckWARNreg(RExC_parse,
14911 "Invalid escape in the specified encoding");
14917 /* Allow \_ to not give an error */
14918 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14920 vFAIL2("Unrecognized escape \\%c in character class",
14924 SAVEFREESV(RExC_rx_sv);
14925 ckWARN2reg(RExC_parse,
14926 "Unrecognized escape \\%c in character class passed through",
14928 (void)ReREFCNT_inc(RExC_rx_sv);
14932 } /* End of switch on char following backslash */
14933 } /* end of handling backslash escape sequences */
14935 /* Here, we have the current token in 'value' */
14937 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14940 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14941 * literal, as is the character that began the false range, i.e.
14942 * the 'a' in the examples */
14945 const int w = (RExC_parse >= rangebegin)
14946 ? RExC_parse - rangebegin
14950 "False [] range \"%"UTF8f"\"",
14951 UTF8fARG(UTF, w, rangebegin));
14954 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14955 ckWARN2reg(RExC_parse,
14956 "False [] range \"%"UTF8f"\"",
14957 UTF8fARG(UTF, w, rangebegin));
14958 (void)ReREFCNT_inc(RExC_rx_sv);
14959 cp_list = add_cp_to_invlist(cp_list, '-');
14960 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14965 range = 0; /* this was not a true range */
14966 element_count += 2; /* So counts for three values */
14969 classnum = namedclass_to_classnum(namedclass);
14971 if (LOC && namedclass < ANYOF_POSIXL_MAX
14972 #ifndef HAS_ISASCII
14973 && classnum != _CC_ASCII
14976 /* What the Posix classes (like \w, [:space:]) match in locale
14977 * isn't knowable under locale until actual match time. Room
14978 * must be reserved (one time per outer bracketed class) to
14979 * store such classes. The space will contain a bit for each
14980 * named class that is to be matched against. This isn't
14981 * needed for \p{} and pseudo-classes, as they are not affected
14982 * by locale, and hence are dealt with separately */
14983 if (! need_class) {
14986 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14989 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14991 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14992 ANYOF_POSIXL_ZERO(ret);
14994 /* We can't change this into some other type of node
14995 * (unless this is the only element, in which case there
14996 * are nodes that mean exactly this) as has runtime
14998 optimizable = FALSE;
15001 /* Coverity thinks it is possible for this to be negative; both
15002 * jhi and khw think it's not, but be safer */
15003 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15004 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
15006 /* See if it already matches the complement of this POSIX
15008 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15009 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
15013 posixl_matches_all = TRUE;
15014 break; /* No need to continue. Since it matches both
15015 e.g., \w and \W, it matches everything, and the
15016 bracketed class can be optimized into qr/./s */
15019 /* Add this class to those that should be checked at runtime */
15020 ANYOF_POSIXL_SET(ret, namedclass);
15022 /* The above-Latin1 characters are not subject to locale rules.
15023 * Just add them, in the second pass, to the
15024 * unconditionally-matched list */
15026 SV* scratch_list = NULL;
15028 /* Get the list of the above-Latin1 code points this
15030 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
15031 PL_XPosix_ptrs[classnum],
15033 /* Odd numbers are complements, like
15034 * NDIGIT, NASCII, ... */
15035 namedclass % 2 != 0,
15037 /* Checking if 'cp_list' is NULL first saves an extra
15038 * clone. Its reference count will be decremented at the
15039 * next union, etc, or if this is the only instance, at the
15040 * end of the routine */
15042 cp_list = scratch_list;
15045 _invlist_union(cp_list, scratch_list, &cp_list);
15046 SvREFCNT_dec_NN(scratch_list);
15048 continue; /* Go get next character */
15051 else if (! SIZE_ONLY) {
15053 /* Here, not in pass1 (in that pass we skip calculating the
15054 * contents of this class), and is /l, or is a POSIX class for
15055 * which /l doesn't matter (or is a Unicode property, which is
15056 * skipped here). */
15057 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
15058 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
15060 /* Here, should be \h, \H, \v, or \V. None of /d, /i
15061 * nor /l make a difference in what these match,
15062 * therefore we just add what they match to cp_list. */
15063 if (classnum != _CC_VERTSPACE) {
15064 assert( namedclass == ANYOF_HORIZWS
15065 || namedclass == ANYOF_NHORIZWS);
15067 /* It turns out that \h is just a synonym for
15069 classnum = _CC_BLANK;
15072 _invlist_union_maybe_complement_2nd(
15074 PL_XPosix_ptrs[classnum],
15075 namedclass % 2 != 0, /* Complement if odd
15076 (NHORIZWS, NVERTWS)
15081 else if (UNI_SEMANTICS
15082 || classnum == _CC_ASCII
15083 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
15084 || classnum == _CC_XDIGIT)))
15086 /* We usually have to worry about /d and /a affecting what
15087 * POSIX classes match, with special code needed for /d
15088 * because we won't know until runtime what all matches.
15089 * But there is no extra work needed under /u, and
15090 * [:ascii:] is unaffected by /a and /d; and :digit: and
15091 * :xdigit: don't have runtime differences under /d. So we
15092 * can special case these, and avoid some extra work below,
15093 * and at runtime. */
15094 _invlist_union_maybe_complement_2nd(
15096 PL_XPosix_ptrs[classnum],
15097 namedclass % 2 != 0,
15100 else { /* Garden variety class. If is NUPPER, NALPHA, ...
15101 complement and use nposixes */
15102 SV** posixes_ptr = namedclass % 2 == 0
15105 _invlist_union_maybe_complement_2nd(
15107 PL_XPosix_ptrs[classnum],
15108 namedclass % 2 != 0,
15112 } /* end of namedclass \blah */
15114 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15116 /* If 'range' is set, 'value' is the ending of a range--check its
15117 * validity. (If value isn't a single code point in the case of a
15118 * range, we should have figured that out above in the code that
15119 * catches false ranges). Later, we will handle each individual code
15120 * point in the range. If 'range' isn't set, this could be the
15121 * beginning of a range, so check for that by looking ahead to see if
15122 * the next real character to be processed is the range indicator--the
15127 /* For unicode ranges, we have to test that the Unicode as opposed
15128 * to the native values are not decreasing. (Above 255, there is
15129 * no difference between native and Unicode) */
15130 if (unicode_range && prevvalue < 255 && value < 255) {
15131 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
15132 goto backwards_range;
15137 if (prevvalue > value) /* b-a */ {
15142 w = RExC_parse - rangebegin;
15144 "Invalid [] range \"%"UTF8f"\"",
15145 UTF8fARG(UTF, w, rangebegin));
15146 NOT_REACHED; /* NOTREACHED */
15150 prevvalue = value; /* save the beginning of the potential range */
15151 if (! stop_at_1 /* Can't be a range if parsing just one thing */
15152 && *RExC_parse == '-')
15154 char* next_char_ptr = RExC_parse + 1;
15156 /* Get the next real char after the '-' */
15157 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
15159 /* If the '-' is at the end of the class (just before the ']',
15160 * it is a literal minus; otherwise it is a range */
15161 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
15162 RExC_parse = next_char_ptr;
15164 /* a bad range like \w-, [:word:]- ? */
15165 if (namedclass > OOB_NAMEDCLASS) {
15166 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
15167 const int w = RExC_parse >= rangebegin
15168 ? RExC_parse - rangebegin
15171 vFAIL4("False [] range \"%*.*s\"",
15176 "False [] range \"%*.*s\"",
15181 cp_list = add_cp_to_invlist(cp_list, '-');
15185 range = 1; /* yeah, it's a range! */
15186 continue; /* but do it the next time */
15191 if (namedclass > OOB_NAMEDCLASS) {
15195 /* Here, we have a single value this time through the loop, and
15196 * <prevvalue> is the beginning of the range, if any; or <value> if
15199 /* non-Latin1 code point implies unicode semantics. Must be set in
15200 * pass1 so is there for the whole of pass 2 */
15202 REQUIRE_UNI_RULES(flagp, NULL);
15205 /* Ready to process either the single value, or the completed range.
15206 * For single-valued non-inverted ranges, we consider the possibility
15207 * of multi-char folds. (We made a conscious decision to not do this
15208 * for the other cases because it can often lead to non-intuitive
15209 * results. For example, you have the peculiar case that:
15210 * "s s" =~ /^[^\xDF]+$/i => Y
15211 * "ss" =~ /^[^\xDF]+$/i => N
15213 * See [perl #89750] */
15214 if (FOLD && allow_multi_folds && value == prevvalue) {
15215 if (value == LATIN_SMALL_LETTER_SHARP_S
15216 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15219 /* Here <value> is indeed a multi-char fold. Get what it is */
15221 U8 foldbuf[UTF8_MAXBYTES_CASE];
15224 UV folded = _to_uni_fold_flags(
15228 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15229 ? FOLD_FLAGS_NOMIX_ASCII
15233 /* Here, <folded> should be the first character of the
15234 * multi-char fold of <value>, with <foldbuf> containing the
15235 * whole thing. But, if this fold is not allowed (because of
15236 * the flags), <fold> will be the same as <value>, and should
15237 * be processed like any other character, so skip the special
15239 if (folded != value) {
15241 /* Skip if we are recursed, currently parsing the class
15242 * again. Otherwise add this character to the list of
15243 * multi-char folds. */
15244 if (! RExC_in_multi_char_class) {
15245 STRLEN cp_count = utf8_length(foldbuf,
15246 foldbuf + foldlen);
15247 SV* multi_fold = sv_2mortal(newSVpvs(""));
15249 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15252 = add_multi_match(multi_char_matches,
15258 /* This element should not be processed further in this
15261 value = save_value;
15262 prevvalue = save_prevvalue;
15268 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15271 /* If the range starts above 255, everything is portable and
15272 * likely to be so for any forseeable character set, so don't
15274 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15275 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15277 else if (prevvalue != value) {
15279 /* Under strict, ranges that stop and/or end in an ASCII
15280 * printable should have each end point be a portable value
15281 * for it (preferably like 'A', but we don't warn if it is
15282 * a (portable) Unicode name or code point), and the range
15283 * must be be all digits or all letters of the same case.
15284 * Otherwise, the range is non-portable and unclear as to
15285 * what it contains */
15286 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15287 && (non_portable_endpoint
15288 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15289 || (isLOWER_A(prevvalue) && isLOWER_A(value))
15290 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15292 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15294 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15296 /* But the nature of Unicode and languages mean we
15297 * can't do the same checks for above-ASCII ranges,
15298 * except in the case of digit ones. These should
15299 * contain only digits from the same group of 10. The
15300 * ASCII case is handled just above. 0x660 is the
15301 * first digit character beyond ASCII. Hence here, the
15302 * range could be a range of digits. Find out. */
15303 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15305 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15308 /* If the range start and final points are in the same
15309 * inversion list element, it means that either both
15310 * are not digits, or both are digits in a consecutive
15311 * sequence of digits. (So far, Unicode has kept all
15312 * such sequences as distinct groups of 10, but assert
15313 * to make sure). If the end points are not in the
15314 * same element, neither should be a digit. */
15315 if (index_start == index_final) {
15316 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15317 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15318 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15320 /* But actually Unicode did have one group of 11
15321 * 'digits' in 5.2, so in case we are operating
15322 * on that version, let that pass */
15323 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15324 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15326 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15330 else if ((index_start >= 0
15331 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15332 || (index_final >= 0
15333 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15335 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15340 if ((! range || prevvalue == value) && non_portable_endpoint) {
15341 if (isPRINT_A(value)) {
15344 if (isBACKSLASHED_PUNCT(value)) {
15345 literal[d++] = '\\';
15347 literal[d++] = (char) value;
15348 literal[d++] = '\0';
15351 "\"%.*s\" is more clearly written simply as \"%s\"",
15352 (int) (RExC_parse - rangebegin),
15357 else if isMNEMONIC_CNTRL(value) {
15359 "\"%.*s\" is more clearly written simply as \"%s\"",
15360 (int) (RExC_parse - rangebegin),
15362 cntrl_to_mnemonic((char) value)
15368 /* Deal with this element of the class */
15372 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15375 /* On non-ASCII platforms, for ranges that span all of 0..255, and
15376 * ones that don't require special handling, we can just add the
15377 * range like we do for ASCII platforms */
15378 if ((UNLIKELY(prevvalue == 0) && value >= 255)
15379 || ! (prevvalue < 256
15381 || (! non_portable_endpoint
15382 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15383 || (isUPPER_A(prevvalue)
15384 && isUPPER_A(value)))))))
15386 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15390 /* Here, requires special handling. This can be because it is
15391 * a range whose code points are considered to be Unicode, and
15392 * so must be individually translated into native, or because
15393 * its a subrange of 'A-Z' or 'a-z' which each aren't
15394 * contiguous in EBCDIC, but we have defined them to include
15395 * only the "expected" upper or lower case ASCII alphabetics.
15396 * Subranges above 255 are the same in native and Unicode, so
15397 * can be added as a range */
15398 U8 start = NATIVE_TO_LATIN1(prevvalue);
15400 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15401 for (j = start; j <= end; j++) {
15402 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15405 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15412 range = 0; /* this range (if it was one) is done now */
15413 } /* End of loop through all the text within the brackets */
15415 /* If anything in the class expands to more than one character, we have to
15416 * deal with them by building up a substitute parse string, and recursively
15417 * calling reg() on it, instead of proceeding */
15418 if (multi_char_matches) {
15419 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15422 char *save_end = RExC_end;
15423 char *save_parse = RExC_parse;
15424 bool first_time = TRUE; /* First multi-char occurrence doesn't get
15429 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
15430 because too confusing */
15432 sv_catpv(substitute_parse, "(?:");
15436 /* Look at the longest folds first */
15437 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15439 if (av_exists(multi_char_matches, cp_count)) {
15440 AV** this_array_ptr;
15443 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15445 while ((this_sequence = av_pop(*this_array_ptr)) !=
15448 if (! first_time) {
15449 sv_catpv(substitute_parse, "|");
15451 first_time = FALSE;
15453 sv_catpv(substitute_parse, SvPVX(this_sequence));
15458 /* If the character class contains anything else besides these
15459 * multi-character folds, have to include it in recursive parsing */
15460 if (element_count) {
15461 sv_catpv(substitute_parse, "|[");
15462 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15463 sv_catpv(substitute_parse, "]");
15466 sv_catpv(substitute_parse, ")");
15469 /* This is a way to get the parse to skip forward a whole named
15470 * sequence instead of matching the 2nd character when it fails the
15472 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15476 RExC_parse = SvPV(substitute_parse, len);
15477 RExC_end = RExC_parse + len;
15478 RExC_in_multi_char_class = 1;
15479 RExC_override_recoding = 1;
15480 RExC_emit = (regnode *)orig_emit;
15482 ret = reg(pRExC_state, 1, ®_flags, depth+1);
15484 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
15486 RExC_parse = save_parse;
15487 RExC_end = save_end;
15488 RExC_in_multi_char_class = 0;
15489 RExC_override_recoding = 0;
15490 SvREFCNT_dec_NN(multi_char_matches);
15494 /* Here, we've gone through the entire class and dealt with multi-char
15495 * folds. We are now in a position that we can do some checks to see if we
15496 * can optimize this ANYOF node into a simpler one, even in Pass 1.
15497 * Currently we only do two checks:
15498 * 1) is in the unlikely event that the user has specified both, eg. \w and
15499 * \W under /l, then the class matches everything. (This optimization
15500 * is done only to make the optimizer code run later work.)
15501 * 2) if the character class contains only a single element (including a
15502 * single range), we see if there is an equivalent node for it.
15503 * Other checks are possible */
15505 && ! ret_invlist /* Can't optimize if returning the constructed
15507 && (UNLIKELY(posixl_matches_all) || element_count == 1))
15512 if (UNLIKELY(posixl_matches_all)) {
15515 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15516 \w or [:digit:] or \p{foo}
15519 /* All named classes are mapped into POSIXish nodes, with its FLAG
15520 * argument giving which class it is */
15521 switch ((I32)namedclass) {
15522 case ANYOF_UNIPROP:
15525 /* These don't depend on the charset modifiers. They always
15526 * match under /u rules */
15527 case ANYOF_NHORIZWS:
15528 case ANYOF_HORIZWS:
15529 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15532 case ANYOF_NVERTWS:
15537 /* The actual POSIXish node for all the rest depends on the
15538 * charset modifier. The ones in the first set depend only on
15539 * ASCII or, if available on this platform, also locale */
15543 op = (LOC) ? POSIXL : POSIXA;
15549 /* The following don't have any matches in the upper Latin1
15550 * range, hence /d is equivalent to /u for them. Making it /u
15551 * saves some branches at runtime */
15555 case ANYOF_NXDIGIT:
15556 if (! DEPENDS_SEMANTICS) {
15557 goto treat_as_default;
15563 /* The following change to CASED under /i */
15569 namedclass = ANYOF_CASED + (namedclass % 2);
15573 /* The rest have more possibilities depending on the charset.
15574 * We take advantage of the enum ordering of the charset
15575 * modifiers to get the exact node type, */
15578 op = POSIXD + get_regex_charset(RExC_flags);
15579 if (op > POSIXA) { /* /aa is same as /a */
15584 /* The odd numbered ones are the complements of the
15585 * next-lower even number one */
15586 if (namedclass % 2 == 1) {
15590 arg = namedclass_to_classnum(namedclass);
15594 else if (value == prevvalue) {
15596 /* Here, the class consists of just a single code point */
15599 if (! LOC && value == '\n') {
15600 op = REG_ANY; /* Optimize [^\n] */
15601 *flagp |= HASWIDTH|SIMPLE;
15605 else if (value < 256 || UTF) {
15607 /* Optimize a single value into an EXACTish node, but not if it
15608 * would require converting the pattern to UTF-8. */
15609 op = compute_EXACTish(pRExC_state);
15611 } /* Otherwise is a range */
15612 else if (! LOC) { /* locale could vary these */
15613 if (prevvalue == '0') {
15614 if (value == '9') {
15619 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15620 /* We can optimize A-Z or a-z, but not if they could match
15621 * something like the KELVIN SIGN under /i. */
15622 if (prevvalue == 'A') {
15625 && ! non_portable_endpoint
15628 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15632 else if (prevvalue == 'a') {
15635 && ! non_portable_endpoint
15638 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15645 /* Here, we have changed <op> away from its initial value iff we found
15646 * an optimization */
15649 /* Throw away this ANYOF regnode, and emit the calculated one,
15650 * which should correspond to the beginning, not current, state of
15652 const char * cur_parse = RExC_parse;
15653 RExC_parse = (char *)orig_parse;
15657 /* To get locale nodes to not use the full ANYOF size would
15658 * require moving the code above that writes the portions
15659 * of it that aren't in other nodes to after this point.
15660 * e.g. ANYOF_POSIXL_SET */
15661 RExC_size = orig_size;
15665 RExC_emit = (regnode *)orig_emit;
15666 if (PL_regkind[op] == POSIXD) {
15667 if (op == POSIXL) {
15668 RExC_contains_locale = 1;
15671 op += NPOSIXD - POSIXD;
15676 ret = reg_node(pRExC_state, op);
15678 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15682 *flagp |= HASWIDTH|SIMPLE;
15684 else if (PL_regkind[op] == EXACT) {
15685 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15686 TRUE /* downgradable to EXACT */
15690 RExC_parse = (char *) cur_parse;
15692 SvREFCNT_dec(posixes);
15693 SvREFCNT_dec(nposixes);
15694 SvREFCNT_dec(simple_posixes);
15695 SvREFCNT_dec(cp_list);
15696 SvREFCNT_dec(cp_foldable_list);
15703 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15705 /* If folding, we calculate all characters that could fold to or from the
15706 * ones already on the list */
15707 if (cp_foldable_list) {
15709 UV start, end; /* End points of code point ranges */
15711 SV* fold_intersection = NULL;
15714 /* Our calculated list will be for Unicode rules. For locale
15715 * matching, we have to keep a separate list that is consulted at
15716 * runtime only when the locale indicates Unicode rules. For
15717 * non-locale, we just use the general list */
15719 use_list = &only_utf8_locale_list;
15722 use_list = &cp_list;
15725 /* Only the characters in this class that participate in folds need
15726 * be checked. Get the intersection of this class and all the
15727 * possible characters that are foldable. This can quickly narrow
15728 * down a large class */
15729 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15730 &fold_intersection);
15732 /* The folds for all the Latin1 characters are hard-coded into this
15733 * program, but we have to go out to disk to get the others. */
15734 if (invlist_highest(cp_foldable_list) >= 256) {
15736 /* This is a hash that for a particular fold gives all
15737 * characters that are involved in it */
15738 if (! PL_utf8_foldclosures) {
15739 _load_PL_utf8_foldclosures();
15743 /* Now look at the foldable characters in this class individually */
15744 invlist_iterinit(fold_intersection);
15745 while (invlist_iternext(fold_intersection, &start, &end)) {
15748 /* Look at every character in the range */
15749 for (j = start; j <= end; j++) {
15750 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15756 if (IS_IN_SOME_FOLD_L1(j)) {
15758 /* ASCII is always matched; non-ASCII is matched
15759 * only under Unicode rules (which could happen
15760 * under /l if the locale is a UTF-8 one */
15761 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15762 *use_list = add_cp_to_invlist(*use_list,
15763 PL_fold_latin1[j]);
15767 add_cp_to_invlist(depends_list,
15768 PL_fold_latin1[j]);
15772 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15773 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15775 add_above_Latin1_folds(pRExC_state,
15782 /* Here is an above Latin1 character. We don't have the
15783 * rules hard-coded for it. First, get its fold. This is
15784 * the simple fold, as the multi-character folds have been
15785 * handled earlier and separated out */
15786 _to_uni_fold_flags(j, foldbuf, &foldlen,
15787 (ASCII_FOLD_RESTRICTED)
15788 ? FOLD_FLAGS_NOMIX_ASCII
15791 /* Single character fold of above Latin1. Add everything in
15792 * its fold closure to the list that this node should match.
15793 * The fold closures data structure is a hash with the keys
15794 * being the UTF-8 of every character that is folded to, like
15795 * 'k', and the values each an array of all code points that
15796 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15797 * Multi-character folds are not included */
15798 if ((listp = hv_fetch(PL_utf8_foldclosures,
15799 (char *) foldbuf, foldlen, FALSE)))
15801 AV* list = (AV*) *listp;
15803 for (k = 0; k <= av_tindex(list); k++) {
15804 SV** c_p = av_fetch(list, k, FALSE);
15810 /* /aa doesn't allow folds between ASCII and non- */
15811 if ((ASCII_FOLD_RESTRICTED
15812 && (isASCII(c) != isASCII(j))))
15817 /* Folds under /l which cross the 255/256 boundary
15818 * are added to a separate list. (These are valid
15819 * only when the locale is UTF-8.) */
15820 if (c < 256 && LOC) {
15821 *use_list = add_cp_to_invlist(*use_list, c);
15825 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15827 cp_list = add_cp_to_invlist(cp_list, c);
15830 /* Similarly folds involving non-ascii Latin1
15831 * characters under /d are added to their list */
15832 depends_list = add_cp_to_invlist(depends_list,
15839 SvREFCNT_dec_NN(fold_intersection);
15842 /* Now that we have finished adding all the folds, there is no reason
15843 * to keep the foldable list separate */
15844 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15845 SvREFCNT_dec_NN(cp_foldable_list);
15848 /* And combine the result (if any) with any inversion list from posix
15849 * classes. The lists are kept separate up to now because we don't want to
15850 * fold the classes (folding of those is automatically handled by the swash
15851 * fetching code) */
15852 if (simple_posixes) {
15853 _invlist_union(cp_list, simple_posixes, &cp_list);
15854 SvREFCNT_dec_NN(simple_posixes);
15856 if (posixes || nposixes) {
15857 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15858 /* Under /a and /aa, nothing above ASCII matches these */
15859 _invlist_intersection(posixes,
15860 PL_XPosix_ptrs[_CC_ASCII],
15864 if (DEPENDS_SEMANTICS) {
15865 /* Under /d, everything in the upper half of the Latin1 range
15866 * matches these complements */
15867 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
15869 else if (AT_LEAST_ASCII_RESTRICTED) {
15870 /* Under /a and /aa, everything above ASCII matches these
15872 _invlist_union_complement_2nd(nposixes,
15873 PL_XPosix_ptrs[_CC_ASCII],
15877 _invlist_union(posixes, nposixes, &posixes);
15878 SvREFCNT_dec_NN(nposixes);
15881 posixes = nposixes;
15884 if (! DEPENDS_SEMANTICS) {
15886 _invlist_union(cp_list, posixes, &cp_list);
15887 SvREFCNT_dec_NN(posixes);
15894 /* Under /d, we put into a separate list the Latin1 things that
15895 * match only when the target string is utf8 */
15896 SV* nonascii_but_latin1_properties = NULL;
15897 _invlist_intersection(posixes, PL_UpperLatin1,
15898 &nonascii_but_latin1_properties);
15899 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15902 _invlist_union(cp_list, posixes, &cp_list);
15903 SvREFCNT_dec_NN(posixes);
15909 if (depends_list) {
15910 _invlist_union(depends_list, nonascii_but_latin1_properties,
15912 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15915 depends_list = nonascii_but_latin1_properties;
15920 /* And combine the result (if any) with any inversion list from properties.
15921 * The lists are kept separate up to now so that we can distinguish the two
15922 * in regards to matching above-Unicode. A run-time warning is generated
15923 * if a Unicode property is matched against a non-Unicode code point. But,
15924 * we allow user-defined properties to match anything, without any warning,
15925 * and we also suppress the warning if there is a portion of the character
15926 * class that isn't a Unicode property, and which matches above Unicode, \W
15927 * or [\x{110000}] for example.
15928 * (Note that in this case, unlike the Posix one above, there is no
15929 * <depends_list>, because having a Unicode property forces Unicode
15934 /* If it matters to the final outcome, see if a non-property
15935 * component of the class matches above Unicode. If so, the
15936 * warning gets suppressed. This is true even if just a single
15937 * such code point is specified, as though not strictly correct if
15938 * another such code point is matched against, the fact that they
15939 * are using above-Unicode code points indicates they should know
15940 * the issues involved */
15942 warn_super = ! (invert
15943 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15946 _invlist_union(properties, cp_list, &cp_list);
15947 SvREFCNT_dec_NN(properties);
15950 cp_list = properties;
15955 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
15957 /* Because an ANYOF node is the only one that warns, this node
15958 * can't be optimized into something else */
15959 optimizable = FALSE;
15963 /* Here, we have calculated what code points should be in the character
15966 * Now we can see about various optimizations. Fold calculation (which we
15967 * did above) needs to take place before inversion. Otherwise /[^k]/i
15968 * would invert to include K, which under /i would match k, which it
15969 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15970 * folded until runtime */
15972 /* If we didn't do folding, it's because some information isn't available
15973 * until runtime; set the run-time fold flag for these. (We don't have to
15974 * worry about properties folding, as that is taken care of by the swash
15975 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15976 * locales, or the class matches at least one 0-255 range code point */
15978 if (only_utf8_locale_list) {
15979 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15981 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
15983 invlist_iterinit(cp_list);
15984 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15985 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15987 invlist_iterfinish(cp_list);
15991 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15992 * at compile time. Besides not inverting folded locale now, we can't
15993 * invert if there are things such as \w, which aren't known until runtime
15997 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15999 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16001 _invlist_invert(cp_list);
16003 /* Any swash can't be used as-is, because we've inverted things */
16005 SvREFCNT_dec_NN(swash);
16009 /* Clear the invert flag since have just done it here */
16016 *ret_invlist = cp_list;
16017 SvREFCNT_dec(swash);
16019 /* Discard the generated node */
16021 RExC_size = orig_size;
16024 RExC_emit = orig_emit;
16029 /* Some character classes are equivalent to other nodes. Such nodes take
16030 * up less room and generally fewer operations to execute than ANYOF nodes.
16031 * Above, we checked for and optimized into some such equivalents for
16032 * certain common classes that are easy to test. Getting to this point in
16033 * the code means that the class didn't get optimized there. Since this
16034 * code is only executed in Pass 2, it is too late to save space--it has
16035 * been allocated in Pass 1, and currently isn't given back. But turning
16036 * things into an EXACTish node can allow the optimizer to join it to any
16037 * adjacent such nodes. And if the class is equivalent to things like /./,
16038 * expensive run-time swashes can be avoided. Now that we have more
16039 * complete information, we can find things necessarily missed by the
16040 * earlier code. I (khw) did some benchmarks and found essentially no
16041 * speed difference between using a POSIXA node versus an ANYOF node, so
16042 * there is no reason to optimize, for example [A-Za-z0-9_] into
16043 * [[:word:]]/a (although if we did it in the sizing pass it would save
16044 * space). _invlistEQ() could be used if one ever wanted to do something
16045 * like this at this point in the code */
16047 if (optimizable && cp_list && ! invert && ! depends_list) {
16049 U8 op = END; /* The optimzation node-type */
16050 const char * cur_parse= RExC_parse;
16052 invlist_iterinit(cp_list);
16053 if (! invlist_iternext(cp_list, &start, &end)) {
16055 /* Here, the list is empty. This happens, for example, when a
16056 * Unicode property that doesn't match anything is the only element
16057 * in the character class (perluniprops.pod notes such properties).
16060 *flagp |= HASWIDTH|SIMPLE;
16062 else if (start == end) { /* The range is a single code point */
16063 if (! invlist_iternext(cp_list, &start, &end)
16065 /* Don't do this optimization if it would require changing
16066 * the pattern to UTF-8 */
16067 && (start < 256 || UTF))
16069 /* Here, the list contains a single code point. Can optimize
16070 * into an EXACTish node */
16081 /* A locale node under folding with one code point can be
16082 * an EXACTFL, as its fold won't be calculated until
16088 /* Here, we are generally folding, but there is only one
16089 * code point to match. If we have to, we use an EXACT
16090 * node, but it would be better for joining with adjacent
16091 * nodes in the optimization pass if we used the same
16092 * EXACTFish node that any such are likely to be. We can
16093 * do this iff the code point doesn't participate in any
16094 * folds. For example, an EXACTF of a colon is the same as
16095 * an EXACT one, since nothing folds to or from a colon. */
16097 if (IS_IN_SOME_FOLD_L1(value)) {
16102 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
16107 /* If we haven't found the node type, above, it means we
16108 * can use the prevailing one */
16110 op = compute_EXACTish(pRExC_state);
16114 } /* End of first range contains just a single code point */
16115 else if (start == 0) {
16116 if (end == UV_MAX) {
16118 *flagp |= HASWIDTH|SIMPLE;
16121 else if (end == '\n' - 1
16122 && invlist_iternext(cp_list, &start, &end)
16123 && start == '\n' + 1 && end == UV_MAX)
16126 *flagp |= HASWIDTH|SIMPLE;
16130 invlist_iterfinish(cp_list);
16133 RExC_parse = (char *)orig_parse;
16134 RExC_emit = (regnode *)orig_emit;
16136 if (regarglen[op]) {
16137 ret = reganode(pRExC_state, op, 0);
16139 ret = reg_node(pRExC_state, op);
16142 RExC_parse = (char *)cur_parse;
16144 if (PL_regkind[op] == EXACT) {
16145 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
16146 TRUE /* downgradable to EXACT */
16150 SvREFCNT_dec_NN(cp_list);
16155 /* Here, <cp_list> contains all the code points we can determine at
16156 * compile time that match under all conditions. Go through it, and
16157 * for things that belong in the bitmap, put them there, and delete from
16158 * <cp_list>. While we are at it, see if everything above 255 is in the
16159 * list, and if so, set a flag to speed up execution */
16161 populate_ANYOF_from_invlist(ret, &cp_list);
16164 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
16167 /* Here, the bitmap has been populated with all the Latin1 code points that
16168 * always match. Can now add to the overall list those that match only
16169 * when the target string is UTF-8 (<depends_list>). */
16170 if (depends_list) {
16172 _invlist_union(cp_list, depends_list, &cp_list);
16173 SvREFCNT_dec_NN(depends_list);
16176 cp_list = depends_list;
16178 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
16181 /* If there is a swash and more than one element, we can't use the swash in
16182 * the optimization below. */
16183 if (swash && element_count > 1) {
16184 SvREFCNT_dec_NN(swash);
16188 /* Note that the optimization of using 'swash' if it is the only thing in
16189 * the class doesn't have us change swash at all, so it can include things
16190 * that are also in the bitmap; otherwise we have purposely deleted that
16191 * duplicate information */
16192 set_ANYOF_arg(pRExC_state, ret, cp_list,
16193 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16195 only_utf8_locale_list,
16196 swash, has_user_defined_property);
16198 *flagp |= HASWIDTH|SIMPLE;
16200 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
16201 RExC_contains_locale = 1;
16207 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
16210 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
16211 regnode* const node,
16213 SV* const runtime_defns,
16214 SV* const only_utf8_locale_list,
16216 const bool has_user_defined_property)
16218 /* Sets the arg field of an ANYOF-type node 'node', using information about
16219 * the node passed-in. If there is nothing outside the node's bitmap, the
16220 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
16221 * the count returned by add_data(), having allocated and stored an array,
16222 * av, that that count references, as follows:
16223 * av[0] stores the character class description in its textual form.
16224 * This is used later (regexec.c:Perl_regclass_swash()) to
16225 * initialize the appropriate swash, and is also useful for dumping
16226 * the regnode. This is set to &PL_sv_undef if the textual
16227 * description is not needed at run-time (as happens if the other
16228 * elements completely define the class)
16229 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16230 * computed from av[0]. But if no further computation need be done,
16231 * the swash is stored here now (and av[0] is &PL_sv_undef).
16232 * av[2] stores the inversion list of code points that match only if the
16233 * current locale is UTF-8
16234 * av[3] stores the cp_list inversion list for use in addition or instead
16235 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16236 * (Otherwise everything needed is already in av[0] and av[1])
16237 * av[4] is set if any component of the class is from a user-defined
16238 * property; used only if av[3] exists */
16242 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16244 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16245 assert(! (ANYOF_FLAGS(node)
16246 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16247 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16248 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16251 AV * const av = newAV();
16254 assert(ANYOF_FLAGS(node)
16255 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16256 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16258 av_store(av, 0, (runtime_defns)
16259 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16262 av_store(av, 1, swash);
16263 SvREFCNT_dec_NN(cp_list);
16266 av_store(av, 1, &PL_sv_undef);
16268 av_store(av, 3, cp_list);
16269 av_store(av, 4, newSVuv(has_user_defined_property));
16273 if (only_utf8_locale_list) {
16274 av_store(av, 2, only_utf8_locale_list);
16277 av_store(av, 2, &PL_sv_undef);
16280 rv = newRV_noinc(MUTABLE_SV(av));
16281 n = add_data(pRExC_state, STR_WITH_LEN("s"));
16282 RExC_rxi->data->data[n] = (void*)rv;
16287 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16289 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16290 const regnode* node,
16293 SV** only_utf8_locale_ptr,
16297 /* For internal core use only.
16298 * Returns the swash for the input 'node' in the regex 'prog'.
16299 * If <doinit> is 'true', will attempt to create the swash if not already
16301 * If <listsvp> is non-null, will return the printable contents of the
16302 * swash. This can be used to get debugging information even before the
16303 * swash exists, by calling this function with 'doinit' set to false, in
16304 * which case the components that will be used to eventually create the
16305 * swash are returned (in a printable form).
16306 * If <exclude_list> is not NULL, it is an inversion list of things to
16307 * exclude from what's returned in <listsvp>.
16308 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
16309 * that, in spite of this function's name, the swash it returns may include
16310 * the bitmap data as well */
16313 SV *si = NULL; /* Input swash initialization string */
16314 SV* invlist = NULL;
16316 RXi_GET_DECL(prog,progi);
16317 const struct reg_data * const data = prog ? progi->data : NULL;
16319 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16321 assert(ANYOF_FLAGS(node)
16322 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16323 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16325 if (data && data->count) {
16326 const U32 n = ARG(node);
16328 if (data->what[n] == 's') {
16329 SV * const rv = MUTABLE_SV(data->data[n]);
16330 AV * const av = MUTABLE_AV(SvRV(rv));
16331 SV **const ary = AvARRAY(av);
16332 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16334 si = *ary; /* ary[0] = the string to initialize the swash with */
16336 /* Elements 3 and 4 are either both present or both absent. [3] is
16337 * any inversion list generated at compile time; [4] indicates if
16338 * that inversion list has any user-defined properties in it. */
16339 if (av_tindex(av) >= 2) {
16340 if (only_utf8_locale_ptr
16342 && ary[2] != &PL_sv_undef)
16344 *only_utf8_locale_ptr = ary[2];
16347 assert(only_utf8_locale_ptr);
16348 *only_utf8_locale_ptr = NULL;
16351 if (av_tindex(av) >= 3) {
16353 if (SvUV(ary[4])) {
16354 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16362 /* Element [1] is reserved for the set-up swash. If already there,
16363 * return it; if not, create it and store it there */
16364 if (ary[1] && SvROK(ary[1])) {
16367 else if (doinit && ((si && si != &PL_sv_undef)
16368 || (invlist && invlist != &PL_sv_undef))) {
16370 sw = _core_swash_init("utf8", /* the utf8 package */
16374 0, /* not from tr/// */
16376 &swash_init_flags);
16377 (void)av_store(av, 1, sw);
16382 /* If requested, return a printable version of what this swash matches */
16384 SV* matches_string = newSVpvs("");
16386 /* The swash should be used, if possible, to get the data, as it
16387 * contains the resolved data. But this function can be called at
16388 * compile-time, before everything gets resolved, in which case we
16389 * return the currently best available information, which is the string
16390 * that will eventually be used to do that resolving, 'si' */
16391 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16392 && (si && si != &PL_sv_undef))
16394 sv_catsv(matches_string, si);
16397 /* Add the inversion list to whatever we have. This may have come from
16398 * the swash, or from an input parameter */
16400 if (exclude_list) {
16401 SV* clone = invlist_clone(invlist);
16402 _invlist_subtract(clone, exclude_list, &clone);
16403 sv_catsv(matches_string, _invlist_contents(clone));
16404 SvREFCNT_dec_NN(clone);
16407 sv_catsv(matches_string, _invlist_contents(invlist));
16410 *listsvp = matches_string;
16415 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16417 /* reg_skipcomment()
16419 Absorbs an /x style # comment from the input stream,
16420 returning a pointer to the first character beyond the comment, or if the
16421 comment terminates the pattern without anything following it, this returns
16422 one past the final character of the pattern (in other words, RExC_end) and
16423 sets the REG_RUN_ON_COMMENT_SEEN flag.
16425 Note it's the callers responsibility to ensure that we are
16426 actually in /x mode
16430 PERL_STATIC_INLINE char*
16431 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16433 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16437 while (p < RExC_end) {
16438 if (*(++p) == '\n') {
16443 /* we ran off the end of the pattern without ending the comment, so we have
16444 * to add an \n when wrapping */
16445 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16450 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
16452 const bool force_to_xmod
16455 /* If the text at the current parse position '*p' is a '(?#...)' comment,
16456 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
16457 * is /x whitespace, advance '*p' so that on exit it points to the first
16458 * byte past all such white space and comments */
16460 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
16462 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
16464 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
16467 if (RExC_end - (*p) >= 3
16469 && *(*p + 1) == '?'
16470 && *(*p + 2) == '#')
16472 while (*(*p) != ')') {
16473 if ((*p) == RExC_end)
16474 FAIL("Sequence (?#... not terminated");
16482 const char * save_p = *p;
16483 while ((*p) < RExC_end) {
16485 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
16488 else if (*(*p) == '#') {
16489 (*p) = reg_skipcomment(pRExC_state, (*p));
16495 if (*p != save_p) {
16508 Advances the parse position by one byte, unless that byte is the beginning
16509 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
16510 those two cases, the parse position is advanced beyond all such comments and
16513 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
16517 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16519 PERL_ARGS_ASSERT_NEXTCHAR;
16522 || UTF8_IS_INVARIANT(*RExC_parse)
16523 || UTF8_IS_START(*RExC_parse));
16525 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16527 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16528 FALSE /* Don't assume /x */ );
16532 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16534 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16535 * space. In pass1, it aligns and increments RExC_size; in pass2,
16538 regnode * const ret = RExC_emit;
16539 GET_RE_DEBUG_FLAGS_DECL;
16541 PERL_ARGS_ASSERT_REGNODE_GUTS;
16543 assert(extra_size >= regarglen[op]);
16546 SIZE_ALIGN(RExC_size);
16547 RExC_size += 1 + extra_size;
16550 if (RExC_emit >= RExC_emit_bound)
16551 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16552 op, (void*)RExC_emit, (void*)RExC_emit_bound);
16554 NODE_ALIGN_FILL(ret);
16555 #ifndef RE_TRACK_PATTERN_OFFSETS
16556 PERL_UNUSED_ARG(name);
16558 if (RExC_offsets) { /* MJD */
16560 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16563 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16564 ? "Overwriting end of array!\n" : "OK",
16565 (UV)(RExC_emit - RExC_emit_start),
16566 (UV)(RExC_parse - RExC_start),
16567 (UV)RExC_offsets[0]));
16568 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16575 - reg_node - emit a node
16577 STATIC regnode * /* Location. */
16578 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16580 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16582 PERL_ARGS_ASSERT_REG_NODE;
16584 assert(regarglen[op] == 0);
16587 regnode *ptr = ret;
16588 FILL_ADVANCE_NODE(ptr, op);
16595 - reganode - emit a node with an argument
16597 STATIC regnode * /* Location. */
16598 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16600 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16602 PERL_ARGS_ASSERT_REGANODE;
16604 assert(regarglen[op] == 1);
16607 regnode *ptr = ret;
16608 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16615 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16617 /* emit a node with U32 and I32 arguments */
16619 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16621 PERL_ARGS_ASSERT_REG2LANODE;
16623 assert(regarglen[op] == 2);
16626 regnode *ptr = ret;
16627 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16634 - reginsert - insert an operator in front of already-emitted operand
16636 * Means relocating the operand.
16639 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16644 const int offset = regarglen[(U8)op];
16645 const int size = NODE_STEP_REGNODE + offset;
16646 GET_RE_DEBUG_FLAGS_DECL;
16648 PERL_ARGS_ASSERT_REGINSERT;
16649 PERL_UNUSED_CONTEXT;
16650 PERL_UNUSED_ARG(depth);
16651 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16652 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16661 if (RExC_open_parens) {
16663 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16664 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16665 if ( RExC_open_parens[paren] >= opnd ) {
16666 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16667 RExC_open_parens[paren] += size;
16669 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16671 if ( RExC_close_parens[paren] >= opnd ) {
16672 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16673 RExC_close_parens[paren] += size;
16675 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16680 while (src > opnd) {
16681 StructCopy(--src, --dst, regnode);
16682 #ifdef RE_TRACK_PATTERN_OFFSETS
16683 if (RExC_offsets) { /* MJD 20010112 */
16685 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16689 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16690 ? "Overwriting end of array!\n" : "OK",
16691 (UV)(src - RExC_emit_start),
16692 (UV)(dst - RExC_emit_start),
16693 (UV)RExC_offsets[0]));
16694 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16695 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16701 place = opnd; /* Op node, where operand used to be. */
16702 #ifdef RE_TRACK_PATTERN_OFFSETS
16703 if (RExC_offsets) { /* MJD */
16705 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16709 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16710 ? "Overwriting end of array!\n" : "OK",
16711 (UV)(place - RExC_emit_start),
16712 (UV)(RExC_parse - RExC_start),
16713 (UV)RExC_offsets[0]));
16714 Set_Node_Offset(place, RExC_parse);
16715 Set_Node_Length(place, 1);
16718 src = NEXTOPER(place);
16719 FILL_ADVANCE_NODE(place, op);
16720 Zero(src, offset, regnode);
16724 - regtail - set the next-pointer at the end of a node chain of p to val.
16725 - SEE ALSO: regtail_study
16727 /* TODO: All three parms should be const */
16729 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16730 const regnode *val,U32 depth)
16733 GET_RE_DEBUG_FLAGS_DECL;
16735 PERL_ARGS_ASSERT_REGTAIL;
16737 PERL_UNUSED_ARG(depth);
16743 /* Find last node. */
16746 regnode * const temp = regnext(scan);
16748 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16749 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16750 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16751 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16752 (temp == NULL ? "->" : ""),
16753 (temp == NULL ? PL_reg_name[OP(val)] : "")
16761 if (reg_off_by_arg[OP(scan)]) {
16762 ARG_SET(scan, val - scan);
16765 NEXT_OFF(scan) = val - scan;
16771 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16772 - Look for optimizable sequences at the same time.
16773 - currently only looks for EXACT chains.
16775 This is experimental code. The idea is to use this routine to perform
16776 in place optimizations on branches and groups as they are constructed,
16777 with the long term intention of removing optimization from study_chunk so
16778 that it is purely analytical.
16780 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16781 to control which is which.
16784 /* TODO: All four parms should be const */
16787 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16788 const regnode *val,U32 depth)
16792 #ifdef EXPERIMENTAL_INPLACESCAN
16795 GET_RE_DEBUG_FLAGS_DECL;
16797 PERL_ARGS_ASSERT_REGTAIL_STUDY;
16803 /* Find last node. */
16807 regnode * const temp = regnext(scan);
16808 #ifdef EXPERIMENTAL_INPLACESCAN
16809 if (PL_regkind[OP(scan)] == EXACT) {
16810 bool unfolded_multi_char; /* Unexamined in this routine */
16811 if (join_exact(pRExC_state, scan, &min,
16812 &unfolded_multi_char, 1, val, depth+1))
16817 switch (OP(scan)) {
16821 case EXACTFA_NO_TRIE:
16827 if( exact == PSEUDO )
16829 else if ( exact != OP(scan) )
16838 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16839 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16840 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16841 SvPV_nolen_const(RExC_mysv),
16842 REG_NODE_NUM(scan),
16843 PL_reg_name[exact]);
16850 DEBUG_PARSE_MSG("");
16851 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16852 PerlIO_printf(Perl_debug_log,
16853 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16854 SvPV_nolen_const(RExC_mysv),
16855 (IV)REG_NODE_NUM(val),
16859 if (reg_off_by_arg[OP(scan)]) {
16860 ARG_SET(scan, val - scan);
16863 NEXT_OFF(scan) = val - scan;
16871 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16876 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16881 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16883 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16884 if (flags & (1<<bit)) {
16885 if (!set++ && lead)
16886 PerlIO_printf(Perl_debug_log, "%s",lead);
16887 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16892 PerlIO_printf(Perl_debug_log, "\n");
16894 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16899 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16905 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16907 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16908 if (flags & (1<<bit)) {
16909 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16912 if (!set++ && lead)
16913 PerlIO_printf(Perl_debug_log, "%s",lead);
16914 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16917 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16918 if (!set++ && lead) {
16919 PerlIO_printf(Perl_debug_log, "%s",lead);
16922 case REGEX_UNICODE_CHARSET:
16923 PerlIO_printf(Perl_debug_log, "UNICODE");
16925 case REGEX_LOCALE_CHARSET:
16926 PerlIO_printf(Perl_debug_log, "LOCALE");
16928 case REGEX_ASCII_RESTRICTED_CHARSET:
16929 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16931 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16932 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16935 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16941 PerlIO_printf(Perl_debug_log, "\n");
16943 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16949 Perl_regdump(pTHX_ const regexp *r)
16952 SV * const sv = sv_newmortal();
16953 SV *dsv= sv_newmortal();
16954 RXi_GET_DECL(r,ri);
16955 GET_RE_DEBUG_FLAGS_DECL;
16957 PERL_ARGS_ASSERT_REGDUMP;
16959 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16961 /* Header fields of interest. */
16962 if (r->anchored_substr) {
16963 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16964 RE_SV_DUMPLEN(r->anchored_substr), 30);
16965 PerlIO_printf(Perl_debug_log,
16966 "anchored %s%s at %"IVdf" ",
16967 s, RE_SV_TAIL(r->anchored_substr),
16968 (IV)r->anchored_offset);
16969 } else if (r->anchored_utf8) {
16970 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16971 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16972 PerlIO_printf(Perl_debug_log,
16973 "anchored utf8 %s%s at %"IVdf" ",
16974 s, RE_SV_TAIL(r->anchored_utf8),
16975 (IV)r->anchored_offset);
16977 if (r->float_substr) {
16978 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16979 RE_SV_DUMPLEN(r->float_substr), 30);
16980 PerlIO_printf(Perl_debug_log,
16981 "floating %s%s at %"IVdf"..%"UVuf" ",
16982 s, RE_SV_TAIL(r->float_substr),
16983 (IV)r->float_min_offset, (UV)r->float_max_offset);
16984 } else if (r->float_utf8) {
16985 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16986 RE_SV_DUMPLEN(r->float_utf8), 30);
16987 PerlIO_printf(Perl_debug_log,
16988 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16989 s, RE_SV_TAIL(r->float_utf8),
16990 (IV)r->float_min_offset, (UV)r->float_max_offset);
16992 if (r->check_substr || r->check_utf8)
16993 PerlIO_printf(Perl_debug_log,
16995 (r->check_substr == r->float_substr
16996 && r->check_utf8 == r->float_utf8
16997 ? "(checking floating" : "(checking anchored"));
16998 if (r->intflags & PREGf_NOSCAN)
16999 PerlIO_printf(Perl_debug_log, " noscan");
17000 if (r->extflags & RXf_CHECK_ALL)
17001 PerlIO_printf(Perl_debug_log, " isall");
17002 if (r->check_substr || r->check_utf8)
17003 PerlIO_printf(Perl_debug_log, ") ");
17005 if (ri->regstclass) {
17006 regprop(r, sv, ri->regstclass, NULL, NULL);
17007 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
17009 if (r->intflags & PREGf_ANCH) {
17010 PerlIO_printf(Perl_debug_log, "anchored");
17011 if (r->intflags & PREGf_ANCH_MBOL)
17012 PerlIO_printf(Perl_debug_log, "(MBOL)");
17013 if (r->intflags & PREGf_ANCH_SBOL)
17014 PerlIO_printf(Perl_debug_log, "(SBOL)");
17015 if (r->intflags & PREGf_ANCH_GPOS)
17016 PerlIO_printf(Perl_debug_log, "(GPOS)");
17017 (void)PerlIO_putc(Perl_debug_log, ' ');
17019 if (r->intflags & PREGf_GPOS_SEEN)
17020 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
17021 if (r->intflags & PREGf_SKIP)
17022 PerlIO_printf(Perl_debug_log, "plus ");
17023 if (r->intflags & PREGf_IMPLICIT)
17024 PerlIO_printf(Perl_debug_log, "implicit ");
17025 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
17026 if (r->extflags & RXf_EVAL_SEEN)
17027 PerlIO_printf(Perl_debug_log, "with eval ");
17028 PerlIO_printf(Perl_debug_log, "\n");
17030 regdump_extflags("r->extflags: ",r->extflags);
17031 regdump_intflags("r->intflags: ",r->intflags);
17034 PERL_ARGS_ASSERT_REGDUMP;
17035 PERL_UNUSED_CONTEXT;
17036 PERL_UNUSED_ARG(r);
17037 #endif /* DEBUGGING */
17041 - regprop - printable representation of opcode, with run time support
17045 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
17050 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
17051 static const char * const anyofs[] = {
17052 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
17053 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
17054 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
17055 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
17056 || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
17057 #error Need to adjust order of anyofs[]
17092 RXi_GET_DECL(prog,progi);
17093 GET_RE_DEBUG_FLAGS_DECL;
17095 PERL_ARGS_ASSERT_REGPROP;
17097 sv_setpvn(sv, "", 0);
17099 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
17100 /* It would be nice to FAIL() here, but this may be called from
17101 regexec.c, and it would be hard to supply pRExC_state. */
17102 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17103 (int)OP(o), (int)REGNODE_MAX);
17104 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
17106 k = PL_regkind[OP(o)];
17109 sv_catpvs(sv, " ");
17110 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
17111 * is a crude hack but it may be the best for now since
17112 * we have no flag "this EXACTish node was UTF-8"
17114 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
17115 PERL_PV_ESCAPE_UNI_DETECT |
17116 PERL_PV_ESCAPE_NONASCII |
17117 PERL_PV_PRETTY_ELLIPSES |
17118 PERL_PV_PRETTY_LTGT |
17119 PERL_PV_PRETTY_NOCLEAR
17121 } else if (k == TRIE) {
17122 /* print the details of the trie in dumpuntil instead, as
17123 * progi->data isn't available here */
17124 const char op = OP(o);
17125 const U32 n = ARG(o);
17126 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
17127 (reg_ac_data *)progi->data->data[n] :
17129 const reg_trie_data * const trie
17130 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
17132 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
17133 DEBUG_TRIE_COMPILE_r(
17134 Perl_sv_catpvf(aTHX_ sv,
17135 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
17136 (UV)trie->startstate,
17137 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
17138 (UV)trie->wordcount,
17141 (UV)TRIE_CHARCOUNT(trie),
17142 (UV)trie->uniquecharcount
17145 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
17146 sv_catpvs(sv, "[");
17147 (void) put_charclass_bitmap_innards(sv,
17148 (IS_ANYOF_TRIE(op))
17150 : TRIE_BITMAP(trie),
17152 sv_catpvs(sv, "]");
17155 } else if (k == CURLY) {
17156 U32 lo = ARG1(o), hi = ARG2(o);
17157 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
17158 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
17159 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
17160 if (hi == REG_INFTY)
17161 sv_catpvs(sv, "INFTY");
17163 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
17164 sv_catpvs(sv, "}");
17166 else if (k == WHILEM && o->flags) /* Ordinal/of */
17167 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
17168 else if (k == REF || k == OPEN || k == CLOSE
17169 || k == GROUPP || OP(o)==ACCEPT)
17171 AV *name_list= NULL;
17172 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
17173 Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
17174 if ( RXp_PAREN_NAMES(prog) ) {
17175 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17176 } else if ( pRExC_state ) {
17177 name_list= RExC_paren_name_list;
17180 if ( k != REF || (OP(o) < NREF)) {
17181 SV **name= av_fetch(name_list, parno, 0 );
17183 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17186 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
17187 I32 *nums=(I32*)SvPVX(sv_dat);
17188 SV **name= av_fetch(name_list, nums[0], 0 );
17191 for ( n=0; n<SvIVX(sv_dat); n++ ) {
17192 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
17193 (n ? "," : ""), (IV)nums[n]);
17195 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17199 if ( k == REF && reginfo) {
17200 U32 n = ARG(o); /* which paren pair */
17201 I32 ln = prog->offs[n].start;
17202 if (prog->lastparen < n || ln == -1)
17203 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
17204 else if (ln == prog->offs[n].end)
17205 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
17207 const char *s = reginfo->strbeg + ln;
17208 Perl_sv_catpvf(aTHX_ sv, ": ");
17209 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
17210 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
17213 } else if (k == GOSUB) {
17214 AV *name_list= NULL;
17215 if ( RXp_PAREN_NAMES(prog) ) {
17216 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17217 } else if ( pRExC_state ) {
17218 name_list= RExC_paren_name_list;
17221 /* Paren and offset */
17222 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
17224 SV **name= av_fetch(name_list, ARG(o), 0 );
17226 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17229 else if (k == LOGICAL)
17230 /* 2: embedded, otherwise 1 */
17231 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
17232 else if (k == ANYOF) {
17233 const U8 flags = ANYOF_FLAGS(o);
17235 SV* bitmap_invlist; /* Will hold what the bit map contains */
17238 if (OP(o) == ANYOFL) {
17239 if (flags & ANYOF_LOC_REQ_UTF8) {
17240 sv_catpvs(sv, "{utf8-loc}");
17243 sv_catpvs(sv, "{loc}");
17246 if (flags & ANYOF_LOC_FOLD)
17247 sv_catpvs(sv, "{i}");
17248 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
17249 if (flags & ANYOF_INVERT)
17250 sv_catpvs(sv, "^");
17252 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
17254 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
17257 /* output any special charclass tests (used entirely under use
17259 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17261 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17262 if (ANYOF_POSIXL_TEST(o,i)) {
17263 sv_catpv(sv, anyofs[i]);
17269 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
17270 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
17271 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
17275 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17276 if (flags & ANYOF_INVERT)
17277 /*make sure the invert info is in each */
17278 sv_catpvs(sv, "^");
17281 if (OP(o) == ANYOFD
17282 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17284 sv_catpvs(sv, "{non-utf8-latin1-all}");
17287 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17288 sv_catpvs(sv, "{above_bitmap_all}");
17290 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17291 SV *lv; /* Set if there is something outside the bit map. */
17292 bool byte_output = FALSE; /* If something has been output */
17293 SV *only_utf8_locale;
17295 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
17296 * is used to guarantee that nothing in the bitmap gets
17298 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17299 &lv, &only_utf8_locale,
17301 if (lv && lv != &PL_sv_undef) {
17302 char *s = savesvpv(lv);
17303 char * const origs = s;
17305 while (*s && *s != '\n')
17309 const char * const t = ++s;
17311 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17312 sv_catpvs(sv, "{outside bitmap}");
17315 sv_catpvs(sv, "{utf8}");
17319 sv_catpvs(sv, " ");
17325 /* Truncate very long output */
17326 if (s - origs > 256) {
17327 Perl_sv_catpvf(aTHX_ sv,
17329 (int) (s - origs - 1),
17335 else if (*s == '\t') {
17349 SvREFCNT_dec_NN(lv);
17352 if ((flags & ANYOF_LOC_FOLD)
17353 && only_utf8_locale
17354 && only_utf8_locale != &PL_sv_undef)
17357 int max_entries = 256;
17359 sv_catpvs(sv, "{utf8 locale}");
17360 invlist_iterinit(only_utf8_locale);
17361 while (invlist_iternext(only_utf8_locale,
17363 put_range(sv, start, end, FALSE);
17365 if (max_entries < 0) {
17366 sv_catpvs(sv, "...");
17370 invlist_iterfinish(only_utf8_locale);
17374 SvREFCNT_dec(bitmap_invlist);
17377 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17379 else if (k == POSIXD || k == NPOSIXD) {
17380 U8 index = FLAGS(o) * 2;
17381 if (index < C_ARRAY_LENGTH(anyofs)) {
17382 if (*anyofs[index] != '[') {
17385 sv_catpv(sv, anyofs[index]);
17386 if (*anyofs[index] != '[') {
17391 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17394 else if (k == BOUND || k == NBOUND) {
17395 /* Must be synced with order of 'bound_type' in regcomp.h */
17396 const char * const bounds[] = {
17397 "", /* Traditional */
17402 sv_catpv(sv, bounds[FLAGS(o)]);
17404 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17405 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17406 else if (OP(o) == SBOL)
17407 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17409 /* add on the verb argument if there is one */
17410 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
17411 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
17412 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
17415 PERL_UNUSED_CONTEXT;
17416 PERL_UNUSED_ARG(sv);
17417 PERL_UNUSED_ARG(o);
17418 PERL_UNUSED_ARG(prog);
17419 PERL_UNUSED_ARG(reginfo);
17420 PERL_UNUSED_ARG(pRExC_state);
17421 #endif /* DEBUGGING */
17427 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17428 { /* Assume that RE_INTUIT is set */
17429 struct regexp *const prog = ReANY(r);
17430 GET_RE_DEBUG_FLAGS_DECL;
17432 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17433 PERL_UNUSED_CONTEXT;
17437 const char * const s = SvPV_nolen_const(RX_UTF8(r)
17438 ? prog->check_utf8 : prog->check_substr);
17440 if (!PL_colorset) reginitcolors();
17441 PerlIO_printf(Perl_debug_log,
17442 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17444 RX_UTF8(r) ? "utf8 " : "",
17445 PL_colors[5],PL_colors[0],
17448 (strlen(s) > 60 ? "..." : ""));
17451 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17452 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17458 handles refcounting and freeing the perl core regexp structure. When
17459 it is necessary to actually free the structure the first thing it
17460 does is call the 'free' method of the regexp_engine associated to
17461 the regexp, allowing the handling of the void *pprivate; member
17462 first. (This routine is not overridable by extensions, which is why
17463 the extensions free is called first.)
17465 See regdupe and regdupe_internal if you change anything here.
17467 #ifndef PERL_IN_XSUB_RE
17469 Perl_pregfree(pTHX_ REGEXP *r)
17475 Perl_pregfree2(pTHX_ REGEXP *rx)
17477 struct regexp *const r = ReANY(rx);
17478 GET_RE_DEBUG_FLAGS_DECL;
17480 PERL_ARGS_ASSERT_PREGFREE2;
17482 if (r->mother_re) {
17483 ReREFCNT_dec(r->mother_re);
17485 CALLREGFREE_PVT(rx); /* free the private data */
17486 SvREFCNT_dec(RXp_PAREN_NAMES(r));
17487 Safefree(r->xpv_len_u.xpvlenu_pv);
17490 SvREFCNT_dec(r->anchored_substr);
17491 SvREFCNT_dec(r->anchored_utf8);
17492 SvREFCNT_dec(r->float_substr);
17493 SvREFCNT_dec(r->float_utf8);
17494 Safefree(r->substrs);
17496 RX_MATCH_COPY_FREE(rx);
17497 #ifdef PERL_ANY_COW
17498 SvREFCNT_dec(r->saved_copy);
17501 SvREFCNT_dec(r->qr_anoncv);
17502 rx->sv_u.svu_rx = 0;
17507 This is a hacky workaround to the structural issue of match results
17508 being stored in the regexp structure which is in turn stored in
17509 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17510 could be PL_curpm in multiple contexts, and could require multiple
17511 result sets being associated with the pattern simultaneously, such
17512 as when doing a recursive match with (??{$qr})
17514 The solution is to make a lightweight copy of the regexp structure
17515 when a qr// is returned from the code executed by (??{$qr}) this
17516 lightweight copy doesn't actually own any of its data except for
17517 the starp/end and the actual regexp structure itself.
17523 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17525 struct regexp *ret;
17526 struct regexp *const r = ReANY(rx);
17527 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17529 PERL_ARGS_ASSERT_REG_TEMP_COPY;
17532 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17534 SvOK_off((SV *)ret_x);
17536 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17537 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
17538 made both spots point to the same regexp body.) */
17539 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17540 assert(!SvPVX(ret_x));
17541 ret_x->sv_u.svu_rx = temp->sv_any;
17542 temp->sv_any = NULL;
17543 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17544 SvREFCNT_dec_NN(temp);
17545 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17546 ing below will not set it. */
17547 SvCUR_set(ret_x, SvCUR(rx));
17550 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17551 sv_force_normal(sv) is called. */
17553 ret = ReANY(ret_x);
17555 SvFLAGS(ret_x) |= SvUTF8(rx);
17556 /* We share the same string buffer as the original regexp, on which we
17557 hold a reference count, incremented when mother_re is set below.
17558 The string pointer is copied here, being part of the regexp struct.
17560 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17561 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17563 const I32 npar = r->nparens+1;
17564 Newx(ret->offs, npar, regexp_paren_pair);
17565 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17568 Newx(ret->substrs, 1, struct reg_substr_data);
17569 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17571 SvREFCNT_inc_void(ret->anchored_substr);
17572 SvREFCNT_inc_void(ret->anchored_utf8);
17573 SvREFCNT_inc_void(ret->float_substr);
17574 SvREFCNT_inc_void(ret->float_utf8);
17576 /* check_substr and check_utf8, if non-NULL, point to either their
17577 anchored or float namesakes, and don't hold a second reference. */
17579 RX_MATCH_COPIED_off(ret_x);
17580 #ifdef PERL_ANY_COW
17581 ret->saved_copy = NULL;
17583 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17584 SvREFCNT_inc_void(ret->qr_anoncv);
17590 /* regfree_internal()
17592 Free the private data in a regexp. This is overloadable by
17593 extensions. Perl takes care of the regexp structure in pregfree(),
17594 this covers the *pprivate pointer which technically perl doesn't
17595 know about, however of course we have to handle the
17596 regexp_internal structure when no extension is in use.
17598 Note this is called before freeing anything in the regexp
17603 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17605 struct regexp *const r = ReANY(rx);
17606 RXi_GET_DECL(r,ri);
17607 GET_RE_DEBUG_FLAGS_DECL;
17609 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17615 SV *dsv= sv_newmortal();
17616 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17617 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17618 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17619 PL_colors[4],PL_colors[5],s);
17622 #ifdef RE_TRACK_PATTERN_OFFSETS
17624 Safefree(ri->u.offsets); /* 20010421 MJD */
17626 if (ri->code_blocks) {
17628 for (n = 0; n < ri->num_code_blocks; n++)
17629 SvREFCNT_dec(ri->code_blocks[n].src_regex);
17630 Safefree(ri->code_blocks);
17634 int n = ri->data->count;
17637 /* If you add a ->what type here, update the comment in regcomp.h */
17638 switch (ri->data->what[n]) {
17644 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17647 Safefree(ri->data->data[n]);
17653 { /* Aho Corasick add-on structure for a trie node.
17654 Used in stclass optimization only */
17656 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17657 #ifdef USE_ITHREADS
17661 refcount = --aho->refcount;
17664 PerlMemShared_free(aho->states);
17665 PerlMemShared_free(aho->fail);
17666 /* do this last!!!! */
17667 PerlMemShared_free(ri->data->data[n]);
17668 /* we should only ever get called once, so
17669 * assert as much, and also guard the free
17670 * which /might/ happen twice. At the least
17671 * it will make code anlyzers happy and it
17672 * doesn't cost much. - Yves */
17673 assert(ri->regstclass);
17674 if (ri->regstclass) {
17675 PerlMemShared_free(ri->regstclass);
17676 ri->regstclass = 0;
17683 /* trie structure. */
17685 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17686 #ifdef USE_ITHREADS
17690 refcount = --trie->refcount;
17693 PerlMemShared_free(trie->charmap);
17694 PerlMemShared_free(trie->states);
17695 PerlMemShared_free(trie->trans);
17697 PerlMemShared_free(trie->bitmap);
17699 PerlMemShared_free(trie->jump);
17700 PerlMemShared_free(trie->wordinfo);
17701 /* do this last!!!! */
17702 PerlMemShared_free(ri->data->data[n]);
17707 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17708 ri->data->what[n]);
17711 Safefree(ri->data->what);
17712 Safefree(ri->data);
17718 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17719 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17720 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17723 re_dup - duplicate a regexp.
17725 This routine is expected to clone a given regexp structure. It is only
17726 compiled under USE_ITHREADS.
17728 After all of the core data stored in struct regexp is duplicated
17729 the regexp_engine.dupe method is used to copy any private data
17730 stored in the *pprivate pointer. This allows extensions to handle
17731 any duplication it needs to do.
17733 See pregfree() and regfree_internal() if you change anything here.
17735 #if defined(USE_ITHREADS)
17736 #ifndef PERL_IN_XSUB_RE
17738 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17742 const struct regexp *r = ReANY(sstr);
17743 struct regexp *ret = ReANY(dstr);
17745 PERL_ARGS_ASSERT_RE_DUP_GUTS;
17747 npar = r->nparens+1;
17748 Newx(ret->offs, npar, regexp_paren_pair);
17749 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17751 if (ret->substrs) {
17752 /* Do it this way to avoid reading from *r after the StructCopy().
17753 That way, if any of the sv_dup_inc()s dislodge *r from the L1
17754 cache, it doesn't matter. */
17755 const bool anchored = r->check_substr
17756 ? r->check_substr == r->anchored_substr
17757 : r->check_utf8 == r->anchored_utf8;
17758 Newx(ret->substrs, 1, struct reg_substr_data);
17759 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17761 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17762 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17763 ret->float_substr = sv_dup_inc(ret->float_substr, param);
17764 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17766 /* check_substr and check_utf8, if non-NULL, point to either their
17767 anchored or float namesakes, and don't hold a second reference. */
17769 if (ret->check_substr) {
17771 assert(r->check_utf8 == r->anchored_utf8);
17772 ret->check_substr = ret->anchored_substr;
17773 ret->check_utf8 = ret->anchored_utf8;
17775 assert(r->check_substr == r->float_substr);
17776 assert(r->check_utf8 == r->float_utf8);
17777 ret->check_substr = ret->float_substr;
17778 ret->check_utf8 = ret->float_utf8;
17780 } else if (ret->check_utf8) {
17782 ret->check_utf8 = ret->anchored_utf8;
17784 ret->check_utf8 = ret->float_utf8;
17789 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17790 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17793 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17795 if (RX_MATCH_COPIED(dstr))
17796 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
17798 ret->subbeg = NULL;
17799 #ifdef PERL_ANY_COW
17800 ret->saved_copy = NULL;
17803 /* Whether mother_re be set or no, we need to copy the string. We
17804 cannot refrain from copying it when the storage points directly to
17805 our mother regexp, because that's
17806 1: a buffer in a different thread
17807 2: something we no longer hold a reference on
17808 so we need to copy it locally. */
17809 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17810 ret->mother_re = NULL;
17812 #endif /* PERL_IN_XSUB_RE */
17817 This is the internal complement to regdupe() which is used to copy
17818 the structure pointed to by the *pprivate pointer in the regexp.
17819 This is the core version of the extension overridable cloning hook.
17820 The regexp structure being duplicated will be copied by perl prior
17821 to this and will be provided as the regexp *r argument, however
17822 with the /old/ structures pprivate pointer value. Thus this routine
17823 may override any copying normally done by perl.
17825 It returns a pointer to the new regexp_internal structure.
17829 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17832 struct regexp *const r = ReANY(rx);
17833 regexp_internal *reti;
17835 RXi_GET_DECL(r,ri);
17837 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17841 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17842 char, regexp_internal);
17843 Copy(ri->program, reti->program, len+1, regnode);
17845 reti->num_code_blocks = ri->num_code_blocks;
17846 if (ri->code_blocks) {
17848 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17849 struct reg_code_block);
17850 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17851 struct reg_code_block);
17852 for (n = 0; n < ri->num_code_blocks; n++)
17853 reti->code_blocks[n].src_regex = (REGEXP*)
17854 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17857 reti->code_blocks = NULL;
17859 reti->regstclass = NULL;
17862 struct reg_data *d;
17863 const int count = ri->data->count;
17866 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17867 char, struct reg_data);
17868 Newx(d->what, count, U8);
17871 for (i = 0; i < count; i++) {
17872 d->what[i] = ri->data->what[i];
17873 switch (d->what[i]) {
17874 /* see also regcomp.h and regfree_internal() */
17875 case 'a': /* actually an AV, but the dup function is identical. */
17879 case 'u': /* actually an HV, but the dup function is identical. */
17880 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17883 /* This is cheating. */
17884 Newx(d->data[i], 1, regnode_ssc);
17885 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17886 reti->regstclass = (regnode*)d->data[i];
17889 /* Trie stclasses are readonly and can thus be shared
17890 * without duplication. We free the stclass in pregfree
17891 * when the corresponding reg_ac_data struct is freed.
17893 reti->regstclass= ri->regstclass;
17897 ((reg_trie_data*)ri->data->data[i])->refcount++;
17902 d->data[i] = ri->data->data[i];
17905 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17906 ri->data->what[i]);
17915 reti->name_list_idx = ri->name_list_idx;
17917 #ifdef RE_TRACK_PATTERN_OFFSETS
17918 if (ri->u.offsets) {
17919 Newx(reti->u.offsets, 2*len+1, U32);
17920 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17923 SetProgLen(reti,len);
17926 return (void*)reti;
17929 #endif /* USE_ITHREADS */
17931 #ifndef PERL_IN_XSUB_RE
17934 - regnext - dig the "next" pointer out of a node
17937 Perl_regnext(pTHX_ regnode *p)
17944 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17945 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17946 (int)OP(p), (int)REGNODE_MAX);
17949 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17958 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17961 STRLEN l1 = strlen(pat1);
17962 STRLEN l2 = strlen(pat2);
17965 const char *message;
17967 PERL_ARGS_ASSERT_RE_CROAK2;
17973 Copy(pat1, buf, l1 , char);
17974 Copy(pat2, buf + l1, l2 , char);
17975 buf[l1 + l2] = '\n';
17976 buf[l1 + l2 + 1] = '\0';
17977 va_start(args, pat2);
17978 msv = vmess(buf, &args);
17980 message = SvPV_const(msv,l1);
17983 Copy(message, buf, l1 , char);
17984 /* l1-1 to avoid \n */
17985 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17988 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
17990 #ifndef PERL_IN_XSUB_RE
17992 Perl_save_re_context(pTHX)
17997 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
18000 const REGEXP * const rx = PM_GETRE(PL_curpm);
18002 nparens = RX_NPARENS(rx);
18005 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
18006 * that PL_curpm will be null, but that utf8.pm and the modules it
18007 * loads will only use $1..$3.
18008 * The t/porting/re_context.t test file checks this assumption.
18013 for (i = 1; i <= nparens; i++) {
18014 char digits[TYPE_CHARS(long)];
18015 const STRLEN len = my_snprintf(digits, sizeof(digits),
18017 GV *const *const gvp
18018 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
18021 GV * const gv = *gvp;
18022 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
18032 S_put_code_point(pTHX_ SV *sv, UV c)
18034 PERL_ARGS_ASSERT_PUT_CODE_POINT;
18037 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
18039 else if (isPRINT(c)) {
18040 const char string = (char) c;
18041 if (isBACKSLASHED_PUNCT(c))
18042 sv_catpvs(sv, "\\");
18043 sv_catpvn(sv, &string, 1);
18046 const char * const mnemonic = cntrl_to_mnemonic((char) c);
18048 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
18051 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
18056 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
18059 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
18061 /* Appends to 'sv' a displayable version of the range of code points from
18062 * 'start' to 'end'. It assumes that only ASCII printables are displayable
18063 * as-is (though some of these will be escaped by put_code_point()). */
18065 const unsigned int min_range_count = 3;
18067 assert(start <= end);
18069 PERL_ARGS_ASSERT_PUT_RANGE;
18071 while (start <= end) {
18073 const char * format;
18075 if (end - start < min_range_count) {
18077 /* Individual chars in short ranges */
18078 for (; start <= end; start++) {
18079 put_code_point(sv, start);
18084 /* If permitted by the input options, and there is a possibility that
18085 * this range contains a printable literal, look to see if there is
18087 if (allow_literals && start <= MAX_PRINT_A) {
18089 /* If the range begin isn't an ASCII printable, effectively split
18090 * the range into two parts:
18091 * 1) the portion before the first such printable,
18093 * and output them separately. */
18094 if (! isPRINT_A(start)) {
18095 UV temp_end = start + 1;
18097 /* There is no point looking beyond the final possible
18098 * printable, in MAX_PRINT_A */
18099 UV max = MIN(end, MAX_PRINT_A);
18101 while (temp_end <= max && ! isPRINT_A(temp_end)) {
18105 /* Here, temp_end points to one beyond the first printable if
18106 * found, or to one beyond 'max' if not. If none found, make
18107 * sure that we use the entire range */
18108 if (temp_end > MAX_PRINT_A) {
18109 temp_end = end + 1;
18112 /* Output the first part of the split range, the part that
18113 * doesn't have printables, with no looking for literals
18114 * (otherwise we would infinitely recurse) */
18115 put_range(sv, start, temp_end - 1, FALSE);
18117 /* The 2nd part of the range (if any) starts here. */
18120 /* We continue instead of dropping down because even if the 2nd
18121 * part is non-empty, it could be so short that we want to
18122 * output it specially, as tested for at the top of this loop.
18127 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
18128 * output a sub-range of just the digits or letters, then process
18129 * the remaining portion as usual. */
18130 if (isALPHANUMERIC_A(start)) {
18131 UV mask = (isDIGIT_A(start))
18136 UV temp_end = start + 1;
18138 /* Find the end of the sub-range that includes just the
18139 * characters in the same class as the first character in it */
18140 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
18145 /* For short ranges, don't duplicate the code above to output
18146 * them; just call recursively */
18147 if (temp_end - start < min_range_count) {
18148 put_range(sv, start, temp_end, FALSE);
18150 else { /* Output as a range */
18151 put_code_point(sv, start);
18152 sv_catpvs(sv, "-");
18153 put_code_point(sv, temp_end);
18155 start = temp_end + 1;
18159 /* We output any other printables as individual characters */
18160 if (isPUNCT_A(start) || isSPACE_A(start)) {
18161 while (start <= end && (isPUNCT_A(start)
18162 || isSPACE_A(start)))
18164 put_code_point(sv, start);
18169 } /* End of looking for literals */
18171 /* Here is not to output as a literal. Some control characters have
18172 * mnemonic names. Split off any of those at the beginning and end of
18173 * the range to print mnemonically. It isn't possible for many of
18174 * these to be in a row, so this won't overwhelm with output */
18175 while (isMNEMONIC_CNTRL(start) && start <= end) {
18176 put_code_point(sv, start);
18179 if (start < end && isMNEMONIC_CNTRL(end)) {
18181 /* Here, the final character in the range has a mnemonic name.
18182 * Work backwards from the end to find the final non-mnemonic */
18183 UV temp_end = end - 1;
18184 while (isMNEMONIC_CNTRL(temp_end)) {
18188 /* And separately output the range that doesn't have mnemonics */
18189 put_range(sv, start, temp_end, FALSE);
18191 /* Then output the mnemonic trailing controls */
18192 start = temp_end + 1;
18193 while (start <= end) {
18194 put_code_point(sv, start);
18200 /* As a final resort, output the range or subrange as hex. */
18202 this_end = (end < NUM_ANYOF_CODE_POINTS)
18204 : NUM_ANYOF_CODE_POINTS - 1;
18205 #if NUM_ANYOF_CODE_POINTS > 256
18206 format = (this_end < 256)
18207 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
18208 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
18210 format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
18212 GCC_DIAG_IGNORE(-Wformat-nonliteral);
18213 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
18220 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
18222 /* Appends to 'sv' a displayable version of the innards of the bracketed
18223 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
18224 * output anything, and bitmap_invlist, if not NULL, will point to an
18225 * inversion list of what is in the bit map */
18229 unsigned int punct_count = 0;
18230 SV* invlist = NULL;
18231 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
18232 bool allow_literals = TRUE;
18234 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
18236 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
18238 /* Worst case is exactly every-other code point is in the list */
18239 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
18241 /* Convert the bit map to an inversion list, keeping track of how many
18242 * ASCII puncts are set, including an extra amount for the backslashed
18244 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
18245 if (BITMAP_TEST(bitmap, i)) {
18246 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
18247 if (isPUNCT_A(i)) {
18249 if isBACKSLASHED_PUNCT(i) {
18256 /* Nothing to output */
18257 if (_invlist_len(*invlist_ptr) == 0) {
18258 SvREFCNT_dec(invlist);
18262 /* Generally, it is more readable if printable characters are output as
18263 * literals, but if a range (nearly) spans all of them, it's best to output
18264 * it as a single range. This code will use a single range if all but 2
18265 * printables are in it */
18266 invlist_iterinit(*invlist_ptr);
18267 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18269 /* If range starts beyond final printable, it doesn't have any in it */
18270 if (start > MAX_PRINT_A) {
18274 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
18275 * all but two, the range must start and end no later than 2 from
18277 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18278 if (end > MAX_PRINT_A) {
18284 if (end - start >= MAX_PRINT_A - ' ' - 2) {
18285 allow_literals = FALSE;
18290 invlist_iterfinish(*invlist_ptr);
18292 /* The legibility of the output depends mostly on how many punctuation
18293 * characters are output. There are 32 possible ASCII ones, and some have
18294 * an additional backslash, bringing it to currently 36, so if any more
18295 * than 18 are to be output, we can instead output it as its complement,
18296 * yielding fewer puncts, and making it more legible. But give some weight
18297 * to the fact that outputting it as a complement is less legible than a
18298 * straight output, so don't complement unless we are somewhat over the 18
18300 if (allow_literals && punct_count > 22) {
18301 sv_catpvs(sv, "^");
18303 /* Add everything remaining to the list, so when we invert it just
18304 * below, it will be excluded */
18305 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18306 _invlist_invert(*invlist_ptr);
18309 /* Here we have figured things out. Output each range */
18310 invlist_iterinit(*invlist_ptr);
18311 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18312 if (start >= NUM_ANYOF_CODE_POINTS) {
18315 put_range(sv, start, end, allow_literals);
18317 invlist_iterfinish(*invlist_ptr);
18322 #define CLEAR_OPTSTART \
18323 if (optstart) STMT_START { \
18324 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
18325 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18329 #define DUMPUNTIL(b,e) \
18331 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18333 STATIC const regnode *
18334 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18335 const regnode *last, const regnode *plast,
18336 SV* sv, I32 indent, U32 depth)
18338 U8 op = PSEUDO; /* Arbitrary non-END op. */
18339 const regnode *next;
18340 const regnode *optstart= NULL;
18342 RXi_GET_DECL(r,ri);
18343 GET_RE_DEBUG_FLAGS_DECL;
18345 PERL_ARGS_ASSERT_DUMPUNTIL;
18347 #ifdef DEBUG_DUMPUNTIL
18348 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18349 last ? last-start : 0,plast ? plast-start : 0);
18352 if (plast && plast < last)
18355 while (PL_regkind[op] != END && (!last || node < last)) {
18357 /* While that wasn't END last time... */
18360 if (op == CLOSE || op == WHILEM)
18362 next = regnext((regnode *)node);
18365 if (OP(node) == OPTIMIZED) {
18366 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18373 regprop(r, sv, node, NULL, NULL);
18374 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18375 (int)(2*indent + 1), "", SvPVX_const(sv));
18377 if (OP(node) != OPTIMIZED) {
18378 if (next == NULL) /* Next ptr. */
18379 PerlIO_printf(Perl_debug_log, " (0)");
18380 else if (PL_regkind[(U8)op] == BRANCH
18381 && PL_regkind[OP(next)] != BRANCH )
18382 PerlIO_printf(Perl_debug_log, " (FAIL)");
18384 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18385 (void)PerlIO_putc(Perl_debug_log, '\n');
18389 if (PL_regkind[(U8)op] == BRANCHJ) {
18392 const regnode *nnode = (OP(next) == LONGJMP
18393 ? regnext((regnode *)next)
18395 if (last && nnode > last)
18397 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18400 else if (PL_regkind[(U8)op] == BRANCH) {
18402 DUMPUNTIL(NEXTOPER(node), next);
18404 else if ( PL_regkind[(U8)op] == TRIE ) {
18405 const regnode *this_trie = node;
18406 const char op = OP(node);
18407 const U32 n = ARG(node);
18408 const reg_ac_data * const ac = op>=AHOCORASICK ?
18409 (reg_ac_data *)ri->data->data[n] :
18411 const reg_trie_data * const trie =
18412 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18414 AV *const trie_words
18415 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18417 const regnode *nextbranch= NULL;
18420 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18421 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18423 PerlIO_printf(Perl_debug_log, "%*s%s ",
18424 (int)(2*(indent+3)), "",
18426 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18427 SvCUR(*elem_ptr), 60,
18428 PL_colors[0], PL_colors[1],
18430 ? PERL_PV_ESCAPE_UNI
18432 | PERL_PV_PRETTY_ELLIPSES
18433 | PERL_PV_PRETTY_LTGT
18438 U16 dist= trie->jump[word_idx+1];
18439 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18440 (UV)((dist ? this_trie + dist : next) - start));
18443 nextbranch= this_trie + trie->jump[0];
18444 DUMPUNTIL(this_trie + dist, nextbranch);
18446 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18447 nextbranch= regnext((regnode *)nextbranch);
18449 PerlIO_printf(Perl_debug_log, "\n");
18452 if (last && next > last)
18457 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
18458 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18459 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18461 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18463 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18465 else if ( op == PLUS || op == STAR) {
18466 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18468 else if (PL_regkind[(U8)op] == ANYOF) {
18469 /* arglen 1 + class block */
18470 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18471 ? ANYOF_POSIXL_SKIP
18473 node = NEXTOPER(node);
18475 else if (PL_regkind[(U8)op] == EXACT) {
18476 /* Literal string, where present. */
18477 node += NODE_SZ_STR(node) - 1;
18478 node = NEXTOPER(node);
18481 node = NEXTOPER(node);
18482 node += regarglen[(U8)op];
18484 if (op == CURLYX || op == OPEN)
18488 #ifdef DEBUG_DUMPUNTIL
18489 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18494 #endif /* DEBUGGING */
18497 * ex: set ts=8 sts=4 sw=4 et: