5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
108 /* this is a chain of data about sub patterns we are processing that
109 need to be handled separately/specially in study_chunk. Its so
110 we can simulate recursion without losing state. */
112 typedef struct scan_frame {
113 regnode *last_regnode; /* last node to process in this frame */
114 regnode *next_regnode; /* next node to process when last is reached */
115 U32 prev_recursed_depth;
116 I32 stopparen; /* what stopparen do we use */
117 U32 is_top_frame; /* what flags do we use? */
119 struct scan_frame *this_prev_frame; /* this previous frame */
120 struct scan_frame *prev_frame; /* previous frame */
121 struct scan_frame *next_frame; /* next frame */
124 /* Certain characters are output as a sequence with the first being a
126 #define isBACKSLASHED_PUNCT(c) \
127 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
130 struct RExC_state_t {
131 U32 flags; /* RXf_* are we folding, multilining? */
132 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
133 char *precomp; /* uncompiled string. */
134 REGEXP *rx_sv; /* The SV that is the regexp. */
135 regexp *rx; /* perl core regexp structure */
136 regexp_internal *rxi; /* internal data for regexp object
138 char *start; /* Start of input for compile */
139 char *end; /* End of input for compile */
140 char *parse; /* Input-scan pointer. */
141 SSize_t whilem_seen; /* number of WHILEM in this expr */
142 regnode *emit_start; /* Start of emitted-code area */
143 regnode *emit_bound; /* First regnode outside of the
145 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
146 implies compiling, so don't emit */
147 regnode_ssc emit_dummy; /* placeholder for emit to point to;
148 large enough for the largest
149 non-EXACTish node, so can use it as
151 I32 naughty; /* How bad is this pattern? */
152 I32 sawback; /* Did we see \1, ...? */
154 SSize_t size; /* Code size. */
155 I32 npar; /* Capture buffer count, (OPEN) plus
156 one. ("par" 0 is the whole
158 I32 nestroot; /* root parens we are in - used by
162 regnode **open_parens; /* pointers to open parens */
163 regnode **close_parens; /* pointers to close parens */
164 regnode *opend; /* END node in program */
165 I32 utf8; /* whether the pattern is utf8 or not */
166 I32 orig_utf8; /* whether the pattern was originally in utf8 */
167 /* XXX use this for future optimisation of case
168 * where pattern must be upgraded to utf8. */
169 I32 uni_semantics; /* If a d charset modifier should use unicode
170 rules, even if the pattern is not in
172 HV *paren_names; /* Paren names */
174 regnode **recurse; /* Recurse regops */
175 I32 recurse_count; /* Number of recurse regops */
176 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
178 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
182 I32 override_recoding;
184 I32 recode_x_to_native;
186 I32 in_multi_char_class;
187 struct reg_code_block *code_blocks; /* positions of literal (?{})
189 int num_code_blocks; /* size of code_blocks[] */
190 int code_index; /* next code_blocks[] slot */
191 SSize_t maxlen; /* mininum possible number of chars in string to match */
192 scan_frame *frame_head;
193 scan_frame *frame_last;
196 #ifdef ADD_TO_REGEXEC
197 char *starttry; /* -Dr: where regtry was called. */
198 #define RExC_starttry (pRExC_state->starttry)
200 SV *runtime_code_qr; /* qr with the runtime code blocks */
202 const char *lastparse;
204 AV *paren_name_list; /* idx -> name */
205 U32 study_chunk_recursed_count;
208 #define RExC_lastparse (pRExC_state->lastparse)
209 #define RExC_lastnum (pRExC_state->lastnum)
210 #define RExC_paren_name_list (pRExC_state->paren_name_list)
211 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
212 #define RExC_mysv (pRExC_state->mysv1)
213 #define RExC_mysv1 (pRExC_state->mysv1)
214 #define RExC_mysv2 (pRExC_state->mysv2)
217 bool seen_unfolded_sharp_s;
220 #define RExC_flags (pRExC_state->flags)
221 #define RExC_pm_flags (pRExC_state->pm_flags)
222 #define RExC_precomp (pRExC_state->precomp)
223 #define RExC_rx_sv (pRExC_state->rx_sv)
224 #define RExC_rx (pRExC_state->rx)
225 #define RExC_rxi (pRExC_state->rxi)
226 #define RExC_start (pRExC_state->start)
227 #define RExC_end (pRExC_state->end)
228 #define RExC_parse (pRExC_state->parse)
229 #define RExC_whilem_seen (pRExC_state->whilem_seen)
231 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
232 * EXACTF node, hence was parsed under /di rules. If later in the parse,
233 * something forces the pattern into using /ui rules, the sharp s should be
234 * folded into the sequence 'ss', which takes up more space than previously
235 * calculated. This means that the sizing pass needs to be restarted. (The
236 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
237 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
238 * so there is no need to resize [perl #125990]. */
239 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241 #ifdef RE_TRACK_PATTERN_OFFSETS
242 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
245 #define RExC_emit (pRExC_state->emit)
246 #define RExC_emit_dummy (pRExC_state->emit_dummy)
247 #define RExC_emit_start (pRExC_state->emit_start)
248 #define RExC_emit_bound (pRExC_state->emit_bound)
249 #define RExC_sawback (pRExC_state->sawback)
250 #define RExC_seen (pRExC_state->seen)
251 #define RExC_size (pRExC_state->size)
252 #define RExC_maxlen (pRExC_state->maxlen)
253 #define RExC_npar (pRExC_state->npar)
254 #define RExC_nestroot (pRExC_state->nestroot)
255 #define RExC_extralen (pRExC_state->extralen)
256 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
257 #define RExC_utf8 (pRExC_state->utf8)
258 #define RExC_uni_semantics (pRExC_state->uni_semantics)
259 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
260 #define RExC_open_parens (pRExC_state->open_parens)
261 #define RExC_close_parens (pRExC_state->close_parens)
262 #define RExC_opend (pRExC_state->opend)
263 #define RExC_paren_names (pRExC_state->paren_names)
264 #define RExC_recurse (pRExC_state->recurse)
265 #define RExC_recurse_count (pRExC_state->recurse_count)
266 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
267 #define RExC_study_chunk_recursed_bytes \
268 (pRExC_state->study_chunk_recursed_bytes)
269 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
270 #define RExC_contains_locale (pRExC_state->contains_locale)
271 #define RExC_contains_i (pRExC_state->contains_i)
272 #define RExC_override_recoding (pRExC_state->override_recoding)
274 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
276 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
277 #define RExC_frame_head (pRExC_state->frame_head)
278 #define RExC_frame_last (pRExC_state->frame_last)
279 #define RExC_frame_count (pRExC_state->frame_count)
280 #define RExC_strict (pRExC_state->strict)
282 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
283 * a flag to disable back-off on the fixed/floating substrings - if it's
284 * a high complexity pattern we assume the benefit of avoiding a full match
285 * is worth the cost of checking for the substrings even if they rarely help.
287 #define RExC_naughty (pRExC_state->naughty)
288 #define TOO_NAUGHTY (10)
289 #define MARK_NAUGHTY(add) \
290 if (RExC_naughty < TOO_NAUGHTY) \
291 RExC_naughty += (add)
292 #define MARK_NAUGHTY_EXP(exp, add) \
293 if (RExC_naughty < TOO_NAUGHTY) \
294 RExC_naughty += RExC_naughty / (exp) + (add)
296 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
297 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
298 ((*s) == '{' && regcurly(s)))
301 * Flags to be passed up and down.
303 #define WORST 0 /* Worst case. */
304 #define HASWIDTH 0x01 /* Known to match non-null strings. */
306 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
307 * character. (There needs to be a case: in the switch statement in regexec.c
308 * for any node marked SIMPLE.) Note that this is not the same thing as
311 #define SPSTART 0x04 /* Starts with * or + */
312 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
313 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
314 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */
315 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
316 calcuate sizes as UTF-8 */
318 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
320 /* whether trie related optimizations are enabled */
321 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
322 #define TRIE_STUDY_OPT
323 #define FULL_TRIE_STUDY
329 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
330 #define PBITVAL(paren) (1 << ((paren) & 7))
331 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
332 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
333 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
335 #define REQUIRE_UTF8(flagp) STMT_START { \
338 *flagp = RESTART_PASS1|NEED_UTF8; \
343 /* Change from /d into /u rules, and restart the parse if we've already seen
344 * something whose size would increase as a result, by setting *flagp and
345 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
346 * we've change to /u during the parse. */
347 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
349 if (DEPENDS_SEMANTICS) { \
351 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
352 RExC_uni_semantics = 1; \
353 if (RExC_seen_unfolded_sharp_s) { \
354 *flagp |= RESTART_PASS1; \
355 return restart_retval; \
360 /* This converts the named class defined in regcomp.h to its equivalent class
361 * number defined in handy.h. */
362 #define namedclass_to_classnum(class) ((int) ((class) / 2))
363 #define classnum_to_namedclass(classnum) ((classnum) * 2)
365 #define _invlist_union_complement_2nd(a, b, output) \
366 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
367 #define _invlist_intersection_complement_2nd(a, b, output) \
368 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
370 /* About scan_data_t.
372 During optimisation we recurse through the regexp program performing
373 various inplace (keyhole style) optimisations. In addition study_chunk
374 and scan_commit populate this data structure with information about
375 what strings MUST appear in the pattern. We look for the longest
376 string that must appear at a fixed location, and we look for the
377 longest string that may appear at a floating location. So for instance
382 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
383 strings (because they follow a .* construct). study_chunk will identify
384 both FOO and BAR as being the longest fixed and floating strings respectively.
386 The strings can be composites, for instance
390 will result in a composite fixed substring 'foo'.
392 For each string some basic information is maintained:
394 - offset or min_offset
395 This is the position the string must appear at, or not before.
396 It also implicitly (when combined with minlenp) tells us how many
397 characters must match before the string we are searching for.
398 Likewise when combined with minlenp and the length of the string it
399 tells us how many characters must appear after the string we have
403 Only used for floating strings. This is the rightmost point that
404 the string can appear at. If set to SSize_t_MAX it indicates that the
405 string can occur infinitely far to the right.
408 A pointer to the minimum number of characters of the pattern that the
409 string was found inside. This is important as in the case of positive
410 lookahead or positive lookbehind we can have multiple patterns
415 The minimum length of the pattern overall is 3, the minimum length
416 of the lookahead part is 3, but the minimum length of the part that
417 will actually match is 1. So 'FOO's minimum length is 3, but the
418 minimum length for the F is 1. This is important as the minimum length
419 is used to determine offsets in front of and behind the string being
420 looked for. Since strings can be composites this is the length of the
421 pattern at the time it was committed with a scan_commit. Note that
422 the length is calculated by study_chunk, so that the minimum lengths
423 are not known until the full pattern has been compiled, thus the
424 pointer to the value.
428 In the case of lookbehind the string being searched for can be
429 offset past the start point of the final matching string.
430 If this value was just blithely removed from the min_offset it would
431 invalidate some of the calculations for how many chars must match
432 before or after (as they are derived from min_offset and minlen and
433 the length of the string being searched for).
434 When the final pattern is compiled and the data is moved from the
435 scan_data_t structure into the regexp structure the information
436 about lookbehind is factored in, with the information that would
437 have been lost precalculated in the end_shift field for the
440 The fields pos_min and pos_delta are used to store the minimum offset
441 and the delta to the maximum offset at the current point in the pattern.
445 typedef struct scan_data_t {
446 /*I32 len_min; unused */
447 /*I32 len_delta; unused */
451 SSize_t last_end; /* min value, <0 unless valid. */
452 SSize_t last_start_min;
453 SSize_t last_start_max;
454 SV **longest; /* Either &l_fixed, or &l_float. */
455 SV *longest_fixed; /* longest fixed string found in pattern */
456 SSize_t offset_fixed; /* offset where it starts */
457 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
458 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
459 SV *longest_float; /* longest floating string found in pattern */
460 SSize_t offset_float_min; /* earliest point in string it can appear */
461 SSize_t offset_float_max; /* latest point in string it can appear */
462 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
463 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
466 SSize_t *last_closep;
467 regnode_ssc *start_class;
471 * Forward declarations for pregcomp()'s friends.
474 static const scan_data_t zero_scan_data =
475 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
477 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
478 #define SF_BEFORE_SEOL 0x0001
479 #define SF_BEFORE_MEOL 0x0002
480 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
481 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
483 #define SF_FIX_SHIFT_EOL (+2)
484 #define SF_FL_SHIFT_EOL (+4)
486 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
487 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
489 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
490 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
491 #define SF_IS_INF 0x0040
492 #define SF_HAS_PAR 0x0080
493 #define SF_IN_PAR 0x0100
494 #define SF_HAS_EVAL 0x0200
495 #define SCF_DO_SUBSTR 0x0400
496 #define SCF_DO_STCLASS_AND 0x0800
497 #define SCF_DO_STCLASS_OR 0x1000
498 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
499 #define SCF_WHILEM_VISITED_POS 0x2000
501 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
502 #define SCF_SEEN_ACCEPT 0x8000
503 #define SCF_TRIE_DOING_RESTUDY 0x10000
504 #define SCF_IN_DEFINE 0x20000
509 #define UTF cBOOL(RExC_utf8)
511 /* The enums for all these are ordered so things work out correctly */
512 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
513 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
514 == REGEX_DEPENDS_CHARSET)
515 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
516 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
517 >= REGEX_UNICODE_CHARSET)
518 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
519 == REGEX_ASCII_RESTRICTED_CHARSET)
520 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
521 >= REGEX_ASCII_RESTRICTED_CHARSET)
522 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
523 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
525 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
527 /* For programs that want to be strictly Unicode compatible by dying if any
528 * attempt is made to match a non-Unicode code point against a Unicode
530 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
532 #define OOB_NAMEDCLASS -1
534 /* There is no code point that is out-of-bounds, so this is problematic. But
535 * its only current use is to initialize a variable that is always set before
537 #define OOB_UNICODE 0xDEADBEEF
539 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
540 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
543 /* length of regex to show in messages that don't mark a position within */
544 #define RegexLengthToShowInErrorMessages 127
547 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
548 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
549 * op/pragma/warn/regcomp.
551 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
552 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
554 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
555 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
557 #define REPORT_LOCATION_ARGS(offset) \
558 UTF8fARG(UTF, offset, RExC_precomp), \
559 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
561 /* Used to point after bad bytes for an error message, but avoid skipping
562 * past a nul byte. */
563 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
566 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
567 * arg. Show regex, up to a maximum length. If it's too long, chop and add
570 #define _FAIL(code) STMT_START { \
571 const char *ellipses = ""; \
572 IV len = RExC_end - RExC_precomp; \
575 SAVEFREESV(RExC_rx_sv); \
576 if (len > RegexLengthToShowInErrorMessages) { \
577 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
578 len = RegexLengthToShowInErrorMessages - 10; \
584 #define FAIL(msg) _FAIL( \
585 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
586 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
588 #define FAIL2(msg,arg) _FAIL( \
589 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
590 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
593 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
595 #define Simple_vFAIL(m) STMT_START { \
597 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
598 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
599 m, REPORT_LOCATION_ARGS(offset)); \
603 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
605 #define vFAIL(m) STMT_START { \
607 SAVEFREESV(RExC_rx_sv); \
612 * Like Simple_vFAIL(), but accepts two arguments.
614 #define Simple_vFAIL2(m,a1) STMT_START { \
615 const IV offset = RExC_parse - RExC_precomp; \
616 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
617 REPORT_LOCATION_ARGS(offset)); \
621 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
623 #define vFAIL2(m,a1) STMT_START { \
625 SAVEFREESV(RExC_rx_sv); \
626 Simple_vFAIL2(m, a1); \
631 * Like Simple_vFAIL(), but accepts three arguments.
633 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
634 const IV offset = RExC_parse - RExC_precomp; \
635 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
636 REPORT_LOCATION_ARGS(offset)); \
640 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
642 #define vFAIL3(m,a1,a2) STMT_START { \
644 SAVEFREESV(RExC_rx_sv); \
645 Simple_vFAIL3(m, a1, a2); \
649 * Like Simple_vFAIL(), but accepts four arguments.
651 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
652 const IV offset = RExC_parse - RExC_precomp; \
653 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
654 REPORT_LOCATION_ARGS(offset)); \
657 #define vFAIL4(m,a1,a2,a3) STMT_START { \
659 SAVEFREESV(RExC_rx_sv); \
660 Simple_vFAIL4(m, a1, a2, a3); \
663 /* A specialized version of vFAIL2 that works with UTF8f */
664 #define vFAIL2utf8f(m, a1) STMT_START { \
665 const IV offset = RExC_parse - RExC_precomp; \
667 SAVEFREESV(RExC_rx_sv); \
668 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
669 REPORT_LOCATION_ARGS(offset)); \
672 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
673 const IV offset = RExC_parse - RExC_precomp; \
675 SAVEFREESV(RExC_rx_sv); \
676 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
677 REPORT_LOCATION_ARGS(offset)); \
680 /* These have asserts in them because of [perl #122671] Many warnings in
681 * regcomp.c can occur twice. If they get output in pass1 and later in that
682 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
683 * would get output again. So they should be output in pass2, and these
684 * asserts make sure new warnings follow that paradigm. */
686 /* m is not necessarily a "literal string", in this macro */
687 #define reg_warn_non_literal_string(loc, m) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
690 m, REPORT_LOCATION_ARGS(offset)); \
693 #define ckWARNreg(loc,m) STMT_START { \
694 const IV offset = loc - RExC_precomp; \
695 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
696 REPORT_LOCATION_ARGS(offset)); \
699 #define vWARN(loc, m) STMT_START { \
700 const IV offset = loc - RExC_precomp; \
701 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
702 REPORT_LOCATION_ARGS(offset)); \
705 #define vWARN_dep(loc, m) STMT_START { \
706 const IV offset = loc - RExC_precomp; \
707 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
708 REPORT_LOCATION_ARGS(offset)); \
711 #define ckWARNdep(loc,m) STMT_START { \
712 const IV offset = loc - RExC_precomp; \
713 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
715 REPORT_LOCATION_ARGS(offset)); \
718 #define ckWARNregdep(loc,m) STMT_START { \
719 const IV offset = loc - RExC_precomp; \
720 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
722 REPORT_LOCATION_ARGS(offset)); \
725 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
726 const IV offset = loc - RExC_precomp; \
727 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
729 a1, REPORT_LOCATION_ARGS(offset)); \
732 #define ckWARN2reg(loc, m, a1) STMT_START { \
733 const IV offset = loc - RExC_precomp; \
734 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
735 a1, REPORT_LOCATION_ARGS(offset)); \
738 #define vWARN3(loc, m, a1, a2) STMT_START { \
739 const IV offset = loc - RExC_precomp; \
740 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
741 a1, a2, REPORT_LOCATION_ARGS(offset)); \
744 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
745 const IV offset = loc - RExC_precomp; \
746 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
747 a1, a2, REPORT_LOCATION_ARGS(offset)); \
750 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
751 const IV offset = loc - RExC_precomp; \
752 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
753 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
756 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
757 const IV offset = loc - RExC_precomp; \
758 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
759 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
762 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
763 const IV offset = loc - RExC_precomp; \
764 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
765 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
768 /* Macros for recording node offsets. 20001227 mjd@plover.com
769 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
770 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
771 * Element 0 holds the number n.
772 * Position is 1 indexed.
774 #ifndef RE_TRACK_PATTERN_OFFSETS
775 #define Set_Node_Offset_To_R(node,byte)
776 #define Set_Node_Offset(node,byte)
777 #define Set_Cur_Node_Offset
778 #define Set_Node_Length_To_R(node,len)
779 #define Set_Node_Length(node,len)
780 #define Set_Node_Cur_Length(node,start)
781 #define Node_Offset(n)
782 #define Node_Length(n)
783 #define Set_Node_Offset_Length(node,offset,len)
784 #define ProgLen(ri) ri->u.proglen
785 #define SetProgLen(ri,x) ri->u.proglen = x
787 #define ProgLen(ri) ri->u.offsets[0]
788 #define SetProgLen(ri,x) ri->u.offsets[0] = x
789 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
791 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
792 __LINE__, (int)(node), (int)(byte))); \
794 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
797 RExC_offsets[2*(node)-1] = (byte); \
802 #define Set_Node_Offset(node,byte) \
803 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
804 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
806 #define Set_Node_Length_To_R(node,len) STMT_START { \
808 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
809 __LINE__, (int)(node), (int)(len))); \
811 Perl_croak(aTHX_ "value of node is %d in Length macro", \
814 RExC_offsets[2*(node)] = (len); \
819 #define Set_Node_Length(node,len) \
820 Set_Node_Length_To_R((node)-RExC_emit_start, len)
821 #define Set_Node_Cur_Length(node, start) \
822 Set_Node_Length(node, RExC_parse - start)
824 /* Get offsets and lengths */
825 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
826 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
828 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
829 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
830 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
834 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
835 #define EXPERIMENTAL_INPLACESCAN
836 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
838 #define DEBUG_RExC_seen() \
839 DEBUG_OPTIMISE_MORE_r({ \
840 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
842 if (RExC_seen & REG_ZERO_LEN_SEEN) \
843 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
845 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
846 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
848 if (RExC_seen & REG_GPOS_SEEN) \
849 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
851 if (RExC_seen & REG_RECURSE_SEEN) \
852 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
854 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
855 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
857 if (RExC_seen & REG_VERBARG_SEEN) \
858 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
860 if (RExC_seen & REG_CUTGROUP_SEEN) \
861 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
863 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
864 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
866 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
867 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
869 if (RExC_seen & REG_GOSTART_SEEN) \
870 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
872 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
873 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
875 PerlIO_printf(Perl_debug_log,"\n"); \
878 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
879 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
881 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
883 PerlIO_printf(Perl_debug_log, "%s", open_str); \
884 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
885 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
886 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
887 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
888 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
889 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
890 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
891 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
892 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
893 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
894 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
895 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
896 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
897 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
898 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
899 PerlIO_printf(Perl_debug_log, "%s", close_str); \
903 #define DEBUG_STUDYDATA(str,data,depth) \
904 DEBUG_OPTIMISE_MORE_r(if(data){ \
905 PerlIO_printf(Perl_debug_log, \
906 "%*s" str "Pos:%"IVdf"/%"IVdf \
908 (int)(depth)*2, "", \
909 (IV)((data)->pos_min), \
910 (IV)((data)->pos_delta), \
911 (UV)((data)->flags) \
913 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
914 PerlIO_printf(Perl_debug_log, \
915 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
916 (IV)((data)->whilem_c), \
917 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
918 is_inf ? "INF " : "" \
920 if ((data)->last_found) \
921 PerlIO_printf(Perl_debug_log, \
922 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
923 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
924 SvPVX_const((data)->last_found), \
925 (IV)((data)->last_end), \
926 (IV)((data)->last_start_min), \
927 (IV)((data)->last_start_max), \
928 ((data)->longest && \
929 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
930 SvPVX_const((data)->longest_fixed), \
931 (IV)((data)->offset_fixed), \
932 ((data)->longest && \
933 (data)->longest==&((data)->longest_float)) ? "*" : "", \
934 SvPVX_const((data)->longest_float), \
935 (IV)((data)->offset_float_min), \
936 (IV)((data)->offset_float_max) \
938 PerlIO_printf(Perl_debug_log,"\n"); \
941 /* is c a control character for which we have a mnemonic? */
942 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
945 S_cntrl_to_mnemonic(const U8 c)
947 /* Returns the mnemonic string that represents character 'c', if one
948 * exists; NULL otherwise. The only ones that exist for the purposes of
949 * this routine are a few control characters */
952 case '\a': return "\\a";
953 case '\b': return "\\b";
954 case ESC_NATIVE: return "\\e";
955 case '\f': return "\\f";
956 case '\n': return "\\n";
957 case '\r': return "\\r";
958 case '\t': return "\\t";
964 /* Mark that we cannot extend a found fixed substring at this point.
965 Update the longest found anchored substring and the longest found
966 floating substrings if needed. */
969 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
970 SSize_t *minlenp, int is_inf)
972 const STRLEN l = CHR_SVLEN(data->last_found);
973 const STRLEN old_l = CHR_SVLEN(*data->longest);
974 GET_RE_DEBUG_FLAGS_DECL;
976 PERL_ARGS_ASSERT_SCAN_COMMIT;
978 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
979 SvSetMagicSV(*data->longest, data->last_found);
980 if (*data->longest == data->longest_fixed) {
981 data->offset_fixed = l ? data->last_start_min : data->pos_min;
982 if (data->flags & SF_BEFORE_EOL)
984 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
986 data->flags &= ~SF_FIX_BEFORE_EOL;
987 data->minlen_fixed=minlenp;
988 data->lookbehind_fixed=0;
990 else { /* *data->longest == data->longest_float */
991 data->offset_float_min = l ? data->last_start_min : data->pos_min;
992 data->offset_float_max = (l
993 ? data->last_start_max
994 : (data->pos_delta > SSize_t_MAX - data->pos_min
996 : data->pos_min + data->pos_delta));
998 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
999 data->offset_float_max = SSize_t_MAX;
1000 if (data->flags & SF_BEFORE_EOL)
1002 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1004 data->flags &= ~SF_FL_BEFORE_EOL;
1005 data->minlen_float=minlenp;
1006 data->lookbehind_float=0;
1009 SvCUR_set(data->last_found, 0);
1011 SV * const sv = data->last_found;
1012 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1013 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1018 data->last_end = -1;
1019 data->flags &= ~SF_BEFORE_EOL;
1020 DEBUG_STUDYDATA("commit: ",data,0);
1023 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1024 * list that describes which code points it matches */
1027 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1029 /* Set the SSC 'ssc' to match an empty string or any code point */
1031 PERL_ARGS_ASSERT_SSC_ANYTHING;
1033 assert(is_ANYOF_SYNTHETIC(ssc));
1035 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1036 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1037 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1041 S_ssc_is_anything(const regnode_ssc *ssc)
1043 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1044 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1045 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1046 * in any way, so there's no point in using it */
1051 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1053 assert(is_ANYOF_SYNTHETIC(ssc));
1055 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1059 /* See if the list consists solely of the range 0 - Infinity */
1060 invlist_iterinit(ssc->invlist);
1061 ret = invlist_iternext(ssc->invlist, &start, &end)
1065 invlist_iterfinish(ssc->invlist);
1071 /* If e.g., both \w and \W are set, matches everything */
1072 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1074 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1075 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1085 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1087 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1088 * string, any code point, or any posix class under locale */
1090 PERL_ARGS_ASSERT_SSC_INIT;
1092 Zero(ssc, 1, regnode_ssc);
1093 set_ANYOF_SYNTHETIC(ssc);
1094 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1097 /* If any portion of the regex is to operate under locale rules that aren't
1098 * fully known at compile time, initialization includes it. The reason
1099 * this isn't done for all regexes is that the optimizer was written under
1100 * the assumption that locale was all-or-nothing. Given the complexity and
1101 * lack of documentation in the optimizer, and that there are inadequate
1102 * test cases for locale, many parts of it may not work properly, it is
1103 * safest to avoid locale unless necessary. */
1104 if (RExC_contains_locale) {
1105 ANYOF_POSIXL_SETALL(ssc);
1108 ANYOF_POSIXL_ZERO(ssc);
1113 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1114 const regnode_ssc *ssc)
1116 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1117 * to the list of code points matched, and locale posix classes; hence does
1118 * not check its flags) */
1123 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1125 assert(is_ANYOF_SYNTHETIC(ssc));
1127 invlist_iterinit(ssc->invlist);
1128 ret = invlist_iternext(ssc->invlist, &start, &end)
1132 invlist_iterfinish(ssc->invlist);
1138 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1146 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1147 const regnode_charclass* const node)
1149 /* Returns a mortal inversion list defining which code points are matched
1150 * by 'node', which is of type ANYOF. Handles complementing the result if
1151 * appropriate. If some code points aren't knowable at this time, the
1152 * returned list must, and will, contain every code point that is a
1155 SV* invlist = sv_2mortal(_new_invlist(0));
1156 SV* only_utf8_locale_invlist = NULL;
1158 const U32 n = ARG(node);
1159 bool new_node_has_latin1 = FALSE;
1161 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1163 /* Look at the data structure created by S_set_ANYOF_arg() */
1164 if (n != ANYOF_ONLY_HAS_BITMAP) {
1165 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1166 AV * const av = MUTABLE_AV(SvRV(rv));
1167 SV **const ary = AvARRAY(av);
1168 assert(RExC_rxi->data->what[n] == 's');
1170 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1171 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1173 else if (ary[0] && ary[0] != &PL_sv_undef) {
1175 /* Here, no compile-time swash, and there are things that won't be
1176 * known until runtime -- we have to assume it could be anything */
1177 return _add_range_to_invlist(invlist, 0, UV_MAX);
1179 else if (ary[3] && ary[3] != &PL_sv_undef) {
1181 /* Here no compile-time swash, and no run-time only data. Use the
1182 * node's inversion list */
1183 invlist = sv_2mortal(invlist_clone(ary[3]));
1186 /* Get the code points valid only under UTF-8 locales */
1187 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1188 && ary[2] && ary[2] != &PL_sv_undef)
1190 only_utf8_locale_invlist = ary[2];
1194 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1195 * code points, and an inversion list for the others, but if there are code
1196 * points that should match only conditionally on the target string being
1197 * UTF-8, those are placed in the inversion list, and not the bitmap.
1198 * Since there are circumstances under which they could match, they are
1199 * included in the SSC. But if the ANYOF node is to be inverted, we have
1200 * to exclude them here, so that when we invert below, the end result
1201 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1202 * have to do this here before we add the unconditionally matched code
1204 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1205 _invlist_intersection_complement_2nd(invlist,
1210 /* Add in the points from the bit map */
1211 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1212 if (ANYOF_BITMAP_TEST(node, i)) {
1213 invlist = add_cp_to_invlist(invlist, i);
1214 new_node_has_latin1 = TRUE;
1218 /* If this can match all upper Latin1 code points, have to add them
1220 if (OP(node) == ANYOFD
1221 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1223 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1226 /* Similarly for these */
1227 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1228 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1231 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1232 _invlist_invert(invlist);
1234 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1236 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1237 * locale. We can skip this if there are no 0-255 at all. */
1238 _invlist_union(invlist, PL_Latin1, &invlist);
1241 /* Similarly add the UTF-8 locale possible matches. These have to be
1242 * deferred until after the non-UTF-8 locale ones are taken care of just
1243 * above, or it leads to wrong results under ANYOF_INVERT */
1244 if (only_utf8_locale_invlist) {
1245 _invlist_union_maybe_complement_2nd(invlist,
1246 only_utf8_locale_invlist,
1247 ANYOF_FLAGS(node) & ANYOF_INVERT,
1254 /* These two functions currently do the exact same thing */
1255 #define ssc_init_zero ssc_init
1257 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1258 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1260 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1261 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1262 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1265 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1266 const regnode_charclass *and_with)
1268 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1269 * another SSC or a regular ANYOF class. Can create false positives. */
1274 PERL_ARGS_ASSERT_SSC_AND;
1276 assert(is_ANYOF_SYNTHETIC(ssc));
1278 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1279 * the code point inversion list and just the relevant flags */
1280 if (is_ANYOF_SYNTHETIC(and_with)) {
1281 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1282 anded_flags = ANYOF_FLAGS(and_with);
1284 /* XXX This is a kludge around what appears to be deficiencies in the
1285 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1286 * there are paths through the optimizer where it doesn't get weeded
1287 * out when it should. And if we don't make some extra provision for
1288 * it like the code just below, it doesn't get added when it should.
1289 * This solution is to add it only when AND'ing, which is here, and
1290 * only when what is being AND'ed is the pristine, original node
1291 * matching anything. Thus it is like adding it to ssc_anything() but
1292 * only when the result is to be AND'ed. Probably the same solution
1293 * could be adopted for the same problem we have with /l matching,
1294 * which is solved differently in S_ssc_init(), and that would lead to
1295 * fewer false positives than that solution has. But if this solution
1296 * creates bugs, the consequences are only that a warning isn't raised
1297 * that should be; while the consequences for having /l bugs is
1298 * incorrect matches */
1299 if (ssc_is_anything((regnode_ssc *)and_with)) {
1300 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1304 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1305 if (OP(and_with) == ANYOFD) {
1306 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1309 anded_flags = ANYOF_FLAGS(and_with)
1310 &( ANYOF_COMMON_FLAGS
1311 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
1315 ANYOF_FLAGS(ssc) &= anded_flags;
1317 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1318 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1319 * 'and_with' may be inverted. When not inverted, we have the situation of
1321 * (C1 | P1) & (C2 | P2)
1322 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1323 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1324 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1325 * <= ((C1 & C2) | P1 | P2)
1326 * Alternatively, the last few steps could be:
1327 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1328 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1329 * <= (C1 | C2 | (P1 & P2))
1330 * We favor the second approach if either P1 or P2 is non-empty. This is
1331 * because these components are a barrier to doing optimizations, as what
1332 * they match cannot be known until the moment of matching as they are
1333 * dependent on the current locale, 'AND"ing them likely will reduce or
1335 * But we can do better if we know that C1,P1 are in their initial state (a
1336 * frequent occurrence), each matching everything:
1337 * (<everything>) & (C2 | P2) = C2 | P2
1338 * Similarly, if C2,P2 are in their initial state (again a frequent
1339 * occurrence), the result is a no-op
1340 * (C1 | P1) & (<everything>) = C1 | P1
1343 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1344 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1345 * <= (C1 & ~C2) | (P1 & ~P2)
1348 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1349 && ! is_ANYOF_SYNTHETIC(and_with))
1353 ssc_intersection(ssc,
1355 FALSE /* Has already been inverted */
1358 /* If either P1 or P2 is empty, the intersection will be also; can skip
1360 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1361 ANYOF_POSIXL_ZERO(ssc);
1363 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1365 /* Note that the Posix class component P from 'and_with' actually
1367 * P = Pa | Pb | ... | Pn
1368 * where each component is one posix class, such as in [\w\s].
1370 * ~P = ~(Pa | Pb | ... | Pn)
1371 * = ~Pa & ~Pb & ... & ~Pn
1372 * <= ~Pa | ~Pb | ... | ~Pn
1373 * The last is something we can easily calculate, but unfortunately
1374 * is likely to have many false positives. We could do better
1375 * in some (but certainly not all) instances if two classes in
1376 * P have known relationships. For example
1377 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1379 * :lower: & :print: = :lower:
1380 * And similarly for classes that must be disjoint. For example,
1381 * since \s and \w can have no elements in common based on rules in
1382 * the POSIX standard,
1383 * \w & ^\S = nothing
1384 * Unfortunately, some vendor locales do not meet the Posix
1385 * standard, in particular almost everything by Microsoft.
1386 * The loop below just changes e.g., \w into \W and vice versa */
1388 regnode_charclass_posixl temp;
1389 int add = 1; /* To calculate the index of the complement */
1391 ANYOF_POSIXL_ZERO(&temp);
1392 for (i = 0; i < ANYOF_MAX; i++) {
1394 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1395 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1397 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1398 ANYOF_POSIXL_SET(&temp, i + add);
1400 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1402 ANYOF_POSIXL_AND(&temp, ssc);
1404 } /* else ssc already has no posixes */
1405 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1406 in its initial state */
1407 else if (! is_ANYOF_SYNTHETIC(and_with)
1408 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1410 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1411 * copy it over 'ssc' */
1412 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1413 if (is_ANYOF_SYNTHETIC(and_with)) {
1414 StructCopy(and_with, ssc, regnode_ssc);
1417 ssc->invlist = anded_cp_list;
1418 ANYOF_POSIXL_ZERO(ssc);
1419 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1424 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1425 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1427 /* One or the other of P1, P2 is non-empty. */
1428 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1429 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1431 ssc_union(ssc, anded_cp_list, FALSE);
1433 else { /* P1 = P2 = empty */
1434 ssc_intersection(ssc, anded_cp_list, FALSE);
1440 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1441 const regnode_charclass *or_with)
1443 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1444 * another SSC or a regular ANYOF class. Can create false positives if
1445 * 'or_with' is to be inverted. */
1450 PERL_ARGS_ASSERT_SSC_OR;
1452 assert(is_ANYOF_SYNTHETIC(ssc));
1454 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1455 * the code point inversion list and just the relevant flags */
1456 if (is_ANYOF_SYNTHETIC(or_with)) {
1457 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1458 ored_flags = ANYOF_FLAGS(or_with);
1461 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1462 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1463 if (OP(or_with) != ANYOFD) {
1465 |= ANYOF_FLAGS(or_with)
1466 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1470 ANYOF_FLAGS(ssc) |= ored_flags;
1472 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1473 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1474 * 'or_with' may be inverted. When not inverted, we have the simple
1475 * situation of computing:
1476 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1477 * If P1|P2 yields a situation with both a class and its complement are
1478 * set, like having both \w and \W, this matches all code points, and we
1479 * can delete these from the P component of the ssc going forward. XXX We
1480 * might be able to delete all the P components, but I (khw) am not certain
1481 * about this, and it is better to be safe.
1484 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1485 * <= (C1 | P1) | ~C2
1486 * <= (C1 | ~C2) | P1
1487 * (which results in actually simpler code than the non-inverted case)
1490 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1491 && ! is_ANYOF_SYNTHETIC(or_with))
1493 /* We ignore P2, leaving P1 going forward */
1494 } /* else Not inverted */
1495 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1496 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1497 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1499 for (i = 0; i < ANYOF_MAX; i += 2) {
1500 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1502 ssc_match_all_cp(ssc);
1503 ANYOF_POSIXL_CLEAR(ssc, i);
1504 ANYOF_POSIXL_CLEAR(ssc, i+1);
1512 FALSE /* Already has been inverted */
1516 PERL_STATIC_INLINE void
1517 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1519 PERL_ARGS_ASSERT_SSC_UNION;
1521 assert(is_ANYOF_SYNTHETIC(ssc));
1523 _invlist_union_maybe_complement_2nd(ssc->invlist,
1529 PERL_STATIC_INLINE void
1530 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1532 const bool invert2nd)
1534 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1536 assert(is_ANYOF_SYNTHETIC(ssc));
1538 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1544 PERL_STATIC_INLINE void
1545 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1547 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1549 assert(is_ANYOF_SYNTHETIC(ssc));
1551 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1554 PERL_STATIC_INLINE void
1555 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1557 /* AND just the single code point 'cp' into the SSC 'ssc' */
1559 SV* cp_list = _new_invlist(2);
1561 PERL_ARGS_ASSERT_SSC_CP_AND;
1563 assert(is_ANYOF_SYNTHETIC(ssc));
1565 cp_list = add_cp_to_invlist(cp_list, cp);
1566 ssc_intersection(ssc, cp_list,
1567 FALSE /* Not inverted */
1569 SvREFCNT_dec_NN(cp_list);
1572 PERL_STATIC_INLINE void
1573 S_ssc_clear_locale(regnode_ssc *ssc)
1575 /* Set the SSC 'ssc' to not match any locale things */
1576 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1578 assert(is_ANYOF_SYNTHETIC(ssc));
1580 ANYOF_POSIXL_ZERO(ssc);
1581 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1584 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1587 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1589 /* The synthetic start class is used to hopefully quickly winnow down
1590 * places where a pattern could start a match in the target string. If it
1591 * doesn't really narrow things down that much, there isn't much point to
1592 * having the overhead of using it. This function uses some very crude
1593 * heuristics to decide if to use the ssc or not.
1595 * It returns TRUE if 'ssc' rules out more than half what it considers to
1596 * be the "likely" possible matches, but of course it doesn't know what the
1597 * actual things being matched are going to be; these are only guesses
1599 * For /l matches, it assumes that the only likely matches are going to be
1600 * in the 0-255 range, uniformly distributed, so half of that is 127
1601 * For /a and /d matches, it assumes that the likely matches will be just
1602 * the ASCII range, so half of that is 63
1603 * For /u and there isn't anything matching above the Latin1 range, it
1604 * assumes that that is the only range likely to be matched, and uses
1605 * half that as the cut-off: 127. If anything matches above Latin1,
1606 * it assumes that all of Unicode could match (uniformly), except for
1607 * non-Unicode code points and things in the General Category "Other"
1608 * (unassigned, private use, surrogates, controls and formats). This
1609 * is a much large number. */
1611 const U32 max_match = (LOC)
1615 : (invlist_highest(ssc->invlist) < 256)
1617 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1618 U32 count = 0; /* Running total of number of code points matched by
1620 UV start, end; /* Start and end points of current range in inversion
1623 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1625 invlist_iterinit(ssc->invlist);
1626 while (invlist_iternext(ssc->invlist, &start, &end)) {
1628 /* /u is the only thing that we expect to match above 255; so if not /u
1629 * and even if there are matches above 255, ignore them. This catches
1630 * things like \d under /d which does match the digits above 255, but
1631 * since the pattern is /d, it is not likely to be expecting them */
1632 if (! UNI_SEMANTICS) {
1636 end = MIN(end, 255);
1638 count += end - start + 1;
1639 if (count > max_match) {
1640 invlist_iterfinish(ssc->invlist);
1650 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1652 /* The inversion list in the SSC is marked mortal; now we need a more
1653 * permanent copy, which is stored the same way that is done in a regular
1654 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1657 SV* invlist = invlist_clone(ssc->invlist);
1659 PERL_ARGS_ASSERT_SSC_FINALIZE;
1661 assert(is_ANYOF_SYNTHETIC(ssc));
1663 /* The code in this file assumes that all but these flags aren't relevant
1664 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1665 * by the time we reach here */
1666 assert(! (ANYOF_FLAGS(ssc)
1667 & ~( ANYOF_COMMON_FLAGS
1668 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
1670 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1672 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1673 NULL, NULL, NULL, FALSE);
1675 /* Make sure is clone-safe */
1676 ssc->invlist = NULL;
1678 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1679 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1682 if (RExC_contains_locale) {
1686 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1689 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1690 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1691 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1692 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1693 ? (TRIE_LIST_CUR( idx ) - 1) \
1699 dump_trie(trie,widecharmap,revcharmap)
1700 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1701 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1703 These routines dump out a trie in a somewhat readable format.
1704 The _interim_ variants are used for debugging the interim
1705 tables that are used to generate the final compressed
1706 representation which is what dump_trie expects.
1708 Part of the reason for their existence is to provide a form
1709 of documentation as to how the different representations function.
1714 Dumps the final compressed table form of the trie to Perl_debug_log.
1715 Used for debugging make_trie().
1719 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1720 AV *revcharmap, U32 depth)
1723 SV *sv=sv_newmortal();
1724 int colwidth= widecharmap ? 6 : 4;
1726 GET_RE_DEBUG_FLAGS_DECL;
1728 PERL_ARGS_ASSERT_DUMP_TRIE;
1730 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1731 (int)depth * 2 + 2,"",
1732 "Match","Base","Ofs" );
1734 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1735 SV ** const tmp = av_fetch( revcharmap, state, 0);
1737 PerlIO_printf( Perl_debug_log, "%*s",
1739 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1740 PL_colors[0], PL_colors[1],
1741 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1742 PERL_PV_ESCAPE_FIRSTCHAR
1747 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1748 (int)depth * 2 + 2,"");
1750 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1751 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1752 PerlIO_printf( Perl_debug_log, "\n");
1754 for( state = 1 ; state < trie->statecount ; state++ ) {
1755 const U32 base = trie->states[ state ].trans.base;
1757 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1758 (int)depth * 2 + 2,"", (UV)state);
1760 if ( trie->states[ state ].wordnum ) {
1761 PerlIO_printf( Perl_debug_log, " W%4X",
1762 trie->states[ state ].wordnum );
1764 PerlIO_printf( Perl_debug_log, "%6s", "" );
1767 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1772 while( ( base + ofs < trie->uniquecharcount ) ||
1773 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1774 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1778 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1780 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1781 if ( ( base + ofs >= trie->uniquecharcount )
1782 && ( base + ofs - trie->uniquecharcount
1784 && trie->trans[ base + ofs
1785 - trie->uniquecharcount ].check == state )
1787 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1789 (UV)trie->trans[ base + ofs
1790 - trie->uniquecharcount ].next );
1792 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1796 PerlIO_printf( Perl_debug_log, "]");
1799 PerlIO_printf( Perl_debug_log, "\n" );
1801 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1803 for (word=1; word <= trie->wordcount; word++) {
1804 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1805 (int)word, (int)(trie->wordinfo[word].prev),
1806 (int)(trie->wordinfo[word].len));
1808 PerlIO_printf(Perl_debug_log, "\n" );
1811 Dumps a fully constructed but uncompressed trie in list form.
1812 List tries normally only are used for construction when the number of
1813 possible chars (trie->uniquecharcount) is very high.
1814 Used for debugging make_trie().
1817 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1818 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1822 SV *sv=sv_newmortal();
1823 int colwidth= widecharmap ? 6 : 4;
1824 GET_RE_DEBUG_FLAGS_DECL;
1826 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1828 /* print out the table precompression. */
1829 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1830 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1831 "------:-----+-----------------\n" );
1833 for( state=1 ; state < next_alloc ; state ++ ) {
1836 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1837 (int)depth * 2 + 2,"", (UV)state );
1838 if ( ! trie->states[ state ].wordnum ) {
1839 PerlIO_printf( Perl_debug_log, "%5s| ","");
1841 PerlIO_printf( Perl_debug_log, "W%4x| ",
1842 trie->states[ state ].wordnum
1845 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1846 SV ** const tmp = av_fetch( revcharmap,
1847 TRIE_LIST_ITEM(state,charid).forid, 0);
1849 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1851 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1853 PL_colors[0], PL_colors[1],
1854 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1855 | PERL_PV_ESCAPE_FIRSTCHAR
1857 TRIE_LIST_ITEM(state,charid).forid,
1858 (UV)TRIE_LIST_ITEM(state,charid).newstate
1861 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1862 (int)((depth * 2) + 14), "");
1865 PerlIO_printf( Perl_debug_log, "\n");
1870 Dumps a fully constructed but uncompressed trie in table form.
1871 This is the normal DFA style state transition table, with a few
1872 twists to facilitate compression later.
1873 Used for debugging make_trie().
1876 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1877 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1882 SV *sv=sv_newmortal();
1883 int colwidth= widecharmap ? 6 : 4;
1884 GET_RE_DEBUG_FLAGS_DECL;
1886 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1889 print out the table precompression so that we can do a visual check
1890 that they are identical.
1893 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1895 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1896 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1898 PerlIO_printf( Perl_debug_log, "%*s",
1900 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1901 PL_colors[0], PL_colors[1],
1902 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1903 PERL_PV_ESCAPE_FIRSTCHAR
1909 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1911 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1912 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1915 PerlIO_printf( Perl_debug_log, "\n" );
1917 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1919 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1920 (int)depth * 2 + 2,"",
1921 (UV)TRIE_NODENUM( state ) );
1923 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1924 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1926 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1928 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1930 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1931 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1932 (UV)trie->trans[ state ].check );
1934 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1935 (UV)trie->trans[ state ].check,
1936 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1944 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1945 startbranch: the first branch in the whole branch sequence
1946 first : start branch of sequence of branch-exact nodes.
1947 May be the same as startbranch
1948 last : Thing following the last branch.
1949 May be the same as tail.
1950 tail : item following the branch sequence
1951 count : words in the sequence
1952 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1953 depth : indent depth
1955 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1957 A trie is an N'ary tree where the branches are determined by digital
1958 decomposition of the key. IE, at the root node you look up the 1st character and
1959 follow that branch repeat until you find the end of the branches. Nodes can be
1960 marked as "accepting" meaning they represent a complete word. Eg:
1964 would convert into the following structure. Numbers represent states, letters
1965 following numbers represent valid transitions on the letter from that state, if
1966 the number is in square brackets it represents an accepting state, otherwise it
1967 will be in parenthesis.
1969 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1973 (1) +-i->(6)-+-s->[7]
1975 +-s->(3)-+-h->(4)-+-e->[5]
1977 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1979 This shows that when matching against the string 'hers' we will begin at state 1
1980 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1981 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1982 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1983 single traverse. We store a mapping from accepting to state to which word was
1984 matched, and then when we have multiple possibilities we try to complete the
1985 rest of the regex in the order in which they occurred in the alternation.
1987 The only prior NFA like behaviour that would be changed by the TRIE support is
1988 the silent ignoring of duplicate alternations which are of the form:
1990 / (DUPE|DUPE) X? (?{ ... }) Y /x
1992 Thus EVAL blocks following a trie may be called a different number of times with
1993 and without the optimisation. With the optimisations dupes will be silently
1994 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1995 the following demonstrates:
1997 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1999 which prints out 'word' three times, but
2001 'words'=~/(word|word|word)(?{ print $1 })S/
2003 which doesnt print it out at all. This is due to other optimisations kicking in.
2005 Example of what happens on a structural level:
2007 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2009 1: CURLYM[1] {1,32767}(18)
2020 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2021 and should turn into:
2023 1: CURLYM[1] {1,32767}(18)
2025 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2033 Cases where tail != last would be like /(?foo|bar)baz/:
2043 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2044 and would end up looking like:
2047 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2054 d = uvchr_to_utf8_flags(d, uv, 0);
2056 is the recommended Unicode-aware way of saying
2061 #define TRIE_STORE_REVCHAR(val) \
2064 SV *zlopp = newSV(UTF8_MAXBYTES); \
2065 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2066 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2067 SvCUR_set(zlopp, kapow - flrbbbbb); \
2070 av_push(revcharmap, zlopp); \
2072 char ooooff = (char)val; \
2073 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2077 /* This gets the next character from the input, folding it if not already
2079 #define TRIE_READ_CHAR STMT_START { \
2082 /* if it is UTF then it is either already folded, or does not need \
2084 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2086 else if (folder == PL_fold_latin1) { \
2087 /* This folder implies Unicode rules, which in the range expressible \
2088 * by not UTF is the lower case, with the two exceptions, one of \
2089 * which should have been taken care of before calling this */ \
2090 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2091 uvc = toLOWER_L1(*uc); \
2092 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2095 /* raw data, will be folded later if needed */ \
2103 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2104 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2105 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2106 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2108 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2109 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2110 TRIE_LIST_CUR( state )++; \
2113 #define TRIE_LIST_NEW(state) STMT_START { \
2114 Newxz( trie->states[ state ].trans.list, \
2115 4, reg_trie_trans_le ); \
2116 TRIE_LIST_CUR( state ) = 1; \
2117 TRIE_LIST_LEN( state ) = 4; \
2120 #define TRIE_HANDLE_WORD(state) STMT_START { \
2121 U16 dupe= trie->states[ state ].wordnum; \
2122 regnode * const noper_next = regnext( noper ); \
2125 /* store the word for dumping */ \
2127 if (OP(noper) != NOTHING) \
2128 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2130 tmp = newSVpvn_utf8( "", 0, UTF ); \
2131 av_push( trie_words, tmp ); \
2135 trie->wordinfo[curword].prev = 0; \
2136 trie->wordinfo[curword].len = wordlen; \
2137 trie->wordinfo[curword].accept = state; \
2139 if ( noper_next < tail ) { \
2141 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2143 trie->jump[curword] = (U16)(noper_next - convert); \
2145 jumper = noper_next; \
2147 nextbranch= regnext(cur); \
2151 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2152 /* chain, so that when the bits of chain are later */\
2153 /* linked together, the dups appear in the chain */\
2154 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2155 trie->wordinfo[dupe].prev = curword; \
2157 /* we haven't inserted this word yet. */ \
2158 trie->states[ state ].wordnum = curword; \
2163 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2164 ( ( base + charid >= ucharcount \
2165 && base + charid < ubound \
2166 && state == trie->trans[ base - ucharcount + charid ].check \
2167 && trie->trans[ base - ucharcount + charid ].next ) \
2168 ? trie->trans[ base - ucharcount + charid ].next \
2169 : ( state==1 ? special : 0 ) \
2173 #define MADE_JUMP_TRIE 2
2174 #define MADE_EXACT_TRIE 4
2177 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2178 regnode *first, regnode *last, regnode *tail,
2179 U32 word_count, U32 flags, U32 depth)
2181 /* first pass, loop through and scan words */
2182 reg_trie_data *trie;
2183 HV *widecharmap = NULL;
2184 AV *revcharmap = newAV();
2190 regnode *jumper = NULL;
2191 regnode *nextbranch = NULL;
2192 regnode *convert = NULL;
2193 U32 *prev_states; /* temp array mapping each state to previous one */
2194 /* we just use folder as a flag in utf8 */
2195 const U8 * folder = NULL;
2198 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2199 AV *trie_words = NULL;
2200 /* along with revcharmap, this only used during construction but both are
2201 * useful during debugging so we store them in the struct when debugging.
2204 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2205 STRLEN trie_charcount=0;
2207 SV *re_trie_maxbuff;
2208 GET_RE_DEBUG_FLAGS_DECL;
2210 PERL_ARGS_ASSERT_MAKE_TRIE;
2212 PERL_UNUSED_ARG(depth);
2216 case EXACT: case EXACTL: break;
2220 case EXACTFLU8: folder = PL_fold_latin1; break;
2221 case EXACTF: folder = PL_fold; break;
2222 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2225 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2227 trie->startstate = 1;
2228 trie->wordcount = word_count;
2229 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2230 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2231 if (flags == EXACT || flags == EXACTL)
2232 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2233 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2234 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2237 trie_words = newAV();
2240 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2241 assert(re_trie_maxbuff);
2242 if (!SvIOK(re_trie_maxbuff)) {
2243 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2245 DEBUG_TRIE_COMPILE_r({
2246 PerlIO_printf( Perl_debug_log,
2247 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2248 (int)depth * 2 + 2, "",
2249 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2250 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2253 /* Find the node we are going to overwrite */
2254 if ( first == startbranch && OP( last ) != BRANCH ) {
2255 /* whole branch chain */
2258 /* branch sub-chain */
2259 convert = NEXTOPER( first );
2262 /* -- First loop and Setup --
2264 We first traverse the branches and scan each word to determine if it
2265 contains widechars, and how many unique chars there are, this is
2266 important as we have to build a table with at least as many columns as we
2269 We use an array of integers to represent the character codes 0..255
2270 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2271 the native representation of the character value as the key and IV's for
2274 *TODO* If we keep track of how many times each character is used we can
2275 remap the columns so that the table compression later on is more
2276 efficient in terms of memory by ensuring the most common value is in the
2277 middle and the least common are on the outside. IMO this would be better
2278 than a most to least common mapping as theres a decent chance the most
2279 common letter will share a node with the least common, meaning the node
2280 will not be compressible. With a middle is most common approach the worst
2281 case is when we have the least common nodes twice.
2285 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2286 regnode *noper = NEXTOPER( cur );
2287 const U8 *uc = (U8*)STRING( noper );
2288 const U8 *e = uc + STR_LEN( noper );
2290 U32 wordlen = 0; /* required init */
2291 STRLEN minchars = 0;
2292 STRLEN maxchars = 0;
2293 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2296 if (OP(noper) == NOTHING) {
2297 regnode *noper_next= regnext(noper);
2298 if (noper_next != tail && OP(noper_next) == flags) {
2300 uc= (U8*)STRING(noper);
2301 e= uc + STR_LEN(noper);
2302 trie->minlen= STR_LEN(noper);
2309 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2310 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2311 regardless of encoding */
2312 if (OP( noper ) == EXACTFU_SS) {
2313 /* false positives are ok, so just set this */
2314 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2317 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2319 TRIE_CHARCOUNT(trie)++;
2322 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2323 * is in effect. Under /i, this character can match itself, or
2324 * anything that folds to it. If not under /i, it can match just
2325 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2326 * all fold to k, and all are single characters. But some folds
2327 * expand to more than one character, so for example LATIN SMALL
2328 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2329 * the string beginning at 'uc' is 'ffi', it could be matched by
2330 * three characters, or just by the one ligature character. (It
2331 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2332 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2333 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2334 * match.) The trie needs to know the minimum and maximum number
2335 * of characters that could match so that it can use size alone to
2336 * quickly reject many match attempts. The max is simple: it is
2337 * the number of folded characters in this branch (since a fold is
2338 * never shorter than what folds to it. */
2342 /* And the min is equal to the max if not under /i (indicated by
2343 * 'folder' being NULL), or there are no multi-character folds. If
2344 * there is a multi-character fold, the min is incremented just
2345 * once, for the character that folds to the sequence. Each
2346 * character in the sequence needs to be added to the list below of
2347 * characters in the trie, but we count only the first towards the
2348 * min number of characters needed. This is done through the
2349 * variable 'foldlen', which is returned by the macros that look
2350 * for these sequences as the number of bytes the sequence
2351 * occupies. Each time through the loop, we decrement 'foldlen' by
2352 * how many bytes the current char occupies. Only when it reaches
2353 * 0 do we increment 'minchars' or look for another multi-character
2355 if (folder == NULL) {
2358 else if (foldlen > 0) {
2359 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2364 /* See if *uc is the beginning of a multi-character fold. If
2365 * so, we decrement the length remaining to look at, to account
2366 * for the current character this iteration. (We can use 'uc'
2367 * instead of the fold returned by TRIE_READ_CHAR because for
2368 * non-UTF, the latin1_safe macro is smart enough to account
2369 * for all the unfolded characters, and because for UTF, the
2370 * string will already have been folded earlier in the
2371 * compilation process */
2373 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2374 foldlen -= UTF8SKIP(uc);
2377 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2382 /* The current character (and any potential folds) should be added
2383 * to the possible matching characters for this position in this
2387 U8 folded= folder[ (U8) uvc ];
2388 if ( !trie->charmap[ folded ] ) {
2389 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2390 TRIE_STORE_REVCHAR( folded );
2393 if ( !trie->charmap[ uvc ] ) {
2394 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2395 TRIE_STORE_REVCHAR( uvc );
2398 /* store the codepoint in the bitmap, and its folded
2400 TRIE_BITMAP_SET(trie, uvc);
2402 /* store the folded codepoint */
2403 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2406 /* store first byte of utf8 representation of
2407 variant codepoints */
2408 if (! UVCHR_IS_INVARIANT(uvc)) {
2409 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2412 set_bit = 0; /* We've done our bit :-) */
2416 /* XXX We could come up with the list of code points that fold
2417 * to this using PL_utf8_foldclosures, except not for
2418 * multi-char folds, as there may be multiple combinations
2419 * there that could work, which needs to wait until runtime to
2420 * resolve (The comment about LIGATURE FFI above is such an
2425 widecharmap = newHV();
2427 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2430 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2432 if ( !SvTRUE( *svpp ) ) {
2433 sv_setiv( *svpp, ++trie->uniquecharcount );
2434 TRIE_STORE_REVCHAR(uvc);
2437 } /* end loop through characters in this branch of the trie */
2439 /* We take the min and max for this branch and combine to find the min
2440 * and max for all branches processed so far */
2441 if( cur == first ) {
2442 trie->minlen = minchars;
2443 trie->maxlen = maxchars;
2444 } else if (minchars < trie->minlen) {
2445 trie->minlen = minchars;
2446 } else if (maxchars > trie->maxlen) {
2447 trie->maxlen = maxchars;
2449 } /* end first pass */
2450 DEBUG_TRIE_COMPILE_r(
2451 PerlIO_printf( Perl_debug_log,
2452 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2453 (int)depth * 2 + 2,"",
2454 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2455 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2456 (int)trie->minlen, (int)trie->maxlen )
2460 We now know what we are dealing with in terms of unique chars and
2461 string sizes so we can calculate how much memory a naive
2462 representation using a flat table will take. If it's over a reasonable
2463 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2464 conservative but potentially much slower representation using an array
2467 At the end we convert both representations into the same compressed
2468 form that will be used in regexec.c for matching with. The latter
2469 is a form that cannot be used to construct with but has memory
2470 properties similar to the list form and access properties similar
2471 to the table form making it both suitable for fast searches and
2472 small enough that its feasable to store for the duration of a program.
2474 See the comment in the code where the compressed table is produced
2475 inplace from the flat tabe representation for an explanation of how
2476 the compression works.
2481 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2484 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2485 > SvIV(re_trie_maxbuff) )
2488 Second Pass -- Array Of Lists Representation
2490 Each state will be represented by a list of charid:state records
2491 (reg_trie_trans_le) the first such element holds the CUR and LEN
2492 points of the allocated array. (See defines above).
2494 We build the initial structure using the lists, and then convert
2495 it into the compressed table form which allows faster lookups
2496 (but cant be modified once converted).
2499 STRLEN transcount = 1;
2501 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2502 "%*sCompiling trie using list compiler\n",
2503 (int)depth * 2 + 2, ""));
2505 trie->states = (reg_trie_state *)
2506 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2507 sizeof(reg_trie_state) );
2511 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2513 regnode *noper = NEXTOPER( cur );
2514 U8 *uc = (U8*)STRING( noper );
2515 const U8 *e = uc + STR_LEN( noper );
2516 U32 state = 1; /* required init */
2517 U16 charid = 0; /* sanity init */
2518 U32 wordlen = 0; /* required init */
2520 if (OP(noper) == NOTHING) {
2521 regnode *noper_next= regnext(noper);
2522 if (noper_next != tail && OP(noper_next) == flags) {
2524 uc= (U8*)STRING(noper);
2525 e= uc + STR_LEN(noper);
2529 if (OP(noper) != NOTHING) {
2530 for ( ; uc < e ; uc += len ) {
2535 charid = trie->charmap[ uvc ];
2537 SV** const svpp = hv_fetch( widecharmap,
2544 charid=(U16)SvIV( *svpp );
2547 /* charid is now 0 if we dont know the char read, or
2548 * nonzero if we do */
2555 if ( !trie->states[ state ].trans.list ) {
2556 TRIE_LIST_NEW( state );
2559 check <= TRIE_LIST_USED( state );
2562 if ( TRIE_LIST_ITEM( state, check ).forid
2565 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2570 newstate = next_alloc++;
2571 prev_states[newstate] = state;
2572 TRIE_LIST_PUSH( state, charid, newstate );
2577 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2581 TRIE_HANDLE_WORD(state);
2583 } /* end second pass */
2585 /* next alloc is the NEXT state to be allocated */
2586 trie->statecount = next_alloc;
2587 trie->states = (reg_trie_state *)
2588 PerlMemShared_realloc( trie->states,
2590 * sizeof(reg_trie_state) );
2592 /* and now dump it out before we compress it */
2593 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2594 revcharmap, next_alloc,
2598 trie->trans = (reg_trie_trans *)
2599 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2606 for( state=1 ; state < next_alloc ; state ++ ) {
2610 DEBUG_TRIE_COMPILE_MORE_r(
2611 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2615 if (trie->states[state].trans.list) {
2616 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2620 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2621 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2622 if ( forid < minid ) {
2624 } else if ( forid > maxid ) {
2628 if ( transcount < tp + maxid - minid + 1) {
2630 trie->trans = (reg_trie_trans *)
2631 PerlMemShared_realloc( trie->trans,
2633 * sizeof(reg_trie_trans) );
2634 Zero( trie->trans + (transcount / 2),
2638 base = trie->uniquecharcount + tp - minid;
2639 if ( maxid == minid ) {
2641 for ( ; zp < tp ; zp++ ) {
2642 if ( ! trie->trans[ zp ].next ) {
2643 base = trie->uniquecharcount + zp - minid;
2644 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2646 trie->trans[ zp ].check = state;
2652 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2654 trie->trans[ tp ].check = state;
2659 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2660 const U32 tid = base
2661 - trie->uniquecharcount
2662 + TRIE_LIST_ITEM( state, idx ).forid;
2663 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2665 trie->trans[ tid ].check = state;
2667 tp += ( maxid - minid + 1 );
2669 Safefree(trie->states[ state ].trans.list);
2672 DEBUG_TRIE_COMPILE_MORE_r(
2673 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2676 trie->states[ state ].trans.base=base;
2678 trie->lasttrans = tp + 1;
2682 Second Pass -- Flat Table Representation.
2684 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2685 each. We know that we will need Charcount+1 trans at most to store
2686 the data (one row per char at worst case) So we preallocate both
2687 structures assuming worst case.
2689 We then construct the trie using only the .next slots of the entry
2692 We use the .check field of the first entry of the node temporarily
2693 to make compression both faster and easier by keeping track of how
2694 many non zero fields are in the node.
2696 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2699 There are two terms at use here: state as a TRIE_NODEIDX() which is
2700 a number representing the first entry of the node, and state as a
2701 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2702 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2703 if there are 2 entrys per node. eg:
2711 The table is internally in the right hand, idx form. However as we
2712 also have to deal with the states array which is indexed by nodenum
2713 we have to use TRIE_NODENUM() to convert.
2716 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2717 "%*sCompiling trie using table compiler\n",
2718 (int)depth * 2 + 2, ""));
2720 trie->trans = (reg_trie_trans *)
2721 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2722 * trie->uniquecharcount + 1,
2723 sizeof(reg_trie_trans) );
2724 trie->states = (reg_trie_state *)
2725 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2726 sizeof(reg_trie_state) );
2727 next_alloc = trie->uniquecharcount + 1;
2730 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2732 regnode *noper = NEXTOPER( cur );
2733 const U8 *uc = (U8*)STRING( noper );
2734 const U8 *e = uc + STR_LEN( noper );
2736 U32 state = 1; /* required init */
2738 U16 charid = 0; /* sanity init */
2739 U32 accept_state = 0; /* sanity init */
2741 U32 wordlen = 0; /* required init */
2743 if (OP(noper) == NOTHING) {
2744 regnode *noper_next= regnext(noper);
2745 if (noper_next != tail && OP(noper_next) == flags) {
2747 uc= (U8*)STRING(noper);
2748 e= uc + STR_LEN(noper);
2752 if ( OP(noper) != NOTHING ) {
2753 for ( ; uc < e ; uc += len ) {
2758 charid = trie->charmap[ uvc ];
2760 SV* const * const svpp = hv_fetch( widecharmap,
2764 charid = svpp ? (U16)SvIV(*svpp) : 0;
2768 if ( !trie->trans[ state + charid ].next ) {
2769 trie->trans[ state + charid ].next = next_alloc;
2770 trie->trans[ state ].check++;
2771 prev_states[TRIE_NODENUM(next_alloc)]
2772 = TRIE_NODENUM(state);
2773 next_alloc += trie->uniquecharcount;
2775 state = trie->trans[ state + charid ].next;
2777 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2779 /* charid is now 0 if we dont know the char read, or
2780 * nonzero if we do */
2783 accept_state = TRIE_NODENUM( state );
2784 TRIE_HANDLE_WORD(accept_state);
2786 } /* end second pass */
2788 /* and now dump it out before we compress it */
2789 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2791 next_alloc, depth+1));
2795 * Inplace compress the table.*
2797 For sparse data sets the table constructed by the trie algorithm will
2798 be mostly 0/FAIL transitions or to put it another way mostly empty.
2799 (Note that leaf nodes will not contain any transitions.)
2801 This algorithm compresses the tables by eliminating most such
2802 transitions, at the cost of a modest bit of extra work during lookup:
2804 - Each states[] entry contains a .base field which indicates the
2805 index in the state[] array wheres its transition data is stored.
2807 - If .base is 0 there are no valid transitions from that node.
2809 - If .base is nonzero then charid is added to it to find an entry in
2812 -If trans[states[state].base+charid].check!=state then the
2813 transition is taken to be a 0/Fail transition. Thus if there are fail
2814 transitions at the front of the node then the .base offset will point
2815 somewhere inside the previous nodes data (or maybe even into a node
2816 even earlier), but the .check field determines if the transition is
2820 The following process inplace converts the table to the compressed
2821 table: We first do not compress the root node 1,and mark all its
2822 .check pointers as 1 and set its .base pointer as 1 as well. This
2823 allows us to do a DFA construction from the compressed table later,
2824 and ensures that any .base pointers we calculate later are greater
2827 - We set 'pos' to indicate the first entry of the second node.
2829 - We then iterate over the columns of the node, finding the first and
2830 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2831 and set the .check pointers accordingly, and advance pos
2832 appropriately and repreat for the next node. Note that when we copy
2833 the next pointers we have to convert them from the original
2834 NODEIDX form to NODENUM form as the former is not valid post
2837 - If a node has no transitions used we mark its base as 0 and do not
2838 advance the pos pointer.
2840 - If a node only has one transition we use a second pointer into the
2841 structure to fill in allocated fail transitions from other states.
2842 This pointer is independent of the main pointer and scans forward
2843 looking for null transitions that are allocated to a state. When it
2844 finds one it writes the single transition into the "hole". If the
2845 pointer doesnt find one the single transition is appended as normal.
2847 - Once compressed we can Renew/realloc the structures to release the
2850 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2851 specifically Fig 3.47 and the associated pseudocode.
2855 const U32 laststate = TRIE_NODENUM( next_alloc );
2858 trie->statecount = laststate;
2860 for ( state = 1 ; state < laststate ; state++ ) {
2862 const U32 stateidx = TRIE_NODEIDX( state );
2863 const U32 o_used = trie->trans[ stateidx ].check;
2864 U32 used = trie->trans[ stateidx ].check;
2865 trie->trans[ stateidx ].check = 0;
2868 used && charid < trie->uniquecharcount;
2871 if ( flag || trie->trans[ stateidx + charid ].next ) {
2872 if ( trie->trans[ stateidx + charid ].next ) {
2874 for ( ; zp < pos ; zp++ ) {
2875 if ( ! trie->trans[ zp ].next ) {
2879 trie->states[ state ].trans.base
2881 + trie->uniquecharcount
2883 trie->trans[ zp ].next
2884 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2886 trie->trans[ zp ].check = state;
2887 if ( ++zp > pos ) pos = zp;
2894 trie->states[ state ].trans.base
2895 = pos + trie->uniquecharcount - charid ;
2897 trie->trans[ pos ].next
2898 = SAFE_TRIE_NODENUM(
2899 trie->trans[ stateidx + charid ].next );
2900 trie->trans[ pos ].check = state;
2905 trie->lasttrans = pos + 1;
2906 trie->states = (reg_trie_state *)
2907 PerlMemShared_realloc( trie->states, laststate
2908 * sizeof(reg_trie_state) );
2909 DEBUG_TRIE_COMPILE_MORE_r(
2910 PerlIO_printf( Perl_debug_log,
2911 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2912 (int)depth * 2 + 2,"",
2913 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2917 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2920 } /* end table compress */
2922 DEBUG_TRIE_COMPILE_MORE_r(
2923 PerlIO_printf(Perl_debug_log,
2924 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2925 (int)depth * 2 + 2, "",
2926 (UV)trie->statecount,
2927 (UV)trie->lasttrans)
2929 /* resize the trans array to remove unused space */
2930 trie->trans = (reg_trie_trans *)
2931 PerlMemShared_realloc( trie->trans, trie->lasttrans
2932 * sizeof(reg_trie_trans) );
2934 { /* Modify the program and insert the new TRIE node */
2935 U8 nodetype =(U8)(flags & 0xFF);
2939 regnode *optimize = NULL;
2940 #ifdef RE_TRACK_PATTERN_OFFSETS
2943 U32 mjd_nodelen = 0;
2944 #endif /* RE_TRACK_PATTERN_OFFSETS */
2945 #endif /* DEBUGGING */
2947 This means we convert either the first branch or the first Exact,
2948 depending on whether the thing following (in 'last') is a branch
2949 or not and whther first is the startbranch (ie is it a sub part of
2950 the alternation or is it the whole thing.)
2951 Assuming its a sub part we convert the EXACT otherwise we convert
2952 the whole branch sequence, including the first.
2954 /* Find the node we are going to overwrite */
2955 if ( first != startbranch || OP( last ) == BRANCH ) {
2956 /* branch sub-chain */
2957 NEXT_OFF( first ) = (U16)(last - first);
2958 #ifdef RE_TRACK_PATTERN_OFFSETS
2960 mjd_offset= Node_Offset((convert));
2961 mjd_nodelen= Node_Length((convert));
2964 /* whole branch chain */
2966 #ifdef RE_TRACK_PATTERN_OFFSETS
2969 const regnode *nop = NEXTOPER( convert );
2970 mjd_offset= Node_Offset((nop));
2971 mjd_nodelen= Node_Length((nop));
2975 PerlIO_printf(Perl_debug_log,
2976 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2977 (int)depth * 2 + 2, "",
2978 (UV)mjd_offset, (UV)mjd_nodelen)
2981 /* But first we check to see if there is a common prefix we can
2982 split out as an EXACT and put in front of the TRIE node. */
2983 trie->startstate= 1;
2984 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2986 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2990 const U32 base = trie->states[ state ].trans.base;
2992 if ( trie->states[state].wordnum )
2995 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2996 if ( ( base + ofs >= trie->uniquecharcount ) &&
2997 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2998 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3000 if ( ++count > 1 ) {
3001 SV **tmp = av_fetch( revcharmap, ofs, 0);
3002 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3003 if ( state == 1 ) break;
3005 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3007 PerlIO_printf(Perl_debug_log,
3008 "%*sNew Start State=%"UVuf" Class: [",
3009 (int)depth * 2 + 2, "",
3012 SV ** const tmp = av_fetch( revcharmap, idx, 0);
3013 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3015 TRIE_BITMAP_SET(trie,*ch);
3017 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3019 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3023 TRIE_BITMAP_SET(trie,*ch);
3025 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3026 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
3032 SV **tmp = av_fetch( revcharmap, idx, 0);
3034 char *ch = SvPV( *tmp, len );
3036 SV *sv=sv_newmortal();
3037 PerlIO_printf( Perl_debug_log,
3038 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3039 (int)depth * 2 + 2, "",
3041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3042 PL_colors[0], PL_colors[1],
3043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3044 PERL_PV_ESCAPE_FIRSTCHAR
3049 OP( convert ) = nodetype;
3050 str=STRING(convert);
3053 STR_LEN(convert) += len;
3059 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3064 trie->prefixlen = (state-1);
3066 regnode *n = convert+NODE_SZ_STR(convert);
3067 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3068 trie->startstate = state;
3069 trie->minlen -= (state - 1);
3070 trie->maxlen -= (state - 1);
3072 /* At least the UNICOS C compiler choked on this
3073 * being argument to DEBUG_r(), so let's just have
3076 #ifdef PERL_EXT_RE_BUILD
3082 regnode *fix = convert;
3083 U32 word = trie->wordcount;
3085 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3086 while( ++fix < n ) {
3087 Set_Node_Offset_Length(fix, 0, 0);
3090 SV ** const tmp = av_fetch( trie_words, word, 0 );
3092 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3093 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3095 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3103 NEXT_OFF(convert) = (U16)(tail - convert);
3104 DEBUG_r(optimize= n);
3110 if ( trie->maxlen ) {
3111 NEXT_OFF( convert ) = (U16)(tail - convert);
3112 ARG_SET( convert, data_slot );
3113 /* Store the offset to the first unabsorbed branch in
3114 jump[0], which is otherwise unused by the jump logic.
3115 We use this when dumping a trie and during optimisation. */
3117 trie->jump[0] = (U16)(nextbranch - convert);
3119 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3120 * and there is a bitmap
3121 * and the first "jump target" node we found leaves enough room
3122 * then convert the TRIE node into a TRIEC node, with the bitmap
3123 * embedded inline in the opcode - this is hypothetically faster.
3125 if ( !trie->states[trie->startstate].wordnum
3127 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3129 OP( convert ) = TRIEC;
3130 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3131 PerlMemShared_free(trie->bitmap);
3134 OP( convert ) = TRIE;
3136 /* store the type in the flags */
3137 convert->flags = nodetype;
3141 + regarglen[ OP( convert ) ];
3143 /* XXX We really should free up the resource in trie now,
3144 as we won't use them - (which resources?) dmq */
3146 /* needed for dumping*/
3147 DEBUG_r(if (optimize) {
3148 regnode *opt = convert;
3150 while ( ++opt < optimize) {
3151 Set_Node_Offset_Length(opt,0,0);
3154 Try to clean up some of the debris left after the
3157 while( optimize < jumper ) {
3158 mjd_nodelen += Node_Length((optimize));
3159 OP( optimize ) = OPTIMIZED;
3160 Set_Node_Offset_Length(optimize,0,0);
3163 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3165 } /* end node insert */
3167 /* Finish populating the prev field of the wordinfo array. Walk back
3168 * from each accept state until we find another accept state, and if
3169 * so, point the first word's .prev field at the second word. If the
3170 * second already has a .prev field set, stop now. This will be the
3171 * case either if we've already processed that word's accept state,
3172 * or that state had multiple words, and the overspill words were
3173 * already linked up earlier.
3180 for (word=1; word <= trie->wordcount; word++) {
3182 if (trie->wordinfo[word].prev)
3184 state = trie->wordinfo[word].accept;
3186 state = prev_states[state];
3189 prev = trie->states[state].wordnum;
3193 trie->wordinfo[word].prev = prev;
3195 Safefree(prev_states);
3199 /* and now dump out the compressed format */
3200 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3202 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3204 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3205 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3207 SvREFCNT_dec_NN(revcharmap);
3211 : trie->startstate>1
3217 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3219 /* The Trie is constructed and compressed now so we can build a fail array if
3222 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3224 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3228 We find the fail state for each state in the trie, this state is the longest
3229 proper suffix of the current state's 'word' that is also a proper prefix of
3230 another word in our trie. State 1 represents the word '' and is thus the
3231 default fail state. This allows the DFA not to have to restart after its
3232 tried and failed a word at a given point, it simply continues as though it
3233 had been matching the other word in the first place.
3235 'abcdgu'=~/abcdefg|cdgu/
3236 When we get to 'd' we are still matching the first word, we would encounter
3237 'g' which would fail, which would bring us to the state representing 'd' in
3238 the second word where we would try 'g' and succeed, proceeding to match
3241 /* add a fail transition */
3242 const U32 trie_offset = ARG(source);
3243 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3245 const U32 ucharcount = trie->uniquecharcount;
3246 const U32 numstates = trie->statecount;
3247 const U32 ubound = trie->lasttrans + ucharcount;
3251 U32 base = trie->states[ 1 ].trans.base;
3254 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3256 GET_RE_DEBUG_FLAGS_DECL;
3258 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3259 PERL_UNUSED_CONTEXT;
3261 PERL_UNUSED_ARG(depth);
3264 if ( OP(source) == TRIE ) {
3265 struct regnode_1 *op = (struct regnode_1 *)
3266 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3267 StructCopy(source,op,struct regnode_1);
3268 stclass = (regnode *)op;
3270 struct regnode_charclass *op = (struct regnode_charclass *)
3271 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3272 StructCopy(source,op,struct regnode_charclass);
3273 stclass = (regnode *)op;
3275 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3277 ARG_SET( stclass, data_slot );
3278 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3279 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3280 aho->trie=trie_offset;
3281 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3282 Copy( trie->states, aho->states, numstates, reg_trie_state );
3283 Newxz( q, numstates, U32);
3284 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3287 /* initialize fail[0..1] to be 1 so that we always have
3288 a valid final fail state */
3289 fail[ 0 ] = fail[ 1 ] = 1;
3291 for ( charid = 0; charid < ucharcount ; charid++ ) {
3292 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3294 q[ q_write ] = newstate;
3295 /* set to point at the root */
3296 fail[ q[ q_write++ ] ]=1;
3299 while ( q_read < q_write) {
3300 const U32 cur = q[ q_read++ % numstates ];
3301 base = trie->states[ cur ].trans.base;
3303 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3304 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3306 U32 fail_state = cur;
3309 fail_state = fail[ fail_state ];
3310 fail_base = aho->states[ fail_state ].trans.base;
3311 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3313 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3314 fail[ ch_state ] = fail_state;
3315 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3317 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3319 q[ q_write++ % numstates] = ch_state;
3323 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3324 when we fail in state 1, this allows us to use the
3325 charclass scan to find a valid start char. This is based on the principle
3326 that theres a good chance the string being searched contains lots of stuff
3327 that cant be a start char.
3329 fail[ 0 ] = fail[ 1 ] = 0;
3330 DEBUG_TRIE_COMPILE_r({
3331 PerlIO_printf(Perl_debug_log,
3332 "%*sStclass Failtable (%"UVuf" states): 0",
3333 (int)(depth * 2), "", (UV)numstates
3335 for( q_read=1; q_read<numstates; q_read++ ) {
3336 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3338 PerlIO_printf(Perl_debug_log, "\n");
3341 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3346 #define DEBUG_PEEP(str,scan,depth) \
3347 DEBUG_OPTIMISE_r({if (scan){ \
3348 regnode *Next = regnext(scan); \
3349 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3350 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3351 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3352 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3353 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3354 PerlIO_printf(Perl_debug_log, "\n"); \
3357 /* The below joins as many adjacent EXACTish nodes as possible into a single
3358 * one. The regop may be changed if the node(s) contain certain sequences that
3359 * require special handling. The joining is only done if:
3360 * 1) there is room in the current conglomerated node to entirely contain the
3362 * 2) they are the exact same node type
3364 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3365 * these get optimized out
3367 * If a node is to match under /i (folded), the number of characters it matches
3368 * can be different than its character length if it contains a multi-character
3369 * fold. *min_subtract is set to the total delta number of characters of the
3372 * And *unfolded_multi_char is set to indicate whether or not the node contains
3373 * an unfolded multi-char fold. This happens when whether the fold is valid or
3374 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3375 * SMALL LETTER SHARP S, as only if the target string being matched against
3376 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3377 * folding rules depend on the locale in force at runtime. (Multi-char folds
3378 * whose components are all above the Latin1 range are not run-time locale
3379 * dependent, and have already been folded by the time this function is
3382 * This is as good a place as any to discuss the design of handling these
3383 * multi-character fold sequences. It's been wrong in Perl for a very long
3384 * time. There are three code points in Unicode whose multi-character folds
3385 * were long ago discovered to mess things up. The previous designs for
3386 * dealing with these involved assigning a special node for them. This
3387 * approach doesn't always work, as evidenced by this example:
3388 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3389 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3390 * would match just the \xDF, it won't be able to handle the case where a
3391 * successful match would have to cross the node's boundary. The new approach
3392 * that hopefully generally solves the problem generates an EXACTFU_SS node
3393 * that is "sss" in this case.
3395 * It turns out that there are problems with all multi-character folds, and not
3396 * just these three. Now the code is general, for all such cases. The
3397 * approach taken is:
3398 * 1) This routine examines each EXACTFish node that could contain multi-
3399 * character folded sequences. Since a single character can fold into
3400 * such a sequence, the minimum match length for this node is less than
3401 * the number of characters in the node. This routine returns in
3402 * *min_subtract how many characters to subtract from the the actual
3403 * length of the string to get a real minimum match length; it is 0 if
3404 * there are no multi-char foldeds. This delta is used by the caller to
3405 * adjust the min length of the match, and the delta between min and max,
3406 * so that the optimizer doesn't reject these possibilities based on size
3408 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3409 * is used for an EXACTFU node that contains at least one "ss" sequence in
3410 * it. For non-UTF-8 patterns and strings, this is the only case where
3411 * there is a possible fold length change. That means that a regular
3412 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3413 * with length changes, and so can be processed faster. regexec.c takes
3414 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3415 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3416 * known until runtime). This saves effort in regex matching. However,
3417 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3418 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3419 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3420 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3421 * possibilities for the non-UTF8 patterns are quite simple, except for
3422 * the sharp s. All the ones that don't involve a UTF-8 target string are
3423 * members of a fold-pair, and arrays are set up for all of them so that
3424 * the other member of the pair can be found quickly. Code elsewhere in
3425 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3426 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3427 * described in the next item.
3428 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3429 * validity of the fold won't be known until runtime, and so must remain
3430 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3431 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3432 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3433 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3434 * The reason this is a problem is that the optimizer part of regexec.c
3435 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3436 * that a character in the pattern corresponds to at most a single
3437 * character in the target string. (And I do mean character, and not byte
3438 * here, unlike other parts of the documentation that have never been
3439 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3440 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3441 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3442 * nodes, violate the assumption, and they are the only instances where it
3443 * is violated. I'm reluctant to try to change the assumption, as the
3444 * code involved is impenetrable to me (khw), so instead the code here
3445 * punts. This routine examines EXACTFL nodes, and (when the pattern
3446 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3447 * boolean indicating whether or not the node contains such a fold. When
3448 * it is true, the caller sets a flag that later causes the optimizer in
3449 * this file to not set values for the floating and fixed string lengths,
3450 * and thus avoids the optimizer code in regexec.c that makes the invalid
3451 * assumption. Thus, there is no optimization based on string lengths for
3452 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3453 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3454 * assumption is wrong only in these cases is that all other non-UTF-8
3455 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3456 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3457 * EXACTF nodes because we don't know at compile time if it actually
3458 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3459 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3460 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3461 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3462 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3463 * string would require the pattern to be forced into UTF-8, the overhead
3464 * of which we want to avoid. Similarly the unfolded multi-char folds in
3465 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3468 * Similarly, the code that generates tries doesn't currently handle
3469 * not-already-folded multi-char folds, and it looks like a pain to change
3470 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3471 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3472 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3473 * using /iaa matching will be doing so almost entirely with ASCII
3474 * strings, so this should rarely be encountered in practice */
3476 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3477 if (PL_regkind[OP(scan)] == EXACT) \
3478 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3481 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3482 UV *min_subtract, bool *unfolded_multi_char,
3483 U32 flags,regnode *val, U32 depth)
3485 /* Merge several consecutive EXACTish nodes into one. */
3486 regnode *n = regnext(scan);
3488 regnode *next = scan + NODE_SZ_STR(scan);
3492 regnode *stop = scan;
3493 GET_RE_DEBUG_FLAGS_DECL;
3495 PERL_UNUSED_ARG(depth);
3498 PERL_ARGS_ASSERT_JOIN_EXACT;
3499 #ifndef EXPERIMENTAL_INPLACESCAN
3500 PERL_UNUSED_ARG(flags);
3501 PERL_UNUSED_ARG(val);
3503 DEBUG_PEEP("join",scan,depth);
3505 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3506 * EXACT ones that are mergeable to the current one. */
3508 && (PL_regkind[OP(n)] == NOTHING
3509 || (stringok && OP(n) == OP(scan)))
3511 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3514 if (OP(n) == TAIL || n > next)
3516 if (PL_regkind[OP(n)] == NOTHING) {
3517 DEBUG_PEEP("skip:",n,depth);
3518 NEXT_OFF(scan) += NEXT_OFF(n);
3519 next = n + NODE_STEP_REGNODE;
3526 else if (stringok) {
3527 const unsigned int oldl = STR_LEN(scan);
3528 regnode * const nnext = regnext(n);
3530 /* XXX I (khw) kind of doubt that this works on platforms (should
3531 * Perl ever run on one) where U8_MAX is above 255 because of lots
3532 * of other assumptions */
3533 /* Don't join if the sum can't fit into a single node */
3534 if (oldl + STR_LEN(n) > U8_MAX)
3537 DEBUG_PEEP("merg",n,depth);
3540 NEXT_OFF(scan) += NEXT_OFF(n);
3541 STR_LEN(scan) += STR_LEN(n);
3542 next = n + NODE_SZ_STR(n);
3543 /* Now we can overwrite *n : */
3544 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3552 #ifdef EXPERIMENTAL_INPLACESCAN
3553 if (flags && !NEXT_OFF(n)) {
3554 DEBUG_PEEP("atch", val, depth);
3555 if (reg_off_by_arg[OP(n)]) {
3556 ARG_SET(n, val - n);
3559 NEXT_OFF(n) = val - n;
3567 *unfolded_multi_char = FALSE;
3569 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3570 * can now analyze for sequences of problematic code points. (Prior to
3571 * this final joining, sequences could have been split over boundaries, and
3572 * hence missed). The sequences only happen in folding, hence for any
3573 * non-EXACT EXACTish node */
3574 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3575 U8* s0 = (U8*) STRING(scan);
3577 U8* s_end = s0 + STR_LEN(scan);
3579 int total_count_delta = 0; /* Total delta number of characters that
3580 multi-char folds expand to */
3582 /* One pass is made over the node's string looking for all the
3583 * possibilities. To avoid some tests in the loop, there are two main
3584 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3589 if (OP(scan) == EXACTFL) {
3592 /* An EXACTFL node would already have been changed to another
3593 * node type unless there is at least one character in it that
3594 * is problematic; likely a character whose fold definition
3595 * won't be known until runtime, and so has yet to be folded.
3596 * For all but the UTF-8 locale, folds are 1-1 in length, but
3597 * to handle the UTF-8 case, we need to create a temporary
3598 * folded copy using UTF-8 locale rules in order to analyze it.
3599 * This is because our macros that look to see if a sequence is
3600 * a multi-char fold assume everything is folded (otherwise the
3601 * tests in those macros would be too complicated and slow).
3602 * Note that here, the non-problematic folds will have already
3603 * been done, so we can just copy such characters. We actually
3604 * don't completely fold the EXACTFL string. We skip the
3605 * unfolded multi-char folds, as that would just create work
3606 * below to figure out the size they already are */
3608 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3611 STRLEN s_len = UTF8SKIP(s);
3612 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3613 Copy(s, d, s_len, U8);
3616 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3617 *unfolded_multi_char = TRUE;
3618 Copy(s, d, s_len, U8);
3621 else if (isASCII(*s)) {
3622 *(d++) = toFOLD(*s);
3626 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3632 /* Point the remainder of the routine to look at our temporary
3636 } /* End of creating folded copy of EXACTFL string */
3638 /* Examine the string for a multi-character fold sequence. UTF-8
3639 * patterns have all characters pre-folded by the time this code is
3641 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3642 length sequence we are looking for is 2 */
3644 int count = 0; /* How many characters in a multi-char fold */
3645 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3646 if (! len) { /* Not a multi-char fold: get next char */
3651 /* Nodes with 'ss' require special handling, except for
3652 * EXACTFA-ish for which there is no multi-char fold to this */
3653 if (len == 2 && *s == 's' && *(s+1) == 's'
3654 && OP(scan) != EXACTFA
3655 && OP(scan) != EXACTFA_NO_TRIE)
3658 if (OP(scan) != EXACTFL) {
3659 OP(scan) = EXACTFU_SS;
3663 else { /* Here is a generic multi-char fold. */
3664 U8* multi_end = s + len;
3666 /* Count how many characters are in it. In the case of
3667 * /aa, no folds which contain ASCII code points are
3668 * allowed, so check for those, and skip if found. */
3669 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3670 count = utf8_length(s, multi_end);
3674 while (s < multi_end) {
3677 goto next_iteration;
3687 /* The delta is how long the sequence is minus 1 (1 is how long
3688 * the character that folds to the sequence is) */
3689 total_count_delta += count - 1;
3693 /* We created a temporary folded copy of the string in EXACTFL
3694 * nodes. Therefore we need to be sure it doesn't go below zero,
3695 * as the real string could be shorter */
3696 if (OP(scan) == EXACTFL) {
3697 int total_chars = utf8_length((U8*) STRING(scan),
3698 (U8*) STRING(scan) + STR_LEN(scan));
3699 if (total_count_delta > total_chars) {
3700 total_count_delta = total_chars;
3704 *min_subtract += total_count_delta;
3707 else if (OP(scan) == EXACTFA) {
3709 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3710 * fold to the ASCII range (and there are no existing ones in the
3711 * upper latin1 range). But, as outlined in the comments preceding
3712 * this function, we need to flag any occurrences of the sharp s.
3713 * This character forbids trie formation (because of added
3715 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3716 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3717 || UNICODE_DOT_DOT_VERSION > 0)
3719 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3720 OP(scan) = EXACTFA_NO_TRIE;
3721 *unfolded_multi_char = TRUE;
3729 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3730 * folds that are all Latin1. As explained in the comments
3731 * preceding this function, we look also for the sharp s in EXACTF
3732 * and EXACTFL nodes; it can be in the final position. Otherwise
3733 * we can stop looking 1 byte earlier because have to find at least
3734 * two characters for a multi-fold */
3735 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3740 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3741 if (! len) { /* Not a multi-char fold. */
3742 if (*s == LATIN_SMALL_LETTER_SHARP_S
3743 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3745 *unfolded_multi_char = TRUE;
3752 && isALPHA_FOLD_EQ(*s, 's')
3753 && isALPHA_FOLD_EQ(*(s+1), 's'))
3756 /* EXACTF nodes need to know that the minimum length
3757 * changed so that a sharp s in the string can match this
3758 * ss in the pattern, but they remain EXACTF nodes, as they
3759 * won't match this unless the target string is is UTF-8,
3760 * which we don't know until runtime. EXACTFL nodes can't
3761 * transform into EXACTFU nodes */
3762 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3763 OP(scan) = EXACTFU_SS;
3767 *min_subtract += len - 1;
3775 /* Allow dumping but overwriting the collection of skipped
3776 * ops and/or strings with fake optimized ops */
3777 n = scan + NODE_SZ_STR(scan);
3785 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3789 /* REx optimizer. Converts nodes into quicker variants "in place".
3790 Finds fixed substrings. */
3792 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3793 to the position after last scanned or to NULL. */
3795 #define INIT_AND_WITHP \
3796 assert(!and_withp); \
3797 Newx(and_withp,1, regnode_ssc); \
3798 SAVEFREEPV(and_withp)
3802 S_unwind_scan_frames(pTHX_ const void *p)
3804 scan_frame *f= (scan_frame *)p;
3806 scan_frame *n= f->next_frame;
3814 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3815 SSize_t *minlenp, SSize_t *deltap,
3820 regnode_ssc *and_withp,
3821 U32 flags, U32 depth)
3822 /* scanp: Start here (read-write). */
3823 /* deltap: Write maxlen-minlen here. */
3824 /* last: Stop before this one. */
3825 /* data: string data about the pattern */
3826 /* stopparen: treat close N as END */
3827 /* recursed: which subroutines have we recursed into */
3828 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3830 /* There must be at least this number of characters to match */
3833 regnode *scan = *scanp, *next;
3835 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3836 int is_inf_internal = 0; /* The studied chunk is infinite */
3837 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3838 scan_data_t data_fake;
3839 SV *re_trie_maxbuff = NULL;
3840 regnode *first_non_open = scan;
3841 SSize_t stopmin = SSize_t_MAX;
3842 scan_frame *frame = NULL;
3843 GET_RE_DEBUG_FLAGS_DECL;
3845 PERL_ARGS_ASSERT_STUDY_CHUNK;
3849 while (first_non_open && OP(first_non_open) == OPEN)
3850 first_non_open=regnext(first_non_open);
3856 RExC_study_chunk_recursed_count++;
3858 DEBUG_OPTIMISE_MORE_r(
3860 PerlIO_printf(Perl_debug_log,
3861 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3862 (int)(depth*2), "", (long)stopparen,
3863 (unsigned long)RExC_study_chunk_recursed_count,
3864 (unsigned long)depth, (unsigned long)recursed_depth,
3867 if (recursed_depth) {
3870 for ( j = 0 ; j < recursed_depth ; j++ ) {
3871 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3873 PAREN_TEST(RExC_study_chunk_recursed +
3874 ( j * RExC_study_chunk_recursed_bytes), i )
3877 !PAREN_TEST(RExC_study_chunk_recursed +
3878 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3881 PerlIO_printf(Perl_debug_log," %d",(int)i);
3885 if ( j + 1 < recursed_depth ) {
3886 PerlIO_printf(Perl_debug_log, ",");
3890 PerlIO_printf(Perl_debug_log,"\n");
3893 while ( scan && OP(scan) != END && scan < last ){
3894 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3895 node length to get a real minimum (because
3896 the folded version may be shorter) */
3897 bool unfolded_multi_char = FALSE;
3898 /* Peephole optimizer: */
3899 DEBUG_STUDYDATA("Peep:", data, depth);
3900 DEBUG_PEEP("Peep", scan, depth);
3903 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3904 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3905 * by a different invocation of reg() -- Yves
3907 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3909 /* Follow the next-chain of the current node and optimize
3910 away all the NOTHINGs from it. */
3911 if (OP(scan) != CURLYX) {
3912 const int max = (reg_off_by_arg[OP(scan)]
3914 /* I32 may be smaller than U16 on CRAYs! */
3915 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3916 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3920 /* Skip NOTHING and LONGJMP. */
3921 while ((n = regnext(n))
3922 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3923 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3924 && off + noff < max)
3926 if (reg_off_by_arg[OP(scan)])
3929 NEXT_OFF(scan) = off;
3932 /* The principal pseudo-switch. Cannot be a switch, since we
3933 look into several different things. */
3934 if ( OP(scan) == DEFINEP ) {
3936 SSize_t deltanext = 0;
3937 SSize_t fake_last_close = 0;
3938 I32 f = SCF_IN_DEFINE;
3940 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3941 scan = regnext(scan);
3942 assert( OP(scan) == IFTHEN );
3943 DEBUG_PEEP("expect IFTHEN", scan, depth);
3945 data_fake.last_closep= &fake_last_close;
3947 next = regnext(scan);
3948 scan = NEXTOPER(NEXTOPER(scan));
3949 DEBUG_PEEP("scan", scan, depth);
3950 DEBUG_PEEP("next", next, depth);
3952 /* we suppose the run is continuous, last=next...
3953 * NOTE we dont use the return here! */
3954 (void)study_chunk(pRExC_state, &scan, &minlen,
3955 &deltanext, next, &data_fake, stopparen,
3956 recursed_depth, NULL, f, depth+1);
3961 OP(scan) == BRANCH ||
3962 OP(scan) == BRANCHJ ||
3965 next = regnext(scan);
3968 /* The op(next)==code check below is to see if we
3969 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3970 * IFTHEN is special as it might not appear in pairs.
3971 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3972 * we dont handle it cleanly. */
3973 if (OP(next) == code || code == IFTHEN) {
3974 /* NOTE - There is similar code to this block below for
3975 * handling TRIE nodes on a re-study. If you change stuff here
3976 * check there too. */
3977 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3979 regnode * const startbranch=scan;
3981 if (flags & SCF_DO_SUBSTR) {
3982 /* Cannot merge strings after this. */
3983 scan_commit(pRExC_state, data, minlenp, is_inf);
3986 if (flags & SCF_DO_STCLASS)
3987 ssc_init_zero(pRExC_state, &accum);
3989 while (OP(scan) == code) {
3990 SSize_t deltanext, minnext, fake;
3992 regnode_ssc this_class;
3994 DEBUG_PEEP("Branch", scan, depth);
3997 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3999 data_fake.whilem_c = data->whilem_c;
4000 data_fake.last_closep = data->last_closep;
4003 data_fake.last_closep = &fake;
4005 data_fake.pos_delta = delta;
4006 next = regnext(scan);
4008 scan = NEXTOPER(scan); /* everything */
4009 if (code != BRANCH) /* everything but BRANCH */
4010 scan = NEXTOPER(scan);
4012 if (flags & SCF_DO_STCLASS) {
4013 ssc_init(pRExC_state, &this_class);
4014 data_fake.start_class = &this_class;
4015 f = SCF_DO_STCLASS_AND;
4017 if (flags & SCF_WHILEM_VISITED_POS)
4018 f |= SCF_WHILEM_VISITED_POS;
4020 /* we suppose the run is continuous, last=next...*/
4021 minnext = study_chunk(pRExC_state, &scan, minlenp,
4022 &deltanext, next, &data_fake, stopparen,
4023 recursed_depth, NULL, f,depth+1);
4027 if (deltanext == SSize_t_MAX) {
4028 is_inf = is_inf_internal = 1;
4030 } else if (max1 < minnext + deltanext)
4031 max1 = minnext + deltanext;
4033 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4035 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4036 if ( stopmin > minnext)
4037 stopmin = min + min1;
4038 flags &= ~SCF_DO_SUBSTR;
4040 data->flags |= SCF_SEEN_ACCEPT;
4043 if (data_fake.flags & SF_HAS_EVAL)
4044 data->flags |= SF_HAS_EVAL;
4045 data->whilem_c = data_fake.whilem_c;
4047 if (flags & SCF_DO_STCLASS)
4048 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4050 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4052 if (flags & SCF_DO_SUBSTR) {
4053 data->pos_min += min1;
4054 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4055 data->pos_delta = SSize_t_MAX;
4057 data->pos_delta += max1 - min1;
4058 if (max1 != min1 || is_inf)
4059 data->longest = &(data->longest_float);
4062 if (delta == SSize_t_MAX
4063 || SSize_t_MAX - delta - (max1 - min1) < 0)
4064 delta = SSize_t_MAX;
4066 delta += max1 - min1;
4067 if (flags & SCF_DO_STCLASS_OR) {
4068 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4070 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4071 flags &= ~SCF_DO_STCLASS;
4074 else if (flags & SCF_DO_STCLASS_AND) {
4076 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4077 flags &= ~SCF_DO_STCLASS;
4080 /* Switch to OR mode: cache the old value of
4081 * data->start_class */
4083 StructCopy(data->start_class, and_withp, regnode_ssc);
4084 flags &= ~SCF_DO_STCLASS_AND;
4085 StructCopy(&accum, data->start_class, regnode_ssc);
4086 flags |= SCF_DO_STCLASS_OR;
4090 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4091 OP( startbranch ) == BRANCH )
4095 Assuming this was/is a branch we are dealing with: 'scan'
4096 now points at the item that follows the branch sequence,
4097 whatever it is. We now start at the beginning of the
4098 sequence and look for subsequences of
4104 which would be constructed from a pattern like
4107 If we can find such a subsequence we need to turn the first
4108 element into a trie and then add the subsequent branch exact
4109 strings to the trie.
4113 1. patterns where the whole set of branches can be
4116 2. patterns where only a subset can be converted.
4118 In case 1 we can replace the whole set with a single regop
4119 for the trie. In case 2 we need to keep the start and end
4122 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4123 becomes BRANCH TRIE; BRANCH X;
4125 There is an additional case, that being where there is a
4126 common prefix, which gets split out into an EXACT like node
4127 preceding the TRIE node.
4129 If x(1..n)==tail then we can do a simple trie, if not we make
4130 a "jump" trie, such that when we match the appropriate word
4131 we "jump" to the appropriate tail node. Essentially we turn
4132 a nested if into a case structure of sorts.
4137 if (!re_trie_maxbuff) {
4138 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4139 if (!SvIOK(re_trie_maxbuff))
4140 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4142 if ( SvIV(re_trie_maxbuff)>=0 ) {
4144 regnode *first = (regnode *)NULL;
4145 regnode *last = (regnode *)NULL;
4146 regnode *tail = scan;
4150 /* var tail is used because there may be a TAIL
4151 regop in the way. Ie, the exacts will point to the
4152 thing following the TAIL, but the last branch will
4153 point at the TAIL. So we advance tail. If we
4154 have nested (?:) we may have to move through several
4158 while ( OP( tail ) == TAIL ) {
4159 /* this is the TAIL generated by (?:) */
4160 tail = regnext( tail );
4164 DEBUG_TRIE_COMPILE_r({
4165 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4166 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4167 (int)depth * 2 + 2, "",
4168 "Looking for TRIE'able sequences. Tail node is: ",
4169 SvPV_nolen_const( RExC_mysv )
4175 Step through the branches
4176 cur represents each branch,
4177 noper is the first thing to be matched as part
4179 noper_next is the regnext() of that node.
4181 We normally handle a case like this
4182 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4183 support building with NOJUMPTRIE, which restricts
4184 the trie logic to structures like /FOO|BAR/.
4186 If noper is a trieable nodetype then the branch is
4187 a possible optimization target. If we are building
4188 under NOJUMPTRIE then we require that noper_next is
4189 the same as scan (our current position in the regex
4192 Once we have two or more consecutive such branches
4193 we can create a trie of the EXACT's contents and
4194 stitch it in place into the program.
4196 If the sequence represents all of the branches in
4197 the alternation we replace the entire thing with a
4200 Otherwise when it is a subsequence we need to
4201 stitch it in place and replace only the relevant
4202 branches. This means the first branch has to remain
4203 as it is used by the alternation logic, and its
4204 next pointer, and needs to be repointed at the item
4205 on the branch chain following the last branch we
4206 have optimized away.
4208 This could be either a BRANCH, in which case the
4209 subsequence is internal, or it could be the item
4210 following the branch sequence in which case the
4211 subsequence is at the end (which does not
4212 necessarily mean the first node is the start of the
4215 TRIE_TYPE(X) is a define which maps the optype to a
4219 ----------------+-----------
4223 EXACTFU_SS | EXACTFU
4226 EXACTFLU8 | EXACTFLU8
4230 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4232 : ( EXACT == (X) ) \
4234 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4236 : ( EXACTFA == (X) ) \
4238 : ( EXACTL == (X) ) \
4240 : ( EXACTFLU8 == (X) ) \
4244 /* dont use tail as the end marker for this traverse */
4245 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4246 regnode * const noper = NEXTOPER( cur );
4247 U8 noper_type = OP( noper );
4248 U8 noper_trietype = TRIE_TYPE( noper_type );
4249 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4250 regnode * const noper_next = regnext( noper );
4251 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4252 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4255 DEBUG_TRIE_COMPILE_r({
4256 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4257 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4258 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4260 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4261 PerlIO_printf( Perl_debug_log, " -> %s",
4262 SvPV_nolen_const(RExC_mysv));
4265 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4266 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4267 SvPV_nolen_const(RExC_mysv));
4269 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4270 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4271 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4275 /* Is noper a trieable nodetype that can be merged
4276 * with the current trie (if there is one)? */
4280 ( noper_trietype == NOTHING)
4281 || ( trietype == NOTHING )
4282 || ( trietype == noper_trietype )
4285 && noper_next == tail
4289 /* Handle mergable triable node Either we are
4290 * the first node in a new trieable sequence,
4291 * in which case we do some bookkeeping,
4292 * otherwise we update the end pointer. */
4295 if ( noper_trietype == NOTHING ) {
4296 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4297 regnode * const noper_next = regnext( noper );
4298 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4299 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4302 if ( noper_next_trietype ) {
4303 trietype = noper_next_trietype;
4304 } else if (noper_next_type) {
4305 /* a NOTHING regop is 1 regop wide.
4306 * We need at least two for a trie
4307 * so we can't merge this in */
4311 trietype = noper_trietype;
4314 if ( trietype == NOTHING )
4315 trietype = noper_trietype;
4320 } /* end handle mergable triable node */
4322 /* handle unmergable node -
4323 * noper may either be a triable node which can
4324 * not be tried together with the current trie,
4325 * or a non triable node */
4327 /* If last is set and trietype is not
4328 * NOTHING then we have found at least two
4329 * triable branch sequences in a row of a
4330 * similar trietype so we can turn them
4331 * into a trie. If/when we allow NOTHING to
4332 * start a trie sequence this condition
4333 * will be required, and it isn't expensive
4334 * so we leave it in for now. */
4335 if ( trietype && trietype != NOTHING )
4336 make_trie( pRExC_state,
4337 startbranch, first, cur, tail,
4338 count, trietype, depth+1 );
4339 last = NULL; /* note: we clear/update
4340 first, trietype etc below,
4341 so we dont do it here */
4345 && noper_next == tail
4348 /* noper is triable, so we can start a new
4352 trietype = noper_trietype;
4354 /* if we already saw a first but the
4355 * current node is not triable then we have
4356 * to reset the first information. */
4361 } /* end handle unmergable node */
4362 } /* loop over branches */
4363 DEBUG_TRIE_COMPILE_r({
4364 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4365 PerlIO_printf( Perl_debug_log,
4366 "%*s- %s (%d) <SCAN FINISHED>\n",
4368 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4371 if ( last && trietype ) {
4372 if ( trietype != NOTHING ) {
4373 /* the last branch of the sequence was part of
4374 * a trie, so we have to construct it here
4375 * outside of the loop */
4376 made= make_trie( pRExC_state, startbranch,
4377 first, scan, tail, count,
4378 trietype, depth+1 );
4379 #ifdef TRIE_STUDY_OPT
4380 if ( ((made == MADE_EXACT_TRIE &&
4381 startbranch == first)
4382 || ( first_non_open == first )) &&
4384 flags |= SCF_TRIE_RESTUDY;
4385 if ( startbranch == first
4388 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4393 /* at this point we know whatever we have is a
4394 * NOTHING sequence/branch AND if 'startbranch'
4395 * is 'first' then we can turn the whole thing
4398 if ( startbranch == first ) {
4400 /* the entire thing is a NOTHING sequence,
4401 * something like this: (?:|) So we can
4402 * turn it into a plain NOTHING op. */
4403 DEBUG_TRIE_COMPILE_r({
4404 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4405 PerlIO_printf( Perl_debug_log,
4406 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4407 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4410 OP(startbranch)= NOTHING;
4411 NEXT_OFF(startbranch)= tail - startbranch;
4412 for ( opt= startbranch + 1; opt < tail ; opt++ )
4416 } /* end if ( last) */
4417 } /* TRIE_MAXBUF is non zero */
4422 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4423 scan = NEXTOPER(NEXTOPER(scan));
4424 } else /* single branch is optimized. */
4425 scan = NEXTOPER(scan);
4427 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4429 regnode *start = NULL;
4430 regnode *end = NULL;
4431 U32 my_recursed_depth= recursed_depth;
4434 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4435 /* Do setup, note this code has side effects beyond
4436 * the rest of this block. Specifically setting
4437 * RExC_recurse[] must happen at least once during
4439 if (OP(scan) == GOSUB) {
4441 RExC_recurse[ARG2L(scan)] = scan;
4442 start = RExC_open_parens[paren-1];
4443 end = RExC_close_parens[paren-1];
4445 start = RExC_rxi->program + 1;
4448 /* NOTE we MUST always execute the above code, even
4449 * if we do nothing with a GOSUB/GOSTART */
4451 ( flags & SCF_IN_DEFINE )
4454 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4456 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4459 /* no need to do anything here if we are in a define. */
4460 /* or we are after some kind of infinite construct
4461 * so we can skip recursing into this item.
4462 * Since it is infinite we will not change the maxlen
4463 * or delta, and if we miss something that might raise
4464 * the minlen it will merely pessimise a little.
4466 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4467 * might result in a minlen of 1 and not of 4,
4468 * but this doesn't make us mismatch, just try a bit
4469 * harder than we should.
4471 scan= regnext(scan);
4478 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4480 /* it is quite possible that there are more efficient ways
4481 * to do this. We maintain a bitmap per level of recursion
4482 * of which patterns we have entered so we can detect if a
4483 * pattern creates a possible infinite loop. When we
4484 * recurse down a level we copy the previous levels bitmap
4485 * down. When we are at recursion level 0 we zero the top
4486 * level bitmap. It would be nice to implement a different
4487 * more efficient way of doing this. In particular the top
4488 * level bitmap may be unnecessary.
4490 if (!recursed_depth) {
4491 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4493 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4494 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4495 RExC_study_chunk_recursed_bytes, U8);
4497 /* we havent recursed into this paren yet, so recurse into it */
4498 DEBUG_STUDYDATA("set:", data,depth);
4499 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4500 my_recursed_depth= recursed_depth + 1;
4502 DEBUG_STUDYDATA("inf:", data,depth);
4503 /* some form of infinite recursion, assume infinite length
4505 if (flags & SCF_DO_SUBSTR) {
4506 scan_commit(pRExC_state, data, minlenp, is_inf);
4507 data->longest = &(data->longest_float);
4509 is_inf = is_inf_internal = 1;
4510 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4511 ssc_anything(data->start_class);
4512 flags &= ~SCF_DO_STCLASS;
4514 start= NULL; /* reset start so we dont recurse later on. */
4519 end = regnext(scan);
4522 scan_frame *newframe;
4524 if (!RExC_frame_last) {
4525 Newxz(newframe, 1, scan_frame);
4526 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4527 RExC_frame_head= newframe;
4529 } else if (!RExC_frame_last->next_frame) {
4530 Newxz(newframe,1,scan_frame);
4531 RExC_frame_last->next_frame= newframe;
4532 newframe->prev_frame= RExC_frame_last;
4535 newframe= RExC_frame_last->next_frame;
4537 RExC_frame_last= newframe;
4539 newframe->next_regnode = regnext(scan);
4540 newframe->last_regnode = last;
4541 newframe->stopparen = stopparen;
4542 newframe->prev_recursed_depth = recursed_depth;
4543 newframe->this_prev_frame= frame;
4545 DEBUG_STUDYDATA("frame-new:",data,depth);
4546 DEBUG_PEEP("fnew", scan, depth);
4553 recursed_depth= my_recursed_depth;
4558 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4559 SSize_t l = STR_LEN(scan);
4562 const U8 * const s = (U8*)STRING(scan);
4563 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4564 l = utf8_length(s, s + l);
4566 uc = *((U8*)STRING(scan));
4569 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4570 /* The code below prefers earlier match for fixed
4571 offset, later match for variable offset. */
4572 if (data->last_end == -1) { /* Update the start info. */
4573 data->last_start_min = data->pos_min;
4574 data->last_start_max = is_inf
4575 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4577 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4579 SvUTF8_on(data->last_found);
4581 SV * const sv = data->last_found;
4582 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4583 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4584 if (mg && mg->mg_len >= 0)
4585 mg->mg_len += utf8_length((U8*)STRING(scan),
4586 (U8*)STRING(scan)+STR_LEN(scan));
4588 data->last_end = data->pos_min + l;
4589 data->pos_min += l; /* As in the first entry. */
4590 data->flags &= ~SF_BEFORE_EOL;
4593 /* ANDing the code point leaves at most it, and not in locale, and
4594 * can't match null string */
4595 if (flags & SCF_DO_STCLASS_AND) {
4596 ssc_cp_and(data->start_class, uc);
4597 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4598 ssc_clear_locale(data->start_class);
4600 else if (flags & SCF_DO_STCLASS_OR) {
4601 ssc_add_cp(data->start_class, uc);
4602 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4604 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4605 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4607 flags &= ~SCF_DO_STCLASS;
4609 else if (PL_regkind[OP(scan)] == EXACT) {
4610 /* But OP != EXACT!, so is EXACTFish */
4611 SSize_t l = STR_LEN(scan);
4612 const U8 * s = (U8*)STRING(scan);
4614 /* Search for fixed substrings supports EXACT only. */
4615 if (flags & SCF_DO_SUBSTR) {
4617 scan_commit(pRExC_state, data, minlenp, is_inf);
4620 l = utf8_length(s, s + l);
4622 if (unfolded_multi_char) {
4623 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4625 min += l - min_subtract;
4627 delta += min_subtract;
4628 if (flags & SCF_DO_SUBSTR) {
4629 data->pos_min += l - min_subtract;
4630 if (data->pos_min < 0) {
4633 data->pos_delta += min_subtract;
4635 data->longest = &(data->longest_float);
4639 if (flags & SCF_DO_STCLASS) {
4640 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4642 assert(EXACTF_invlist);
4643 if (flags & SCF_DO_STCLASS_AND) {
4644 if (OP(scan) != EXACTFL)
4645 ssc_clear_locale(data->start_class);
4646 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4647 ANYOF_POSIXL_ZERO(data->start_class);
4648 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4650 else { /* SCF_DO_STCLASS_OR */
4651 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4652 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4654 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4655 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4657 flags &= ~SCF_DO_STCLASS;
4658 SvREFCNT_dec(EXACTF_invlist);
4661 else if (REGNODE_VARIES(OP(scan))) {
4662 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4663 I32 fl = 0, f = flags;
4664 regnode * const oscan = scan;
4665 regnode_ssc this_class;
4666 regnode_ssc *oclass = NULL;
4667 I32 next_is_eval = 0;
4669 switch (PL_regkind[OP(scan)]) {
4670 case WHILEM: /* End of (?:...)* . */
4671 scan = NEXTOPER(scan);
4674 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4675 next = NEXTOPER(scan);
4676 if (OP(next) == EXACT
4677 || OP(next) == EXACTL
4678 || (flags & SCF_DO_STCLASS))
4681 maxcount = REG_INFTY;
4682 next = regnext(scan);
4683 scan = NEXTOPER(scan);
4687 if (flags & SCF_DO_SUBSTR)
4692 if (flags & SCF_DO_STCLASS) {
4694 maxcount = REG_INFTY;
4695 next = regnext(scan);
4696 scan = NEXTOPER(scan);
4699 if (flags & SCF_DO_SUBSTR) {
4700 scan_commit(pRExC_state, data, minlenp, is_inf);
4701 /* Cannot extend fixed substrings */
4702 data->longest = &(data->longest_float);
4704 is_inf = is_inf_internal = 1;
4705 scan = regnext(scan);
4706 goto optimize_curly_tail;
4708 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4709 && (scan->flags == stopparen))
4714 mincount = ARG1(scan);
4715 maxcount = ARG2(scan);
4717 next = regnext(scan);
4718 if (OP(scan) == CURLYX) {
4719 I32 lp = (data ? *(data->last_closep) : 0);
4720 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4722 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4723 next_is_eval = (OP(scan) == EVAL);
4725 if (flags & SCF_DO_SUBSTR) {
4727 scan_commit(pRExC_state, data, minlenp, is_inf);
4728 /* Cannot extend fixed substrings */
4729 pos_before = data->pos_min;
4733 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4735 data->flags |= SF_IS_INF;
4737 if (flags & SCF_DO_STCLASS) {
4738 ssc_init(pRExC_state, &this_class);
4739 oclass = data->start_class;
4740 data->start_class = &this_class;
4741 f |= SCF_DO_STCLASS_AND;
4742 f &= ~SCF_DO_STCLASS_OR;
4744 /* Exclude from super-linear cache processing any {n,m}
4745 regops for which the combination of input pos and regex
4746 pos is not enough information to determine if a match
4749 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4750 regex pos at the \s*, the prospects for a match depend not
4751 only on the input position but also on how many (bar\s*)
4752 repeats into the {4,8} we are. */
4753 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4754 f &= ~SCF_WHILEM_VISITED_POS;
4756 /* This will finish on WHILEM, setting scan, or on NULL: */
4757 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4758 last, data, stopparen, recursed_depth, NULL,
4760 ? (f & ~SCF_DO_SUBSTR)
4764 if (flags & SCF_DO_STCLASS)
4765 data->start_class = oclass;
4766 if (mincount == 0 || minnext == 0) {
4767 if (flags & SCF_DO_STCLASS_OR) {
4768 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4770 else if (flags & SCF_DO_STCLASS_AND) {
4771 /* Switch to OR mode: cache the old value of
4772 * data->start_class */
4774 StructCopy(data->start_class, and_withp, regnode_ssc);
4775 flags &= ~SCF_DO_STCLASS_AND;
4776 StructCopy(&this_class, data->start_class, regnode_ssc);
4777 flags |= SCF_DO_STCLASS_OR;
4778 ANYOF_FLAGS(data->start_class)
4779 |= SSC_MATCHES_EMPTY_STRING;
4781 } else { /* Non-zero len */
4782 if (flags & SCF_DO_STCLASS_OR) {
4783 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4784 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4786 else if (flags & SCF_DO_STCLASS_AND)
4787 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4788 flags &= ~SCF_DO_STCLASS;
4790 if (!scan) /* It was not CURLYX, but CURLY. */
4792 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4793 /* ? quantifier ok, except for (?{ ... }) */
4794 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4795 && (minnext == 0) && (deltanext == 0)
4796 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4797 && maxcount <= REG_INFTY/3) /* Complement check for big
4800 /* Fatal warnings may leak the regexp without this: */
4801 SAVEFREESV(RExC_rx_sv);
4802 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4803 "Quantifier unexpected on zero-length expression "
4804 "in regex m/%"UTF8f"/",
4805 UTF8fARG(UTF, RExC_end - RExC_precomp,
4807 (void)ReREFCNT_inc(RExC_rx_sv);
4810 min += minnext * mincount;
4811 is_inf_internal |= deltanext == SSize_t_MAX
4812 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4813 is_inf |= is_inf_internal;
4815 delta = SSize_t_MAX;
4817 delta += (minnext + deltanext) * maxcount
4818 - minnext * mincount;
4820 /* Try powerful optimization CURLYX => CURLYN. */
4821 if ( OP(oscan) == CURLYX && data
4822 && data->flags & SF_IN_PAR
4823 && !(data->flags & SF_HAS_EVAL)
4824 && !deltanext && minnext == 1 ) {
4825 /* Try to optimize to CURLYN. */
4826 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4827 regnode * const nxt1 = nxt;
4834 if (!REGNODE_SIMPLE(OP(nxt))
4835 && !(PL_regkind[OP(nxt)] == EXACT
4836 && STR_LEN(nxt) == 1))
4842 if (OP(nxt) != CLOSE)
4844 if (RExC_open_parens) {
4845 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4846 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4848 /* Now we know that nxt2 is the only contents: */
4849 oscan->flags = (U8)ARG(nxt);
4851 OP(nxt1) = NOTHING; /* was OPEN. */
4854 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4855 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4856 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4857 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4858 OP(nxt + 1) = OPTIMIZED; /* was count. */
4859 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4864 /* Try optimization CURLYX => CURLYM. */
4865 if ( OP(oscan) == CURLYX && data
4866 && !(data->flags & SF_HAS_PAR)
4867 && !(data->flags & SF_HAS_EVAL)
4868 && !deltanext /* atom is fixed width */
4869 && minnext != 0 /* CURLYM can't handle zero width */
4871 /* Nor characters whose fold at run-time may be
4872 * multi-character */
4873 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4875 /* XXXX How to optimize if data == 0? */
4876 /* Optimize to a simpler form. */
4877 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4881 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4882 && (OP(nxt2) != WHILEM))
4884 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4885 /* Need to optimize away parenths. */
4886 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4887 /* Set the parenth number. */
4888 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4890 oscan->flags = (U8)ARG(nxt);
4891 if (RExC_open_parens) {
4892 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4893 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4895 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4896 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4899 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4900 OP(nxt + 1) = OPTIMIZED; /* was count. */
4901 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4902 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4905 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4906 regnode *nnxt = regnext(nxt1);
4908 if (reg_off_by_arg[OP(nxt1)])
4909 ARG_SET(nxt1, nxt2 - nxt1);
4910 else if (nxt2 - nxt1 < U16_MAX)
4911 NEXT_OFF(nxt1) = nxt2 - nxt1;
4913 OP(nxt) = NOTHING; /* Cannot beautify */
4918 /* Optimize again: */
4919 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4920 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4925 else if ((OP(oscan) == CURLYX)
4926 && (flags & SCF_WHILEM_VISITED_POS)
4927 /* See the comment on a similar expression above.
4928 However, this time it's not a subexpression
4929 we care about, but the expression itself. */
4930 && (maxcount == REG_INFTY)
4931 && data && ++data->whilem_c < 16) {
4932 /* This stays as CURLYX, we can put the count/of pair. */
4933 /* Find WHILEM (as in regexec.c) */
4934 regnode *nxt = oscan + NEXT_OFF(oscan);
4936 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4938 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4939 | (RExC_whilem_seen << 4)); /* On WHILEM */
4941 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4943 if (flags & SCF_DO_SUBSTR) {
4944 SV *last_str = NULL;
4945 STRLEN last_chrs = 0;
4946 int counted = mincount != 0;
4948 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4950 SSize_t b = pos_before >= data->last_start_min
4951 ? pos_before : data->last_start_min;
4953 const char * const s = SvPV_const(data->last_found, l);
4954 SSize_t old = b - data->last_start_min;
4957 old = utf8_hop((U8*)s, old) - (U8*)s;
4959 /* Get the added string: */
4960 last_str = newSVpvn_utf8(s + old, l, UTF);
4961 last_chrs = UTF ? utf8_length((U8*)(s + old),
4962 (U8*)(s + old + l)) : l;
4963 if (deltanext == 0 && pos_before == b) {
4964 /* What was added is a constant string */
4967 SvGROW(last_str, (mincount * l) + 1);
4968 repeatcpy(SvPVX(last_str) + l,
4969 SvPVX_const(last_str), l,
4971 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4972 /* Add additional parts. */
4973 SvCUR_set(data->last_found,
4974 SvCUR(data->last_found) - l);
4975 sv_catsv(data->last_found, last_str);
4977 SV * sv = data->last_found;
4979 SvUTF8(sv) && SvMAGICAL(sv) ?
4980 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4981 if (mg && mg->mg_len >= 0)
4982 mg->mg_len += last_chrs * (mincount-1);
4984 last_chrs *= mincount;
4985 data->last_end += l * (mincount - 1);
4988 /* start offset must point into the last copy */
4989 data->last_start_min += minnext * (mincount - 1);
4990 data->last_start_max =
4993 : data->last_start_max +
4994 (maxcount - 1) * (minnext + data->pos_delta);
4997 /* It is counted once already... */
4998 data->pos_min += minnext * (mincount - counted);
5000 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5001 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5002 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5003 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5005 if (deltanext != SSize_t_MAX)
5006 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5007 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5008 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5010 if (deltanext == SSize_t_MAX
5011 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5012 data->pos_delta = SSize_t_MAX;
5014 data->pos_delta += - counted * deltanext +
5015 (minnext + deltanext) * maxcount - minnext * mincount;
5016 if (mincount != maxcount) {
5017 /* Cannot extend fixed substrings found inside
5019 scan_commit(pRExC_state, data, minlenp, is_inf);
5020 if (mincount && last_str) {
5021 SV * const sv = data->last_found;
5022 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5023 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5027 sv_setsv(sv, last_str);
5028 data->last_end = data->pos_min;
5029 data->last_start_min = data->pos_min - last_chrs;
5030 data->last_start_max = is_inf
5032 : data->pos_min + data->pos_delta - last_chrs;
5034 data->longest = &(data->longest_float);
5036 SvREFCNT_dec(last_str);
5038 if (data && (fl & SF_HAS_EVAL))
5039 data->flags |= SF_HAS_EVAL;
5040 optimize_curly_tail:
5041 if (OP(oscan) != CURLYX) {
5042 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5044 NEXT_OFF(oscan) += NEXT_OFF(next);
5050 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5055 if (flags & SCF_DO_SUBSTR) {
5056 /* Cannot expect anything... */
5057 scan_commit(pRExC_state, data, minlenp, is_inf);
5058 data->longest = &(data->longest_float);
5060 is_inf = is_inf_internal = 1;
5061 if (flags & SCF_DO_STCLASS_OR) {
5062 if (OP(scan) == CLUMP) {
5063 /* Actually is any start char, but very few code points
5064 * aren't start characters */
5065 ssc_match_all_cp(data->start_class);
5068 ssc_anything(data->start_class);
5071 flags &= ~SCF_DO_STCLASS;
5075 else if (OP(scan) == LNBREAK) {
5076 if (flags & SCF_DO_STCLASS) {
5077 if (flags & SCF_DO_STCLASS_AND) {
5078 ssc_intersection(data->start_class,
5079 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5080 ssc_clear_locale(data->start_class);
5081 ANYOF_FLAGS(data->start_class)
5082 &= ~SSC_MATCHES_EMPTY_STRING;
5084 else if (flags & SCF_DO_STCLASS_OR) {
5085 ssc_union(data->start_class,
5086 PL_XPosix_ptrs[_CC_VERTSPACE],
5088 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5090 /* See commit msg for
5091 * 749e076fceedeb708a624933726e7989f2302f6a */
5092 ANYOF_FLAGS(data->start_class)
5093 &= ~SSC_MATCHES_EMPTY_STRING;
5095 flags &= ~SCF_DO_STCLASS;
5098 if (delta != SSize_t_MAX)
5099 delta++; /* Because of the 2 char string cr-lf */
5100 if (flags & SCF_DO_SUBSTR) {
5101 /* Cannot expect anything... */
5102 scan_commit(pRExC_state, data, minlenp, is_inf);
5104 data->pos_delta += 1;
5105 data->longest = &(data->longest_float);
5108 else if (REGNODE_SIMPLE(OP(scan))) {
5110 if (flags & SCF_DO_SUBSTR) {
5111 scan_commit(pRExC_state, data, minlenp, is_inf);
5115 if (flags & SCF_DO_STCLASS) {
5117 SV* my_invlist = NULL;
5120 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5121 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5123 /* Some of the logic below assumes that switching
5124 locale on will only add false positives. */
5129 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5133 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5134 ssc_match_all_cp(data->start_class);
5139 SV* REG_ANY_invlist = _new_invlist(2);
5140 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5142 if (flags & SCF_DO_STCLASS_OR) {
5143 ssc_union(data->start_class,
5145 TRUE /* TRUE => invert, hence all but \n
5149 else if (flags & SCF_DO_STCLASS_AND) {
5150 ssc_intersection(data->start_class,
5152 TRUE /* TRUE => invert */
5154 ssc_clear_locale(data->start_class);
5156 SvREFCNT_dec_NN(REG_ANY_invlist);
5163 if (flags & SCF_DO_STCLASS_AND)
5164 ssc_and(pRExC_state, data->start_class,
5165 (regnode_charclass *) scan);
5167 ssc_or(pRExC_state, data->start_class,
5168 (regnode_charclass *) scan);
5176 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5177 if (flags & SCF_DO_STCLASS_AND) {
5178 bool was_there = cBOOL(
5179 ANYOF_POSIXL_TEST(data->start_class,
5181 ANYOF_POSIXL_ZERO(data->start_class);
5182 if (was_there) { /* Do an AND */
5183 ANYOF_POSIXL_SET(data->start_class, namedclass);
5185 /* No individual code points can now match */
5186 data->start_class->invlist
5187 = sv_2mortal(_new_invlist(0));
5190 int complement = namedclass + ((invert) ? -1 : 1);
5192 assert(flags & SCF_DO_STCLASS_OR);
5194 /* If the complement of this class was already there,
5195 * the result is that they match all code points,
5196 * (\d + \D == everything). Remove the classes from
5197 * future consideration. Locale is not relevant in
5199 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5200 ssc_match_all_cp(data->start_class);
5201 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5202 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5204 else { /* The usual case; just add this class to the
5206 ANYOF_POSIXL_SET(data->start_class, namedclass);
5211 case NPOSIXA: /* For these, we always know the exact set of
5216 if (FLAGS(scan) == _CC_ASCII) {
5217 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5220 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5221 PL_XPosix_ptrs[_CC_ASCII],
5232 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5234 /* NPOSIXD matches all upper Latin1 code points unless the
5235 * target string being matched is UTF-8, which is
5236 * unknowable until match time. Since we are going to
5237 * invert, we want to get rid of all of them so that the
5238 * inversion will match all */
5239 if (OP(scan) == NPOSIXD) {
5240 _invlist_subtract(my_invlist, PL_UpperLatin1,
5246 if (flags & SCF_DO_STCLASS_AND) {
5247 ssc_intersection(data->start_class, my_invlist, invert);
5248 ssc_clear_locale(data->start_class);
5251 assert(flags & SCF_DO_STCLASS_OR);
5252 ssc_union(data->start_class, my_invlist, invert);
5254 SvREFCNT_dec(my_invlist);
5256 if (flags & SCF_DO_STCLASS_OR)
5257 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5258 flags &= ~SCF_DO_STCLASS;
5261 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5262 data->flags |= (OP(scan) == MEOL
5265 scan_commit(pRExC_state, data, minlenp, is_inf);
5268 else if ( PL_regkind[OP(scan)] == BRANCHJ
5269 /* Lookbehind, or need to calculate parens/evals/stclass: */
5270 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5271 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5273 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5274 || OP(scan) == UNLESSM )
5276 /* Negative Lookahead/lookbehind
5277 In this case we can't do fixed string optimisation.
5280 SSize_t deltanext, minnext, fake = 0;
5285 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5287 data_fake.whilem_c = data->whilem_c;
5288 data_fake.last_closep = data->last_closep;
5291 data_fake.last_closep = &fake;
5292 data_fake.pos_delta = delta;
5293 if ( flags & SCF_DO_STCLASS && !scan->flags
5294 && OP(scan) == IFMATCH ) { /* Lookahead */
5295 ssc_init(pRExC_state, &intrnl);
5296 data_fake.start_class = &intrnl;
5297 f |= SCF_DO_STCLASS_AND;
5299 if (flags & SCF_WHILEM_VISITED_POS)
5300 f |= SCF_WHILEM_VISITED_POS;
5301 next = regnext(scan);
5302 nscan = NEXTOPER(NEXTOPER(scan));
5303 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5304 last, &data_fake, stopparen,
5305 recursed_depth, NULL, f, depth+1);
5308 FAIL("Variable length lookbehind not implemented");
5310 else if (minnext > (I32)U8_MAX) {
5311 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5314 scan->flags = (U8)minnext;
5317 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5319 if (data_fake.flags & SF_HAS_EVAL)
5320 data->flags |= SF_HAS_EVAL;
5321 data->whilem_c = data_fake.whilem_c;
5323 if (f & SCF_DO_STCLASS_AND) {
5324 if (flags & SCF_DO_STCLASS_OR) {
5325 /* OR before, AND after: ideally we would recurse with
5326 * data_fake to get the AND applied by study of the
5327 * remainder of the pattern, and then derecurse;
5328 * *** HACK *** for now just treat as "no information".
5329 * See [perl #56690].
5331 ssc_init(pRExC_state, data->start_class);
5333 /* AND before and after: combine and continue. These
5334 * assertions are zero-length, so can match an EMPTY
5336 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5337 ANYOF_FLAGS(data->start_class)
5338 |= SSC_MATCHES_EMPTY_STRING;
5342 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5344 /* Positive Lookahead/lookbehind
5345 In this case we can do fixed string optimisation,
5346 but we must be careful about it. Note in the case of
5347 lookbehind the positions will be offset by the minimum
5348 length of the pattern, something we won't know about
5349 until after the recurse.
5351 SSize_t deltanext, fake = 0;
5355 /* We use SAVEFREEPV so that when the full compile
5356 is finished perl will clean up the allocated
5357 minlens when it's all done. This way we don't
5358 have to worry about freeing them when we know
5359 they wont be used, which would be a pain.
5362 Newx( minnextp, 1, SSize_t );
5363 SAVEFREEPV(minnextp);
5366 StructCopy(data, &data_fake, scan_data_t);
5367 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5370 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5371 data_fake.last_found=newSVsv(data->last_found);
5375 data_fake.last_closep = &fake;
5376 data_fake.flags = 0;
5377 data_fake.pos_delta = delta;
5379 data_fake.flags |= SF_IS_INF;
5380 if ( flags & SCF_DO_STCLASS && !scan->flags
5381 && OP(scan) == IFMATCH ) { /* Lookahead */
5382 ssc_init(pRExC_state, &intrnl);
5383 data_fake.start_class = &intrnl;
5384 f |= SCF_DO_STCLASS_AND;
5386 if (flags & SCF_WHILEM_VISITED_POS)
5387 f |= SCF_WHILEM_VISITED_POS;
5388 next = regnext(scan);
5389 nscan = NEXTOPER(NEXTOPER(scan));
5391 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5392 &deltanext, last, &data_fake,
5393 stopparen, recursed_depth, NULL,
5397 FAIL("Variable length lookbehind not implemented");
5399 else if (*minnextp > (I32)U8_MAX) {
5400 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5403 scan->flags = (U8)*minnextp;
5408 if (f & SCF_DO_STCLASS_AND) {
5409 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5410 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5413 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5415 if (data_fake.flags & SF_HAS_EVAL)
5416 data->flags |= SF_HAS_EVAL;
5417 data->whilem_c = data_fake.whilem_c;
5418 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5419 if (RExC_rx->minlen<*minnextp)
5420 RExC_rx->minlen=*minnextp;
5421 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5422 SvREFCNT_dec_NN(data_fake.last_found);
5424 if ( data_fake.minlen_fixed != minlenp )
5426 data->offset_fixed= data_fake.offset_fixed;
5427 data->minlen_fixed= data_fake.minlen_fixed;
5428 data->lookbehind_fixed+= scan->flags;
5430 if ( data_fake.minlen_float != minlenp )
5432 data->minlen_float= data_fake.minlen_float;
5433 data->offset_float_min=data_fake.offset_float_min;
5434 data->offset_float_max=data_fake.offset_float_max;
5435 data->lookbehind_float+= scan->flags;
5442 else if (OP(scan) == OPEN) {
5443 if (stopparen != (I32)ARG(scan))
5446 else if (OP(scan) == CLOSE) {
5447 if (stopparen == (I32)ARG(scan)) {
5450 if ((I32)ARG(scan) == is_par) {
5451 next = regnext(scan);
5453 if ( next && (OP(next) != WHILEM) && next < last)
5454 is_par = 0; /* Disable optimization */
5457 *(data->last_closep) = ARG(scan);
5459 else if (OP(scan) == EVAL) {
5461 data->flags |= SF_HAS_EVAL;
5463 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5464 if (flags & SCF_DO_SUBSTR) {
5465 scan_commit(pRExC_state, data, minlenp, is_inf);
5466 flags &= ~SCF_DO_SUBSTR;
5468 if (data && OP(scan)==ACCEPT) {
5469 data->flags |= SCF_SEEN_ACCEPT;
5474 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5476 if (flags & SCF_DO_SUBSTR) {
5477 scan_commit(pRExC_state, data, minlenp, is_inf);
5478 data->longest = &(data->longest_float);
5480 is_inf = is_inf_internal = 1;
5481 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5482 ssc_anything(data->start_class);
5483 flags &= ~SCF_DO_STCLASS;
5485 else if (OP(scan) == GPOS) {
5486 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5487 !(delta || is_inf || (data && data->pos_delta)))
5489 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5490 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5491 if (RExC_rx->gofs < (STRLEN)min)
5492 RExC_rx->gofs = min;
5494 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5498 #ifdef TRIE_STUDY_OPT
5499 #ifdef FULL_TRIE_STUDY
5500 else if (PL_regkind[OP(scan)] == TRIE) {
5501 /* NOTE - There is similar code to this block above for handling
5502 BRANCH nodes on the initial study. If you change stuff here
5504 regnode *trie_node= scan;
5505 regnode *tail= regnext(scan);
5506 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5507 SSize_t max1 = 0, min1 = SSize_t_MAX;
5510 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5511 /* Cannot merge strings after this. */
5512 scan_commit(pRExC_state, data, minlenp, is_inf);
5514 if (flags & SCF_DO_STCLASS)
5515 ssc_init_zero(pRExC_state, &accum);
5521 const regnode *nextbranch= NULL;
5524 for ( word=1 ; word <= trie->wordcount ; word++)
5526 SSize_t deltanext=0, minnext=0, f = 0, fake;
5527 regnode_ssc this_class;
5529 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5531 data_fake.whilem_c = data->whilem_c;
5532 data_fake.last_closep = data->last_closep;
5535 data_fake.last_closep = &fake;
5536 data_fake.pos_delta = delta;
5537 if (flags & SCF_DO_STCLASS) {
5538 ssc_init(pRExC_state, &this_class);
5539 data_fake.start_class = &this_class;
5540 f = SCF_DO_STCLASS_AND;
5542 if (flags & SCF_WHILEM_VISITED_POS)
5543 f |= SCF_WHILEM_VISITED_POS;
5545 if (trie->jump[word]) {
5547 nextbranch = trie_node + trie->jump[0];
5548 scan= trie_node + trie->jump[word];
5549 /* We go from the jump point to the branch that follows
5550 it. Note this means we need the vestigal unused
5551 branches even though they arent otherwise used. */
5552 minnext = study_chunk(pRExC_state, &scan, minlenp,
5553 &deltanext, (regnode *)nextbranch, &data_fake,
5554 stopparen, recursed_depth, NULL, f,depth+1);
5556 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5557 nextbranch= regnext((regnode*)nextbranch);
5559 if (min1 > (SSize_t)(minnext + trie->minlen))
5560 min1 = minnext + trie->minlen;
5561 if (deltanext == SSize_t_MAX) {
5562 is_inf = is_inf_internal = 1;
5564 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5565 max1 = minnext + deltanext + trie->maxlen;
5567 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5569 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5570 if ( stopmin > min + min1)
5571 stopmin = min + min1;
5572 flags &= ~SCF_DO_SUBSTR;
5574 data->flags |= SCF_SEEN_ACCEPT;
5577 if (data_fake.flags & SF_HAS_EVAL)
5578 data->flags |= SF_HAS_EVAL;
5579 data->whilem_c = data_fake.whilem_c;
5581 if (flags & SCF_DO_STCLASS)
5582 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5585 if (flags & SCF_DO_SUBSTR) {
5586 data->pos_min += min1;
5587 data->pos_delta += max1 - min1;
5588 if (max1 != min1 || is_inf)
5589 data->longest = &(data->longest_float);
5592 if (delta != SSize_t_MAX)
5593 delta += max1 - min1;
5594 if (flags & SCF_DO_STCLASS_OR) {
5595 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5597 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5598 flags &= ~SCF_DO_STCLASS;
5601 else if (flags & SCF_DO_STCLASS_AND) {
5603 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5604 flags &= ~SCF_DO_STCLASS;
5607 /* Switch to OR mode: cache the old value of
5608 * data->start_class */
5610 StructCopy(data->start_class, and_withp, regnode_ssc);
5611 flags &= ~SCF_DO_STCLASS_AND;
5612 StructCopy(&accum, data->start_class, regnode_ssc);
5613 flags |= SCF_DO_STCLASS_OR;
5620 else if (PL_regkind[OP(scan)] == TRIE) {
5621 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5624 min += trie->minlen;
5625 delta += (trie->maxlen - trie->minlen);
5626 flags &= ~SCF_DO_STCLASS; /* xxx */
5627 if (flags & SCF_DO_SUBSTR) {
5628 /* Cannot expect anything... */
5629 scan_commit(pRExC_state, data, minlenp, is_inf);
5630 data->pos_min += trie->minlen;
5631 data->pos_delta += (trie->maxlen - trie->minlen);
5632 if (trie->maxlen != trie->minlen)
5633 data->longest = &(data->longest_float);
5635 if (trie->jump) /* no more substrings -- for now /grr*/
5636 flags &= ~SCF_DO_SUBSTR;
5638 #endif /* old or new */
5639 #endif /* TRIE_STUDY_OPT */
5641 /* Else: zero-length, ignore. */
5642 scan = regnext(scan);
5644 /* If we are exiting a recursion we can unset its recursed bit
5645 * and allow ourselves to enter it again - no danger of an
5646 * infinite loop there.
5647 if (stopparen > -1 && recursed) {
5648 DEBUG_STUDYDATA("unset:", data,depth);
5649 PAREN_UNSET( recursed, stopparen);
5655 DEBUG_STUDYDATA("frame-end:",data,depth);
5656 DEBUG_PEEP("fend", scan, depth);
5658 /* restore previous context */
5659 last = frame->last_regnode;
5660 scan = frame->next_regnode;
5661 stopparen = frame->stopparen;
5662 recursed_depth = frame->prev_recursed_depth;
5664 RExC_frame_last = frame->prev_frame;
5665 frame = frame->this_prev_frame;
5666 goto fake_study_recurse;
5671 DEBUG_STUDYDATA("pre-fin:",data,depth);
5674 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5676 if (flags & SCF_DO_SUBSTR && is_inf)
5677 data->pos_delta = SSize_t_MAX - data->pos_min;
5678 if (is_par > (I32)U8_MAX)
5680 if (is_par && pars==1 && data) {
5681 data->flags |= SF_IN_PAR;
5682 data->flags &= ~SF_HAS_PAR;
5684 else if (pars && data) {
5685 data->flags |= SF_HAS_PAR;
5686 data->flags &= ~SF_IN_PAR;
5688 if (flags & SCF_DO_STCLASS_OR)
5689 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5690 if (flags & SCF_TRIE_RESTUDY)
5691 data->flags |= SCF_TRIE_RESTUDY;
5693 DEBUG_STUDYDATA("post-fin:",data,depth);
5696 SSize_t final_minlen= min < stopmin ? min : stopmin;
5698 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5699 if (final_minlen > SSize_t_MAX - delta)
5700 RExC_maxlen = SSize_t_MAX;
5701 else if (RExC_maxlen < final_minlen + delta)
5702 RExC_maxlen = final_minlen + delta;
5704 return final_minlen;
5706 NOT_REACHED; /* NOTREACHED */
5710 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5712 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5714 PERL_ARGS_ASSERT_ADD_DATA;
5716 Renewc(RExC_rxi->data,
5717 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5718 char, struct reg_data);
5720 Renew(RExC_rxi->data->what, count + n, U8);
5722 Newx(RExC_rxi->data->what, n, U8);
5723 RExC_rxi->data->count = count + n;
5724 Copy(s, RExC_rxi->data->what + count, n, U8);
5728 /*XXX: todo make this not included in a non debugging perl, but appears to be
5729 * used anyway there, in 'use re' */
5730 #ifndef PERL_IN_XSUB_RE
5732 Perl_reginitcolors(pTHX)
5734 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5736 char *t = savepv(s);
5740 t = strchr(t, '\t');
5746 PL_colors[i] = t = (char *)"";
5751 PL_colors[i++] = (char *)"";
5758 #ifdef TRIE_STUDY_OPT
5759 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5762 (data.flags & SCF_TRIE_RESTUDY) \
5770 #define CHECK_RESTUDY_GOTO_butfirst
5774 * pregcomp - compile a regular expression into internal code
5776 * Decides which engine's compiler to call based on the hint currently in
5780 #ifndef PERL_IN_XSUB_RE
5782 /* return the currently in-scope regex engine (or the default if none) */
5784 regexp_engine const *
5785 Perl_current_re_engine(pTHX)
5787 if (IN_PERL_COMPILETIME) {
5788 HV * const table = GvHV(PL_hintgv);
5791 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5792 return &PL_core_reg_engine;
5793 ptr = hv_fetchs(table, "regcomp", FALSE);
5794 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5795 return &PL_core_reg_engine;
5796 return INT2PTR(regexp_engine*,SvIV(*ptr));
5800 if (!PL_curcop->cop_hints_hash)
5801 return &PL_core_reg_engine;
5802 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5803 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5804 return &PL_core_reg_engine;
5805 return INT2PTR(regexp_engine*,SvIV(ptr));
5811 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5813 regexp_engine const *eng = current_re_engine();
5814 GET_RE_DEBUG_FLAGS_DECL;
5816 PERL_ARGS_ASSERT_PREGCOMP;
5818 /* Dispatch a request to compile a regexp to correct regexp engine. */
5820 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5823 return CALLREGCOMP_ENG(eng, pattern, flags);
5827 /* public(ish) entry point for the perl core's own regex compiling code.
5828 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5829 * pattern rather than a list of OPs, and uses the internal engine rather
5830 * than the current one */
5833 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5835 SV *pat = pattern; /* defeat constness! */
5836 PERL_ARGS_ASSERT_RE_COMPILE;
5837 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5838 #ifdef PERL_IN_XSUB_RE
5841 &PL_core_reg_engine,
5843 NULL, NULL, rx_flags, 0);
5847 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5848 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5849 * point to the realloced string and length.
5851 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5855 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5856 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5858 U8 *const src = (U8*)*pat_p;
5863 GET_RE_DEBUG_FLAGS_DECL;
5865 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5866 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5868 Newx(dst, *plen_p * 2 + 1, U8);
5871 while (s < *plen_p) {
5872 append_utf8_from_native_byte(src[s], &d);
5873 if (n < num_code_blocks) {
5874 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5875 pRExC_state->code_blocks[n].start = d - dst - 1;
5876 assert(*(d - 1) == '(');
5879 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5880 pRExC_state->code_blocks[n].end = d - dst - 1;
5881 assert(*(d - 1) == ')');
5890 *pat_p = (char*) dst;
5892 RExC_orig_utf8 = RExC_utf8 = 1;
5897 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5898 * while recording any code block indices, and handling overloading,
5899 * nested qr// objects etc. If pat is null, it will allocate a new
5900 * string, or just return the first arg, if there's only one.
5902 * Returns the malloced/updated pat.
5903 * patternp and pat_count is the array of SVs to be concatted;
5904 * oplist is the optional list of ops that generated the SVs;
5905 * recompile_p is a pointer to a boolean that will be set if
5906 * the regex will need to be recompiled.
5907 * delim, if non-null is an SV that will be inserted between each element
5911 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5912 SV *pat, SV ** const patternp, int pat_count,
5913 OP *oplist, bool *recompile_p, SV *delim)
5917 bool use_delim = FALSE;
5918 bool alloced = FALSE;
5920 /* if we know we have at least two args, create an empty string,
5921 * then concatenate args to that. For no args, return an empty string */
5922 if (!pat && pat_count != 1) {
5928 for (svp = patternp; svp < patternp + pat_count; svp++) {
5931 STRLEN orig_patlen = 0;
5933 SV *msv = use_delim ? delim : *svp;
5934 if (!msv) msv = &PL_sv_undef;
5936 /* if we've got a delimiter, we go round the loop twice for each
5937 * svp slot (except the last), using the delimiter the second
5946 if (SvTYPE(msv) == SVt_PVAV) {
5947 /* we've encountered an interpolated array within
5948 * the pattern, e.g. /...@a..../. Expand the list of elements,
5949 * then recursively append elements.
5950 * The code in this block is based on S_pushav() */
5952 AV *const av = (AV*)msv;
5953 const SSize_t maxarg = AvFILL(av) + 1;
5957 assert(oplist->op_type == OP_PADAV
5958 || oplist->op_type == OP_RV2AV);
5959 oplist = OpSIBLING(oplist);
5962 if (SvRMAGICAL(av)) {
5965 Newx(array, maxarg, SV*);
5967 for (i=0; i < maxarg; i++) {
5968 SV ** const svp = av_fetch(av, i, FALSE);
5969 array[i] = svp ? *svp : &PL_sv_undef;
5973 array = AvARRAY(av);
5975 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5976 array, maxarg, NULL, recompile_p,
5978 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5984 /* we make the assumption here that each op in the list of
5985 * op_siblings maps to one SV pushed onto the stack,
5986 * except for code blocks, with have both an OP_NULL and
5988 * This allows us to match up the list of SVs against the
5989 * list of OPs to find the next code block.
5991 * Note that PUSHMARK PADSV PADSV ..
5993 * PADRANGE PADSV PADSV ..
5994 * so the alignment still works. */
5997 if (oplist->op_type == OP_NULL
5998 && (oplist->op_flags & OPf_SPECIAL))
6000 assert(n < pRExC_state->num_code_blocks);
6001 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6002 pRExC_state->code_blocks[n].block = oplist;
6003 pRExC_state->code_blocks[n].src_regex = NULL;
6006 oplist = OpSIBLING(oplist); /* skip CONST */
6009 oplist = OpSIBLING(oplist);;
6012 /* apply magic and QR overloading to arg */
6015 if (SvROK(msv) && SvAMAGIC(msv)) {
6016 SV *sv = AMG_CALLunary(msv, regexp_amg);
6020 if (SvTYPE(sv) != SVt_REGEXP)
6021 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6026 /* try concatenation overload ... */
6027 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6028 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6031 /* overloading involved: all bets are off over literal
6032 * code. Pretend we haven't seen it */
6033 pRExC_state->num_code_blocks -= n;
6037 /* ... or failing that, try "" overload */
6038 while (SvAMAGIC(msv)
6039 && (sv = AMG_CALLunary(msv, string_amg))
6043 && SvRV(msv) == SvRV(sv))
6048 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6052 /* this is a partially unrolled
6053 * sv_catsv_nomg(pat, msv);
6054 * that allows us to adjust code block indices if
6057 char *dst = SvPV_force_nomg(pat, dlen);
6059 if (SvUTF8(msv) && !SvUTF8(pat)) {
6060 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6061 sv_setpvn(pat, dst, dlen);
6064 sv_catsv_nomg(pat, msv);
6071 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6074 /* extract any code blocks within any embedded qr//'s */
6075 if (rx && SvTYPE(rx) == SVt_REGEXP
6076 && RX_ENGINE((REGEXP*)rx)->op_comp)
6079 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6080 if (ri->num_code_blocks) {
6082 /* the presence of an embedded qr// with code means
6083 * we should always recompile: the text of the
6084 * qr// may not have changed, but it may be a
6085 * different closure than last time */
6087 Renew(pRExC_state->code_blocks,
6088 pRExC_state->num_code_blocks + ri->num_code_blocks,
6089 struct reg_code_block);
6090 pRExC_state->num_code_blocks += ri->num_code_blocks;
6092 for (i=0; i < ri->num_code_blocks; i++) {
6093 struct reg_code_block *src, *dst;
6094 STRLEN offset = orig_patlen
6095 + ReANY((REGEXP *)rx)->pre_prefix;
6096 assert(n < pRExC_state->num_code_blocks);
6097 src = &ri->code_blocks[i];
6098 dst = &pRExC_state->code_blocks[n];
6099 dst->start = src->start + offset;
6100 dst->end = src->end + offset;
6101 dst->block = src->block;
6102 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6111 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6120 /* see if there are any run-time code blocks in the pattern.
6121 * False positives are allowed */
6124 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6125 char *pat, STRLEN plen)
6130 PERL_UNUSED_CONTEXT;
6132 for (s = 0; s < plen; s++) {
6133 if (n < pRExC_state->num_code_blocks
6134 && s == pRExC_state->code_blocks[n].start)
6136 s = pRExC_state->code_blocks[n].end;
6140 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6142 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6144 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6151 /* Handle run-time code blocks. We will already have compiled any direct
6152 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6153 * copy of it, but with any literal code blocks blanked out and
6154 * appropriate chars escaped; then feed it into
6156 * eval "qr'modified_pattern'"
6160 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6164 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6166 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6167 * and merge them with any code blocks of the original regexp.
6169 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6170 * instead, just save the qr and return FALSE; this tells our caller that
6171 * the original pattern needs upgrading to utf8.
6175 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6176 char *pat, STRLEN plen)
6180 GET_RE_DEBUG_FLAGS_DECL;
6182 if (pRExC_state->runtime_code_qr) {
6183 /* this is the second time we've been called; this should
6184 * only happen if the main pattern got upgraded to utf8
6185 * during compilation; re-use the qr we compiled first time
6186 * round (which should be utf8 too)
6188 qr = pRExC_state->runtime_code_qr;
6189 pRExC_state->runtime_code_qr = NULL;
6190 assert(RExC_utf8 && SvUTF8(qr));
6196 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6200 /* determine how many extra chars we need for ' and \ escaping */
6201 for (s = 0; s < plen; s++) {
6202 if (pat[s] == '\'' || pat[s] == '\\')
6206 Newx(newpat, newlen, char);
6208 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6210 for (s = 0; s < plen; s++) {
6211 if (n < pRExC_state->num_code_blocks
6212 && s == pRExC_state->code_blocks[n].start)
6214 /* blank out literal code block */
6215 assert(pat[s] == '(');
6216 while (s <= pRExC_state->code_blocks[n].end) {
6224 if (pat[s] == '\'' || pat[s] == '\\')
6229 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6233 PerlIO_printf(Perl_debug_log,
6234 "%sre-parsing pattern for runtime code:%s %s\n",
6235 PL_colors[4],PL_colors[5],newpat);
6238 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6244 PUSHSTACKi(PERLSI_REQUIRE);
6245 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6246 * parsing qr''; normally only q'' does this. It also alters
6248 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6249 SvREFCNT_dec_NN(sv);
6254 SV * const errsv = ERRSV;
6255 if (SvTRUE_NN(errsv))
6257 Safefree(pRExC_state->code_blocks);
6258 /* use croak_sv ? */
6259 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6262 assert(SvROK(qr_ref));
6264 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6265 /* the leaving below frees the tmp qr_ref.
6266 * Give qr a life of its own */
6274 if (!RExC_utf8 && SvUTF8(qr)) {
6275 /* first time through; the pattern got upgraded; save the
6276 * qr for the next time through */
6277 assert(!pRExC_state->runtime_code_qr);
6278 pRExC_state->runtime_code_qr = qr;
6283 /* extract any code blocks within the returned qr// */
6286 /* merge the main (r1) and run-time (r2) code blocks into one */
6288 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6289 struct reg_code_block *new_block, *dst;
6290 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6293 if (!r2->num_code_blocks) /* we guessed wrong */
6295 SvREFCNT_dec_NN(qr);
6300 r1->num_code_blocks + r2->num_code_blocks,
6301 struct reg_code_block);
6304 while ( i1 < r1->num_code_blocks
6305 || i2 < r2->num_code_blocks)
6307 struct reg_code_block *src;
6310 if (i1 == r1->num_code_blocks) {
6311 src = &r2->code_blocks[i2++];
6314 else if (i2 == r2->num_code_blocks)
6315 src = &r1->code_blocks[i1++];
6316 else if ( r1->code_blocks[i1].start
6317 < r2->code_blocks[i2].start)
6319 src = &r1->code_blocks[i1++];
6320 assert(src->end < r2->code_blocks[i2].start);
6323 assert( r1->code_blocks[i1].start
6324 > r2->code_blocks[i2].start);
6325 src = &r2->code_blocks[i2++];
6327 assert(src->end < r1->code_blocks[i1].start);
6330 assert(pat[src->start] == '(');
6331 assert(pat[src->end] == ')');
6332 dst->start = src->start;
6333 dst->end = src->end;
6334 dst->block = src->block;
6335 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6339 r1->num_code_blocks += r2->num_code_blocks;
6340 Safefree(r1->code_blocks);
6341 r1->code_blocks = new_block;
6344 SvREFCNT_dec_NN(qr);
6350 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6351 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6352 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6353 STRLEN longest_length, bool eol, bool meol)
6355 /* This is the common code for setting up the floating and fixed length
6356 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6357 * as to whether succeeded or not */
6362 if (! (longest_length
6363 || (eol /* Can't have SEOL and MULTI */
6364 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6366 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6367 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6372 /* copy the information about the longest from the reg_scan_data
6373 over to the program. */
6374 if (SvUTF8(sv_longest)) {
6375 *rx_utf8 = sv_longest;
6378 *rx_substr = sv_longest;
6381 /* end_shift is how many chars that must be matched that
6382 follow this item. We calculate it ahead of time as once the
6383 lookbehind offset is added in we lose the ability to correctly
6385 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6386 *rx_end_shift = ml - offset
6387 - longest_length + (SvTAIL(sv_longest) != 0)
6390 t = (eol/* Can't have SEOL and MULTI */
6391 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6392 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6398 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6399 * regular expression into internal code.
6400 * The pattern may be passed either as:
6401 * a list of SVs (patternp plus pat_count)
6402 * a list of OPs (expr)
6403 * If both are passed, the SV list is used, but the OP list indicates
6404 * which SVs are actually pre-compiled code blocks
6406 * The SVs in the list have magic and qr overloading applied to them (and
6407 * the list may be modified in-place with replacement SVs in the latter
6410 * If the pattern hasn't changed from old_re, then old_re will be
6413 * eng is the current engine. If that engine has an op_comp method, then
6414 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6415 * do the initial concatenation of arguments and pass on to the external
6418 * If is_bare_re is not null, set it to a boolean indicating whether the
6419 * arg list reduced (after overloading) to a single bare regex which has
6420 * been returned (i.e. /$qr/).
6422 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6424 * pm_flags contains the PMf_* flags, typically based on those from the
6425 * pm_flags field of the related PMOP. Currently we're only interested in
6426 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6428 * We can't allocate space until we know how big the compiled form will be,
6429 * but we can't compile it (and thus know how big it is) until we've got a
6430 * place to put the code. So we cheat: we compile it twice, once with code
6431 * generation turned off and size counting turned on, and once "for real".
6432 * This also means that we don't allocate space until we are sure that the
6433 * thing really will compile successfully, and we never have to move the
6434 * code and thus invalidate pointers into it. (Note that it has to be in
6435 * one piece because free() must be able to free it all.) [NB: not true in perl]
6437 * Beware that the optimization-preparation code in here knows about some
6438 * of the structure of the compiled regexp. [I'll say.]
6442 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6443 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6444 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6448 regexp_internal *ri;
6456 SV *code_blocksv = NULL;
6457 SV** new_patternp = patternp;
6459 /* these are all flags - maybe they should be turned
6460 * into a single int with different bit masks */
6461 I32 sawlookahead = 0;
6466 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6468 bool runtime_code = 0;
6470 RExC_state_t RExC_state;
6471 RExC_state_t * const pRExC_state = &RExC_state;
6472 #ifdef TRIE_STUDY_OPT
6474 RExC_state_t copyRExC_state;
6476 GET_RE_DEBUG_FLAGS_DECL;
6478 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6480 DEBUG_r(if (!PL_colorset) reginitcolors());
6482 /* Initialize these here instead of as-needed, as is quick and avoids
6483 * having to test them each time otherwise */
6484 if (! PL_AboveLatin1) {
6485 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6486 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6487 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6488 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6489 PL_HasMultiCharFold =
6490 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6492 /* This is calculated here, because the Perl program that generates the
6493 * static global ones doesn't currently have access to
6494 * NUM_ANYOF_CODE_POINTS */
6495 PL_InBitmap = _new_invlist(2);
6496 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6497 NUM_ANYOF_CODE_POINTS - 1);
6500 pRExC_state->code_blocks = NULL;
6501 pRExC_state->num_code_blocks = 0;
6504 *is_bare_re = FALSE;
6506 if (expr && (expr->op_type == OP_LIST ||
6507 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6508 /* allocate code_blocks if needed */
6512 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6513 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6514 ncode++; /* count of DO blocks */
6516 pRExC_state->num_code_blocks = ncode;
6517 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6522 /* compile-time pattern with just OP_CONSTs and DO blocks */
6527 /* find how many CONSTs there are */
6530 if (expr->op_type == OP_CONST)
6533 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6534 if (o->op_type == OP_CONST)
6538 /* fake up an SV array */
6540 assert(!new_patternp);
6541 Newx(new_patternp, n, SV*);
6542 SAVEFREEPV(new_patternp);
6546 if (expr->op_type == OP_CONST)
6547 new_patternp[n] = cSVOPx_sv(expr);
6549 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6550 if (o->op_type == OP_CONST)
6551 new_patternp[n++] = cSVOPo_sv;
6556 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6557 "Assembling pattern from %d elements%s\n", pat_count,
6558 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6560 /* set expr to the first arg op */
6562 if (pRExC_state->num_code_blocks
6563 && expr->op_type != OP_CONST)
6565 expr = cLISTOPx(expr)->op_first;
6566 assert( expr->op_type == OP_PUSHMARK
6567 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6568 || expr->op_type == OP_PADRANGE);
6569 expr = OpSIBLING(expr);
6572 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6573 expr, &recompile, NULL);
6575 /* handle bare (possibly after overloading) regex: foo =~ $re */
6580 if (SvTYPE(re) == SVt_REGEXP) {
6584 Safefree(pRExC_state->code_blocks);
6585 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6586 "Precompiled pattern%s\n",
6587 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6593 exp = SvPV_nomg(pat, plen);
6595 if (!eng->op_comp) {
6596 if ((SvUTF8(pat) && IN_BYTES)
6597 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6599 /* make a temporary copy; either to convert to bytes,
6600 * or to avoid repeating get-magic / overloaded stringify */
6601 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6602 (IN_BYTES ? 0 : SvUTF8(pat)));
6604 Safefree(pRExC_state->code_blocks);
6605 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6608 /* ignore the utf8ness if the pattern is 0 length */
6609 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6611 RExC_uni_semantics = 0;
6612 RExC_seen_unfolded_sharp_s = 0;
6613 RExC_contains_locale = 0;
6614 RExC_contains_i = 0;
6615 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6616 pRExC_state->runtime_code_qr = NULL;
6617 RExC_frame_head= NULL;
6618 RExC_frame_last= NULL;
6619 RExC_frame_count= 0;
6622 RExC_mysv1= sv_newmortal();
6623 RExC_mysv2= sv_newmortal();
6626 SV *dsv= sv_newmortal();
6627 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6628 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6629 PL_colors[4],PL_colors[5],s);
6633 /* we jump here if we have to recompile, e.g., from upgrading the pattern
6636 if ((pm_flags & PMf_USE_RE_EVAL)
6637 /* this second condition covers the non-regex literal case,
6638 * i.e. $foo =~ '(?{})'. */
6639 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6641 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6643 /* return old regex if pattern hasn't changed */
6644 /* XXX: note in the below we have to check the flags as well as the
6647 * Things get a touch tricky as we have to compare the utf8 flag
6648 * independently from the compile flags. */
6652 && !!RX_UTF8(old_re) == !!RExC_utf8
6653 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6654 && RX_PRECOMP(old_re)
6655 && RX_PRELEN(old_re) == plen
6656 && memEQ(RX_PRECOMP(old_re), exp, plen)
6657 && !runtime_code /* with runtime code, always recompile */ )
6659 Safefree(pRExC_state->code_blocks);
6663 rx_flags = orig_rx_flags;
6665 if (rx_flags & PMf_FOLD) {
6666 RExC_contains_i = 1;
6668 if ( initial_charset == REGEX_DEPENDS_CHARSET
6669 && (RExC_utf8 ||RExC_uni_semantics))
6672 /* Set to use unicode semantics if the pattern is in utf8 and has the
6673 * 'depends' charset specified, as it means unicode when utf8 */
6674 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6678 RExC_flags = rx_flags;
6679 RExC_pm_flags = pm_flags;
6682 assert(TAINTING_get || !TAINT_get);
6684 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6686 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6687 /* whoops, we have a non-utf8 pattern, whilst run-time code
6688 * got compiled as utf8. Try again with a utf8 pattern */
6689 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6690 pRExC_state->num_code_blocks);
6691 goto redo_first_pass;
6694 assert(!pRExC_state->runtime_code_qr);
6700 RExC_in_lookbehind = 0;
6701 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6703 RExC_override_recoding = 0;
6705 RExC_recode_x_to_native = 0;
6707 RExC_in_multi_char_class = 0;
6709 /* First pass: determine size, legality. */
6712 RExC_end = exp + plen;
6717 RExC_emit = (regnode *) &RExC_emit_dummy;
6718 RExC_whilem_seen = 0;
6719 RExC_open_parens = NULL;
6720 RExC_close_parens = NULL;
6722 RExC_paren_names = NULL;
6724 RExC_paren_name_list = NULL;
6726 RExC_recurse = NULL;
6727 RExC_study_chunk_recursed = NULL;
6728 RExC_study_chunk_recursed_bytes= 0;
6729 RExC_recurse_count = 0;
6730 pRExC_state->code_index = 0;
6733 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6735 RExC_lastparse=NULL;
6737 /* reg may croak on us, not giving us a chance to free
6738 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6739 need it to survive as long as the regexp (qr/(?{})/).
6740 We must check that code_blocksv is not already set, because we may
6741 have jumped back to restart the sizing pass. */
6742 if (pRExC_state->code_blocks && !code_blocksv) {
6743 code_blocksv = newSV_type(SVt_PV);
6744 SAVEFREESV(code_blocksv);
6745 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6746 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6748 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6749 /* It's possible to write a regexp in ascii that represents Unicode
6750 codepoints outside of the byte range, such as via \x{100}. If we
6751 detect such a sequence we have to convert the entire pattern to utf8
6752 and then recompile, as our sizing calculation will have been based
6753 on 1 byte == 1 character, but we will need to use utf8 to encode
6754 at least some part of the pattern, and therefore must convert the whole
6757 if (flags & RESTART_PASS1) {
6758 if (flags & NEED_UTF8) {
6759 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6760 pRExC_state->num_code_blocks);
6763 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6764 "Need to redo pass 1\n"));
6767 goto redo_first_pass;
6769 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6772 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6775 PerlIO_printf(Perl_debug_log,
6776 "Required size %"IVdf" nodes\n"
6777 "Starting second pass (creation)\n",
6780 RExC_lastparse=NULL;
6783 /* The first pass could have found things that force Unicode semantics */
6784 if ((RExC_utf8 || RExC_uni_semantics)
6785 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6787 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6790 /* Small enough for pointer-storage convention?
6791 If extralen==0, this means that we will not need long jumps. */
6792 if (RExC_size >= 0x10000L && RExC_extralen)
6793 RExC_size += RExC_extralen;
6796 if (RExC_whilem_seen > 15)
6797 RExC_whilem_seen = 15;
6799 /* Allocate space and zero-initialize. Note, the two step process
6800 of zeroing when in debug mode, thus anything assigned has to
6801 happen after that */
6802 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6804 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6805 char, regexp_internal);
6806 if ( r == NULL || ri == NULL )
6807 FAIL("Regexp out of space");
6809 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6810 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6813 /* bulk initialize base fields with 0. */
6814 Zero(ri, sizeof(regexp_internal), char);
6817 /* non-zero initialization begins here */
6820 r->extflags = rx_flags;
6821 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6823 if (pm_flags & PMf_IS_QR) {
6824 ri->code_blocks = pRExC_state->code_blocks;
6825 ri->num_code_blocks = pRExC_state->num_code_blocks;
6830 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6831 if (pRExC_state->code_blocks[n].src_regex)
6832 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6833 if(pRExC_state->code_blocks)
6834 SAVEFREEPV(pRExC_state->code_blocks); /* often null */
6838 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6839 bool has_charset = (get_regex_charset(r->extflags)
6840 != REGEX_DEPENDS_CHARSET);
6842 /* The caret is output if there are any defaults: if not all the STD
6843 * flags are set, or if no character set specifier is needed */
6845 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6847 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6848 == REG_RUN_ON_COMMENT_SEEN);
6849 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
6850 >> RXf_PMf_STD_PMMOD_SHIFT);
6851 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6854 /* We output all the necessary flags; we never output a minus, as all
6855 * those are defaults, so are
6856 * covered by the caret */
6857 const STRLEN wraplen = plen + has_p + has_runon
6858 + has_default /* If needs a caret */
6859 + PL_bitcount[reganch] /* 1 char for each set standard flag */
6861 /* If needs a character set specifier */
6862 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6863 + (sizeof("(?:)") - 1);
6865 /* make sure PL_bitcount bounds not exceeded */
6866 assert(sizeof(STD_PAT_MODS) <= 8);
6868 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6869 r->xpv_len_u.xpvlenu_pv = p;
6871 SvFLAGS(rx) |= SVf_UTF8;
6874 /* If a default, cover it using the caret */
6876 *p++= DEFAULT_PAT_MOD;
6880 const char* const name = get_regex_charset_name(r->extflags, &len);
6881 Copy(name, p, len, char);
6885 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6888 while((ch = *fptr++)) {
6896 Copy(RExC_precomp, p, plen, char);
6897 assert ((RX_WRAPPED(rx) - p) < 16);
6898 r->pre_prefix = p - RX_WRAPPED(rx);
6904 SvCUR_set(rx, p - RX_WRAPPED(rx));
6908 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6910 /* setup various meta data about recursion, this all requires
6911 * RExC_npar to be correctly set, and a bit later on we clear it */
6912 if (RExC_seen & REG_RECURSE_SEEN) {
6913 Newxz(RExC_open_parens, RExC_npar,regnode *);
6914 SAVEFREEPV(RExC_open_parens);
6915 Newxz(RExC_close_parens,RExC_npar,regnode *);
6916 SAVEFREEPV(RExC_close_parens);
6918 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6919 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6920 * So its 1 if there are no parens. */
6921 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6922 ((RExC_npar & 0x07) != 0);
6923 Newx(RExC_study_chunk_recursed,
6924 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6925 SAVEFREEPV(RExC_study_chunk_recursed);
6928 /* Useful during FAIL. */
6929 #ifdef RE_TRACK_PATTERN_OFFSETS
6930 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6931 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6932 "%s %"UVuf" bytes for offset annotations.\n",
6933 ri->u.offsets ? "Got" : "Couldn't get",
6934 (UV)((2*RExC_size+1) * sizeof(U32))));
6936 SetProgLen(ri,RExC_size);
6941 /* Second pass: emit code. */
6942 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6943 RExC_pm_flags = pm_flags;
6945 RExC_end = exp + plen;
6948 RExC_emit_start = ri->program;
6949 RExC_emit = ri->program;
6950 RExC_emit_bound = ri->program + RExC_size + 1;
6951 pRExC_state->code_index = 0;
6953 *((char*) RExC_emit++) = (char) REG_MAGIC;
6954 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6956 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6958 /* XXXX To minimize changes to RE engine we always allocate
6959 3-units-long substrs field. */
6960 Newx(r->substrs, 1, struct reg_substr_data);
6961 if (RExC_recurse_count) {
6962 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6963 SAVEFREEPV(RExC_recurse);
6967 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6969 RExC_study_chunk_recursed_count= 0;
6971 Zero(r->substrs, 1, struct reg_substr_data);
6972 if (RExC_study_chunk_recursed) {
6973 Zero(RExC_study_chunk_recursed,
6974 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6978 #ifdef TRIE_STUDY_OPT
6980 StructCopy(&zero_scan_data, &data, scan_data_t);
6981 copyRExC_state = RExC_state;
6984 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6986 RExC_state = copyRExC_state;
6987 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6988 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6990 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6991 StructCopy(&zero_scan_data, &data, scan_data_t);
6994 StructCopy(&zero_scan_data, &data, scan_data_t);
6997 /* Dig out information for optimizations. */
6998 r->extflags = RExC_flags; /* was pm_op */
6999 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7002 SvUTF8_on(rx); /* Unicode in it? */
7003 ri->regstclass = NULL;
7004 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7005 r->intflags |= PREGf_NAUGHTY;
7006 scan = ri->program + 1; /* First BRANCH. */
7008 /* testing for BRANCH here tells us whether there is "must appear"
7009 data in the pattern. If there is then we can use it for optimisations */
7010 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7013 STRLEN longest_float_length, longest_fixed_length;
7014 regnode_ssc ch_class; /* pointed to by data */
7016 SSize_t last_close = 0; /* pointed to by data */
7017 regnode *first= scan;
7018 regnode *first_next= regnext(first);
7020 * Skip introductions and multiplicators >= 1
7021 * so that we can extract the 'meat' of the pattern that must
7022 * match in the large if() sequence following.
7023 * NOTE that EXACT is NOT covered here, as it is normally
7024 * picked up by the optimiser separately.
7026 * This is unfortunate as the optimiser isnt handling lookahead
7027 * properly currently.
7030 while ((OP(first) == OPEN && (sawopen = 1)) ||
7031 /* An OR of *one* alternative - should not happen now. */
7032 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7033 /* for now we can't handle lookbehind IFMATCH*/
7034 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7035 (OP(first) == PLUS) ||
7036 (OP(first) == MINMOD) ||
7037 /* An {n,m} with n>0 */
7038 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7039 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7042 * the only op that could be a regnode is PLUS, all the rest
7043 * will be regnode_1 or regnode_2.
7045 * (yves doesn't think this is true)
7047 if (OP(first) == PLUS)
7050 if (OP(first) == MINMOD)
7052 first += regarglen[OP(first)];
7054 first = NEXTOPER(first);
7055 first_next= regnext(first);
7058 /* Starting-point info. */
7060 DEBUG_PEEP("first:",first,0);
7061 /* Ignore EXACT as we deal with it later. */
7062 if (PL_regkind[OP(first)] == EXACT) {
7063 if (OP(first) == EXACT || OP(first) == EXACTL)
7064 NOOP; /* Empty, get anchored substr later. */
7066 ri->regstclass = first;
7069 else if (PL_regkind[OP(first)] == TRIE &&
7070 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7072 /* this can happen only on restudy */
7073 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7076 else if (REGNODE_SIMPLE(OP(first)))
7077 ri->regstclass = first;
7078 else if (PL_regkind[OP(first)] == BOUND ||
7079 PL_regkind[OP(first)] == NBOUND)
7080 ri->regstclass = first;
7081 else if (PL_regkind[OP(first)] == BOL) {
7082 r->intflags |= (OP(first) == MBOL
7085 first = NEXTOPER(first);
7088 else if (OP(first) == GPOS) {
7089 r->intflags |= PREGf_ANCH_GPOS;
7090 first = NEXTOPER(first);
7093 else if ((!sawopen || !RExC_sawback) &&
7095 (OP(first) == STAR &&
7096 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7097 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7099 /* turn .* into ^.* with an implied $*=1 */
7101 (OP(NEXTOPER(first)) == REG_ANY)
7104 r->intflags |= (type | PREGf_IMPLICIT);
7105 first = NEXTOPER(first);
7108 if (sawplus && !sawminmod && !sawlookahead
7109 && (!sawopen || !RExC_sawback)
7110 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7111 /* x+ must match at the 1st pos of run of x's */
7112 r->intflags |= PREGf_SKIP;
7114 /* Scan is after the zeroth branch, first is atomic matcher. */
7115 #ifdef TRIE_STUDY_OPT
7118 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7119 (IV)(first - scan + 1))
7123 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7124 (IV)(first - scan + 1))
7130 * If there's something expensive in the r.e., find the
7131 * longest literal string that must appear and make it the
7132 * regmust. Resolve ties in favor of later strings, since
7133 * the regstart check works with the beginning of the r.e.
7134 * and avoiding duplication strengthens checking. Not a
7135 * strong reason, but sufficient in the absence of others.
7136 * [Now we resolve ties in favor of the earlier string if
7137 * it happens that c_offset_min has been invalidated, since the
7138 * earlier string may buy us something the later one won't.]
7141 data.longest_fixed = newSVpvs("");
7142 data.longest_float = newSVpvs("");
7143 data.last_found = newSVpvs("");
7144 data.longest = &(data.longest_fixed);
7145 ENTER_with_name("study_chunk");
7146 SAVEFREESV(data.longest_fixed);
7147 SAVEFREESV(data.longest_float);
7148 SAVEFREESV(data.last_found);
7150 if (!ri->regstclass) {
7151 ssc_init(pRExC_state, &ch_class);
7152 data.start_class = &ch_class;
7153 stclass_flag = SCF_DO_STCLASS_AND;
7154 } else /* XXXX Check for BOUND? */
7156 data.last_closep = &last_close;
7159 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7160 scan + RExC_size, /* Up to end */
7162 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7163 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7167 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7170 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7171 && data.last_start_min == 0 && data.last_end > 0
7172 && !RExC_seen_zerolen
7173 && !(RExC_seen & REG_VERBARG_SEEN)
7174 && !(RExC_seen & REG_GPOS_SEEN)
7176 r->extflags |= RXf_CHECK_ALL;
7178 scan_commit(pRExC_state, &data,&minlen,0);
7180 longest_float_length = CHR_SVLEN(data.longest_float);
7182 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7183 && data.offset_fixed == data.offset_float_min
7184 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7185 && S_setup_longest (aTHX_ pRExC_state,
7189 &(r->float_end_shift),
7190 data.lookbehind_float,
7191 data.offset_float_min,
7193 longest_float_length,
7194 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7195 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7197 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7198 r->float_max_offset = data.offset_float_max;
7199 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7200 r->float_max_offset -= data.lookbehind_float;
7201 SvREFCNT_inc_simple_void_NN(data.longest_float);
7204 r->float_substr = r->float_utf8 = NULL;
7205 longest_float_length = 0;
7208 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7210 if (S_setup_longest (aTHX_ pRExC_state,
7212 &(r->anchored_utf8),
7213 &(r->anchored_substr),
7214 &(r->anchored_end_shift),
7215 data.lookbehind_fixed,
7218 longest_fixed_length,
7219 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7220 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7222 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7223 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7226 r->anchored_substr = r->anchored_utf8 = NULL;
7227 longest_fixed_length = 0;
7229 LEAVE_with_name("study_chunk");
7232 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7233 ri->regstclass = NULL;
7235 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7237 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7238 && is_ssc_worth_it(pRExC_state, data.start_class))
7240 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7242 ssc_finalize(pRExC_state, data.start_class);
7244 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7245 StructCopy(data.start_class,
7246 (regnode_ssc*)RExC_rxi->data->data[n],
7248 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7249 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7250 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7251 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7252 PerlIO_printf(Perl_debug_log,
7253 "synthetic stclass \"%s\".\n",
7254 SvPVX_const(sv));});
7255 data.start_class = NULL;
7258 /* A temporary algorithm prefers floated substr to fixed one to dig
7260 if (longest_fixed_length > longest_float_length) {
7261 r->substrs->check_ix = 0;
7262 r->check_end_shift = r->anchored_end_shift;
7263 r->check_substr = r->anchored_substr;
7264 r->check_utf8 = r->anchored_utf8;
7265 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7266 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7267 r->intflags |= PREGf_NOSCAN;
7270 r->substrs->check_ix = 1;
7271 r->check_end_shift = r->float_end_shift;
7272 r->check_substr = r->float_substr;
7273 r->check_utf8 = r->float_utf8;
7274 r->check_offset_min = r->float_min_offset;
7275 r->check_offset_max = r->float_max_offset;
7277 if ((r->check_substr || r->check_utf8) ) {
7278 r->extflags |= RXf_USE_INTUIT;
7279 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7280 r->extflags |= RXf_INTUIT_TAIL;
7282 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7284 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7285 if ( (STRLEN)minlen < longest_float_length )
7286 minlen= longest_float_length;
7287 if ( (STRLEN)minlen < longest_fixed_length )
7288 minlen= longest_fixed_length;
7292 /* Several toplevels. Best we can is to set minlen. */
7294 regnode_ssc ch_class;
7295 SSize_t last_close = 0;
7297 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7299 scan = ri->program + 1;
7300 ssc_init(pRExC_state, &ch_class);
7301 data.start_class = &ch_class;
7302 data.last_closep = &last_close;
7305 minlen = study_chunk(pRExC_state,
7306 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7307 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7308 ? SCF_TRIE_DOING_RESTUDY
7312 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7314 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7315 = r->float_substr = r->float_utf8 = NULL;
7317 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7318 && is_ssc_worth_it(pRExC_state, data.start_class))
7320 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7322 ssc_finalize(pRExC_state, data.start_class);
7324 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7325 StructCopy(data.start_class,
7326 (regnode_ssc*)RExC_rxi->data->data[n],
7328 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7329 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7330 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7331 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7332 PerlIO_printf(Perl_debug_log,
7333 "synthetic stclass \"%s\".\n",
7334 SvPVX_const(sv));});
7335 data.start_class = NULL;
7339 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7340 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7341 r->maxlen = REG_INFTY;
7344 r->maxlen = RExC_maxlen;
7347 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7348 the "real" pattern. */
7350 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7351 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7353 r->minlenret = minlen;
7354 if (r->minlen < minlen)
7357 if (RExC_seen & REG_GPOS_SEEN)
7358 r->intflags |= PREGf_GPOS_SEEN;
7359 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7360 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7362 if (pRExC_state->num_code_blocks)
7363 r->extflags |= RXf_EVAL_SEEN;
7364 if (RExC_seen & REG_VERBARG_SEEN)
7366 r->intflags |= PREGf_VERBARG_SEEN;
7367 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7369 if (RExC_seen & REG_CUTGROUP_SEEN)
7370 r->intflags |= PREGf_CUTGROUP_SEEN;
7371 if (pm_flags & PMf_USE_RE_EVAL)
7372 r->intflags |= PREGf_USE_RE_EVAL;
7373 if (RExC_paren_names)
7374 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7376 RXp_PAREN_NAMES(r) = NULL;
7378 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7379 * so it can be used in pp.c */
7380 if (r->intflags & PREGf_ANCH)
7381 r->extflags |= RXf_IS_ANCHORED;
7385 /* this is used to identify "special" patterns that might result
7386 * in Perl NOT calling the regex engine and instead doing the match "itself",
7387 * particularly special cases in split//. By having the regex compiler
7388 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7389 * we avoid weird issues with equivalent patterns resulting in different behavior,
7390 * AND we allow non Perl engines to get the same optimizations by the setting the
7391 * flags appropriately - Yves */
7392 regnode *first = ri->program + 1;
7394 regnode *next = regnext(first);
7397 if (PL_regkind[fop] == NOTHING && nop == END)
7398 r->extflags |= RXf_NULL;
7399 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7400 /* when fop is SBOL first->flags will be true only when it was
7401 * produced by parsing /\A/, and not when parsing /^/. This is
7402 * very important for the split code as there we want to
7403 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7404 * See rt #122761 for more details. -- Yves */
7405 r->extflags |= RXf_START_ONLY;
7406 else if (fop == PLUS
7407 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7409 r->extflags |= RXf_WHITE;
7410 else if ( r->extflags & RXf_SPLIT
7411 && (fop == EXACT || fop == EXACTL)
7412 && STR_LEN(first) == 1
7413 && *(STRING(first)) == ' '
7415 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7419 if (RExC_contains_locale) {
7420 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7424 if (RExC_paren_names) {
7425 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7426 ri->data->data[ri->name_list_idx]
7427 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7430 ri->name_list_idx = 0;
7432 if (RExC_recurse_count) {
7433 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7434 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7435 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7438 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7439 /* assume we don't need to swap parens around before we match */
7441 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7442 (unsigned long)RExC_study_chunk_recursed_count);
7446 PerlIO_printf(Perl_debug_log,"Final program:\n");
7449 #ifdef RE_TRACK_PATTERN_OFFSETS
7450 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7451 const STRLEN len = ri->u.offsets[0];
7453 GET_RE_DEBUG_FLAGS_DECL;
7454 PerlIO_printf(Perl_debug_log,
7455 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7456 for (i = 1; i <= len; i++) {
7457 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7458 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7459 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7461 PerlIO_printf(Perl_debug_log, "\n");
7466 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7467 * by setting the regexp SV to readonly-only instead. If the
7468 * pattern's been recompiled, the USEDness should remain. */
7469 if (old_re && SvREADONLY(old_re))
7477 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7480 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7482 PERL_UNUSED_ARG(value);
7484 if (flags & RXapif_FETCH) {
7485 return reg_named_buff_fetch(rx, key, flags);
7486 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7487 Perl_croak_no_modify();
7489 } else if (flags & RXapif_EXISTS) {
7490 return reg_named_buff_exists(rx, key, flags)
7493 } else if (flags & RXapif_REGNAMES) {
7494 return reg_named_buff_all(rx, flags);
7495 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7496 return reg_named_buff_scalar(rx, flags);
7498 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7504 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7507 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7508 PERL_UNUSED_ARG(lastkey);
7510 if (flags & RXapif_FIRSTKEY)
7511 return reg_named_buff_firstkey(rx, flags);
7512 else if (flags & RXapif_NEXTKEY)
7513 return reg_named_buff_nextkey(rx, flags);
7515 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7522 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7525 AV *retarray = NULL;
7527 struct regexp *const rx = ReANY(r);
7529 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7531 if (flags & RXapif_ALL)
7534 if (rx && RXp_PAREN_NAMES(rx)) {
7535 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7538 SV* sv_dat=HeVAL(he_str);
7539 I32 *nums=(I32*)SvPVX(sv_dat);
7540 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7541 if ((I32)(rx->nparens) >= nums[i]
7542 && rx->offs[nums[i]].start != -1
7543 && rx->offs[nums[i]].end != -1)
7546 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7551 ret = newSVsv(&PL_sv_undef);
7554 av_push(retarray, ret);
7557 return newRV_noinc(MUTABLE_SV(retarray));
7564 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7567 struct regexp *const rx = ReANY(r);
7569 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7571 if (rx && RXp_PAREN_NAMES(rx)) {
7572 if (flags & RXapif_ALL) {
7573 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7575 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7577 SvREFCNT_dec_NN(sv);
7589 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7591 struct regexp *const rx = ReANY(r);
7593 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7595 if ( rx && RXp_PAREN_NAMES(rx) ) {
7596 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7598 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7605 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7607 struct regexp *const rx = ReANY(r);
7608 GET_RE_DEBUG_FLAGS_DECL;
7610 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7612 if (rx && RXp_PAREN_NAMES(rx)) {
7613 HV *hv = RXp_PAREN_NAMES(rx);
7615 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7618 SV* sv_dat = HeVAL(temphe);
7619 I32 *nums = (I32*)SvPVX(sv_dat);
7620 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7621 if ((I32)(rx->lastparen) >= nums[i] &&
7622 rx->offs[nums[i]].start != -1 &&
7623 rx->offs[nums[i]].end != -1)
7629 if (parno || flags & RXapif_ALL) {
7630 return newSVhek(HeKEY_hek(temphe));
7638 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7643 struct regexp *const rx = ReANY(r);
7645 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7647 if (rx && RXp_PAREN_NAMES(rx)) {
7648 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7649 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7650 } else if (flags & RXapif_ONE) {
7651 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7652 av = MUTABLE_AV(SvRV(ret));
7653 length = av_tindex(av);
7654 SvREFCNT_dec_NN(ret);
7655 return newSViv(length + 1);
7657 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7662 return &PL_sv_undef;
7666 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7668 struct regexp *const rx = ReANY(r);
7671 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7673 if (rx && RXp_PAREN_NAMES(rx)) {
7674 HV *hv= RXp_PAREN_NAMES(rx);
7676 (void)hv_iterinit(hv);
7677 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7680 SV* sv_dat = HeVAL(temphe);
7681 I32 *nums = (I32*)SvPVX(sv_dat);
7682 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7683 if ((I32)(rx->lastparen) >= nums[i] &&
7684 rx->offs[nums[i]].start != -1 &&
7685 rx->offs[nums[i]].end != -1)
7691 if (parno || flags & RXapif_ALL) {
7692 av_push(av, newSVhek(HeKEY_hek(temphe)));
7697 return newRV_noinc(MUTABLE_SV(av));
7701 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7704 struct regexp *const rx = ReANY(r);
7710 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7712 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7713 || n == RX_BUFF_IDX_CARET_FULLMATCH
7714 || n == RX_BUFF_IDX_CARET_POSTMATCH
7717 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7719 /* on something like
7722 * the KEEPCOPY is set on the PMOP rather than the regex */
7723 if (PL_curpm && r == PM_GETRE(PL_curpm))
7724 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7733 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7734 /* no need to distinguish between them any more */
7735 n = RX_BUFF_IDX_FULLMATCH;
7737 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7738 && rx->offs[0].start != -1)
7740 /* $`, ${^PREMATCH} */
7741 i = rx->offs[0].start;
7745 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7746 && rx->offs[0].end != -1)
7748 /* $', ${^POSTMATCH} */
7749 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7750 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7753 if ( 0 <= n && n <= (I32)rx->nparens &&
7754 (s1 = rx->offs[n].start) != -1 &&
7755 (t1 = rx->offs[n].end) != -1)
7757 /* $&, ${^MATCH}, $1 ... */
7759 s = rx->subbeg + s1 - rx->suboffset;
7764 assert(s >= rx->subbeg);
7765 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7767 #ifdef NO_TAINT_SUPPORT
7768 sv_setpvn(sv, s, i);
7770 const int oldtainted = TAINT_get;
7772 sv_setpvn(sv, s, i);
7773 TAINT_set(oldtainted);
7775 if (RXp_MATCH_UTF8(rx))
7780 if (RXp_MATCH_TAINTED(rx)) {
7781 if (SvTYPE(sv) >= SVt_PVMG) {
7782 MAGIC* const mg = SvMAGIC(sv);
7785 SvMAGIC_set(sv, mg->mg_moremagic);
7787 if ((mgt = SvMAGIC(sv))) {
7788 mg->mg_moremagic = mgt;
7789 SvMAGIC_set(sv, mg);
7800 sv_setsv(sv,&PL_sv_undef);
7806 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7807 SV const * const value)
7809 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7811 PERL_UNUSED_ARG(rx);
7812 PERL_UNUSED_ARG(paren);
7813 PERL_UNUSED_ARG(value);
7816 Perl_croak_no_modify();
7820 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7823 struct regexp *const rx = ReANY(r);
7827 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7829 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7830 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7831 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7834 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7836 /* on something like
7839 * the KEEPCOPY is set on the PMOP rather than the regex */
7840 if (PL_curpm && r == PM_GETRE(PL_curpm))
7841 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7847 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7849 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7850 case RX_BUFF_IDX_PREMATCH: /* $` */
7851 if (rx->offs[0].start != -1) {
7852 i = rx->offs[0].start;
7861 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7862 case RX_BUFF_IDX_POSTMATCH: /* $' */
7863 if (rx->offs[0].end != -1) {
7864 i = rx->sublen - rx->offs[0].end;
7866 s1 = rx->offs[0].end;
7873 default: /* $& / ${^MATCH}, $1, $2, ... */
7874 if (paren <= (I32)rx->nparens &&
7875 (s1 = rx->offs[paren].start) != -1 &&
7876 (t1 = rx->offs[paren].end) != -1)
7882 if (ckWARN(WARN_UNINITIALIZED))
7883 report_uninit((const SV *)sv);
7888 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7889 const char * const s = rx->subbeg - rx->suboffset + s1;
7894 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7901 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7903 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7904 PERL_UNUSED_ARG(rx);
7908 return newSVpvs("Regexp");
7911 /* Scans the name of a named buffer from the pattern.
7912 * If flags is REG_RSN_RETURN_NULL returns null.
7913 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7914 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7915 * to the parsed name as looked up in the RExC_paren_names hash.
7916 * If there is an error throws a vFAIL().. type exception.
7919 #define REG_RSN_RETURN_NULL 0
7920 #define REG_RSN_RETURN_NAME 1
7921 #define REG_RSN_RETURN_DATA 2
7924 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7926 char *name_start = RExC_parse;
7928 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7930 assert (RExC_parse <= RExC_end);
7931 if (RExC_parse == RExC_end) NOOP;
7932 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7933 /* skip IDFIRST by using do...while */
7936 RExC_parse += UTF8SKIP(RExC_parse);
7937 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7941 } while (isWORDCHAR(*RExC_parse));
7943 RExC_parse++; /* so the <- from the vFAIL is after the offending
7945 vFAIL("Group name must start with a non-digit word character");
7949 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7950 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7951 if ( flags == REG_RSN_RETURN_NAME)
7953 else if (flags==REG_RSN_RETURN_DATA) {
7956 if ( ! sv_name ) /* should not happen*/
7957 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7958 if (RExC_paren_names)
7959 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7961 sv_dat = HeVAL(he_str);
7963 vFAIL("Reference to nonexistent named group");
7967 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7968 (unsigned long) flags);
7970 NOT_REACHED; /* NOTREACHED */
7975 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7977 if (RExC_lastparse!=RExC_parse) { \
7978 PerlIO_printf(Perl_debug_log, "%s", \
7979 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7980 RExC_end - RExC_parse, 16, \
7982 PERL_PV_ESCAPE_UNI_DETECT | \
7983 PERL_PV_PRETTY_ELLIPSES | \
7984 PERL_PV_PRETTY_LTGT | \
7985 PERL_PV_ESCAPE_RE | \
7986 PERL_PV_PRETTY_EXACTSIZE \
7990 PerlIO_printf(Perl_debug_log,"%16s",""); \
7993 num = RExC_size + 1; \
7995 num=REG_NODE_NUM(RExC_emit); \
7996 if (RExC_lastnum!=num) \
7997 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7999 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8000 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8001 (int)((depth*2)), "", \
8005 RExC_lastparse=RExC_parse; \
8010 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8011 DEBUG_PARSE_MSG((funcname)); \
8012 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8014 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8015 DEBUG_PARSE_MSG((funcname)); \
8016 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8019 /* This section of code defines the inversion list object and its methods. The
8020 * interfaces are highly subject to change, so as much as possible is static to
8021 * this file. An inversion list is here implemented as a malloc'd C UV array
8022 * as an SVt_INVLIST scalar.
8024 * An inversion list for Unicode is an array of code points, sorted by ordinal
8025 * number. The zeroth element is the first code point in the list. The 1th
8026 * element is the first element beyond that not in the list. In other words,
8027 * the first range is
8028 * invlist[0]..(invlist[1]-1)
8029 * The other ranges follow. Thus every element whose index is divisible by two
8030 * marks the beginning of a range that is in the list, and every element not
8031 * divisible by two marks the beginning of a range not in the list. A single
8032 * element inversion list that contains the single code point N generally
8033 * consists of two elements
8036 * (The exception is when N is the highest representable value on the
8037 * machine, in which case the list containing just it would be a single
8038 * element, itself. By extension, if the last range in the list extends to
8039 * infinity, then the first element of that range will be in the inversion list
8040 * at a position that is divisible by two, and is the final element in the
8042 * Taking the complement (inverting) an inversion list is quite simple, if the
8043 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8044 * This implementation reserves an element at the beginning of each inversion
8045 * list to always contain 0; there is an additional flag in the header which
8046 * indicates if the list begins at the 0, or is offset to begin at the next
8049 * More about inversion lists can be found in "Unicode Demystified"
8050 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8051 * More will be coming when functionality is added later.
8053 * The inversion list data structure is currently implemented as an SV pointing
8054 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8055 * array of UV whose memory management is automatically handled by the existing
8056 * facilities for SV's.
8058 * Some of the methods should always be private to the implementation, and some
8059 * should eventually be made public */
8061 /* The header definitions are in F<invlist_inline.h> */
8063 PERL_STATIC_INLINE UV*
8064 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8066 /* Returns a pointer to the first element in the inversion list's array.
8067 * This is called upon initialization of an inversion list. Where the
8068 * array begins depends on whether the list has the code point U+0000 in it
8069 * or not. The other parameter tells it whether the code that follows this
8070 * call is about to put a 0 in the inversion list or not. The first
8071 * element is either the element reserved for 0, if TRUE, or the element
8072 * after it, if FALSE */
8074 bool* offset = get_invlist_offset_addr(invlist);
8075 UV* zero_addr = (UV *) SvPVX(invlist);
8077 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8080 assert(! _invlist_len(invlist));
8084 /* 1^1 = 0; 1^0 = 1 */
8085 *offset = 1 ^ will_have_0;
8086 return zero_addr + *offset;
8089 PERL_STATIC_INLINE void
8090 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8092 /* Sets the current number of elements stored in the inversion list.
8093 * Updates SvCUR correspondingly */
8094 PERL_UNUSED_CONTEXT;
8095 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8097 assert(SvTYPE(invlist) == SVt_INVLIST);
8102 : TO_INTERNAL_SIZE(len + offset));
8103 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8106 #ifndef PERL_IN_XSUB_RE
8108 PERL_STATIC_INLINE IV*
8109 S_get_invlist_previous_index_addr(SV* invlist)
8111 /* Return the address of the IV that is reserved to hold the cached index
8113 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8115 assert(SvTYPE(invlist) == SVt_INVLIST);
8117 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8120 PERL_STATIC_INLINE IV
8121 S_invlist_previous_index(SV* const invlist)
8123 /* Returns cached index of previous search */
8125 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8127 return *get_invlist_previous_index_addr(invlist);
8130 PERL_STATIC_INLINE void
8131 S_invlist_set_previous_index(SV* const invlist, const IV index)
8133 /* Caches <index> for later retrieval */
8135 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8137 assert(index == 0 || index < (int) _invlist_len(invlist));
8139 *get_invlist_previous_index_addr(invlist) = index;
8142 PERL_STATIC_INLINE void
8143 S_invlist_trim(SV* const invlist)
8145 PERL_ARGS_ASSERT_INVLIST_TRIM;
8147 assert(SvTYPE(invlist) == SVt_INVLIST);
8149 /* Change the length of the inversion list to how many entries it currently
8151 SvPV_shrink_to_cur((SV *) invlist);
8154 PERL_STATIC_INLINE bool
8155 S_invlist_is_iterating(SV* const invlist)
8157 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8159 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8162 #endif /* ifndef PERL_IN_XSUB_RE */
8164 PERL_STATIC_INLINE UV
8165 S_invlist_max(SV* const invlist)
8167 /* Returns the maximum number of elements storable in the inversion list's
8168 * array, without having to realloc() */
8170 PERL_ARGS_ASSERT_INVLIST_MAX;
8172 assert(SvTYPE(invlist) == SVt_INVLIST);
8174 /* Assumes worst case, in which the 0 element is not counted in the
8175 * inversion list, so subtracts 1 for that */
8176 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8177 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8178 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8181 #ifndef PERL_IN_XSUB_RE
8183 Perl__new_invlist(pTHX_ IV initial_size)
8186 /* Return a pointer to a newly constructed inversion list, with enough
8187 * space to store 'initial_size' elements. If that number is negative, a
8188 * system default is used instead */
8192 if (initial_size < 0) {
8196 /* Allocate the initial space */
8197 new_list = newSV_type(SVt_INVLIST);
8199 /* First 1 is in case the zero element isn't in the list; second 1 is for
8201 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8202 invlist_set_len(new_list, 0, 0);
8204 /* Force iterinit() to be used to get iteration to work */
8205 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8207 *get_invlist_previous_index_addr(new_list) = 0;
8213 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8215 /* Return a pointer to a newly constructed inversion list, initialized to
8216 * point to <list>, which has to be in the exact correct inversion list
8217 * form, including internal fields. Thus this is a dangerous routine that
8218 * should not be used in the wrong hands. The passed in 'list' contains
8219 * several header fields at the beginning that are not part of the
8220 * inversion list body proper */
8222 const STRLEN length = (STRLEN) list[0];
8223 const UV version_id = list[1];
8224 const bool offset = cBOOL(list[2]);
8225 #define HEADER_LENGTH 3
8226 /* If any of the above changes in any way, you must change HEADER_LENGTH
8227 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8228 * perl -E 'say int(rand 2**31-1)'
8230 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8231 data structure type, so that one being
8232 passed in can be validated to be an
8233 inversion list of the correct vintage.
8236 SV* invlist = newSV_type(SVt_INVLIST);
8238 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8240 if (version_id != INVLIST_VERSION_ID) {
8241 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8244 /* The generated array passed in includes header elements that aren't part
8245 * of the list proper, so start it just after them */
8246 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8248 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8249 shouldn't touch it */
8251 *(get_invlist_offset_addr(invlist)) = offset;
8253 /* The 'length' passed to us is the physical number of elements in the
8254 * inversion list. But if there is an offset the logical number is one
8256 invlist_set_len(invlist, length - offset, offset);
8258 invlist_set_previous_index(invlist, 0);
8260 /* Initialize the iteration pointer. */
8261 invlist_iterfinish(invlist);
8263 SvREADONLY_on(invlist);
8267 #endif /* ifndef PERL_IN_XSUB_RE */
8270 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8272 /* Grow the maximum size of an inversion list */
8274 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8276 assert(SvTYPE(invlist) == SVt_INVLIST);
8278 /* Add one to account for the zero element at the beginning which may not
8279 * be counted by the calling parameters */
8280 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8284 S__append_range_to_invlist(pTHX_ SV* const invlist,
8285 const UV start, const UV end)
8287 /* Subject to change or removal. Append the range from 'start' to 'end' at
8288 * the end of the inversion list. The range must be above any existing
8292 UV max = invlist_max(invlist);
8293 UV len = _invlist_len(invlist);
8296 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8298 if (len == 0) { /* Empty lists must be initialized */
8299 offset = start != 0;
8300 array = _invlist_array_init(invlist, ! offset);
8303 /* Here, the existing list is non-empty. The current max entry in the
8304 * list is generally the first value not in the set, except when the
8305 * set extends to the end of permissible values, in which case it is
8306 * the first entry in that final set, and so this call is an attempt to
8307 * append out-of-order */
8309 UV final_element = len - 1;
8310 array = invlist_array(invlist);
8311 if (array[final_element] > start
8312 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8314 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8315 array[final_element], start,
8316 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8319 /* Here, it is a legal append. If the new range begins with the first
8320 * value not in the set, it is extending the set, so the new first
8321 * value not in the set is one greater than the newly extended range.
8323 offset = *get_invlist_offset_addr(invlist);
8324 if (array[final_element] == start) {
8325 if (end != UV_MAX) {
8326 array[final_element] = end + 1;
8329 /* But if the end is the maximum representable on the machine,
8330 * just let the range that this would extend to have no end */
8331 invlist_set_len(invlist, len - 1, offset);
8337 /* Here the new range doesn't extend any existing set. Add it */
8339 len += 2; /* Includes an element each for the start and end of range */
8341 /* If wll overflow the existing space, extend, which may cause the array to
8344 invlist_extend(invlist, len);
8346 /* Have to set len here to avoid assert failure in invlist_array() */
8347 invlist_set_len(invlist, len, offset);
8349 array = invlist_array(invlist);
8352 invlist_set_len(invlist, len, offset);
8355 /* The next item on the list starts the range, the one after that is
8356 * one past the new range. */
8357 array[len - 2] = start;
8358 if (end != UV_MAX) {
8359 array[len - 1] = end + 1;
8362 /* But if the end is the maximum representable on the machine, just let
8363 * the range have no end */
8364 invlist_set_len(invlist, len - 1, offset);
8368 #ifndef PERL_IN_XSUB_RE
8371 Perl__invlist_search(SV* const invlist, const UV cp)
8373 /* Searches the inversion list for the entry that contains the input code
8374 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8375 * return value is the index into the list's array of the range that
8380 IV high = _invlist_len(invlist);
8381 const IV highest_element = high - 1;
8384 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8386 /* If list is empty, return failure. */
8391 /* (We can't get the array unless we know the list is non-empty) */
8392 array = invlist_array(invlist);
8394 mid = invlist_previous_index(invlist);
8395 assert(mid >=0 && mid <= highest_element);
8397 /* <mid> contains the cache of the result of the previous call to this
8398 * function (0 the first time). See if this call is for the same result,
8399 * or if it is for mid-1. This is under the theory that calls to this
8400 * function will often be for related code points that are near each other.
8401 * And benchmarks show that caching gives better results. We also test
8402 * here if the code point is within the bounds of the list. These tests
8403 * replace others that would have had to be made anyway to make sure that
8404 * the array bounds were not exceeded, and these give us extra information
8405 * at the same time */
8406 if (cp >= array[mid]) {
8407 if (cp >= array[highest_element]) {
8408 return highest_element;
8411 /* Here, array[mid] <= cp < array[highest_element]. This means that
8412 * the final element is not the answer, so can exclude it; it also
8413 * means that <mid> is not the final element, so can refer to 'mid + 1'
8415 if (cp < array[mid + 1]) {
8421 else { /* cp < aray[mid] */
8422 if (cp < array[0]) { /* Fail if outside the array */
8426 if (cp >= array[mid - 1]) {
8431 /* Binary search. What we are looking for is <i> such that
8432 * array[i] <= cp < array[i+1]
8433 * The loop below converges on the i+1. Note that there may not be an
8434 * (i+1)th element in the array, and things work nonetheless */
8435 while (low < high) {
8436 mid = (low + high) / 2;
8437 assert(mid <= highest_element);
8438 if (array[mid] <= cp) { /* cp >= array[mid] */
8441 /* We could do this extra test to exit the loop early.
8442 if (cp < array[low]) {
8447 else { /* cp < array[mid] */
8454 invlist_set_previous_index(invlist, high);
8459 Perl__invlist_populate_swatch(SV* const invlist,
8460 const UV start, const UV end, U8* swatch)
8462 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8463 * but is used when the swash has an inversion list. This makes this much
8464 * faster, as it uses a binary search instead of a linear one. This is
8465 * intimately tied to that function, and perhaps should be in utf8.c,
8466 * except it is intimately tied to inversion lists as well. It assumes
8467 * that <swatch> is all 0's on input */
8470 const IV len = _invlist_len(invlist);
8474 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8476 if (len == 0) { /* Empty inversion list */
8480 array = invlist_array(invlist);
8482 /* Find which element it is */
8483 i = _invlist_search(invlist, start);
8485 /* We populate from <start> to <end> */
8486 while (current < end) {
8489 /* The inversion list gives the results for every possible code point
8490 * after the first one in the list. Only those ranges whose index is
8491 * even are ones that the inversion list matches. For the odd ones,
8492 * and if the initial code point is not in the list, we have to skip
8493 * forward to the next element */
8494 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8496 if (i >= len) { /* Finished if beyond the end of the array */
8500 if (current >= end) { /* Finished if beyond the end of what we
8502 if (LIKELY(end < UV_MAX)) {
8506 /* We get here when the upper bound is the maximum
8507 * representable on the machine, and we are looking for just
8508 * that code point. Have to special case it */
8510 goto join_end_of_list;
8513 assert(current >= start);
8515 /* The current range ends one below the next one, except don't go past
8518 upper = (i < len && array[i] < end) ? array[i] : end;
8520 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8521 * for each code point in it */
8522 for (; current < upper; current++) {
8523 const STRLEN offset = (STRLEN)(current - start);
8524 swatch[offset >> 3] |= 1 << (offset & 7);
8529 /* Quit if at the end of the list */
8532 /* But first, have to deal with the highest possible code point on
8533 * the platform. The previous code assumes that <end> is one
8534 * beyond where we want to populate, but that is impossible at the
8535 * platform's infinity, so have to handle it specially */
8536 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8538 const STRLEN offset = (STRLEN)(end - start);
8539 swatch[offset >> 3] |= 1 << (offset & 7);
8544 /* Advance to the next range, which will be for code points not in the
8553 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8554 const bool complement_b, SV** output)
8556 /* Take the union of two inversion lists and point <output> to it. *output
8557 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8558 * the reference count to that list will be decremented if not already a
8559 * temporary (mortal); otherwise *output will be made correspondingly
8560 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8561 * second list is returned. If <complement_b> is TRUE, the union is taken
8562 * of the complement (inversion) of <b> instead of b itself.
8564 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8565 * Richard Gillam, published by Addison-Wesley, and explained at some
8566 * length there. The preface says to incorporate its examples into your
8567 * code at your own risk.
8569 * The algorithm is like a merge sort.
8571 * XXX A potential performance improvement is to keep track as we go along
8572 * if only one of the inputs contributes to the result, meaning the other
8573 * is a subset of that one. In that case, we can skip the final copy and
8574 * return the larger of the input lists, but then outside code might need
8575 * to keep track of whether to free the input list or not */
8577 const UV* array_a; /* a's array */
8579 UV len_a; /* length of a's array */
8582 SV* u; /* the resulting union */
8586 UV i_a = 0; /* current index into a's array */
8590 /* running count, as explained in the algorithm source book; items are
8591 * stopped accumulating and are output when the count changes to/from 0.
8592 * The count is incremented when we start a range that's in the set, and
8593 * decremented when we start a range that's not in the set. So its range
8594 * is 0 to 2. Only when the count is zero is something not in the set.
8598 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8601 /* If either one is empty, the union is the other one */
8602 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8603 bool make_temp = FALSE; /* Should we mortalize the result? */
8607 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8613 *output = invlist_clone(b);
8615 _invlist_invert(*output);
8617 } /* else *output already = b; */
8620 sv_2mortal(*output);
8624 else if ((len_b = _invlist_len(b)) == 0) {
8625 bool make_temp = FALSE;
8627 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8632 /* The complement of an empty list is a list that has everything in it,
8633 * so the union with <a> includes everything too */
8636 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8640 *output = _new_invlist(1);
8641 _append_range_to_invlist(*output, 0, UV_MAX);
8643 else if (*output != a) {
8644 *output = invlist_clone(a);
8646 /* else *output already = a; */
8649 sv_2mortal(*output);
8654 /* Here both lists exist and are non-empty */
8655 array_a = invlist_array(a);
8656 array_b = invlist_array(b);
8658 /* If are to take the union of 'a' with the complement of b, set it
8659 * up so are looking at b's complement. */
8662 /* To complement, we invert: if the first element is 0, remove it. To
8663 * do this, we just pretend the array starts one later */
8664 if (array_b[0] == 0) {
8670 /* But if the first element is not zero, we pretend the list starts
8671 * at the 0 that is always stored immediately before the array. */
8677 /* Size the union for the worst case: that the sets are completely
8679 u = _new_invlist(len_a + len_b);
8681 /* Will contain U+0000 if either component does */
8682 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8683 || (len_b > 0 && array_b[0] == 0));
8685 /* Go through each list item by item, stopping when exhausted one of
8687 while (i_a < len_a && i_b < len_b) {
8688 UV cp; /* The element to potentially add to the union's array */
8689 bool cp_in_set; /* is it in the the input list's set or not */
8691 /* We need to take one or the other of the two inputs for the union.
8692 * Since we are merging two sorted lists, we take the smaller of the
8693 * next items. In case of a tie, we take the one that is in its set
8694 * first. If we took one not in the set first, it would decrement the
8695 * count, possibly to 0 which would cause it to be output as ending the
8696 * range, and the next time through we would take the same number, and
8697 * output it again as beginning the next range. By doing it the
8698 * opposite way, there is no possibility that the count will be
8699 * momentarily decremented to 0, and thus the two adjoining ranges will
8700 * be seamlessly merged. (In a tie and both are in the set or both not
8701 * in the set, it doesn't matter which we take first.) */
8702 if (array_a[i_a] < array_b[i_b]
8703 || (array_a[i_a] == array_b[i_b]
8704 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8706 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8710 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8711 cp = array_b[i_b++];
8714 /* Here, have chosen which of the two inputs to look at. Only output
8715 * if the running count changes to/from 0, which marks the
8716 * beginning/end of a range in that's in the set */
8719 array_u[i_u++] = cp;
8726 array_u[i_u++] = cp;
8731 /* Here, we are finished going through at least one of the lists, which
8732 * means there is something remaining in at most one. We check if the list
8733 * that hasn't been exhausted is positioned such that we are in the middle
8734 * of a range in its set or not. (i_a and i_b point to the element beyond
8735 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8736 * is potentially more to output.
8737 * There are four cases:
8738 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8739 * in the union is entirely from the non-exhausted set.
8740 * 2) Both were in their sets, count is 2. Nothing further should
8741 * be output, as everything that remains will be in the exhausted
8742 * list's set, hence in the union; decrementing to 1 but not 0 insures
8744 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8745 * Nothing further should be output because the union includes
8746 * everything from the exhausted set. Not decrementing ensures that.
8747 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8748 * decrementing to 0 insures that we look at the remainder of the
8749 * non-exhausted set */
8750 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8751 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8756 /* The final length is what we've output so far, plus what else is about to
8757 * be output. (If 'count' is non-zero, then the input list we exhausted
8758 * has everything remaining up to the machine's limit in its set, and hence
8759 * in the union, so there will be no further output. */
8762 /* At most one of the subexpressions will be non-zero */
8763 len_u += (len_a - i_a) + (len_b - i_b);
8766 /* Set result to final length, which can change the pointer to array_u, so
8768 if (len_u != _invlist_len(u)) {
8769 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8771 array_u = invlist_array(u);
8774 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8775 * the other) ended with everything above it not in its set. That means
8776 * that the remaining part of the union is precisely the same as the
8777 * non-exhausted list, so can just copy it unchanged. (If both list were
8778 * exhausted at the same time, then the operations below will be both 0.)
8781 IV copy_count; /* At most one will have a non-zero copy count */
8782 if ((copy_count = len_a - i_a) > 0) {
8783 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8785 else if ((copy_count = len_b - i_b) > 0) {
8786 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8790 /* We may be removing a reference to one of the inputs. If so, the output
8791 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8792 * count decremented) */
8793 if (a == *output || b == *output) {
8794 assert(! invlist_is_iterating(*output));
8795 if ((SvTEMP(*output))) {
8799 SvREFCNT_dec_NN(*output);
8809 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8810 const bool complement_b, SV** i)
8812 /* Take the intersection of two inversion lists and point <i> to it. *i
8813 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8814 * the reference count to that list will be decremented if not already a
8815 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8816 * The first list, <a>, may be NULL, in which case an empty list is
8817 * returned. If <complement_b> is TRUE, the result will be the
8818 * intersection of <a> and the complement (or inversion) of <b> instead of
8821 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8822 * Richard Gillam, published by Addison-Wesley, and explained at some
8823 * length there. The preface says to incorporate its examples into your
8824 * code at your own risk. In fact, it had bugs
8826 * The algorithm is like a merge sort, and is essentially the same as the
8830 const UV* array_a; /* a's array */
8832 UV len_a; /* length of a's array */
8835 SV* r; /* the resulting intersection */
8839 UV i_a = 0; /* current index into a's array */
8843 /* running count, as explained in the algorithm source book; items are
8844 * stopped accumulating and are output when the count changes to/from 2.
8845 * The count is incremented when we start a range that's in the set, and
8846 * decremented when we start a range that's not in the set. So its range
8847 * is 0 to 2. Only when the count is 2 is something in the intersection.
8851 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8854 /* Special case if either one is empty */
8855 len_a = (a == NULL) ? 0 : _invlist_len(a);
8856 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8857 bool make_temp = FALSE;
8859 if (len_a != 0 && complement_b) {
8861 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8862 * be empty. Here, also we are using 'b's complement, which hence
8863 * must be every possible code point. Thus the intersection is
8867 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8872 *i = invlist_clone(a);
8874 /* else *i is already 'a' */
8882 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8883 * intersection must be empty */
8885 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8890 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8894 *i = _new_invlist(0);
8902 /* Here both lists exist and are non-empty */
8903 array_a = invlist_array(a);
8904 array_b = invlist_array(b);
8906 /* If are to take the intersection of 'a' with the complement of b, set it
8907 * up so are looking at b's complement. */
8910 /* To complement, we invert: if the first element is 0, remove it. To
8911 * do this, we just pretend the array starts one later */
8912 if (array_b[0] == 0) {
8918 /* But if the first element is not zero, we pretend the list starts
8919 * at the 0 that is always stored immediately before the array. */
8925 /* Size the intersection for the worst case: that the intersection ends up
8926 * fragmenting everything to be completely disjoint */
8927 r= _new_invlist(len_a + len_b);
8929 /* Will contain U+0000 iff both components do */
8930 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8931 && len_b > 0 && array_b[0] == 0);
8933 /* Go through each list item by item, stopping when exhausted one of
8935 while (i_a < len_a && i_b < len_b) {
8936 UV cp; /* The element to potentially add to the intersection's
8938 bool cp_in_set; /* Is it in the input list's set or not */
8940 /* We need to take one or the other of the two inputs for the
8941 * intersection. Since we are merging two sorted lists, we take the
8942 * smaller of the next items. In case of a tie, we take the one that
8943 * is not in its set first (a difference from the union algorithm). If
8944 * we took one in the set first, it would increment the count, possibly
8945 * to 2 which would cause it to be output as starting a range in the
8946 * intersection, and the next time through we would take that same
8947 * number, and output it again as ending the set. By doing it the
8948 * opposite of this, there is no possibility that the count will be
8949 * momentarily incremented to 2. (In a tie and both are in the set or
8950 * both not in the set, it doesn't matter which we take first.) */
8951 if (array_a[i_a] < array_b[i_b]
8952 || (array_a[i_a] == array_b[i_b]
8953 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8955 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8959 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8963 /* Here, have chosen which of the two inputs to look at. Only output
8964 * if the running count changes to/from 2, which marks the
8965 * beginning/end of a range that's in the intersection */
8969 array_r[i_r++] = cp;
8974 array_r[i_r++] = cp;
8980 /* Here, we are finished going through at least one of the lists, which
8981 * means there is something remaining in at most one. We check if the list
8982 * that has been exhausted is positioned such that we are in the middle
8983 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8984 * the ones we care about.) There are four cases:
8985 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8986 * nothing left in the intersection.
8987 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8988 * above 2. What should be output is exactly that which is in the
8989 * non-exhausted set, as everything it has is also in the intersection
8990 * set, and everything it doesn't have can't be in the intersection
8991 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8992 * gets incremented to 2. Like the previous case, the intersection is
8993 * everything that remains in the non-exhausted set.
8994 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8995 * remains 1. And the intersection has nothing more. */
8996 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8997 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9002 /* The final length is what we've output so far plus what else is in the
9003 * intersection. At most one of the subexpressions below will be non-zero
9007 len_r += (len_a - i_a) + (len_b - i_b);
9010 /* Set result to final length, which can change the pointer to array_r, so
9012 if (len_r != _invlist_len(r)) {
9013 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9015 array_r = invlist_array(r);
9018 /* Finish outputting any remaining */
9019 if (count >= 2) { /* At most one will have a non-zero copy count */
9021 if ((copy_count = len_a - i_a) > 0) {
9022 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9024 else if ((copy_count = len_b - i_b) > 0) {
9025 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9029 /* We may be removing a reference to one of the inputs. If so, the output
9030 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9031 * count decremented) */
9032 if (a == *i || b == *i) {
9033 assert(! invlist_is_iterating(*i));
9038 SvREFCNT_dec_NN(*i);
9048 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9050 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9051 * set. A pointer to the inversion list is returned. This may actually be
9052 * a new list, in which case the passed in one has been destroyed. The
9053 * passed-in inversion list can be NULL, in which case a new one is created
9054 * with just the one range in it */
9059 if (invlist == NULL) {
9060 invlist = _new_invlist(2);
9064 len = _invlist_len(invlist);
9067 /* If comes after the final entry actually in the list, can just append it
9070 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9071 && start >= invlist_array(invlist)[len - 1]))
9073 _append_range_to_invlist(invlist, start, end);
9077 /* Here, can't just append things, create and return a new inversion list
9078 * which is the union of this range and the existing inversion list. (If
9079 * the new range is well-behaved wrt to the old one, we could just insert
9080 * it, doing a Move() down on the tail of the old one (potentially growing
9081 * it first). But to determine that means we would have the extra
9082 * (possibly throw-away) work of first finding where the new one goes and
9083 * whether it disrupts (splits) an existing range, so it doesn't appear to
9084 * me (khw) that it's worth it) */
9085 range_invlist = _new_invlist(2);
9086 _append_range_to_invlist(range_invlist, start, end);
9088 _invlist_union(invlist, range_invlist, &invlist);
9090 /* The temporary can be freed */
9091 SvREFCNT_dec_NN(range_invlist);
9097 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9098 UV** other_elements_ptr)
9100 /* Create and return an inversion list whose contents are to be populated
9101 * by the caller. The caller gives the number of elements (in 'size') and
9102 * the very first element ('element0'). This function will set
9103 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9106 * Obviously there is some trust involved that the caller will properly
9107 * fill in the other elements of the array.
9109 * (The first element needs to be passed in, as the underlying code does
9110 * things differently depending on whether it is zero or non-zero) */
9112 SV* invlist = _new_invlist(size);
9115 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9117 _append_range_to_invlist(invlist, element0, element0);
9118 offset = *get_invlist_offset_addr(invlist);
9120 invlist_set_len(invlist, size, offset);
9121 *other_elements_ptr = invlist_array(invlist) + 1;
9127 PERL_STATIC_INLINE SV*
9128 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9129 return _add_range_to_invlist(invlist, cp, cp);
9132 #ifndef PERL_IN_XSUB_RE
9134 Perl__invlist_invert(pTHX_ SV* const invlist)
9136 /* Complement the input inversion list. This adds a 0 if the list didn't
9137 * have a zero; removes it otherwise. As described above, the data
9138 * structure is set up so that this is very efficient */
9140 PERL_ARGS_ASSERT__INVLIST_INVERT;
9142 assert(! invlist_is_iterating(invlist));
9144 /* The inverse of matching nothing is matching everything */
9145 if (_invlist_len(invlist) == 0) {
9146 _append_range_to_invlist(invlist, 0, UV_MAX);
9150 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9155 PERL_STATIC_INLINE SV*
9156 S_invlist_clone(pTHX_ SV* const invlist)
9159 /* Return a new inversion list that is a copy of the input one, which is
9160 * unchanged. The new list will not be mortal even if the old one was. */
9162 /* Need to allocate extra space to accommodate Perl's addition of a
9163 * trailing NUL to SvPV's, since it thinks they are always strings */
9164 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9165 STRLEN physical_length = SvCUR(invlist);
9166 bool offset = *(get_invlist_offset_addr(invlist));
9168 PERL_ARGS_ASSERT_INVLIST_CLONE;
9170 *(get_invlist_offset_addr(new_invlist)) = offset;
9171 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9172 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9177 PERL_STATIC_INLINE STRLEN*
9178 S_get_invlist_iter_addr(SV* invlist)
9180 /* Return the address of the UV that contains the current iteration
9183 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9185 assert(SvTYPE(invlist) == SVt_INVLIST);
9187 return &(((XINVLIST*) SvANY(invlist))->iterator);
9190 PERL_STATIC_INLINE void
9191 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9193 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9195 *get_invlist_iter_addr(invlist) = 0;
9198 PERL_STATIC_INLINE void
9199 S_invlist_iterfinish(SV* invlist)
9201 /* Terminate iterator for invlist. This is to catch development errors.
9202 * Any iteration that is interrupted before completed should call this
9203 * function. Functions that add code points anywhere else but to the end
9204 * of an inversion list assert that they are not in the middle of an
9205 * iteration. If they were, the addition would make the iteration
9206 * problematical: if the iteration hadn't reached the place where things
9207 * were being added, it would be ok */
9209 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9211 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9215 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9217 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9218 * This call sets in <*start> and <*end>, the next range in <invlist>.
9219 * Returns <TRUE> if successful and the next call will return the next
9220 * range; <FALSE> if was already at the end of the list. If the latter,
9221 * <*start> and <*end> are unchanged, and the next call to this function
9222 * will start over at the beginning of the list */
9224 STRLEN* pos = get_invlist_iter_addr(invlist);
9225 UV len = _invlist_len(invlist);
9228 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9231 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9235 array = invlist_array(invlist);
9237 *start = array[(*pos)++];
9243 *end = array[(*pos)++] - 1;
9249 PERL_STATIC_INLINE UV
9250 S_invlist_highest(SV* const invlist)
9252 /* Returns the highest code point that matches an inversion list. This API
9253 * has an ambiguity, as it returns 0 under either the highest is actually
9254 * 0, or if the list is empty. If this distinction matters to you, check
9255 * for emptiness before calling this function */
9257 UV len = _invlist_len(invlist);
9260 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9266 array = invlist_array(invlist);
9268 /* The last element in the array in the inversion list always starts a
9269 * range that goes to infinity. That range may be for code points that are
9270 * matched in the inversion list, or it may be for ones that aren't
9271 * matched. In the latter case, the highest code point in the set is one
9272 * less than the beginning of this range; otherwise it is the final element
9273 * of this range: infinity */
9274 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9276 : array[len - 1] - 1;
9279 #ifndef PERL_IN_XSUB_RE
9281 Perl__invlist_contents(pTHX_ SV* const invlist)
9283 /* Get the contents of an inversion list into a string SV so that they can
9284 * be printed out. It uses the format traditionally done for debug tracing
9288 SV* output = newSVpvs("\n");
9290 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9292 assert(! invlist_is_iterating(invlist));
9294 invlist_iterinit(invlist);
9295 while (invlist_iternext(invlist, &start, &end)) {
9296 if (end == UV_MAX) {
9297 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9299 else if (end != start) {
9300 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9304 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9312 #ifndef PERL_IN_XSUB_RE
9314 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9315 const char * const indent, SV* const invlist)
9317 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9318 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9319 * the string 'indent'. The output looks like this:
9320 [0] 0x000A .. 0x000D
9322 [4] 0x2028 .. 0x2029
9323 [6] 0x3104 .. INFINITY
9324 * This means that the first range of code points matched by the list are
9325 * 0xA through 0xD; the second range contains only the single code point
9326 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9327 * are used to define each range (except if the final range extends to
9328 * infinity, only a single element is needed). The array index of the
9329 * first element for the corresponding range is given in brackets. */
9334 PERL_ARGS_ASSERT__INVLIST_DUMP;
9336 if (invlist_is_iterating(invlist)) {
9337 Perl_dump_indent(aTHX_ level, file,
9338 "%sCan't dump inversion list because is in middle of iterating\n",
9343 invlist_iterinit(invlist);
9344 while (invlist_iternext(invlist, &start, &end)) {
9345 if (end == UV_MAX) {
9346 Perl_dump_indent(aTHX_ level, file,
9347 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9348 indent, (UV)count, start);
9350 else if (end != start) {
9351 Perl_dump_indent(aTHX_ level, file,
9352 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9353 indent, (UV)count, start, end);
9356 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9357 indent, (UV)count, start);
9364 Perl__load_PL_utf8_foldclosures (pTHX)
9366 assert(! PL_utf8_foldclosures);
9368 /* If the folds haven't been read in, call a fold function
9370 if (! PL_utf8_tofold) {
9371 U8 dummy[UTF8_MAXBYTES_CASE+1];
9373 /* This string is just a short named one above \xff */
9374 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9375 assert(PL_utf8_tofold); /* Verify that worked */
9377 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9381 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9383 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9385 /* Return a boolean as to if the two passed in inversion lists are
9386 * identical. The final argument, if TRUE, says to take the complement of
9387 * the second inversion list before doing the comparison */
9389 const UV* array_a = invlist_array(a);
9390 const UV* array_b = invlist_array(b);
9391 UV len_a = _invlist_len(a);
9392 UV len_b = _invlist_len(b);
9394 UV i = 0; /* current index into the arrays */
9395 bool retval = TRUE; /* Assume are identical until proven otherwise */
9397 PERL_ARGS_ASSERT__INVLISTEQ;
9399 /* If are to compare 'a' with the complement of b, set it
9400 * up so are looking at b's complement. */
9403 /* The complement of nothing is everything, so <a> would have to have
9404 * just one element, starting at zero (ending at infinity) */
9406 return (len_a == 1 && array_a[0] == 0);
9408 else if (array_b[0] == 0) {
9410 /* Otherwise, to complement, we invert. Here, the first element is
9411 * 0, just remove it. To do this, we just pretend the array starts
9419 /* But if the first element is not zero, we pretend the list starts
9420 * at the 0 that is always stored immediately before the array. */
9426 /* Make sure that the lengths are the same, as well as the final element
9427 * before looping through the remainder. (Thus we test the length, final,
9428 * and first elements right off the bat) */
9429 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9432 else for (i = 0; i < len_a - 1; i++) {
9433 if (array_a[i] != array_b[i]) {
9444 * As best we can, determine the characters that can match the start of
9445 * the given EXACTF-ish node.
9447 * Returns the invlist as a new SV*; it is the caller's responsibility to
9448 * call SvREFCNT_dec() when done with it.
9451 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9453 const U8 * s = (U8*)STRING(node);
9454 SSize_t bytelen = STR_LEN(node);
9456 /* Start out big enough for 2 separate code points */
9457 SV* invlist = _new_invlist(4);
9459 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9464 /* We punt and assume can match anything if the node begins
9465 * with a multi-character fold. Things are complicated. For
9466 * example, /ffi/i could match any of:
9467 * "\N{LATIN SMALL LIGATURE FFI}"
9468 * "\N{LATIN SMALL LIGATURE FF}I"
9469 * "F\N{LATIN SMALL LIGATURE FI}"
9470 * plus several other things; and making sure we have all the
9471 * possibilities is hard. */
9472 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9473 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9476 /* Any Latin1 range character can potentially match any
9477 * other depending on the locale */
9478 if (OP(node) == EXACTFL) {
9479 _invlist_union(invlist, PL_Latin1, &invlist);
9482 /* But otherwise, it matches at least itself. We can
9483 * quickly tell if it has a distinct fold, and if so,
9484 * it matches that as well */
9485 invlist = add_cp_to_invlist(invlist, uc);
9486 if (IS_IN_SOME_FOLD_L1(uc))
9487 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9490 /* Some characters match above-Latin1 ones under /i. This
9491 * is true of EXACTFL ones when the locale is UTF-8 */
9492 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9493 && (! isASCII(uc) || (OP(node) != EXACTFA
9494 && OP(node) != EXACTFA_NO_TRIE)))
9496 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9500 else { /* Pattern is UTF-8 */
9501 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9502 STRLEN foldlen = UTF8SKIP(s);
9503 const U8* e = s + bytelen;
9506 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9508 /* The only code points that aren't folded in a UTF EXACTFish
9509 * node are are the problematic ones in EXACTFL nodes */
9510 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9511 /* We need to check for the possibility that this EXACTFL
9512 * node begins with a multi-char fold. Therefore we fold
9513 * the first few characters of it so that we can make that
9518 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9520 *(d++) = (U8) toFOLD(*s);
9525 to_utf8_fold(s, d, &len);
9531 /* And set up so the code below that looks in this folded
9532 * buffer instead of the node's string */
9534 foldlen = UTF8SKIP(folded);
9538 /* When we reach here 's' points to the fold of the first
9539 * character(s) of the node; and 'e' points to far enough along
9540 * the folded string to be just past any possible multi-char
9541 * fold. 'foldlen' is the length in bytes of the first
9544 * Unlike the non-UTF-8 case, the macro for determining if a
9545 * string is a multi-char fold requires all the characters to
9546 * already be folded. This is because of all the complications
9547 * if not. Note that they are folded anyway, except in EXACTFL
9548 * nodes. Like the non-UTF case above, we punt if the node
9549 * begins with a multi-char fold */
9551 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9552 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9554 else { /* Single char fold */
9556 /* It matches all the things that fold to it, which are
9557 * found in PL_utf8_foldclosures (including itself) */
9558 invlist = add_cp_to_invlist(invlist, uc);
9559 if (! PL_utf8_foldclosures)
9560 _load_PL_utf8_foldclosures();
9561 if ((listp = hv_fetch(PL_utf8_foldclosures,
9562 (char *) s, foldlen, FALSE)))
9564 AV* list = (AV*) *listp;
9566 for (k = 0; k <= av_tindex(list); k++) {
9567 SV** c_p = av_fetch(list, k, FALSE);
9573 /* /aa doesn't allow folds between ASCII and non- */
9574 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9575 && isASCII(c) != isASCII(uc))
9580 invlist = add_cp_to_invlist(invlist, c);
9589 #undef HEADER_LENGTH
9590 #undef TO_INTERNAL_SIZE
9591 #undef FROM_INTERNAL_SIZE
9592 #undef INVLIST_VERSION_ID
9594 /* End of inversion list object */
9597 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9599 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9600 * constructs, and updates RExC_flags with them. On input, RExC_parse
9601 * should point to the first flag; it is updated on output to point to the
9602 * final ')' or ':'. There needs to be at least one flag, or this will
9605 /* for (?g), (?gc), and (?o) warnings; warning
9606 about (?c) will warn about (?g) -- japhy */
9608 #define WASTED_O 0x01
9609 #define WASTED_G 0x02
9610 #define WASTED_C 0x04
9611 #define WASTED_GC (WASTED_G|WASTED_C)
9612 I32 wastedflags = 0x00;
9613 U32 posflags = 0, negflags = 0;
9614 U32 *flagsp = &posflags;
9615 char has_charset_modifier = '\0';
9617 bool has_use_defaults = FALSE;
9618 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9619 int x_mod_count = 0;
9621 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9623 /* '^' as an initial flag sets certain defaults */
9624 if (UCHARAT(RExC_parse) == '^') {
9626 has_use_defaults = TRUE;
9627 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9628 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9629 ? REGEX_UNICODE_CHARSET
9630 : REGEX_DEPENDS_CHARSET);
9633 cs = get_regex_charset(RExC_flags);
9634 if (cs == REGEX_DEPENDS_CHARSET
9635 && (RExC_utf8 || RExC_uni_semantics))
9637 cs = REGEX_UNICODE_CHARSET;
9640 while (*RExC_parse) {
9641 /* && strchr("iogcmsx", *RExC_parse) */
9642 /* (?g), (?gc) and (?o) are useless here
9643 and must be globally applied -- japhy */
9644 switch (*RExC_parse) {
9646 /* Code for the imsxn flags */
9647 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9649 case LOCALE_PAT_MOD:
9650 if (has_charset_modifier) {
9651 goto excess_modifier;
9653 else if (flagsp == &negflags) {
9656 cs = REGEX_LOCALE_CHARSET;
9657 has_charset_modifier = LOCALE_PAT_MOD;
9659 case UNICODE_PAT_MOD:
9660 if (has_charset_modifier) {
9661 goto excess_modifier;
9663 else if (flagsp == &negflags) {
9666 cs = REGEX_UNICODE_CHARSET;
9667 has_charset_modifier = UNICODE_PAT_MOD;
9669 case ASCII_RESTRICT_PAT_MOD:
9670 if (flagsp == &negflags) {
9673 if (has_charset_modifier) {
9674 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9675 goto excess_modifier;
9677 /* Doubled modifier implies more restricted */
9678 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9681 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9683 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9685 case DEPENDS_PAT_MOD:
9686 if (has_use_defaults) {
9687 goto fail_modifiers;
9689 else if (flagsp == &negflags) {
9692 else if (has_charset_modifier) {
9693 goto excess_modifier;
9696 /* The dual charset means unicode semantics if the
9697 * pattern (or target, not known until runtime) are
9698 * utf8, or something in the pattern indicates unicode
9700 cs = (RExC_utf8 || RExC_uni_semantics)
9701 ? REGEX_UNICODE_CHARSET
9702 : REGEX_DEPENDS_CHARSET;
9703 has_charset_modifier = DEPENDS_PAT_MOD;
9707 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9708 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9710 else if (has_charset_modifier == *(RExC_parse - 1)) {
9711 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9715 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9717 NOT_REACHED; /*NOTREACHED*/
9720 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9722 NOT_REACHED; /*NOTREACHED*/
9723 case ONCE_PAT_MOD: /* 'o' */
9724 case GLOBAL_PAT_MOD: /* 'g' */
9725 if (PASS2 && ckWARN(WARN_REGEXP)) {
9726 const I32 wflagbit = *RExC_parse == 'o'
9729 if (! (wastedflags & wflagbit) ) {
9730 wastedflags |= wflagbit;
9731 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9734 "Useless (%s%c) - %suse /%c modifier",
9735 flagsp == &negflags ? "?-" : "?",
9737 flagsp == &negflags ? "don't " : "",
9744 case CONTINUE_PAT_MOD: /* 'c' */
9745 if (PASS2 && ckWARN(WARN_REGEXP)) {
9746 if (! (wastedflags & WASTED_C) ) {
9747 wastedflags |= WASTED_GC;
9748 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9751 "Useless (%sc) - %suse /gc modifier",
9752 flagsp == &negflags ? "?-" : "?",
9753 flagsp == &negflags ? "don't " : ""
9758 case KEEPCOPY_PAT_MOD: /* 'p' */
9759 if (flagsp == &negflags) {
9761 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9763 *flagsp |= RXf_PMf_KEEPCOPY;
9767 /* A flag is a default iff it is following a minus, so
9768 * if there is a minus, it means will be trying to
9769 * re-specify a default which is an error */
9770 if (has_use_defaults || flagsp == &negflags) {
9771 goto fail_modifiers;
9774 wastedflags = 0; /* reset so (?g-c) warns twice */
9778 RExC_flags |= posflags;
9779 RExC_flags &= ~negflags;
9780 set_regex_charset(&RExC_flags, cs);
9781 if (RExC_flags & RXf_PMf_FOLD) {
9782 RExC_contains_i = 1;
9785 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9791 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9792 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9793 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9794 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9795 NOT_REACHED; /*NOTREACHED*/
9801 vFAIL("Sequence (?... not terminated");
9805 - reg - regular expression, i.e. main body or parenthesized thing
9807 * Caller must absorb opening parenthesis.
9809 * Combining parenthesis handling with the base level of regular expression
9810 * is a trifle forced, but the need to tie the tails of the branches to what
9811 * follows makes it hard to avoid.
9813 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9815 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9817 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9820 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9821 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
9822 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
9823 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
9824 NULL, which cannot happen. */
9826 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9827 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9828 * 2 is like 1, but indicates that nextchar() has been called to advance
9829 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9830 * this flag alerts us to the need to check for that */
9832 regnode *ret; /* Will be the head of the group. */
9835 regnode *ender = NULL;
9838 U32 oregflags = RExC_flags;
9839 bool have_branch = 0;
9841 I32 freeze_paren = 0;
9842 I32 after_freeze = 0;
9843 I32 num; /* numeric backreferences */
9845 char * parse_start = RExC_parse; /* MJD */
9846 char * const oregcomp_parse = RExC_parse;
9848 GET_RE_DEBUG_FLAGS_DECL;
9850 PERL_ARGS_ASSERT_REG;
9851 DEBUG_PARSE("reg ");
9853 *flagp = 0; /* Tentatively. */
9856 /* Make an OPEN node, if parenthesized. */
9859 /* Under /x, space and comments can be gobbled up between the '(' and
9860 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9861 * intervening space, as the sequence is a token, and a token should be
9863 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9865 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9866 char *start_verb = RExC_parse;
9867 STRLEN verb_len = 0;
9868 char *start_arg = NULL;
9869 unsigned char op = 0;
9870 int arg_required = 0;
9871 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
9873 if (has_intervening_patws) {
9875 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9877 while ( *RExC_parse && *RExC_parse != ')' ) {
9878 if ( *RExC_parse == ':' ) {
9879 start_arg = RExC_parse + 1;
9885 verb_len = RExC_parse - start_verb;
9888 while ( *RExC_parse && *RExC_parse != ')' )
9890 if ( *RExC_parse != ')' )
9891 vFAIL("Unterminated verb pattern argument");
9892 if ( RExC_parse == start_arg )
9895 if ( *RExC_parse != ')' )
9896 vFAIL("Unterminated verb pattern");
9899 switch ( *start_verb ) {
9900 case 'A': /* (*ACCEPT) */
9901 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9903 internal_argval = RExC_nestroot;
9906 case 'C': /* (*COMMIT) */
9907 if ( memEQs(start_verb,verb_len,"COMMIT") )
9910 case 'F': /* (*FAIL) */
9911 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9915 case ':': /* (*:NAME) */
9916 case 'M': /* (*MARK:NAME) */
9917 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9922 case 'P': /* (*PRUNE) */
9923 if ( memEQs(start_verb,verb_len,"PRUNE") )
9926 case 'S': /* (*SKIP) */
9927 if ( memEQs(start_verb,verb_len,"SKIP") )
9930 case 'T': /* (*THEN) */
9931 /* [19:06] <TimToady> :: is then */
9932 if ( memEQs(start_verb,verb_len,"THEN") ) {
9934 RExC_seen |= REG_CUTGROUP_SEEN;
9939 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9941 "Unknown verb pattern '%"UTF8f"'",
9942 UTF8fARG(UTF, verb_len, start_verb));
9944 if ( arg_required && !start_arg ) {
9945 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9946 verb_len, start_verb);
9948 if (internal_argval == -1) {
9949 ret = reganode(pRExC_state, op, 0);
9951 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
9953 RExC_seen |= REG_VERBARG_SEEN;
9954 if ( ! SIZE_ONLY ) {
9956 SV *sv = newSVpvn( start_arg,
9957 RExC_parse - start_arg);
9958 ARG(ret) = add_data( pRExC_state,
9960 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9965 if ( internal_argval != -1 )
9966 ARG2L_SET(ret, internal_argval);
9968 nextchar(pRExC_state);
9971 else if (*RExC_parse == '?') { /* (?...) */
9972 bool is_logical = 0;
9973 const char * const seqstart = RExC_parse;
9974 const char * endptr;
9975 if (has_intervening_patws) {
9977 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9981 paren = *RExC_parse++;
9982 ret = NULL; /* For look-ahead/behind. */
9985 case 'P': /* (?P...) variants for those used to PCRE/Python */
9986 paren = *RExC_parse++;
9987 if ( paren == '<') /* (?P<...>) named capture */
9989 else if (paren == '>') { /* (?P>name) named recursion */
9990 goto named_recursion;
9992 else if (paren == '=') { /* (?P=...) named backref */
9993 /* this pretty much dupes the code for \k<NAME> in
9994 * regatom(), if you change this make sure you change that
9996 char* name_start = RExC_parse;
9998 SV *sv_dat = reg_scan_name(pRExC_state,
9999 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10000 if (RExC_parse == name_start || *RExC_parse != ')')
10001 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
10002 vFAIL2("Sequence %.3s... not terminated",parse_start);
10005 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10006 RExC_rxi->data->data[num]=(void*)sv_dat;
10007 SvREFCNT_inc_simple_void(sv_dat);
10010 ret = reganode(pRExC_state,
10013 : (ASCII_FOLD_RESTRICTED)
10015 : (AT_LEAST_UNI_SEMANTICS)
10021 *flagp |= HASWIDTH;
10023 Set_Node_Offset(ret, parse_start+1);
10024 Set_Node_Cur_Length(ret, parse_start);
10026 nextchar(pRExC_state);
10030 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10031 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10032 vFAIL3("Sequence (%.*s...) not recognized",
10033 RExC_parse-seqstart, seqstart);
10034 NOT_REACHED; /*NOTREACHED*/
10035 case '<': /* (?<...) */
10036 if (*RExC_parse == '!')
10038 else if (*RExC_parse != '=')
10044 case '\'': /* (?'...') */
10045 name_start= RExC_parse;
10046 svname = reg_scan_name(pRExC_state,
10047 SIZE_ONLY /* reverse test from the others */
10048 ? REG_RSN_RETURN_NAME
10049 : REG_RSN_RETURN_NULL);
10050 if (RExC_parse == name_start || *RExC_parse != paren)
10051 vFAIL2("Sequence (?%c... not terminated",
10052 paren=='>' ? '<' : paren);
10056 if (!svname) /* shouldn't happen */
10058 "panic: reg_scan_name returned NULL");
10059 if (!RExC_paren_names) {
10060 RExC_paren_names= newHV();
10061 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10063 RExC_paren_name_list= newAV();
10064 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10067 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10069 sv_dat = HeVAL(he_str);
10071 /* croak baby croak */
10073 "panic: paren_name hash element allocation failed");
10074 } else if ( SvPOK(sv_dat) ) {
10075 /* (?|...) can mean we have dupes so scan to check
10076 its already been stored. Maybe a flag indicating
10077 we are inside such a construct would be useful,
10078 but the arrays are likely to be quite small, so
10079 for now we punt -- dmq */
10080 IV count = SvIV(sv_dat);
10081 I32 *pv = (I32*)SvPVX(sv_dat);
10083 for ( i = 0 ; i < count ; i++ ) {
10084 if ( pv[i] == RExC_npar ) {
10090 pv = (I32*)SvGROW(sv_dat,
10091 SvCUR(sv_dat) + sizeof(I32)+1);
10092 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10093 pv[count] = RExC_npar;
10094 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10097 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10098 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10101 SvIV_set(sv_dat, 1);
10104 /* Yes this does cause a memory leak in debugging Perls
10106 if (!av_store(RExC_paren_name_list,
10107 RExC_npar, SvREFCNT_inc(svname)))
10108 SvREFCNT_dec_NN(svname);
10111 /*sv_dump(sv_dat);*/
10113 nextchar(pRExC_state);
10115 goto capturing_parens;
10117 RExC_seen |= REG_LOOKBEHIND_SEEN;
10118 RExC_in_lookbehind++;
10121 case '=': /* (?=...) */
10122 RExC_seen_zerolen++;
10124 case '!': /* (?!...) */
10125 RExC_seen_zerolen++;
10126 /* check if we're really just a "FAIL" assertion */
10127 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10128 FALSE /* Don't force to /x */ );
10129 if (*RExC_parse == ')') {
10130 ret=reganode(pRExC_state, OPFAIL, 0);
10131 nextchar(pRExC_state);
10135 case '|': /* (?|...) */
10136 /* branch reset, behave like a (?:...) except that
10137 buffers in alternations share the same numbers */
10139 after_freeze = freeze_paren = RExC_npar;
10141 case ':': /* (?:...) */
10142 case '>': /* (?>...) */
10144 case '$': /* (?$...) */
10145 case '@': /* (?@...) */
10146 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10148 case '0' : /* (?0) */
10149 case 'R' : /* (?R) */
10150 if (*RExC_parse != ')')
10151 FAIL("Sequence (?R) not terminated");
10152 ret = reg_node(pRExC_state, GOSTART);
10153 RExC_seen |= REG_GOSTART_SEEN;
10154 *flagp |= POSTPONED;
10155 nextchar(pRExC_state);
10158 /* named and numeric backreferences */
10159 case '&': /* (?&NAME) */
10160 parse_start = RExC_parse - 1;
10163 SV *sv_dat = reg_scan_name(pRExC_state,
10164 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10165 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10167 if (RExC_parse == RExC_end || *RExC_parse != ')')
10168 vFAIL("Sequence (?&... not terminated");
10169 goto gen_recurse_regop;
10172 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10174 vFAIL("Illegal pattern");
10176 goto parse_recursion;
10178 case '-': /* (?-1) */
10179 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10180 RExC_parse--; /* rewind to let it be handled later */
10184 case '1': case '2': case '3': case '4': /* (?1) */
10185 case '5': case '6': case '7': case '8': case '9':
10189 bool is_neg = FALSE;
10191 parse_start = RExC_parse - 1; /* MJD */
10192 if (*RExC_parse == '-') {
10196 if (grok_atoUV(RExC_parse, &unum, &endptr)
10200 RExC_parse = (char*)endptr;
10204 /* Some limit for num? */
10208 if (*RExC_parse!=')')
10209 vFAIL("Expecting close bracket");
10212 if ( paren == '-' ) {
10214 Diagram of capture buffer numbering.
10215 Top line is the normal capture buffer numbers
10216 Bottom line is the negative indexing as from
10220 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10224 num = RExC_npar + num;
10227 vFAIL("Reference to nonexistent group");
10229 } else if ( paren == '+' ) {
10230 num = RExC_npar + num - 1;
10233 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10235 if (num > (I32)RExC_rx->nparens) {
10237 vFAIL("Reference to nonexistent group");
10239 RExC_recurse_count++;
10240 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10241 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10242 22, "| |", (int)(depth * 2 + 1), "",
10243 (UV)ARG(ret), (IV)ARG2L(ret)));
10245 RExC_seen |= REG_RECURSE_SEEN;
10246 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10247 Set_Node_Offset(ret, parse_start); /* MJD */
10249 *flagp |= POSTPONED;
10250 nextchar(pRExC_state);
10255 case '?': /* (??...) */
10257 if (*RExC_parse != '{') {
10258 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10259 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10261 "Sequence (%"UTF8f"...) not recognized",
10262 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10263 NOT_REACHED; /*NOTREACHED*/
10265 *flagp |= POSTPONED;
10266 paren = *RExC_parse++;
10268 case '{': /* (?{...}) */
10271 struct reg_code_block *cb;
10273 RExC_seen_zerolen++;
10275 if ( !pRExC_state->num_code_blocks
10276 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10277 || pRExC_state->code_blocks[pRExC_state->code_index].start
10278 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10281 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10282 FAIL("panic: Sequence (?{...}): no code block found\n");
10283 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10285 /* this is a pre-compiled code block (?{...}) */
10286 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10287 RExC_parse = RExC_start + cb->end;
10290 if (cb->src_regex) {
10291 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10292 RExC_rxi->data->data[n] =
10293 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10294 RExC_rxi->data->data[n+1] = (void*)o;
10297 n = add_data(pRExC_state,
10298 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10299 RExC_rxi->data->data[n] = (void*)o;
10302 pRExC_state->code_index++;
10303 nextchar(pRExC_state);
10307 ret = reg_node(pRExC_state, LOGICAL);
10309 eval = reg2Lanode(pRExC_state, EVAL,
10312 /* for later propagation into (??{})
10314 RExC_flags & RXf_PMf_COMPILETIME
10319 REGTAIL(pRExC_state, ret, eval);
10320 /* deal with the length of this later - MJD */
10323 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10324 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10325 Set_Node_Offset(ret, parse_start);
10328 case '(': /* (?(?{...})...) and (?(?=...)...) */
10331 const int DEFINE_len = sizeof("DEFINE") - 1;
10332 if (RExC_parse[0] == '?') { /* (?(?...)) */
10334 RExC_parse[1] == '=' ||
10335 RExC_parse[1] == '!' ||
10336 RExC_parse[1] == '<' ||
10337 RExC_parse[1] == '{'
10338 ) { /* Lookahead or eval. */
10342 ret = reg_node(pRExC_state, LOGICAL);
10346 tail = reg(pRExC_state, 1, &flag, depth+1);
10347 if (flag & (RESTART_PASS1|NEED_UTF8)) {
10348 *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10351 REGTAIL(pRExC_state, ret, tail);
10354 /* Fall through to ‘Unknown switch condition’ at the
10355 end of the if/else chain. */
10357 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10358 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10360 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10361 char *name_start= RExC_parse++;
10363 SV *sv_dat=reg_scan_name(pRExC_state,
10364 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10365 if (RExC_parse == name_start || *RExC_parse != ch)
10366 vFAIL2("Sequence (?(%c... not terminated",
10367 (ch == '>' ? '<' : ch));
10370 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10371 RExC_rxi->data->data[num]=(void*)sv_dat;
10372 SvREFCNT_inc_simple_void(sv_dat);
10374 ret = reganode(pRExC_state,NGROUPP,num);
10375 goto insert_if_check_paren;
10377 else if (RExC_end - RExC_parse >= DEFINE_len
10378 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10380 ret = reganode(pRExC_state,DEFINEP,0);
10381 RExC_parse += DEFINE_len;
10383 goto insert_if_check_paren;
10385 else if (RExC_parse[0] == 'R') {
10388 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10390 if (grok_atoUV(RExC_parse, &uv, &endptr)
10394 RExC_parse = (char*)endptr;
10396 /* else "Switch condition not recognized" below */
10397 } else if (RExC_parse[0] == '&') {
10400 sv_dat = reg_scan_name(pRExC_state,
10402 ? REG_RSN_RETURN_NULL
10403 : REG_RSN_RETURN_DATA);
10404 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10406 ret = reganode(pRExC_state,INSUBP,parno);
10407 goto insert_if_check_paren;
10409 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10413 if (grok_atoUV(RExC_parse, &uv, &endptr)
10417 RExC_parse = (char*)endptr;
10420 vFAIL("panic: grok_atoUV returned FALSE");
10422 ret = reganode(pRExC_state, GROUPP, parno);
10424 insert_if_check_paren:
10425 if (UCHARAT(RExC_parse) != ')') {
10426 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10427 vFAIL("Switch condition not recognized");
10429 nextchar(pRExC_state);
10431 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10432 br = regbranch(pRExC_state, &flags, 1,depth+1);
10434 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10435 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10438 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10441 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10443 c = UCHARAT(RExC_parse);
10444 nextchar(pRExC_state);
10445 if (flags&HASWIDTH)
10446 *flagp |= HASWIDTH;
10449 vFAIL("(?(DEFINE)....) does not allow branches");
10451 /* Fake one for optimizer. */
10452 lastbr = reganode(pRExC_state, IFTHEN, 0);
10454 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10455 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10456 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10459 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10462 REGTAIL(pRExC_state, ret, lastbr);
10463 if (flags&HASWIDTH)
10464 *flagp |= HASWIDTH;
10465 c = UCHARAT(RExC_parse);
10466 nextchar(pRExC_state);
10471 if (RExC_parse>RExC_end)
10472 vFAIL("Switch (?(condition)... not terminated");
10474 vFAIL("Switch (?(condition)... contains too many branches");
10476 ender = reg_node(pRExC_state, TAIL);
10477 REGTAIL(pRExC_state, br, ender);
10479 REGTAIL(pRExC_state, lastbr, ender);
10480 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10483 REGTAIL(pRExC_state, ret, ender);
10484 RExC_size++; /* XXX WHY do we need this?!!
10485 For large programs it seems to be required
10486 but I can't figure out why. -- dmq*/
10489 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10490 vFAIL("Unknown switch condition (?(...))");
10492 case '[': /* (?[ ... ]) */
10493 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10496 RExC_parse--; /* for vFAIL to print correctly */
10497 vFAIL("Sequence (? incomplete");
10499 default: /* e.g., (?i) */
10502 parse_lparen_question_flags(pRExC_state);
10503 if (UCHARAT(RExC_parse) != ':') {
10505 nextchar(pRExC_state);
10510 nextchar(pRExC_state);
10515 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10520 ret = reganode(pRExC_state, OPEN, parno);
10522 if (!RExC_nestroot)
10523 RExC_nestroot = parno;
10524 if (RExC_seen & REG_RECURSE_SEEN
10525 && !RExC_open_parens[parno-1])
10527 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10528 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10529 22, "| |", (int)(depth * 2 + 1), "",
10530 (IV)parno, REG_NODE_NUM(ret)));
10531 RExC_open_parens[parno-1]= ret;
10534 Set_Node_Length(ret, 1); /* MJD */
10535 Set_Node_Offset(ret, RExC_parse); /* MJD */
10538 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10547 /* Pick up the branches, linking them together. */
10548 parse_start = RExC_parse; /* MJD */
10549 br = regbranch(pRExC_state, &flags, 1,depth+1);
10551 /* branch_len = (paren != 0); */
10554 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10555 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10558 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10560 if (*RExC_parse == '|') {
10561 if (!SIZE_ONLY && RExC_extralen) {
10562 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10565 reginsert(pRExC_state, BRANCH, br, depth+1);
10566 Set_Node_Length(br, paren != 0);
10567 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10571 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10573 else if (paren == ':') {
10574 *flagp |= flags&SIMPLE;
10576 if (is_open) { /* Starts with OPEN. */
10577 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10579 else if (paren != '?') /* Not Conditional */
10581 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10583 while (*RExC_parse == '|') {
10584 if (!SIZE_ONLY && RExC_extralen) {
10585 ender = reganode(pRExC_state, LONGJMP,0);
10587 /* Append to the previous. */
10588 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10591 RExC_extralen += 2; /* Account for LONGJMP. */
10592 nextchar(pRExC_state);
10593 if (freeze_paren) {
10594 if (RExC_npar > after_freeze)
10595 after_freeze = RExC_npar;
10596 RExC_npar = freeze_paren;
10598 br = regbranch(pRExC_state, &flags, 0, depth+1);
10601 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10602 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10605 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10607 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10609 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10612 if (have_branch || paren != ':') {
10613 /* Make a closing node, and hook it on the end. */
10616 ender = reg_node(pRExC_state, TAIL);
10619 ender = reganode(pRExC_state, CLOSE, parno);
10620 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10621 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10622 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10623 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10624 RExC_close_parens[parno-1]= ender;
10625 if (RExC_nestroot == parno)
10628 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10629 Set_Node_Length(ender,1); /* MJD */
10635 *flagp &= ~HASWIDTH;
10638 ender = reg_node(pRExC_state, SUCCEED);
10641 ender = reg_node(pRExC_state, END);
10643 assert(!RExC_opend); /* there can only be one! */
10644 RExC_opend = ender;
10648 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10649 DEBUG_PARSE_MSG("lsbr");
10650 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10651 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10652 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10653 SvPV_nolen_const(RExC_mysv1),
10654 (IV)REG_NODE_NUM(lastbr),
10655 SvPV_nolen_const(RExC_mysv2),
10656 (IV)REG_NODE_NUM(ender),
10657 (IV)(ender - lastbr)
10660 REGTAIL(pRExC_state, lastbr, ender);
10662 if (have_branch && !SIZE_ONLY) {
10663 char is_nothing= 1;
10665 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10667 /* Hook the tails of the branches to the closing node. */
10668 for (br = ret; br; br = regnext(br)) {
10669 const U8 op = PL_regkind[OP(br)];
10670 if (op == BRANCH) {
10671 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10672 if ( OP(NEXTOPER(br)) != NOTHING
10673 || regnext(NEXTOPER(br)) != ender)
10676 else if (op == BRANCHJ) {
10677 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10678 /* for now we always disable this optimisation * /
10679 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10680 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10686 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10687 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10688 DEBUG_PARSE_MSG("NADA");
10689 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10690 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10691 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10692 SvPV_nolen_const(RExC_mysv1),
10693 (IV)REG_NODE_NUM(ret),
10694 SvPV_nolen_const(RExC_mysv2),
10695 (IV)REG_NODE_NUM(ender),
10700 if (OP(ender) == TAIL) {
10705 for ( opt= br + 1; opt < ender ; opt++ )
10706 OP(opt)= OPTIMIZED;
10707 NEXT_OFF(br)= ender - br;
10715 static const char parens[] = "=!<,>";
10717 if (paren && (p = strchr(parens, paren))) {
10718 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10719 int flag = (p - parens) > 1;
10722 node = SUSPEND, flag = 0;
10723 reginsert(pRExC_state, node,ret, depth+1);
10724 Set_Node_Cur_Length(ret, parse_start);
10725 Set_Node_Offset(ret, parse_start + 1);
10727 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10731 /* Check for proper termination. */
10733 /* restore original flags, but keep (?p) and, if we've changed from /d
10734 * rules to /u, keep the /u */
10735 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10736 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
10737 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
10739 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
10740 RExC_parse = oregcomp_parse;
10741 vFAIL("Unmatched (");
10743 nextchar(pRExC_state);
10745 else if (!paren && RExC_parse < RExC_end) {
10746 if (*RExC_parse == ')') {
10748 vFAIL("Unmatched )");
10751 FAIL("Junk on end of regexp"); /* "Can't happen". */
10752 NOT_REACHED; /* NOTREACHED */
10755 if (RExC_in_lookbehind) {
10756 RExC_in_lookbehind--;
10758 if (after_freeze > RExC_npar)
10759 RExC_npar = after_freeze;
10764 - regbranch - one alternative of an | operator
10766 * Implements the concatenation operator.
10768 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10769 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10772 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10775 regnode *chain = NULL;
10777 I32 flags = 0, c = 0;
10778 GET_RE_DEBUG_FLAGS_DECL;
10780 PERL_ARGS_ASSERT_REGBRANCH;
10782 DEBUG_PARSE("brnc");
10787 if (!SIZE_ONLY && RExC_extralen)
10788 ret = reganode(pRExC_state, BRANCHJ,0);
10790 ret = reg_node(pRExC_state, BRANCH);
10791 Set_Node_Length(ret, 1);
10795 if (!first && SIZE_ONLY)
10796 RExC_extralen += 1; /* BRANCHJ */
10798 *flagp = WORST; /* Tentatively. */
10800 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10801 FALSE /* Don't force to /x */ );
10802 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10803 flags &= ~TRYAGAIN;
10804 latest = regpiece(pRExC_state, &flags,depth+1);
10805 if (latest == NULL) {
10806 if (flags & TRYAGAIN)
10808 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10809 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10812 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10814 else if (ret == NULL)
10816 *flagp |= flags&(HASWIDTH|POSTPONED);
10817 if (chain == NULL) /* First piece. */
10818 *flagp |= flags&SPSTART;
10820 /* FIXME adding one for every branch after the first is probably
10821 * excessive now we have TRIE support. (hv) */
10823 REGTAIL(pRExC_state, chain, latest);
10828 if (chain == NULL) { /* Loop ran zero times. */
10829 chain = reg_node(pRExC_state, NOTHING);
10834 *flagp |= flags&SIMPLE;
10841 - regpiece - something followed by possible [*+?]
10843 * Note that the branching code sequences used for ? and the general cases
10844 * of * and + are somewhat optimized: they use the same NOTHING node as
10845 * both the endmarker for their branch list and the body of the last branch.
10846 * It might seem that this node could be dispensed with entirely, but the
10847 * endmarker role is not redundant.
10849 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10851 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10852 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10855 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10861 const char * const origparse = RExC_parse;
10863 I32 max = REG_INFTY;
10864 #ifdef RE_TRACK_PATTERN_OFFSETS
10867 const char *maxpos = NULL;
10870 /* Save the original in case we change the emitted regop to a FAIL. */
10871 regnode * const orig_emit = RExC_emit;
10873 GET_RE_DEBUG_FLAGS_DECL;
10875 PERL_ARGS_ASSERT_REGPIECE;
10877 DEBUG_PARSE("piec");
10879 ret = regatom(pRExC_state, &flags,depth+1);
10881 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
10882 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
10884 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10890 if (op == '{' && regcurly(RExC_parse)) {
10892 #ifdef RE_TRACK_PATTERN_OFFSETS
10893 parse_start = RExC_parse; /* MJD */
10895 next = RExC_parse + 1;
10896 while (isDIGIT(*next) || *next == ',') {
10897 if (*next == ',') {
10905 if (*next == '}') { /* got one */
10906 const char* endptr;
10910 if (isDIGIT(*RExC_parse)) {
10911 if (!grok_atoUV(RExC_parse, &uv, &endptr))
10912 vFAIL("Invalid quantifier in {,}");
10913 if (uv >= REG_INFTY)
10914 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10919 if (*maxpos == ',')
10922 maxpos = RExC_parse;
10923 if (isDIGIT(*maxpos)) {
10924 if (!grok_atoUV(maxpos, &uv, &endptr))
10925 vFAIL("Invalid quantifier in {,}");
10926 if (uv >= REG_INFTY)
10927 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10930 max = REG_INFTY; /* meaning "infinity" */
10933 nextchar(pRExC_state);
10934 if (max < min) { /* If can't match, warn and optimize to fail
10938 /* We can't back off the size because we have to reserve
10939 * enough space for all the things we are about to throw
10940 * away, but we can shrink it by the ammount we are about
10941 * to re-use here */
10942 RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10945 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10946 RExC_emit = orig_emit;
10948 ret = reganode(pRExC_state, OPFAIL, 0);
10951 else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
10954 ckWARN2reg(RExC_parse + 1,
10955 "Useless use of greediness modifier '%c'",
10961 if ((flags&SIMPLE)) {
10962 if (min == 0 && max == REG_INFTY) {
10963 reginsert(pRExC_state, STAR, ret, depth+1);
10966 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10969 if (min == 1 && max == REG_INFTY) {
10970 reginsert(pRExC_state, PLUS, ret, depth+1);
10973 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10976 MARK_NAUGHTY_EXP(2, 2);
10977 reginsert(pRExC_state, CURLY, ret, depth+1);
10978 Set_Node_Offset(ret, parse_start+1); /* MJD */
10979 Set_Node_Cur_Length(ret, parse_start);
10982 regnode * const w = reg_node(pRExC_state, WHILEM);
10985 REGTAIL(pRExC_state, ret, w);
10986 if (!SIZE_ONLY && RExC_extralen) {
10987 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10988 reginsert(pRExC_state, NOTHING,ret, depth+1);
10989 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10991 reginsert(pRExC_state, CURLYX,ret, depth+1);
10993 Set_Node_Offset(ret, parse_start+1);
10994 Set_Node_Length(ret,
10995 op == '{' ? (RExC_parse - parse_start) : 1);
10997 if (!SIZE_ONLY && RExC_extralen)
10998 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10999 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11001 RExC_whilem_seen++, RExC_extralen += 3;
11002 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
11009 *flagp |= HASWIDTH;
11011 ARG1_SET(ret, (U16)min);
11012 ARG2_SET(ret, (U16)max);
11014 if (max == REG_INFTY)
11015 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11021 if (!ISMULT1(op)) {
11026 #if 0 /* Now runtime fix should be reliable. */
11028 /* if this is reinstated, don't forget to put this back into perldiag:
11030 =item Regexp *+ operand could be empty at {#} in regex m/%s/
11032 (F) The part of the regexp subject to either the * or + quantifier
11033 could match an empty string. The {#} shows in the regular
11034 expression about where the problem was discovered.
11038 if (!(flags&HASWIDTH) && op != '?')
11039 vFAIL("Regexp *+ operand could be empty");
11042 #ifdef RE_TRACK_PATTERN_OFFSETS
11043 parse_start = RExC_parse;
11045 nextchar(pRExC_state);
11047 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11053 else if (op == '+') {
11057 else if (op == '?') {
11062 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11063 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11064 ckWARN2reg(RExC_parse,
11065 "%"UTF8f" matches null string many times",
11066 UTF8fARG(UTF, (RExC_parse >= origparse
11067 ? RExC_parse - origparse
11070 (void)ReREFCNT_inc(RExC_rx_sv);
11073 if (RExC_parse < RExC_end && *RExC_parse == '?') {
11074 nextchar(pRExC_state);
11075 reginsert(pRExC_state, MINMOD, ret, depth+1);
11076 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11079 if (RExC_parse < RExC_end && *RExC_parse == '+') {
11081 nextchar(pRExC_state);
11082 ender = reg_node(pRExC_state, SUCCEED);
11083 REGTAIL(pRExC_state, ret, ender);
11084 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11086 ender = reg_node(pRExC_state, TAIL);
11087 REGTAIL(pRExC_state, ret, ender);
11090 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11092 vFAIL("Nested quantifiers");
11099 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11107 /* This routine teases apart the various meanings of \N and returns
11108 * accordingly. The input parameters constrain which meaning(s) is/are valid
11109 * in the current context.
11111 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11113 * If <code_point_p> is not NULL, the context is expecting the result to be a
11114 * single code point. If this \N instance turns out to a single code point,
11115 * the function returns TRUE and sets *code_point_p to that code point.
11117 * If <node_p> is not NULL, the context is expecting the result to be one of
11118 * the things representable by a regnode. If this \N instance turns out to be
11119 * one such, the function generates the regnode, returns TRUE and sets *node_p
11120 * to point to that regnode.
11122 * If this instance of \N isn't legal in any context, this function will
11123 * generate a fatal error and not return.
11125 * On input, RExC_parse should point to the first char following the \N at the
11126 * time of the call. On successful return, RExC_parse will have been updated
11127 * to point to just after the sequence identified by this routine. Also
11128 * *flagp has been updated as needed.
11130 * When there is some problem with the current context and this \N instance,
11131 * the function returns FALSE, without advancing RExC_parse, nor setting
11132 * *node_p, nor *code_point_p, nor *flagp.
11134 * If <cp_count> is not NULL, the caller wants to know the length (in code
11135 * points) that this \N sequence matches. This is set even if the function
11136 * returns FALSE, as detailed below.
11138 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11140 * Probably the most common case is for the \N to specify a single code point.
11141 * *cp_count will be set to 1, and *code_point_p will be set to that code
11144 * Another possibility is for the input to be an empty \N{}, which for
11145 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11146 * will be set to a generated NOTHING node.
11148 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11149 * set to 0. *node_p will be set to a generated REG_ANY node.
11151 * The fourth possibility is that \N resolves to a sequence of more than one
11152 * code points. *cp_count will be set to the number of code points in the
11153 * sequence. *node_p * will be set to a generated node returned by this
11154 * function calling S_reg().
11156 * The final possibility is that it is premature to be calling this function;
11157 * that pass1 needs to be restarted. This can happen when this changes from
11158 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
11159 * latter occurs only when the fourth possibility would otherwise be in
11160 * effect, and is because one of those code points requires the pattern to be
11161 * recompiled as UTF-8. The function returns FALSE, and sets the
11162 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
11163 * happens, the caller needs to desist from continuing parsing, and return
11164 * this information to its caller. This is not set for when there is only one
11165 * code point, as this can be called as part of an ANYOF node, and they can
11166 * store above-Latin1 code points without the pattern having to be in UTF-8.
11168 * For non-single-quoted regexes, the tokenizer has resolved character and
11169 * sequence names inside \N{...} into their Unicode values, normalizing the
11170 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11171 * hex-represented code points in the sequence. This is done there because
11172 * the names can vary based on what charnames pragma is in scope at the time,
11173 * so we need a way to take a snapshot of what they resolve to at the time of
11174 * the original parse. [perl #56444].
11176 * That parsing is skipped for single-quoted regexes, so we may here get
11177 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11178 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11179 * is legal and handled here. The code point is Unicode, and has to be
11180 * translated into the native character set for non-ASCII platforms.
11183 char * endbrace; /* points to '}' following the name */
11184 char *endchar; /* Points to '.' or '}' ending cur char in the input
11186 char* p = RExC_parse; /* Temporary */
11188 GET_RE_DEBUG_FLAGS_DECL;
11190 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11192 GET_RE_DEBUG_FLAGS;
11194 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11195 assert(! (node_p && cp_count)); /* At most 1 should be set */
11197 if (cp_count) { /* Initialize return for the most common case */
11201 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11202 * modifier. The other meanings do not, so use a temporary until we find
11203 * out which we are being called with */
11204 skip_to_be_ignored_text(pRExC_state, &p,
11205 FALSE /* Don't force to /x */ );
11207 /* Disambiguate between \N meaning a named character versus \N meaning
11208 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11209 * quantifier, or there is no '{' at all */
11210 if (*p != '{' || regcurly(p)) {
11220 *node_p = reg_node(pRExC_state, REG_ANY);
11221 *flagp |= HASWIDTH|SIMPLE;
11223 Set_Node_Length(*node_p, 1); /* MJD */
11227 /* Here, we have decided it should be a named character or sequence */
11229 /* The test above made sure that the next real character is a '{', but
11230 * under the /x modifier, it could be separated by space (or a comment and
11231 * \n) and this is not allowed (for consistency with \x{...} and the
11232 * tokenizer handling of \N{NAME}). */
11233 if (*RExC_parse != '{') {
11234 vFAIL("Missing braces on \\N{}");
11237 RExC_parse++; /* Skip past the '{' */
11239 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11240 || ! (endbrace == RExC_parse /* nothing between the {} */
11241 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11242 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11245 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11246 vFAIL("\\N{NAME} must be resolved by the lexer");
11249 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11252 if (endbrace == RExC_parse) { /* empty: \N{} */
11256 nextchar(pRExC_state);
11261 *node_p = reg_node(pRExC_state,NOTHING);
11265 RExC_parse += 2; /* Skip past the 'U+' */
11267 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11269 /* Code points are separated by dots. If none, there is only one code
11270 * point, and is terminated by the brace */
11272 if (endchar >= endbrace) {
11273 STRLEN length_of_hex;
11274 I32 grok_hex_flags;
11276 /* Here, exactly one code point. If that isn't what is wanted, fail */
11277 if (! code_point_p) {
11282 /* Convert code point from hex */
11283 length_of_hex = (STRLEN)(endchar - RExC_parse);
11284 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11285 | PERL_SCAN_DISALLOW_PREFIX
11287 /* No errors in the first pass (See [perl
11288 * #122671].) We let the code below find the
11289 * errors when there are multiple chars. */
11291 ? PERL_SCAN_SILENT_ILLDIGIT
11294 /* This routine is the one place where both single- and double-quotish
11295 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11296 * must be converted to native. */
11297 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11302 /* The tokenizer should have guaranteed validity, but it's possible to
11303 * bypass it by using single quoting, so check. Don't do the check
11304 * here when there are multiple chars; we do it below anyway. */
11305 if (length_of_hex == 0
11306 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11308 RExC_parse += length_of_hex; /* Includes all the valid */
11309 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11310 ? UTF8SKIP(RExC_parse)
11312 /* Guard against malformed utf8 */
11313 if (RExC_parse >= endchar) {
11314 RExC_parse = endchar;
11316 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11319 RExC_parse = endbrace + 1;
11322 else { /* Is a multiple character sequence */
11323 SV * substitute_parse;
11325 char *orig_end = RExC_end;
11328 /* Count the code points, if desired, in the sequence */
11331 while (RExC_parse < endbrace) {
11332 /* Point to the beginning of the next character in the sequence. */
11333 RExC_parse = endchar + 1;
11334 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11339 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11340 * But don't backup up the pointer if the caller want to know how many
11341 * code points there are (they can then handle things) */
11349 /* What is done here is to convert this to a sub-pattern of the form
11350 * \x{char1}\x{char2}... and then call reg recursively to parse it
11351 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11352 * while not having to worry about special handling that some code
11353 * points may have. */
11355 substitute_parse = newSVpvs("?:");
11357 while (RExC_parse < endbrace) {
11359 /* Convert to notation the rest of the code understands */
11360 sv_catpv(substitute_parse, "\\x{");
11361 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11362 sv_catpv(substitute_parse, "}");
11364 /* Point to the beginning of the next character in the sequence. */
11365 RExC_parse = endchar + 1;
11366 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11369 sv_catpv(substitute_parse, ")");
11371 RExC_parse = SvPV(substitute_parse, len);
11373 /* Don't allow empty number */
11374 if (len < (STRLEN) 8) {
11375 RExC_parse = endbrace;
11376 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11378 RExC_end = RExC_parse + len;
11380 /* The values are Unicode, and therefore not subject to recoding, but
11381 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11383 RExC_override_recoding = 1;
11385 RExC_recode_x_to_native = 1;
11389 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11390 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11391 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11394 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11397 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11400 /* Restore the saved values */
11401 RExC_parse = endbrace;
11402 RExC_end = orig_end;
11403 RExC_override_recoding = 0;
11405 RExC_recode_x_to_native = 0;
11408 SvREFCNT_dec_NN(substitute_parse);
11409 nextchar(pRExC_state);
11419 * It returns the code point in utf8 for the value in *encp.
11420 * value: a code value in the source encoding
11421 * encp: a pointer to an Encode object
11423 * If the result from Encode is not a single character,
11424 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11427 S_reg_recode(pTHX_ const U8 value, SV **encp)
11430 SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
11431 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11432 const STRLEN newlen = SvCUR(sv);
11433 UV uv = UNICODE_REPLACEMENT;
11435 PERL_ARGS_ASSERT_REG_RECODE;
11439 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11442 if (!newlen || numlen != newlen) {
11443 uv = UNICODE_REPLACEMENT;
11449 PERL_STATIC_INLINE U8
11450 S_compute_EXACTish(RExC_state_t *pRExC_state)
11454 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11462 op = get_regex_charset(RExC_flags);
11463 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11464 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11465 been, so there is no hole */
11468 return op + EXACTF;
11471 PERL_STATIC_INLINE void
11472 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11473 regnode *node, I32* flagp, STRLEN len, UV code_point,
11476 /* This knows the details about sizing an EXACTish node, setting flags for
11477 * it (by setting <*flagp>, and potentially populating it with a single
11480 * If <len> (the length in bytes) is non-zero, this function assumes that
11481 * the node has already been populated, and just does the sizing. In this
11482 * case <code_point> should be the final code point that has already been
11483 * placed into the node. This value will be ignored except that under some
11484 * circumstances <*flagp> is set based on it.
11486 * If <len> is zero, the function assumes that the node is to contain only
11487 * the single character given by <code_point> and calculates what <len>
11488 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11489 * additionally will populate the node's STRING with <code_point> or its
11492 * In both cases <*flagp> is appropriately set
11494 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11495 * 255, must be folded (the former only when the rules indicate it can
11498 * When it does the populating, it looks at the flag 'downgradable'. If
11499 * true with a node that folds, it checks if the single code point
11500 * participates in a fold, and if not downgrades the node to an EXACT.
11501 * This helps the optimizer */
11503 bool len_passed_in = cBOOL(len != 0);
11504 U8 character[UTF8_MAXBYTES_CASE+1];
11506 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11508 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11509 * sizing difference, and is extra work that is thrown away */
11510 if (downgradable && ! PASS2) {
11511 downgradable = FALSE;
11514 if (! len_passed_in) {
11516 if (UVCHR_IS_INVARIANT(code_point)) {
11517 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11518 *character = (U8) code_point;
11520 else { /* Here is /i and not /l. (toFOLD() is defined on just
11521 ASCII, which isn't the same thing as INVARIANT on
11522 EBCDIC, but it works there, as the extra invariants
11523 fold to themselves) */
11524 *character = toFOLD((U8) code_point);
11526 /* We can downgrade to an EXACT node if this character
11527 * isn't a folding one. Note that this assumes that
11528 * nothing above Latin1 folds to some other invariant than
11529 * one of these alphabetics; otherwise we would also have
11531 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11532 * || ASCII_FOLD_RESTRICTED))
11534 if (downgradable && PL_fold[code_point] == code_point) {
11540 else if (FOLD && (! LOC
11541 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11542 { /* Folding, and ok to do so now */
11543 UV folded = _to_uni_fold_flags(
11547 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11548 ? FOLD_FLAGS_NOMIX_ASCII
11551 && folded == code_point /* This quickly rules out many
11552 cases, avoiding the
11553 _invlist_contains_cp() overhead
11555 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11562 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11564 /* Not folding this cp, and can output it directly */
11565 *character = UTF8_TWO_BYTE_HI(code_point);
11566 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11570 uvchr_to_utf8( character, code_point);
11571 len = UTF8SKIP(character);
11573 } /* Else pattern isn't UTF8. */
11575 *character = (U8) code_point;
11577 } /* Else is folded non-UTF8 */
11578 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11579 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11580 || UNICODE_DOT_DOT_VERSION > 0)
11581 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11585 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11586 * comments at join_exact()); */
11587 *character = (U8) code_point;
11590 /* Can turn into an EXACT node if we know the fold at compile time,
11591 * and it folds to itself and doesn't particpate in other folds */
11594 && PL_fold_latin1[code_point] == code_point
11595 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11596 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11600 } /* else is Sharp s. May need to fold it */
11601 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11603 *(character + 1) = 's';
11607 *character = LATIN_SMALL_LETTER_SHARP_S;
11613 RExC_size += STR_SZ(len);
11616 RExC_emit += STR_SZ(len);
11617 STR_LEN(node) = len;
11618 if (! len_passed_in) {
11619 Copy((char *) character, STRING(node), len, char);
11623 *flagp |= HASWIDTH;
11625 /* A single character node is SIMPLE, except for the special-cased SHARP S
11627 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
11628 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11629 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11630 || UNICODE_DOT_DOT_VERSION > 0)
11631 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
11632 || ! FOLD || ! DEPENDS_SEMANTICS)
11638 /* The OP may not be well defined in PASS1 */
11639 if (PASS2 && OP(node) == EXACTFL) {
11640 RExC_contains_locale = 1;
11645 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11646 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11649 S_backref_value(char *p)
11651 const char* endptr;
11653 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11660 - regatom - the lowest level
11662 Try to identify anything special at the start of the pattern. If there
11663 is, then handle it as required. This may involve generating a single regop,
11664 such as for an assertion; or it may involve recursing, such as to
11665 handle a () structure.
11667 If the string doesn't start with something special then we gobble up
11668 as much literal text as we can.
11670 Once we have been able to handle whatever type of thing started the
11671 sequence, we return.
11673 Note: we have to be careful with escapes, as they can be both literal
11674 and special, and in the case of \10 and friends, context determines which.
11676 A summary of the code structure is:
11678 switch (first_byte) {
11679 cases for each special:
11680 handle this special;
11683 switch (2nd byte) {
11684 cases for each unambiguous special:
11685 handle this special;
11687 cases for each ambigous special/literal:
11689 if (special) handle here
11691 default: // unambiguously literal:
11694 default: // is a literal char
11697 create EXACTish node for literal;
11698 while (more input and node isn't full) {
11699 switch (input_byte) {
11700 cases for each special;
11701 make sure parse pointer is set so that the next call to
11702 regatom will see this special first
11703 goto loopdone; // EXACTish node terminated by prev. char
11705 append char to EXACTISH node;
11707 get next input byte;
11711 return the generated node;
11713 Specifically there are two separate switches for handling
11714 escape sequences, with the one for handling literal escapes requiring
11715 a dummy entry for all of the special escapes that are actually handled
11718 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11720 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11721 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11722 Otherwise does not return NULL.
11726 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11728 regnode *ret = NULL;
11735 GET_RE_DEBUG_FLAGS_DECL;
11737 *flagp = WORST; /* Tentatively. */
11739 DEBUG_PARSE("atom");
11741 PERL_ARGS_ASSERT_REGATOM;
11744 parse_start = RExC_parse;
11745 switch ((U8)*RExC_parse) {
11747 RExC_seen_zerolen++;
11748 nextchar(pRExC_state);
11749 if (RExC_flags & RXf_PMf_MULTILINE)
11750 ret = reg_node(pRExC_state, MBOL);
11752 ret = reg_node(pRExC_state, SBOL);
11753 Set_Node_Length(ret, 1); /* MJD */
11756 nextchar(pRExC_state);
11758 RExC_seen_zerolen++;
11759 if (RExC_flags & RXf_PMf_MULTILINE)
11760 ret = reg_node(pRExC_state, MEOL);
11762 ret = reg_node(pRExC_state, SEOL);
11763 Set_Node_Length(ret, 1); /* MJD */
11766 nextchar(pRExC_state);
11767 if (RExC_flags & RXf_PMf_SINGLELINE)
11768 ret = reg_node(pRExC_state, SANY);
11770 ret = reg_node(pRExC_state, REG_ANY);
11771 *flagp |= HASWIDTH|SIMPLE;
11773 Set_Node_Length(ret, 1); /* MJD */
11777 char * const oregcomp_parse = ++RExC_parse;
11778 ret = regclass(pRExC_state, flagp,depth+1,
11779 FALSE, /* means parse the whole char class */
11780 TRUE, /* allow multi-char folds */
11781 FALSE, /* don't silence non-portable warnings. */
11782 (bool) RExC_strict,
11783 TRUE, /* Allow an optimized regnode result */
11786 if (*flagp & (RESTART_PASS1|NEED_UTF8))
11788 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11791 if (*RExC_parse != ']') {
11792 RExC_parse = oregcomp_parse;
11793 vFAIL("Unmatched [");
11795 nextchar(pRExC_state);
11796 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11800 nextchar(pRExC_state);
11801 ret = reg(pRExC_state, 2, &flags,depth+1);
11803 if (flags & TRYAGAIN) {
11804 if (RExC_parse == RExC_end) {
11805 /* Make parent create an empty node if needed. */
11806 *flagp |= TRYAGAIN;
11811 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11812 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11815 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11818 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11822 if (flags & TRYAGAIN) {
11823 *flagp |= TRYAGAIN;
11826 vFAIL("Internal urp");
11827 /* Supposed to be caught earlier. */
11833 vFAIL("Quantifier follows nothing");
11838 This switch handles escape sequences that resolve to some kind
11839 of special regop and not to literal text. Escape sequnces that
11840 resolve to literal text are handled below in the switch marked
11843 Every entry in this switch *must* have a corresponding entry
11844 in the literal escape switch. However, the opposite is not
11845 required, as the default for this switch is to jump to the
11846 literal text handling code.
11848 switch ((U8)*++RExC_parse) {
11849 /* Special Escapes */
11851 RExC_seen_zerolen++;
11852 ret = reg_node(pRExC_state, SBOL);
11853 /* SBOL is shared with /^/ so we set the flags so we can tell
11854 * /\A/ from /^/ in split. We check ret because first pass we
11855 * have no regop struct to set the flags on. */
11859 goto finish_meta_pat;
11861 ret = reg_node(pRExC_state, GPOS);
11862 RExC_seen |= REG_GPOS_SEEN;
11864 goto finish_meta_pat;
11866 RExC_seen_zerolen++;
11867 ret = reg_node(pRExC_state, KEEPS);
11869 /* XXX:dmq : disabling in-place substitution seems to
11870 * be necessary here to avoid cases of memory corruption, as
11871 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11873 RExC_seen |= REG_LOOKBEHIND_SEEN;
11874 goto finish_meta_pat;
11876 ret = reg_node(pRExC_state, SEOL);
11878 RExC_seen_zerolen++; /* Do not optimize RE away */
11879 goto finish_meta_pat;
11881 ret = reg_node(pRExC_state, EOS);
11883 RExC_seen_zerolen++; /* Do not optimize RE away */
11884 goto finish_meta_pat;
11886 vFAIL("\\C no longer supported");
11888 ret = reg_node(pRExC_state, CLUMP);
11889 *flagp |= HASWIDTH;
11890 goto finish_meta_pat;
11896 arg = ANYOF_WORDCHAR;
11904 regex_charset charset = get_regex_charset(RExC_flags);
11906 RExC_seen_zerolen++;
11907 RExC_seen |= REG_LOOKBEHIND_SEEN;
11908 op = BOUND + charset;
11910 if (op == BOUNDL) {
11911 RExC_contains_locale = 1;
11914 ret = reg_node(pRExC_state, op);
11916 if (*(RExC_parse + 1) != '{') {
11917 FLAGS(ret) = TRADITIONAL_BOUND;
11918 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
11924 char name = *RExC_parse;
11927 endbrace = strchr(RExC_parse, '}');
11930 vFAIL2("Missing right brace on \\%c{}", name);
11932 /* XXX Need to decide whether to take spaces or not. Should be
11933 * consistent with \p{}, but that currently is SPACE, which
11934 * means vertical too, which seems wrong
11935 * while (isBLANK(*RExC_parse)) {
11938 if (endbrace == RExC_parse) {
11939 RExC_parse++; /* After the '}' */
11940 vFAIL2("Empty \\%c{}", name);
11942 length = endbrace - RExC_parse;
11943 /*while (isBLANK(*(RExC_parse + length - 1))) {
11946 switch (*RExC_parse) {
11949 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11951 goto bad_bound_type;
11953 FLAGS(ret) = GCB_BOUND;
11956 if (length != 2 || *(RExC_parse + 1) != 'b') {
11957 goto bad_bound_type;
11959 FLAGS(ret) = SB_BOUND;
11962 if (length != 2 || *(RExC_parse + 1) != 'b') {
11963 goto bad_bound_type;
11965 FLAGS(ret) = WB_BOUND;
11969 RExC_parse = endbrace;
11971 "'%"UTF8f"' is an unknown bound type",
11972 UTF8fARG(UTF, length, endbrace - length));
11973 NOT_REACHED; /*NOTREACHED*/
11975 RExC_parse = endbrace;
11976 REQUIRE_UNI_RULES(flagp, NULL);
11978 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
11982 /* Don't have to worry about UTF-8, in this message because
11983 * to get here the contents of the \b must be ASCII */
11984 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
11985 "Using /u for '%.*s' instead of /%s",
11987 endbrace - length + 1,
11988 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11989 ? ASCII_RESTRICT_PAT_MODS
11990 : ASCII_MORE_RESTRICT_PAT_MODS);
11994 if (PASS2 && invert) {
11995 OP(ret) += NBOUND - BOUND;
11997 goto finish_meta_pat;
12005 if (! DEPENDS_SEMANTICS) {
12009 /* \d doesn't have any matches in the upper Latin1 range, hence /d
12010 * is equivalent to /u. Changing to /u saves some branches at
12013 goto join_posix_op_known;
12016 ret = reg_node(pRExC_state, LNBREAK);
12017 *flagp |= HASWIDTH|SIMPLE;
12018 goto finish_meta_pat;
12026 goto join_posix_op_known;
12032 arg = ANYOF_VERTWS;
12034 goto join_posix_op_known;
12044 op = POSIXD + get_regex_charset(RExC_flags);
12045 if (op > POSIXA) { /* /aa is same as /a */
12048 else if (op == POSIXL) {
12049 RExC_contains_locale = 1;
12052 join_posix_op_known:
12055 op += NPOSIXD - POSIXD;
12058 ret = reg_node(pRExC_state, op);
12060 FLAGS(ret) = namedclass_to_classnum(arg);
12063 *flagp |= HASWIDTH|SIMPLE;
12067 nextchar(pRExC_state);
12068 Set_Node_Length(ret, 2); /* MJD */
12074 ret = regclass(pRExC_state, flagp,depth+1,
12075 TRUE, /* means just parse this element */
12076 FALSE, /* don't allow multi-char folds */
12077 FALSE, /* don't silence non-portable warnings. It
12078 would be a bug if these returned
12080 (bool) RExC_strict,
12081 TRUE, /* Allow an optimized regnode result */
12083 if (*flagp & RESTART_PASS1)
12085 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12086 * multi-char folds are allowed. */
12088 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12093 Set_Node_Offset(ret, parse_start);
12094 Set_Node_Cur_Length(ret, parse_start - 2);
12095 nextchar(pRExC_state);
12098 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12099 * \N{...} evaluates to a sequence of more than one code points).
12100 * The function call below returns a regnode, which is our result.
12101 * The parameters cause it to fail if the \N{} evaluates to a
12102 * single code point; we handle those like any other literal. The
12103 * reason that the multicharacter case is handled here and not as
12104 * part of the EXACtish code is because of quantifiers. In
12105 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12106 * this way makes that Just Happen. dmq.
12107 * join_exact() will join this up with adjacent EXACTish nodes
12108 * later on, if appropriate. */
12110 if (grok_bslash_N(pRExC_state,
12111 &ret, /* Want a regnode returned */
12112 NULL, /* Fail if evaluates to a single code
12114 NULL, /* Don't need a count of how many code
12122 if (*flagp & RESTART_PASS1)
12125 /* Here, evaluates to a single code point. Go get that */
12126 RExC_parse = parse_start;
12129 case 'k': /* Handle \k<NAME> and \k'NAME' */
12132 char ch= RExC_parse[1];
12133 if (ch != '<' && ch != '\'' && ch != '{') {
12135 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12136 vFAIL2("Sequence %.2s... not terminated",parse_start);
12138 /* this pretty much dupes the code for (?P=...) in reg(), if
12139 you change this make sure you change that */
12140 char* name_start = (RExC_parse += 2);
12142 SV *sv_dat = reg_scan_name(pRExC_state,
12143 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12144 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12145 if (RExC_parse == name_start || *RExC_parse != ch)
12146 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12147 vFAIL2("Sequence %.3s... not terminated",parse_start);
12150 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12151 RExC_rxi->data->data[num]=(void*)sv_dat;
12152 SvREFCNT_inc_simple_void(sv_dat);
12156 ret = reganode(pRExC_state,
12159 : (ASCII_FOLD_RESTRICTED)
12161 : (AT_LEAST_UNI_SEMANTICS)
12167 *flagp |= HASWIDTH;
12169 /* override incorrect value set in reganode MJD */
12170 Set_Node_Offset(ret, parse_start+1);
12171 Set_Node_Cur_Length(ret, parse_start);
12172 nextchar(pRExC_state);
12178 case '1': case '2': case '3': case '4':
12179 case '5': case '6': case '7': case '8': case '9':
12184 if (*RExC_parse == 'g') {
12188 if (*RExC_parse == '{') {
12192 if (*RExC_parse == '-') {
12196 if (hasbrace && !isDIGIT(*RExC_parse)) {
12197 if (isrel) RExC_parse--;
12199 goto parse_named_seq;
12202 num = S_backref_value(RExC_parse);
12204 vFAIL("Reference to invalid group 0");
12205 else if (num == I32_MAX) {
12206 if (isDIGIT(*RExC_parse))
12207 vFAIL("Reference to nonexistent group");
12209 vFAIL("Unterminated \\g... pattern");
12213 num = RExC_npar - num;
12215 vFAIL("Reference to nonexistent or unclosed group");
12219 num = S_backref_value(RExC_parse);
12220 /* bare \NNN might be backref or octal - if it is larger
12221 * than or equal RExC_npar then it is assumed to be an
12222 * octal escape. Note RExC_npar is +1 from the actual
12223 * number of parens. */
12224 /* Note we do NOT check if num == I32_MAX here, as that is
12225 * handled by the RExC_npar check */
12228 /* any numeric escape < 10 is always a backref */
12230 /* any numeric escape < RExC_npar is a backref */
12231 && num >= RExC_npar
12232 /* cannot be an octal escape if it starts with 8 */
12233 && *RExC_parse != '8'
12234 /* cannot be an octal escape it it starts with 9 */
12235 && *RExC_parse != '9'
12238 /* Probably not a backref, instead likely to be an
12239 * octal character escape, e.g. \35 or \777.
12240 * The above logic should make it obvious why using
12241 * octal escapes in patterns is problematic. - Yves */
12242 RExC_parse = parse_start;
12247 /* At this point RExC_parse points at a numeric escape like
12248 * \12 or \88 or something similar, which we should NOT treat
12249 * as an octal escape. It may or may not be a valid backref
12250 * escape. For instance \88888888 is unlikely to be a valid
12252 while (isDIGIT(*RExC_parse))
12255 if (*RExC_parse != '}')
12256 vFAIL("Unterminated \\g{...} pattern");
12260 if (num > (I32)RExC_rx->nparens)
12261 vFAIL("Reference to nonexistent group");
12264 ret = reganode(pRExC_state,
12267 : (ASCII_FOLD_RESTRICTED)
12269 : (AT_LEAST_UNI_SEMANTICS)
12275 *flagp |= HASWIDTH;
12277 /* override incorrect value set in reganode MJD */
12278 Set_Node_Offset(ret, parse_start);
12279 Set_Node_Cur_Length(ret, parse_start-1);
12280 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12281 FALSE /* Don't force to /x */ );
12285 if (RExC_parse >= RExC_end)
12286 FAIL("Trailing \\");
12289 /* Do not generate "unrecognized" warnings here, we fall
12290 back into the quick-grab loop below */
12291 RExC_parse = parse_start;
12293 } /* end of switch on a \foo sequence */
12298 /* '#' comments should have been spaced over before this function was
12300 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12302 if (RExC_flags & RXf_PMf_EXTENDED) {
12303 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12304 if (RExC_parse < RExC_end)
12314 /* Here, we have determined that the next thing is probably a
12315 * literal character. RExC_parse points to the first byte of its
12316 * definition. (It still may be an escape sequence that evaluates
12317 * to a single character) */
12323 #define MAX_NODE_STRING_SIZE 127
12324 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12326 U8 upper_parse = MAX_NODE_STRING_SIZE;
12327 U8 node_type = compute_EXACTish(pRExC_state);
12328 bool next_is_quantifier;
12329 char * oldp = NULL;
12331 /* We can convert EXACTF nodes to EXACTFU if they contain only
12332 * characters that match identically regardless of the target
12333 * string's UTF8ness. The reason to do this is that EXACTF is not
12334 * trie-able, EXACTFU is.
12336 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12337 * contain only above-Latin1 characters (hence must be in UTF8),
12338 * which don't participate in folds with Latin1-range characters,
12339 * as the latter's folds aren't known until runtime. (We don't
12340 * need to figure this out until pass 2) */
12341 bool maybe_exactfu = PASS2
12342 && (node_type == EXACTF || node_type == EXACTFL);
12344 /* If a folding node contains only code points that don't
12345 * participate in folds, it can be changed into an EXACT node,
12346 * which allows the optimizer more things to look for */
12349 ret = reg_node(pRExC_state, node_type);
12351 /* In pass1, folded, we use a temporary buffer instead of the
12352 * actual node, as the node doesn't exist yet */
12353 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12359 /* We look for the EXACTFish to EXACT node optimizaton only if
12360 * folding. (And we don't need to figure this out until pass 2) */
12361 maybe_exact = FOLD && PASS2;
12363 /* XXX The node can hold up to 255 bytes, yet this only goes to
12364 * 127. I (khw) do not know why. Keeping it somewhat less than
12365 * 255 allows us to not have to worry about overflow due to
12366 * converting to utf8 and fold expansion, but that value is
12367 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12368 * split up by this limit into a single one using the real max of
12369 * 255. Even at 127, this breaks under rare circumstances. If
12370 * folding, we do not want to split a node at a character that is a
12371 * non-final in a multi-char fold, as an input string could just
12372 * happen to want to match across the node boundary. The join
12373 * would solve that problem if the join actually happens. But a
12374 * series of more than two nodes in a row each of 127 would cause
12375 * the first join to succeed to get to 254, but then there wouldn't
12376 * be room for the next one, which could at be one of those split
12377 * multi-char folds. I don't know of any fool-proof solution. One
12378 * could back off to end with only a code point that isn't such a
12379 * non-final, but it is possible for there not to be any in the
12382 assert( ! UTF /* Is at the beginning of a character */
12383 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12384 || UTF8_IS_START(UCHARAT(RExC_parse)));
12386 for (p = RExC_parse;
12387 len < upper_parse && p < RExC_end;
12392 /* White space has already been ignored */
12393 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
12394 || ! is_PATWS_safe((p), RExC_end, UTF));
12406 /* Literal Escapes Switch
12408 This switch is meant to handle escape sequences that
12409 resolve to a literal character.
12411 Every escape sequence that represents something
12412 else, like an assertion or a char class, is handled
12413 in the switch marked 'Special Escapes' above in this
12414 routine, but also has an entry here as anything that
12415 isn't explicitly mentioned here will be treated as
12416 an unescaped equivalent literal.
12419 switch ((U8)*++p) {
12420 /* These are all the special escapes. */
12421 case 'A': /* Start assertion */
12422 case 'b': case 'B': /* Word-boundary assertion*/
12423 case 'C': /* Single char !DANGEROUS! */
12424 case 'd': case 'D': /* digit class */
12425 case 'g': case 'G': /* generic-backref, pos assertion */
12426 case 'h': case 'H': /* HORIZWS */
12427 case 'k': case 'K': /* named backref, keep marker */
12428 case 'p': case 'P': /* Unicode property */
12429 case 'R': /* LNBREAK */
12430 case 's': case 'S': /* space class */
12431 case 'v': case 'V': /* VERTWS */
12432 case 'w': case 'W': /* word class */
12433 case 'X': /* eXtended Unicode "combining
12434 character sequence" */
12435 case 'z': case 'Z': /* End of line/string assertion */
12439 /* Anything after here is an escape that resolves to a
12440 literal. (Except digits, which may or may not)
12446 case 'N': /* Handle a single-code point named character. */
12447 RExC_parse = p + 1;
12448 if (! grok_bslash_N(pRExC_state,
12449 NULL, /* Fail if evaluates to
12450 anything other than a
12451 single code point */
12452 &ender, /* The returned single code
12454 NULL, /* Don't need a count of
12455 how many code points */
12459 if (*flagp & NEED_UTF8)
12460 FAIL("panic: grok_bslash_N set NEED_UTF8");
12461 if (*flagp & RESTART_PASS1)
12464 /* Here, it wasn't a single code point. Go close
12465 * up this EXACTish node. The switch() prior to
12466 * this switch handles the other cases */
12467 RExC_parse = p = oldp;
12471 if (ender > 0xff) {
12472 REQUIRE_UTF8(flagp);
12488 ender = ESC_NATIVE;
12498 const char* error_msg;
12500 bool valid = grok_bslash_o(&p,
12503 PASS2, /* out warnings */
12504 (bool) RExC_strict,
12505 TRUE, /* Output warnings
12510 RExC_parse = p; /* going to die anyway; point
12511 to exact spot of failure */
12515 if (IN_ENCODING && ender < 0x100) {
12516 goto recode_encoding;
12518 if (ender > 0xff) {
12519 REQUIRE_UTF8(flagp);
12525 UV result = UV_MAX; /* initialize to erroneous
12527 const char* error_msg;
12529 bool valid = grok_bslash_x(&p,
12532 PASS2, /* out warnings */
12533 (bool) RExC_strict,
12534 TRUE, /* Silence warnings
12539 RExC_parse = p; /* going to die anyway; point
12540 to exact spot of failure */
12545 if (ender < 0x100) {
12547 if (RExC_recode_x_to_native) {
12548 ender = LATIN1_TO_NATIVE(ender);
12553 goto recode_encoding;
12557 REQUIRE_UTF8(flagp);
12563 ender = grok_bslash_c(*p++, PASS2);
12565 case '8': case '9': /* must be a backreference */
12567 /* we have an escape like \8 which cannot be an octal escape
12568 * so we exit the loop, and let the outer loop handle this
12569 * escape which may or may not be a legitimate backref. */
12571 case '1': case '2': case '3':case '4':
12572 case '5': case '6': case '7':
12573 /* When we parse backslash escapes there is ambiguity
12574 * between backreferences and octal escapes. Any escape
12575 * from \1 - \9 is a backreference, any multi-digit
12576 * escape which does not start with 0 and which when
12577 * evaluated as decimal could refer to an already
12578 * parsed capture buffer is a back reference. Anything
12581 * Note this implies that \118 could be interpreted as
12582 * 118 OR as "\11" . "8" depending on whether there
12583 * were 118 capture buffers defined already in the
12586 /* NOTE, RExC_npar is 1 more than the actual number of
12587 * parens we have seen so far, hence the < RExC_npar below. */
12589 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12590 { /* Not to be treated as an octal constant, go
12598 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12600 ender = grok_oct(p, &numlen, &flags, NULL);
12601 if (ender > 0xff) {
12602 REQUIRE_UTF8(flagp);
12605 if (PASS2 /* like \08, \178 */
12608 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12610 reg_warn_non_literal_string(
12612 form_short_octal_warning(p, numlen));
12615 if (IN_ENCODING && ender < 0x100)
12616 goto recode_encoding;
12619 if (! RExC_override_recoding) {
12620 SV* enc = _get_encoding();
12621 ender = reg_recode((U8)ender, &enc);
12623 ckWARNreg(p, "Invalid escape in the specified encoding");
12624 REQUIRE_UTF8(flagp);
12629 FAIL("Trailing \\");
12632 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12633 /* Include any left brace following the alpha to emphasize
12634 * that it could be part of an escape at some point
12636 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12637 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12639 goto normal_default;
12640 } /* End of switch on '\' */
12643 /* Currently we don't warn when the lbrace is at the start
12644 * of a construct. This catches it in the middle of a
12645 * literal string, or when it's the first thing after
12646 * something like "\b" */
12648 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12650 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12653 default: /* A literal character */
12655 if (! UTF8_IS_INVARIANT(*p) && UTF) {
12657 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12658 &numlen, UTF8_ALLOW_DEFAULT);
12664 } /* End of switch on the literal */
12666 /* Here, have looked at the literal character and <ender>
12667 * contains its ordinal, <p> points to the character after it.
12668 * We need to check if the next non-ignored thing is a
12669 * quantifier. Move <p> to after anything that should be
12670 * ignored, which, as a side effect, positions <p> for the next
12671 * loop iteration */
12672 skip_to_be_ignored_text(pRExC_state, &p,
12673 FALSE /* Don't force to /x */ );
12675 /* If the next thing is a quantifier, it applies to this
12676 * character only, which means that this character has to be in
12677 * its own node and can't just be appended to the string in an
12678 * existing node, so if there are already other characters in
12679 * the node, close the node with just them, and set up to do
12680 * this character again next time through, when it will be the
12681 * only thing in its new node */
12682 if ((next_is_quantifier = ( LIKELY(p < RExC_end)
12683 && UNLIKELY(ISMULT2(p))))
12690 /* Ready to add 'ender' to the node */
12692 if (! FOLD) { /* The simple case, just append the literal */
12694 /* In the sizing pass, we need only the size of the
12695 * character we are appending, hence we can delay getting
12696 * its representation until PASS2. */
12699 const STRLEN unilen = UVCHR_SKIP(ender);
12702 /* We have to subtract 1 just below (and again in
12703 * the corresponding PASS2 code) because the loop
12704 * increments <len> each time, as all but this path
12705 * (and one other) through it add a single byte to
12706 * the EXACTish node. But these paths would change
12707 * len to be the correct final value, so cancel out
12708 * the increment that follows */
12714 } else { /* PASS2 */
12717 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12718 len += (char *) new_s - s - 1;
12719 s = (char *) new_s;
12722 *(s++) = (char) ender;
12726 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12728 /* Here are folding under /l, and the code point is
12729 * problematic. First, we know we can't simplify things */
12730 maybe_exact = FALSE;
12731 maybe_exactfu = FALSE;
12733 /* A problematic code point in this context means that its
12734 * fold isn't known until runtime, so we can't fold it now.
12735 * (The non-problematic code points are the above-Latin1
12736 * ones that fold to also all above-Latin1. Their folds
12737 * don't vary no matter what the locale is.) But here we
12738 * have characters whose fold depends on the locale.
12739 * Unlike the non-folding case above, we have to keep track
12740 * of these in the sizing pass, so that we can make sure we
12741 * don't split too-long nodes in the middle of a potential
12742 * multi-char fold. And unlike the regular fold case
12743 * handled in the else clauses below, we don't actually
12744 * fold and don't have special cases to consider. What we
12745 * do for both passes is the PASS2 code for non-folding */
12746 goto not_fold_common;
12748 else /* A regular FOLD code point */
12750 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12751 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12752 || UNICODE_DOT_DOT_VERSION > 0)
12753 /* See comments for join_exact() as to why we fold
12754 * this non-UTF at compile time */
12755 || ( node_type == EXACTFU
12756 && ender == LATIN_SMALL_LETTER_SHARP_S)
12759 /* Here, are folding and are not UTF-8 encoded; therefore
12760 * the character must be in the range 0-255, and is not /l
12761 * (Not /l because we already handled these under /l in
12762 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12763 if (IS_IN_SOME_FOLD_L1(ender)) {
12764 maybe_exact = FALSE;
12766 /* See if the character's fold differs between /d and
12767 * /u. This includes the multi-char fold SHARP S to
12769 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
12770 RExC_seen_unfolded_sharp_s = 1;
12771 maybe_exactfu = FALSE;
12773 else if (maybe_exactfu
12774 && (PL_fold[ender] != PL_fold_latin1[ender]
12775 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12776 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12777 || UNICODE_DOT_DOT_VERSION > 0)
12779 && isALPHA_FOLD_EQ(ender, 's')
12780 && isALPHA_FOLD_EQ(*(s-1), 's'))
12783 maybe_exactfu = FALSE;
12787 /* Even when folding, we store just the input character, as
12788 * we have an array that finds its fold quickly */
12789 *(s++) = (char) ender;
12791 else { /* FOLD, and UTF (or sharp s) */
12792 /* Unlike the non-fold case, we do actually have to
12793 * calculate the results here in pass 1. This is for two
12794 * reasons, the folded length may be longer than the
12795 * unfolded, and we have to calculate how many EXACTish
12796 * nodes it will take; and we may run out of room in a node
12797 * in the middle of a potential multi-char fold, and have
12798 * to back off accordingly. */
12801 if (isASCII_uni(ender)) {
12802 folded = toFOLD(ender);
12803 *(s)++ = (U8) folded;
12808 folded = _to_uni_fold_flags(
12812 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12813 ? FOLD_FLAGS_NOMIX_ASCII
12817 /* The loop increments <len> each time, as all but this
12818 * path (and one other) through it add a single byte to
12819 * the EXACTish node. But this one has changed len to
12820 * be the correct final value, so subtract one to
12821 * cancel out the increment that follows */
12822 len += foldlen - 1;
12824 /* If this node only contains non-folding code points so
12825 * far, see if this new one is also non-folding */
12827 if (folded != ender) {
12828 maybe_exact = FALSE;
12831 /* Here the fold is the original; we have to check
12832 * further to see if anything folds to it */
12833 if (_invlist_contains_cp(PL_utf8_foldable,
12836 maybe_exact = FALSE;
12843 if (next_is_quantifier) {
12845 /* Here, the next input is a quantifier, and to get here,
12846 * the current character is the only one in the node.
12847 * Also, here <len> doesn't include the final byte for this
12853 } /* End of loop through literal characters */
12855 /* Here we have either exhausted the input or ran out of room in
12856 * the node. (If we encountered a character that can't be in the
12857 * node, transfer is made directly to <loopdone>, and so we
12858 * wouldn't have fallen off the end of the loop.) In the latter
12859 * case, we artificially have to split the node into two, because
12860 * we just don't have enough space to hold everything. This
12861 * creates a problem if the final character participates in a
12862 * multi-character fold in the non-final position, as a match that
12863 * should have occurred won't, due to the way nodes are matched,
12864 * and our artificial boundary. So back off until we find a non-
12865 * problematic character -- one that isn't at the beginning or
12866 * middle of such a fold. (Either it doesn't participate in any
12867 * folds, or appears only in the final position of all the folds it
12868 * does participate in.) A better solution with far fewer false
12869 * positives, and that would fill the nodes more completely, would
12870 * be to actually have available all the multi-character folds to
12871 * test against, and to back-off only far enough to be sure that
12872 * this node isn't ending with a partial one. <upper_parse> is set
12873 * further below (if we need to reparse the node) to include just
12874 * up through that final non-problematic character that this code
12875 * identifies, so when it is set to less than the full node, we can
12876 * skip the rest of this */
12877 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12879 const STRLEN full_len = len;
12881 assert(len >= MAX_NODE_STRING_SIZE);
12883 /* Here, <s> points to the final byte of the final character.
12884 * Look backwards through the string until find a non-
12885 * problematic character */
12889 /* This has no multi-char folds to non-UTF characters */
12890 if (ASCII_FOLD_RESTRICTED) {
12894 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12898 if (! PL_NonL1NonFinalFold) {
12899 PL_NonL1NonFinalFold = _new_invlist_C_array(
12900 NonL1_Perl_Non_Final_Folds_invlist);
12903 /* Point to the first byte of the final character */
12904 s = (char *) utf8_hop((U8 *) s, -1);
12906 while (s >= s0) { /* Search backwards until find
12907 non-problematic char */
12908 if (UTF8_IS_INVARIANT(*s)) {
12910 /* There are no ascii characters that participate
12911 * in multi-char folds under /aa. In EBCDIC, the
12912 * non-ascii invariants are all control characters,
12913 * so don't ever participate in any folds. */
12914 if (ASCII_FOLD_RESTRICTED
12915 || ! IS_NON_FINAL_FOLD(*s))
12920 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12921 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
12927 else if (! _invlist_contains_cp(
12928 PL_NonL1NonFinalFold,
12929 valid_utf8_to_uvchr((U8 *) s, NULL)))
12934 /* Here, the current character is problematic in that
12935 * it does occur in the non-final position of some
12936 * fold, so try the character before it, but have to
12937 * special case the very first byte in the string, so
12938 * we don't read outside the string */
12939 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12940 } /* End of loop backwards through the string */
12942 /* If there were only problematic characters in the string,
12943 * <s> will point to before s0, in which case the length
12944 * should be 0, otherwise include the length of the
12945 * non-problematic character just found */
12946 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12949 /* Here, have found the final character, if any, that is
12950 * non-problematic as far as ending the node without splitting
12951 * it across a potential multi-char fold. <len> contains the
12952 * number of bytes in the node up-to and including that
12953 * character, or is 0 if there is no such character, meaning
12954 * the whole node contains only problematic characters. In
12955 * this case, give up and just take the node as-is. We can't
12960 /* If the node ends in an 's' we make sure it stays EXACTF,
12961 * as if it turns into an EXACTFU, it could later get
12962 * joined with another 's' that would then wrongly match
12964 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12966 maybe_exactfu = FALSE;
12970 /* Here, the node does contain some characters that aren't
12971 * problematic. If one such is the final character in the
12972 * node, we are done */
12973 if (len == full_len) {
12976 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12978 /* If the final character is problematic, but the
12979 * penultimate is not, back-off that last character to
12980 * later start a new node with it */
12985 /* Here, the final non-problematic character is earlier
12986 * in the input than the penultimate character. What we do
12987 * is reparse from the beginning, going up only as far as
12988 * this final ok one, thus guaranteeing that the node ends
12989 * in an acceptable character. The reason we reparse is
12990 * that we know how far in the character is, but we don't
12991 * know how to correlate its position with the input parse.
12992 * An alternate implementation would be to build that
12993 * correlation as we go along during the original parse,
12994 * but that would entail extra work for every node, whereas
12995 * this code gets executed only when the string is too
12996 * large for the node, and the final two characters are
12997 * problematic, an infrequent occurrence. Yet another
12998 * possible strategy would be to save the tail of the
12999 * string, and the next time regatom is called, initialize
13000 * with that. The problem with this is that unless you
13001 * back off one more character, you won't be guaranteed
13002 * regatom will get called again, unless regbranch,
13003 * regpiece ... are also changed. If you do back off that
13004 * extra character, so that there is input guaranteed to
13005 * force calling regatom, you can't handle the case where
13006 * just the first character in the node is acceptable. I
13007 * (khw) decided to try this method which doesn't have that
13008 * pitfall; if performance issues are found, we can do a
13009 * combination of the current approach plus that one */
13015 } /* End of verifying node ends with an appropriate char */
13017 loopdone: /* Jumped to when encounters something that shouldn't be
13020 /* I (khw) don't know if you can get here with zero length, but the
13021 * old code handled this situation by creating a zero-length EXACT
13022 * node. Might as well be NOTHING instead */
13028 /* If 'maybe_exact' is still set here, means there are no
13029 * code points in the node that participate in folds;
13030 * similarly for 'maybe_exactfu' and code points that match
13031 * differently depending on UTF8ness of the target string
13032 * (for /u), or depending on locale for /l */
13038 else if (maybe_exactfu) {
13044 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13045 FALSE /* Don't look to see if could
13046 be turned into an EXACT
13047 node, as we have already
13052 RExC_parse = p - 1;
13053 Set_Node_Cur_Length(ret, parse_start);
13055 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13056 FALSE /* Don't force to /x */ );
13058 /* len is STRLEN which is unsigned, need to copy to signed */
13061 vFAIL("Internal disaster");
13064 } /* End of label 'defchar:' */
13066 } /* End of giant switch on input character */
13073 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13075 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
13076 * sets up the bitmap and any flags, removing those code points from the
13077 * inversion list, setting it to NULL should it become completely empty */
13079 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13080 assert(PL_regkind[OP(node)] == ANYOF);
13082 ANYOF_BITMAP_ZERO(node);
13083 if (*invlist_ptr) {
13085 /* This gets set if we actually need to modify things */
13086 bool change_invlist = FALSE;
13090 /* Start looking through *invlist_ptr */
13091 invlist_iterinit(*invlist_ptr);
13092 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13096 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13097 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13099 else if (end >= NUM_ANYOF_CODE_POINTS) {
13100 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13103 /* Quit if are above what we should change */
13104 if (start >= NUM_ANYOF_CODE_POINTS) {
13108 change_invlist = TRUE;
13110 /* Set all the bits in the range, up to the max that we are doing */
13111 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13113 : NUM_ANYOF_CODE_POINTS - 1;
13114 for (i = start; i <= (int) high; i++) {
13115 if (! ANYOF_BITMAP_TEST(node, i)) {
13116 ANYOF_BITMAP_SET(node, i);
13120 invlist_iterfinish(*invlist_ptr);
13122 /* Done with loop; remove any code points that are in the bitmap from
13123 * *invlist_ptr; similarly for code points above the bitmap if we have
13124 * a flag to match all of them anyways */
13125 if (change_invlist) {
13126 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13128 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13129 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13132 /* If have completely emptied it, remove it completely */
13133 if (_invlist_len(*invlist_ptr) == 0) {
13134 SvREFCNT_dec_NN(*invlist_ptr);
13135 *invlist_ptr = NULL;
13140 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13141 Character classes ([:foo:]) can also be negated ([:^foo:]).
13142 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13143 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13144 but trigger failures because they are currently unimplemented. */
13146 #define POSIXCC_DONE(c) ((c) == ':')
13147 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13148 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13150 PERL_STATIC_INLINE I32
13151 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13153 I32 namedclass = OOB_NAMEDCLASS;
13155 PERL_ARGS_ASSERT_REGPPOSIXCC;
13157 if (value == '[' && RExC_parse + 1 < RExC_end &&
13158 /* I smell either [: or [= or [. -- POSIX has been here, right? */
13159 POSIXCC(UCHARAT(RExC_parse)))
13161 const char c = UCHARAT(RExC_parse);
13162 char* const s = RExC_parse++;
13164 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13166 if (RExC_parse == RExC_end) {
13169 /* Try to give a better location for the error (than the end of
13170 * the string) by looking for the matching ']' */
13172 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13175 vFAIL2("Unmatched '%c' in POSIX class", c);
13177 /* Grandfather lone [:, [=, [. */
13181 const char* const t = RExC_parse++; /* skip over the c */
13184 if (UCHARAT(RExC_parse) == ']') {
13185 const char *posixcc = s + 1;
13186 RExC_parse++; /* skip over the ending ] */
13189 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13190 const I32 skip = t - posixcc;
13192 /* Initially switch on the length of the name. */
13195 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13196 this is the Perl \w
13198 namedclass = ANYOF_WORDCHAR;
13201 /* Names all of length 5. */
13202 /* alnum alpha ascii blank cntrl digit graph lower
13203 print punct space upper */
13204 /* Offset 4 gives the best switch position. */
13205 switch (posixcc[4]) {
13207 if (memEQ(posixcc, "alph", 4)) /* alpha */
13208 namedclass = ANYOF_ALPHA;
13211 if (memEQ(posixcc, "spac", 4)) /* space */
13212 namedclass = ANYOF_SPACE;
13215 if (memEQ(posixcc, "grap", 4)) /* graph */
13216 namedclass = ANYOF_GRAPH;
13219 if (memEQ(posixcc, "asci", 4)) /* ascii */
13220 namedclass = ANYOF_ASCII;
13223 if (memEQ(posixcc, "blan", 4)) /* blank */
13224 namedclass = ANYOF_BLANK;
13227 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13228 namedclass = ANYOF_CNTRL;
13231 if (memEQ(posixcc, "alnu", 4)) /* alnum */
13232 namedclass = ANYOF_ALPHANUMERIC;
13235 if (memEQ(posixcc, "lowe", 4)) /* lower */
13236 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13237 else if (memEQ(posixcc, "uppe", 4)) /* upper */
13238 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13241 if (memEQ(posixcc, "digi", 4)) /* digit */
13242 namedclass = ANYOF_DIGIT;
13243 else if (memEQ(posixcc, "prin", 4)) /* print */
13244 namedclass = ANYOF_PRINT;
13245 else if (memEQ(posixcc, "punc", 4)) /* punct */
13246 namedclass = ANYOF_PUNCT;
13251 if (memEQ(posixcc, "xdigit", 6))
13252 namedclass = ANYOF_XDIGIT;
13256 if (namedclass == OOB_NAMEDCLASS)
13258 "POSIX class [:%"UTF8f":] unknown",
13259 UTF8fARG(UTF, t - s - 1, s + 1));
13261 /* The #defines are structured so each complement is +1 to
13262 * the normal one */
13266 assert (posixcc[skip] == ':');
13267 assert (posixcc[skip+1] == ']');
13268 } else if (!SIZE_ONLY) {
13269 /* [[=foo=]] and [[.foo.]] are still future. */
13271 /* adjust RExC_parse so the warning shows after
13272 the class closes */
13273 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13275 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13278 /* Maternal grandfather:
13279 * "[:" ending in ":" but not in ":]" */
13281 vFAIL("Unmatched '[' in POSIX class");
13284 /* Grandfather lone [:, [=, [. */
13294 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13296 /* This applies some heuristics at the current parse position (which should
13297 * be at a '[') to see if what follows might be intended to be a [:posix:]
13298 * class. It returns true if it really is a posix class, of course, but it
13299 * also can return true if it thinks that what was intended was a posix
13300 * class that didn't quite make it.
13302 * It will return true for
13304 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13305 * ')' indicating the end of the (?[
13306 * [:any garbage including %^&$ punctuation:]
13308 * This is designed to be called only from S_handle_regex_sets; it could be
13309 * easily adapted to be called from the spot at the beginning of regclass()
13310 * that checks to see in a normal bracketed class if the surrounding []
13311 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13312 * change long-standing behavior, so I (khw) didn't do that */
13313 char* p = RExC_parse + 1;
13314 char first_char = *p;
13316 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13318 assert(*(p - 1) == '[');
13320 if (! POSIXCC(first_char)) {
13325 while (p < RExC_end && isWORDCHAR(*p)) p++;
13327 if (p >= RExC_end) {
13331 if (p - RExC_parse > 2 /* Got at least 1 word character */
13332 && (*p == first_char
13333 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13338 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13341 && p - RExC_parse > 2 /* [:] evaluates to colon;
13342 [::] is a bad posix class. */
13343 && first_char == *(p - 1));
13346 STATIC unsigned int
13347 S_regex_set_precedence(const U8 my_operator) {
13349 /* Returns the precedence in the (?[...]) construct of the input operator,
13350 * specified by its character representation. The precedence follows
13351 * general Perl rules, but it extends this so that ')' and ']' have (low)
13352 * precedence even though they aren't really operators */
13354 switch (my_operator) {
13370 NOT_REACHED; /* NOTREACHED */
13371 return 0; /* Silence compiler warning */
13375 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13376 I32 *flagp, U32 depth,
13377 char * const oregcomp_parse)
13379 /* Handle the (?[...]) construct to do set operations */
13381 U8 curchar; /* Current character being parsed */
13382 UV start, end; /* End points of code point ranges */
13383 SV* final = NULL; /* The end result inversion list */
13384 SV* result_string; /* 'final' stringified */
13385 AV* stack; /* stack of operators and operands not yet
13387 AV* fence_stack = NULL; /* A stack containing the positions in
13388 'stack' of where the undealt-with left
13389 parens would be if they were actually
13391 IV fence = 0; /* Position of where most recent undealt-
13392 with left paren in stack is; -1 if none.
13394 STRLEN len; /* Temporary */
13395 regnode* node; /* Temporary, and final regnode returned by
13397 const bool save_fold = FOLD; /* Temporary */
13398 char *save_end, *save_parse; /* Temporaries */
13399 const bool in_locale = LOC; /* we turn off /l during processing */
13401 GET_RE_DEBUG_FLAGS_DECL;
13403 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13406 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
13409 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
13410 This is required so that the compile
13411 time values are valid in all runtime
13414 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13415 * (such as EXACT). Thus we can skip most everything if just sizing. We
13416 * call regclass to handle '[]' so as to not have to reinvent its parsing
13417 * rules here (throwing away the size it computes each time). And, we exit
13418 * upon an unescaped ']' that isn't one ending a regclass. To do both
13419 * these things, we need to realize that something preceded by a backslash
13420 * is escaped, so we have to keep track of backslashes */
13422 UV depth = 0; /* how many nested (?[...]) constructs */
13424 while (RExC_parse < RExC_end) {
13425 SV* current = NULL;
13427 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13428 TRUE /* Force /x */ );
13430 switch (*RExC_parse) {
13432 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13437 /* Skip the next byte (which could cause us to end up in
13438 * the middle of a UTF-8 character, but since none of those
13439 * are confusable with anything we currently handle in this
13440 * switch (invariants all), it's safe. We'll just hit the
13441 * default: case next time and keep on incrementing until
13442 * we find one of the invariants we do handle. */
13444 if (*RExC_parse == 'c') {
13445 /* Skip the \cX notation for control characters */
13446 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13451 /* If this looks like it is a [:posix:] class, leave the
13452 * parse pointer at the '[' to fool regclass() into
13453 * thinking it is part of a '[[:posix:]]'. That function
13454 * will use strict checking to force a syntax error if it
13455 * doesn't work out to a legitimate class */
13456 bool is_posix_class
13457 = could_it_be_a_POSIX_class(pRExC_state);
13458 if (! is_posix_class) {
13462 /* regclass() can only return RESTART_PASS1 and NEED_UTF8
13463 * if multi-char folds are allowed. */
13464 if (!regclass(pRExC_state, flagp,depth+1,
13465 is_posix_class, /* parse the whole char
13466 class only if not a
13468 FALSE, /* don't allow multi-char folds */
13469 TRUE, /* silence non-portable warnings. */
13471 FALSE, /* Require return to be an ANYOF */
13474 FAIL2("panic: regclass returned NULL to handle_sets, "
13475 "flags=%#"UVxf"", (UV) *flagp);
13477 /* function call leaves parse pointing to the ']', except
13478 * if we faked it */
13479 if (is_posix_class) {
13483 SvREFCNT_dec(current); /* In case it returned something */
13488 if (depth--) break;
13490 if (RExC_parse < RExC_end
13491 && *RExC_parse == ')')
13493 node = reganode(pRExC_state, ANYOF, 0);
13494 RExC_size += ANYOF_SKIP;
13495 nextchar(pRExC_state);
13496 Set_Node_Length(node,
13497 RExC_parse - oregcomp_parse + 1); /* MJD */
13499 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
13507 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13511 FAIL("Syntax error in (?[...])");
13514 /* Pass 2 only after this. */
13515 Perl_ck_warner_d(aTHX_
13516 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13517 "The regex_sets feature is experimental" REPORT_LOCATION,
13518 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13520 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13521 RExC_precomp + (RExC_parse - RExC_precomp)));
13523 /* Everything in this construct is a metacharacter. Operands begin with
13524 * either a '\' (for an escape sequence), or a '[' for a bracketed
13525 * character class. Any other character should be an operator, or
13526 * parenthesis for grouping. Both types of operands are handled by calling
13527 * regclass() to parse them. It is called with a parameter to indicate to
13528 * return the computed inversion list. The parsing here is implemented via
13529 * a stack. Each entry on the stack is a single character representing one
13530 * of the operators; or else a pointer to an operand inversion list. */
13532 #define IS_OPERATOR(a) SvIOK(a)
13533 #define IS_OPERAND(a) (! IS_OPERATOR(a))
13535 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
13536 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13537 * with pronouncing it called it Reverse Polish instead, but now that YOU
13538 * know how to pronounce it you can use the correct term, thus giving due
13539 * credit to the person who invented it, and impressing your geek friends.
13540 * Wikipedia says that the pronounciation of "Ł" has been changing so that
13541 * it is now more like an English initial W (as in wonk) than an L.)
13543 * This means that, for example, 'a | b & c' is stored on the stack as
13551 * where the numbers in brackets give the stack [array] element number.
13552 * In this implementation, parentheses are not stored on the stack.
13553 * Instead a '(' creates a "fence" so that the part of the stack below the
13554 * fence is invisible except to the corresponding ')' (this allows us to
13555 * replace testing for parens, by using instead subtraction of the fence
13556 * position). As new operands are processed they are pushed onto the stack
13557 * (except as noted in the next paragraph). New operators of higher
13558 * precedence than the current final one are inserted on the stack before
13559 * the lhs operand (so that when the rhs is pushed next, everything will be
13560 * in the correct positions shown above. When an operator of equal or
13561 * lower precedence is encountered in parsing, all the stacked operations
13562 * of equal or higher precedence are evaluated, leaving the result as the
13563 * top entry on the stack. This makes higher precedence operations
13564 * evaluate before lower precedence ones, and causes operations of equal
13565 * precedence to left associate.
13567 * The only unary operator '!' is immediately pushed onto the stack when
13568 * encountered. When an operand is encountered, if the top of the stack is
13569 * a '!", the complement is immediately performed, and the '!' popped. The
13570 * resulting value is treated as a new operand, and the logic in the
13571 * previous paragraph is executed. Thus in the expression
13573 * the stack looks like
13579 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13586 * A ')' is treated as an operator with lower precedence than all the
13587 * aforementioned ones, which causes all operations on the stack above the
13588 * corresponding '(' to be evaluated down to a single resultant operand.
13589 * Then the fence for the '(' is removed, and the operand goes through the
13590 * algorithm above, without the fence.
13592 * A separate stack is kept of the fence positions, so that the position of
13593 * the latest so-far unbalanced '(' is at the top of it.
13595 * The ']' ending the construct is treated as the lowest operator of all,
13596 * so that everything gets evaluated down to a single operand, which is the
13599 sv_2mortal((SV *)(stack = newAV()));
13600 sv_2mortal((SV *)(fence_stack = newAV()));
13602 while (RExC_parse < RExC_end) {
13603 I32 top_index; /* Index of top-most element in 'stack' */
13604 SV** top_ptr; /* Pointer to top 'stack' element */
13605 SV* current = NULL; /* To contain the current inversion list
13607 SV* only_to_avoid_leaks;
13609 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13610 TRUE /* Force /x */ );
13611 if (RExC_parse >= RExC_end) {
13612 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13615 curchar = UCHARAT(RExC_parse);
13619 top_index = av_tindex(stack);
13622 SV** stacked_ptr; /* Ptr to something already on 'stack' */
13623 char stacked_operator; /* The topmost operator on the 'stack'. */
13624 SV* lhs; /* Operand to the left of the operator */
13625 SV* rhs; /* Operand to the right of the operator */
13626 SV* fence_ptr; /* Pointer to top element of the fence
13631 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13633 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13634 * This happens when we have some thing like
13636 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13638 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13640 * Here we would be handling the interpolated
13641 * '$thai_or_lao'. We handle this by a recursive call to
13642 * ourselves which returns the inversion list the
13643 * interpolated expression evaluates to. We use the flags
13644 * from the interpolated pattern. */
13645 U32 save_flags = RExC_flags;
13646 const char * save_parse;
13648 RExC_parse += 2; /* Skip past the '(?' */
13649 save_parse = RExC_parse;
13651 /* Parse any flags for the '(?' */
13652 parse_lparen_question_flags(pRExC_state);
13654 if (RExC_parse == save_parse /* Makes sure there was at
13655 least one flag (or else
13656 this embedding wasn't
13658 || RExC_parse >= RExC_end - 4
13659 || UCHARAT(RExC_parse) != ':'
13660 || UCHARAT(++RExC_parse) != '('
13661 || UCHARAT(++RExC_parse) != '?'
13662 || UCHARAT(++RExC_parse) != '[')
13665 /* In combination with the above, this moves the
13666 * pointer to the point just after the first erroneous
13667 * character (or if there are no flags, to where they
13668 * should have been) */
13669 if (RExC_parse >= RExC_end - 4) {
13670 RExC_parse = RExC_end;
13672 else if (RExC_parse != save_parse) {
13673 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13675 vFAIL("Expecting '(?flags:(?[...'");
13678 /* Recurse, with the meat of the embedded expression */
13680 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13681 depth+1, oregcomp_parse);
13683 /* Here, 'current' contains the embedded expression's
13684 * inversion list, and RExC_parse points to the trailing
13685 * ']'; the next character should be the ')' */
13687 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13689 /* Then the ')' matching the original '(' handled by this
13690 * case: statement */
13692 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13695 RExC_flags = save_flags;
13696 goto handle_operand;
13699 /* A regular '('. Look behind for illegal syntax */
13700 if (top_index - fence >= 0) {
13701 /* If the top entry on the stack is an operator, it had
13702 * better be a '!', otherwise the entry below the top
13703 * operand should be an operator */
13704 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13705 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
13706 || ( IS_OPERAND(*top_ptr)
13707 && ( top_index - fence < 1
13708 || ! (stacked_ptr = av_fetch(stack,
13711 || ! IS_OPERATOR(*stacked_ptr))))
13714 vFAIL("Unexpected '(' with no preceding operator");
13718 /* Stack the position of this undealt-with left paren */
13719 fence = top_index + 1;
13720 av_push(fence_stack, newSViv(fence));
13724 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13725 * multi-char folds are allowed. */
13726 if (!regclass(pRExC_state, flagp,depth+1,
13727 TRUE, /* means parse just the next thing */
13728 FALSE, /* don't allow multi-char folds */
13729 FALSE, /* don't silence non-portable warnings. */
13731 FALSE, /* Require return to be an ANYOF */
13734 FAIL2("panic: regclass returned NULL to handle_sets, "
13735 "flags=%#"UVxf"", (UV) *flagp);
13738 /* regclass() will return with parsing just the \ sequence,
13739 * leaving the parse pointer at the next thing to parse */
13741 goto handle_operand;
13743 case '[': /* Is a bracketed character class */
13745 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13747 if (! is_posix_class) {
13751 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13752 * multi-char folds are allowed. */
13753 if(!regclass(pRExC_state, flagp,depth+1,
13754 is_posix_class, /* parse the whole char class
13755 only if not a posix class */
13756 FALSE, /* don't allow multi-char folds */
13757 FALSE, /* don't silence non-portable warnings. */
13759 FALSE, /* Require return to be an ANYOF */
13763 FAIL2("panic: regclass returned NULL to handle_sets, "
13764 "flags=%#"UVxf"", (UV) *flagp);
13767 /* function call leaves parse pointing to the ']', except if we
13769 if (is_posix_class) {
13773 goto handle_operand;
13777 if (top_index >= 1) {
13778 goto join_operators;
13781 /* Only a single operand on the stack: are done */
13785 if (av_tindex(fence_stack) < 0) {
13787 vFAIL("Unexpected ')'");
13790 /* If at least two thing on the stack, treat this as an
13792 if (top_index - fence >= 1) {
13793 goto join_operators;
13796 /* Here only a single thing on the fenced stack, and there is a
13797 * fence. Get rid of it */
13798 fence_ptr = av_pop(fence_stack);
13800 fence = SvIV(fence_ptr) - 1;
13801 SvREFCNT_dec_NN(fence_ptr);
13808 /* Having gotten rid of the fence, we pop the operand at the
13809 * stack top and process it as a newly encountered operand */
13810 current = av_pop(stack);
13811 if (IS_OPERAND(current)) {
13812 goto handle_operand;
13824 /* These binary operators should have a left operand already
13826 if ( top_index - fence < 0
13827 || top_index - fence == 1
13828 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13829 || ! IS_OPERAND(*top_ptr))
13831 goto unexpected_binary;
13834 /* If only the one operand is on the part of the stack visible
13835 * to us, we just place this operator in the proper position */
13836 if (top_index - fence < 2) {
13838 /* Place the operator before the operand */
13840 SV* lhs = av_pop(stack);
13841 av_push(stack, newSVuv(curchar));
13842 av_push(stack, lhs);
13846 /* But if there is something else on the stack, we need to
13847 * process it before this new operator if and only if the
13848 * stacked operation has equal or higher precedence than the
13853 /* The operator on the stack is supposed to be below both its
13855 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13856 || IS_OPERAND(*stacked_ptr))
13858 /* But if not, it's legal and indicates we are completely
13859 * done if and only if we're currently processing a ']',
13860 * which should be the final thing in the expression */
13861 if (curchar == ']') {
13867 vFAIL2("Unexpected binary operator '%c' with no "
13868 "preceding operand", curchar);
13870 stacked_operator = (char) SvUV(*stacked_ptr);
13872 if (regex_set_precedence(curchar)
13873 > regex_set_precedence(stacked_operator))
13875 /* Here, the new operator has higher precedence than the
13876 * stacked one. This means we need to add the new one to
13877 * the stack to await its rhs operand (and maybe more
13878 * stuff). We put it before the lhs operand, leaving
13879 * untouched the stacked operator and everything below it
13881 lhs = av_pop(stack);
13882 assert(IS_OPERAND(lhs));
13884 av_push(stack, newSVuv(curchar));
13885 av_push(stack, lhs);
13889 /* Here, the new operator has equal or lower precedence than
13890 * what's already there. This means the operation already
13891 * there should be performed now, before the new one. */
13893 rhs = av_pop(stack);
13894 if (! IS_OPERAND(rhs)) {
13896 /* This can happen when a ! is not followed by an operand,
13897 * like in /(?[\t &!])/ */
13901 lhs = av_pop(stack);
13903 if (! IS_OPERAND(lhs)) {
13905 /* This can happen when there is an empty (), like in
13906 * /(?[[0]+()+])/ */
13910 switch (stacked_operator) {
13912 _invlist_intersection(lhs, rhs, &rhs);
13917 _invlist_union(lhs, rhs, &rhs);
13921 _invlist_subtract(lhs, rhs, &rhs);
13924 case '^': /* The union minus the intersection */
13930 _invlist_union(lhs, rhs, &u);
13931 _invlist_intersection(lhs, rhs, &i);
13932 /* _invlist_subtract will overwrite rhs
13933 without freeing what it already contains */
13935 _invlist_subtract(u, i, &rhs);
13936 SvREFCNT_dec_NN(i);
13937 SvREFCNT_dec_NN(u);
13938 SvREFCNT_dec_NN(element);
13944 /* Here, the higher precedence operation has been done, and the
13945 * result is in 'rhs'. We overwrite the stacked operator with
13946 * the result. Then we redo this code to either push the new
13947 * operator onto the stack or perform any higher precedence
13948 * stacked operation */
13949 only_to_avoid_leaks = av_pop(stack);
13950 SvREFCNT_dec(only_to_avoid_leaks);
13951 av_push(stack, rhs);
13954 case '!': /* Highest priority, right associative */
13956 /* If what's already at the top of the stack is another '!",
13957 * they just cancel each other out */
13958 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
13959 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
13961 only_to_avoid_leaks = av_pop(stack);
13962 SvREFCNT_dec(only_to_avoid_leaks);
13964 else { /* Otherwise, since it's right associative, just push
13966 av_push(stack, newSVuv(curchar));
13971 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13972 vFAIL("Unexpected character");
13976 /* Here 'current' is the operand. If something is already on the
13977 * stack, we have to check if it is a !. */
13978 top_index = av_tindex(stack); /* Code above may have altered the
13979 * stack in the time since we
13980 * earlier set 'top_index'. */
13981 if (top_index - fence >= 0) {
13982 /* If the top entry on the stack is an operator, it had better
13983 * be a '!', otherwise the entry below the top operand should
13984 * be an operator */
13985 top_ptr = av_fetch(stack, top_index, FALSE);
13987 if (IS_OPERATOR(*top_ptr)) {
13989 /* The only permissible operator at the top of the stack is
13990 * '!', which is applied immediately to this operand. */
13991 curchar = (char) SvUV(*top_ptr);
13992 if (curchar != '!') {
13993 SvREFCNT_dec(current);
13994 vFAIL2("Unexpected binary operator '%c' with no "
13995 "preceding operand", curchar);
13998 _invlist_invert(current);
14000 only_to_avoid_leaks = av_pop(stack);
14001 SvREFCNT_dec(only_to_avoid_leaks);
14002 top_index = av_tindex(stack);
14004 /* And we redo with the inverted operand. This allows
14005 * handling multiple ! in a row */
14006 goto handle_operand;
14008 /* Single operand is ok only for the non-binary ')'
14010 else if ((top_index - fence == 0 && curchar != ')')
14011 || (top_index - fence > 0
14012 && (! (stacked_ptr = av_fetch(stack,
14015 || IS_OPERAND(*stacked_ptr))))
14017 SvREFCNT_dec(current);
14018 vFAIL("Operand with no preceding operator");
14022 /* Here there was nothing on the stack or the top element was
14023 * another operand. Just add this new one */
14024 av_push(stack, current);
14026 } /* End of switch on next parse token */
14028 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14029 } /* End of loop parsing through the construct */
14032 if (av_tindex(fence_stack) >= 0) {
14033 vFAIL("Unmatched (");
14036 if (av_tindex(stack) < 0 /* Was empty */
14037 || ((final = av_pop(stack)) == NULL)
14038 || ! IS_OPERAND(final)
14039 || SvTYPE(final) != SVt_INVLIST
14040 || av_tindex(stack) >= 0) /* More left on stack */
14043 SvREFCNT_dec(final);
14044 vFAIL("Incomplete expression within '(?[ ])'");
14047 /* Here, 'final' is the resultant inversion list from evaluating the
14048 * expression. Return it if so requested */
14049 if (return_invlist) {
14050 *return_invlist = final;
14054 /* Otherwise generate a resultant node, based on 'final'. regclass() is
14055 * expecting a string of ranges and individual code points */
14056 invlist_iterinit(final);
14057 result_string = newSVpvs("");
14058 while (invlist_iternext(final, &start, &end)) {
14059 if (start == end) {
14060 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
14063 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
14068 /* About to generate an ANYOF (or similar) node from the inversion list we
14069 * have calculated */
14070 save_parse = RExC_parse;
14071 RExC_parse = SvPV(result_string, len);
14072 save_end = RExC_end;
14073 RExC_end = RExC_parse + len;
14075 /* We turn off folding around the call, as the class we have constructed
14076 * already has all folding taken into consideration, and we don't want
14077 * regclass() to add to that */
14078 RExC_flags &= ~RXf_PMf_FOLD;
14079 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
14080 * folds are allowed. */
14081 node = regclass(pRExC_state, flagp,depth+1,
14082 FALSE, /* means parse the whole char class */
14083 FALSE, /* don't allow multi-char folds */
14084 TRUE, /* silence non-portable warnings. The above may very
14085 well have generated non-portable code points, but
14086 they're valid on this machine */
14087 FALSE, /* similarly, no need for strict */
14088 FALSE, /* Require return to be an ANYOF */
14092 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
14095 /* Fix up the node type if we are in locale. (We have pretended we are
14096 * under /u for the purposes of regclass(), as this construct will only
14097 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
14098 * as to cause any warnings about bad locales to be output in regexec.c),
14099 * and add the flag that indicates to check if not in a UTF-8 locale. The
14100 * reason we above forbid optimization into something other than an ANYOF
14101 * node is simply to minimize the number of code changes in regexec.c.
14102 * Otherwise we would have to create new EXACTish node types and deal with
14103 * them. This decision could be revisited should this construct become
14106 * (One might think we could look at the resulting ANYOF node and suppress
14107 * the flag if everything is above 255, as those would be UTF-8 only,
14108 * but this isn't true, as the components that led to that result could
14109 * have been locale-affected, and just happen to cancel each other out
14110 * under UTF-8 locales.) */
14112 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14114 assert(OP(node) == ANYOF);
14117 ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
14121 RExC_flags |= RXf_PMf_FOLD;
14124 RExC_parse = save_parse + 1;
14125 RExC_end = save_end;
14126 SvREFCNT_dec_NN(final);
14127 SvREFCNT_dec_NN(result_string);
14129 nextchar(pRExC_state);
14130 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
14137 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
14139 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
14140 * innocent-looking character class, like /[ks]/i won't have to go out to
14141 * disk to find the possible matches.
14143 * This should be called only for a Latin1-range code points, cp, which is
14144 * known to be involved in a simple fold with other code points above
14145 * Latin1. It would give false results if /aa has been specified.
14146 * Multi-char folds are outside the scope of this, and must be handled
14149 * XXX It would be better to generate these via regen, in case a new
14150 * version of the Unicode standard adds new mappings, though that is not
14151 * really likely, and may be caught by the default: case of the switch
14154 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14156 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14162 add_cp_to_invlist(*invlist, KELVIN_SIGN);
14166 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14169 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14170 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14172 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14173 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14174 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14176 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14177 *invlist = add_cp_to_invlist(*invlist,
14178 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14181 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
14183 case LATIN_SMALL_LETTER_SHARP_S:
14184 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14189 #if UNICODE_MAJOR_VERSION < 3 \
14190 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
14192 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
14197 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
14198 # if UNICODE_DOT_DOT_VERSION == 1
14199 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
14205 /* Use deprecated warning to increase the chances of this being
14208 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14215 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14217 /* This adds the string scalar <multi_string> to the array
14218 * <multi_char_matches>. <multi_string> is known to have exactly
14219 * <cp_count> code points in it. This is used when constructing a
14220 * bracketed character class and we find something that needs to match more
14221 * than a single character.
14223 * <multi_char_matches> is actually an array of arrays. Each top-level
14224 * element is an array that contains all the strings known so far that are
14225 * the same length. And that length (in number of code points) is the same
14226 * as the index of the top-level array. Hence, the [2] element is an
14227 * array, each element thereof is a string containing TWO code points;
14228 * while element [3] is for strings of THREE characters, and so on. Since
14229 * this is for multi-char strings there can never be a [0] nor [1] element.
14231 * When we rewrite the character class below, we will do so such that the
14232 * longest strings are written first, so that it prefers the longest
14233 * matching strings first. This is done even if it turns out that any
14234 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
14235 * Christiansen has agreed that this is ok. This makes the test for the
14236 * ligature 'ffi' come before the test for 'ff', for example */
14239 AV** this_array_ptr;
14241 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14243 if (! multi_char_matches) {
14244 multi_char_matches = newAV();
14247 if (av_exists(multi_char_matches, cp_count)) {
14248 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14249 this_array = *this_array_ptr;
14252 this_array = newAV();
14253 av_store(multi_char_matches, cp_count,
14256 av_push(this_array, multi_string);
14258 return multi_char_matches;
14261 /* The names of properties whose definitions are not known at compile time are
14262 * stored in this SV, after a constant heading. So if the length has been
14263 * changed since initialization, then there is a run-time definition. */
14264 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
14265 (SvCUR(listsv) != initial_listsv_len)
14267 /* There is a restricted set of white space characters that are legal when
14268 * ignoring white space in a bracketed character class. This generates the
14269 * code to skip them.
14271 * There is a line below that uses the same white space criteria but is outside
14272 * this macro. Both here and there must use the same definition */
14273 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
14276 while ( p < RExC_end \
14277 && isBLANK_A(UCHARAT(p))) \
14285 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14286 const bool stop_at_1, /* Just parse the next thing, don't
14287 look for a full character class */
14288 bool allow_multi_folds,
14289 const bool silence_non_portable, /* Don't output warnings
14293 bool optimizable, /* ? Allow a non-ANYOF return
14295 SV** ret_invlist /* Return an inversion list, not a node */
14298 /* parse a bracketed class specification. Most of these will produce an
14299 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14300 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
14301 * under /i with multi-character folds: it will be rewritten following the
14302 * paradigm of this example, where the <multi-fold>s are characters which
14303 * fold to multiple character sequences:
14304 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14305 * gets effectively rewritten as:
14306 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14307 * reg() gets called (recursively) on the rewritten version, and this
14308 * function will return what it constructs. (Actually the <multi-fold>s
14309 * aren't physically removed from the [abcdefghi], it's just that they are
14310 * ignored in the recursion by means of a flag:
14311 * <RExC_in_multi_char_class>.)
14313 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14314 * characters, with the corresponding bit set if that character is in the
14315 * list. For characters above this, a range list or swash is used. There
14316 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14317 * determinable at compile time
14319 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
14320 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
14321 * to UTF-8. This can only happen if ret_invlist is non-NULL.
14324 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14326 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14329 IV namedclass = OOB_NAMEDCLASS;
14330 char *rangebegin = NULL;
14331 bool need_class = 0;
14333 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14334 than just initialized. */
14335 SV* properties = NULL; /* Code points that match \p{} \P{} */
14336 SV* posixes = NULL; /* Code points that match classes like [:word:],
14337 extended beyond the Latin1 range. These have to
14338 be kept separate from other code points for much
14339 of this function because their handling is
14340 different under /i, and for most classes under
14342 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
14343 separate for a while from the non-complemented
14344 versions because of complications with /d
14346 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14347 treated more simply than the general case,
14348 leading to less compilation and execution
14350 UV element_count = 0; /* Number of distinct elements in the class.
14351 Optimizations may be possible if this is tiny */
14352 AV * multi_char_matches = NULL; /* Code points that fold to more than one
14353 character; used under /i */
14355 char * stop_ptr = RExC_end; /* where to stop parsing */
14356 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14359 /* Unicode properties are stored in a swash; this holds the current one
14360 * being parsed. If this swash is the only above-latin1 component of the
14361 * character class, an optimization is to pass it directly on to the
14362 * execution engine. Otherwise, it is set to NULL to indicate that there
14363 * are other things in the class that have to be dealt with at execution
14365 SV* swash = NULL; /* Code points that match \p{} \P{} */
14367 /* Set if a component of this character class is user-defined; just passed
14368 * on to the engine */
14369 bool has_user_defined_property = FALSE;
14371 /* inversion list of code points this node matches only when the target
14372 * string is in UTF-8. (Because is under /d) */
14373 SV* depends_list = NULL;
14375 /* Inversion list of code points this node matches regardless of things
14376 * like locale, folding, utf8ness of the target string */
14377 SV* cp_list = NULL;
14379 /* Like cp_list, but code points on this list need to be checked for things
14380 * that fold to/from them under /i */
14381 SV* cp_foldable_list = NULL;
14383 /* Like cp_list, but code points on this list are valid only when the
14384 * runtime locale is UTF-8 */
14385 SV* only_utf8_locale_list = NULL;
14387 /* In a range, if one of the endpoints is non-character-set portable,
14388 * meaning that it hard-codes a code point that may mean a different
14389 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14390 * mnemonic '\t' which each mean the same character no matter which
14391 * character set the platform is on. */
14392 unsigned int non_portable_endpoint = 0;
14394 /* Is the range unicode? which means on a platform that isn't 1-1 native
14395 * to Unicode (i.e. non-ASCII), each code point in it should be considered
14396 * to be a Unicode value. */
14397 bool unicode_range = FALSE;
14398 bool invert = FALSE; /* Is this class to be complemented */
14400 bool warn_super = ALWAYS_WARN_SUPER;
14402 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14403 case we need to change the emitted regop to an EXACT. */
14404 const char * orig_parse = RExC_parse;
14405 const SSize_t orig_size = RExC_size;
14406 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14407 GET_RE_DEBUG_FLAGS_DECL;
14409 PERL_ARGS_ASSERT_REGCLASS;
14411 PERL_UNUSED_ARG(depth);
14414 DEBUG_PARSE("clas");
14416 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
14417 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
14418 && UNICODE_DOT_DOT_VERSION == 0)
14419 allow_multi_folds = FALSE;
14422 /* Assume we are going to generate an ANYOF node. */
14423 ret = reganode(pRExC_state,
14426 : (DEPENDS_SEMANTICS)
14432 RExC_size += ANYOF_SKIP;
14433 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14436 ANYOF_FLAGS(ret) = 0;
14438 RExC_emit += ANYOF_SKIP;
14439 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14440 initial_listsv_len = SvCUR(listsv);
14441 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
14444 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14446 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14449 allow_multi_folds = FALSE;
14451 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14454 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14455 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14456 const char *s = RExC_parse;
14457 const char c = *s++;
14462 while (isWORDCHAR(*s))
14464 if (*s && c == *s && s[1] == ']') {
14465 SAVEFREESV(RExC_rx_sv);
14467 "POSIX syntax [%c %c] belongs inside character classes",
14469 (void)ReREFCNT_inc(RExC_rx_sv);
14473 /* If the caller wants us to just parse a single element, accomplish this
14474 * by faking the loop ending condition */
14475 if (stop_at_1 && RExC_end > RExC_parse) {
14476 stop_ptr = RExC_parse + 1;
14479 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14480 if (UCHARAT(RExC_parse) == ']')
14481 goto charclassloop;
14484 if (RExC_parse >= stop_ptr) {
14488 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14490 if (UCHARAT(RExC_parse) == ']') {
14496 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14497 save_value = value;
14498 save_prevvalue = prevvalue;
14501 rangebegin = RExC_parse;
14503 non_portable_endpoint = 0;
14506 value = utf8n_to_uvchr((U8*)RExC_parse,
14507 RExC_end - RExC_parse,
14508 &numlen, UTF8_ALLOW_DEFAULT);
14509 RExC_parse += numlen;
14512 value = UCHARAT(RExC_parse++);
14515 && RExC_parse < RExC_end
14516 && POSIXCC(UCHARAT(RExC_parse)))
14518 namedclass = regpposixcc(pRExC_state, value, strict);
14520 else if (value == '\\') {
14521 /* Is a backslash; get the code point of the char after it */
14522 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14523 value = utf8n_to_uvchr((U8*)RExC_parse,
14524 RExC_end - RExC_parse,
14525 &numlen, UTF8_ALLOW_DEFAULT);
14526 RExC_parse += numlen;
14529 value = UCHARAT(RExC_parse++);
14531 /* Some compilers cannot handle switching on 64-bit integer
14532 * values, therefore value cannot be an UV. Yes, this will
14533 * be a problem later if we want switch on Unicode.
14534 * A similar issue a little bit later when switching on
14535 * namedclass. --jhi */
14537 /* If the \ is escaping white space when white space is being
14538 * skipped, it means that that white space is wanted literally, and
14539 * is already in 'value'. Otherwise, need to translate the escape
14540 * into what it signifies. */
14541 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
14543 case 'w': namedclass = ANYOF_WORDCHAR; break;
14544 case 'W': namedclass = ANYOF_NWORDCHAR; break;
14545 case 's': namedclass = ANYOF_SPACE; break;
14546 case 'S': namedclass = ANYOF_NSPACE; break;
14547 case 'd': namedclass = ANYOF_DIGIT; break;
14548 case 'D': namedclass = ANYOF_NDIGIT; break;
14549 case 'v': namedclass = ANYOF_VERTWS; break;
14550 case 'V': namedclass = ANYOF_NVERTWS; break;
14551 case 'h': namedclass = ANYOF_HORIZWS; break;
14552 case 'H': namedclass = ANYOF_NHORIZWS; break;
14553 case 'N': /* Handle \N{NAME} in class */
14555 const char * const backslash_N_beg = RExC_parse - 2;
14558 if (! grok_bslash_N(pRExC_state,
14559 NULL, /* No regnode */
14560 &value, /* Yes single value */
14561 &cp_count, /* Multiple code pt count */
14566 if (*flagp & NEED_UTF8)
14567 FAIL("panic: grok_bslash_N set NEED_UTF8");
14568 if (*flagp & RESTART_PASS1)
14571 if (cp_count < 0) {
14572 vFAIL("\\N in a character class must be a named character: \\N{...}");
14574 else if (cp_count == 0) {
14576 RExC_parse++; /* Position after the "}" */
14577 vFAIL("Zero length \\N{}");
14580 ckWARNreg(RExC_parse,
14581 "Ignoring zero length \\N{} in character class");
14584 else { /* cp_count > 1 */
14585 if (! RExC_in_multi_char_class) {
14586 if (invert || range || *RExC_parse == '-') {
14589 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14592 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14594 break; /* <value> contains the first code
14595 point. Drop out of the switch to
14599 SV * multi_char_N = newSVpvn(backslash_N_beg,
14600 RExC_parse - backslash_N_beg);
14602 = add_multi_match(multi_char_matches,
14607 } /* End of cp_count != 1 */
14609 /* This element should not be processed further in this
14612 value = save_value;
14613 prevvalue = save_prevvalue;
14614 continue; /* Back to top of loop to get next char */
14617 /* Here, is a single code point, and <value> contains it */
14618 unicode_range = TRUE; /* \N{} are Unicode */
14626 /* We will handle any undefined properties ourselves */
14627 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14628 /* And we actually would prefer to get
14629 * the straight inversion list of the
14630 * swash, since we will be accessing it
14631 * anyway, to save a little time */
14632 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14634 if (RExC_parse >= RExC_end)
14635 vFAIL2("Empty \\%c{}", (U8)value);
14636 if (*RExC_parse == '{') {
14637 const U8 c = (U8)value;
14638 e = strchr(RExC_parse, '}');
14641 vFAIL2("Missing right brace on \\%c{}", c);
14645 while (isSPACE(*RExC_parse)) {
14649 if (UCHARAT(RExC_parse) == '^') {
14651 /* toggle. (The rhs xor gets the single bit that
14652 * differs between P and p; the other xor inverts just
14654 value ^= 'P' ^ 'p';
14657 while (isSPACE(*RExC_parse)) {
14662 if (e == RExC_parse)
14663 vFAIL2("Empty \\%c{}", c);
14665 n = e - RExC_parse;
14666 while (isSPACE(*(RExC_parse + n - 1)))
14668 } /* The \p isn't immediately followed by a '{' */
14669 else if (! isALPHA(*RExC_parse)) {
14670 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14671 vFAIL2("Character following \\%c must be '{' or a "
14672 "single-character Unicode property name",
14682 char* base_name; /* name after any packages are stripped */
14683 const char * const colon_colon = "::";
14685 /* Try to get the definition of the property into
14686 * <invlist>. If /i is in effect, the effective property
14687 * will have its name be <__NAME_i>. The design is
14688 * discussed in commit
14689 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14690 name = savepv(Perl_form(aTHX_
14692 (FOLD) ? "__" : "",
14698 /* Look up the property name, and get its swash and
14699 * inversion list, if the property is found */
14700 if (swash) { /* Return any left-overs */
14701 SvREFCNT_dec_NN(swash);
14703 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14706 NULL, /* No inversion list */
14709 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14710 HV* curpkg = (IN_PERL_COMPILETIME)
14712 : CopSTASH(PL_curcop);
14716 if (swash) { /* Got a swash but no inversion list.
14717 Something is likely wrong that will
14718 be sorted-out later */
14719 SvREFCNT_dec_NN(swash);
14723 /* Here didn't find it. It could be a an error (like a
14724 * typo) in specifying a Unicode property, or it could
14725 * be a user-defined property that will be available at
14726 * run-time. The names of these must begin with 'In'
14727 * or 'Is' (after any packages are stripped off). So
14728 * if not one of those, or if we accept only
14729 * compile-time properties, is an error; otherwise add
14730 * it to the list for run-time look up. */
14731 if ((base_name = rninstr(name, name + n,
14732 colon_colon, colon_colon + 2)))
14733 { /* Has ::. We know this must be a user-defined
14736 final_n -= base_name - name;
14745 || base_name[0] != 'I'
14746 || (base_name[1] != 's' && base_name[1] != 'n')
14749 const char * const msg
14751 ? "Illegal user-defined property name"
14752 : "Can't find Unicode property definition";
14753 RExC_parse = e + 1;
14755 /* diag_listed_as: Can't find Unicode property definition "%s" */
14756 vFAIL3utf8f("%s \"%"UTF8f"\"",
14757 msg, UTF8fARG(UTF, n, name));
14760 /* If the property name doesn't already have a package
14761 * name, add the current one to it so that it can be
14762 * referred to outside it. [perl #121777] */
14763 if (! has_pkg && curpkg) {
14764 char* pkgname = HvNAME(curpkg);
14765 if (strNE(pkgname, "main")) {
14766 char* full_name = Perl_form(aTHX_
14770 n = strlen(full_name);
14772 name = savepvn(full_name, n);
14775 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14776 (value == 'p' ? '+' : '!'),
14777 UTF8fARG(UTF, n, name));
14778 has_user_defined_property = TRUE;
14779 optimizable = FALSE; /* Will have to leave this an
14782 /* We don't know yet, so have to assume that the
14783 * property could match something in the upper Latin1
14784 * range, hence something that isn't utf8. Note that
14785 * this would cause things in <depends_list> to match
14786 * inappropriately, except that any \p{}, including
14787 * this one forces Unicode semantics, which means there
14788 * is no <depends_list> */
14790 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14794 /* Here, did get the swash and its inversion list. If
14795 * the swash is from a user-defined property, then this
14796 * whole character class should be regarded as such */
14797 if (swash_init_flags
14798 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14800 has_user_defined_property = TRUE;
14803 /* We warn on matching an above-Unicode code point
14804 * if the match would return true, except don't
14805 * warn for \p{All}, which has exactly one element
14807 (_invlist_contains_cp(invlist, 0x110000)
14808 && (! (_invlist_len(invlist) == 1
14809 && *invlist_array(invlist) == 0)))
14815 /* Invert if asking for the complement */
14816 if (value == 'P') {
14817 _invlist_union_complement_2nd(properties,
14821 /* The swash can't be used as-is, because we've
14822 * inverted things; delay removing it to here after
14823 * have copied its invlist above */
14824 SvREFCNT_dec_NN(swash);
14828 _invlist_union(properties, invlist, &properties);
14833 RExC_parse = e + 1;
14834 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14837 /* \p means they want Unicode semantics */
14838 REQUIRE_UNI_RULES(flagp, NULL);
14841 case 'n': value = '\n'; break;
14842 case 'r': value = '\r'; break;
14843 case 't': value = '\t'; break;
14844 case 'f': value = '\f'; break;
14845 case 'b': value = '\b'; break;
14846 case 'e': value = ESC_NATIVE; break;
14847 case 'a': value = '\a'; break;
14849 RExC_parse--; /* function expects to be pointed at the 'o' */
14851 const char* error_msg;
14852 bool valid = grok_bslash_o(&RExC_parse,
14855 PASS2, /* warnings only in
14858 silence_non_portable,
14864 non_portable_endpoint++;
14865 if (IN_ENCODING && value < 0x100) {
14866 goto recode_encoding;
14870 RExC_parse--; /* function expects to be pointed at the 'x' */
14872 const char* error_msg;
14873 bool valid = grok_bslash_x(&RExC_parse,
14876 PASS2, /* Output warnings */
14878 silence_non_portable,
14884 non_portable_endpoint++;
14885 if (IN_ENCODING && value < 0x100)
14886 goto recode_encoding;
14889 value = grok_bslash_c(*RExC_parse++, PASS2);
14890 non_portable_endpoint++;
14892 case '0': case '1': case '2': case '3': case '4':
14893 case '5': case '6': case '7':
14895 /* Take 1-3 octal digits */
14896 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14897 numlen = (strict) ? 4 : 3;
14898 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14899 RExC_parse += numlen;
14902 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14903 vFAIL("Need exactly 3 octal digits");
14905 else if (! SIZE_ONLY /* like \08, \178 */
14907 && RExC_parse < RExC_end
14908 && isDIGIT(*RExC_parse)
14909 && ckWARN(WARN_REGEXP))
14911 SAVEFREESV(RExC_rx_sv);
14912 reg_warn_non_literal_string(
14914 form_short_octal_warning(RExC_parse, numlen));
14915 (void)ReREFCNT_inc(RExC_rx_sv);
14918 non_portable_endpoint++;
14919 if (IN_ENCODING && value < 0x100)
14920 goto recode_encoding;
14924 if (! RExC_override_recoding) {
14925 SV* enc = _get_encoding();
14926 value = reg_recode((U8)value, &enc);
14929 vFAIL("Invalid escape in the specified encoding");
14932 ckWARNreg(RExC_parse,
14933 "Invalid escape in the specified encoding");
14939 /* Allow \_ to not give an error */
14940 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14942 vFAIL2("Unrecognized escape \\%c in character class",
14946 SAVEFREESV(RExC_rx_sv);
14947 ckWARN2reg(RExC_parse,
14948 "Unrecognized escape \\%c in character class passed through",
14950 (void)ReREFCNT_inc(RExC_rx_sv);
14954 } /* End of switch on char following backslash */
14955 } /* end of handling backslash escape sequences */
14957 /* Here, we have the current token in 'value' */
14959 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14962 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14963 * literal, as is the character that began the false range, i.e.
14964 * the 'a' in the examples */
14967 const int w = (RExC_parse >= rangebegin)
14968 ? RExC_parse - rangebegin
14972 "False [] range \"%"UTF8f"\"",
14973 UTF8fARG(UTF, w, rangebegin));
14976 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14977 ckWARN2reg(RExC_parse,
14978 "False [] range \"%"UTF8f"\"",
14979 UTF8fARG(UTF, w, rangebegin));
14980 (void)ReREFCNT_inc(RExC_rx_sv);
14981 cp_list = add_cp_to_invlist(cp_list, '-');
14982 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14987 range = 0; /* this was not a true range */
14988 element_count += 2; /* So counts for three values */
14991 classnum = namedclass_to_classnum(namedclass);
14993 if (LOC && namedclass < ANYOF_POSIXL_MAX
14994 #ifndef HAS_ISASCII
14995 && classnum != _CC_ASCII
14998 /* What the Posix classes (like \w, [:space:]) match in locale
14999 * isn't knowable under locale until actual match time. Room
15000 * must be reserved (one time per outer bracketed class) to
15001 * store such classes. The space will contain a bit for each
15002 * named class that is to be matched against. This isn't
15003 * needed for \p{} and pseudo-classes, as they are not affected
15004 * by locale, and hence are dealt with separately */
15005 if (! need_class) {
15008 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15011 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15013 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
15014 ANYOF_POSIXL_ZERO(ret);
15016 /* We can't change this into some other type of node
15017 * (unless this is the only element, in which case there
15018 * are nodes that mean exactly this) as has runtime
15020 optimizable = FALSE;
15023 /* Coverity thinks it is possible for this to be negative; both
15024 * jhi and khw think it's not, but be safer */
15025 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15026 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
15028 /* See if it already matches the complement of this POSIX
15030 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15031 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
15035 posixl_matches_all = TRUE;
15036 break; /* No need to continue. Since it matches both
15037 e.g., \w and \W, it matches everything, and the
15038 bracketed class can be optimized into qr/./s */
15041 /* Add this class to those that should be checked at runtime */
15042 ANYOF_POSIXL_SET(ret, namedclass);
15044 /* The above-Latin1 characters are not subject to locale rules.
15045 * Just add them, in the second pass, to the
15046 * unconditionally-matched list */
15048 SV* scratch_list = NULL;
15050 /* Get the list of the above-Latin1 code points this
15052 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
15053 PL_XPosix_ptrs[classnum],
15055 /* Odd numbers are complements, like
15056 * NDIGIT, NASCII, ... */
15057 namedclass % 2 != 0,
15059 /* Checking if 'cp_list' is NULL first saves an extra
15060 * clone. Its reference count will be decremented at the
15061 * next union, etc, or if this is the only instance, at the
15062 * end of the routine */
15064 cp_list = scratch_list;
15067 _invlist_union(cp_list, scratch_list, &cp_list);
15068 SvREFCNT_dec_NN(scratch_list);
15070 continue; /* Go get next character */
15073 else if (! SIZE_ONLY) {
15075 /* Here, not in pass1 (in that pass we skip calculating the
15076 * contents of this class), and is /l, or is a POSIX class for
15077 * which /l doesn't matter (or is a Unicode property, which is
15078 * skipped here). */
15079 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
15080 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
15082 /* Here, should be \h, \H, \v, or \V. None of /d, /i
15083 * nor /l make a difference in what these match,
15084 * therefore we just add what they match to cp_list. */
15085 if (classnum != _CC_VERTSPACE) {
15086 assert( namedclass == ANYOF_HORIZWS
15087 || namedclass == ANYOF_NHORIZWS);
15089 /* It turns out that \h is just a synonym for
15091 classnum = _CC_BLANK;
15094 _invlist_union_maybe_complement_2nd(
15096 PL_XPosix_ptrs[classnum],
15097 namedclass % 2 != 0, /* Complement if odd
15098 (NHORIZWS, NVERTWS)
15103 else if (UNI_SEMANTICS
15104 || classnum == _CC_ASCII
15105 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
15106 || classnum == _CC_XDIGIT)))
15108 /* We usually have to worry about /d and /a affecting what
15109 * POSIX classes match, with special code needed for /d
15110 * because we won't know until runtime what all matches.
15111 * But there is no extra work needed under /u, and
15112 * [:ascii:] is unaffected by /a and /d; and :digit: and
15113 * :xdigit: don't have runtime differences under /d. So we
15114 * can special case these, and avoid some extra work below,
15115 * and at runtime. */
15116 _invlist_union_maybe_complement_2nd(
15118 PL_XPosix_ptrs[classnum],
15119 namedclass % 2 != 0,
15122 else { /* Garden variety class. If is NUPPER, NALPHA, ...
15123 complement and use nposixes */
15124 SV** posixes_ptr = namedclass % 2 == 0
15127 _invlist_union_maybe_complement_2nd(
15129 PL_XPosix_ptrs[classnum],
15130 namedclass % 2 != 0,
15134 } /* end of namedclass \blah */
15136 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15138 /* If 'range' is set, 'value' is the ending of a range--check its
15139 * validity. (If value isn't a single code point in the case of a
15140 * range, we should have figured that out above in the code that
15141 * catches false ranges). Later, we will handle each individual code
15142 * point in the range. If 'range' isn't set, this could be the
15143 * beginning of a range, so check for that by looking ahead to see if
15144 * the next real character to be processed is the range indicator--the
15149 /* For unicode ranges, we have to test that the Unicode as opposed
15150 * to the native values are not decreasing. (Above 255, there is
15151 * no difference between native and Unicode) */
15152 if (unicode_range && prevvalue < 255 && value < 255) {
15153 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
15154 goto backwards_range;
15159 if (prevvalue > value) /* b-a */ {
15164 w = RExC_parse - rangebegin;
15166 "Invalid [] range \"%"UTF8f"\"",
15167 UTF8fARG(UTF, w, rangebegin));
15168 NOT_REACHED; /* NOTREACHED */
15172 prevvalue = value; /* save the beginning of the potential range */
15173 if (! stop_at_1 /* Can't be a range if parsing just one thing */
15174 && *RExC_parse == '-')
15176 char* next_char_ptr = RExC_parse + 1;
15178 /* Get the next real char after the '-' */
15179 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
15181 /* If the '-' is at the end of the class (just before the ']',
15182 * it is a literal minus; otherwise it is a range */
15183 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
15184 RExC_parse = next_char_ptr;
15186 /* a bad range like \w-, [:word:]- ? */
15187 if (namedclass > OOB_NAMEDCLASS) {
15188 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
15189 const int w = RExC_parse >= rangebegin
15190 ? RExC_parse - rangebegin
15193 vFAIL4("False [] range \"%*.*s\"",
15198 "False [] range \"%*.*s\"",
15203 cp_list = add_cp_to_invlist(cp_list, '-');
15207 range = 1; /* yeah, it's a range! */
15208 continue; /* but do it the next time */
15213 if (namedclass > OOB_NAMEDCLASS) {
15217 /* Here, we have a single value this time through the loop, and
15218 * <prevvalue> is the beginning of the range, if any; or <value> if
15221 /* non-Latin1 code point implies unicode semantics. Must be set in
15222 * pass1 so is there for the whole of pass 2 */
15224 REQUIRE_UNI_RULES(flagp, NULL);
15227 /* Ready to process either the single value, or the completed range.
15228 * For single-valued non-inverted ranges, we consider the possibility
15229 * of multi-char folds. (We made a conscious decision to not do this
15230 * for the other cases because it can often lead to non-intuitive
15231 * results. For example, you have the peculiar case that:
15232 * "s s" =~ /^[^\xDF]+$/i => Y
15233 * "ss" =~ /^[^\xDF]+$/i => N
15235 * See [perl #89750] */
15236 if (FOLD && allow_multi_folds && value == prevvalue) {
15237 if (value == LATIN_SMALL_LETTER_SHARP_S
15238 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15241 /* Here <value> is indeed a multi-char fold. Get what it is */
15243 U8 foldbuf[UTF8_MAXBYTES_CASE];
15246 UV folded = _to_uni_fold_flags(
15250 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15251 ? FOLD_FLAGS_NOMIX_ASCII
15255 /* Here, <folded> should be the first character of the
15256 * multi-char fold of <value>, with <foldbuf> containing the
15257 * whole thing. But, if this fold is not allowed (because of
15258 * the flags), <fold> will be the same as <value>, and should
15259 * be processed like any other character, so skip the special
15261 if (folded != value) {
15263 /* Skip if we are recursed, currently parsing the class
15264 * again. Otherwise add this character to the list of
15265 * multi-char folds. */
15266 if (! RExC_in_multi_char_class) {
15267 STRLEN cp_count = utf8_length(foldbuf,
15268 foldbuf + foldlen);
15269 SV* multi_fold = sv_2mortal(newSVpvs(""));
15271 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15274 = add_multi_match(multi_char_matches,
15280 /* This element should not be processed further in this
15283 value = save_value;
15284 prevvalue = save_prevvalue;
15290 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15293 /* If the range starts above 255, everything is portable and
15294 * likely to be so for any forseeable character set, so don't
15296 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15297 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15299 else if (prevvalue != value) {
15301 /* Under strict, ranges that stop and/or end in an ASCII
15302 * printable should have each end point be a portable value
15303 * for it (preferably like 'A', but we don't warn if it is
15304 * a (portable) Unicode name or code point), and the range
15305 * must be be all digits or all letters of the same case.
15306 * Otherwise, the range is non-portable and unclear as to
15307 * what it contains */
15308 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15309 && (non_portable_endpoint
15310 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15311 || (isLOWER_A(prevvalue) && isLOWER_A(value))
15312 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15314 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15316 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15318 /* But the nature of Unicode and languages mean we
15319 * can't do the same checks for above-ASCII ranges,
15320 * except in the case of digit ones. These should
15321 * contain only digits from the same group of 10. The
15322 * ASCII case is handled just above. 0x660 is the
15323 * first digit character beyond ASCII. Hence here, the
15324 * range could be a range of digits. Find out. */
15325 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15327 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15330 /* If the range start and final points are in the same
15331 * inversion list element, it means that either both
15332 * are not digits, or both are digits in a consecutive
15333 * sequence of digits. (So far, Unicode has kept all
15334 * such sequences as distinct groups of 10, but assert
15335 * to make sure). If the end points are not in the
15336 * same element, neither should be a digit. */
15337 if (index_start == index_final) {
15338 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15339 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15340 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15342 /* But actually Unicode did have one group of 11
15343 * 'digits' in 5.2, so in case we are operating
15344 * on that version, let that pass */
15345 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15346 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15348 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15352 else if ((index_start >= 0
15353 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15354 || (index_final >= 0
15355 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15357 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15362 if ((! range || prevvalue == value) && non_portable_endpoint) {
15363 if (isPRINT_A(value)) {
15366 if (isBACKSLASHED_PUNCT(value)) {
15367 literal[d++] = '\\';
15369 literal[d++] = (char) value;
15370 literal[d++] = '\0';
15373 "\"%.*s\" is more clearly written simply as \"%s\"",
15374 (int) (RExC_parse - rangebegin),
15379 else if isMNEMONIC_CNTRL(value) {
15381 "\"%.*s\" is more clearly written simply as \"%s\"",
15382 (int) (RExC_parse - rangebegin),
15384 cntrl_to_mnemonic((char) value)
15390 /* Deal with this element of the class */
15394 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15397 /* On non-ASCII platforms, for ranges that span all of 0..255, and
15398 * ones that don't require special handling, we can just add the
15399 * range like we do for ASCII platforms */
15400 if ((UNLIKELY(prevvalue == 0) && value >= 255)
15401 || ! (prevvalue < 256
15403 || (! non_portable_endpoint
15404 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15405 || (isUPPER_A(prevvalue)
15406 && isUPPER_A(value)))))))
15408 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15412 /* Here, requires special handling. This can be because it is
15413 * a range whose code points are considered to be Unicode, and
15414 * so must be individually translated into native, or because
15415 * its a subrange of 'A-Z' or 'a-z' which each aren't
15416 * contiguous in EBCDIC, but we have defined them to include
15417 * only the "expected" upper or lower case ASCII alphabetics.
15418 * Subranges above 255 are the same in native and Unicode, so
15419 * can be added as a range */
15420 U8 start = NATIVE_TO_LATIN1(prevvalue);
15422 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15423 for (j = start; j <= end; j++) {
15424 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15427 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15434 range = 0; /* this range (if it was one) is done now */
15435 } /* End of loop through all the text within the brackets */
15437 /* If anything in the class expands to more than one character, we have to
15438 * deal with them by building up a substitute parse string, and recursively
15439 * calling reg() on it, instead of proceeding */
15440 if (multi_char_matches) {
15441 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15444 char *save_end = RExC_end;
15445 char *save_parse = RExC_parse;
15446 bool first_time = TRUE; /* First multi-char occurrence doesn't get
15451 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
15452 because too confusing */
15454 sv_catpv(substitute_parse, "(?:");
15458 /* Look at the longest folds first */
15459 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15461 if (av_exists(multi_char_matches, cp_count)) {
15462 AV** this_array_ptr;
15465 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15467 while ((this_sequence = av_pop(*this_array_ptr)) !=
15470 if (! first_time) {
15471 sv_catpv(substitute_parse, "|");
15473 first_time = FALSE;
15475 sv_catpv(substitute_parse, SvPVX(this_sequence));
15480 /* If the character class contains anything else besides these
15481 * multi-character folds, have to include it in recursive parsing */
15482 if (element_count) {
15483 sv_catpv(substitute_parse, "|[");
15484 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15485 sv_catpv(substitute_parse, "]");
15488 sv_catpv(substitute_parse, ")");
15491 /* This is a way to get the parse to skip forward a whole named
15492 * sequence instead of matching the 2nd character when it fails the
15494 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15498 RExC_parse = SvPV(substitute_parse, len);
15499 RExC_end = RExC_parse + len;
15500 RExC_in_multi_char_class = 1;
15501 RExC_override_recoding = 1;
15502 RExC_emit = (regnode *)orig_emit;
15504 ret = reg(pRExC_state, 1, ®_flags, depth+1);
15506 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
15508 RExC_parse = save_parse;
15509 RExC_end = save_end;
15510 RExC_in_multi_char_class = 0;
15511 RExC_override_recoding = 0;
15512 SvREFCNT_dec_NN(multi_char_matches);
15516 /* Here, we've gone through the entire class and dealt with multi-char
15517 * folds. We are now in a position that we can do some checks to see if we
15518 * can optimize this ANYOF node into a simpler one, even in Pass 1.
15519 * Currently we only do two checks:
15520 * 1) is in the unlikely event that the user has specified both, eg. \w and
15521 * \W under /l, then the class matches everything. (This optimization
15522 * is done only to make the optimizer code run later work.)
15523 * 2) if the character class contains only a single element (including a
15524 * single range), we see if there is an equivalent node for it.
15525 * Other checks are possible */
15527 && ! ret_invlist /* Can't optimize if returning the constructed
15529 && (UNLIKELY(posixl_matches_all) || element_count == 1))
15534 if (UNLIKELY(posixl_matches_all)) {
15537 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15538 \w or [:digit:] or \p{foo}
15541 /* All named classes are mapped into POSIXish nodes, with its FLAG
15542 * argument giving which class it is */
15543 switch ((I32)namedclass) {
15544 case ANYOF_UNIPROP:
15547 /* These don't depend on the charset modifiers. They always
15548 * match under /u rules */
15549 case ANYOF_NHORIZWS:
15550 case ANYOF_HORIZWS:
15551 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15554 case ANYOF_NVERTWS:
15559 /* The actual POSIXish node for all the rest depends on the
15560 * charset modifier. The ones in the first set depend only on
15561 * ASCII or, if available on this platform, also locale */
15565 op = (LOC) ? POSIXL : POSIXA;
15571 /* The following don't have any matches in the upper Latin1
15572 * range, hence /d is equivalent to /u for them. Making it /u
15573 * saves some branches at runtime */
15577 case ANYOF_NXDIGIT:
15578 if (! DEPENDS_SEMANTICS) {
15579 goto treat_as_default;
15585 /* The following change to CASED under /i */
15591 namedclass = ANYOF_CASED + (namedclass % 2);
15595 /* The rest have more possibilities depending on the charset.
15596 * We take advantage of the enum ordering of the charset
15597 * modifiers to get the exact node type, */
15600 op = POSIXD + get_regex_charset(RExC_flags);
15601 if (op > POSIXA) { /* /aa is same as /a */
15606 /* The odd numbered ones are the complements of the
15607 * next-lower even number one */
15608 if (namedclass % 2 == 1) {
15612 arg = namedclass_to_classnum(namedclass);
15616 else if (value == prevvalue) {
15618 /* Here, the class consists of just a single code point */
15621 if (! LOC && value == '\n') {
15622 op = REG_ANY; /* Optimize [^\n] */
15623 *flagp |= HASWIDTH|SIMPLE;
15627 else if (value < 256 || UTF) {
15629 /* Optimize a single value into an EXACTish node, but not if it
15630 * would require converting the pattern to UTF-8. */
15631 op = compute_EXACTish(pRExC_state);
15633 } /* Otherwise is a range */
15634 else if (! LOC) { /* locale could vary these */
15635 if (prevvalue == '0') {
15636 if (value == '9') {
15641 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15642 /* We can optimize A-Z or a-z, but not if they could match
15643 * something like the KELVIN SIGN under /i. */
15644 if (prevvalue == 'A') {
15647 && ! non_portable_endpoint
15650 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15654 else if (prevvalue == 'a') {
15657 && ! non_portable_endpoint
15660 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15667 /* Here, we have changed <op> away from its initial value iff we found
15668 * an optimization */
15671 /* Throw away this ANYOF regnode, and emit the calculated one,
15672 * which should correspond to the beginning, not current, state of
15674 const char * cur_parse = RExC_parse;
15675 RExC_parse = (char *)orig_parse;
15679 /* To get locale nodes to not use the full ANYOF size would
15680 * require moving the code above that writes the portions
15681 * of it that aren't in other nodes to after this point.
15682 * e.g. ANYOF_POSIXL_SET */
15683 RExC_size = orig_size;
15687 RExC_emit = (regnode *)orig_emit;
15688 if (PL_regkind[op] == POSIXD) {
15689 if (op == POSIXL) {
15690 RExC_contains_locale = 1;
15693 op += NPOSIXD - POSIXD;
15698 ret = reg_node(pRExC_state, op);
15700 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15704 *flagp |= HASWIDTH|SIMPLE;
15706 else if (PL_regkind[op] == EXACT) {
15707 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15708 TRUE /* downgradable to EXACT */
15712 RExC_parse = (char *) cur_parse;
15714 SvREFCNT_dec(posixes);
15715 SvREFCNT_dec(nposixes);
15716 SvREFCNT_dec(simple_posixes);
15717 SvREFCNT_dec(cp_list);
15718 SvREFCNT_dec(cp_foldable_list);
15725 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15727 /* If folding, we calculate all characters that could fold to or from the
15728 * ones already on the list */
15729 if (cp_foldable_list) {
15731 UV start, end; /* End points of code point ranges */
15733 SV* fold_intersection = NULL;
15736 /* Our calculated list will be for Unicode rules. For locale
15737 * matching, we have to keep a separate list that is consulted at
15738 * runtime only when the locale indicates Unicode rules. For
15739 * non-locale, we just use the general list */
15741 use_list = &only_utf8_locale_list;
15744 use_list = &cp_list;
15747 /* Only the characters in this class that participate in folds need
15748 * be checked. Get the intersection of this class and all the
15749 * possible characters that are foldable. This can quickly narrow
15750 * down a large class */
15751 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15752 &fold_intersection);
15754 /* The folds for all the Latin1 characters are hard-coded into this
15755 * program, but we have to go out to disk to get the others. */
15756 if (invlist_highest(cp_foldable_list) >= 256) {
15758 /* This is a hash that for a particular fold gives all
15759 * characters that are involved in it */
15760 if (! PL_utf8_foldclosures) {
15761 _load_PL_utf8_foldclosures();
15765 /* Now look at the foldable characters in this class individually */
15766 invlist_iterinit(fold_intersection);
15767 while (invlist_iternext(fold_intersection, &start, &end)) {
15770 /* Look at every character in the range */
15771 for (j = start; j <= end; j++) {
15772 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15778 if (IS_IN_SOME_FOLD_L1(j)) {
15780 /* ASCII is always matched; non-ASCII is matched
15781 * only under Unicode rules (which could happen
15782 * under /l if the locale is a UTF-8 one */
15783 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15784 *use_list = add_cp_to_invlist(*use_list,
15785 PL_fold_latin1[j]);
15789 add_cp_to_invlist(depends_list,
15790 PL_fold_latin1[j]);
15794 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15795 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15797 add_above_Latin1_folds(pRExC_state,
15804 /* Here is an above Latin1 character. We don't have the
15805 * rules hard-coded for it. First, get its fold. This is
15806 * the simple fold, as the multi-character folds have been
15807 * handled earlier and separated out */
15808 _to_uni_fold_flags(j, foldbuf, &foldlen,
15809 (ASCII_FOLD_RESTRICTED)
15810 ? FOLD_FLAGS_NOMIX_ASCII
15813 /* Single character fold of above Latin1. Add everything in
15814 * its fold closure to the list that this node should match.
15815 * The fold closures data structure is a hash with the keys
15816 * being the UTF-8 of every character that is folded to, like
15817 * 'k', and the values each an array of all code points that
15818 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15819 * Multi-character folds are not included */
15820 if ((listp = hv_fetch(PL_utf8_foldclosures,
15821 (char *) foldbuf, foldlen, FALSE)))
15823 AV* list = (AV*) *listp;
15825 for (k = 0; k <= av_tindex(list); k++) {
15826 SV** c_p = av_fetch(list, k, FALSE);
15832 /* /aa doesn't allow folds between ASCII and non- */
15833 if ((ASCII_FOLD_RESTRICTED
15834 && (isASCII(c) != isASCII(j))))
15839 /* Folds under /l which cross the 255/256 boundary
15840 * are added to a separate list. (These are valid
15841 * only when the locale is UTF-8.) */
15842 if (c < 256 && LOC) {
15843 *use_list = add_cp_to_invlist(*use_list, c);
15847 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15849 cp_list = add_cp_to_invlist(cp_list, c);
15852 /* Similarly folds involving non-ascii Latin1
15853 * characters under /d are added to their list */
15854 depends_list = add_cp_to_invlist(depends_list,
15861 SvREFCNT_dec_NN(fold_intersection);
15864 /* Now that we have finished adding all the folds, there is no reason
15865 * to keep the foldable list separate */
15866 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15867 SvREFCNT_dec_NN(cp_foldable_list);
15870 /* And combine the result (if any) with any inversion list from posix
15871 * classes. The lists are kept separate up to now because we don't want to
15872 * fold the classes (folding of those is automatically handled by the swash
15873 * fetching code) */
15874 if (simple_posixes) {
15875 _invlist_union(cp_list, simple_posixes, &cp_list);
15876 SvREFCNT_dec_NN(simple_posixes);
15878 if (posixes || nposixes) {
15879 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15880 /* Under /a and /aa, nothing above ASCII matches these */
15881 _invlist_intersection(posixes,
15882 PL_XPosix_ptrs[_CC_ASCII],
15886 if (DEPENDS_SEMANTICS) {
15887 /* Under /d, everything in the upper half of the Latin1 range
15888 * matches these complements */
15889 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
15891 else if (AT_LEAST_ASCII_RESTRICTED) {
15892 /* Under /a and /aa, everything above ASCII matches these
15894 _invlist_union_complement_2nd(nposixes,
15895 PL_XPosix_ptrs[_CC_ASCII],
15899 _invlist_union(posixes, nposixes, &posixes);
15900 SvREFCNT_dec_NN(nposixes);
15903 posixes = nposixes;
15906 if (! DEPENDS_SEMANTICS) {
15908 _invlist_union(cp_list, posixes, &cp_list);
15909 SvREFCNT_dec_NN(posixes);
15916 /* Under /d, we put into a separate list the Latin1 things that
15917 * match only when the target string is utf8 */
15918 SV* nonascii_but_latin1_properties = NULL;
15919 _invlist_intersection(posixes, PL_UpperLatin1,
15920 &nonascii_but_latin1_properties);
15921 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15924 _invlist_union(cp_list, posixes, &cp_list);
15925 SvREFCNT_dec_NN(posixes);
15931 if (depends_list) {
15932 _invlist_union(depends_list, nonascii_but_latin1_properties,
15934 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15937 depends_list = nonascii_but_latin1_properties;
15942 /* And combine the result (if any) with any inversion list from properties.
15943 * The lists are kept separate up to now so that we can distinguish the two
15944 * in regards to matching above-Unicode. A run-time warning is generated
15945 * if a Unicode property is matched against a non-Unicode code point. But,
15946 * we allow user-defined properties to match anything, without any warning,
15947 * and we also suppress the warning if there is a portion of the character
15948 * class that isn't a Unicode property, and which matches above Unicode, \W
15949 * or [\x{110000}] for example.
15950 * (Note that in this case, unlike the Posix one above, there is no
15951 * <depends_list>, because having a Unicode property forces Unicode
15956 /* If it matters to the final outcome, see if a non-property
15957 * component of the class matches above Unicode. If so, the
15958 * warning gets suppressed. This is true even if just a single
15959 * such code point is specified, as though not strictly correct if
15960 * another such code point is matched against, the fact that they
15961 * are using above-Unicode code points indicates they should know
15962 * the issues involved */
15964 warn_super = ! (invert
15965 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15968 _invlist_union(properties, cp_list, &cp_list);
15969 SvREFCNT_dec_NN(properties);
15972 cp_list = properties;
15977 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
15979 /* Because an ANYOF node is the only one that warns, this node
15980 * can't be optimized into something else */
15981 optimizable = FALSE;
15985 /* Here, we have calculated what code points should be in the character
15988 * Now we can see about various optimizations. Fold calculation (which we
15989 * did above) needs to take place before inversion. Otherwise /[^k]/i
15990 * would invert to include K, which under /i would match k, which it
15991 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15992 * folded until runtime */
15994 /* If we didn't do folding, it's because some information isn't available
15995 * until runtime; set the run-time fold flag for these. (We don't have to
15996 * worry about properties folding, as that is taken care of by the swash
15997 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15998 * locales, or the class matches at least one 0-255 range code point */
16000 if (only_utf8_locale_list) {
16001 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
16003 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
16005 invlist_iterinit(cp_list);
16006 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
16007 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
16009 invlist_iterfinish(cp_list);
16013 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
16014 * at compile time. Besides not inverting folded locale now, we can't
16015 * invert if there are things such as \w, which aren't known until runtime
16019 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
16021 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16023 _invlist_invert(cp_list);
16025 /* Any swash can't be used as-is, because we've inverted things */
16027 SvREFCNT_dec_NN(swash);
16031 /* Clear the invert flag since have just done it here */
16038 *ret_invlist = cp_list;
16039 SvREFCNT_dec(swash);
16041 /* Discard the generated node */
16043 RExC_size = orig_size;
16046 RExC_emit = orig_emit;
16051 /* Some character classes are equivalent to other nodes. Such nodes take
16052 * up less room and generally fewer operations to execute than ANYOF nodes.
16053 * Above, we checked for and optimized into some such equivalents for
16054 * certain common classes that are easy to test. Getting to this point in
16055 * the code means that the class didn't get optimized there. Since this
16056 * code is only executed in Pass 2, it is too late to save space--it has
16057 * been allocated in Pass 1, and currently isn't given back. But turning
16058 * things into an EXACTish node can allow the optimizer to join it to any
16059 * adjacent such nodes. And if the class is equivalent to things like /./,
16060 * expensive run-time swashes can be avoided. Now that we have more
16061 * complete information, we can find things necessarily missed by the
16062 * earlier code. I (khw) did some benchmarks and found essentially no
16063 * speed difference between using a POSIXA node versus an ANYOF node, so
16064 * there is no reason to optimize, for example [A-Za-z0-9_] into
16065 * [[:word:]]/a (although if we did it in the sizing pass it would save
16066 * space). _invlistEQ() could be used if one ever wanted to do something
16067 * like this at this point in the code */
16069 if (optimizable && cp_list && ! invert && ! depends_list) {
16071 U8 op = END; /* The optimzation node-type */
16072 const char * cur_parse= RExC_parse;
16074 invlist_iterinit(cp_list);
16075 if (! invlist_iternext(cp_list, &start, &end)) {
16077 /* Here, the list is empty. This happens, for example, when a
16078 * Unicode property that doesn't match anything is the only element
16079 * in the character class (perluniprops.pod notes such properties).
16082 *flagp |= HASWIDTH|SIMPLE;
16084 else if (start == end) { /* The range is a single code point */
16085 if (! invlist_iternext(cp_list, &start, &end)
16087 /* Don't do this optimization if it would require changing
16088 * the pattern to UTF-8 */
16089 && (start < 256 || UTF))
16091 /* Here, the list contains a single code point. Can optimize
16092 * into an EXACTish node */
16103 /* A locale node under folding with one code point can be
16104 * an EXACTFL, as its fold won't be calculated until
16110 /* Here, we are generally folding, but there is only one
16111 * code point to match. If we have to, we use an EXACT
16112 * node, but it would be better for joining with adjacent
16113 * nodes in the optimization pass if we used the same
16114 * EXACTFish node that any such are likely to be. We can
16115 * do this iff the code point doesn't participate in any
16116 * folds. For example, an EXACTF of a colon is the same as
16117 * an EXACT one, since nothing folds to or from a colon. */
16119 if (IS_IN_SOME_FOLD_L1(value)) {
16124 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
16129 /* If we haven't found the node type, above, it means we
16130 * can use the prevailing one */
16132 op = compute_EXACTish(pRExC_state);
16136 } /* End of first range contains just a single code point */
16137 else if (start == 0) {
16138 if (end == UV_MAX) {
16140 *flagp |= HASWIDTH|SIMPLE;
16143 else if (end == '\n' - 1
16144 && invlist_iternext(cp_list, &start, &end)
16145 && start == '\n' + 1 && end == UV_MAX)
16148 *flagp |= HASWIDTH|SIMPLE;
16152 invlist_iterfinish(cp_list);
16155 RExC_parse = (char *)orig_parse;
16156 RExC_emit = (regnode *)orig_emit;
16158 if (regarglen[op]) {
16159 ret = reganode(pRExC_state, op, 0);
16161 ret = reg_node(pRExC_state, op);
16164 RExC_parse = (char *)cur_parse;
16166 if (PL_regkind[op] == EXACT) {
16167 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
16168 TRUE /* downgradable to EXACT */
16172 SvREFCNT_dec_NN(cp_list);
16177 /* Here, <cp_list> contains all the code points we can determine at
16178 * compile time that match under all conditions. Go through it, and
16179 * for things that belong in the bitmap, put them there, and delete from
16180 * <cp_list>. While we are at it, see if everything above 255 is in the
16181 * list, and if so, set a flag to speed up execution */
16183 populate_ANYOF_from_invlist(ret, &cp_list);
16186 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
16189 /* Here, the bitmap has been populated with all the Latin1 code points that
16190 * always match. Can now add to the overall list those that match only
16191 * when the target string is UTF-8 (<depends_list>). */
16192 if (depends_list) {
16194 _invlist_union(cp_list, depends_list, &cp_list);
16195 SvREFCNT_dec_NN(depends_list);
16198 cp_list = depends_list;
16200 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
16203 /* If there is a swash and more than one element, we can't use the swash in
16204 * the optimization below. */
16205 if (swash && element_count > 1) {
16206 SvREFCNT_dec_NN(swash);
16210 /* Note that the optimization of using 'swash' if it is the only thing in
16211 * the class doesn't have us change swash at all, so it can include things
16212 * that are also in the bitmap; otherwise we have purposely deleted that
16213 * duplicate information */
16214 set_ANYOF_arg(pRExC_state, ret, cp_list,
16215 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16217 only_utf8_locale_list,
16218 swash, has_user_defined_property);
16220 *flagp |= HASWIDTH|SIMPLE;
16222 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
16223 RExC_contains_locale = 1;
16229 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
16232 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
16233 regnode* const node,
16235 SV* const runtime_defns,
16236 SV* const only_utf8_locale_list,
16238 const bool has_user_defined_property)
16240 /* Sets the arg field of an ANYOF-type node 'node', using information about
16241 * the node passed-in. If there is nothing outside the node's bitmap, the
16242 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
16243 * the count returned by add_data(), having allocated and stored an array,
16244 * av, that that count references, as follows:
16245 * av[0] stores the character class description in its textual form.
16246 * This is used later (regexec.c:Perl_regclass_swash()) to
16247 * initialize the appropriate swash, and is also useful for dumping
16248 * the regnode. This is set to &PL_sv_undef if the textual
16249 * description is not needed at run-time (as happens if the other
16250 * elements completely define the class)
16251 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16252 * computed from av[0]. But if no further computation need be done,
16253 * the swash is stored here now (and av[0] is &PL_sv_undef).
16254 * av[2] stores the inversion list of code points that match only if the
16255 * current locale is UTF-8
16256 * av[3] stores the cp_list inversion list for use in addition or instead
16257 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16258 * (Otherwise everything needed is already in av[0] and av[1])
16259 * av[4] is set if any component of the class is from a user-defined
16260 * property; used only if av[3] exists */
16264 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16266 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16267 assert(! (ANYOF_FLAGS(node)
16268 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16269 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16270 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16273 AV * const av = newAV();
16276 assert(ANYOF_FLAGS(node)
16277 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16278 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16280 av_store(av, 0, (runtime_defns)
16281 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16284 av_store(av, 1, swash);
16285 SvREFCNT_dec_NN(cp_list);
16288 av_store(av, 1, &PL_sv_undef);
16290 av_store(av, 3, cp_list);
16291 av_store(av, 4, newSVuv(has_user_defined_property));
16295 if (only_utf8_locale_list) {
16296 av_store(av, 2, only_utf8_locale_list);
16299 av_store(av, 2, &PL_sv_undef);
16302 rv = newRV_noinc(MUTABLE_SV(av));
16303 n = add_data(pRExC_state, STR_WITH_LEN("s"));
16304 RExC_rxi->data->data[n] = (void*)rv;
16309 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16311 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16312 const regnode* node,
16315 SV** only_utf8_locale_ptr,
16319 /* For internal core use only.
16320 * Returns the swash for the input 'node' in the regex 'prog'.
16321 * If <doinit> is 'true', will attempt to create the swash if not already
16323 * If <listsvp> is non-null, will return the printable contents of the
16324 * swash. This can be used to get debugging information even before the
16325 * swash exists, by calling this function with 'doinit' set to false, in
16326 * which case the components that will be used to eventually create the
16327 * swash are returned (in a printable form).
16328 * If <exclude_list> is not NULL, it is an inversion list of things to
16329 * exclude from what's returned in <listsvp>.
16330 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
16331 * that, in spite of this function's name, the swash it returns may include
16332 * the bitmap data as well */
16335 SV *si = NULL; /* Input swash initialization string */
16336 SV* invlist = NULL;
16338 RXi_GET_DECL(prog,progi);
16339 const struct reg_data * const data = prog ? progi->data : NULL;
16341 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16343 assert(ANYOF_FLAGS(node)
16344 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16345 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16347 if (data && data->count) {
16348 const U32 n = ARG(node);
16350 if (data->what[n] == 's') {
16351 SV * const rv = MUTABLE_SV(data->data[n]);
16352 AV * const av = MUTABLE_AV(SvRV(rv));
16353 SV **const ary = AvARRAY(av);
16354 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16356 si = *ary; /* ary[0] = the string to initialize the swash with */
16358 /* Elements 3 and 4 are either both present or both absent. [3] is
16359 * any inversion list generated at compile time; [4] indicates if
16360 * that inversion list has any user-defined properties in it. */
16361 if (av_tindex(av) >= 2) {
16362 if (only_utf8_locale_ptr
16364 && ary[2] != &PL_sv_undef)
16366 *only_utf8_locale_ptr = ary[2];
16369 assert(only_utf8_locale_ptr);
16370 *only_utf8_locale_ptr = NULL;
16373 if (av_tindex(av) >= 3) {
16375 if (SvUV(ary[4])) {
16376 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16384 /* Element [1] is reserved for the set-up swash. If already there,
16385 * return it; if not, create it and store it there */
16386 if (ary[1] && SvROK(ary[1])) {
16389 else if (doinit && ((si && si != &PL_sv_undef)
16390 || (invlist && invlist != &PL_sv_undef))) {
16392 sw = _core_swash_init("utf8", /* the utf8 package */
16396 0, /* not from tr/// */
16398 &swash_init_flags);
16399 (void)av_store(av, 1, sw);
16404 /* If requested, return a printable version of what this swash matches */
16406 SV* matches_string = newSVpvs("");
16408 /* The swash should be used, if possible, to get the data, as it
16409 * contains the resolved data. But this function can be called at
16410 * compile-time, before everything gets resolved, in which case we
16411 * return the currently best available information, which is the string
16412 * that will eventually be used to do that resolving, 'si' */
16413 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16414 && (si && si != &PL_sv_undef))
16416 sv_catsv(matches_string, si);
16419 /* Add the inversion list to whatever we have. This may have come from
16420 * the swash, or from an input parameter */
16422 if (exclude_list) {
16423 SV* clone = invlist_clone(invlist);
16424 _invlist_subtract(clone, exclude_list, &clone);
16425 sv_catsv(matches_string, _invlist_contents(clone));
16426 SvREFCNT_dec_NN(clone);
16429 sv_catsv(matches_string, _invlist_contents(invlist));
16432 *listsvp = matches_string;
16437 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16439 /* reg_skipcomment()
16441 Absorbs an /x style # comment from the input stream,
16442 returning a pointer to the first character beyond the comment, or if the
16443 comment terminates the pattern without anything following it, this returns
16444 one past the final character of the pattern (in other words, RExC_end) and
16445 sets the REG_RUN_ON_COMMENT_SEEN flag.
16447 Note it's the callers responsibility to ensure that we are
16448 actually in /x mode
16452 PERL_STATIC_INLINE char*
16453 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16455 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16459 while (p < RExC_end) {
16460 if (*(++p) == '\n') {
16465 /* we ran off the end of the pattern without ending the comment, so we have
16466 * to add an \n when wrapping */
16467 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16472 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
16474 const bool force_to_xmod
16477 /* If the text at the current parse position '*p' is a '(?#...)' comment,
16478 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
16479 * is /x whitespace, advance '*p' so that on exit it points to the first
16480 * byte past all such white space and comments */
16482 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
16484 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
16486 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
16489 if (RExC_end - (*p) >= 3
16491 && *(*p + 1) == '?'
16492 && *(*p + 2) == '#')
16494 while (*(*p) != ')') {
16495 if ((*p) == RExC_end)
16496 FAIL("Sequence (?#... not terminated");
16504 const char * save_p = *p;
16505 while ((*p) < RExC_end) {
16507 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
16510 else if (*(*p) == '#') {
16511 (*p) = reg_skipcomment(pRExC_state, (*p));
16517 if (*p != save_p) {
16530 Advances the parse position by one byte, unless that byte is the beginning
16531 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
16532 those two cases, the parse position is advanced beyond all such comments and
16535 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
16539 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16541 PERL_ARGS_ASSERT_NEXTCHAR;
16544 || UTF8_IS_INVARIANT(*RExC_parse)
16545 || UTF8_IS_START(*RExC_parse));
16547 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16549 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16550 FALSE /* Don't assume /x */ );
16554 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16556 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16557 * space. In pass1, it aligns and increments RExC_size; in pass2,
16560 regnode * const ret = RExC_emit;
16561 GET_RE_DEBUG_FLAGS_DECL;
16563 PERL_ARGS_ASSERT_REGNODE_GUTS;
16565 assert(extra_size >= regarglen[op]);
16568 SIZE_ALIGN(RExC_size);
16569 RExC_size += 1 + extra_size;
16572 if (RExC_emit >= RExC_emit_bound)
16573 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16574 op, (void*)RExC_emit, (void*)RExC_emit_bound);
16576 NODE_ALIGN_FILL(ret);
16577 #ifndef RE_TRACK_PATTERN_OFFSETS
16578 PERL_UNUSED_ARG(name);
16580 if (RExC_offsets) { /* MJD */
16582 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16585 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16586 ? "Overwriting end of array!\n" : "OK",
16587 (UV)(RExC_emit - RExC_emit_start),
16588 (UV)(RExC_parse - RExC_start),
16589 (UV)RExC_offsets[0]));
16590 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16597 - reg_node - emit a node
16599 STATIC regnode * /* Location. */
16600 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16602 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16604 PERL_ARGS_ASSERT_REG_NODE;
16606 assert(regarglen[op] == 0);
16609 regnode *ptr = ret;
16610 FILL_ADVANCE_NODE(ptr, op);
16617 - reganode - emit a node with an argument
16619 STATIC regnode * /* Location. */
16620 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16622 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16624 PERL_ARGS_ASSERT_REGANODE;
16626 assert(regarglen[op] == 1);
16629 regnode *ptr = ret;
16630 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16637 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16639 /* emit a node with U32 and I32 arguments */
16641 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16643 PERL_ARGS_ASSERT_REG2LANODE;
16645 assert(regarglen[op] == 2);
16648 regnode *ptr = ret;
16649 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16656 - reginsert - insert an operator in front of already-emitted operand
16658 * Means relocating the operand.
16661 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16666 const int offset = regarglen[(U8)op];
16667 const int size = NODE_STEP_REGNODE + offset;
16668 GET_RE_DEBUG_FLAGS_DECL;
16670 PERL_ARGS_ASSERT_REGINSERT;
16671 PERL_UNUSED_CONTEXT;
16672 PERL_UNUSED_ARG(depth);
16673 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16674 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16683 if (RExC_open_parens) {
16685 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16686 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16687 if ( RExC_open_parens[paren] >= opnd ) {
16688 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16689 RExC_open_parens[paren] += size;
16691 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16693 if ( RExC_close_parens[paren] >= opnd ) {
16694 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16695 RExC_close_parens[paren] += size;
16697 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16702 while (src > opnd) {
16703 StructCopy(--src, --dst, regnode);
16704 #ifdef RE_TRACK_PATTERN_OFFSETS
16705 if (RExC_offsets) { /* MJD 20010112 */
16707 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16711 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16712 ? "Overwriting end of array!\n" : "OK",
16713 (UV)(src - RExC_emit_start),
16714 (UV)(dst - RExC_emit_start),
16715 (UV)RExC_offsets[0]));
16716 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16717 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16723 place = opnd; /* Op node, where operand used to be. */
16724 #ifdef RE_TRACK_PATTERN_OFFSETS
16725 if (RExC_offsets) { /* MJD */
16727 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16731 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16732 ? "Overwriting end of array!\n" : "OK",
16733 (UV)(place - RExC_emit_start),
16734 (UV)(RExC_parse - RExC_start),
16735 (UV)RExC_offsets[0]));
16736 Set_Node_Offset(place, RExC_parse);
16737 Set_Node_Length(place, 1);
16740 src = NEXTOPER(place);
16741 FILL_ADVANCE_NODE(place, op);
16742 Zero(src, offset, regnode);
16746 - regtail - set the next-pointer at the end of a node chain of p to val.
16747 - SEE ALSO: regtail_study
16749 /* TODO: All three parms should be const */
16751 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16752 const regnode *val,U32 depth)
16755 GET_RE_DEBUG_FLAGS_DECL;
16757 PERL_ARGS_ASSERT_REGTAIL;
16759 PERL_UNUSED_ARG(depth);
16765 /* Find last node. */
16768 regnode * const temp = regnext(scan);
16770 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16771 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16772 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16773 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16774 (temp == NULL ? "->" : ""),
16775 (temp == NULL ? PL_reg_name[OP(val)] : "")
16783 if (reg_off_by_arg[OP(scan)]) {
16784 ARG_SET(scan, val - scan);
16787 NEXT_OFF(scan) = val - scan;
16793 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16794 - Look for optimizable sequences at the same time.
16795 - currently only looks for EXACT chains.
16797 This is experimental code. The idea is to use this routine to perform
16798 in place optimizations on branches and groups as they are constructed,
16799 with the long term intention of removing optimization from study_chunk so
16800 that it is purely analytical.
16802 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16803 to control which is which.
16806 /* TODO: All four parms should be const */
16809 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16810 const regnode *val,U32 depth)
16814 #ifdef EXPERIMENTAL_INPLACESCAN
16817 GET_RE_DEBUG_FLAGS_DECL;
16819 PERL_ARGS_ASSERT_REGTAIL_STUDY;
16825 /* Find last node. */
16829 regnode * const temp = regnext(scan);
16830 #ifdef EXPERIMENTAL_INPLACESCAN
16831 if (PL_regkind[OP(scan)] == EXACT) {
16832 bool unfolded_multi_char; /* Unexamined in this routine */
16833 if (join_exact(pRExC_state, scan, &min,
16834 &unfolded_multi_char, 1, val, depth+1))
16839 switch (OP(scan)) {
16843 case EXACTFA_NO_TRIE:
16849 if( exact == PSEUDO )
16851 else if ( exact != OP(scan) )
16860 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16861 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16862 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16863 SvPV_nolen_const(RExC_mysv),
16864 REG_NODE_NUM(scan),
16865 PL_reg_name[exact]);
16872 DEBUG_PARSE_MSG("");
16873 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16874 PerlIO_printf(Perl_debug_log,
16875 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16876 SvPV_nolen_const(RExC_mysv),
16877 (IV)REG_NODE_NUM(val),
16881 if (reg_off_by_arg[OP(scan)]) {
16882 ARG_SET(scan, val - scan);
16885 NEXT_OFF(scan) = val - scan;
16893 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16898 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16903 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16905 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16906 if (flags & (1<<bit)) {
16907 if (!set++ && lead)
16908 PerlIO_printf(Perl_debug_log, "%s",lead);
16909 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16914 PerlIO_printf(Perl_debug_log, "\n");
16916 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16921 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16927 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16929 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16930 if (flags & (1<<bit)) {
16931 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16934 if (!set++ && lead)
16935 PerlIO_printf(Perl_debug_log, "%s",lead);
16936 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16939 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16940 if (!set++ && lead) {
16941 PerlIO_printf(Perl_debug_log, "%s",lead);
16944 case REGEX_UNICODE_CHARSET:
16945 PerlIO_printf(Perl_debug_log, "UNICODE");
16947 case REGEX_LOCALE_CHARSET:
16948 PerlIO_printf(Perl_debug_log, "LOCALE");
16950 case REGEX_ASCII_RESTRICTED_CHARSET:
16951 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16953 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16954 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16957 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16963 PerlIO_printf(Perl_debug_log, "\n");
16965 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16971 Perl_regdump(pTHX_ const regexp *r)
16974 SV * const sv = sv_newmortal();
16975 SV *dsv= sv_newmortal();
16976 RXi_GET_DECL(r,ri);
16977 GET_RE_DEBUG_FLAGS_DECL;
16979 PERL_ARGS_ASSERT_REGDUMP;
16981 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16983 /* Header fields of interest. */
16984 if (r->anchored_substr) {
16985 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16986 RE_SV_DUMPLEN(r->anchored_substr), 30);
16987 PerlIO_printf(Perl_debug_log,
16988 "anchored %s%s at %"IVdf" ",
16989 s, RE_SV_TAIL(r->anchored_substr),
16990 (IV)r->anchored_offset);
16991 } else if (r->anchored_utf8) {
16992 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16993 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16994 PerlIO_printf(Perl_debug_log,
16995 "anchored utf8 %s%s at %"IVdf" ",
16996 s, RE_SV_TAIL(r->anchored_utf8),
16997 (IV)r->anchored_offset);
16999 if (r->float_substr) {
17000 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
17001 RE_SV_DUMPLEN(r->float_substr), 30);
17002 PerlIO_printf(Perl_debug_log,
17003 "floating %s%s at %"IVdf"..%"UVuf" ",
17004 s, RE_SV_TAIL(r->float_substr),
17005 (IV)r->float_min_offset, (UV)r->float_max_offset);
17006 } else if (r->float_utf8) {
17007 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
17008 RE_SV_DUMPLEN(r->float_utf8), 30);
17009 PerlIO_printf(Perl_debug_log,
17010 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
17011 s, RE_SV_TAIL(r->float_utf8),
17012 (IV)r->float_min_offset, (UV)r->float_max_offset);
17014 if (r->check_substr || r->check_utf8)
17015 PerlIO_printf(Perl_debug_log,
17017 (r->check_substr == r->float_substr
17018 && r->check_utf8 == r->float_utf8
17019 ? "(checking floating" : "(checking anchored"));
17020 if (r->intflags & PREGf_NOSCAN)
17021 PerlIO_printf(Perl_debug_log, " noscan");
17022 if (r->extflags & RXf_CHECK_ALL)
17023 PerlIO_printf(Perl_debug_log, " isall");
17024 if (r->check_substr || r->check_utf8)
17025 PerlIO_printf(Perl_debug_log, ") ");
17027 if (ri->regstclass) {
17028 regprop(r, sv, ri->regstclass, NULL, NULL);
17029 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
17031 if (r->intflags & PREGf_ANCH) {
17032 PerlIO_printf(Perl_debug_log, "anchored");
17033 if (r->intflags & PREGf_ANCH_MBOL)
17034 PerlIO_printf(Perl_debug_log, "(MBOL)");
17035 if (r->intflags & PREGf_ANCH_SBOL)
17036 PerlIO_printf(Perl_debug_log, "(SBOL)");
17037 if (r->intflags & PREGf_ANCH_GPOS)
17038 PerlIO_printf(Perl_debug_log, "(GPOS)");
17039 (void)PerlIO_putc(Perl_debug_log, ' ');
17041 if (r->intflags & PREGf_GPOS_SEEN)
17042 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
17043 if (r->intflags & PREGf_SKIP)
17044 PerlIO_printf(Perl_debug_log, "plus ");
17045 if (r->intflags & PREGf_IMPLICIT)
17046 PerlIO_printf(Perl_debug_log, "implicit ");
17047 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
17048 if (r->extflags & RXf_EVAL_SEEN)
17049 PerlIO_printf(Perl_debug_log, "with eval ");
17050 PerlIO_printf(Perl_debug_log, "\n");
17052 regdump_extflags("r->extflags: ",r->extflags);
17053 regdump_intflags("r->intflags: ",r->intflags);
17056 PERL_ARGS_ASSERT_REGDUMP;
17057 PERL_UNUSED_CONTEXT;
17058 PERL_UNUSED_ARG(r);
17059 #endif /* DEBUGGING */
17063 - regprop - printable representation of opcode, with run time support
17067 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
17072 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
17073 static const char * const anyofs[] = {
17074 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
17075 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
17076 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
17077 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
17078 || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
17079 #error Need to adjust order of anyofs[]
17114 RXi_GET_DECL(prog,progi);
17115 GET_RE_DEBUG_FLAGS_DECL;
17117 PERL_ARGS_ASSERT_REGPROP;
17119 sv_setpvn(sv, "", 0);
17121 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
17122 /* It would be nice to FAIL() here, but this may be called from
17123 regexec.c, and it would be hard to supply pRExC_state. */
17124 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17125 (int)OP(o), (int)REGNODE_MAX);
17126 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
17128 k = PL_regkind[OP(o)];
17131 sv_catpvs(sv, " ");
17132 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
17133 * is a crude hack but it may be the best for now since
17134 * we have no flag "this EXACTish node was UTF-8"
17136 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
17137 PERL_PV_ESCAPE_UNI_DETECT |
17138 PERL_PV_ESCAPE_NONASCII |
17139 PERL_PV_PRETTY_ELLIPSES |
17140 PERL_PV_PRETTY_LTGT |
17141 PERL_PV_PRETTY_NOCLEAR
17143 } else if (k == TRIE) {
17144 /* print the details of the trie in dumpuntil instead, as
17145 * progi->data isn't available here */
17146 const char op = OP(o);
17147 const U32 n = ARG(o);
17148 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
17149 (reg_ac_data *)progi->data->data[n] :
17151 const reg_trie_data * const trie
17152 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
17154 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
17155 DEBUG_TRIE_COMPILE_r(
17156 Perl_sv_catpvf(aTHX_ sv,
17157 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
17158 (UV)trie->startstate,
17159 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
17160 (UV)trie->wordcount,
17163 (UV)TRIE_CHARCOUNT(trie),
17164 (UV)trie->uniquecharcount
17167 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
17168 sv_catpvs(sv, "[");
17169 (void) put_charclass_bitmap_innards(sv,
17170 (IS_ANYOF_TRIE(op))
17172 : TRIE_BITMAP(trie),
17174 sv_catpvs(sv, "]");
17177 } else if (k == CURLY) {
17178 U32 lo = ARG1(o), hi = ARG2(o);
17179 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
17180 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
17181 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
17182 if (hi == REG_INFTY)
17183 sv_catpvs(sv, "INFTY");
17185 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
17186 sv_catpvs(sv, "}");
17188 else if (k == WHILEM && o->flags) /* Ordinal/of */
17189 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
17190 else if (k == REF || k == OPEN || k == CLOSE
17191 || k == GROUPP || OP(o)==ACCEPT)
17193 AV *name_list= NULL;
17194 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
17195 Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
17196 if ( RXp_PAREN_NAMES(prog) ) {
17197 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17198 } else if ( pRExC_state ) {
17199 name_list= RExC_paren_name_list;
17202 if ( k != REF || (OP(o) < NREF)) {
17203 SV **name= av_fetch(name_list, parno, 0 );
17205 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17208 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
17209 I32 *nums=(I32*)SvPVX(sv_dat);
17210 SV **name= av_fetch(name_list, nums[0], 0 );
17213 for ( n=0; n<SvIVX(sv_dat); n++ ) {
17214 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
17215 (n ? "," : ""), (IV)nums[n]);
17217 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17221 if ( k == REF && reginfo) {
17222 U32 n = ARG(o); /* which paren pair */
17223 I32 ln = prog->offs[n].start;
17224 if (prog->lastparen < n || ln == -1)
17225 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
17226 else if (ln == prog->offs[n].end)
17227 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
17229 const char *s = reginfo->strbeg + ln;
17230 Perl_sv_catpvf(aTHX_ sv, ": ");
17231 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
17232 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
17235 } else if (k == GOSUB) {
17236 AV *name_list= NULL;
17237 if ( RXp_PAREN_NAMES(prog) ) {
17238 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17239 } else if ( pRExC_state ) {
17240 name_list= RExC_paren_name_list;
17243 /* Paren and offset */
17244 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
17246 SV **name= av_fetch(name_list, ARG(o), 0 );
17248 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17251 else if (k == LOGICAL)
17252 /* 2: embedded, otherwise 1 */
17253 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
17254 else if (k == ANYOF) {
17255 const U8 flags = ANYOF_FLAGS(o);
17257 SV* bitmap_invlist; /* Will hold what the bit map contains */
17260 if (OP(o) == ANYOFL) {
17261 if (flags & ANYOF_LOC_REQ_UTF8) {
17262 sv_catpvs(sv, "{utf8-loc}");
17265 sv_catpvs(sv, "{loc}");
17268 if (flags & ANYOF_LOC_FOLD)
17269 sv_catpvs(sv, "{i}");
17270 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
17271 if (flags & ANYOF_INVERT)
17272 sv_catpvs(sv, "^");
17274 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
17276 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
17279 /* output any special charclass tests (used entirely under use
17281 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17283 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17284 if (ANYOF_POSIXL_TEST(o,i)) {
17285 sv_catpv(sv, anyofs[i]);
17291 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
17292 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
17293 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
17297 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17298 if (flags & ANYOF_INVERT)
17299 /*make sure the invert info is in each */
17300 sv_catpvs(sv, "^");
17303 if (OP(o) == ANYOFD
17304 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17306 sv_catpvs(sv, "{non-utf8-latin1-all}");
17309 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17310 sv_catpvs(sv, "{above_bitmap_all}");
17312 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17313 SV *lv; /* Set if there is something outside the bit map. */
17314 bool byte_output = FALSE; /* If something has been output */
17315 SV *only_utf8_locale;
17317 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
17318 * is used to guarantee that nothing in the bitmap gets
17320 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17321 &lv, &only_utf8_locale,
17323 if (lv && lv != &PL_sv_undef) {
17324 char *s = savesvpv(lv);
17325 char * const origs = s;
17327 while (*s && *s != '\n')
17331 const char * const t = ++s;
17333 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17334 sv_catpvs(sv, "{outside bitmap}");
17337 sv_catpvs(sv, "{utf8}");
17341 sv_catpvs(sv, " ");
17347 /* Truncate very long output */
17348 if (s - origs > 256) {
17349 Perl_sv_catpvf(aTHX_ sv,
17351 (int) (s - origs - 1),
17357 else if (*s == '\t') {
17371 SvREFCNT_dec_NN(lv);
17374 if ((flags & ANYOF_LOC_FOLD)
17375 && only_utf8_locale
17376 && only_utf8_locale != &PL_sv_undef)
17379 int max_entries = 256;
17381 sv_catpvs(sv, "{utf8 locale}");
17382 invlist_iterinit(only_utf8_locale);
17383 while (invlist_iternext(only_utf8_locale,
17385 put_range(sv, start, end, FALSE);
17387 if (max_entries < 0) {
17388 sv_catpvs(sv, "...");
17392 invlist_iterfinish(only_utf8_locale);
17396 SvREFCNT_dec(bitmap_invlist);
17399 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17401 else if (k == POSIXD || k == NPOSIXD) {
17402 U8 index = FLAGS(o) * 2;
17403 if (index < C_ARRAY_LENGTH(anyofs)) {
17404 if (*anyofs[index] != '[') {
17407 sv_catpv(sv, anyofs[index]);
17408 if (*anyofs[index] != '[') {
17413 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17416 else if (k == BOUND || k == NBOUND) {
17417 /* Must be synced with order of 'bound_type' in regcomp.h */
17418 const char * const bounds[] = {
17419 "", /* Traditional */
17424 sv_catpv(sv, bounds[FLAGS(o)]);
17426 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17427 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17428 else if (OP(o) == SBOL)
17429 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17431 /* add on the verb argument if there is one */
17432 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
17433 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
17434 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
17437 PERL_UNUSED_CONTEXT;
17438 PERL_UNUSED_ARG(sv);
17439 PERL_UNUSED_ARG(o);
17440 PERL_UNUSED_ARG(prog);
17441 PERL_UNUSED_ARG(reginfo);
17442 PERL_UNUSED_ARG(pRExC_state);
17443 #endif /* DEBUGGING */
17449 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17450 { /* Assume that RE_INTUIT is set */
17451 struct regexp *const prog = ReANY(r);
17452 GET_RE_DEBUG_FLAGS_DECL;
17454 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17455 PERL_UNUSED_CONTEXT;
17459 const char * const s = SvPV_nolen_const(RX_UTF8(r)
17460 ? prog->check_utf8 : prog->check_substr);
17462 if (!PL_colorset) reginitcolors();
17463 PerlIO_printf(Perl_debug_log,
17464 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17466 RX_UTF8(r) ? "utf8 " : "",
17467 PL_colors[5],PL_colors[0],
17470 (strlen(s) > 60 ? "..." : ""));
17473 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17474 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17480 handles refcounting and freeing the perl core regexp structure. When
17481 it is necessary to actually free the structure the first thing it
17482 does is call the 'free' method of the regexp_engine associated to
17483 the regexp, allowing the handling of the void *pprivate; member
17484 first. (This routine is not overridable by extensions, which is why
17485 the extensions free is called first.)
17487 See regdupe and regdupe_internal if you change anything here.
17489 #ifndef PERL_IN_XSUB_RE
17491 Perl_pregfree(pTHX_ REGEXP *r)
17497 Perl_pregfree2(pTHX_ REGEXP *rx)
17499 struct regexp *const r = ReANY(rx);
17500 GET_RE_DEBUG_FLAGS_DECL;
17502 PERL_ARGS_ASSERT_PREGFREE2;
17504 if (r->mother_re) {
17505 ReREFCNT_dec(r->mother_re);
17507 CALLREGFREE_PVT(rx); /* free the private data */
17508 SvREFCNT_dec(RXp_PAREN_NAMES(r));
17509 Safefree(r->xpv_len_u.xpvlenu_pv);
17512 SvREFCNT_dec(r->anchored_substr);
17513 SvREFCNT_dec(r->anchored_utf8);
17514 SvREFCNT_dec(r->float_substr);
17515 SvREFCNT_dec(r->float_utf8);
17516 Safefree(r->substrs);
17518 RX_MATCH_COPY_FREE(rx);
17519 #ifdef PERL_ANY_COW
17520 SvREFCNT_dec(r->saved_copy);
17523 SvREFCNT_dec(r->qr_anoncv);
17524 rx->sv_u.svu_rx = 0;
17529 This is a hacky workaround to the structural issue of match results
17530 being stored in the regexp structure which is in turn stored in
17531 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17532 could be PL_curpm in multiple contexts, and could require multiple
17533 result sets being associated with the pattern simultaneously, such
17534 as when doing a recursive match with (??{$qr})
17536 The solution is to make a lightweight copy of the regexp structure
17537 when a qr// is returned from the code executed by (??{$qr}) this
17538 lightweight copy doesn't actually own any of its data except for
17539 the starp/end and the actual regexp structure itself.
17545 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17547 struct regexp *ret;
17548 struct regexp *const r = ReANY(rx);
17549 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17551 PERL_ARGS_ASSERT_REG_TEMP_COPY;
17554 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17556 SvOK_off((SV *)ret_x);
17558 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17559 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
17560 made both spots point to the same regexp body.) */
17561 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17562 assert(!SvPVX(ret_x));
17563 ret_x->sv_u.svu_rx = temp->sv_any;
17564 temp->sv_any = NULL;
17565 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17566 SvREFCNT_dec_NN(temp);
17567 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17568 ing below will not set it. */
17569 SvCUR_set(ret_x, SvCUR(rx));
17572 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17573 sv_force_normal(sv) is called. */
17575 ret = ReANY(ret_x);
17577 SvFLAGS(ret_x) |= SvUTF8(rx);
17578 /* We share the same string buffer as the original regexp, on which we
17579 hold a reference count, incremented when mother_re is set below.
17580 The string pointer is copied here, being part of the regexp struct.
17582 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17583 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17585 const I32 npar = r->nparens+1;
17586 Newx(ret->offs, npar, regexp_paren_pair);
17587 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17590 Newx(ret->substrs, 1, struct reg_substr_data);
17591 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17593 SvREFCNT_inc_void(ret->anchored_substr);
17594 SvREFCNT_inc_void(ret->anchored_utf8);
17595 SvREFCNT_inc_void(ret->float_substr);
17596 SvREFCNT_inc_void(ret->float_utf8);
17598 /* check_substr and check_utf8, if non-NULL, point to either their
17599 anchored or float namesakes, and don't hold a second reference. */
17601 RX_MATCH_COPIED_off(ret_x);
17602 #ifdef PERL_ANY_COW
17603 ret->saved_copy = NULL;
17605 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17606 SvREFCNT_inc_void(ret->qr_anoncv);
17612 /* regfree_internal()
17614 Free the private data in a regexp. This is overloadable by
17615 extensions. Perl takes care of the regexp structure in pregfree(),
17616 this covers the *pprivate pointer which technically perl doesn't
17617 know about, however of course we have to handle the
17618 regexp_internal structure when no extension is in use.
17620 Note this is called before freeing anything in the regexp
17625 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17627 struct regexp *const r = ReANY(rx);
17628 RXi_GET_DECL(r,ri);
17629 GET_RE_DEBUG_FLAGS_DECL;
17631 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17637 SV *dsv= sv_newmortal();
17638 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17639 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17640 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17641 PL_colors[4],PL_colors[5],s);
17644 #ifdef RE_TRACK_PATTERN_OFFSETS
17646 Safefree(ri->u.offsets); /* 20010421 MJD */
17648 if (ri->code_blocks) {
17650 for (n = 0; n < ri->num_code_blocks; n++)
17651 SvREFCNT_dec(ri->code_blocks[n].src_regex);
17652 Safefree(ri->code_blocks);
17656 int n = ri->data->count;
17659 /* If you add a ->what type here, update the comment in regcomp.h */
17660 switch (ri->data->what[n]) {
17666 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17669 Safefree(ri->data->data[n]);
17675 { /* Aho Corasick add-on structure for a trie node.
17676 Used in stclass optimization only */
17678 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17679 #ifdef USE_ITHREADS
17683 refcount = --aho->refcount;
17686 PerlMemShared_free(aho->states);
17687 PerlMemShared_free(aho->fail);
17688 /* do this last!!!! */
17689 PerlMemShared_free(ri->data->data[n]);
17690 /* we should only ever get called once, so
17691 * assert as much, and also guard the free
17692 * which /might/ happen twice. At the least
17693 * it will make code anlyzers happy and it
17694 * doesn't cost much. - Yves */
17695 assert(ri->regstclass);
17696 if (ri->regstclass) {
17697 PerlMemShared_free(ri->regstclass);
17698 ri->regstclass = 0;
17705 /* trie structure. */
17707 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17708 #ifdef USE_ITHREADS
17712 refcount = --trie->refcount;
17715 PerlMemShared_free(trie->charmap);
17716 PerlMemShared_free(trie->states);
17717 PerlMemShared_free(trie->trans);
17719 PerlMemShared_free(trie->bitmap);
17721 PerlMemShared_free(trie->jump);
17722 PerlMemShared_free(trie->wordinfo);
17723 /* do this last!!!! */
17724 PerlMemShared_free(ri->data->data[n]);
17729 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17730 ri->data->what[n]);
17733 Safefree(ri->data->what);
17734 Safefree(ri->data);
17740 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17741 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17742 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17745 re_dup - duplicate a regexp.
17747 This routine is expected to clone a given regexp structure. It is only
17748 compiled under USE_ITHREADS.
17750 After all of the core data stored in struct regexp is duplicated
17751 the regexp_engine.dupe method is used to copy any private data
17752 stored in the *pprivate pointer. This allows extensions to handle
17753 any duplication it needs to do.
17755 See pregfree() and regfree_internal() if you change anything here.
17757 #if defined(USE_ITHREADS)
17758 #ifndef PERL_IN_XSUB_RE
17760 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17764 const struct regexp *r = ReANY(sstr);
17765 struct regexp *ret = ReANY(dstr);
17767 PERL_ARGS_ASSERT_RE_DUP_GUTS;
17769 npar = r->nparens+1;
17770 Newx(ret->offs, npar, regexp_paren_pair);
17771 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17773 if (ret->substrs) {
17774 /* Do it this way to avoid reading from *r after the StructCopy().
17775 That way, if any of the sv_dup_inc()s dislodge *r from the L1
17776 cache, it doesn't matter. */
17777 const bool anchored = r->check_substr
17778 ? r->check_substr == r->anchored_substr
17779 : r->check_utf8 == r->anchored_utf8;
17780 Newx(ret->substrs, 1, struct reg_substr_data);
17781 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17783 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17784 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17785 ret->float_substr = sv_dup_inc(ret->float_substr, param);
17786 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17788 /* check_substr and check_utf8, if non-NULL, point to either their
17789 anchored or float namesakes, and don't hold a second reference. */
17791 if (ret->check_substr) {
17793 assert(r->check_utf8 == r->anchored_utf8);
17794 ret->check_substr = ret->anchored_substr;
17795 ret->check_utf8 = ret->anchored_utf8;
17797 assert(r->check_substr == r->float_substr);
17798 assert(r->check_utf8 == r->float_utf8);
17799 ret->check_substr = ret->float_substr;
17800 ret->check_utf8 = ret->float_utf8;
17802 } else if (ret->check_utf8) {
17804 ret->check_utf8 = ret->anchored_utf8;
17806 ret->check_utf8 = ret->float_utf8;
17811 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17812 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17815 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17817 if (RX_MATCH_COPIED(dstr))
17818 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
17820 ret->subbeg = NULL;
17821 #ifdef PERL_ANY_COW
17822 ret->saved_copy = NULL;
17825 /* Whether mother_re be set or no, we need to copy the string. We
17826 cannot refrain from copying it when the storage points directly to
17827 our mother regexp, because that's
17828 1: a buffer in a different thread
17829 2: something we no longer hold a reference on
17830 so we need to copy it locally. */
17831 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17832 ret->mother_re = NULL;
17834 #endif /* PERL_IN_XSUB_RE */
17839 This is the internal complement to regdupe() which is used to copy
17840 the structure pointed to by the *pprivate pointer in the regexp.
17841 This is the core version of the extension overridable cloning hook.
17842 The regexp structure being duplicated will be copied by perl prior
17843 to this and will be provided as the regexp *r argument, however
17844 with the /old/ structures pprivate pointer value. Thus this routine
17845 may override any copying normally done by perl.
17847 It returns a pointer to the new regexp_internal structure.
17851 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17854 struct regexp *const r = ReANY(rx);
17855 regexp_internal *reti;
17857 RXi_GET_DECL(r,ri);
17859 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17863 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17864 char, regexp_internal);
17865 Copy(ri->program, reti->program, len+1, regnode);
17867 reti->num_code_blocks = ri->num_code_blocks;
17868 if (ri->code_blocks) {
17870 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17871 struct reg_code_block);
17872 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17873 struct reg_code_block);
17874 for (n = 0; n < ri->num_code_blocks; n++)
17875 reti->code_blocks[n].src_regex = (REGEXP*)
17876 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17879 reti->code_blocks = NULL;
17881 reti->regstclass = NULL;
17884 struct reg_data *d;
17885 const int count = ri->data->count;
17888 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17889 char, struct reg_data);
17890 Newx(d->what, count, U8);
17893 for (i = 0; i < count; i++) {
17894 d->what[i] = ri->data->what[i];
17895 switch (d->what[i]) {
17896 /* see also regcomp.h and regfree_internal() */
17897 case 'a': /* actually an AV, but the dup function is identical. */
17901 case 'u': /* actually an HV, but the dup function is identical. */
17902 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17905 /* This is cheating. */
17906 Newx(d->data[i], 1, regnode_ssc);
17907 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17908 reti->regstclass = (regnode*)d->data[i];
17911 /* Trie stclasses are readonly and can thus be shared
17912 * without duplication. We free the stclass in pregfree
17913 * when the corresponding reg_ac_data struct is freed.
17915 reti->regstclass= ri->regstclass;
17919 ((reg_trie_data*)ri->data->data[i])->refcount++;
17924 d->data[i] = ri->data->data[i];
17927 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17928 ri->data->what[i]);
17937 reti->name_list_idx = ri->name_list_idx;
17939 #ifdef RE_TRACK_PATTERN_OFFSETS
17940 if (ri->u.offsets) {
17941 Newx(reti->u.offsets, 2*len+1, U32);
17942 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17945 SetProgLen(reti,len);
17948 return (void*)reti;
17951 #endif /* USE_ITHREADS */
17953 #ifndef PERL_IN_XSUB_RE
17956 - regnext - dig the "next" pointer out of a node
17959 Perl_regnext(pTHX_ regnode *p)
17966 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17967 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17968 (int)OP(p), (int)REGNODE_MAX);
17971 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17980 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17983 STRLEN l1 = strlen(pat1);
17984 STRLEN l2 = strlen(pat2);
17987 const char *message;
17989 PERL_ARGS_ASSERT_RE_CROAK2;
17995 Copy(pat1, buf, l1 , char);
17996 Copy(pat2, buf + l1, l2 , char);
17997 buf[l1 + l2] = '\n';
17998 buf[l1 + l2 + 1] = '\0';
17999 va_start(args, pat2);
18000 msv = vmess(buf, &args);
18002 message = SvPV_const(msv,l1);
18005 Copy(message, buf, l1 , char);
18006 /* l1-1 to avoid \n */
18007 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
18010 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
18012 #ifndef PERL_IN_XSUB_RE
18014 Perl_save_re_context(pTHX)
18019 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
18022 const REGEXP * const rx = PM_GETRE(PL_curpm);
18024 nparens = RX_NPARENS(rx);
18027 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
18028 * that PL_curpm will be null, but that utf8.pm and the modules it
18029 * loads will only use $1..$3.
18030 * The t/porting/re_context.t test file checks this assumption.
18035 for (i = 1; i <= nparens; i++) {
18036 char digits[TYPE_CHARS(long)];
18037 const STRLEN len = my_snprintf(digits, sizeof(digits),
18039 GV *const *const gvp
18040 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
18043 GV * const gv = *gvp;
18044 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
18054 S_put_code_point(pTHX_ SV *sv, UV c)
18056 PERL_ARGS_ASSERT_PUT_CODE_POINT;
18059 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
18061 else if (isPRINT(c)) {
18062 const char string = (char) c;
18063 if (isBACKSLASHED_PUNCT(c))
18064 sv_catpvs(sv, "\\");
18065 sv_catpvn(sv, &string, 1);
18068 const char * const mnemonic = cntrl_to_mnemonic((char) c);
18070 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
18073 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
18078 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
18081 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
18083 /* Appends to 'sv' a displayable version of the range of code points from
18084 * 'start' to 'end'. It assumes that only ASCII printables are displayable
18085 * as-is (though some of these will be escaped by put_code_point()). */
18087 const unsigned int min_range_count = 3;
18089 assert(start <= end);
18091 PERL_ARGS_ASSERT_PUT_RANGE;
18093 while (start <= end) {
18095 const char * format;
18097 if (end - start < min_range_count) {
18099 /* Individual chars in short ranges */
18100 for (; start <= end; start++) {
18101 put_code_point(sv, start);
18106 /* If permitted by the input options, and there is a possibility that
18107 * this range contains a printable literal, look to see if there is
18109 if (allow_literals && start <= MAX_PRINT_A) {
18111 /* If the range begin isn't an ASCII printable, effectively split
18112 * the range into two parts:
18113 * 1) the portion before the first such printable,
18115 * and output them separately. */
18116 if (! isPRINT_A(start)) {
18117 UV temp_end = start + 1;
18119 /* There is no point looking beyond the final possible
18120 * printable, in MAX_PRINT_A */
18121 UV max = MIN(end, MAX_PRINT_A);
18123 while (temp_end <= max && ! isPRINT_A(temp_end)) {
18127 /* Here, temp_end points to one beyond the first printable if
18128 * found, or to one beyond 'max' if not. If none found, make
18129 * sure that we use the entire range */
18130 if (temp_end > MAX_PRINT_A) {
18131 temp_end = end + 1;
18134 /* Output the first part of the split range, the part that
18135 * doesn't have printables, with no looking for literals
18136 * (otherwise we would infinitely recurse) */
18137 put_range(sv, start, temp_end - 1, FALSE);
18139 /* The 2nd part of the range (if any) starts here. */
18142 /* We continue instead of dropping down because even if the 2nd
18143 * part is non-empty, it could be so short that we want to
18144 * output it specially, as tested for at the top of this loop.
18149 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
18150 * output a sub-range of just the digits or letters, then process
18151 * the remaining portion as usual. */
18152 if (isALPHANUMERIC_A(start)) {
18153 UV mask = (isDIGIT_A(start))
18158 UV temp_end = start + 1;
18160 /* Find the end of the sub-range that includes just the
18161 * characters in the same class as the first character in it */
18162 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
18167 /* For short ranges, don't duplicate the code above to output
18168 * them; just call recursively */
18169 if (temp_end - start < min_range_count) {
18170 put_range(sv, start, temp_end, FALSE);
18172 else { /* Output as a range */
18173 put_code_point(sv, start);
18174 sv_catpvs(sv, "-");
18175 put_code_point(sv, temp_end);
18177 start = temp_end + 1;
18181 /* We output any other printables as individual characters */
18182 if (isPUNCT_A(start) || isSPACE_A(start)) {
18183 while (start <= end && (isPUNCT_A(start)
18184 || isSPACE_A(start)))
18186 put_code_point(sv, start);
18191 } /* End of looking for literals */
18193 /* Here is not to output as a literal. Some control characters have
18194 * mnemonic names. Split off any of those at the beginning and end of
18195 * the range to print mnemonically. It isn't possible for many of
18196 * these to be in a row, so this won't overwhelm with output */
18197 while (isMNEMONIC_CNTRL(start) && start <= end) {
18198 put_code_point(sv, start);
18201 if (start < end && isMNEMONIC_CNTRL(end)) {
18203 /* Here, the final character in the range has a mnemonic name.
18204 * Work backwards from the end to find the final non-mnemonic */
18205 UV temp_end = end - 1;
18206 while (isMNEMONIC_CNTRL(temp_end)) {
18210 /* And separately output the range that doesn't have mnemonics */
18211 put_range(sv, start, temp_end, FALSE);
18213 /* Then output the mnemonic trailing controls */
18214 start = temp_end + 1;
18215 while (start <= end) {
18216 put_code_point(sv, start);
18222 /* As a final resort, output the range or subrange as hex. */
18224 this_end = (end < NUM_ANYOF_CODE_POINTS)
18226 : NUM_ANYOF_CODE_POINTS - 1;
18227 #if NUM_ANYOF_CODE_POINTS > 256
18228 format = (this_end < 256)
18229 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
18230 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
18232 format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
18234 GCC_DIAG_IGNORE(-Wformat-nonliteral);
18235 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
18242 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
18244 /* Appends to 'sv' a displayable version of the innards of the bracketed
18245 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
18246 * output anything, and bitmap_invlist, if not NULL, will point to an
18247 * inversion list of what is in the bit map */
18251 unsigned int punct_count = 0;
18252 SV* invlist = NULL;
18253 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
18254 bool allow_literals = TRUE;
18256 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
18258 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
18260 /* Worst case is exactly every-other code point is in the list */
18261 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
18263 /* Convert the bit map to an inversion list, keeping track of how many
18264 * ASCII puncts are set, including an extra amount for the backslashed
18266 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
18267 if (BITMAP_TEST(bitmap, i)) {
18268 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
18269 if (isPUNCT_A(i)) {
18271 if isBACKSLASHED_PUNCT(i) {
18278 /* Nothing to output */
18279 if (_invlist_len(*invlist_ptr) == 0) {
18280 SvREFCNT_dec(invlist);
18284 /* Generally, it is more readable if printable characters are output as
18285 * literals, but if a range (nearly) spans all of them, it's best to output
18286 * it as a single range. This code will use a single range if all but 2
18287 * printables are in it */
18288 invlist_iterinit(*invlist_ptr);
18289 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18291 /* If range starts beyond final printable, it doesn't have any in it */
18292 if (start > MAX_PRINT_A) {
18296 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
18297 * all but two, the range must start and end no later than 2 from
18299 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18300 if (end > MAX_PRINT_A) {
18306 if (end - start >= MAX_PRINT_A - ' ' - 2) {
18307 allow_literals = FALSE;
18312 invlist_iterfinish(*invlist_ptr);
18314 /* The legibility of the output depends mostly on how many punctuation
18315 * characters are output. There are 32 possible ASCII ones, and some have
18316 * an additional backslash, bringing it to currently 36, so if any more
18317 * than 18 are to be output, we can instead output it as its complement,
18318 * yielding fewer puncts, and making it more legible. But give some weight
18319 * to the fact that outputting it as a complement is less legible than a
18320 * straight output, so don't complement unless we are somewhat over the 18
18322 if (allow_literals && punct_count > 22) {
18323 sv_catpvs(sv, "^");
18325 /* Add everything remaining to the list, so when we invert it just
18326 * below, it will be excluded */
18327 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18328 _invlist_invert(*invlist_ptr);
18331 /* Here we have figured things out. Output each range */
18332 invlist_iterinit(*invlist_ptr);
18333 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18334 if (start >= NUM_ANYOF_CODE_POINTS) {
18337 put_range(sv, start, end, allow_literals);
18339 invlist_iterfinish(*invlist_ptr);
18344 #define CLEAR_OPTSTART \
18345 if (optstart) STMT_START { \
18346 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
18347 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18351 #define DUMPUNTIL(b,e) \
18353 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18355 STATIC const regnode *
18356 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18357 const regnode *last, const regnode *plast,
18358 SV* sv, I32 indent, U32 depth)
18360 U8 op = PSEUDO; /* Arbitrary non-END op. */
18361 const regnode *next;
18362 const regnode *optstart= NULL;
18364 RXi_GET_DECL(r,ri);
18365 GET_RE_DEBUG_FLAGS_DECL;
18367 PERL_ARGS_ASSERT_DUMPUNTIL;
18369 #ifdef DEBUG_DUMPUNTIL
18370 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18371 last ? last-start : 0,plast ? plast-start : 0);
18374 if (plast && plast < last)
18377 while (PL_regkind[op] != END && (!last || node < last)) {
18379 /* While that wasn't END last time... */
18382 if (op == CLOSE || op == WHILEM)
18384 next = regnext((regnode *)node);
18387 if (OP(node) == OPTIMIZED) {
18388 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18395 regprop(r, sv, node, NULL, NULL);
18396 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18397 (int)(2*indent + 1), "", SvPVX_const(sv));
18399 if (OP(node) != OPTIMIZED) {
18400 if (next == NULL) /* Next ptr. */
18401 PerlIO_printf(Perl_debug_log, " (0)");
18402 else if (PL_regkind[(U8)op] == BRANCH
18403 && PL_regkind[OP(next)] != BRANCH )
18404 PerlIO_printf(Perl_debug_log, " (FAIL)");
18406 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18407 (void)PerlIO_putc(Perl_debug_log, '\n');
18411 if (PL_regkind[(U8)op] == BRANCHJ) {
18414 const regnode *nnode = (OP(next) == LONGJMP
18415 ? regnext((regnode *)next)
18417 if (last && nnode > last)
18419 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18422 else if (PL_regkind[(U8)op] == BRANCH) {
18424 DUMPUNTIL(NEXTOPER(node), next);
18426 else if ( PL_regkind[(U8)op] == TRIE ) {
18427 const regnode *this_trie = node;
18428 const char op = OP(node);
18429 const U32 n = ARG(node);
18430 const reg_ac_data * const ac = op>=AHOCORASICK ?
18431 (reg_ac_data *)ri->data->data[n] :
18433 const reg_trie_data * const trie =
18434 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18436 AV *const trie_words
18437 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18439 const regnode *nextbranch= NULL;
18442 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18443 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18445 PerlIO_printf(Perl_debug_log, "%*s%s ",
18446 (int)(2*(indent+3)), "",
18448 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18449 SvCUR(*elem_ptr), 60,
18450 PL_colors[0], PL_colors[1],
18452 ? PERL_PV_ESCAPE_UNI
18454 | PERL_PV_PRETTY_ELLIPSES
18455 | PERL_PV_PRETTY_LTGT
18460 U16 dist= trie->jump[word_idx+1];
18461 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18462 (UV)((dist ? this_trie + dist : next) - start));
18465 nextbranch= this_trie + trie->jump[0];
18466 DUMPUNTIL(this_trie + dist, nextbranch);
18468 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18469 nextbranch= regnext((regnode *)nextbranch);
18471 PerlIO_printf(Perl_debug_log, "\n");
18474 if (last && next > last)
18479 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
18480 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18481 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18483 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18485 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18487 else if ( op == PLUS || op == STAR) {
18488 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18490 else if (PL_regkind[(U8)op] == ANYOF) {
18491 /* arglen 1 + class block */
18492 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18493 ? ANYOF_POSIXL_SKIP
18495 node = NEXTOPER(node);
18497 else if (PL_regkind[(U8)op] == EXACT) {
18498 /* Literal string, where present. */
18499 node += NODE_SZ_STR(node) - 1;
18500 node = NEXTOPER(node);
18503 node = NEXTOPER(node);
18504 node += regarglen[(U8)op];
18506 if (op == CURLYX || op == OPEN)
18510 #ifdef DEBUG_DUMPUNTIL
18511 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18516 #endif /* DEBUGGING */
18519 * ex: set ts=8 sts=4 sw=4 et: